diff --git a/q2_vsearch/_cluster_sequences.py b/q2_vsearch/_cluster_sequences.py index a8cbafa..8419bae 100644 --- a/q2_vsearch/_cluster_sequences.py +++ b/q2_vsearch/_cluster_sequences.py @@ -17,30 +17,47 @@ from ._cluster_features import run_command -def dereplicate_sequences(sequences: QIIME1DemuxDirFmt, - derep_prefix: bool = False, - min_seq_length: int = 1, - min_unique_size: int = 1 - ) -> (biom.Table, DNAFASTAFormat): +def dereplicate_sequences( + sequences: QIIME1DemuxDirFmt, + derep_prefix: bool = False, + min_seq_length: int = 1, + min_unique_size: int = 1, +) -> (biom.Table, DNAFASTAFormat): dereplicated_sequences = DNAFASTAFormat() + with tempfile.NamedTemporaryFile(mode='w+') as out_uc: seqs_fp = '%s/seqs.fna' % str(sequences) - cmd = ['vsearch', - '--derep_prefix' if derep_prefix else '--derep_fulllength', - seqs_fp, - '--output', str(dereplicated_sequences), - '--relabel_sha1', '--relabel_keep', - '--uc', out_uc.name, - '--xsize', - '--minseqlength', str(min_seq_length), - '--minuniquesize', str(min_unique_size), - '--fasta_width', '0'] + cmd = [ + 'vsearch', + '--derep_prefix' if derep_prefix else '--derep_fulllength', + seqs_fp, + '--output', str(dereplicated_sequences), + '--relabel_sha1', + '--relabel_keep', + '--uc', out_uc.name, + '--xsize', + '--minseqlength', str(min_seq_length), + '--minuniquesize', str(min_unique_size), + '--fasta_width', '0' + ] run_command(cmd) + out_uc.seek(0) table = parse_uc(out_uc) - id_map = {e.metadata['description']: e.metadata['id'] - for e in skbio.io.read(str(dereplicated_sequences), - constructor=skbio.DNA, - format='fasta')} - table.update_ids(id_map=id_map, axis='observation') + + id_map = { + e.metadata['description']: e.metadata['id'] for e in skbio.io.read( + str(dereplicated_sequences), + constructor=skbio.DNA, + format='fasta' + ) + } + + # keep only sequence ids that are in the dereplicated sequences + dereplicated_ids = list(id_map.keys()) + table = table.filter(dereplicated_ids, axis='observation') + + # rename feature ids to sha1 sequence hashes + table = table.update_ids(id_map=id_map, axis='observation') + return table, dereplicated_sequences diff --git a/q2_vsearch/tests/test_cluster_sequences.py b/q2_vsearch/tests/test_cluster_sequences.py index 50395b0..b6840cc 100644 --- a/q2_vsearch/tests/test_cluster_sequences.py +++ b/q2_vsearch/tests/test_cluster_sequences.py @@ -95,6 +95,56 @@ def test_dereplicate_sequences_min_length(self): 'description': 's2_42'})] self.assertEqual(obs_seqs, exp_seqs) + def test_dereplicate_sequences_min_unique_size(self): + input_sequences_fp = self.get_data_path('seqs-1') + input_sequences = QIIME1DemuxDirFmt(input_sequences_fp, 'r') + + exp_table = biom.Table( + np.array([[2, 1], [0, 2]]), + [ + '4574b947a0159c0da35a1f30f989681a1d9f64ef', + '16a1263bde4f2f99422630d1bb87935c4236d1ba' + ], + ['sample1', 's2'] + ) + + with redirected_stdio(stderr=os.devnull): + obs_table, obs_sequences = dereplicate_sequences( + sequences=input_sequences, + min_unique_size=2 + ) + + # order of identifiers is important for biom.Table equality + obs_table = obs_table.sort_order( + exp_table.ids(axis='observation'), axis='observation' + ) + self.assertEqual(obs_table, exp_table) + + # sequences are reverse-sorted by abundance in output + # sequence s2_2 is missing from output because it has an abundance of 1 + obs_seqs = list( + skbio.io.read( + str(obs_sequences), constructor=skbio.DNA, format='fasta' + ) + ) + exp_seqs = [ + skbio.DNA( + 'AAACGTTACGGTTAACTATACATGCAGAAGACTAATCGG', + metadata={ + 'id': ('4574b947a0159c0da35a1f30f989681a1d9f64ef'), + 'description': 'sample1_1' + } + ), + skbio.DNA( + 'ACGTACGTACGTACGTACGTACGTACGTACGTGCATGGTGCGACCG', + metadata={ + 'id': ('16a1263bde4f2f99422630d1bb87935c4236d1ba'), + 'description': 's2_42' + } + ) + ] + self.assertEqual(obs_seqs, exp_seqs) + def test_dereplicate_sequences_underscores_in_ids(self): input_sequences_fp = self.get_data_path('seqs-2') input_sequences = QIIME1DemuxDirFmt(input_sequences_fp, 'r')