Skip to content

Commit

Permalink
BUG: allow usage of min_unique_size parameter (#102)
Browse files Browse the repository at this point in the history
  • Loading branch information
colinvwood authored Jan 23, 2025
1 parent 68e90ae commit 86fb84e
Show file tree
Hide file tree
Showing 2 changed files with 87 additions and 20 deletions.
57 changes: 37 additions & 20 deletions q2_vsearch/_cluster_sequences.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,30 +17,47 @@
from ._cluster_features import run_command


def dereplicate_sequences(sequences: QIIME1DemuxDirFmt,
derep_prefix: bool = False,
min_seq_length: int = 1,
min_unique_size: int = 1
) -> (biom.Table, DNAFASTAFormat):
def dereplicate_sequences(
sequences: QIIME1DemuxDirFmt,
derep_prefix: bool = False,
min_seq_length: int = 1,
min_unique_size: int = 1,
) -> (biom.Table, DNAFASTAFormat):
dereplicated_sequences = DNAFASTAFormat()

with tempfile.NamedTemporaryFile(mode='w+') as out_uc:
seqs_fp = '%s/seqs.fna' % str(sequences)
cmd = ['vsearch',
'--derep_prefix' if derep_prefix else '--derep_fulllength',
seqs_fp,
'--output', str(dereplicated_sequences),
'--relabel_sha1', '--relabel_keep',
'--uc', out_uc.name,
'--xsize',
'--minseqlength', str(min_seq_length),
'--minuniquesize', str(min_unique_size),
'--fasta_width', '0']
cmd = [
'vsearch',
'--derep_prefix' if derep_prefix else '--derep_fulllength',
seqs_fp,
'--output', str(dereplicated_sequences),
'--relabel_sha1',
'--relabel_keep',
'--uc', out_uc.name,
'--xsize',
'--minseqlength', str(min_seq_length),
'--minuniquesize', str(min_unique_size),
'--fasta_width', '0'
]
run_command(cmd)

out_uc.seek(0)
table = parse_uc(out_uc)
id_map = {e.metadata['description']: e.metadata['id']
for e in skbio.io.read(str(dereplicated_sequences),
constructor=skbio.DNA,
format='fasta')}
table.update_ids(id_map=id_map, axis='observation')

id_map = {
e.metadata['description']: e.metadata['id'] for e in skbio.io.read(
str(dereplicated_sequences),
constructor=skbio.DNA,
format='fasta'
)
}

# keep only sequence ids that are in the dereplicated sequences
dereplicated_ids = list(id_map.keys())
table = table.filter(dereplicated_ids, axis='observation')

# rename feature ids to sha1 sequence hashes
table = table.update_ids(id_map=id_map, axis='observation')

return table, dereplicated_sequences
50 changes: 50 additions & 0 deletions q2_vsearch/tests/test_cluster_sequences.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,56 @@ def test_dereplicate_sequences_min_length(self):
'description': 's2_42'})]
self.assertEqual(obs_seqs, exp_seqs)

def test_dereplicate_sequences_min_unique_size(self):
input_sequences_fp = self.get_data_path('seqs-1')
input_sequences = QIIME1DemuxDirFmt(input_sequences_fp, 'r')

exp_table = biom.Table(
np.array([[2, 1], [0, 2]]),
[
'4574b947a0159c0da35a1f30f989681a1d9f64ef',
'16a1263bde4f2f99422630d1bb87935c4236d1ba'
],
['sample1', 's2']
)

with redirected_stdio(stderr=os.devnull):
obs_table, obs_sequences = dereplicate_sequences(
sequences=input_sequences,
min_unique_size=2
)

# order of identifiers is important for biom.Table equality
obs_table = obs_table.sort_order(
exp_table.ids(axis='observation'), axis='observation'
)
self.assertEqual(obs_table, exp_table)

# sequences are reverse-sorted by abundance in output
# sequence s2_2 is missing from output because it has an abundance of 1
obs_seqs = list(
skbio.io.read(
str(obs_sequences), constructor=skbio.DNA, format='fasta'
)
)
exp_seqs = [
skbio.DNA(
'AAACGTTACGGTTAACTATACATGCAGAAGACTAATCGG',
metadata={
'id': ('4574b947a0159c0da35a1f30f989681a1d9f64ef'),
'description': 'sample1_1'
}
),
skbio.DNA(
'ACGTACGTACGTACGTACGTACGTACGTACGTGCATGGTGCGACCG',
metadata={
'id': ('16a1263bde4f2f99422630d1bb87935c4236d1ba'),
'description': 's2_42'
}
)
]
self.assertEqual(obs_seqs, exp_seqs)

def test_dereplicate_sequences_underscores_in_ids(self):
input_sequences_fp = self.get_data_path('seqs-2')
input_sequences = QIIME1DemuxDirFmt(input_sequences_fp, 'r')
Expand Down

0 comments on commit 86fb84e

Please sign in to comment.