Skip to content

Commit

Permalink
ingest/joined-ncbi: Dedup by sample id in strain
Browse files Browse the repository at this point in the history
Resolves <#95>

After appending the Andersen lab metadata to the NCBI metadata, dedup
the records by sample id within the strain name. Then the sequences are
filtered by the final strain names in the metadata TSV.
  • Loading branch information
joverlee521 committed Oct 11, 2024
1 parent e98c98c commit 58c0508
Show file tree
Hide file tree
Showing 2 changed files with 86 additions and 35 deletions.
44 changes: 44 additions & 0 deletions ingest/build-configs/ncbi/bin/dedup-by-sample-id
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/usr/bin/env python3
"""
Deduplicate records by the sample id of the `strain` field.
For example, the following strain names have duplicate sample id 24-005334-001
- A/chicken/Ohio/24-005334-001/2024
- A/Chicken/USA/24-005334-001-original/2024
Only keeps the first record of duplicates and keeps record if the sample id
cannot be parsed from `strain`.
"""
import json
import re
from sys import stderr, stdin, stdout


SAMPLE_ID_REGEX = r"^A\/[^\/]*\/[^\/]*\/(\d{2}-\d{6}-\d{3})[^\/]*\/\d{4}$"
STRAIN_FIELD = "strain"


if __name__ == "__main__":
seen_sample_ids = set()

for index, record in enumerate(stdin):
record = json.loads(record).copy()

strain = record[STRAIN_FIELD]
sample_id_matches = re.match(SAMPLE_ID_REGEX, strain)
if sample_id_matches is not None:
sample_id = sample_id_matches.group(1)
if sample_id in seen_sample_ids:
print(
f"Dropping record {index!r} because it has a duplicate ",
f"sample ID {sample_id!r} in the strain name {strain!r}",
file=stderr
)
continue

seen_sample_ids.add(sample_id)

# Not every strain name has a matching sample id
# Just keep records that don't match
json.dump(record, stdout, allow_nan=False, indent=None, separators=',:')
print()
77 changes: 42 additions & 35 deletions ingest/build-configs/ncbi/rules/join_ncbi_andersen.smk
Original file line number Diff line number Diff line change
Expand Up @@ -26,66 +26,73 @@ rule select_missing_metadata:
"""


rule select_missing_strain_names:
rule append_missing_metadata_to_ncbi:
input:
ncbi_metadata = "ncbi/results/metadata.tsv",
missing_metadata = "joined-ncbi/data/missing_metadata.tsv",
output:
missing_sequence_ids = "joined-ncbi/data/missing_sequence_ids.txt",
joined_metadata = "joined-ncbi/data/all_metadata.tsv",
params:
sequence_id_column = config["curate"]["output_id_field"],
source_column_name = config["join_ncbi_andersen"]["source_column_name"],
ncbi_source = config["join_ncbi_andersen"]["ncbi_source"],
andersen_source = config["join_ncbi_andersen"]["andersen_source"],
dedup_column = config["join_ncbi_andersen"]["dedup_column"],
shell:
"""
tsv-select -H -f {params.sequence_id_column} \
{input.missing_metadata} \
> {output.missing_sequence_ids}
tsv-append \
--source-header {params.source_column_name} \
--file {params.ncbi_source}={input.ncbi_metadata} \
--file {params.andersen_source}={input.missing_metadata} \
| tsv-uniq -H -f {params.dedup_column} \
> {output.joined_metadata}
"""


rule select_missing_sequences:
rule dedup_metadata_by_sample_id:
input:
missing_sequence_ids = "joined-ncbi/data/missing_sequence_ids.txt",
andersen_sequences = "andersen-lab/results/sequences_{segment}.fasta",
all_joined_metadata="joined-ncbi/data/all_metadata.tsv",
output:
missing_sequences = "joined-ncbi/data/missing_sequences_{segment}.fasta",
joined_metadata="joined-ncbi/results/metadata.tsv",
params:
id_field=config["curate"]["output_id_field"],
log:
"logs/joined-ncbi/dedup_metadata_by_sample_id.txt",
shell:
"""
seqkit grep -f {input.missing_sequence_ids} \
{input.andersen_sequences} \
> {output.missing_sequences}
r"""
(augur curate passthru \
--id-column {params.id_field:q} \
--metadata {input.all_joined_metadata:q} \
| ./build-configs/ncbi/bin/dedup-by-sample-id \
| augur curate passthru \
--output-metadata {output.joined_metadata}) 2>> {log}
"""


rule append_missing_metadata_to_ncbi:
rule select_final_strain_names:
input:
ncbi_metadata = "ncbi/results/metadata.tsv",
missing_metadata = "joined-ncbi/data/missing_metadata.tsv",
joined_metadata="joined-ncbi/results/metadata.tsv",
output:
joined_metadata = "joined-ncbi/results/metadata.tsv",
strain_names = "joined-ncbi/data/final_strain_names.txt",
params:
source_column_name = config["join_ncbi_andersen"]["source_column_name"],
ncbi_source = config["join_ncbi_andersen"]["ncbi_source"],
andersen_source = config["join_ncbi_andersen"]["andersen_source"],
dedup_column = config["join_ncbi_andersen"]["dedup_column"],
sequence_id_column = config["curate"]["output_id_field"],
shell:
"""
tsv-append \
--source-header {params.source_column_name} \
--file {params.ncbi_source}={input.ncbi_metadata} \
--file {params.andersen_source}={input.missing_metadata} \
| tsv-uniq -H -f {params.dedup_column} \
> {output.joined_metadata}
r"""
tsv-select -H -f {params.sequence_id_column} \
{input.joined_metadata} \
> {output.strain_names}
"""


rule append_missing_sequences_to_ncbi:
rule select_final_sequences:
input:
strain_names = "joined-ncbi/data/final_strain_names.txt",
ncbi_sequences = "ncbi/results/sequences_{segment}.fasta",
missing_sequences = "joined-ncbi/data/missing_sequences_{segment}.fasta",
andersen_sequences = "andersen-lab/results/sequences_{segment}.fasta",
output:
joined_sequences = "joined-ncbi/results/sequences_{segment}.fasta",
shell:
"""
cat {input.ncbi_sequences} {input.missing_sequences} \
| seqkit rmdup \
> {output.joined_sequences}
r"""
cat {input.ncbi_sequences} {input.andersen_sequences} \
| seqkit grep -f {input.strain_names} \
> {output.joined_sequences}
"""

0 comments on commit 58c0508

Please sign in to comment.