From 12e8964a8586c3dd0726d1486e557c4b1a5b6218 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Fri, 11 Oct 2024 16:48:27 -0700 Subject: [PATCH] ingest: Dedup by sample id within NCBI and Andersen lab data In investigating the duplicates dropped from the joined-ncbi metadata, I realized that these duplicates were not purely from the merge of the two data sources. This commit deduplicates by sample id in the upstream metadata as well. There's no need to change the processing of sequence FASTAs at this point because they are still matched by their respective accessions instead of strain name. --- ingest/build-configs/ncbi/rules/curate.smk | 3 ++- ingest/build-configs/ncbi/rules/ingest_andersen_lab.smk | 7 ++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/ingest/build-configs/ncbi/rules/curate.smk b/ingest/build-configs/ncbi/rules/curate.smk index 0e025ce..a5bde36 100644 --- a/ingest/build-configs/ncbi/rules/curate.smk +++ b/ingest/build-configs/ncbi/rules/curate.smk @@ -123,11 +123,12 @@ rule split_curated_ndjson_by_segment: benchmark: "ncbi/benchmarks/{segment}/split_curated_ndjson_by_segment.txt" shell: - """ + r""" (cat {input.curated_ndjson} \ | ./build-configs/ncbi/bin/filter-ndjson-by-segment \ --segment {wildcards.segment} \ | ./build-configs/ncbi/bin/dedup-by-strain \ + | ./build-configs/ncbi/bin/dedup-by-sample-id \ | augur curate passthru \ --output-metadata {output.metadata} \ --output-fasta {output.sequences} \ diff --git a/ingest/build-configs/ncbi/rules/ingest_andersen_lab.smk b/ingest/build-configs/ncbi/rules/ingest_andersen_lab.smk index 6852aa6..2d31d35 100644 --- a/ingest/build-configs/ncbi/rules/ingest_andersen_lab.smk +++ b/ingest/build-configs/ncbi/rules/ingest_andersen_lab.smk @@ -127,11 +127,12 @@ rule curate_metadata: expected_date_formats=['%Y-%m-%d', '%Y', '%Y-%m-%d %H:%M:%S', '%Y-%m-%dT%H:%M:%SZ'], annotations_id=config["curate"]["annotations_id"], shell: - """ - augur curate normalize-strings \ + r""" + (augur curate normalize-strings \ --metadata {input.metadata} \ | ./build-configs/ncbi/bin/curate-andersen-lab-data \ | ./build-configs/ncbi/bin/dedup-by-strain \ + | ./build-configs/ncbi/bin/dedup-by-sample-id \ | augur curate format-dates \ --date-fields {params.date_fields:q} \ --expected-date-formats {params.expected_date_formats:q} \ @@ -143,7 +144,7 @@ rule curate_metadata: --annotations {input.annotations} \ --id-field {params.annotations_id} \ | augur curate passthru \ - --output-metadata {output.metadata} 2>> {log} + --output-metadata {output.metadata}) 2>> {log} """ rule match_metadata_and_segment_fasta: