diff --git a/.gitignore b/.gitignore index 1d3eff1..840ed05 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ benchmarks/ ingest/fauna/ ingest/andersen-lab/ ingest/ncbi/ +ingest/joined-ncbi/ # Sensitive environment variables environment* diff --git a/README.md b/README.md index 63f0ebf..66f749a 100755 --- a/README.md +++ b/README.md @@ -74,7 +74,7 @@ nextstrain build \ --env AWS_SECRET_ACCESS_KEY \ . \ --snakefile Snakefile.genome \ - --config s3_src=s3://nextstrain-data/files/workflows/avian-flu/h5n1/ncbi + --config s3_src=s3://nextstrain-data/files/workflows/avian-flu/h5n1 ``` Currently this is only set up for the "h5n1-cattle-outbreak" build using NCBI data, diff --git a/Snakefile.genome b/Snakefile.genome index bcd02d9..27bd8be 100644 --- a/Snakefile.genome +++ b/Snakefile.genome @@ -1,8 +1,10 @@ include: "rules/common.smk" +SUPPORTED_LOCAL_SOURCES = ["ncbi", "andersen-lab", "joined-ncbi"] + if LOCAL_INGEST: - assert INGEST_SOURCE == "ncbi", \ - "Full genome build is only set up for locat ingest from 'ncbi'." + assert INGEST_SOURCE in SUPPORTED_LOCAL_SOURCES, \ + f"Full genome build is only set up for locat ingest from {SUPPORTED_LOCAL_SOURCES}." else: assert S3_SRC.startswith("s3://nextstrain-data/"), \ "Full genome build is only set up for data from the public S3 bucket" diff --git a/config/auspice_config_h5n1-cattle-outbreak.json b/config/auspice_config_h5n1-cattle-outbreak.json index 7548457..1de23e7 100755 --- a/config/auspice_config_h5n1-cattle-outbreak.json +++ b/config/auspice_config_h5n1-cattle-outbreak.json @@ -100,6 +100,11 @@ "key": "submitting_lab", "title": "Submitting Lab", "type": "categorical" + }, + { + "key": "data_source", + "title": "Data Source", + "type": "categorical" } ], "geo_resolutions": [ @@ -123,6 +128,7 @@ "submitting_lab" ], "metadata_columns": [ - "genbank_accession" + "genbank_accession", + "sra_accessions" ] } diff --git a/config/description_h5n1-cattle-outbreak.md b/config/description_h5n1-cattle-outbreak.md index 7414468..24adc8e 100644 --- a/config/description_h5n1-cattle-outbreak.md +++ b/config/description_h5n1-cattle-outbreak.md @@ -1,3 +1,6 @@ We gratefully acknowledge the authors, originating and submitting laboratories of the genetic sequences and metadata for sharing their work. Please note that although data generators have generously shared data in an open fashion, that does not mean there should be free license to publish on this data. Data generators should be cited where possible and collaborations should be sought in some circumstances. Please try to avoid scooping someone else's work. Reach out if uncertain. Genomic data from the ongoing outbreak of H5N1 in cattle in the US was shared by the [National Veterinary Services Laboratories (NVSL)](https://www.aphis.usda.gov/labs/about-nvsl) of the [Animal and Plant Health Inspection Service (APHIS)](https://www.aphis.usda.gov/) of the U.S. Department of Agriculture (USDA) in an open fashion to NCBI GenBank. + +NCBI GenBank data is supplemented with publicly available consensus sequences and metadata +from Andersen lab's [avian-influenza repo](https://github.com/andersen-lab/avian-influenza). diff --git a/config/dropped_strains_h5n1-cattle-outbreak.txt b/config/dropped_strains_h5n1-cattle-outbreak.txt index b23361e..9650479 100755 --- a/config/dropped_strains_h5n1-cattle-outbreak.txt +++ b/config/dropped_strains_h5n1-cattle-outbreak.txt @@ -1,3 +1,14 @@ +# Duplicated entries from Andersen Lab +A/Cattle/USA/24-008749-004-v/2024 # Duplicate of A/cattle/Texas/24-008749-004/2024 +A/Cattle/USA/24-008749-006-original/2024 # Duplicate of A/cattle/Texas/24-008749-006-original/2024 +A/Cattle/USA/24-008749-006-v/2024 # Duplicate of A/cattle/Texas/24-008749-006-original/2024 +A/Cattle/USA/24-008766-001-original/2024 # Duplicate of A/cattle/Kansas/24-008766-001/2024 +A/Cattle/USA/24-008766-001-v/2024 # Duplicate of A/cattle/Kansas/24-008766-001/2024 +A/Cattle/USA/24-009027-002-original/2024 # Duplicate of A/cattle/Michigan/24-009027-002/2024 +A/Cattle/USA/24-009027-002-v/2024 # Duplicate of A/cattle/Michigan/24-009027-002/2024 +A/PEFA/USA/24-005915-001-original/2024 # Duplicate of A/Peregrinefalcon/California/24-005915-001/2024 +A/Skunk/USA/24-006483-001-original/2024 # Duplicate of A/skunk/NewMexico/24-006483-001/2024 + # Dropping these strains from include due to excess private mutations A/cattle/NorthCarolina/24-010327-002/2024 A/cattle/Texas/24-009495-007/2024 @@ -86,4 +97,3 @@ A/turkey/SouthDakota/24-007379-002/2024 A/turkeyvulture/California/24-007419-001/2024 A/westerngull/California/24-004708-001/2024 A/woodduck/NorthCarolina/W24-026/2024 - diff --git a/ingest/README.md b/ingest/README.md index 5881825..a846104 100644 --- a/ingest/README.md +++ b/ingest/README.md @@ -35,6 +35,18 @@ nextstrain build . ingest_andersen_lab --configfile build-configs/ncbi/defaults/ The results will be available in `andersen-lab/results/`. +### Ingest and join NCBI GenBank and Andersen lab data + +To ingest and join NCBI GenBank and Andersen lab data, we deduplicate records using SRA accessions. +Andersen lab data with SRA accessions _not_ found in the NCBI GenBank data are appended +to the metadata TSV and sequence FASTA files by running: + +``` +nextstrain build . ingest_joined_ncbi --configfile build-configs/ncbi/defaults/config.yaml +``` + +This results in files `metadata.tsv`, `sequences_ha.fast`, etc... under `joined-ncbi/results/`. + #### Upload to S3 To run both NCBI Genbank and Andersent Lab ingests _and_ upload results to S3, @@ -49,9 +61,10 @@ nextstrain build \ --configfile build-configs/ncbi/defaults/config.yaml ``` -The workflow compresses and uploads the local files to S3 to corresponding paths -under `s3://nextstrain-data/files/workflows/avian-flu/h5n1/ncbi` and -`s3://nextstrain-data/files/workflows/avian-flu/h5n1/andersen-lab`. +The workflow compresses and uploads the local files to S3 to corresponding paths under +- joined-ncbi = `s3://nextstrain-data/files/workflows/avian-flu/h5n1/` +- ncbi = `s3://nextstrain-data/files/workflows/avian-flu/h5n1/ncbi` +- andersen-lab = `s3://nextstrain-data/files/workflows/avian-flu/h5n1/andersen-lab`. ### Ingest and upload data from fauna to S3 diff --git a/ingest/Snakefile b/ingest/Snakefile index eb3839c..7cb238f 100644 --- a/ingest/Snakefile +++ b/ingest/Snakefile @@ -3,7 +3,7 @@ path_to_fauna = '../fauna' # Use default configuration values. Override with Snakemake's --configfile/--config options. configfile: "defaults/config.yaml" -SUPPORTED_DATA_SOURCES = ["fauna", "ncbi", "andersen-lab"] +SUPPORTED_DATA_SOURCES = ["fauna", "ncbi", "andersen-lab", "joined-ncbi"] wildcard_constraints: segment = "|".join(config["segments"]), diff --git a/ingest/build-configs/ncbi/Snakefile b/ingest/build-configs/ncbi/Snakefile index 1b2c62b..c1052d9 100644 --- a/ingest/build-configs/ncbi/Snakefile +++ b/ingest/build-configs/ncbi/Snakefile @@ -8,7 +8,7 @@ configfile: "build-configs/ncbi/defaults/config.yaml" # Sanity check that the requested segments match our ncbi_segments map assert all(segment in config["ncbi_segments"].keys() for segment in config["segments"]) -NCBI_DATA_SOURCES = ["ncbi", "andersen-lab"] +NCBI_DATA_SOURCES = ["ncbi", "andersen-lab", "joined-ncbi"] rule ingest_ncbi: input: @@ -26,6 +26,14 @@ rule ingest_andersen_lab: "andersen-lab/results/metadata.tsv", +rule ingest_joined_ncbi: + input: + expand([ + "joined-ncbi/results/sequences_{segment}.fasta", + ], segment=config["segments"]), + "joined-ncbi/results/metadata.tsv", + + # Uploads all results for NCBI and Andersen Lab ingests rule upload_all_ncbi: input: @@ -39,3 +47,4 @@ rule upload_all_ncbi: include: "rules/ingest_andersen_lab.smk" include: "rules/fetch_from_ncbi.smk" include: "rules/curate.smk" +include: "rules/join_ncbi_andersen.smk" diff --git a/ingest/build-configs/ncbi/bin/curate_andersen_lab_data b/ingest/build-configs/ncbi/bin/curate-andersen-lab-data similarity index 96% rename from ingest/build-configs/ncbi/bin/curate_andersen_lab_data rename to ingest/build-configs/ncbi/bin/curate-andersen-lab-data index 5cc636a..42e9690 100755 --- a/ingest/build-configs/ncbi/bin/curate_andersen_lab_data +++ b/ingest/build-configs/ncbi/bin/curate-andersen-lab-data @@ -30,6 +30,8 @@ NEXTSTRAIN_RECORD = { 'PMID': '?', 'gisaid_clade': '?', 'h5_clade': '?', + 'genbank_accession': '?', + 'sra_accessions': '?', } @@ -40,6 +42,7 @@ def create_new_record(anderson_record: dict) -> dict: """ new_record = copy.deepcopy(NEXTSTRAIN_RECORD) new_record['isolate_id'] = anderson_record['Run'] + new_record['sra_accessions'] = anderson_record['Run'] new_record['division'] = anderson_record['US State'] new_record['location'] = anderson_record['US State'] diff --git a/ingest/build-configs/ncbi/defaults/config.yaml b/ingest/build-configs/ncbi/defaults/config.yaml index af71ec9..b0a5046 100644 --- a/ingest/build-configs/ncbi/defaults/config.yaml +++ b/ingest/build-configs/ncbi/defaults/config.yaml @@ -126,9 +126,17 @@ curate: - gisaid_clade - h5_clade - genbank_accession + - sra_accessions s3_dst: ncbi: s3://nextstrain-data/files/workflows/avian-flu/h5n1/ncbi andersen-lab: s3://nextstrain-data/files/workflows/avian-flu/h5n1/andersen-lab + joined-ncbi: s3://nextstrain-data/files/workflows/avian-flu/h5n1 cloudfront_domain: data.nextstrain.org + +join_ncbi_andersen: + match_field: sra_accessions + source_column_name: data_source + ncbi_source: genbank + andersen_source: andersen-lab diff --git a/ingest/build-configs/ncbi/rules/ingest_andersen_lab.smk b/ingest/build-configs/ncbi/rules/ingest_andersen_lab.smk index 25dfb51..86c07f8 100644 --- a/ingest/build-configs/ncbi/rules/ingest_andersen_lab.smk +++ b/ingest/build-configs/ncbi/rules/ingest_andersen_lab.smk @@ -83,7 +83,7 @@ rule curate_metadata: """ augur curate normalize-strings \ --metadata {input.metadata} \ - | ./build-configs/ncbi/bin/curate_andersen_lab_data \ + | ./build-configs/ncbi/bin/curate-andersen-lab-data \ | ./vendored/apply-geolocation-rules \ --geolocation-rules {input.geolocation_rules} \ | augur curate passthru \ @@ -99,8 +99,12 @@ rule match_metadata_and_segment_fasta: metadata = "andersen-lab/data/metadata.tsv", fasta = "andersen-lab/data/{segment}.fasta" output: - metadata = "andersen-lab/data/metadata_{segment}.tsv", - fasta = "andersen-lab/results/sequences_{segment}.fasta" + metadata = "andersen-lab/data/matched_metadata_{segment}.tsv", + fasta = "andersen-lab/results/sequences_{segment}.fasta", + params: + input_id_field="isolate_id", + sequence_field="sequence", + output_id_field=config["curate"]["output_id_field"], log: "andersen-lab/logs/match_segment_metadata_and_fasta/{segment}.txt", shell: @@ -108,13 +112,34 @@ rule match_metadata_and_segment_fasta: augur curate passthru \ --metadata {input.metadata} \ --fasta {input.fasta} \ - --seq-id-column isolate_id \ - --seq-field sequence \ + --seq-id-column {params.input_id_field} \ + --seq-field {params.sequence_field} \ --unmatched-reporting warn \ --duplicate-reporting warn \ --output-metadata {output.metadata} \ --output-fasta {output.fasta} \ - --output-id-field strain \ - --output-seq-field sequence \ + --output-id-field {params.output_id_field} \ + --output-seq-field {params.sequence_field} \ 2> {log} """ + + +rule reorder_metadata_columns: + """ + Using tsv-select to reorder the columns of the Andersen lab metadata to + exactly match the NCBI metadata columns. Ensures that we don't accidently + append the wrong columns in joining steps. + + tsv-select will exit with error if the column does not exist. + """ + input: + metadata = "andersen-lab/data/matched_metadata_{segment}.tsv", + output: + reordered_metadata = "andersen-lab/data/metadata_{segment}.tsv" + params: + metadata_fields=",".join(config["curate"]["metadata_columns"]), + shell: + """ + tsv-select -H -f {params.metadata_fields} \ + {input.metadata} > {output.reordered_metadata} + """ diff --git a/ingest/build-configs/ncbi/rules/join_ncbi_andersen.smk b/ingest/build-configs/ncbi/rules/join_ncbi_andersen.smk new file mode 100644 index 0000000..9e62933 --- /dev/null +++ b/ingest/build-configs/ncbi/rules/join_ncbi_andersen.smk @@ -0,0 +1,87 @@ +""" +This part of the workflow handles the joining and deduplication of NCBI and +Andersen lab data. +""" + + +rule select_missing_metadata: + """ + Uses tsv-join --exclude flag to exclude matching records so we are + left with any missing metadata from the Andersen lab. + """ + input: + ncbi_metadata = "ncbi/results/metadata.tsv", + andersen_metadata = "andersen-lab/results/metadata.tsv", + output: + missing_metadata = "joined-ncbi/data/missing_metadata.tsv", + params: + match_field = config["join_ncbi_andersen"]["match_field"], + shell: + """ + tsv-join -H \ + --exclude \ + --filter-file {input.ncbi_metadata} \ + --key-fields {params.match_field} \ + {input.andersen_metadata} > {output.missing_metadata} + """ + + +rule select_missing_strain_names: + input: + missing_metadata = "joined-ncbi/data/missing_metadata.tsv", + output: + missing_sequence_ids = "joined-ncbi/data/missing_sequence_ids.txt", + params: + sequence_id_column = config["curate"]["output_id_field"], + shell: + """ + tsv-select -H -f {params.sequence_id_column} \ + {input.missing_metadata} \ + > {output.missing_sequence_ids} + """ + + +rule select_missing_sequences: + input: + missing_sequence_ids = "joined-ncbi/data/missing_sequence_ids.txt", + andersen_sequences = "andersen-lab/results/sequences_{segment}.fasta", + output: + missing_sequences = "joined-ncbi/data/missing_sequences_{segment}.fasta", + shell: + """ + seqkit grep -f {input.missing_sequence_ids} \ + {input.andersen_sequences} \ + > {output.missing_sequences} + """ + + +rule append_missing_metadata_to_ncbi: + input: + ncbi_metadata = "ncbi/results/metadata.tsv", + missing_metadata = "joined-ncbi/data/missing_metadata.tsv", + output: + joined_metadata = "joined-ncbi/results/metadata.tsv", + params: + source_column_name = config["join_ncbi_andersen"]["source_column_name"], + ncbi_source = config["join_ncbi_andersen"]["ncbi_source"], + andersen_source = config["join_ncbi_andersen"]["andersen_source"], + shell: + """ + tsv-append \ + --source-header {params.source_column_name} \ + --file {params.ncbi_source}={input.ncbi_metadata} \ + --file {params.andersen_source}={input.missing_metadata} \ + > {output.joined_metadata} + """ + + +rule append_missing_sequences_to_ncbi: + input: + ncbi_sequences = "ncbi/results/sequences_{segment}.fasta", + missing_sequences = "joined-ncbi/data/missing_sequences_{segment}.fasta", + output: + joined_sequences = "joined-ncbi/results/sequences_{segment}.fasta", + shell: + """ + cat {input.ncbi_sequences} {input.missing_sequences} > {output.joined_sequences} + """