nextstrain · joverlee521 · May 22, 2024 · May 22, 2024 · May 22, 2024 · May 22, 2024
diff --git a/ingest/Snakefile b/ingest/Snakefile
@@ -20,3 +20,9 @@ rule upload_all:
 
 include: "rules/upload_from_fauna.smk"
 include: "rules/ingest_andersen_lab.smk"
+
+# Allow users to import custom rules provided via the config.
+if "custom_rules" in config:
+    for rule_file in config["custom_rules"]:
+
+        include: rule_file
diff --git a/ingest/build-configs/ncbi/README.md b/ingest/build-configs/ncbi/README.md
@@ -0,0 +1,42 @@
+# NCBI Ingest
+
+This workflow ingests public data from NCBI, specifically for the H5N1 outbreak
+in the U.S. in 2024.
+
+## Workflow Usage
+
+The workflow can be run from the top level pathogen repo directory:
+```
+nextstrain build ingest ingest_ncbi --configfile build-configs/ncbi/defaults/config.yaml
+```
+
+Alternatively, the workflow can also be run from within the ingest directory:
+```
+cd ingest
+nextstrain build . ingest_ncbi --configfile build-configs/ncbi/defaults/config.yaml
+```
+
+This produces the default outputs of the NCBI ingest workflow:
+
+- metadata      = ncbi/results/metadata.tsv
+- sequences     = ncbi/results/sequences.fasta
+
+
+## Defaults
+
+The defaults directory contains all of the default configurations for the NCBI ingest workflow.
+
+[defaults/config.yaml](defaults/config.yaml) contains all of the default configuration parameters
+used for the ingest workflow. Use Snakemake's `--configfile`/`--config`
+options to override these default values.
+
+## Snakefile and rules
+
+The rules directory contains separate Snakefiles (`*.smk`) as modules of the core ingest workflow.
+The modules of the workflow are in separate files to keep the main ingest [Snakefile](Snakefile) succinct and organized.
+
+The `workdir` is hardcoded to be the ingest directory so all filepaths for
+inputs/outputs should be relative to the ingest directory.
+
+Modules are all [included](https://snakemake.readthedocs.io/en/stable/snakefiles/modularization.html#includes)
+in the main Snakefile in the order that they are expected to run.
diff --git a/ingest/build-configs/ncbi/Snakefile b/ingest/build-configs/ncbi/Snakefile
@@ -0,0 +1,15 @@
+"""
+This is the NCBI ingest Snakefile that orchestrates the NCBI ingest
+workflow and defines its default outputs.
+"""
+# Use default configuration values. Override with Snakemake's --configfile/--config options.
+configfile: "build-configs/ncbi/defaults/config.yaml"
+
+rule ingest_ncbi:
+    input:
+        "ncbi/results/sequences.fasta",
+        "ncbi/results/metadata.tsv",
+
+# Include file paths are relative this Snakefile
+include: "rules/fetch_from_ncbi.smk"
+include: "rules/curate.smk"
diff --git a/ingest/build-configs/ncbi/bin/fetch_from_ncbi_entrez_with_accessions b/ingest/build-configs/ncbi/bin/fetch_from_ncbi_entrez_with_accessions
@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+"""
+Fetch metadata from NCBI Entrez for the provided file of GenBank accessions.
+"""
+import argparse
+from typing import Iterator, List
+from Bio import SeqIO, Entrez, SeqRecord
+from augur.io.metadata import write_records_to_tsv
+
+# To use the efetch API, the docs indicate only around 10,000 records should be fetched per request
+# https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch
+# However, in my testing with HepB, the max records returned was 9,999
+#   - Jover, 16 August 2023
+BATCH_SIZE = 9999
+Entrez.email = "[email protected]"
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--genbank-accessions", required=True,
+        help="File of GenBank accessions to fetch from Entrez, with one accession per line.")
+    parser.add_argument("--source-fields", nargs="+",
+        default=["strain", "serotype", "segment"],
+        help="Qualifier fields to parse from the source feature of GenBank records")
+    parser.add_argument("--id-column-name", default="accession_version",
+        help="Column name to use the GenBank record id field.")
+    parser.add_argument("--output", required=True,
+        help="TSV output file. Accepts '-' to output TSV to stdout.")
+    return parser.parse_args()
+
+
+def read_accessions(accessions_file: str) -> List[str]:
+    """
+    Reads each line from the provided *accessions_file* and
+    returns it them as a single comma-delimited string.
+
+    Expects the first line of the file to be a header and skips it.
+    """
+    accessions = []
+    with open(accessions_file) as file:
+        # Skip header line
+        next(file)
+        for line in file:
+            accessions.append(line.rstrip())
+
+    return accessions
+
+
+def fetch_from_efetch(accessions: List[str]) -> Iterator[Iterator[SeqRecord.SeqRecord]]:
+    """
+    Fetches the GenBank records for the provided *accessions* from
+    Entrez EFetch in batches of `BATCH_SIZE` and yields an iterator of
+    individual SeqRecords.
+    """
+    for i in range(0, len(accessions), BATCH_SIZE):
+        batched_accessions = accessions[i:i + BATCH_SIZE]
+        handle = Entrez.efetch(
+            db='nucleotide',
+            id=batched_accessions,
+            rettype="gb",
+            retmode="text",
+        )
+        yield SeqIO.parse(handle, "genbank")
+
+
+def parse_genbank_records(record_batches: Iterator[Iterator[SeqRecord.SeqRecord]],
+                          id_field_name: str,
+                          source_fields: List[str]) -> Iterator[dict]:
+    """
+    Parse the provided GenBank *record_batches* to pull out the
+    requested *source_fields* from the 'source' feature's qualifiers.
+
+    Yields the parsed records as dict with the GenBank accession (versioned)
+    and the requested *source_fields*.
+    """
+    for batch in record_batches:
+        for record in batch:
+            source_features = [
+                feature for feature in record.features
+                if feature.type == 'source'
+            ]
+
+            assert len(source_features) == 1, \
+                "GenBank records should have one 'source' feature"
+
+            source_qualifiers = source_features[0].qualifiers
+            requested_record = {}
+            requested_record[id_field_name] = record.id
+            requested_record.update({
+                # Qualifier fields are returned as lists even though they only have one value
+                field: source_qualifiers.get(field, [""])[0]
+                for field in source_fields
+            })
+
+            yield requested_record
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    accessions = read_accessions(args.genbank_accessions)
+    fetched_records = fetch_from_efetch(accessions)
+    parsed_records = parse_genbank_records(fetched_records,
+                                           args.id_column_name,
+                                           args.source_fields)
+    write_records_to_tsv(parsed_records, args.output)
diff --git a/ingest/build-configs/ncbi/defaults/annotations.tsv b/ingest/build-configs/ncbi/defaults/annotations.tsv
@@ -0,0 +1,6 @@
+# Manually curated annotations TSV file
+# The TSV should not have a header and should have exactly three columns:
+# id to match existing metadata, field name, and field value
+# If there are multiple annotations for the same id and field, then the last value is used
+# Lines starting with '#' are treated as comments
+# Any '#' after the field value are treated as comments.
diff --git a/ingest/build-configs/ncbi/defaults/config.yaml b/ingest/build-configs/ncbi/defaults/config.yaml
@@ -0,0 +1,134 @@
+# Config for the NCBI ingest workflow
+
+custom_rules:
+  - "build-configs/ncbi/Snakefile"
+
+# NCBI taxon id for "Influenza A virus", which are then further
+# labeled with `serotype` (e.g. H5N1)
+ncbi_taxon_id: "11320"
+# Subset to get data for the H5N1 outbreak in the U.S. in 2024.
+# Also needed because pulling data for 11320 results in an
+# invalid zip archive error when trying to pull the full dataset.
+# <https://github.com/ncbi/datasets/issues/360>
+ncbi_released_after: "04/01/2024"
+ncbi_geo_location: "North America"
+ncbi_join_field: "accession_version"
+
+# The list of NCBI Datasets fields to include from NCBI Datasets output
+# These need to be the "mnemonics" of the NCBI Datasets fields, see docs for full list of fields
+# https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields
+# Note: the "accession" field MUST be provided to match with the sequences
+ncbi_datasets_fields:
+  - accession
+  - sourcedb
+  - sra-accs
+  - isolate-lineage
+  - geo-region
+  - geo-location
+  - isolate-collection-date
+  - release-date
+  - update-date
+  - length
+  - host-name
+  - isolate-lineage-source
+  - biosample-acc
+  - submitter-names
+  - submitter-affiliation
+  - submitter-country
+
+# Fields to parse from the GenBank records that are fetched with Entrez
+entrez_source_fields:
+  - strain
+  - serotype
+  - segment
+
+# Config parameters related to the curate pipeline
+curate:
+  # URL pointed to public generalized geolocation rules
+  # For the Nextstrain team, this is currently
+  # "https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv"
+  geolocation_rules_url: "https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv"
+  # The path to the local geolocation rules within the pathogen repo
+  # The path should be relative to the ingest directory.
+  local_geolocation_rules: "defaults/geolocation_rules.tsv"
+  # List of field names to change where the key is the original field name and the value is the new field name
+  # The original field names should match the ncbi_datasets_fields provided above.
+  # This is the first step in the pipeline, so any references to field names in the configs below should use the new field names
+  field_map:
+    accession: accession
+    accession_version: accession_version
+    sourcedb: database
+    sra-accs: sra_accessions
+    strain: strain
+    isolate-lineage: isolate
+    serotype: serotype
+    segment: segment
+    geo-region: region
+    geo-location: location
+    isolate-collection-date: date
+    release-date: date_released
+    update-date: date_updated
+    length: length
+    host-name: host
+    isolate-lineage-source: sample_type
+    biosample-acc: biosample_accessions
+    submitter-names: authors
+    submitter-affiliation: institution
+    submitter-country: submitter_country
+  # Standardized strain name regex
+  # Currently accepts any characters because we do not have a clear standard for strain names across pathogens
+  strain_regex: "^.+$"
+  # Back up strain name field to use if "strain" doesn"t match regex above
+  strain_backup_fields: ["isolate", "accession"]
+  # List of date fields to standardize to ISO format YYYY-MM-DD
+  date_fields: ["date", "date_released", "date_updated"]
+  # List of expected date formats that are present in the date fields provided above
+  # These date formats should use directives expected by datetime
+  # See https://docs.python.org/3.9/library/datetime.html#strftime-and-strptime-format-codes
+  expected_date_formats: ["%Y", "%Y-%m", "%Y-%m-%d", "%Y-%m-%dT%H:%M:%SZ"]
+  titlecase:
+    # List of string fields to titlecase
+    fields: ["region", "country", "division", "location"]
+    # List of abbreviations not cast to titlecase, keeps uppercase
+    abbreviations: ["USA"]
+    # Articles that should not be cast to titlecase
+    articles: [
+      "and", "d", "de", "del", "des", "di", "do", "en", "l", "la", "las", "le",
+      "los", "nad", "of", "op", "sur", "the", "y"
+    ]
+  # Metadata field that contains the list of authors associated with the sequence
+  authors_field: "authors"
+  # Default value to use if the authors field is empty
+  authors_default_value: "?"
+  # Name to use for the generated abbreviated authors field
+  abbr_authors_field: "abbr_authors"
+  # Path to the manual annotations file
+  # The path should be relative to the ingest directory
+  annotations: "build-configs/ncbi/defaults/annotations.tsv"
+  # The ID field in the metadata to use to merge the manual annotations
+  annotations_id: "accession"
+  # The ID field in the metadata to use as the sequence id in the output FASTA file
+  output_id_field: "accession"
+  # The field in the NDJSON record that contains the actual genomic sequence
+  output_sequence_field: "sequence"
+  # The list of metadata columns to keep in the final output of the curation pipeline.
+  metadata_columns: [
+    "accession",
+    "accession_version",
+    "strain",
+    "serotype",
+    "segment",
+    "date",
+    "region",
+    "country",
+    "division",
+    "location",
+    "length",
+    "host",
+    "date_released",
+    "date_updated",
+    "sra_accessions",
+    "authors",
+    "abbr_authors",
+    "institution",
+  ]