wip: script to fetch from ncbi entrez

nextstrain · May 23, 2024 · 98453c3 · 98453c3
1 parent bf089ac
commit 98453c3
Showing 1 changed file with 62 additions and 0 deletions.
diff --git a/ingest/scripts/fetch_from_ncbi_entrez_with_accessions b/ingest/scripts/fetch_from_ncbi_entrez_with_accessions
@@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+"""
+Fetch metadata from NCBI Entrez for the provided file of GenBank accessions.
+"""
+import argparse
+from typing import Iterator
+from Bio import SeqIO, Entrez, SeqRecord
+
+# To use the efetch API, the docs indicate only around 10,000 records should be fetched per request
+# https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch
+# However, in my testing with HepB, the max records returned was 9,999
+#   - Jover, 16 August 2023
+BATCH_SIZE = 9999
+Entrez.email = "[email protected]"
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--genbank-accessions", required=True,
+        help="File of GenBank accessions to fetch from Entrez, with one accession per line.")
+    parser.add_argument("--output", required=True, help="TSV output file.")
+    return parser.parse_args()
+
+
+def read_accessions(accessions_file: str) -> str:
+    """
+    Reads each line from the provided *accessions_file* and
+    returns it them as a single comma-delimited string.
+
+    Expects the first line of the file to be a header and skips it.
+    """
+    accessions = []
+    with open(accessions_file) as file:
+        # Skip header line
+        next(file)
+        for line in file:
+            accessions.append(line.rstrip())
+
+    return ",".join(accessions)
+
+
+def fetch_from_efetch(accessions: str) -> Iterator[SeqRecord.SeqRecord]:
+    """
+    Fetches the GenBank records for the provided *accessions* from
+    Entrez EFetch and returns an iterator of individual SeqRecords.
+    """
+    handle = Entrez.efetch(
+        db='nucleotide',
+        id=accessions,
+        rettype="gb",
+        retmode="text",
+    )
+
+    return SeqIO.parse(handle, "genbank")
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    accessions = read_accessions(args.genbank_accessions)
+    for record in fetch_from_efetch(accessions):
+        print(record)