diff --git a/ingest/scripts/fetch_from_ncbi_entrez_with_accessions b/ingest/scripts/fetch_from_ncbi_entrez_with_accessions new file mode 100755 index 0000000..6768a6e --- /dev/null +++ b/ingest/scripts/fetch_from_ncbi_entrez_with_accessions @@ -0,0 +1,62 @@ +#!/usr/bin/env python3 +""" +Fetch metadata from NCBI Entrez for the provided file of GenBank accessions. +""" +import argparse +from typing import Iterator +from Bio import SeqIO, Entrez, SeqRecord + +# To use the efetch API, the docs indicate only around 10,000 records should be fetched per request +# https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch +# However, in my testing with HepB, the max records returned was 9,999 +# - Jover, 16 August 2023 +BATCH_SIZE = 9999 +Entrez.email = "hello@nextstrain.org" + + +def parse_args(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--genbank-accessions", required=True, + help="File of GenBank accessions to fetch from Entrez, with one accession per line.") + parser.add_argument("--output", required=True, help="TSV output file.") + return parser.parse_args() + + +def read_accessions(accessions_file: str) -> str: + """ + Reads each line from the provided *accessions_file* and + returns it them as a single comma-delimited string. + + Expects the first line of the file to be a header and skips it. + """ + accessions = [] + with open(accessions_file) as file: + # Skip header line + next(file) + for line in file: + accessions.append(line.rstrip()) + + return ",".join(accessions) + + +def fetch_from_efetch(accessions: str) -> Iterator[SeqRecord.SeqRecord]: + """ + Fetches the GenBank records for the provided *accessions* from + Entrez EFetch and returns an iterator of individual SeqRecords. + """ + handle = Entrez.efetch( + db='nucleotide', + id=accessions, + rettype="gb", + retmode="text", + ) + + return SeqIO.parse(handle, "genbank") + + +if __name__ == '__main__': + args = parse_args() + + accessions = read_accessions(args.genbank_accessions) + for record in fetch_from_efetch(accessions): + print(record)