Skip to content

Commit

Permalink
wip: script to fetch from ncbi entrez
Browse files Browse the repository at this point in the history
  • Loading branch information
joverlee521 committed May 23, 2024
1 parent bf089ac commit 98453c3
Showing 1 changed file with 62 additions and 0 deletions.
62 changes: 62 additions & 0 deletions ingest/scripts/fetch_from_ncbi_entrez_with_accessions
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#!/usr/bin/env python3
"""
Fetch metadata from NCBI Entrez for the provided file of GenBank accessions.
"""
import argparse
from typing import Iterator
from Bio import SeqIO, Entrez, SeqRecord

# To use the efetch API, the docs indicate only around 10,000 records should be fetched per request
# https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch
# However, in my testing with HepB, the max records returned was 9,999
# - Jover, 16 August 2023
BATCH_SIZE = 9999
Entrez.email = "[email protected]"


def parse_args():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--genbank-accessions", required=True,
help="File of GenBank accessions to fetch from Entrez, with one accession per line.")
parser.add_argument("--output", required=True, help="TSV output file.")
return parser.parse_args()


def read_accessions(accessions_file: str) -> str:
"""
Reads each line from the provided *accessions_file* and
returns it them as a single comma-delimited string.
Expects the first line of the file to be a header and skips it.
"""
accessions = []
with open(accessions_file) as file:
# Skip header line
next(file)
for line in file:
accessions.append(line.rstrip())

return ",".join(accessions)


def fetch_from_efetch(accessions: str) -> Iterator[SeqRecord.SeqRecord]:
"""
Fetches the GenBank records for the provided *accessions* from
Entrez EFetch and returns an iterator of individual SeqRecords.
"""
handle = Entrez.efetch(
db='nucleotide',
id=accessions,
rettype="gb",
retmode="text",
)

return SeqIO.parse(handle, "genbank")


if __name__ == '__main__':
args = parse_args()

accessions = read_accessions(args.genbank_accessions)
for record in fetch_from_efetch(accessions):
print(record)

0 comments on commit 98453c3

Please sign in to comment.