-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
wip: script to fetch from ncbi entrez
- Loading branch information
1 parent
bf089ac
commit 98453c3
Showing
1 changed file
with
62 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
#!/usr/bin/env python3 | ||
""" | ||
Fetch metadata from NCBI Entrez for the provided file of GenBank accessions. | ||
""" | ||
import argparse | ||
from typing import Iterator | ||
from Bio import SeqIO, Entrez, SeqRecord | ||
|
||
# To use the efetch API, the docs indicate only around 10,000 records should be fetched per request | ||
# https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch | ||
# However, in my testing with HepB, the max records returned was 9,999 | ||
# - Jover, 16 August 2023 | ||
BATCH_SIZE = 9999 | ||
Entrez.email = "[email protected]" | ||
|
||
|
||
def parse_args(): | ||
parser = argparse.ArgumentParser(description=__doc__) | ||
parser.add_argument("--genbank-accessions", required=True, | ||
help="File of GenBank accessions to fetch from Entrez, with one accession per line.") | ||
parser.add_argument("--output", required=True, help="TSV output file.") | ||
return parser.parse_args() | ||
|
||
|
||
def read_accessions(accessions_file: str) -> str: | ||
""" | ||
Reads each line from the provided *accessions_file* and | ||
returns it them as a single comma-delimited string. | ||
Expects the first line of the file to be a header and skips it. | ||
""" | ||
accessions = [] | ||
with open(accessions_file) as file: | ||
# Skip header line | ||
next(file) | ||
for line in file: | ||
accessions.append(line.rstrip()) | ||
|
||
return ",".join(accessions) | ||
|
||
|
||
def fetch_from_efetch(accessions: str) -> Iterator[SeqRecord.SeqRecord]: | ||
""" | ||
Fetches the GenBank records for the provided *accessions* from | ||
Entrez EFetch and returns an iterator of individual SeqRecords. | ||
""" | ||
handle = Entrez.efetch( | ||
db='nucleotide', | ||
id=accessions, | ||
rettype="gb", | ||
retmode="text", | ||
) | ||
|
||
return SeqIO.parse(handle, "genbank") | ||
|
||
|
||
if __name__ == '__main__': | ||
args = parse_args() | ||
|
||
accessions = read_accessions(args.genbank_accessions) | ||
for record in fetch_from_efetch(accessions): | ||
print(record) |