Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve ingest for Andersen lab/SRA sequences #114

Merged
merged 5 commits into from
Dec 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 17 additions & 4 deletions ingest/build-configs/ncbi/bin/curate-andersen-lab-data
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@ Parses NDJSON record from stdin and outputs new record to stdout.
"""
import copy
import json
import re
from datetime import datetime
from enum import Enum
from sys import stdin, stdout, stderr
from augur.curate.parse_genbank_location import parse_location


NEXTSTRAIN_RECORD = {
Expand Down Expand Up @@ -45,8 +47,13 @@ def create_new_record(anderson_record: dict) -> dict:
new_record['sra_accessions'] = anderson_record['Run']
new_record['region'] = anderson_record['geo_loc_name_country_continent']
new_record['country'] = anderson_record['geo_loc_name_country']
new_record['division'] = anderson_record.get('US State', '')
new_record['location'] = anderson_record.get('US State', '')
# Parse the geolocation as the GenBank format `country:division,location`
new_record = parse_location(new_record, 'country')
# Try to fill `US State` if division and location were not parsed
if new_record['division'] == '':
new_record['division'] = anderson_record.get('US State', '')
if new_record['location'] == '':
new_record['location'] = anderson_record.get('US State', '')
new_record['host'] = anderson_record['Host']
new_record['date_released'] = anderson_record['ReleaseDate']

Expand Down Expand Up @@ -85,12 +92,18 @@ def construct_strain_name(record: dict, sample_id: str) -> str:
to include host, country, and year.

Removes all spaces in the constructed strain name because they are not
allowed in the downstream phylogenetic workflow.
allowed in the downstream phylogenetic workflow. Also replaces invalid
characters with `_` to match iqtree¹ so augur tree will not modify strain
names and cause a mismatch between the tree and the alignment FASTA.²

¹ <https://github.com/iqtree/iqtree2/blob/74da454bbd98d6ecb8cb955975a50de59785fbde/utils/tools.cpp#L607>
² <https://github.com/nextstrain/avian-flu/issues/113>
"""
host = record['host']
country = record['country']
year = parse_year(record['date'])
return f"A/{host}/{country}/{sample_id}/{year}".replace(" ", "")
strain = f"A/{host}/{country}/{sample_id}/{year}".replace(" ", "")
return re.sub(r'[^\w\_\-\.\|\/]', '_', strain)


def parse_year(date_string: str) -> str:
Expand Down
3 changes: 3 additions & 0 deletions ingest/build-configs/ncbi/defaults/annotations.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,6 @@ A/ALPACA/USA/24-014328-007/2024 division Idaho # Based on news reports https://w
A/Alpaca/USA/24-015080-001/2024 division Idaho # Based on news reports https://www.cidrap.umn.edu/avian-influenza-bird-flu/alpacas-infected-h5n1-avian-flu-idaho
A/House-Mouse/USA/24-014780-002/2024 division New Mexico # Based on https://wahis.woah.org/#/in-review/4451?reportId=167771&fromPage=event-dashboard-url
A/House-Mouse/USA/24-014782-003/2024 division New Mexico # Based on https://wahis.woah.org/#/in-review/4451?reportId=167771&fromPage=event-dashboard-url
A/PETFOOD/USA/24-037325-011/2024 host avian # This sequence is likely from cat food, which is turkey https://apps.oregon.gov/oregon-newsroom/OR/ODA/Posts/Post/morasch-meats-voluntary-recall-feline-raw-pet-food-hpai
A/PETFOOD/USA/24-037325-012/2024 host avian # This sequence is likely from cat food, which is turkeyy https://apps.oregon.gov/oregon-newsroom/OR/ODA/Posts/Post/morasch-meats-voluntary-recall-feline-raw-pet-food-hpai
A/PETFOOD/USA/24-037325-013/2024 host avian # This sequence is likely from cat food, which is turkey https://apps.oregon.gov/oregon-newsroom/OR/ODA/Posts/Post/morasch-meats-voluntary-recall-feline-raw-pet-food-hpai
11 changes: 6 additions & 5 deletions ingest/build-configs/ncbi/defaults/host-map.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,10 @@ Cygnus olor Avian
Dairy cattle Cattle
domestic cat Nonhuman Mammal
domestic-cat Nonhuman Mammal
Dromaius novaehollandiae Avian
duck Avian
EMU Avian
Emu Avian
environment Environment
EURASIAN COLLARED DOVE Avian
Falco peregrinus Avian
Expand Down Expand Up @@ -78,9 +81,11 @@ Quiscalus quiscula Avian
raccoon Nonhuman Mammal
red fox Nonhuman Mammal
red tailed hawk Avian
Rock Pigeon Avian
skunk Nonhuman Mammal
snow goose Avian
Streptopelia decaocto Avian
tiger Nonhuman Mammal
Turdus merula Avian
turkey Avian
turkey vulture Avian
Expand All @@ -89,9 +94,5 @@ Vulpes vulpes Nonhuman Mammal
western gull Avian
western kingbird Avian
western sandpiper Avian
Wild-Bird Avian
EMU Avian
Emu Avian
Dromaius novaehollandiae Avian
White-winged Dove Avian
Rock Pigeon Avian
Wild-Bird Avian
2 changes: 2 additions & 0 deletions ingest/defaults/geolocation_rules.tsv
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
/USA/*/* North America/USA/*/*
# Better to unknown division be listed as unknown than as "USA"
North America/USA// North America/USA/?/?
North America/USA/USA/ North America/USA/?/?
Expand All @@ -6,5 +7,6 @@ North America/USA/CA/CA North America/USA/California/California
North America/USA/KS/KS North America/USA/Kansas/Kansas
North America/USA/MI/MI North America/USA/Michigan/Michigan
North America/USA/NM/NM North America/USA/New Mexico/New Mexico
North America/USA/OR/ North America/USA/Oregon/Oregon
North America/USA/TX/TX North America/USA/Texas/Texas
North America/USA/WY/WY North America/USA/Wyoming/Wyoming
Loading