Skip to content

Commit

Permalink
Merge pull request #114 from nextstrain/improve-ingest-andersen
Browse files Browse the repository at this point in the history
Improve ingest for Andersen lab/SRA sequences
  • Loading branch information
joverlee521 authored Dec 30, 2024
2 parents 0780396 + 058b27d commit fe37067
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 9 deletions.
21 changes: 17 additions & 4 deletions ingest/build-configs/ncbi/bin/curate-andersen-lab-data
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@ Parses NDJSON record from stdin and outputs new record to stdout.
"""
import copy
import json
import re
from datetime import datetime
from enum import Enum
from sys import stdin, stdout, stderr
from augur.curate.parse_genbank_location import parse_location


NEXTSTRAIN_RECORD = {
Expand Down Expand Up @@ -45,8 +47,13 @@ def create_new_record(anderson_record: dict) -> dict:
new_record['sra_accessions'] = anderson_record['Run']
new_record['region'] = anderson_record['geo_loc_name_country_continent']
new_record['country'] = anderson_record['geo_loc_name_country']
new_record['division'] = anderson_record.get('US State', '')
new_record['location'] = anderson_record.get('US State', '')
# Parse the geolocation as the GenBank format `country:division,location`
new_record = parse_location(new_record, 'country')
# Try to fill `US State` if division and location were not parsed
if new_record['division'] == '':
new_record['division'] = anderson_record.get('US State', '')
if new_record['location'] == '':
new_record['location'] = anderson_record.get('US State', '')
new_record['host'] = anderson_record['Host']
new_record['date_released'] = anderson_record['ReleaseDate']

Expand Down Expand Up @@ -85,12 +92,18 @@ def construct_strain_name(record: dict, sample_id: str) -> str:
to include host, country, and year.
Removes all spaces in the constructed strain name because they are not
allowed in the downstream phylogenetic workflow.
allowed in the downstream phylogenetic workflow. Also replaces invalid
characters with `_` to match iqtree¹ so augur tree will not modify strain
names and cause a mismatch between the tree and the alignment FASTA.²
¹ <https://github.com/iqtree/iqtree2/blob/74da454bbd98d6ecb8cb955975a50de59785fbde/utils/tools.cpp#L607>
² <https://github.com/nextstrain/avian-flu/issues/113>
"""
host = record['host']
country = record['country']
year = parse_year(record['date'])
return f"A/{host}/{country}/{sample_id}/{year}".replace(" ", "")
strain = f"A/{host}/{country}/{sample_id}/{year}".replace(" ", "")
return re.sub(r'[^\w\_\-\.\|\/]', '_', strain)


def parse_year(date_string: str) -> str:
Expand Down
3 changes: 3 additions & 0 deletions ingest/build-configs/ncbi/defaults/annotations.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,6 @@ A/ALPACA/USA/24-014328-007/2024 division Idaho # Based on news reports https://w
A/Alpaca/USA/24-015080-001/2024 division Idaho # Based on news reports https://www.cidrap.umn.edu/avian-influenza-bird-flu/alpacas-infected-h5n1-avian-flu-idaho
A/House-Mouse/USA/24-014780-002/2024 division New Mexico # Based on https://wahis.woah.org/#/in-review/4451?reportId=167771&fromPage=event-dashboard-url
A/House-Mouse/USA/24-014782-003/2024 division New Mexico # Based on https://wahis.woah.org/#/in-review/4451?reportId=167771&fromPage=event-dashboard-url
A/PETFOOD/USA/24-037325-011/2024 host avian # This sequence is likely from cat food, which is turkey https://apps.oregon.gov/oregon-newsroom/OR/ODA/Posts/Post/morasch-meats-voluntary-recall-feline-raw-pet-food-hpai
A/PETFOOD/USA/24-037325-012/2024 host avian # This sequence is likely from cat food, which is turkeyy https://apps.oregon.gov/oregon-newsroom/OR/ODA/Posts/Post/morasch-meats-voluntary-recall-feline-raw-pet-food-hpai
A/PETFOOD/USA/24-037325-013/2024 host avian # This sequence is likely from cat food, which is turkey https://apps.oregon.gov/oregon-newsroom/OR/ODA/Posts/Post/morasch-meats-voluntary-recall-feline-raw-pet-food-hpai
11 changes: 6 additions & 5 deletions ingest/build-configs/ncbi/defaults/host-map.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,10 @@ Cygnus olor Avian
Dairy cattle Cattle
domestic cat Nonhuman Mammal
domestic-cat Nonhuman Mammal
Dromaius novaehollandiae Avian
duck Avian
EMU Avian
Emu Avian
environment Environment
EURASIAN COLLARED DOVE Avian
Falco peregrinus Avian
Expand Down Expand Up @@ -78,9 +81,11 @@ Quiscalus quiscula Avian
raccoon Nonhuman Mammal
red fox Nonhuman Mammal
red tailed hawk Avian
Rock Pigeon Avian
skunk Nonhuman Mammal
snow goose Avian
Streptopelia decaocto Avian
tiger Nonhuman Mammal
Turdus merula Avian
turkey Avian
turkey vulture Avian
Expand All @@ -89,9 +94,5 @@ Vulpes vulpes Nonhuman Mammal
western gull Avian
western kingbird Avian
western sandpiper Avian
Wild-Bird Avian
EMU Avian
Emu Avian
Dromaius novaehollandiae Avian
White-winged Dove Avian
Rock Pigeon Avian
Wild-Bird Avian
2 changes: 2 additions & 0 deletions ingest/defaults/geolocation_rules.tsv
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
/USA/*/* North America/USA/*/*
# Better to unknown division be listed as unknown than as "USA"
North America/USA// North America/USA/?/?
North America/USA/USA/ North America/USA/?/?
Expand All @@ -6,5 +7,6 @@ North America/USA/CA/CA North America/USA/California/California
North America/USA/KS/KS North America/USA/Kansas/Kansas
North America/USA/MI/MI North America/USA/Michigan/Michigan
North America/USA/NM/NM North America/USA/New Mexico/New Mexico
North America/USA/OR/ North America/USA/Oregon/Oregon
North America/USA/TX/TX North America/USA/Texas/Texas
North America/USA/WY/WY North America/USA/Wyoming/Wyoming

0 comments on commit fe37067

Please sign in to comment.