diff --git a/ingest/build-configs/ncbi/bin/curate-andersen-lab-data b/ingest/build-configs/ncbi/bin/curate-andersen-lab-data index d309a54..d847e09 100755 --- a/ingest/build-configs/ncbi/bin/curate-andersen-lab-data +++ b/ingest/build-configs/ncbi/bin/curate-andersen-lab-data @@ -7,9 +7,11 @@ Parses NDJSON record from stdin and outputs new record to stdout. """ import copy import json +import re from datetime import datetime from enum import Enum from sys import stdin, stdout, stderr +from augur.curate.parse_genbank_location import parse_location NEXTSTRAIN_RECORD = { @@ -45,8 +47,13 @@ def create_new_record(anderson_record: dict) -> dict: new_record['sra_accessions'] = anderson_record['Run'] new_record['region'] = anderson_record['geo_loc_name_country_continent'] new_record['country'] = anderson_record['geo_loc_name_country'] - new_record['division'] = anderson_record.get('US State', '') - new_record['location'] = anderson_record.get('US State', '') + # Parse the geolocation as the GenBank format `country:division,location` + new_record = parse_location(new_record, 'country') + # Try to fill `US State` if division and location were not parsed + if new_record['division'] == '': + new_record['division'] = anderson_record.get('US State', '') + if new_record['location'] == '': + new_record['location'] = anderson_record.get('US State', '') new_record['host'] = anderson_record['Host'] new_record['date_released'] = anderson_record['ReleaseDate'] @@ -85,12 +92,18 @@ def construct_strain_name(record: dict, sample_id: str) -> str: to include host, country, and year. Removes all spaces in the constructed strain name because they are not - allowed in the downstream phylogenetic workflow. + allowed in the downstream phylogenetic workflow. Also replaces invalid + characters with `_` to match iqtree¹ so augur tree will not modify strain + names and cause a mismatch between the tree and the alignment FASTA.² + + ¹ + ² """ host = record['host'] country = record['country'] year = parse_year(record['date']) - return f"A/{host}/{country}/{sample_id}/{year}".replace(" ", "") + strain = f"A/{host}/{country}/{sample_id}/{year}".replace(" ", "") + return re.sub(r'[^\w\_\-\.\|\/]', '_', strain) def parse_year(date_string: str) -> str: diff --git a/ingest/build-configs/ncbi/defaults/annotations.tsv b/ingest/build-configs/ncbi/defaults/annotations.tsv index 7595c51..14a8e1e 100644 --- a/ingest/build-configs/ncbi/defaults/annotations.tsv +++ b/ingest/build-configs/ncbi/defaults/annotations.tsv @@ -10,3 +10,6 @@ A/ALPACA/USA/24-014328-007/2024 division Idaho # Based on news reports https://w A/Alpaca/USA/24-015080-001/2024 division Idaho # Based on news reports https://www.cidrap.umn.edu/avian-influenza-bird-flu/alpacas-infected-h5n1-avian-flu-idaho A/House-Mouse/USA/24-014780-002/2024 division New Mexico # Based on https://wahis.woah.org/#/in-review/4451?reportId=167771&fromPage=event-dashboard-url A/House-Mouse/USA/24-014782-003/2024 division New Mexico # Based on https://wahis.woah.org/#/in-review/4451?reportId=167771&fromPage=event-dashboard-url +A/PETFOOD/USA/24-037325-011/2024 host avian # This sequence is likely from cat food, which is turkey https://apps.oregon.gov/oregon-newsroom/OR/ODA/Posts/Post/morasch-meats-voluntary-recall-feline-raw-pet-food-hpai +A/PETFOOD/USA/24-037325-012/2024 host avian # This sequence is likely from cat food, which is turkeyy https://apps.oregon.gov/oregon-newsroom/OR/ODA/Posts/Post/morasch-meats-voluntary-recall-feline-raw-pet-food-hpai +A/PETFOOD/USA/24-037325-013/2024 host avian # This sequence is likely from cat food, which is turkey https://apps.oregon.gov/oregon-newsroom/OR/ODA/Posts/Post/morasch-meats-voluntary-recall-feline-raw-pet-food-hpai diff --git a/ingest/build-configs/ncbi/defaults/host-map.tsv b/ingest/build-configs/ncbi/defaults/host-map.tsv index bf32eaa..766c0b6 100644 --- a/ingest/build-configs/ncbi/defaults/host-map.tsv +++ b/ingest/build-configs/ncbi/defaults/host-map.tsv @@ -37,7 +37,10 @@ Cygnus olor Avian Dairy cattle Cattle domestic cat Nonhuman Mammal domestic-cat Nonhuman Mammal +Dromaius novaehollandiae Avian duck Avian +EMU Avian +Emu Avian environment Environment EURASIAN COLLARED DOVE Avian Falco peregrinus Avian @@ -78,9 +81,11 @@ Quiscalus quiscula Avian raccoon Nonhuman Mammal red fox Nonhuman Mammal red tailed hawk Avian +Rock Pigeon Avian skunk Nonhuman Mammal snow goose Avian Streptopelia decaocto Avian +tiger Nonhuman Mammal Turdus merula Avian turkey Avian turkey vulture Avian @@ -89,9 +94,5 @@ Vulpes vulpes Nonhuman Mammal western gull Avian western kingbird Avian western sandpiper Avian -Wild-Bird Avian -EMU Avian -Emu Avian -Dromaius novaehollandiae Avian White-winged Dove Avian -Rock Pigeon Avian +Wild-Bird Avian diff --git a/ingest/defaults/geolocation_rules.tsv b/ingest/defaults/geolocation_rules.tsv index 23e2e7a..b7ebfed 100644 --- a/ingest/defaults/geolocation_rules.tsv +++ b/ingest/defaults/geolocation_rules.tsv @@ -1,3 +1,4 @@ +/USA/*/* North America/USA/*/* # Better to unknown division be listed as unknown than as "USA" North America/USA// North America/USA/?/? North America/USA/USA/ North America/USA/?/? @@ -6,5 +7,6 @@ North America/USA/CA/CA North America/USA/California/California North America/USA/KS/KS North America/USA/Kansas/Kansas North America/USA/MI/MI North America/USA/Michigan/Michigan North America/USA/NM/NM North America/USA/New Mexico/New Mexico +North America/USA/OR/ North America/USA/Oregon/Oregon North America/USA/TX/TX North America/USA/Texas/Texas North America/USA/WY/WY North America/USA/Wyoming/Wyoming