Skip to content

Commit

Permalink
Merge pull request #54 from nextstrain/switch-andersen-metadata
Browse files Browse the repository at this point in the history
Switch Andersen lab metadata
  • Loading branch information
joverlee521 authored Jun 10, 2024
2 parents b8de6b2 + a0e6d61 commit 0ec0870
Show file tree
Hide file tree
Showing 4 changed files with 130 additions and 64 deletions.
75 changes: 22 additions & 53 deletions ingest/build-configs/ncbi/bin/curate-andersen-lab-data
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ NEXTSTRAIN_RECORD = {
'virus': 'avian_flu',
'isolate_id': '?',
'date': '?',
'region': 'North America',
'country': 'USA',
'region': '?',
'country': '?',
'division': '?',
'location': '?',
'host': '?',
Expand All @@ -43,77 +43,46 @@ def create_new_record(anderson_record: dict) -> dict:
new_record = copy.deepcopy(NEXTSTRAIN_RECORD)
new_record['isolate_id'] = anderson_record['Run']
new_record['sra_accessions'] = anderson_record['Run']
new_record['division'] = anderson_record['US State']
new_record['location'] = anderson_record['US State']
new_record['region'] = anderson_record['geo_loc_name_country_continent']
new_record['country'] = anderson_record['geo_loc_name_country']
new_record['division'] = anderson_record.get('US State', '')
new_record['location'] = anderson_record.get('US State', '')
new_record['host'] = anderson_record['Host']

new_record['date'] = use_date_when_available(anderson_record)
center_name = parse_center_name(anderson_record['Center Name'])
new_record['originating_lab'] = center_name
new_record['submitting_lab'] = center_name

new_record['host'] = parse_host_group(anderson_record['Host'])
new_record['date'] = parse_date(anderson_record['Date'])
new_record['strain'] = f'A/{anderson_record["Host"]}/{new_record["country"]}/{anderson_record["isolate"]}/{parse_year(new_record["date"])}'
return new_record


def parse_center_name(center_name: str) -> str:
if center_name == 'USDA-NVSL':
return center_name.replace('-', ' ')

return center_name


def parse_host_group(host: str) -> str:
"""
Bin `host` into the HOST_GROUPS
def use_date_when_available(andersen_record: dict) -> str:
"""
# Replace with enum.StrEnum starting with Python 3.11
class HOST_GROUPS(str, Enum):
AVIAN = 'Avian'
CATTLE = 'Cattle'
NONHUMAN_MAMMAL = 'Nonhuman Mammal'

known_hosts = {
'Blackbird': HOST_GROUPS.AVIAN,
'Cat': HOST_GROUPS.NONHUMAN_MAMMAL,
'Cattle': HOST_GROUPS.CATTLE,
'CAGO': HOST_GROUPS.AVIAN,
'Chicken': HOST_GROUPS.AVIAN,
'Grackle': HOST_GROUPS.AVIAN,
'Goose': HOST_GROUPS.AVIAN,
'PEFA': HOST_GROUPS.AVIAN,
'Skunk': HOST_GROUPS.NONHUMAN_MAMMAL,
'Raccoon': HOST_GROUPS.NONHUMAN_MAMMAL,
}

host_group = known_hosts.get(host)
if host_group is None:
print(f"WARNING: unable to group unknown host {host!r}", file=stderr)
return host

return host_group


def parse_date(date_string: str) -> str:
Give the old date field `Date` precedence since they are more specific
"""
If date_string is empty, 'NA', or includes a `?`, then returns the
hardcoded `2024-XX-XX` date.
old_date_field = andersen_record.get("Date", "")
old_date_uncertain = "NA" in old_date_field or "?" in old_date_field

Otherwise return the original date_string.
"""
default_date = '2024-XX-XX'
if old_date_field and not old_date_uncertain:
return old_date_field

return andersen_record["Collection_Date"]

if not date_string or date_string == 'NA' or '?' in date_string:
return default_date

return date_string
def parse_center_name(center_name: str) -> str:
if center_name == 'USDA-NVSL':
return center_name.replace('-', ' ')

return center_name


def parse_year(date_string: str) -> str:
"""
Parse the year from the provided `date_string`
"""
date_formats = ['%Y-%m-%d', '%Y-XX-XX']
date_formats = ['%Y-%m-%d', '%Y']
for date_format in date_formats:
try:
parsed_date = datetime.strptime(date_string, date_format)
Expand Down
11 changes: 8 additions & 3 deletions ingest/build-configs/ncbi/bin/transform-host
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,24 @@ from sys import stderr, stdin, stdout
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--host-map", required=True,
help="TSV file that maps `old_name` to the `new_name`.")
help="TSV file that maps `old_name` to the `new_name`. " +
"The `old_name` value is case-insensitive")
return parser.parse_args()


def parse_host_map(host_map_file: str) -> dict:
"""
Parse the provided *host_map_file* into a dictionary, where the key
is the `old_name` and the values are the `new_name`.
The `old_name` is transformed to lowercase to support case-insensitive
comparisons.
"""
host_map = {}
with open(host_map_file) as tsv_file:
reader = csv.DictReader(tsv_file, delimiter="\t")
for row in reader:
host_map[row["old_name"]] = row["new_name"]
host_map[row["old_name"].lower()] = row["new_name"]

return host_map

Expand All @@ -37,7 +41,8 @@ if __name__ == "__main__":
record = json.loads(record).copy()

host_field = record.get("host")
host = host_map.get(host_field)
# Do case-insensitive comparison of host_field to host_map
host = host_map.get(host_field.lower() if isinstance(host_field, str) else "")
if host_field is None:
print(
f"WARNING: Unable to transform host in record {index!r} " +
Expand Down
37 changes: 37 additions & 0 deletions ingest/build-configs/ncbi/defaults/host-map.tsv
Original file line number Diff line number Diff line change
@@ -1,39 +1,76 @@
old_name new_name
alpaca Nonhuman Mammal
american crow Avian
american wigeon Avian
Anas platyrhynchos Avian
Anatidae Avian
Anser caerulescens Avian
Arenaria interpres Avian
Aythya americana Avian
bald eagle Avian
black billed magpie Avian
blackbird Avian
Bos taurus Cattle
Branta canadensis Avian
Bubo virginianus Avian
Buteo jamaicensis Avian
cago Avian
Calidris alba Avian
canada goose Avian
Capra hircus Nonhuman Mammal
cat Nonhuman Mammal
Cathartes aura Avian
cattle Cattle
Chenonetta jubata Avian
chicken Avian
Columbidae Avian
common raven Avian
comon-grackle Avian
Corvus Avian
Corvus brachyrhynchos Avian
Corvus corax Avian
Cygnus olor Avian
Dairy cattle Cattle
domestic cat Nonhuman Mammal
domestic-cat Nonhuman Mammal
duck Avian
environment Environment
Falco peregrinus Avian
Feliformia Nonhuman Mammal
feline Nonhuman Mammal
Felis catus Nonhuman Mammal
Gallus gallus Avian
ganada goose Avian
goat Nonhuman Mammal
goose Avian
grackle Avian
great horned owl Avian
Haliaeetus leucocephalus Avian
harris hawk Avian
harris-hawk Avian
hawk Avian
Homo sapiens Human
Icteridae Avian
Larus occidentalis Avian
Lophodytes cucullatus Avian
mallard Avian
Meleagris gallopavo Avian
Mephitidae Nonhuman Mammal
mountain lion Nonhuman Mammal
mountain_lion Nonhuman Mammal
mute swan Avian
Panthera leo Nonhuman Mammal
pefa Avian
Pelecanus erythrorhynchos Avian
pigeon Avian
Procyon lotor Nonhuman Mammal
raccoon Nonhuman Mammal
red fox Nonhuman Mammal
red tailed hawk Avian
skunk Nonhuman Mammal
snow goose Avian
Turdus merula Avian
turkey Avian
turkey vulture Avian
western gull Avian
western sandpiper Avian
71 changes: 63 additions & 8 deletions ingest/build-configs/ncbi/rules/ingest_andersen_lab.smk
Original file line number Diff line number Diff line change
Expand Up @@ -16,22 +16,63 @@ rule fetch_andersen_lab_repo:
> {output.andersen_lab_repo}
"""

rule extract_metadata:

rule extract_old_metadata:
input:
andersen_lab_repo = "andersen-lab/data/avian-influenza.tar.gz"
output:
metadata = "andersen-lab/data/PRJNA1102327_metadata.csv"
metadata = "andersen-lab/data/PRJNA1102327_old_metadata.csv"
params:
output_dir = lambda wildcards, output: Path(output.metadata).parent
metadata_file_path = "metadata/PRJNA1102327_metadata.csv",
fields_to_keep = "Run,Date,US State",
shell:
"""
tar xz --file={input.andersen_lab_repo} \
--strip-components=2 \
-C {params.output_dir} \
tar xz -O --file={input.andersen_lab_repo} \
--wildcards \
"*/{params.metadata_file_path:q}" \
| csvtk cut -f {params.fields_to_keep:q} \
> {output.metadata}
"""


rule extract_automated_metadata:
input:
andersen_lab_repo = "andersen-lab/data/avian-influenza.tar.gz"
output:
metadata = "andersen-lab/data/PRJNA1102327_automated_metadata.csv"
params:
metadata_file_path = "metadata/SraRunTable_PRJNA1102327_automated.csv",
shell:
"""
tar xz -O --file={input.andersen_lab_repo} \
--wildcards \
"*/metadata/PRJNA1102327_metadata.csv"
"*/{params.metadata_file_path:q}" \
> {output.metadata}
"""


rule join_old_and_automated_metadata:
"""
Join the extra fields from the old metadata CSV to the automated metadata
to fill in additional data that is no longer included.
"""
input:
old_metadata = "andersen-lab/data/PRJNA1102327_old_metadata.csv",
automated_metadata = "andersen-lab/data/PRJNA1102327_automated_metadata.csv",
output:
metadata = "andersen-lab/data/PRJNA1102327_metadata.csv",
params:
join_field = "Run",
shell:
"""
csvtk join -f {params.join_field:q} \
--left-join \
{input.automated_metadata} \
{input.old_metadata} \
> {output.metadata}
"""


rule extract_consensus_sequences:
input:
andersen_lab_repo = "andersen-lab/data/avian-influenza.tar.gz"
Expand Down Expand Up @@ -74,18 +115,32 @@ rule rename_and_concatenate_segment_fastas:
rule curate_metadata:
input:
metadata = "andersen-lab/data/PRJNA1102327_metadata.csv",
geolocation_rules = "defaults/geolocation_rules.tsv"
geolocation_rules = "defaults/geolocation_rules.tsv",
annotations=config["curate"]["annotations"],
output:
metadata = "andersen-lab/data/metadata.tsv"
log:
"andersen-lab/logs/curate_metadata.txt",
params:
host_map=config["curate"]["host_map"],
date_fields=['date'],
expected_date_formats=['%Y-%m-%d', '%Y'],
annotations_id=config["curate"]["annotations_id"],
shell:
"""
augur curate normalize-strings \
--metadata {input.metadata} \
| ./build-configs/ncbi/bin/curate-andersen-lab-data \
| augur curate format-dates \
--date-fields {params.date_fields} \
--expected-date-formats {params.expected_date_formats} \
| ./build-configs/ncbi/bin/transform-host \
--host-map {params.host_map} \
| ./vendored/apply-geolocation-rules \
--geolocation-rules {input.geolocation_rules} \
| ./vendored/merge-user-metadata \
--annotations {input.annotations} \
--id-field {params.annotations_id} \
| augur curate passthru \
--output-metadata {output.metadata} 2>> {log}
"""
Expand Down

0 comments on commit 0ec0870

Please sign in to comment.