From 9bbc61c977d6a1c7ec5262e2f9b6d0282aa66e18 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Mon, 10 Jun 2024 11:54:48 -0700 Subject: [PATCH 1/8] ingest/andersen-lab: Switch to automated metadata MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previous metadata file in the Andersen lab repo is no longer getting updated. It was last updated April 26, 2024.¹ The repo now automatically updates a new metadata CSV file since https://github.com/andersen-lab/avian-influenza/pull/2 ¹ https://github.com/andersen-lab/avian-influenza/blob/f04267ba8d2fcace275bd5b01e120abf036e3fec/metadata/PRJNA1102327_metadata.csv --- ingest/build-configs/ncbi/rules/ingest_andersen_lab.smk | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/ingest/build-configs/ncbi/rules/ingest_andersen_lab.smk b/ingest/build-configs/ncbi/rules/ingest_andersen_lab.smk index 86c07f8..e9e1978 100644 --- a/ingest/build-configs/ncbi/rules/ingest_andersen_lab.smk +++ b/ingest/build-configs/ncbi/rules/ingest_andersen_lab.smk @@ -22,14 +22,13 @@ rule extract_metadata: output: metadata = "andersen-lab/data/PRJNA1102327_metadata.csv" params: - output_dir = lambda wildcards, output: Path(output.metadata).parent + metadata_file_path = "metadata/SraRunTable_PRJNA1102327_automated.csv", shell: """ - tar xz --file={input.andersen_lab_repo} \ - --strip-components=2 \ - -C {params.output_dir} \ + tar xz -O --file={input.andersen_lab_repo} \ --wildcards \ - "*/metadata/PRJNA1102327_metadata.csv" + "*/{params.metadata_file_path:q}" \ + > {output.metadata} """ rule extract_consensus_sequences: From 60df51790eb1d8d11df8545c9d16ca84b600d062 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Mon, 10 Jun 2024 12:58:15 -0700 Subject: [PATCH 2/8] ingest/andersen-lab: Fix critical errors Fixes critical errors that prevented the andersen-lab ingest workflow from running due to the switch to the new automated CSV file in the previous commit. There will be a a series of other improvements in subsequent commits to clean up the workflow. --- ingest/build-configs/ncbi/bin/curate-andersen-lab-data | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/ingest/build-configs/ncbi/bin/curate-andersen-lab-data b/ingest/build-configs/ncbi/bin/curate-andersen-lab-data index 42e9690..3163217 100755 --- a/ingest/build-configs/ncbi/bin/curate-andersen-lab-data +++ b/ingest/build-configs/ncbi/bin/curate-andersen-lab-data @@ -43,15 +43,13 @@ def create_new_record(anderson_record: dict) -> dict: new_record = copy.deepcopy(NEXTSTRAIN_RECORD) new_record['isolate_id'] = anderson_record['Run'] new_record['sra_accessions'] = anderson_record['Run'] - new_record['division'] = anderson_record['US State'] - new_record['location'] = anderson_record['US State'] center_name = parse_center_name(anderson_record['Center Name']) new_record['originating_lab'] = center_name new_record['submitting_lab'] = center_name new_record['host'] = parse_host_group(anderson_record['Host']) - new_record['date'] = parse_date(anderson_record['Date']) + new_record['date'] = parse_date(anderson_record['Collection_Date']) new_record['strain'] = f'A/{anderson_record["Host"]}/{new_record["country"]}/{anderson_record["isolate"]}/{parse_year(new_record["date"])}' return new_record @@ -113,7 +111,7 @@ def parse_year(date_string: str) -> str: """ Parse the year from the provided `date_string` """ - date_formats = ['%Y-%m-%d', '%Y-XX-XX'] + date_formats = ['%Y-%m-%d', '%Y', '%Y-XX-XX'] for date_format in date_formats: try: parsed_date = datetime.strptime(date_string, date_format) From fc580daca10bb49421187c8603d5eee2f028a1ee Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Mon, 10 Jun 2024 13:05:58 -0700 Subject: [PATCH 3/8] ingest/andersen-lab: Stop hardcoding region/country The automated CSV file from Andersen lab includes the region and country columns as `geo_loc_name_country_continent` and `geo_loc_name_country` respectively, so we no longer have to hardcode their values. --- ingest/build-configs/ncbi/bin/curate-andersen-lab-data | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ingest/build-configs/ncbi/bin/curate-andersen-lab-data b/ingest/build-configs/ncbi/bin/curate-andersen-lab-data index 3163217..c5db9cb 100755 --- a/ingest/build-configs/ncbi/bin/curate-andersen-lab-data +++ b/ingest/build-configs/ncbi/bin/curate-andersen-lab-data @@ -17,8 +17,8 @@ NEXTSTRAIN_RECORD = { 'virus': 'avian_flu', 'isolate_id': '?', 'date': '?', - 'region': 'North America', - 'country': 'USA', + 'region': '?', + 'country': '?', 'division': '?', 'location': '?', 'host': '?', @@ -43,6 +43,8 @@ def create_new_record(anderson_record: dict) -> dict: new_record = copy.deepcopy(NEXTSTRAIN_RECORD) new_record['isolate_id'] = anderson_record['Run'] new_record['sra_accessions'] = anderson_record['Run'] + new_record['region'] = anderson_record['geo_loc_name_country_continent'] + new_record['country'] = anderson_record['geo_loc_name_country'] center_name = parse_center_name(anderson_record['Center Name']) new_record['originating_lab'] = center_name From 93fa2492ee4c760401f24edbf38f0f3867394458 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Mon, 10 Jun 2024 13:19:15 -0700 Subject: [PATCH 4/8] ingest/transform-host: case-insensitive comparisons of host values Preparing for using the same script for the Andersen lab ingest, where the `old_name` for host values include more variations on capitalizations. --- ingest/build-configs/ncbi/bin/transform-host | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/ingest/build-configs/ncbi/bin/transform-host b/ingest/build-configs/ncbi/bin/transform-host index eb5d765..a073012 100755 --- a/ingest/build-configs/ncbi/bin/transform-host +++ b/ingest/build-configs/ncbi/bin/transform-host @@ -11,7 +11,8 @@ from sys import stderr, stdin, stdout def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--host-map", required=True, - help="TSV file that maps `old_name` to the `new_name`.") + help="TSV file that maps `old_name` to the `new_name`. " + + "The `old_name` value is case-insensitive") return parser.parse_args() @@ -19,12 +20,15 @@ def parse_host_map(host_map_file: str) -> dict: """ Parse the provided *host_map_file* into a dictionary, where the key is the `old_name` and the values are the `new_name`. + + The `old_name` is transformed to lowercase to support case-insensitive + comparisons. """ host_map = {} with open(host_map_file) as tsv_file: reader = csv.DictReader(tsv_file, delimiter="\t") for row in reader: - host_map[row["old_name"]] = row["new_name"] + host_map[row["old_name"].lower()] = row["new_name"] return host_map @@ -37,7 +41,8 @@ if __name__ == "__main__": record = json.loads(record).copy() host_field = record.get("host") - host = host_map.get(host_field) + # Do case-insensitive comparison of host_field to host_map + host = host_map.get(host_field.lower() if isinstance(host_field, str) else "") if host_field is None: print( f"WARNING: Unable to transform host in record {index!r} " + From ff92a046781cca1798f00a0699d1cad4049f6995 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Mon, 10 Jun 2024 13:38:38 -0700 Subject: [PATCH 5/8] ingest/andersen-lab: Use `transform-host` to curate hosts The automated metadata has a lot more host values so it makes more sense to use the same `transform-host` script as the NCBI ingest. This also helps to ensure that they all use the same standard hosts. --- .../ncbi/bin/curate-andersen-lab-data | 33 +---------------- .../build-configs/ncbi/defaults/host-map.tsv | 37 +++++++++++++++++++ .../ncbi/rules/ingest_andersen_lab.smk | 4 ++ 3 files changed, 42 insertions(+), 32 deletions(-) diff --git a/ingest/build-configs/ncbi/bin/curate-andersen-lab-data b/ingest/build-configs/ncbi/bin/curate-andersen-lab-data index c5db9cb..eee05e5 100755 --- a/ingest/build-configs/ncbi/bin/curate-andersen-lab-data +++ b/ingest/build-configs/ncbi/bin/curate-andersen-lab-data @@ -45,12 +45,12 @@ def create_new_record(anderson_record: dict) -> dict: new_record['sra_accessions'] = anderson_record['Run'] new_record['region'] = anderson_record['geo_loc_name_country_continent'] new_record['country'] = anderson_record['geo_loc_name_country'] + new_record['host'] = anderson_record['Host'] center_name = parse_center_name(anderson_record['Center Name']) new_record['originating_lab'] = center_name new_record['submitting_lab'] = center_name - new_record['host'] = parse_host_group(anderson_record['Host']) new_record['date'] = parse_date(anderson_record['Collection_Date']) new_record['strain'] = f'A/{anderson_record["Host"]}/{new_record["country"]}/{anderson_record["isolate"]}/{parse_year(new_record["date"])}' return new_record @@ -63,37 +63,6 @@ def parse_center_name(center_name: str) -> str: return center_name -def parse_host_group(host: str) -> str: - """ - Bin `host` into the HOST_GROUPS - """ - # Replace with enum.StrEnum starting with Python 3.11 - class HOST_GROUPS(str, Enum): - AVIAN = 'Avian' - CATTLE = 'Cattle' - NONHUMAN_MAMMAL = 'Nonhuman Mammal' - - known_hosts = { - 'Blackbird': HOST_GROUPS.AVIAN, - 'Cat': HOST_GROUPS.NONHUMAN_MAMMAL, - 'Cattle': HOST_GROUPS.CATTLE, - 'CAGO': HOST_GROUPS.AVIAN, - 'Chicken': HOST_GROUPS.AVIAN, - 'Grackle': HOST_GROUPS.AVIAN, - 'Goose': HOST_GROUPS.AVIAN, - 'PEFA': HOST_GROUPS.AVIAN, - 'Skunk': HOST_GROUPS.NONHUMAN_MAMMAL, - 'Raccoon': HOST_GROUPS.NONHUMAN_MAMMAL, - } - - host_group = known_hosts.get(host) - if host_group is None: - print(f"WARNING: unable to group unknown host {host!r}", file=stderr) - return host - - return host_group - - def parse_date(date_string: str) -> str: """ If date_string is empty, 'NA', or includes a `?`, then returns the diff --git a/ingest/build-configs/ncbi/defaults/host-map.tsv b/ingest/build-configs/ncbi/defaults/host-map.tsv index 59e3edd..e8674bb 100644 --- a/ingest/build-configs/ncbi/defaults/host-map.tsv +++ b/ingest/build-configs/ncbi/defaults/host-map.tsv @@ -1,39 +1,76 @@ old_name new_name +alpaca Nonhuman Mammal +american crow Avian +american wigeon Avian Anas platyrhynchos Avian Anatidae Avian Anser caerulescens Avian Arenaria interpres Avian Aythya americana Avian +bald eagle Avian +black billed magpie Avian +blackbird Avian Bos taurus Cattle Branta canadensis Avian Bubo virginianus Avian Buteo jamaicensis Avian +cago Avian Calidris alba Avian +canada goose Avian Capra hircus Nonhuman Mammal +cat Nonhuman Mammal Cathartes aura Avian +cattle Cattle Chenonetta jubata Avian +chicken Avian Columbidae Avian +common raven Avian +comon-grackle Avian Corvus Avian Corvus brachyrhynchos Avian Corvus corax Avian Cygnus olor Avian Dairy cattle Cattle +domestic cat Nonhuman Mammal +domestic-cat Nonhuman Mammal +duck Avian environment Environment Falco peregrinus Avian Feliformia Nonhuman Mammal +feline Nonhuman Mammal Felis catus Nonhuman Mammal Gallus gallus Avian +ganada goose Avian +goat Nonhuman Mammal +goose Avian grackle Avian +great horned owl Avian Haliaeetus leucocephalus Avian +harris hawk Avian harris-hawk Avian +hawk Avian Homo sapiens Human Icteridae Avian Larus occidentalis Avian Lophodytes cucullatus Avian +mallard Avian Meleagris gallopavo Avian Mephitidae Nonhuman Mammal +mountain lion Nonhuman Mammal mountain_lion Nonhuman Mammal +mute swan Avian Panthera leo Nonhuman Mammal +pefa Avian Pelecanus erythrorhynchos Avian +pigeon Avian Procyon lotor Nonhuman Mammal +raccoon Nonhuman Mammal +red fox Nonhuman Mammal +red tailed hawk Avian +skunk Nonhuman Mammal +snow goose Avian Turdus merula Avian +turkey Avian +turkey vulture Avian +western gull Avian +western sandpiper Avian diff --git a/ingest/build-configs/ncbi/rules/ingest_andersen_lab.smk b/ingest/build-configs/ncbi/rules/ingest_andersen_lab.smk index e9e1978..6e0815c 100644 --- a/ingest/build-configs/ncbi/rules/ingest_andersen_lab.smk +++ b/ingest/build-configs/ncbi/rules/ingest_andersen_lab.smk @@ -78,11 +78,15 @@ rule curate_metadata: metadata = "andersen-lab/data/metadata.tsv" log: "andersen-lab/logs/curate_metadata.txt", + params: + host_map=config["curate"]["host_map"], shell: """ augur curate normalize-strings \ --metadata {input.metadata} \ | ./build-configs/ncbi/bin/curate-andersen-lab-data \ + | ./build-configs/ncbi/bin/transform-host \ + --host-map {params.host_map} \ | ./vendored/apply-geolocation-rules \ --geolocation-rules {input.geolocation_rules} \ | augur curate passthru \ From 2fd458530ff5b7248fce7add4e4e1f5492b53c6f Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Mon, 10 Jun 2024 13:52:59 -0700 Subject: [PATCH 6/8] ingest/andersen-lab: Use `augur curate format-dates` Instead of doing custom date transformations in `curate-andersen-lab-data`, just use `augur curate format-dates`. --- .../ncbi/bin/curate-andersen-lab-data | 19 ++----------------- .../ncbi/rules/ingest_andersen_lab.smk | 5 +++++ 2 files changed, 7 insertions(+), 17 deletions(-) diff --git a/ingest/build-configs/ncbi/bin/curate-andersen-lab-data b/ingest/build-configs/ncbi/bin/curate-andersen-lab-data index eee05e5..0009f65 100755 --- a/ingest/build-configs/ncbi/bin/curate-andersen-lab-data +++ b/ingest/build-configs/ncbi/bin/curate-andersen-lab-data @@ -46,12 +46,12 @@ def create_new_record(anderson_record: dict) -> dict: new_record['region'] = anderson_record['geo_loc_name_country_continent'] new_record['country'] = anderson_record['geo_loc_name_country'] new_record['host'] = anderson_record['Host'] + new_record['date'] = anderson_record['Collection_Date'] center_name = parse_center_name(anderson_record['Center Name']) new_record['originating_lab'] = center_name new_record['submitting_lab'] = center_name - new_record['date'] = parse_date(anderson_record['Collection_Date']) new_record['strain'] = f'A/{anderson_record["Host"]}/{new_record["country"]}/{anderson_record["isolate"]}/{parse_year(new_record["date"])}' return new_record @@ -63,26 +63,11 @@ def parse_center_name(center_name: str) -> str: return center_name -def parse_date(date_string: str) -> str: - """ - If date_string is empty, 'NA', or includes a `?`, then returns the - hardcoded `2024-XX-XX` date. - - Otherwise return the original date_string. - """ - default_date = '2024-XX-XX' - - if not date_string or date_string == 'NA' or '?' in date_string: - return default_date - - return date_string - - def parse_year(date_string: str) -> str: """ Parse the year from the provided `date_string` """ - date_formats = ['%Y-%m-%d', '%Y', '%Y-XX-XX'] + date_formats = ['%Y'] for date_format in date_formats: try: parsed_date = datetime.strptime(date_string, date_format) diff --git a/ingest/build-configs/ncbi/rules/ingest_andersen_lab.smk b/ingest/build-configs/ncbi/rules/ingest_andersen_lab.smk index 6e0815c..5cb9946 100644 --- a/ingest/build-configs/ncbi/rules/ingest_andersen_lab.smk +++ b/ingest/build-configs/ncbi/rules/ingest_andersen_lab.smk @@ -80,11 +80,16 @@ rule curate_metadata: "andersen-lab/logs/curate_metadata.txt", params: host_map=config["curate"]["host_map"], + date_fields=['date'], + expected_date_formats=['%Y'], shell: """ augur curate normalize-strings \ --metadata {input.metadata} \ | ./build-configs/ncbi/bin/curate-andersen-lab-data \ + | augur curate format-dates \ + --date-fields {params.date_fields} \ + --expected-date-formats {params.expected_date_formats} \ | ./build-configs/ncbi/bin/transform-host \ --host-map {params.host_map} \ | ./vendored/apply-geolocation-rules \ From 8786ee069f65ec522cd45c32bb3fe36b68df2cb6 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Mon, 10 Jun 2024 14:05:25 -0700 Subject: [PATCH 7/8] ingest/andersen-lab: Use `merge-user-metadata` Allow users to add annotations for the Andersen lab data. --- ingest/build-configs/ncbi/rules/ingest_andersen_lab.smk | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/ingest/build-configs/ncbi/rules/ingest_andersen_lab.smk b/ingest/build-configs/ncbi/rules/ingest_andersen_lab.smk index 5cb9946..d6ae9f7 100644 --- a/ingest/build-configs/ncbi/rules/ingest_andersen_lab.smk +++ b/ingest/build-configs/ncbi/rules/ingest_andersen_lab.smk @@ -73,7 +73,8 @@ rule rename_and_concatenate_segment_fastas: rule curate_metadata: input: metadata = "andersen-lab/data/PRJNA1102327_metadata.csv", - geolocation_rules = "defaults/geolocation_rules.tsv" + geolocation_rules = "defaults/geolocation_rules.tsv", + annotations=config["curate"]["annotations"], output: metadata = "andersen-lab/data/metadata.tsv" log: @@ -82,6 +83,7 @@ rule curate_metadata: host_map=config["curate"]["host_map"], date_fields=['date'], expected_date_formats=['%Y'], + annotations_id=config["curate"]["annotations_id"], shell: """ augur curate normalize-strings \ @@ -94,6 +96,9 @@ rule curate_metadata: --host-map {params.host_map} \ | ./vendored/apply-geolocation-rules \ --geolocation-rules {input.geolocation_rules} \ + | ./vendored/merge-user-metadata \ + --annotations {input.annotations} \ + --id-field {params.annotations_id} \ | augur curate passthru \ --output-metadata {output.metadata} 2>> {log} """ From a0e6d61c511e4bc60a8e235c84637b344988c293 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Mon, 10 Jun 2024 14:37:22 -0700 Subject: [PATCH 8/8] ingest/andersen-lab: Join old and automated metadata I didn't want to use the user annotations for 150+ records' date and division data, so I opted to just join the old metadata with the new automated metadata. --- .../ncbi/bin/curate-andersen-lab-data | 19 +++++++- .../ncbi/rules/ingest_andersen_lab.smk | 48 +++++++++++++++++-- 2 files changed, 62 insertions(+), 5 deletions(-) diff --git a/ingest/build-configs/ncbi/bin/curate-andersen-lab-data b/ingest/build-configs/ncbi/bin/curate-andersen-lab-data index 0009f65..7f93792 100755 --- a/ingest/build-configs/ncbi/bin/curate-andersen-lab-data +++ b/ingest/build-configs/ncbi/bin/curate-andersen-lab-data @@ -45,9 +45,11 @@ def create_new_record(anderson_record: dict) -> dict: new_record['sra_accessions'] = anderson_record['Run'] new_record['region'] = anderson_record['geo_loc_name_country_continent'] new_record['country'] = anderson_record['geo_loc_name_country'] + new_record['division'] = anderson_record.get('US State', '') + new_record['location'] = anderson_record.get('US State', '') new_record['host'] = anderson_record['Host'] - new_record['date'] = anderson_record['Collection_Date'] + new_record['date'] = use_date_when_available(anderson_record) center_name = parse_center_name(anderson_record['Center Name']) new_record['originating_lab'] = center_name new_record['submitting_lab'] = center_name @@ -56,6 +58,19 @@ def create_new_record(anderson_record: dict) -> dict: return new_record +def use_date_when_available(andersen_record: dict) -> str: + """ + Give the old date field `Date` precedence since they are more specific + """ + old_date_field = andersen_record.get("Date", "") + old_date_uncertain = "NA" in old_date_field or "?" in old_date_field + + if old_date_field and not old_date_uncertain: + return old_date_field + + return andersen_record["Collection_Date"] + + def parse_center_name(center_name: str) -> str: if center_name == 'USDA-NVSL': return center_name.replace('-', ' ') @@ -67,7 +82,7 @@ def parse_year(date_string: str) -> str: """ Parse the year from the provided `date_string` """ - date_formats = ['%Y'] + date_formats = ['%Y-%m-%d', '%Y'] for date_format in date_formats: try: parsed_date = datetime.strptime(date_string, date_format) diff --git a/ingest/build-configs/ncbi/rules/ingest_andersen_lab.smk b/ingest/build-configs/ncbi/rules/ingest_andersen_lab.smk index d6ae9f7..fb607b0 100644 --- a/ingest/build-configs/ncbi/rules/ingest_andersen_lab.smk +++ b/ingest/build-configs/ncbi/rules/ingest_andersen_lab.smk @@ -16,11 +16,30 @@ rule fetch_andersen_lab_repo: > {output.andersen_lab_repo} """ -rule extract_metadata: + +rule extract_old_metadata: + input: + andersen_lab_repo = "andersen-lab/data/avian-influenza.tar.gz" + output: + metadata = "andersen-lab/data/PRJNA1102327_old_metadata.csv" + params: + metadata_file_path = "metadata/PRJNA1102327_metadata.csv", + fields_to_keep = "Run,Date,US State", + shell: + """ + tar xz -O --file={input.andersen_lab_repo} \ + --wildcards \ + "*/{params.metadata_file_path:q}" \ + | csvtk cut -f {params.fields_to_keep:q} \ + > {output.metadata} + """ + + +rule extract_automated_metadata: input: andersen_lab_repo = "andersen-lab/data/avian-influenza.tar.gz" output: - metadata = "andersen-lab/data/PRJNA1102327_metadata.csv" + metadata = "andersen-lab/data/PRJNA1102327_automated_metadata.csv" params: metadata_file_path = "metadata/SraRunTable_PRJNA1102327_automated.csv", shell: @@ -31,6 +50,29 @@ rule extract_metadata: > {output.metadata} """ + +rule join_old_and_automated_metadata: + """ + Join the extra fields from the old metadata CSV to the automated metadata + to fill in additional data that is no longer included. + """ + input: + old_metadata = "andersen-lab/data/PRJNA1102327_old_metadata.csv", + automated_metadata = "andersen-lab/data/PRJNA1102327_automated_metadata.csv", + output: + metadata = "andersen-lab/data/PRJNA1102327_metadata.csv", + params: + join_field = "Run", + shell: + """ + csvtk join -f {params.join_field:q} \ + --left-join \ + {input.automated_metadata} \ + {input.old_metadata} \ + > {output.metadata} + """ + + rule extract_consensus_sequences: input: andersen_lab_repo = "andersen-lab/data/avian-influenza.tar.gz" @@ -82,7 +124,7 @@ rule curate_metadata: params: host_map=config["curate"]["host_map"], date_fields=['date'], - expected_date_formats=['%Y'], + expected_date_formats=['%Y-%m-%d', '%Y'], annotations_id=config["curate"]["annotations_id"], shell: """