diff --git a/ingest/build-configs/ncbi/bin/curate-andersen-lab-data b/ingest/build-configs/ncbi/bin/curate-andersen-lab-data index 7f93792..2063bd8 100755 --- a/ingest/build-configs/ncbi/bin/curate-andersen-lab-data +++ b/ingest/build-configs/ncbi/bin/curate-andersen-lab-data @@ -54,7 +54,7 @@ def create_new_record(anderson_record: dict) -> dict: new_record['originating_lab'] = center_name new_record['submitting_lab'] = center_name - new_record['strain'] = f'A/{anderson_record["Host"]}/{new_record["country"]}/{anderson_record["isolate"]}/{parse_year(new_record["date"])}' + new_record['strain'] = construct_strain_name(new_record, anderson_record['isolate']) return new_record @@ -78,6 +78,20 @@ def parse_center_name(center_name: str) -> str: return center_name +def construct_strain_name(record: dict, sample_id: str) -> str: + """ + Construct a strain name for the *sample_id* using metadata from the *record* + to include host, country, and year. + + Removes all spaces in the constructed strain name because they are not + allowed in the downstream phylogenetic workflow. + """ + host = record['host'] + country = record['country'] + year = parse_year(record['date']) + return f"A/{host}/{country}/{sample_id}/{year}".replace(" ", "") + + def parse_year(date_string: str) -> str: """ Parse the year from the provided `date_string` diff --git a/ingest/build-configs/ncbi/defaults/config.yaml b/ingest/build-configs/ncbi/defaults/config.yaml index b0a5046..cac445a 100644 --- a/ingest/build-configs/ncbi/defaults/config.yaml +++ b/ingest/build-configs/ncbi/defaults/config.yaml @@ -139,4 +139,4 @@ join_ncbi_andersen: match_field: sra_accessions source_column_name: data_source ncbi_source: genbank - andersen_source: andersen-lab + andersen_source: sra-via-andersen-lab