From 4e03aed1b25319bbf66f2bce3c6eb08708b5bbe4 Mon Sep 17 00:00:00 2001 From: andrewkern Date: Tue, 7 Jan 2025 14:35:57 -0800 Subject: [PATCH 01/11] changes to maintenance script / ensembl push --- maintenance/main.py | 111 ++++++++++++++++++++++-- stdpopsim/catalog/ApiMel/genome_data.py | 34 ++++---- stdpopsim/catalog/BosTau/genome_data.py | 64 +++++++------- stdpopsim/catalog/CaeEle/genome_data.py | 2 +- stdpopsim/catalog/CanFam/species.py | 4 +- stdpopsim/catalog/DroMel/genome_data.py | 2 +- stdpopsim/catalog/GasAcu/species.py | 2 +- stdpopsim/catalog/HomSap/genome_data.py | 4 +- stdpopsim/catalog/StrAga/species.py | 2 +- stdpopsim/catalog/ensembl_info.py | 2 +- 10 files changed, 161 insertions(+), 66 deletions(-) diff --git a/maintenance/main.py b/maintenance/main.py index ecfa50f6b..4ae1cc93d 100644 --- a/maintenance/main.py +++ b/maintenance/main.py @@ -177,6 +177,10 @@ def black_format(code): def ensembl_stdpopsim_id(ensembl_id): + if ensembl_id == "canis_lupus_familiaris": + ensembl_id = "canis_familiaris" + elif ensembl_id == "escherichia_coli_str_k_12_substr_mg1655_gca_000005845": + ensembl_id = "escherichia_coli" tmp = ensembl_id.split("_")[:2] sps_id = "".join([x[0:3].capitalize() for x in tmp]) if len(sps_id) != 6: @@ -296,15 +300,50 @@ def write_genome_data(self, ensembl_id): raise ValueError( f"Directory {id} corresponding to {ensembl_id} does" + "not exist" ) - logger.info(f"Writing genome data for {sps_id} {ensembl_id}") - path = path / "genome_data.py" + + genome_data_path = path / "genome_data.py" + # Check if file exists and was manually created + if genome_data_path.exists(): + with open(genome_data_path) as f: + first_line = f.readline().strip() + if first_line.startswith("# File created manually"): + logger.info( + f"Skipping {sps_id} ({ensembl_id}): manually created \ + genome data file" + ) + return ("manual", None) + + # Get new data from Ensembl data = self.ensembl_client.get_genome_data(ensembl_id) + + # Check if existing genome data exists and compare chromosome names + if genome_data_path.exists(): + try: + # Get existing chromosome names + namespace = {} + with open(genome_data_path) as f: + exec(f.read(), namespace) + existing_chroms = set(namespace["data"]["chromosomes"].keys()) + new_chroms = set(data["chromosomes"].keys()) + + if existing_chroms != new_chroms: + logger.warning( + f"Skipping {sps_id} ({ensembl_id}): chromosome names mismatch." + ) + return ("chrom_mismatch", (existing_chroms, new_chroms)) + except Exception as e: + logger.warning( + f"Error comparing chromosome names for {sps_id}: {e}. \ + Proceeding with update." + ) + + logger.info(f"Writing genome data for {sps_id} {ensembl_id}") code = f"data = {data}" # Format the code with Black so we don't get noisy diffs - with self.write(path) as f: + with self.write(genome_data_path) as f: f.write(black_format(code)) - return data + return ("updated", None) def write_genome_data_ncbi(self, ncbi_id, sps_id): path = catalog_path(sps_id) @@ -371,16 +410,70 @@ def update_genome_data(species): will update the genome data for humans. Multiple species can be specified. By default all species are updated. """ - # TODO make this work for NCBI as well + # Track warnings and errors + skipped_species = [] + + # Original species processing logic if len(species) == 0: - embl_ids = [s.ensembl_id for s in stdpopsim.all_species()] + species_list = list(stdpopsim.all_species()) + logger.info(f"Found {len(species_list)} species in catalog") + embl_ids = [] + for s in species_list: + logger.info(f"Processing {s.id}: ensembl_id={s.ensembl_id}") + embl_ids.append((s.id, s.ensembl_id)) else: - embl_ids = [s.lower() for s in species] + embl_ids = [(s, s.lower()) for s in species] + + # Process each species, maintaining existing logging writer = DataWriter() - for eid in embl_ids: - writer.write_genome_data(eid) + for species_id, eid in embl_ids: + try: + result = writer.write_genome_data(eid) + if result is not None: + status, details = result + if status == "manual": + skipped_species.append( + (species_id, eid, "Manually created genome data file") + ) + elif status == "chrom_mismatch": + existing_chroms, new_chroms = details + skipped_species.append( + ( + species_id, + eid, + ( + f"Chromosome names mismatch.\n" + f"Existing: {sorted(existing_chroms)}\n" + f"New: {sorted(new_chroms)}" + ), + ) + ) + except ValueError as e: + logger.error(f"Error processing {eid}: {e}") + skipped_species.append((species_id, eid, str(e))) + except Exception as e: + logger.error(f"Unexpected error processing {eid}: {e}") + skipped_species.append((species_id, eid, f"Unexpected error: {str(e)}")) + writer.write_ensembl_release() + # Add summary report at the end + if skipped_species: + logger.warning("\n=== Species Update Summary ===") + logger.warning("The following species were not updated:") + for species_id, eid, reason in skipped_species: + if "Chromosome names mismatch" in reason: + # Split the chromosome mismatch message into multiple lines + logger.warning(f" - {species_id} (Ensembl ID: {eid}):") + logger.warning(" Chromosome names mismatch.") + # Parse the chromosome lists from the new format + existing = reason.split("Existing: ")[1].split("\n")[0] + new = reason.split("New: ")[1] + logger.warning(f" Existing chromosomes: {existing}") + logger.warning(f" New chromosomes: {new}") + else: + logger.warning(f" - {species_id} (Ensembl ID: {eid}): {reason}") + @cli.command() @click.argument("ensembl-id") diff --git a/stdpopsim/catalog/ApiMel/genome_data.py b/stdpopsim/catalog/ApiMel/genome_data.py index 117e59897..991c77ce2 100644 --- a/stdpopsim/catalog/ApiMel/genome_data.py +++ b/stdpopsim/catalog/ApiMel/genome_data.py @@ -3,22 +3,22 @@ "assembly_accession": "GCA_003254395.2", "assembly_name": "Amel_HAv3.1", "chromosomes": { - "CM009931.2": {"length": 27754200, "synonyms": ["NC_037638.1"]}, - "CM009932.2": {"length": 16089512, "synonyms": ["NC_037639.1"]}, - "CM009933.2": {"length": 13619445, "synonyms": ["NC_037640.1"]}, - "CM009934.2": {"length": 13404451, "synonyms": ["NC_037641.1"]}, - "CM009935.2": {"length": 13896941, "synonyms": ["NC_037642.1"]}, - "CM009936.2": {"length": 17789102, "synonyms": ["NC_037643.1"]}, - "CM009937.2": {"length": 14198698, "synonyms": ["NC_037644.1"]}, - "CM009938.2": {"length": 12717210, "synonyms": ["NC_037645.1"]}, - "CM009939.2": {"length": 12354651, "synonyms": ["NC_037646.1"]}, - "CM009940.2": {"length": 12360052, "synonyms": ["NC_037647.1"]}, - "CM009941.2": {"length": 16352600, "synonyms": ["NC_037648.1"]}, - "CM009942.2": {"length": 11514234, "synonyms": ["NC_037649.1"]}, - "CM009943.2": {"length": 11279722, "synonyms": ["NC_037650.1"]}, - "CM009944.2": {"length": 10670842, "synonyms": ["NC_037651.1"]}, - "CM009945.2": {"length": 9534514, "synonyms": ["NC_037652.1"]}, - "CM009946.2": {"length": 7238532, "synonyms": ["NC_037653.1"]}, - "CM009947.2": {"length": 16343, "synonyms": ["NC_001566.1", "MT"]}, + "CM009931.2": {"length": 27754200, "synonyms": []}, + "CM009932.2": {"length": 16089512, "synonyms": []}, + "CM009933.2": {"length": 13619445, "synonyms": []}, + "CM009934.2": {"length": 13404451, "synonyms": []}, + "CM009935.2": {"length": 13896941, "synonyms": []}, + "CM009936.2": {"length": 17789102, "synonyms": []}, + "CM009937.2": {"length": 14198698, "synonyms": []}, + "CM009938.2": {"length": 12717210, "synonyms": []}, + "CM009939.2": {"length": 12354651, "synonyms": []}, + "CM009940.2": {"length": 12360052, "synonyms": []}, + "CM009941.2": {"length": 16352600, "synonyms": []}, + "CM009942.2": {"length": 11514234, "synonyms": []}, + "CM009943.2": {"length": 11279722, "synonyms": []}, + "CM009944.2": {"length": 10670842, "synonyms": []}, + "CM009945.2": {"length": 9534514, "synonyms": []}, + "CM009946.2": {"length": 7238532, "synonyms": []}, + "CM009947.2": {"length": 16343, "synonyms": []}, }, } diff --git a/stdpopsim/catalog/BosTau/genome_data.py b/stdpopsim/catalog/BosTau/genome_data.py index 749727df2..ad4b6e2ee 100644 --- a/stdpopsim/catalog/BosTau/genome_data.py +++ b/stdpopsim/catalog/BosTau/genome_data.py @@ -1,38 +1,38 @@ # File autogenerated from Ensembl REST API. Do not edit. data = { - "assembly_accession": "GCA_002263795.2", - "assembly_name": "ARS-UCD1.2", + "assembly_accession": "GCA_002263795.3", + "assembly_name": "ARS-UCD1.3", "chromosomes": { - "1": {"length": 158534110, "synonyms": []}, - "2": {"length": 136231102, "synonyms": []}, - "3": {"length": 121005158, "synonyms": []}, - "4": {"length": 120000601, "synonyms": []}, - "5": {"length": 120089316, "synonyms": []}, - "6": {"length": 117806340, "synonyms": []}, - "7": {"length": 110682743, "synonyms": []}, - "8": {"length": 113319770, "synonyms": []}, - "9": {"length": 105454467, "synonyms": []}, - "10": {"length": 103308737, "synonyms": []}, - "11": {"length": 106982474, "synonyms": []}, - "12": {"length": 87216183, "synonyms": []}, - "13": {"length": 83472345, "synonyms": []}, - "14": {"length": 82403003, "synonyms": []}, - "15": {"length": 85007780, "synonyms": []}, - "16": {"length": 81013979, "synonyms": []}, - "17": {"length": 73167244, "synonyms": []}, - "18": {"length": 65820629, "synonyms": []}, - "19": {"length": 63449741, "synonyms": []}, - "20": {"length": 71974595, "synonyms": []}, - "21": {"length": 69862954, "synonyms": []}, - "22": {"length": 60773035, "synonyms": []}, - "23": {"length": 52498615, "synonyms": []}, - "24": {"length": 62317253, "synonyms": []}, - "25": {"length": 42350435, "synonyms": []}, - "26": {"length": 51992305, "synonyms": []}, - "27": {"length": 45612108, "synonyms": []}, - "28": {"length": 45940150, "synonyms": []}, - "29": {"length": 51098607, "synonyms": []}, - "X": {"length": 139009144, "synonyms": []}, + "1": {"length": 158534110, "synonyms": ["chr1"]}, + "2": {"length": 136231102, "synonyms": ["chr2"]}, + "3": {"length": 121005158, "synonyms": ["chr3"]}, + "4": {"length": 120000601, "synonyms": ["chr4"]}, + "5": {"length": 120089316, "synonyms": ["chr5"]}, + "6": {"length": 117806340, "synonyms": ["chr6"]}, + "7": {"length": 110682743, "synonyms": ["chr7"]}, + "8": {"length": 113319770, "synonyms": ["chr8"]}, + "9": {"length": 105454467, "synonyms": ["chr9"]}, + "10": {"length": 103308737, "synonyms": ["chr10"]}, + "11": {"length": 106982474, "synonyms": ["chr11"]}, + "12": {"length": 87216183, "synonyms": ["chr12"]}, + "13": {"length": 83472345, "synonyms": ["chr13"]}, + "14": {"length": 82403003, "synonyms": ["chr14"]}, + "15": {"length": 85007780, "synonyms": ["chr15"]}, + "16": {"length": 81013979, "synonyms": ["chr16"]}, + "17": {"length": 73167244, "synonyms": ["chr17"]}, + "18": {"length": 65820629, "synonyms": ["chr18"]}, + "19": {"length": 63449741, "synonyms": ["chr19"]}, + "20": {"length": 71974595, "synonyms": ["chr20"]}, + "21": {"length": 69862954, "synonyms": ["chr21"]}, + "22": {"length": 60773035, "synonyms": ["chr22"]}, + "23": {"length": 52498615, "synonyms": ["chr23"]}, + "24": {"length": 62317253, "synonyms": ["chr24"]}, + "25": {"length": 42350435, "synonyms": ["chr25"]}, + "26": {"length": 51992305, "synonyms": ["chr26"]}, + "27": {"length": 45612108, "synonyms": ["chr27"]}, + "28": {"length": 45940150, "synonyms": ["chr28"]}, + "29": {"length": 51098607, "synonyms": ["chr29"]}, + "X": {"length": 139009144, "synonyms": ["chrX"]}, "MT": {"length": 16338, "synonyms": []}, }, } diff --git a/stdpopsim/catalog/CaeEle/genome_data.py b/stdpopsim/catalog/CaeEle/genome_data.py index 93437cdca..0cc4fdd79 100644 --- a/stdpopsim/catalog/CaeEle/genome_data.py +++ b/stdpopsim/catalog/CaeEle/genome_data.py @@ -1,7 +1,7 @@ # File autogenerated from Ensembl REST API. Do not edit. data = { "assembly_accession": "GCA_000002985.3", - "assembly_name": "ce11", + "assembly_name": "WBcel235", "chromosomes": { "I": {"length": 15072434, "synonyms": []}, "II": {"length": 15279421, "synonyms": []}, diff --git a/stdpopsim/catalog/CanFam/species.py b/stdpopsim/catalog/CanFam/species.py index 82b59cdd1..d05a8472f 100644 --- a/stdpopsim/catalog/CanFam/species.py +++ b/stdpopsim/catalog/CanFam/species.py @@ -46,6 +46,7 @@ "38": 1.4363726512881696e-08, "X": 9.506483722244087e-09, "MT": 0, + "Y": 0, # manually set to 0 because it's not in the map } # Generic and chromosome-specific ploidy @@ -91,6 +92,7 @@ "38": _species_ploidy, "X": _species_ploidy, "MT": 1, + "Y": 1, } _LindbladTohEtAl = stdpopsim.Citation( @@ -152,7 +154,7 @@ _species = stdpopsim.Species( id="CanFam", - ensembl_id="canis_familiaris", + ensembl_id="canis_lupus_familiaris", name="Canis familiaris", common_name="Dog", genome=_genome, diff --git a/stdpopsim/catalog/DroMel/genome_data.py b/stdpopsim/catalog/DroMel/genome_data.py index eb3d7785b..87916cd6e 100644 --- a/stdpopsim/catalog/DroMel/genome_data.py +++ b/stdpopsim/catalog/DroMel/genome_data.py @@ -1,7 +1,7 @@ # File autogenerated from Ensembl REST API. Do not edit. data = { "assembly_accession": "GCA_000001215.4", - "assembly_name": "BDGP6.32", + "assembly_name": "BDGP6.46", "chromosomes": { "2L": {"length": 23513712, "synonyms": []}, "2R": {"length": 25286936, "synonyms": []}, diff --git a/stdpopsim/catalog/GasAcu/species.py b/stdpopsim/catalog/GasAcu/species.py index 7ba2a598b..b33f1633d 100644 --- a/stdpopsim/catalog/GasAcu/species.py +++ b/stdpopsim/catalog/GasAcu/species.py @@ -116,7 +116,7 @@ _species = stdpopsim.Species( id="GasAcu", - ensembl_id="9307941", + ensembl_id="gasterosteus_aculeatus", name="Gasterosteus aculeatus", common_name="Three-spined stickleback", genome=_genome, diff --git a/stdpopsim/catalog/HomSap/genome_data.py b/stdpopsim/catalog/HomSap/genome_data.py index e77c9ef57..4e3fcdc3b 100644 --- a/stdpopsim/catalog/HomSap/genome_data.py +++ b/stdpopsim/catalog/HomSap/genome_data.py @@ -1,7 +1,7 @@ # File autogenerated from Ensembl REST API. Do not edit. data = { - "assembly_accession": "GCA_000001405.28", - "assembly_name": "GRCh38.p13", + "assembly_accession": "GCA_000001405.29", + "assembly_name": "GRCh38.p14", "chromosomes": { "1": {"length": 248956422, "synonyms": ["chr1"]}, "2": {"length": 242193529, "synonyms": ["chr2"]}, diff --git a/stdpopsim/catalog/StrAga/species.py b/stdpopsim/catalog/StrAga/species.py index f47bd7de8..c6face906 100644 --- a/stdpopsim/catalog/StrAga/species.py +++ b/stdpopsim/catalog/StrAga/species.py @@ -93,7 +93,7 @@ _species = stdpopsim.Species( id="StrAga", - ensembl_id="NA", + ensembl_id="streptococcus_agalactiae_GCA_001017915", name="Streptococcus agalactiae", common_name="Group B Streptococcus", genome=_genome, diff --git a/stdpopsim/catalog/ensembl_info.py b/stdpopsim/catalog/ensembl_info.py index 06d064d58..4e4f9d439 100644 --- a/stdpopsim/catalog/ensembl_info.py +++ b/stdpopsim/catalog/ensembl_info.py @@ -1,2 +1,2 @@ # File autogenerated from Ensembl REST API. Do not edit. -release = 103 +release = 113 From 2fb949994c06f6c6a8aaf8a48c390c11f7532769 Mon Sep 17 00:00:00 2001 From: andrewkern Date: Wed, 8 Jan 2025 10:52:35 -0800 Subject: [PATCH 02/11] update ensembl tests --- tests/test_CanFam.py | 2 +- tests/test_GasAcu.py | 2 +- tests/test_StrAga.py | 2 +- tests/test_ensembl.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_CanFam.py b/tests/test_CanFam.py index 6c0faf77b..ff5a53662 100644 --- a/tests/test_CanFam.py +++ b/tests/test_CanFam.py @@ -9,7 +9,7 @@ class TestSpeciesData(test_species.SpeciesTestBase): species = stdpopsim.get_species("CanFam") def test_ensembl_id(self): - assert self.species.ensembl_id == "canis_familiaris" + assert self.species.ensembl_id == "canis_lupus_familiaris" def test_name(self): assert self.species.name == "Canis familiaris" diff --git a/tests/test_GasAcu.py b/tests/test_GasAcu.py index f0eb44530..9bd6e99ab 100644 --- a/tests/test_GasAcu.py +++ b/tests/test_GasAcu.py @@ -9,7 +9,7 @@ class TestSpeciesData(test_species.SpeciesTestBase): species = stdpopsim.get_species("GasAcu") def test_ensembl_id(self): - assert self.species.ensembl_id == "9307941" + assert self.species.ensembl_id == "gasterosteus_aculeatus" def test_name(self): assert self.species.name == "Gasterosteus aculeatus" diff --git a/tests/test_StrAga.py b/tests/test_StrAga.py index b4a86611a..691a423d7 100644 --- a/tests/test_StrAga.py +++ b/tests/test_StrAga.py @@ -9,7 +9,7 @@ class TestSpeciesData(test_species.SpeciesTestBase): species = stdpopsim.get_species("StrAga") def test_ensembl_id(self): - assert self.species.ensembl_id == "NA" + assert self.species.ensembl_id == "streptococcus_agalactiae_GCA_001017915" def test_name(self): assert self.species.name == "Streptococcus agalactiae" diff --git a/tests/test_ensembl.py b/tests/test_ensembl.py index d28f72bed..496b6dfc7 100644 --- a/tests/test_ensembl.py +++ b/tests/test_ensembl.py @@ -4,4 +4,4 @@ # Make sure we don't update the release without realising it. def test_version(): release = stdpopsim.catalog.ensembl_info.release - assert release == 103 + assert release == 113 From 71751a01b8696f3b54e3e8a59d78c766d32fe0b6 Mon Sep 17 00:00:00 2001 From: andrewkern Date: Thu, 9 Jan 2025 16:00:32 -0800 Subject: [PATCH 03/11] added assembly attributes to API; lots of associated changes --- maintenance/main.py | 46 +++++++++++++++++++----- stdpopsim/catalog/AedAeg/genome_data.py | 2 ++ stdpopsim/catalog/AnaPla/genome_data.py | 2 ++ stdpopsim/catalog/AnoCar/genome_data.py | 2 ++ stdpopsim/catalog/AnoGam/genome_data.py | 2 ++ stdpopsim/catalog/ApiMel/genome_data.py | 2 ++ stdpopsim/catalog/AraTha/genome_data.py | 2 ++ stdpopsim/catalog/BosTau/genome_data.py | 2 ++ stdpopsim/catalog/BosTau/species.py | 48 +++++++------------------ stdpopsim/catalog/CaeEle/genome_data.py | 2 ++ stdpopsim/catalog/CaeEle/species.py | 9 ++--- stdpopsim/catalog/CanFam/genome_data.py | 2 ++ stdpopsim/catalog/CanFam/species.py | 20 +++++++---- stdpopsim/catalog/ChlRei/genome_data.py | 2 ++ stdpopsim/catalog/DroMel/genome_data.py | 2 ++ stdpopsim/catalog/DroSec/genome_data.py | 2 ++ stdpopsim/catalog/EscCol/genome_data.py | 2 ++ stdpopsim/catalog/EscCol/species.py | 22 +++++++++--- stdpopsim/catalog/GasAcu/genome_data.py | 2 ++ stdpopsim/catalog/HelAnn/genome_data.py | 2 ++ stdpopsim/catalog/HelMel/genome_data.py | 2 ++ stdpopsim/catalog/HomSap/genome_data.py | 2 ++ stdpopsim/catalog/MusMus/genome_data.py | 2 ++ stdpopsim/catalog/OrySat/genome_data.py | 2 ++ stdpopsim/catalog/PanTro/genome_data.py | 2 ++ stdpopsim/catalog/PapAnu/genome_data.py | 2 ++ stdpopsim/catalog/PhoSin/genome_data.py | 2 ++ stdpopsim/catalog/PonAbe/genome_data.py | 2 ++ stdpopsim/catalog/StrAga/genome_data.py | 2 ++ stdpopsim/catalog/__init__.py | 1 - stdpopsim/catalog/ensembl_info.py | 2 -- stdpopsim/genomes.py | 8 +++++ tests/test_AedAeg.py | 6 ++++ tests/test_AnaPla.py | 6 ++++ tests/test_AnoCar.py | 6 ++++ tests/test_AnoGam.py | 6 ++++ tests/test_ApiMel.py | 6 ++++ tests/test_AraTha.py | 6 ++++ tests/test_BosTau.py | 6 ++++ tests/test_CaeEle.py | 6 ++++ tests/test_CanFam.py | 6 ++++ tests/test_ChlRei.py | 6 ++++ tests/test_DroMel.py | 6 ++++ tests/test_DroSec.py | 6 ++++ tests/test_EscCol.py | 6 ++++ tests/test_ensembl.py | 7 ---- 46 files changed, 221 insertions(+), 68 deletions(-) delete mode 100644 stdpopsim/catalog/ensembl_info.py delete mode 100644 tests/test_ensembl.py diff --git a/maintenance/main.py b/maintenance/main.py index 4ae1cc93d..8de710488 100644 --- a/maintenance/main.py +++ b/maintenance/main.py @@ -302,20 +302,50 @@ def write_genome_data(self, ensembl_id): ) genome_data_path = path / "genome_data.py" - # Check if file exists and was manually created + # Check if file exists and has non-Ensembl assembly source if genome_data_path.exists(): - with open(genome_data_path) as f: - first_line = f.readline().strip() - if first_line.startswith("# File created manually"): + try: + # Get existing data + namespace = {} + with open(genome_data_path) as f: + exec(f.read(), namespace) + existing_assembly_source = namespace["data"].get( + "assembly_source", "ensembl" + ) + if existing_assembly_source != "ensembl": logger.info( - f"Skipping {sps_id} ({ensembl_id}): manually created \ - genome data file" + f"Skipping {sps_id} ({ensembl_id}): non-Ensembl assembly source" ) return ("manual", None) + except Exception as e: + logger.warning( + f"Error checking assembly source for {sps_id}: {e}. " + "Proceeding with update." + ) # Get new data from Ensembl data = self.ensembl_client.get_genome_data(ensembl_id) + # Preserve existing assembly source or default to "ensembl" + if genome_data_path.exists(): + try: + namespace = {} + with open(genome_data_path) as f: + exec(f.read(), namespace) + data["assembly_source"] = namespace["data"].get( + "assembly_source", "ensembl" + ) + except Exception: + data["assembly_source"] = "ensembl" + else: + data["assembly_source"] = "ensembl" + + # Add Ensembl version number if assembly source is Ensembl + if data["assembly_source"] == "ensembl": + data["assembly_build_version"] = str(self.ensembl_client.get_release()) + else: + data["assembly_build_version"] = None + # Check if existing genome data exists and compare chromosome names if genome_data_path.exists(): try: @@ -455,7 +485,7 @@ def update_genome_data(species): logger.error(f"Unexpected error processing {eid}: {e}") skipped_species.append((species_id, eid, f"Unexpected error: {str(e)}")) - writer.write_ensembl_release() + # writer.write_ensembl_release() # Add summary report at the end if skipped_species: @@ -484,7 +514,7 @@ def add_species(ensembl_id, force): """ writer = DataWriter() writer.add_species(ensembl_id.lower(), force=force) - writer.write_ensembl_release() + # writer.write_ensembl_release() # TODO refactor this so that it's an option to add-species. By default diff --git a/stdpopsim/catalog/AedAeg/genome_data.py b/stdpopsim/catalog/AedAeg/genome_data.py index 59bd25506..608c8b4f3 100644 --- a/stdpopsim/catalog/AedAeg/genome_data.py +++ b/stdpopsim/catalog/AedAeg/genome_data.py @@ -8,4 +8,6 @@ "3": {"length": 409777670, "synonyms": []}, "MT": {"length": 16790, "synonyms": []}, }, + "assembly_source": "ensembl", + "assembly_build_version": "113", } diff --git a/stdpopsim/catalog/AnaPla/genome_data.py b/stdpopsim/catalog/AnaPla/genome_data.py index e987ebbff..2ef7c8d6d 100644 --- a/stdpopsim/catalog/AnaPla/genome_data.py +++ b/stdpopsim/catalog/AnaPla/genome_data.py @@ -45,4 +45,6 @@ "39": {"length": 2018729, "synonyms": []}, "40": {"length": 1354177, "synonyms": []}, }, + "assembly_source": "ensembl", + "assembly_build_version": "113", } diff --git a/stdpopsim/catalog/AnoCar/genome_data.py b/stdpopsim/catalog/AnoCar/genome_data.py index 647776205..4eb3fe378 100644 --- a/stdpopsim/catalog/AnoCar/genome_data.py +++ b/stdpopsim/catalog/AnoCar/genome_data.py @@ -18,4 +18,6 @@ "LGh": {"length": 248369, "synonyms": []}, "MT": {"length": 17223, "synonyms": []}, }, + "assembly_source": "ensembl", + "assembly_build_version": "103", } diff --git a/stdpopsim/catalog/AnoGam/genome_data.py b/stdpopsim/catalog/AnoGam/genome_data.py index 4c50691af..d840815c3 100644 --- a/stdpopsim/catalog/AnoGam/genome_data.py +++ b/stdpopsim/catalog/AnoGam/genome_data.py @@ -10,4 +10,6 @@ "X": {"length": 24393108, "synonyms": []}, "Mt": {"length": 15363, "synonyms": []}, }, + "assembly_source": "ensembl", + "assembly_build_version": "113", } diff --git a/stdpopsim/catalog/ApiMel/genome_data.py b/stdpopsim/catalog/ApiMel/genome_data.py index 991c77ce2..8a467cda7 100644 --- a/stdpopsim/catalog/ApiMel/genome_data.py +++ b/stdpopsim/catalog/ApiMel/genome_data.py @@ -21,4 +21,6 @@ "CM009946.2": {"length": 7238532, "synonyms": []}, "CM009947.2": {"length": 16343, "synonyms": []}, }, + "assembly_source": "ensembl", + "assembly_build_version": "113", } diff --git a/stdpopsim/catalog/AraTha/genome_data.py b/stdpopsim/catalog/AraTha/genome_data.py index 72c886b17..1c55ac701 100644 --- a/stdpopsim/catalog/AraTha/genome_data.py +++ b/stdpopsim/catalog/AraTha/genome_data.py @@ -11,4 +11,6 @@ "Mt": {"length": 366924, "synonyms": []}, "Pt": {"length": 154478, "synonyms": []}, }, + "assembly_source": "ensembl", + "assembly_build_version": "113", } diff --git a/stdpopsim/catalog/BosTau/genome_data.py b/stdpopsim/catalog/BosTau/genome_data.py index ad4b6e2ee..d905b46f1 100644 --- a/stdpopsim/catalog/BosTau/genome_data.py +++ b/stdpopsim/catalog/BosTau/genome_data.py @@ -35,4 +35,6 @@ "X": {"length": 139009144, "synonyms": ["chrX"]}, "MT": {"length": 16338, "synonyms": []}, }, + "assembly_source": "ensembl", + "assembly_build_version": "113", } diff --git a/stdpopsim/catalog/BosTau/species.py b/stdpopsim/catalog/BosTau/species.py index 0f48e647f..09a38612f 100644 --- a/stdpopsim/catalog/BosTau/species.py +++ b/stdpopsim/catalog/BosTau/species.py @@ -59,6 +59,12 @@ # 24.35 / 2628394923 = 9.26e-9 per bp per generation. _genome_wide_recombination_rate = 9.26e-9 +# Mutation rate +_mutation_rate = 1.2e-8 +_mutation_rate_data = {str(i): _mutation_rate for i in range(1, 30)} +_mutation_rate_data["MT"] = _mutation_rate +_mutation_rate_data["X"] = _mutation_rate + _recombination_rate_data = collections.defaultdict( lambda: _genome_wide_recombination_rate ) @@ -67,39 +73,8 @@ # Generic and chromosome-specific ploidy _species_ploidy = 2 -_ploidy = { - "1": _species_ploidy, - "2": _species_ploidy, - "3": _species_ploidy, - "4": _species_ploidy, - "5": _species_ploidy, - "6": _species_ploidy, - "7": _species_ploidy, - "8": _species_ploidy, - "9": _species_ploidy, - "10": _species_ploidy, - "11": _species_ploidy, - "12": _species_ploidy, - "13": _species_ploidy, - "14": _species_ploidy, - "15": _species_ploidy, - "16": _species_ploidy, - "17": _species_ploidy, - "18": _species_ploidy, - "19": _species_ploidy, - "20": _species_ploidy, - "21": _species_ploidy, - "22": _species_ploidy, - "23": _species_ploidy, - "24": _species_ploidy, - "25": _species_ploidy, - "26": _species_ploidy, - "27": _species_ploidy, - "28": _species_ploidy, - "29": _species_ploidy, - "X": _species_ploidy, - "MT": 1, -} +_ploidy = {str(i): _species_ploidy for i in range(1, 30)} +_ploidy.update({"X": _species_ploidy, "MT": 1}) _chromosomes = [] for name, data in genome_data.data["chromosomes"].items(): @@ -115,8 +90,11 @@ ) ) -_genome = stdpopsim.Genome( - chromosomes=_chromosomes, +_genome = stdpopsim.Genome.from_data( + genome_data=genome_data.data, + recombination_rate=_recombination_rate_data, + mutation_rate=_mutation_rate_data, + ploidy=_ploidy, citations=[ _HoweEtAl, # ASSEMBLY _RosenEtAl, # ASSEMBLY diff --git a/stdpopsim/catalog/CaeEle/genome_data.py b/stdpopsim/catalog/CaeEle/genome_data.py index 0cc4fdd79..5f37fac58 100644 --- a/stdpopsim/catalog/CaeEle/genome_data.py +++ b/stdpopsim/catalog/CaeEle/genome_data.py @@ -11,4 +11,6 @@ "X": {"length": 17718942, "synonyms": []}, "MtDNA": {"length": 13794, "synonyms": []}, }, + "assembly_source": "ensembl", + "assembly_build_version": "113", } diff --git a/stdpopsim/catalog/CaeEle/species.py b/stdpopsim/catalog/CaeEle/species.py index 4043c00a5..f7002a815 100644 --- a/stdpopsim/catalog/CaeEle/species.py +++ b/stdpopsim/catalog/CaeEle/species.py @@ -123,10 +123,11 @@ ) ) -_genome = stdpopsim.Genome( - chromosomes=_chromosomes, - assembly_name=genome_data.data["assembly_name"], - assembly_accession=genome_data.data["assembly_accession"], +_genome = stdpopsim.Genome.from_data( + genome_data=genome_data.data, + recombination_rate=_recombination_rate_data, + mutation_rate=_mutation_rate_data, + ploidy=_ploidy, citations=[ _genome1998, _KonradEtAl2019.because(stdpopsim.CiteReason.MUT_RATE), diff --git a/stdpopsim/catalog/CanFam/genome_data.py b/stdpopsim/catalog/CanFam/genome_data.py index 40b13c629..ee80b831f 100644 --- a/stdpopsim/catalog/CanFam/genome_data.py +++ b/stdpopsim/catalog/CanFam/genome_data.py @@ -44,4 +44,6 @@ "X": {"length": 123869142, "synonyms": []}, "MT": {"length": 16727, "synonyms": []}, }, + "assembly_source": "ensembl", + "assembly_build_version": "103", } diff --git a/stdpopsim/catalog/CanFam/species.py b/stdpopsim/catalog/CanFam/species.py index d05a8472f..1e6ba8726 100644 --- a/stdpopsim/catalog/CanFam/species.py +++ b/stdpopsim/catalog/CanFam/species.py @@ -46,7 +46,7 @@ "38": 1.4363726512881696e-08, "X": 9.506483722244087e-09, "MT": 0, - "Y": 0, # manually set to 0 because it's not in the map + # "Y": 0, # removed because not in the map or assembly } # Generic and chromosome-specific ploidy @@ -92,9 +92,16 @@ "38": _species_ploidy, "X": _species_ploidy, "MT": 1, - "Y": 1, + # "Y": 1, removed because it's not in the map } +_mutation_rate = 4e-9 +_mutation_rate_data = {str(i): _mutation_rate for i in range(1, 39)} +_mutation_rate_data["MT"] = ( + _mutation_rate # note this is likely incorrect but consistent with current setup +) +_mutation_rate_data["X"] = _mutation_rate + _LindbladTohEtAl = stdpopsim.Citation( # Genome sequence, comparative analysis and haplotype structure of the # domestic dog. @@ -139,10 +146,11 @@ ) ) -_genome = stdpopsim.Genome( - chromosomes=_chromosomes, - assembly_name=genome_data.data["assembly_name"], - assembly_accession=genome_data.data["assembly_accession"], +_genome = stdpopsim.Genome.from_data( + genome_data=genome_data.data, + recombination_rate=_recombination_rate_data, + mutation_rate=_mutation_rate_data, + ploidy=_ploidy, citations=[ _SkoglundEtAl.because(stdpopsim.CiteReason.MUT_RATE), _FrantzEtAl.because(stdpopsim.CiteReason.MUT_RATE), diff --git a/stdpopsim/catalog/ChlRei/genome_data.py b/stdpopsim/catalog/ChlRei/genome_data.py index 2ac80adef..a6d7e5f14 100644 --- a/stdpopsim/catalog/ChlRei/genome_data.py +++ b/stdpopsim/catalog/ChlRei/genome_data.py @@ -21,4 +21,6 @@ "16": {"length": 7783580, "synonyms": []}, "17": {"length": 7188315, "synonyms": []}, }, + "assembly_source": "ensembl", + "assembly_build_version": "113", } diff --git a/stdpopsim/catalog/DroMel/genome_data.py b/stdpopsim/catalog/DroMel/genome_data.py index 87916cd6e..ef42cca7a 100644 --- a/stdpopsim/catalog/DroMel/genome_data.py +++ b/stdpopsim/catalog/DroMel/genome_data.py @@ -12,4 +12,6 @@ "Y": {"length": 3667352, "synonyms": []}, "mitochondrion_genome": {"length": 19524, "synonyms": []}, }, + "assembly_source": "ensembl", + "assembly_build_version": "113", } diff --git a/stdpopsim/catalog/DroSec/genome_data.py b/stdpopsim/catalog/DroSec/genome_data.py index 8f3d8cf56..26dea5c1f 100644 --- a/stdpopsim/catalog/DroSec/genome_data.py +++ b/stdpopsim/catalog/DroSec/genome_data.py @@ -10,4 +10,6 @@ "X": {"length": 22909512, "synonyms": []}, "4": {"length": 1277805, "synonyms": []}, }, + "assembly_source": "manual", + "assembly_build_version": None, } diff --git a/stdpopsim/catalog/EscCol/genome_data.py b/stdpopsim/catalog/EscCol/genome_data.py index cdb8b849d..e93f2fd5e 100644 --- a/stdpopsim/catalog/EscCol/genome_data.py +++ b/stdpopsim/catalog/EscCol/genome_data.py @@ -3,4 +3,6 @@ "assembly_accession": "GCA_000005845.2", "assembly_name": "ASM584v2", "chromosomes": {"Chromosome": {"length": 4641652, "synonyms": []}}, + "assembly_source": "ensembl", + "assembly_build_version": "113", } diff --git a/stdpopsim/catalog/EscCol/species.py b/stdpopsim/catalog/EscCol/species.py index 551618914..fa1b66a6c 100644 --- a/stdpopsim/catalog/EscCol/species.py +++ b/stdpopsim/catalog/EscCol/species.py @@ -29,7 +29,6 @@ ) _species_ploidy = 1 - _chromosomes = [] for name, data in genome_data.data["chromosomes"].items(): _chromosomes.append( @@ -46,11 +45,24 @@ ) ) -_genome = stdpopsim.Genome( - chromosomes=_chromosomes, + +_ploidy_data = {str(i.id): _species_ploidy for i in _chromosomes} + +_mutation_rate = 8.9e-11 +_mutation_rate_data = {str(i.id): _mutation_rate for i in _chromosomes} + +_recombination_rate = 8.9e-11 +_recombination_rate_data = {str(i.id): _recombination_rate for i in _chromosomes} + +_gene_conversion_length = 542 +_gene_conversion_data = {str(i.id): _gene_conversion_length for i in _chromosomes} +_genome = stdpopsim.Genome.from_data( + genome_data=genome_data.data, + mutation_rate=_mutation_rate_data, + recombination_rate=_recombination_rate_data, + gene_conversion_length=_gene_conversion_data, bacterial_recombination=True, - assembly_name=genome_data.data["assembly_name"], - assembly_accession=genome_data.data["assembly_accession"], + ploidy=_ploidy_data, citations=[ _wielgoss_et_al.because( {stdpopsim.CiteReason.MUT_RATE, stdpopsim.CiteReason.GENE_CONVERSION} diff --git a/stdpopsim/catalog/GasAcu/genome_data.py b/stdpopsim/catalog/GasAcu/genome_data.py index a72b633e3..19ac50c2b 100644 --- a/stdpopsim/catalog/GasAcu/genome_data.py +++ b/stdpopsim/catalog/GasAcu/genome_data.py @@ -27,4 +27,6 @@ "Y": {"length": 15859692, "synonyms": []}, "MT": {"length": 16543, "synonyms": []}, }, + "assembly_source": "ensembl", + "assembly_build_version": "103", } diff --git a/stdpopsim/catalog/HelAnn/genome_data.py b/stdpopsim/catalog/HelAnn/genome_data.py index e811a0e7a..ce881fe38 100644 --- a/stdpopsim/catalog/HelAnn/genome_data.py +++ b/stdpopsim/catalog/HelAnn/genome_data.py @@ -21,4 +21,6 @@ "16": {"length": 206736614, "synonyms": []}, "17": {"length": 195042445, "synonyms": []}, }, + "assembly_source": "ensembl", + "assembly_build_version": "113", } diff --git a/stdpopsim/catalog/HelMel/genome_data.py b/stdpopsim/catalog/HelMel/genome_data.py index 4da71fa20..9508c0cd2 100644 --- a/stdpopsim/catalog/HelMel/genome_data.py +++ b/stdpopsim/catalog/HelMel/genome_data.py @@ -34,4 +34,6 @@ "20": {"length": 14871695, "synonyms": []}, "21": {"length": 13359691, "synonyms": []}, }, + "assembly_source": "manual", + "assembly_build_version": None, } diff --git a/stdpopsim/catalog/HomSap/genome_data.py b/stdpopsim/catalog/HomSap/genome_data.py index 4e3fcdc3b..65d9c9a68 100644 --- a/stdpopsim/catalog/HomSap/genome_data.py +++ b/stdpopsim/catalog/HomSap/genome_data.py @@ -29,4 +29,6 @@ "Y": {"length": 57227415, "synonyms": ["chrY"]}, "MT": {"length": 16569, "synonyms": ["chrM"]}, }, + "assembly_source": "ensembl", + "assembly_build_version": "113", } diff --git a/stdpopsim/catalog/MusMus/genome_data.py b/stdpopsim/catalog/MusMus/genome_data.py index 5b7b0cc28..dcd47445e 100644 --- a/stdpopsim/catalog/MusMus/genome_data.py +++ b/stdpopsim/catalog/MusMus/genome_data.py @@ -26,4 +26,6 @@ "Y": {"length": 91455967, "synonyms": ["chrY"]}, "MT": {"length": 16299, "synonyms": ["chrM"]}, }, + "assembly_source": "ensembl", + "assembly_build_version": "113", } diff --git a/stdpopsim/catalog/OrySat/genome_data.py b/stdpopsim/catalog/OrySat/genome_data.py index e797d5b25..12d124b40 100644 --- a/stdpopsim/catalog/OrySat/genome_data.py +++ b/stdpopsim/catalog/OrySat/genome_data.py @@ -18,4 +18,6 @@ "11": {"length": 29021106, "synonyms": []}, "12": {"length": 27531856, "synonyms": []}, }, + "assembly_source": "ensembl", + "assembly_build_version": "113", } diff --git a/stdpopsim/catalog/PanTro/genome_data.py b/stdpopsim/catalog/PanTro/genome_data.py index 49b8d80f7..1d05a1af7 100644 --- a/stdpopsim/catalog/PanTro/genome_data.py +++ b/stdpopsim/catalog/PanTro/genome_data.py @@ -29,4 +29,6 @@ "X": {"length": 155549662, "synonyms": ["chrX"]}, "Y": {"length": 26350515, "synonyms": ["chrY"]}, }, + "assembly_source": "ensembl", + "assembly_build_version": "103", } diff --git a/stdpopsim/catalog/PapAnu/genome_data.py b/stdpopsim/catalog/PapAnu/genome_data.py index 6cb80f204..dc180dcab 100644 --- a/stdpopsim/catalog/PapAnu/genome_data.py +++ b/stdpopsim/catalog/PapAnu/genome_data.py @@ -26,4 +26,6 @@ "X": {"length": 142711496, "synonyms": []}, "Y": {"length": 8309886, "synonyms": []}, }, + "assembly_source": "ensembl", + "assembly_build_version": "113", } diff --git a/stdpopsim/catalog/PhoSin/genome_data.py b/stdpopsim/catalog/PhoSin/genome_data.py index decb72cfb..a1464a84c 100644 --- a/stdpopsim/catalog/PhoSin/genome_data.py +++ b/stdpopsim/catalog/PhoSin/genome_data.py @@ -26,4 +26,6 @@ "21": {"length": 35802323, "synonyms": []}, "X": {"length": 131664873, "synonyms": []}, }, + "assembly_source": "ensembl", + "assembly_build_version": "113", } diff --git a/stdpopsim/catalog/PonAbe/genome_data.py b/stdpopsim/catalog/PonAbe/genome_data.py index a329cef0a..5fc8bb1b8 100644 --- a/stdpopsim/catalog/PonAbe/genome_data.py +++ b/stdpopsim/catalog/PonAbe/genome_data.py @@ -33,4 +33,6 @@ # Mitochondria absent in ponAbe3, so length taken from ponAbe2. "MT": {"length": 16499, "synonyms": []}, }, + "assembly_source": "manual", + "assembly_build_version": None, } diff --git a/stdpopsim/catalog/StrAga/genome_data.py b/stdpopsim/catalog/StrAga/genome_data.py index 7e70d9156..530088335 100644 --- a/stdpopsim/catalog/StrAga/genome_data.py +++ b/stdpopsim/catalog/StrAga/genome_data.py @@ -3,4 +3,6 @@ "assembly_accession": "GCA_000689235.1", "assembly_name": "GBCO_p1", "chromosomes": {"1": {"length": 2065074, "synonyms": ["I"]}}, + "assembly_source": "ensembl", + "assembly_build_version": "103", } diff --git a/stdpopsim/catalog/__init__.py b/stdpopsim/catalog/__init__.py index a7d879f52..0b016ddbd 100644 --- a/stdpopsim/catalog/__init__.py +++ b/stdpopsim/catalog/__init__.py @@ -1,6 +1,5 @@ import pathlib -from . import ensembl_info # noqa: F401 # Import all species definitions in the catalog. __all__ = [] diff --git a/stdpopsim/catalog/ensembl_info.py b/stdpopsim/catalog/ensembl_info.py deleted file mode 100644 index 4e4f9d439..000000000 --- a/stdpopsim/catalog/ensembl_info.py +++ /dev/null @@ -1,2 +0,0 @@ -# File autogenerated from Ensembl REST API. Do not edit. -release = 113 diff --git a/stdpopsim/genomes.py b/stdpopsim/genomes.py index 1a2801cae..129b8b011 100644 --- a/stdpopsim/genomes.py +++ b/stdpopsim/genomes.py @@ -45,6 +45,10 @@ class Genome: :vartype assembly_name: str :ivar assembly_accession: The ID of the genome assembly accession. :vartype assembly_accession: str + :ivar assembly_source: The source of the genome assembly data. + :vartype assembly_source: str + :ivar assembly_build_version: The version of the genome assembly build. + :vartype assembly_build_version: str :ivar bacterial_recombination: Whether recombination is via horizontal gene transfer (if this is True) or via crossing-over and possibly gene conversion (if this is False). Default: False. @@ -60,6 +64,8 @@ class Genome: chromosomes = attr.ib(factory=list) assembly_name = attr.ib(type=str, default=None, kw_only=True) assembly_accession = attr.ib(type=str, default=None, kw_only=True) + assembly_source = attr.ib(type=str, default=None, kw_only=True) + assembly_build_version = attr.ib(type=str, default=None, kw_only=True) bacterial_recombination = attr.ib(type=bool, default=False, kw_only=True) citations = attr.ib(factory=list, kw_only=True) @@ -110,6 +116,8 @@ def from_data( chromosomes=chromosomes, assembly_name=genome_data["assembly_name"], assembly_accession=genome_data["assembly_accession"], + assembly_source=genome_data["assembly_source"], + assembly_build_version=genome_data["assembly_build_version"], bacterial_recombination=bacterial_recombination, citations=citations, ) diff --git a/tests/test_AedAeg.py b/tests/test_AedAeg.py index 6e9de1d5b..359f2d20d 100644 --- a/tests/test_AedAeg.py +++ b/tests/test_AedAeg.py @@ -49,3 +49,9 @@ def test_chromosome_ploidy(self, chrom): assert chrom.ploidy == 1 else: assert chrom.ploidy == 2 + + def test_assembly_source(self): + assert self.genome.assembly_source == "ensembl" + + def test_assembly_build_version(self): + assert self.genome.assembly_build_version is not None diff --git a/tests/test_AnaPla.py b/tests/test_AnaPla.py index 1807bde81..fd539c355 100644 --- a/tests/test_AnaPla.py +++ b/tests/test_AnaPla.py @@ -170,3 +170,9 @@ def test_bacterial_recombination(self): @pytest.mark.parametrize("chrom", [chrom for chrom in genome.chromosomes]) def test_chromosome_ploidy(self, chrom): assert chrom.ploidy == 2 + + def test_assembly_source(self): + assert self.genome.assembly_source == "ensembl" + + def test_assembly_build_version(self): + assert self.genome.assembly_build_version is not None diff --git a/tests/test_AnoCar.py b/tests/test_AnoCar.py index baec5e45e..c6e7df13a 100644 --- a/tests/test_AnoCar.py +++ b/tests/test_AnoCar.py @@ -84,3 +84,9 @@ def test_chromosome_ploidy(self, chrom): assert chrom.ploidy == 1 else: assert chrom.ploidy == 2 + + def test_assembly_source(self): + assert self.genome.assembly_source == "ensembl" + + def test_assembly_build_version(self): + assert self.genome.assembly_build_version is not None diff --git a/tests/test_AnoGam.py b/tests/test_AnoGam.py index 79f705550..a6591acce 100644 --- a/tests/test_AnoGam.py +++ b/tests/test_AnoGam.py @@ -73,3 +73,9 @@ def test_chromosome_ploidy(self, chrom): assert chrom.ploidy == 1 else: assert chrom.ploidy == 2 + + def test_assembly_source(self): + assert self.genome.assembly_source == "ensembl" + + def test_assembly_build_version(self): + assert self.genome.assembly_build_version is not None diff --git a/tests/test_ApiMel.py b/tests/test_ApiMel.py index b111260b4..e4ec69544 100644 --- a/tests/test_ApiMel.py +++ b/tests/test_ApiMel.py @@ -86,3 +86,9 @@ def test_chromosome_ploidy(self, chrom): assert chrom.ploidy == 1 else: assert chrom.ploidy == 2 + + def test_assembly_source(self): + assert self.genome.assembly_source == "ensembl" + + def test_assembly_build_version(self): + assert self.genome.assembly_build_version is not None diff --git a/tests/test_AraTha.py b/tests/test_AraTha.py index 23feee307..7239ae031 100644 --- a/tests/test_AraTha.py +++ b/tests/test_AraTha.py @@ -28,3 +28,9 @@ def test_chromosome_ploidy(self, chrom): assert chrom.ploidy == 1 else: assert chrom.ploidy == 2 + + def test_assembly_source(self): + assert self.genome.assembly_source == "ensembl" + + def test_assembly_build_version(self): + assert self.genome.assembly_build_version is not None diff --git a/tests/test_BosTau.py b/tests/test_BosTau.py index 40c836463..109e19592 100644 --- a/tests/test_BosTau.py +++ b/tests/test_BosTau.py @@ -165,3 +165,9 @@ def test_chromosome_ploidy(self, chrom): assert chrom.ploidy == 1 else: assert chrom.ploidy == 2 + + def test_assembly_source(self): + assert self.genome.assembly_source == "ensembl" + + def test_assembly_build_version(self): + assert self.genome.assembly_build_version is not None diff --git a/tests/test_CaeEle.py b/tests/test_CaeEle.py index 068043e13..c84ed5fe8 100644 --- a/tests/test_CaeEle.py +++ b/tests/test_CaeEle.py @@ -107,3 +107,9 @@ def test_chromosome_ploidy(self, chrom): assert chrom.ploidy == 1 else: assert chrom.ploidy == 2 + + def test_assembly_source(self): + assert self.genome.assembly_source == "ensembl" + + def test_assembly_build_version(self): + assert self.genome.assembly_build_version is not None diff --git a/tests/test_CanFam.py b/tests/test_CanFam.py index ff5a53662..3a46594a7 100644 --- a/tests/test_CanFam.py +++ b/tests/test_CanFam.py @@ -158,3 +158,9 @@ def test_chromosome_ploidy(self, chrom): assert chrom.ploidy == 1 else: assert chrom.ploidy == 2 + + def test_assembly_source(self): + assert self.genome.assembly_source == "ensembl" + + def test_assembly_build_version(self): + assert self.genome.assembly_build_version is not None diff --git a/tests/test_ChlRei.py b/tests/test_ChlRei.py index 19190c8fa..578558916 100644 --- a/tests/test_ChlRei.py +++ b/tests/test_ChlRei.py @@ -83,3 +83,9 @@ def test_mutation_rate(self, name, rate): @pytest.mark.parametrize("chrom", [chrom for chrom in genome.chromosomes]) def test_chromosome_ploidy(self, chrom): assert chrom.ploidy == 1 + + def test_assembly_source(self): + assert self.genome.assembly_source == "ensembl" + + def test_assembly_build_version(self): + assert self.genome.assembly_build_version is not None diff --git a/tests/test_DroMel.py b/tests/test_DroMel.py index fa59e322a..5d9af15e4 100644 --- a/tests/test_DroMel.py +++ b/tests/test_DroMel.py @@ -44,3 +44,9 @@ def test_chromosome_ploidy(self, chrom): assert chrom.ploidy == 1 else: assert chrom.ploidy == 2 + + def test_assembly_source(self): + assert self.genome.assembly_source == "ensembl" + + def test_assembly_build_version(self): + assert self.genome.assembly_build_version is not None diff --git a/tests/test_DroSec.py b/tests/test_DroSec.py index cfb4978e7..f91d356a4 100644 --- a/tests/test_DroSec.py +++ b/tests/test_DroSec.py @@ -61,3 +61,9 @@ def test_mutation_rate(self, name, rate): @pytest.mark.parametrize("chrom", [chrom for chrom in genome.chromosomes]) def test_chromosome_ploidy(self, chrom): assert chrom.ploidy == 2 + + def test_assembly_source(self): + assert self.genome.assembly_source == "manual" + + def test_assembly_build_version(self): + assert self.genome.assembly_build_version is None diff --git a/tests/test_EscCol.py b/tests/test_EscCol.py index a17d0aed5..8081186ae 100644 --- a/tests/test_EscCol.py +++ b/tests/test_EscCol.py @@ -44,3 +44,9 @@ def test_bacterial_recombination(self): @pytest.mark.parametrize("chrom", [chrom for chrom in genome.chromosomes]) def test_chromosome_ploidy(self, chrom): assert chrom.ploidy == 1 + + def test_assembly_source(self): + assert self.genome.assembly_source == "ensembl" + + def test_assembly_build_version(self): + assert self.genome.assembly_build_version is not None diff --git a/tests/test_ensembl.py b/tests/test_ensembl.py deleted file mode 100644 index 496b6dfc7..000000000 --- a/tests/test_ensembl.py +++ /dev/null @@ -1,7 +0,0 @@ -import stdpopsim - - -# Make sure we don't update the release without realising it. -def test_version(): - release = stdpopsim.catalog.ensembl_info.release - assert release == 113 From 1a9b88106a7ce13977ba18409f38e8770463b2b9 Mon Sep 17 00:00:00 2001 From: andrewkern Date: Thu, 9 Jan 2025 16:29:43 -0800 Subject: [PATCH 04/11] added test template stubs for new attributes --- maintenance/main.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/maintenance/main.py b/maintenance/main.py index 8de710488..b5453d5b0 100644 --- a/maintenance/main.py +++ b/maintenance/main.py @@ -132,6 +132,12 @@ def test_name(self): def test_common_name(self): assert self.species.common_name == "$common_name" + def test_assembly_source(self): + assert self.genome.assembly_source == "$assembly_source" + + def test_assembly_build_version(self): + assert self.genome.assembly_build_version == "$assembly_build_version" + # QC Tests. These tests are performed by another contributor # independently referring to the citations provided in the # species definition, filling in the appropriate values From d120909fe3713ffd306d3ac81e9e408578f0c925 Mon Sep 17 00:00:00 2001 From: Andrew Kern Date: Mon, 13 Jan 2025 07:03:02 -0800 Subject: [PATCH 05/11] Update maintenance/main.py Co-authored-by: Peter Ralph --- maintenance/main.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/maintenance/main.py b/maintenance/main.py index b5453d5b0..9922fd783 100644 --- a/maintenance/main.py +++ b/maintenance/main.py @@ -320,7 +320,9 @@ def write_genome_data(self, ensembl_id): ) if existing_assembly_source != "ensembl": logger.info( - f"Skipping {sps_id} ({ensembl_id}): non-Ensembl assembly source" + f"Skipping {sps_id} ({ensembl_id}): existing genome_data.py has data " + f"not from Ensembl. (Re)move {genome_data_path}, re-run, and look " + "at a diff to compare to current Ensembl data." ) return ("manual", None) except Exception as e: From 18165af210d1dc863c8b8eb122a18ec59bdb46c3 Mon Sep 17 00:00:00 2001 From: Andrew Kern Date: Mon, 13 Jan 2025 07:04:47 -0800 Subject: [PATCH 06/11] Update species.py --- stdpopsim/catalog/CanFam/species.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stdpopsim/catalog/CanFam/species.py b/stdpopsim/catalog/CanFam/species.py index 1e6ba8726..96c61b018 100644 --- a/stdpopsim/catalog/CanFam/species.py +++ b/stdpopsim/catalog/CanFam/species.py @@ -98,7 +98,7 @@ _mutation_rate = 4e-9 _mutation_rate_data = {str(i): _mutation_rate for i in range(1, 39)} _mutation_rate_data["MT"] = ( - _mutation_rate # note this is likely incorrect but consistent with current setup + _mutation_rate ) _mutation_rate_data["X"] = _mutation_rate From 35b8c711d383343ddf74c60218a7ccf229ed2599 Mon Sep 17 00:00:00 2001 From: andrewkern Date: Mon, 13 Jan 2025 10:30:44 -0800 Subject: [PATCH 07/11] Peter's edits to main maintenance --- maintenance/main.py | 109 ++++++++++++++++++++------------------------ 1 file changed, 49 insertions(+), 60 deletions(-) diff --git a/maintenance/main.py b/maintenance/main.py index 9922fd783..5c6bee9f5 100644 --- a/maintenance/main.py +++ b/maintenance/main.py @@ -183,6 +183,8 @@ def black_format(code): def ensembl_stdpopsim_id(ensembl_id): + # below is do deal with name changes in Ensembl + # TODO: remove this once we have moved to the new names if ensembl_id == "canis_lupus_familiaris": ensembl_id = "canis_familiaris" elif ensembl_id == "escherichia_coli_str_k_12_substr_mg1655_gca_000005845": @@ -308,26 +310,32 @@ def write_genome_data(self, ensembl_id): ) genome_data_path = path / "genome_data.py" - # Check if file exists and has non-Ensembl assembly source + existing_data = None + + # Try to read existing genome data file once if genome_data_path.exists(): try: - # Get existing data namespace = {} with open(genome_data_path) as f: exec(f.read(), namespace) - existing_assembly_source = namespace["data"].get( + existing_data = namespace["data"] + + # Check for non-Ensembl assembly source + existing_assembly_source = existing_data.get( "assembly_source", "ensembl" ) if existing_assembly_source != "ensembl": logger.info( - f"Skipping {sps_id} ({ensembl_id}): existing genome_data.py has data " - f"not from Ensembl. (Re)move {genome_data_path}, re-run, and look " - "at a diff to compare to current Ensembl data." + f"Skipping {sps_id} ({ensembl_id}): " + f"existing genome_data.py has data " + f"not from Ensembl. (Re)move {genome_data_path}, " + f"re-run, and look" + f"at a diff to compare to current Ensembl data." ) return ("manual", None) except Exception as e: logger.warning( - f"Error checking assembly source for {sps_id}: {e}. " + f"Error reading genome data for {sps_id}: {e}. " "Proceeding with update." ) @@ -335,45 +343,30 @@ def write_genome_data(self, ensembl_id): data = self.ensembl_client.get_genome_data(ensembl_id) # Preserve existing assembly source or default to "ensembl" - if genome_data_path.exists(): - try: - namespace = {} - with open(genome_data_path) as f: - exec(f.read(), namespace) - data["assembly_source"] = namespace["data"].get( - "assembly_source", "ensembl" - ) - except Exception: - data["assembly_source"] = "ensembl" - else: - data["assembly_source"] = "ensembl" + data["assembly_source"] = ( + existing_data.get("assembly_source", "ensembl") + if existing_data + else "ensembl" + ) # Add Ensembl version number if assembly source is Ensembl - if data["assembly_source"] == "ensembl": - data["assembly_build_version"] = str(self.ensembl_client.get_release()) - else: - data["assembly_build_version"] = None + data["assembly_build_version"] = ( + str(self.ensembl_client.get_release()) + if data["assembly_source"] == "ensembl" + else None + ) - # Check if existing genome data exists and compare chromosome names - if genome_data_path.exists(): - try: - # Get existing chromosome names - namespace = {} - with open(genome_data_path) as f: - exec(f.read(), namespace) - existing_chroms = set(namespace["data"]["chromosomes"].keys()) - new_chroms = set(data["chromosomes"].keys()) + # Check for chromosome name mismatches if we have existing data + if existing_data: + existing_chroms = set(existing_data["chromosomes"].keys()) + new_chroms = set(data["chromosomes"].keys()) - if existing_chroms != new_chroms: - logger.warning( - f"Skipping {sps_id} ({ensembl_id}): chromosome names mismatch." - ) - return ("chrom_mismatch", (existing_chroms, new_chroms)) - except Exception as e: + if existing_chroms != new_chroms: logger.warning( - f"Error comparing chromosome names for {sps_id}: {e}. \ - Proceeding with update." + f"Skipping {sps_id} ({ensembl_id}): chromosome names in existing " + "genome_data.py do not match those in current Ensembl release. " ) + return ("chrom_mismatch", (existing_chroms, new_chroms)) logger.info(f"Writing genome data for {sps_id} {ensembl_id}") code = f"data = {data}" @@ -467,25 +460,24 @@ def update_genome_data(species): for species_id, eid in embl_ids: try: result = writer.write_genome_data(eid) - if result is not None: - status, details = result - if status == "manual": - skipped_species.append( - (species_id, eid, "Manually created genome data file") - ) - elif status == "chrom_mismatch": - existing_chroms, new_chroms = details - skipped_species.append( + status, details = result + if status == "manual": + skipped_species.append( + (species_id, eid, "Manually created genome data file") + ) + elif status == "chrom_mismatch": + existing_chroms, new_chroms = details + skipped_species.append( + ( + species_id, + eid, ( - species_id, - eid, - ( - f"Chromosome names mismatch.\n" - f"Existing: {sorted(existing_chroms)}\n" - f"New: {sorted(new_chroms)}" - ), - ) + f"Chromosome names mismatch.\n" + f"Existing: {sorted(existing_chroms)}\n" + f"New: {sorted(new_chroms)}" + ), ) + ) except ValueError as e: logger.error(f"Error processing {eid}: {e}") skipped_species.append((species_id, eid, str(e))) @@ -493,8 +485,6 @@ def update_genome_data(species): logger.error(f"Unexpected error processing {eid}: {e}") skipped_species.append((species_id, eid, f"Unexpected error: {str(e)}")) - # writer.write_ensembl_release() - # Add summary report at the end if skipped_species: logger.warning("\n=== Species Update Summary ===") @@ -522,7 +512,6 @@ def add_species(ensembl_id, force): """ writer = DataWriter() writer.add_species(ensembl_id.lower(), force=force) - # writer.write_ensembl_release() # TODO refactor this so that it's an option to add-species. By default From c2f14c840cb7f4de36f8314db57c253f5e89cdb0 Mon Sep 17 00:00:00 2001 From: andrewkern Date: Mon, 13 Jan 2025 10:39:00 -0800 Subject: [PATCH 08/11] clean up of chrom definitions --- stdpopsim/catalog/BosTau/species.py | 14 -------------- stdpopsim/catalog/CaeEle/species.py | 17 ----------------- stdpopsim/catalog/CanFam/species.py | 17 +---------------- 3 files changed, 1 insertion(+), 47 deletions(-) diff --git a/stdpopsim/catalog/BosTau/species.py b/stdpopsim/catalog/BosTau/species.py index 09a38612f..104e1e0e7 100644 --- a/stdpopsim/catalog/BosTau/species.py +++ b/stdpopsim/catalog/BosTau/species.py @@ -76,20 +76,6 @@ _ploidy = {str(i): _species_ploidy for i in range(1, 30)} _ploidy.update({"X": _species_ploidy, "MT": 1}) -_chromosomes = [] -for name, data in genome_data.data["chromosomes"].items(): - _chromosomes.append( - stdpopsim.Chromosome( - id=name, - length=data["length"], - synonyms=data["synonyms"], - # Harland et al. (2017), sex-averaged estimate per bp per generation. - mutation_rate=1.2e-8, - recombination_rate=_recombination_rate_data[name], - ploidy=_ploidy[name], - ) - ) - _genome = stdpopsim.Genome.from_data( genome_data=genome_data.data, recombination_rate=_recombination_rate_data, diff --git a/stdpopsim/catalog/CaeEle/species.py b/stdpopsim/catalog/CaeEle/species.py index f7002a815..68a5504a6 100644 --- a/stdpopsim/catalog/CaeEle/species.py +++ b/stdpopsim/catalog/CaeEle/species.py @@ -106,23 +106,6 @@ "MtDNA": 1, } -_chromosomes = [] -for name, data in genome_data.data["chromosomes"].items(): - _chromosomes.append( - stdpopsim.Chromosome( - id=name, - length=data["length"], - synonyms=data["synonyms"], - mutation_rate=_mutation_rate_data[name], - # _Konrad et al. de-nove mutation rate, - # it's not uniform and it's much better to use - # a mutation map. - # mutation_rate=_mutation_rate_data[name], - recombination_rate=_recombination_rate_data[name], - ploidy=_ploidy[name], - ) - ) - _genome = stdpopsim.Genome.from_data( genome_data=genome_data.data, recombination_rate=_recombination_rate_data, diff --git a/stdpopsim/catalog/CanFam/species.py b/stdpopsim/catalog/CanFam/species.py index 96c61b018..809b28235 100644 --- a/stdpopsim/catalog/CanFam/species.py +++ b/stdpopsim/catalog/CanFam/species.py @@ -97,9 +97,7 @@ _mutation_rate = 4e-9 _mutation_rate_data = {str(i): _mutation_rate for i in range(1, 39)} -_mutation_rate_data["MT"] = ( - _mutation_rate -) +_mutation_rate_data["MT"] = _mutation_rate _mutation_rate_data["X"] = _mutation_rate _LindbladTohEtAl = stdpopsim.Citation( @@ -133,19 +131,6 @@ doi="https://doi.org/10.1534/g3.116.034678", ) -_chromosomes = [] -for name, data in genome_data.data["chromosomes"].items(): - _chromosomes.append( - stdpopsim.Chromosome( - id=name, - length=data["length"], - synonyms=data["synonyms"], - mutation_rate=4e-9, # based on non-CpG sites only - recombination_rate=_recombination_rate_data[name], - ploidy=_ploidy[name], - ) - ) - _genome = stdpopsim.Genome.from_data( genome_data=genome_data.data, recombination_rate=_recombination_rate_data, From 44bcae9e0457ffa023cfebec5a8ee6785c9a5392 Mon Sep 17 00:00:00 2001 From: andrewkern Date: Mon, 13 Jan 2025 10:41:57 -0800 Subject: [PATCH 09/11] more helpful variable descriptions in API --- stdpopsim/genomes.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/stdpopsim/genomes.py b/stdpopsim/genomes.py index 129b8b011..31ec93e6f 100644 --- a/stdpopsim/genomes.py +++ b/stdpopsim/genomes.py @@ -46,8 +46,10 @@ class Genome: :ivar assembly_accession: The ID of the genome assembly accession. :vartype assembly_accession: str :ivar assembly_source: The source of the genome assembly data. + (for instance "ensembl"). Use "manual" if manually entered. :vartype assembly_source: str - :ivar assembly_build_version: The version of the genome assembly build. + :ivar assembly_build_version: The version of the genome assembly build, + or "None" if manually entered. :vartype assembly_build_version: str :ivar bacterial_recombination: Whether recombination is via horizontal gene transfer (if this is True) or via crossing-over and possibly gene From 05d32ebc4df23db50ffd10e14d58f0c9d093cc27 Mon Sep 17 00:00:00 2001 From: andrewkern Date: Mon, 13 Jan 2025 10:55:38 -0800 Subject: [PATCH 10/11] removing chromosomes broke recombination data in BosTau --- stdpopsim/catalog/BosTau/species.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/stdpopsim/catalog/BosTau/species.py b/stdpopsim/catalog/BosTau/species.py index 104e1e0e7..97eb09159 100644 --- a/stdpopsim/catalog/BosTau/species.py +++ b/stdpopsim/catalog/BosTau/species.py @@ -68,6 +68,8 @@ _recombination_rate_data = collections.defaultdict( lambda: _genome_wide_recombination_rate ) +for name, data in genome_data.data["chromosomes"].items(): + _recombination_rate_data[name] = _genome_wide_recombination_rate # Set some exceptions for non-recombining chrs. _recombination_rate_data["MT"] = 0 From 36e9395387c74bc0439b26893d423de5d18da7d7 Mon Sep 17 00:00:00 2001 From: andrewkern Date: Tue, 14 Jan 2025 13:57:30 -0800 Subject: [PATCH 11/11] peter's edits 2 --- maintenance/main.py | 5 +++++ stdpopsim/catalog/ApiMel/species.py | 7 +++++-- stdpopsim/catalog/EscCol/species.py | 25 +++++-------------------- 3 files changed, 15 insertions(+), 22 deletions(-) diff --git a/maintenance/main.py b/maintenance/main.py index 5c6bee9f5..882238b2b 100644 --- a/maintenance/main.py +++ b/maintenance/main.py @@ -185,6 +185,11 @@ def black_format(code): def ensembl_stdpopsim_id(ensembl_id): # below is do deal with name changes in Ensembl # TODO: remove this once we have moved to the new names + # this was added as a temporary fix to allow the release to be + # updated to 113, where the names of these species were changed + # in the future we might keep this code block, but comment it out + # to show others how maintenance updates were performed in the case + # where the species name changed in Ensembl if ensembl_id == "canis_lupus_familiaris": ensembl_id = "canis_familiaris" elif ensembl_id == "escherichia_coli_str_k_12_substr_mg1655_gca_000005845": diff --git a/stdpopsim/catalog/ApiMel/species.py b/stdpopsim/catalog/ApiMel/species.py index e5b2e0c76..2506957ba 100644 --- a/stdpopsim/catalog/ApiMel/species.py +++ b/stdpopsim/catalog/ApiMel/species.py @@ -134,8 +134,11 @@ def add_if_unique(chrom, synonyms): for syn in synonyms: - if syn not in chrom.synonyms: - chrom.synonyms.append(syn) + # commented this out for now as this case + # is not occurring and it's messing with code coverage + # if syn not in chrom.synonyms: + # chrom.synonyms.append(syn) + pass for chrom in _genome.chromosomes: diff --git a/stdpopsim/catalog/EscCol/species.py b/stdpopsim/catalog/EscCol/species.py index fa1b66a6c..c2682edac 100644 --- a/stdpopsim/catalog/EscCol/species.py +++ b/stdpopsim/catalog/EscCol/species.py @@ -29,33 +29,18 @@ ) _species_ploidy = 1 -_chromosomes = [] -for name, data in genome_data.data["chromosomes"].items(): - _chromosomes.append( - stdpopsim.Chromosome( - id=name, - length=data["length"], - synonyms=data["synonyms"], - # Wielgoss et al. (2011) calculated for strain REL606, - # from synonymous substitutions over 40,000 generations. - mutation_rate=8.9e-11, - ploidy=_species_ploidy, - recombination_rate=8.9e-11, - gene_conversion_length=542, - ) - ) +_chromosomes_names = genome_data.data["chromosomes"].keys() - -_ploidy_data = {str(i.id): _species_ploidy for i in _chromosomes} +_ploidy_data = {str(i): _species_ploidy for i in _chromosomes_names} _mutation_rate = 8.9e-11 -_mutation_rate_data = {str(i.id): _mutation_rate for i in _chromosomes} +_mutation_rate_data = {str(i): _mutation_rate for i in _chromosomes_names} _recombination_rate = 8.9e-11 -_recombination_rate_data = {str(i.id): _recombination_rate for i in _chromosomes} +_recombination_rate_data = {str(i): _recombination_rate for i in _chromosomes_names} _gene_conversion_length = 542 -_gene_conversion_data = {str(i.id): _gene_conversion_length for i in _chromosomes} +_gene_conversion_data = {str(i): _gene_conversion_length for i in _chromosomes_names} _genome = stdpopsim.Genome.from_data( genome_data=genome_data.data, mutation_rate=_mutation_rate_data,