Skip to content

Commit

Permalink
added assembly attributes to API; lots of associated changes
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewkern committed Jan 10, 2025
1 parent 2fb9499 commit 71751a0
Show file tree
Hide file tree
Showing 46 changed files with 221 additions and 68 deletions.
46 changes: 38 additions & 8 deletions maintenance/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,20 +302,50 @@ def write_genome_data(self, ensembl_id):
)

genome_data_path = path / "genome_data.py"
# Check if file exists and was manually created
# Check if file exists and has non-Ensembl assembly source
if genome_data_path.exists():
with open(genome_data_path) as f:
first_line = f.readline().strip()
if first_line.startswith("# File created manually"):
try:
# Get existing data
namespace = {}
with open(genome_data_path) as f:
exec(f.read(), namespace)
existing_assembly_source = namespace["data"].get(
"assembly_source", "ensembl"
)
if existing_assembly_source != "ensembl":
logger.info(
f"Skipping {sps_id} ({ensembl_id}): manually created \
genome data file"
f"Skipping {sps_id} ({ensembl_id}): non-Ensembl assembly source"
)
return ("manual", None)
except Exception as e:
logger.warning(
f"Error checking assembly source for {sps_id}: {e}. "
"Proceeding with update."
)

# Get new data from Ensembl
data = self.ensembl_client.get_genome_data(ensembl_id)

# Preserve existing assembly source or default to "ensembl"
if genome_data_path.exists():
try:
namespace = {}
with open(genome_data_path) as f:
exec(f.read(), namespace)
data["assembly_source"] = namespace["data"].get(
"assembly_source", "ensembl"
)
except Exception:
data["assembly_source"] = "ensembl"
else:
data["assembly_source"] = "ensembl"

# Add Ensembl version number if assembly source is Ensembl
if data["assembly_source"] == "ensembl":
data["assembly_build_version"] = str(self.ensembl_client.get_release())
else:
data["assembly_build_version"] = None

# Check if existing genome data exists and compare chromosome names
if genome_data_path.exists():
try:
Expand Down Expand Up @@ -455,7 +485,7 @@ def update_genome_data(species):
logger.error(f"Unexpected error processing {eid}: {e}")
skipped_species.append((species_id, eid, f"Unexpected error: {str(e)}"))

writer.write_ensembl_release()
# writer.write_ensembl_release()

# Add summary report at the end
if skipped_species:
Expand Down Expand Up @@ -484,7 +514,7 @@ def add_species(ensembl_id, force):
"""
writer = DataWriter()
writer.add_species(ensembl_id.lower(), force=force)
writer.write_ensembl_release()
# writer.write_ensembl_release()


# TODO refactor this so that it's an option to add-species. By default
Expand Down
2 changes: 2 additions & 0 deletions stdpopsim/catalog/AedAeg/genome_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,6 @@
"3": {"length": 409777670, "synonyms": []},
"MT": {"length": 16790, "synonyms": []},
},
"assembly_source": "ensembl",
"assembly_build_version": "113",
}
2 changes: 2 additions & 0 deletions stdpopsim/catalog/AnaPla/genome_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,4 +45,6 @@
"39": {"length": 2018729, "synonyms": []},
"40": {"length": 1354177, "synonyms": []},
},
"assembly_source": "ensembl",
"assembly_build_version": "113",
}
2 changes: 2 additions & 0 deletions stdpopsim/catalog/AnoCar/genome_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,6 @@
"LGh": {"length": 248369, "synonyms": []},
"MT": {"length": 17223, "synonyms": []},
},
"assembly_source": "ensembl",
"assembly_build_version": "103",
}
2 changes: 2 additions & 0 deletions stdpopsim/catalog/AnoGam/genome_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,6 @@
"X": {"length": 24393108, "synonyms": []},
"Mt": {"length": 15363, "synonyms": []},
},
"assembly_source": "ensembl",
"assembly_build_version": "113",
}
2 changes: 2 additions & 0 deletions stdpopsim/catalog/ApiMel/genome_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,6 @@
"CM009946.2": {"length": 7238532, "synonyms": []},
"CM009947.2": {"length": 16343, "synonyms": []},
},
"assembly_source": "ensembl",
"assembly_build_version": "113",
}
2 changes: 2 additions & 0 deletions stdpopsim/catalog/AraTha/genome_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,6 @@
"Mt": {"length": 366924, "synonyms": []},
"Pt": {"length": 154478, "synonyms": []},
},
"assembly_source": "ensembl",
"assembly_build_version": "113",
}
2 changes: 2 additions & 0 deletions stdpopsim/catalog/BosTau/genome_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,6 @@
"X": {"length": 139009144, "synonyms": ["chrX"]},
"MT": {"length": 16338, "synonyms": []},
},
"assembly_source": "ensembl",
"assembly_build_version": "113",
}
48 changes: 13 additions & 35 deletions stdpopsim/catalog/BosTau/species.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,12 @@
# 24.35 / 2628394923 = 9.26e-9 per bp per generation.
_genome_wide_recombination_rate = 9.26e-9

# Mutation rate
_mutation_rate = 1.2e-8
_mutation_rate_data = {str(i): _mutation_rate for i in range(1, 30)}
_mutation_rate_data["MT"] = _mutation_rate
_mutation_rate_data["X"] = _mutation_rate

_recombination_rate_data = collections.defaultdict(
lambda: _genome_wide_recombination_rate
)
Expand All @@ -67,39 +73,8 @@

# Generic and chromosome-specific ploidy
_species_ploidy = 2
_ploidy = {
"1": _species_ploidy,
"2": _species_ploidy,
"3": _species_ploidy,
"4": _species_ploidy,
"5": _species_ploidy,
"6": _species_ploidy,
"7": _species_ploidy,
"8": _species_ploidy,
"9": _species_ploidy,
"10": _species_ploidy,
"11": _species_ploidy,
"12": _species_ploidy,
"13": _species_ploidy,
"14": _species_ploidy,
"15": _species_ploidy,
"16": _species_ploidy,
"17": _species_ploidy,
"18": _species_ploidy,
"19": _species_ploidy,
"20": _species_ploidy,
"21": _species_ploidy,
"22": _species_ploidy,
"23": _species_ploidy,
"24": _species_ploidy,
"25": _species_ploidy,
"26": _species_ploidy,
"27": _species_ploidy,
"28": _species_ploidy,
"29": _species_ploidy,
"X": _species_ploidy,
"MT": 1,
}
_ploidy = {str(i): _species_ploidy for i in range(1, 30)}
_ploidy.update({"X": _species_ploidy, "MT": 1})

_chromosomes = []
for name, data in genome_data.data["chromosomes"].items():
Expand All @@ -115,8 +90,11 @@
)
)

_genome = stdpopsim.Genome(
chromosomes=_chromosomes,
_genome = stdpopsim.Genome.from_data(
genome_data=genome_data.data,
recombination_rate=_recombination_rate_data,
mutation_rate=_mutation_rate_data,
ploidy=_ploidy,
citations=[
_HoweEtAl, # ASSEMBLY
_RosenEtAl, # ASSEMBLY
Expand Down
2 changes: 2 additions & 0 deletions stdpopsim/catalog/CaeEle/genome_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,6 @@
"X": {"length": 17718942, "synonyms": []},
"MtDNA": {"length": 13794, "synonyms": []},
},
"assembly_source": "ensembl",
"assembly_build_version": "113",
}
9 changes: 5 additions & 4 deletions stdpopsim/catalog/CaeEle/species.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,10 +123,11 @@
)
)

_genome = stdpopsim.Genome(
chromosomes=_chromosomes,
assembly_name=genome_data.data["assembly_name"],
assembly_accession=genome_data.data["assembly_accession"],
_genome = stdpopsim.Genome.from_data(
genome_data=genome_data.data,
recombination_rate=_recombination_rate_data,
mutation_rate=_mutation_rate_data,
ploidy=_ploidy,
citations=[
_genome1998,
_KonradEtAl2019.because(stdpopsim.CiteReason.MUT_RATE),
Expand Down
2 changes: 2 additions & 0 deletions stdpopsim/catalog/CanFam/genome_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,6 @@
"X": {"length": 123869142, "synonyms": []},
"MT": {"length": 16727, "synonyms": []},
},
"assembly_source": "ensembl",
"assembly_build_version": "103",
}
20 changes: 14 additions & 6 deletions stdpopsim/catalog/CanFam/species.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
"38": 1.4363726512881696e-08,
"X": 9.506483722244087e-09,
"MT": 0,
"Y": 0, # manually set to 0 because it's not in the map
# "Y": 0, # removed because not in the map or assembly
}

# Generic and chromosome-specific ploidy
Expand Down Expand Up @@ -92,9 +92,16 @@
"38": _species_ploidy,
"X": _species_ploidy,
"MT": 1,
"Y": 1,
# "Y": 1, removed because it's not in the map
}

_mutation_rate = 4e-9
_mutation_rate_data = {str(i): _mutation_rate for i in range(1, 39)}
_mutation_rate_data["MT"] = (
_mutation_rate # note this is likely incorrect but consistent with current setup
)
_mutation_rate_data["X"] = _mutation_rate

_LindbladTohEtAl = stdpopsim.Citation(
# Genome sequence, comparative analysis and haplotype structure of the
# domestic dog.
Expand Down Expand Up @@ -139,10 +146,11 @@
)
)

_genome = stdpopsim.Genome(
chromosomes=_chromosomes,
assembly_name=genome_data.data["assembly_name"],
assembly_accession=genome_data.data["assembly_accession"],
_genome = stdpopsim.Genome.from_data(
genome_data=genome_data.data,
recombination_rate=_recombination_rate_data,
mutation_rate=_mutation_rate_data,
ploidy=_ploidy,
citations=[
_SkoglundEtAl.because(stdpopsim.CiteReason.MUT_RATE),
_FrantzEtAl.because(stdpopsim.CiteReason.MUT_RATE),
Expand Down
2 changes: 2 additions & 0 deletions stdpopsim/catalog/ChlRei/genome_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,6 @@
"16": {"length": 7783580, "synonyms": []},
"17": {"length": 7188315, "synonyms": []},
},
"assembly_source": "ensembl",
"assembly_build_version": "113",
}
2 changes: 2 additions & 0 deletions stdpopsim/catalog/DroMel/genome_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,6 @@
"Y": {"length": 3667352, "synonyms": []},
"mitochondrion_genome": {"length": 19524, "synonyms": []},
},
"assembly_source": "ensembl",
"assembly_build_version": "113",
}
2 changes: 2 additions & 0 deletions stdpopsim/catalog/DroSec/genome_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,6 @@
"X": {"length": 22909512, "synonyms": []},
"4": {"length": 1277805, "synonyms": []},
},
"assembly_source": "manual",
"assembly_build_version": None,
}
2 changes: 2 additions & 0 deletions stdpopsim/catalog/EscCol/genome_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,6 @@
"assembly_accession": "GCA_000005845.2",
"assembly_name": "ASM584v2",
"chromosomes": {"Chromosome": {"length": 4641652, "synonyms": []}},
"assembly_source": "ensembl",
"assembly_build_version": "113",
}
22 changes: 17 additions & 5 deletions stdpopsim/catalog/EscCol/species.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
)

_species_ploidy = 1

_chromosomes = []
for name, data in genome_data.data["chromosomes"].items():
_chromosomes.append(
Expand All @@ -46,11 +45,24 @@
)
)

_genome = stdpopsim.Genome(
chromosomes=_chromosomes,

_ploidy_data = {str(i.id): _species_ploidy for i in _chromosomes}

_mutation_rate = 8.9e-11
_mutation_rate_data = {str(i.id): _mutation_rate for i in _chromosomes}

_recombination_rate = 8.9e-11
_recombination_rate_data = {str(i.id): _recombination_rate for i in _chromosomes}

_gene_conversion_length = 542
_gene_conversion_data = {str(i.id): _gene_conversion_length for i in _chromosomes}
_genome = stdpopsim.Genome.from_data(
genome_data=genome_data.data,
mutation_rate=_mutation_rate_data,
recombination_rate=_recombination_rate_data,
gene_conversion_length=_gene_conversion_data,
bacterial_recombination=True,
assembly_name=genome_data.data["assembly_name"],
assembly_accession=genome_data.data["assembly_accession"],
ploidy=_ploidy_data,
citations=[
_wielgoss_et_al.because(
{stdpopsim.CiteReason.MUT_RATE, stdpopsim.CiteReason.GENE_CONVERSION}
Expand Down
2 changes: 2 additions & 0 deletions stdpopsim/catalog/GasAcu/genome_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,6 @@
"Y": {"length": 15859692, "synonyms": []},
"MT": {"length": 16543, "synonyms": []},
},
"assembly_source": "ensembl",
"assembly_build_version": "103",
}
2 changes: 2 additions & 0 deletions stdpopsim/catalog/HelAnn/genome_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,6 @@
"16": {"length": 206736614, "synonyms": []},
"17": {"length": 195042445, "synonyms": []},
},
"assembly_source": "ensembl",
"assembly_build_version": "113",
}
2 changes: 2 additions & 0 deletions stdpopsim/catalog/HelMel/genome_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,6 @@
"20": {"length": 14871695, "synonyms": []},
"21": {"length": 13359691, "synonyms": []},
},
"assembly_source": "manual",
"assembly_build_version": None,
}
2 changes: 2 additions & 0 deletions stdpopsim/catalog/HomSap/genome_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,6 @@
"Y": {"length": 57227415, "synonyms": ["chrY"]},
"MT": {"length": 16569, "synonyms": ["chrM"]},
},
"assembly_source": "ensembl",
"assembly_build_version": "113",
}
2 changes: 2 additions & 0 deletions stdpopsim/catalog/MusMus/genome_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,6 @@
"Y": {"length": 91455967, "synonyms": ["chrY"]},
"MT": {"length": 16299, "synonyms": ["chrM"]},
},
"assembly_source": "ensembl",
"assembly_build_version": "113",
}
2 changes: 2 additions & 0 deletions stdpopsim/catalog/OrySat/genome_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,6 @@
"11": {"length": 29021106, "synonyms": []},
"12": {"length": 27531856, "synonyms": []},
},
"assembly_source": "ensembl",
"assembly_build_version": "113",
}
2 changes: 2 additions & 0 deletions stdpopsim/catalog/PanTro/genome_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,6 @@
"X": {"length": 155549662, "synonyms": ["chrX"]},
"Y": {"length": 26350515, "synonyms": ["chrY"]},
},
"assembly_source": "ensembl",
"assembly_build_version": "103",
}
2 changes: 2 additions & 0 deletions stdpopsim/catalog/PapAnu/genome_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,6 @@
"X": {"length": 142711496, "synonyms": []},
"Y": {"length": 8309886, "synonyms": []},
},
"assembly_source": "ensembl",
"assembly_build_version": "113",
}
2 changes: 2 additions & 0 deletions stdpopsim/catalog/PhoSin/genome_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,6 @@
"21": {"length": 35802323, "synonyms": []},
"X": {"length": 131664873, "synonyms": []},
},
"assembly_source": "ensembl",
"assembly_build_version": "113",
}
Loading

0 comments on commit 71751a0

Please sign in to comment.