Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

changes to maintenance script / ensembl push #1646

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
142 changes: 131 additions & 11 deletions maintenance/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,12 @@ def test_name(self):
def test_common_name(self):
assert self.species.common_name == "$common_name"

def test_assembly_source(self):
assert self.genome.assembly_source == "$assembly_source"

def test_assembly_build_version(self):
assert self.genome.assembly_build_version == "$assembly_build_version"

# QC Tests. These tests are performed by another contributor
# independently referring to the citations provided in the
# species definition, filling in the appropriate values
Expand Down Expand Up @@ -177,6 +183,12 @@ def black_format(code):


def ensembl_stdpopsim_id(ensembl_id):
# below is do deal with name changes in Ensembl
# TODO: remove this once we have moved to the new names
if ensembl_id == "canis_lupus_familiaris":
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

perhaps insert a comment explaining what this is doing?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean, explaining why this is necessary

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And, actually, I don't understand why it's necessary. I see below that now species.ensembl_id == "canis_lupus_familiaris", so where does ensembl_id equal "canis_familiaris"? Just missing something here.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hah this is because of the transition that happened when running the maintenance script! now i bet i can take it out

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay - could you either take this out, or say in the comment exactly what needs to happen to take it out? (It says "once we have moved to the new names" - but: moved what names? where? I am confused. =)

ensembl_id = "canis_familiaris"
elif ensembl_id == "escherichia_coli_str_k_12_substr_mg1655_gca_000005845":
ensembl_id = "escherichia_coli"
tmp = ensembl_id.split("_")[:2]
sps_id = "".join([x[0:3].capitalize() for x in tmp])
if len(sps_id) != 6:
Expand Down Expand Up @@ -296,15 +308,73 @@ def write_genome_data(self, ensembl_id):
raise ValueError(
f"Directory {id} corresponding to {ensembl_id} does" + "not exist"
)
logger.info(f"Writing genome data for {sps_id} {ensembl_id}")
path = path / "genome_data.py"

genome_data_path = path / "genome_data.py"
existing_data = None

# Try to read existing genome data file once
if genome_data_path.exists():
try:
namespace = {}
with open(genome_data_path) as f:
exec(f.read(), namespace)
existing_data = namespace["data"]

# Check for non-Ensembl assembly source
existing_assembly_source = existing_data.get(
"assembly_source", "ensembl"
)
if existing_assembly_source != "ensembl":
logger.info(
f"Skipping {sps_id} ({ensembl_id}): "
f"existing genome_data.py has data "
f"not from Ensembl. (Re)move {genome_data_path}, "
f"re-run, and look"
f"at a diff to compare to current Ensembl data."
)
return ("manual", None)
except Exception as e:
logger.warning(
f"Error reading genome data for {sps_id}: {e}. "
"Proceeding with update."
)

# Get new data from Ensembl
data = self.ensembl_client.get_genome_data(ensembl_id)

# Preserve existing assembly source or default to "ensembl"
data["assembly_source"] = (
existing_data.get("assembly_source", "ensembl")
if existing_data
else "ensembl"
)

# Add Ensembl version number if assembly source is Ensembl
data["assembly_build_version"] = (
str(self.ensembl_client.get_release())
if data["assembly_source"] == "ensembl"
else None
)

# Check for chromosome name mismatches if we have existing data
if existing_data:
existing_chroms = set(existing_data["chromosomes"].keys())
new_chroms = set(data["chromosomes"].keys())

if existing_chroms != new_chroms:
logger.warning(
f"Skipping {sps_id} ({ensembl_id}): chromosome names in existing "
"genome_data.py do not match those in current Ensembl release. "
)
return ("chrom_mismatch", (existing_chroms, new_chroms))

logger.info(f"Writing genome data for {sps_id} {ensembl_id}")
code = f"data = {data}"

# Format the code with Black so we don't get noisy diffs
with self.write(path) as f:
with self.write(genome_data_path) as f:
f.write(black_format(code))
return data
return ("updated", None)
petrelharp marked this conversation as resolved.
Show resolved Hide resolved

def write_genome_data_ncbi(self, ncbi_id, sps_id):
path = catalog_path(sps_id)
Expand Down Expand Up @@ -371,15 +441,66 @@ def update_genome_data(species):
will update the genome data for humans. Multiple species can
be specified. By default all species are updated.
"""
# TODO make this work for NCBI as well
# Track warnings and errors
skipped_species = []

# Original species processing logic
if len(species) == 0:
embl_ids = [s.ensembl_id for s in stdpopsim.all_species()]
species_list = list(stdpopsim.all_species())
logger.info(f"Found {len(species_list)} species in catalog")
embl_ids = []
for s in species_list:
logger.info(f"Processing {s.id}: ensembl_id={s.ensembl_id}")
embl_ids.append((s.id, s.ensembl_id))
else:
embl_ids = [s.lower() for s in species]
embl_ids = [(s, s.lower()) for s in species]

# Process each species, maintaining existing logging
writer = DataWriter()
for eid in embl_ids:
writer.write_genome_data(eid)
writer.write_ensembl_release()
for species_id, eid in embl_ids:
try:
result = writer.write_genome_data(eid)
status, details = result
if status == "manual":
skipped_species.append(
(species_id, eid, "Manually created genome data file")
)
elif status == "chrom_mismatch":
existing_chroms, new_chroms = details
skipped_species.append(
(
species_id,
eid,
(
f"Chromosome names mismatch.\n"
f"Existing: {sorted(existing_chroms)}\n"
f"New: {sorted(new_chroms)}"
),
)
)
except ValueError as e:
logger.error(f"Error processing {eid}: {e}")
skipped_species.append((species_id, eid, str(e)))
except Exception as e:
logger.error(f"Unexpected error processing {eid}: {e}")
skipped_species.append((species_id, eid, f"Unexpected error: {str(e)}"))

# Add summary report at the end
if skipped_species:
logger.warning("\n=== Species Update Summary ===")
logger.warning("The following species were not updated:")
for species_id, eid, reason in skipped_species:
if "Chromosome names mismatch" in reason:
# Split the chromosome mismatch message into multiple lines
logger.warning(f" - {species_id} (Ensembl ID: {eid}):")
logger.warning(" Chromosome names mismatch.")
# Parse the chromosome lists from the new format
existing = reason.split("Existing: ")[1].split("\n")[0]
new = reason.split("New: ")[1]
logger.warning(f" Existing chromosomes: {existing}")
logger.warning(f" New chromosomes: {new}")
else:
logger.warning(f" - {species_id} (Ensembl ID: {eid}): {reason}")


@cli.command()
Expand All @@ -391,7 +512,6 @@ def add_species(ensembl_id, force):
"""
writer = DataWriter()
writer.add_species(ensembl_id.lower(), force=force)
writer.write_ensembl_release()


# TODO refactor this so that it's an option to add-species. By default
Expand Down
2 changes: 2 additions & 0 deletions stdpopsim/catalog/AedAeg/genome_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,6 @@
"3": {"length": 409777670, "synonyms": []},
"MT": {"length": 16790, "synonyms": []},
},
"assembly_source": "ensembl",
"assembly_build_version": "113",
}
2 changes: 2 additions & 0 deletions stdpopsim/catalog/AnaPla/genome_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,4 +45,6 @@
"39": {"length": 2018729, "synonyms": []},
"40": {"length": 1354177, "synonyms": []},
},
"assembly_source": "ensembl",
"assembly_build_version": "113",
}
2 changes: 2 additions & 0 deletions stdpopsim/catalog/AnoCar/genome_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,6 @@
"LGh": {"length": 248369, "synonyms": []},
"MT": {"length": 17223, "synonyms": []},
},
"assembly_source": "ensembl",
"assembly_build_version": "103",
}
2 changes: 2 additions & 0 deletions stdpopsim/catalog/AnoGam/genome_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,6 @@
"X": {"length": 24393108, "synonyms": []},
"Mt": {"length": 15363, "synonyms": []},
},
"assembly_source": "ensembl",
"assembly_build_version": "113",
}
36 changes: 19 additions & 17 deletions stdpopsim/catalog/ApiMel/genome_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,24 @@
"assembly_accession": "GCA_003254395.2",
"assembly_name": "Amel_HAv3.1",
"chromosomes": {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, this is why Codecov is complaining (no more synonyms, so this bit of code in ApiMel/species.py is not needed)
Screenshot from 2025-01-13 21-54-18

"CM009931.2": {"length": 27754200, "synonyms": ["NC_037638.1"]},
"CM009932.2": {"length": 16089512, "synonyms": ["NC_037639.1"]},
"CM009933.2": {"length": 13619445, "synonyms": ["NC_037640.1"]},
"CM009934.2": {"length": 13404451, "synonyms": ["NC_037641.1"]},
"CM009935.2": {"length": 13896941, "synonyms": ["NC_037642.1"]},
"CM009936.2": {"length": 17789102, "synonyms": ["NC_037643.1"]},
"CM009937.2": {"length": 14198698, "synonyms": ["NC_037644.1"]},
"CM009938.2": {"length": 12717210, "synonyms": ["NC_037645.1"]},
"CM009939.2": {"length": 12354651, "synonyms": ["NC_037646.1"]},
"CM009940.2": {"length": 12360052, "synonyms": ["NC_037647.1"]},
"CM009941.2": {"length": 16352600, "synonyms": ["NC_037648.1"]},
"CM009942.2": {"length": 11514234, "synonyms": ["NC_037649.1"]},
"CM009943.2": {"length": 11279722, "synonyms": ["NC_037650.1"]},
"CM009944.2": {"length": 10670842, "synonyms": ["NC_037651.1"]},
"CM009945.2": {"length": 9534514, "synonyms": ["NC_037652.1"]},
"CM009946.2": {"length": 7238532, "synonyms": ["NC_037653.1"]},
"CM009947.2": {"length": 16343, "synonyms": ["NC_001566.1", "MT"]},
petrelharp marked this conversation as resolved.
Show resolved Hide resolved
"CM009931.2": {"length": 27754200, "synonyms": []},
"CM009932.2": {"length": 16089512, "synonyms": []},
"CM009933.2": {"length": 13619445, "synonyms": []},
"CM009934.2": {"length": 13404451, "synonyms": []},
"CM009935.2": {"length": 13896941, "synonyms": []},
"CM009936.2": {"length": 17789102, "synonyms": []},
"CM009937.2": {"length": 14198698, "synonyms": []},
"CM009938.2": {"length": 12717210, "synonyms": []},
"CM009939.2": {"length": 12354651, "synonyms": []},
"CM009940.2": {"length": 12360052, "synonyms": []},
"CM009941.2": {"length": 16352600, "synonyms": []},
"CM009942.2": {"length": 11514234, "synonyms": []},
"CM009943.2": {"length": 11279722, "synonyms": []},
"CM009944.2": {"length": 10670842, "synonyms": []},
"CM009945.2": {"length": 9534514, "synonyms": []},
"CM009946.2": {"length": 7238532, "synonyms": []},
"CM009947.2": {"length": 16343, "synonyms": []},
},
"assembly_source": "ensembl",
"assembly_build_version": "113",
}
2 changes: 2 additions & 0 deletions stdpopsim/catalog/AraTha/genome_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,6 @@
"Mt": {"length": 366924, "synonyms": []},
"Pt": {"length": 154478, "synonyms": []},
},
"assembly_source": "ensembl",
"assembly_build_version": "113",
}
66 changes: 34 additions & 32 deletions stdpopsim/catalog/BosTau/genome_data.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,40 @@
# File autogenerated from Ensembl REST API. Do not edit.
data = {
"assembly_accession": "GCA_002263795.2",
"assembly_name": "ARS-UCD1.2",
"assembly_accession": "GCA_002263795.3",
"assembly_name": "ARS-UCD1.3",
"chromosomes": {
"1": {"length": 158534110, "synonyms": []},
"2": {"length": 136231102, "synonyms": []},
"3": {"length": 121005158, "synonyms": []},
"4": {"length": 120000601, "synonyms": []},
"5": {"length": 120089316, "synonyms": []},
"6": {"length": 117806340, "synonyms": []},
"7": {"length": 110682743, "synonyms": []},
"8": {"length": 113319770, "synonyms": []},
"9": {"length": 105454467, "synonyms": []},
"10": {"length": 103308737, "synonyms": []},
"11": {"length": 106982474, "synonyms": []},
"12": {"length": 87216183, "synonyms": []},
"13": {"length": 83472345, "synonyms": []},
"14": {"length": 82403003, "synonyms": []},
"15": {"length": 85007780, "synonyms": []},
"16": {"length": 81013979, "synonyms": []},
"17": {"length": 73167244, "synonyms": []},
"18": {"length": 65820629, "synonyms": []},
"19": {"length": 63449741, "synonyms": []},
"20": {"length": 71974595, "synonyms": []},
"21": {"length": 69862954, "synonyms": []},
"22": {"length": 60773035, "synonyms": []},
"23": {"length": 52498615, "synonyms": []},
"24": {"length": 62317253, "synonyms": []},
"25": {"length": 42350435, "synonyms": []},
"26": {"length": 51992305, "synonyms": []},
"27": {"length": 45612108, "synonyms": []},
"28": {"length": 45940150, "synonyms": []},
"29": {"length": 51098607, "synonyms": []},
"X": {"length": 139009144, "synonyms": []},
"1": {"length": 158534110, "synonyms": ["chr1"]},
"2": {"length": 136231102, "synonyms": ["chr2"]},
"3": {"length": 121005158, "synonyms": ["chr3"]},
"4": {"length": 120000601, "synonyms": ["chr4"]},
"5": {"length": 120089316, "synonyms": ["chr5"]},
"6": {"length": 117806340, "synonyms": ["chr6"]},
"7": {"length": 110682743, "synonyms": ["chr7"]},
"8": {"length": 113319770, "synonyms": ["chr8"]},
"9": {"length": 105454467, "synonyms": ["chr9"]},
"10": {"length": 103308737, "synonyms": ["chr10"]},
"11": {"length": 106982474, "synonyms": ["chr11"]},
"12": {"length": 87216183, "synonyms": ["chr12"]},
"13": {"length": 83472345, "synonyms": ["chr13"]},
"14": {"length": 82403003, "synonyms": ["chr14"]},
"15": {"length": 85007780, "synonyms": ["chr15"]},
"16": {"length": 81013979, "synonyms": ["chr16"]},
"17": {"length": 73167244, "synonyms": ["chr17"]},
"18": {"length": 65820629, "synonyms": ["chr18"]},
"19": {"length": 63449741, "synonyms": ["chr19"]},
"20": {"length": 71974595, "synonyms": ["chr20"]},
"21": {"length": 69862954, "synonyms": ["chr21"]},
"22": {"length": 60773035, "synonyms": ["chr22"]},
"23": {"length": 52498615, "synonyms": ["chr23"]},
"24": {"length": 62317253, "synonyms": ["chr24"]},
"25": {"length": 42350435, "synonyms": ["chr25"]},
"26": {"length": 51992305, "synonyms": ["chr26"]},
"27": {"length": 45612108, "synonyms": ["chr27"]},
"28": {"length": 45940150, "synonyms": ["chr28"]},
"29": {"length": 51098607, "synonyms": ["chr29"]},
"X": {"length": 139009144, "synonyms": ["chrX"]},
"MT": {"length": 16338, "synonyms": []},
},
"assembly_source": "ensembl",
"assembly_build_version": "113",
}
Loading
Loading