Skip to content

Commit

Permalink
Improve tooling around adding species.
Browse files Browse the repository at this point in the history
  • Loading branch information
jeromekelleher committed Mar 10, 2021
1 parent 25f9799 commit f295c1b
Show file tree
Hide file tree
Showing 16 changed files with 295 additions and 89 deletions.
15 changes: 13 additions & 2 deletions maintenance/ensembl.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
import urllib.request


logger = logging.getLogger("ensembl")


class EnsemblRestClient:
"""
A client for the Ensembl REST API. Based on the example code
Expand Down Expand Up @@ -46,7 +49,7 @@ def _sleep_if_needed(self):
if self.num_requests >= self.max_requests_per_second:
delta = time.time() - self.last_request_time
if delta < 1:
logging.debug("Rate limiting REST API")
logger.info("Rate limiting REST API")
time.sleep(1 - delta)
self.num_requests = 0
else:
Expand All @@ -61,9 +64,10 @@ def get(self, endpoint, headers=None, params=None):
"""
self._sleep_if_needed()
request = self._make_request(endpoint, headers, params)
logging.debug("making request to %s", request.full_url)
logger.info("making request to %s", request.full_url)
response = urllib.request.urlopen(request)
content = response.read()
logger.debug("Response: %s", content)
data = json.loads(content)
return data

Expand All @@ -78,6 +82,13 @@ def get_release(self):
assert len(releases) == 1
return releases[0]

def get_species_data(self, ensembl_id):
"""
Returns species information for the specified ensembl_id.
"""
output = self.get(endpoint=f"/info/genomes/{ensembl_id}")
return output

def get_genome_data(self, ensembl_id):
"""
Returns the genome data for the specified Ensembl species
Expand Down
180 changes: 140 additions & 40 deletions maintenance/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,49 +7,82 @@
import shutil
import string
import pathlib
import logging

import click
import black
import daiquiri

import stdpopsim
from . import ensembl

logger = logging.getLogger("maint")

species_template = string.Template(
"""
import stdpopsim
from . import genome_data
_chromosomes = []
for name, data in genome_data.data["chromosomes"].items():
_chromosomes.append(
stdpopsim.Chromosome(
id=name,
length=data["length"],
synonyms=data["synonyms"],
mutation_rate=0, # FILL ME IN
recombination_rate=0, # FILL ME IN
)
)
_genome = stdpopsim.Genome(
chromosomes=_chromosomes,
mutation_rate_citations=[], # ADD CITATIONS
recombination_rate_citations=[], # ADD CITATIONS
assembly_name=genome_data.data["assembly_name"],
assembly_accession=genome_data.data["assembly_accession"],
assembly_citations=[],
# [The following are notes for implementers and should be deleted
# once the recombination rates have been inserted]
# This is the per-chromosome recombination rate, typically the mean
# rate along the chromosome.
# Values in this dictionary are set to -1 by default, so you have
# to update each one. These should be derived from the most reliable
# data and how they were obtained should be documented here.
# The appropriate citation must be added to the list of
# recombination_rate_citations in the Genome object.
_recombination_rate = $chromosome_rate_dict
# [The following are notes for implementers and should be deleted
# once the mutation rates have been inserted]
# This is the per-chromosome mutation rate, typically the mean
# rate along the chromosome. If per chromosome rates are not available,
# the same value should be used for each chromosome. In this case,
# please use a variable to store this value, rather than repeating
# the same numerical constant, e.g.
# _mutation_rate = {
# 1: _overall_rate,
# 2: _overall_rate,
# ...
# Values in this dictionary are set to -1 by default, so you have
# to update each one. These should be derived from the most reliable
# data and how they were obtained should be documented here.
# The appropriate citation must be added to the list of
# mutation_rate_citations in the Genome object.
_mutation_rate = $chromosome_rate_dict
_genome = stdpopsim.Genome.from_data(
genome_data.data,
recombination_rate=_recombination_rate,
mutation_rate=_mutation_rate
)
# [Implementers: you must add citations for the values that are
# provided. Do this like:
# _genome.recombination_rate_citations.append(
# stdpopsim.Citation(author=x, date=y, doi=z))
# _genome.mutation_rate_citations.append(
# stdpopsim.Citation(author=x, date=y, doi=z))
_species = stdpopsim.Species(
id="$sps_id",
name="FIXME",
common_name="FIXME",
ensembl_id="$ensembl_id",
name="$scientific_name",
common_name="$common_name",
genome=_genome,
generation_time=0, # FIXME
generation_time_citations=[],
population_size=0, # FIXME
# [Implementers: you must provide an estimate of the generation_time.
# Please also add a citation for this.]
generation_time=0,
# [Implementers: you must provide an estimate of the population size.
# TODO: give a definition of what this should be.
# Please also add a citation for this.]
population_size=0,
population_size_citations=[],
generation_time_citations=[],
)
stdpopsim.register_species(_species)
Expand All @@ -58,20 +91,54 @@

species_test_template = string.Template(
"""
import pytest
import stdpopsim
from tests import test_species
class TestSpecies(test_species.SpeciesTestBase):
class TestSpeciesData(test_species.SpeciesTestBase):
species = stdpopsim.get_species("$sps_id")
# TODO specific tests for species data.
def test_ensembl_id(self):
assert self.species.ensembl_id == "$ensembl_id"
class TestGenome(test_species.GenomeTestBase):
def test_name(self):
assert self.species.name == "$scientific_name"
def test_common_name(self):
assert self.species.common_name == "$common_name"
# QC Tests. These tests are performed by another contributor
# independently referring to the citations provided in the
# species definition, filling in the appropriate values
# and deleting the pytest "skip" annotations.
@pytest.mark.skip("Population size QC not done yet")
def test_qc_population_size(self):
assert self.species.population_size == -1
@pytest.mark.skip("Generation time QC not done yet")
def test_qc_generation_time(self):
assert self.species.generation_time == -1
class TestGenomeData(test_species.GenomeTestBase):
genome = stdpopsim.get_species("$sps_id").genome
@pytest.mark.skip("Recombination rate QC not done yet")
@pytest.mark.parametrize(
["name", "rate"],
$chromosome_rate_dict.items())
def test_recombination_rate(self, name, rate):
assert pytest.approx(rate, self.genome.get_chromosome(name).recombination_rate)
@pytest.mark.skip("Mutation rate QC not done yet")
@pytest.mark.parametrize(
["name", "rate"],
$chromosome_rate_dict.items())
def test_mutation_rate(self, name, rate):
assert pytest.approx(rate, self.genome.get_chromosome(name).mutation_rate)
"""
)

Expand All @@ -92,7 +159,7 @@ def catalog_path(sps_id):
return pathlib.Path(f"stdpopsim/catalog/{sps_id}")


def write_catalog_stub(path, sps_id, ensembl_id):
def write_catalog_stub(*, path, sps_id, ensembl_id, species_data, genome_data):
"""
Writes stub files to the catalog for a new species.
"""
Expand All @@ -102,15 +169,32 @@ def write_catalog_stub(path, sps_id, ensembl_id):
print('"""', file=f)
print("from . import species # noqa: F401", file=f)

species_code = species_template.substitute(sps_id=sps_id)
scientific_name = species_data["scientific_name"]
common_name = species_data["display_name"]
logger.info(f"{sps_id}: name={scientific_name}, common_name={common_name}")
chr_names = genome_data["chromosomes"].keys()
chromosome_rate_template = {name: -1 for name in chr_names}
species_code = species_template.substitute(
ensembl_id=ensembl_id,
sps_id=sps_id,
scientific_name=scientific_name,
common_name=common_name,
chromosome_rate_dict=chromosome_rate_template,
)
path = path / "species.py"
click.echo(f"Writing species definition stub to {path}")
logger.info(f"Writing species definition stub to {path}")
with open(path, "w") as f:
f.write(black_format(species_code))

test_code = species_test_template.substitute(sps_id=sps_id)
test_code = species_test_template.substitute(
ensembl_id=ensembl_id,
sps_id=sps_id,
scientific_name=scientific_name,
common_name=common_name,
chromosome_rate_dict=chromosome_rate_template,
)
test_path = pathlib.Path("tests") / f"test_{sps_id}.py"
click.echo(f"Writing species test stub to {test_path}")
logger.info(f"Writing species test stub to {test_path}")
with open(test_path, "w") as f:
f.write(black_format(test_code))

Expand All @@ -132,13 +216,20 @@ def write(self, path):

def add_species(self, ensembl_id, force=False):
sps_id = ensembl_stdpopsim_id(ensembl_id)
click.echo(f"Adding new species {sps_id} for Ensembl ID {ensembl_id}")
logger.info(f"Adding new species {sps_id} for Ensembl ID {ensembl_id}")
root = catalog_path(sps_id)
if force:
shutil.rmtree(root, ignore_errors=True)
root.mkdir()
self.write_genome_data(ensembl_id)
write_catalog_stub(root, sps_id, ensembl_id)
genome_data = self.write_genome_data(ensembl_id)
species_data = self.ensembl_client.get_species_data(ensembl_id)
write_catalog_stub(
path=root,
sps_id=sps_id,
ensembl_id=ensembl_id,
species_data=species_data,
genome_data=genome_data,
)

def write_genome_data(self, ensembl_id):
sps_id = ensembl_stdpopsim_id(ensembl_id)
Expand All @@ -147,18 +238,19 @@ def write_genome_data(self, ensembl_id):
raise ValueError(
f"Directory {id} corresponding to {ensembl_id} does" + "not exist"
)
click.echo(f"Writing genome data for {sps_id} {ensembl_id}")
logger.info(f"Writing genome data for {sps_id} {ensembl_id}")
path = path / "genome_data.py"
data = self.ensembl_client.get_genome_data(ensembl_id)
code = f"data = {data}"

# Format the code with Black so we don't get noisy diffs
with self.write(path) as f:
f.write(black_format(code))
return data

def write_ensembl_release(self):
release = self.ensembl_client.get_release()
click.echo(f"Using Ensembl release {release}")
logger.info(f"Using Ensembl release {release}")
path = pathlib.Path("stdpopsim/catalog/ensembl_info.py")
code = f"release = {release}"
with self.write(path) as f:
Expand All @@ -171,8 +263,16 @@ def write_ensembl_release(self):


@click.group()
def cli():
pass
@click.option("--debug", is_flag=True)
def cli(debug):
log_level = logging.INFO
if debug:
log_level = logging.DEBUG
daiquiri.setup(level=log_level)

# Black's logging is very noisy
black_logger = logging.getLogger("blib2to3")
black_logger.setLevel(logging.CRITICAL)


@cli.command()
Expand Down Expand Up @@ -215,7 +315,7 @@ def add_species(ensembl_id, force):
Add a new species to the catalog using its ensembl ID.
"""
writer = DataWriter()
writer.add_species(ensembl_id, force=force)
writer.add_species(ensembl_id.lower(), force=force)
writer.write_ensembl_release()


Expand Down
11 changes: 6 additions & 5 deletions stdpopsim/catalog/AraTha/species.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,23 +31,23 @@
mutation_rate_citations=[
stdpopsim.Citation(
author="Ossowski et al.",
year="2010",
year=2010,
doi="https://doi.org/10.1126/science.1180677",
reasons={stdpopsim.CiteReason.MUT_RATE},
)
],
recombination_rate_citations=[
stdpopsim.Citation(
author="Huber et al.",
year="2014",
year=2014,
doi="https://doi.org/10.1093/molbev/msu247",
reasons={stdpopsim.CiteReason.REC_RATE},
)
],
assembly_citations=[
stdpopsim.Citation(
doi="https://doi.org/10.1093/nar/gkm965",
year="2007",
year=2007,
author="Swarbreck et al.",
reasons={stdpopsim.CiteReason.ASSEMBLY},
)
Expand All @@ -56,14 +56,15 @@

_species = stdpopsim.Species(
id="AraTha",
ensembl_id="arabidopsis_thaliana",
name="Arabidopsis thaliana",
common_name="A. thaliana",
genome=_genome,
generation_time=1.0,
generation_time_citations=[
stdpopsim.Citation(
doi="https://doi.org/10.1890/0012-9658(2002)083[1006:GTINSO]2.0.CO;2",
year="2002",
year=2002,
author="Donohue",
reasons={stdpopsim.CiteReason.GEN_TIME},
)
Expand All @@ -72,7 +73,7 @@
population_size_citations=[
stdpopsim.Citation(
doi="https://doi.org/10.1016/j.cell.2016.05.063",
year="2016",
year=2016,
author="1001GenomesConsortium",
reasons={stdpopsim.CiteReason.POP_SIZE},
)
Expand Down
1 change: 1 addition & 0 deletions stdpopsim/catalog/BosTau/species.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@

_species = stdpopsim.Species(
id="BosTau",
ensembl_id="bos_taurus",
name="Bos Taurus",
common_name="Cattle",
genome=_genome,
Expand Down
1 change: 1 addition & 0 deletions stdpopsim/catalog/CanFam/species.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@

_species = stdpopsim.Species(
id="CanFam",
ensembl_id="canis_familiaris",
name="Canis familiaris",
common_name="Dog",
genome=_genome,
Expand Down
Loading

0 comments on commit f295c1b

Please sign in to comment.