From 1a18f882dd1ac1e5079c7eb0d1788a08f8be6e7b Mon Sep 17 00:00:00 2001 From: Jeffrey Adrion Date: Thu, 19 Nov 2020 00:07:26 +0000 Subject: [PATCH] changed name of DroMel map and added liftover documentation --- docs/development.rst | 83 ++++++++++++++++++++++++++++ maintenance/liftOver_comeron2012.py | 8 +-- stdpopsim/catalog/DroMel/__init__.py | 35 ++++++++++++ 3 files changed, 122 insertions(+), 4 deletions(-) diff --git a/docs/development.rst b/docs/development.rst index a01ca9ba2..b5ad6d2e2 100644 --- a/docs/development.rst +++ b/docs/development.rst @@ -962,6 +962,89 @@ Once all this is done, submit a PR containing the code changes and wait for dire on whom to send the compressed archive of genetic maps to (currently Andrew Kern is the primary uploader but please wait to send files to him until directed). +************************** +Lifting over a genetic map +************************** +Existing genetic maps will need to be lifted over to a new assembly, if and when the +current assembly is updated in `stdpopsim`. This process can be partially automated by running +the liftOver maintenance code. + +First, you must download and install the ``liftOver`` executable from the +`UCSC Genome Browser Store `_. +Next, you must download the appropriate chain files, again from UCSC +(see `UCSC Genome Browser downloads +`_ for more details). +To validate the remapping between assemblies it is required to have chain files +corresponding to both directions of the liftOver +(e.g. `hg19ToHg38.over.chain.gz` and `hg38ToHg19.over.chain.gz`) as in the +example below. + +An example of the process for +lifting over the `GeneticMap` ``"HapMapII_GRCh37"`` to the ``"Hg19"`` assembly +is shown below: + +.. code-block:: sh + + python /maintenance/liftOver_catalog.py \ + --species HomSap \ + --map HapMapII_GRCh37 \ + --chainFile hg19ToHg38.over.chain.gz \ + --validationChain hg38ToHg19.over.chain.gz \ + --winLen 1000 \ + --useAdjacentAvg \ + --retainIntermediates \ + --gapThresh 1000000 + +Here, the argument ``"--winLen"`` corresponds to the size of the window over which a weighted +average of recombination rates is taken when comparing the original map with the +back-lifted map (for validation purposes only). The argument ``"--gapThresh"`` is used to select a threshold for +which gaps in the new assembly longer than the ``"--gapThresh"`` will be set with a +recombination rate equal to 0.0000, instead of an average rate. The type of average rate used for gaps +shorter than the ``"--gapThresh"`` is determined either by using the mean rate of two most adjacent windows +or by using the mean rate for the entire chromosome, using options ``"--useAdjacentAvg"`` or +``"--useChromosomeAvg"``` respectively. + +Validation plots will automatically be generated in the ``"/liftOver_validation/"`` +directory. Intermediate files created by the ``liftOver`` executable will be saved +for inspection in the ``"/liftOver_intermediates/"``, only if the +``"--retainInermediates"`` option is used. Once the user has inspected the validation plots +and deemed the liftOver process to be sufficiently accurate, they can proceed to generating +the SHA256 checksum. + +The SHA256 checksum of the new genetic map tarball can be obtained using the +``sha256sum`` command from GNU coreutils. If this is not available on your +system, the following can instead be used: + +.. code-block:: sh + + python -c 'from stdpopsim.utils import sha256; print(sha256("genetic_map.tgz"))' + +The newly lifted over maps will be formatted in a compressed archive and +automatically named using the assembly name from the chain file. +This file will be sent to one of the `stdpopsim` uploaders for placement in the +AWS cloud, once the new map is approved. Finally, you must add a `GeneticMap` +object to the file named for your species in the `catalog` directory (the same one in +which the genome is defined) as shown in `Adding a genetic map`_. + +Again, once all this is done, submit a PR containing the code changes and wait for +directions on whom to send the compressed archive of genetic maps to +(currently Andrew Kern is the primary uploader but please wait to send files +to him until directed). + +.. note:: + + The ``GeneticMap`` named ``"ComeronCrossoverV2_dm6"`` for ``"DroMel"`` + was generated by similar code (albeit slightly different + compared to that shown above) using the following command: + +.. code-block:: sh + + python /maintenance/liftOver_comeron2012.py \ + --winLen 1000 \ + --gapThresh 1000000 \ + --useAdjacentAvg \ + --retainIntermediates + **************** Coding standards **************** diff --git a/maintenance/liftOver_comeron2012.py b/maintenance/liftOver_comeron2012.py index a7332db1f..c16c3c290 100644 --- a/maintenance/liftOver_comeron2012.py +++ b/maintenance/liftOver_comeron2012.py @@ -79,7 +79,7 @@ def main(): intermediatesDir_val = os.path.join(cwd, "liftOver_validation_intermediates") validationDir = os.path.join(cwd, "liftOver_validation") preliftDir = os.path.join(tmpDir, "prelift") - postliftDir = os.path.join(tmpDir, "comeron2012_maps") + postliftDir = os.path.join(tmpDir, "comeron2012v2_maps") postliftDir_val = os.path.join( tmpDir, newAssemb + "_liftedOverTo_{assemb}".format(assemb=orig_label) ) @@ -108,7 +108,7 @@ def main(): validationChain = os.path.join(tmpDir, "dm6ToDm3.over.chain.gz") urllib.request.urlretrieve(chainUrl, filename=chainFile) urllib.request.urlretrieve(validationUrl, filename=validationChain) - mapUrl = "ftp://ftp.flybase.org/flybase/associated_files/" "Comeron.2012.10.15.xlsx" + mapUrl = "ftp://ftp.flybase.org/flybase/associated_files/Comeron.2012.10.15.xlsx" mapFile = os.path.join(tmpDir, "Comeron.2012.10.15.xlsx") urllib.request.urlretrieve(mapUrl, filename=mapFile) @@ -135,7 +135,7 @@ def main(): outFiles.append( os.path.join( postliftDir, - "genetic_map_comeron2012_{assemb}_chr{id}.txt".format( + "genetic_map_comeron2012v2_{assemb}_chr{id}.txt".format( assemb=newAssemb, id=chrom ), ) @@ -188,7 +188,7 @@ def main(): # Tar new files and remove temporary directory print("Creating new tarball...") - outTarFile = os.path.join(cwd, "comeron2012_maps.tar.gz") + outTarFile = os.path.join(cwd, "comeron2012v2_maps.tar.gz") with tarfile.open(outTarFile, "w:gz") as tb: tb.add(postliftDir, arcname=os.path.basename(postliftDir)) shutil.rmtree(tmpDir) diff --git a/stdpopsim/catalog/DroMel/__init__.py b/stdpopsim/catalog/DroMel/__init__.py index 590e1ca10..6cfd10782 100644 --- a/stdpopsim/catalog/DroMel/__init__.py +++ b/stdpopsim/catalog/DroMel/__init__.py @@ -108,7 +108,42 @@ ) ], ) +_species.add_genetic_map(_gm) +_gm = stdpopsim.GeneticMap( + species=_species, + id="ComeronCrossoverV2_dm6", + description="Crossover map from meioses products of 8 lab crosses", + long_description=""" + The crossover map from a study of 8 crosses of 12 highly + inbred lines of D. melanogaster. This is based on the + products of 5,860 female meioses from whole genome sequencing data. + Recombination rates were calculated from the density of individual + recombination events that were detected in crosses. This map was + subsequently lifted over to the dm6 assembly using the available + maintenance code command: + + python liftOver_comeron2012.py \ + --winLen 1000 \ + --gapThresh 1000000 \ + --useAdjacentAvg \ + --retainIntermediates + """, + url=( + "https://stdpopsim.s3-us-west-2.amazonaws.com/genetic_maps/" + "DroMel/comeron2012v2_maps.tar.gz" + ), + sha256="79484e6042c36b06946af672cabf2ec3012b6bfc6e0018517c8e7be4769cd334", + file_pattern="genetic_map_comeron2012v2_dm6_chr{id}.txt", + citations=[ + stdpopsim.Citation( + author="Comeron et al", + doi="https://doi.org/10.1371/journal.pgen.1002905", + year=2012, + reasons={stdpopsim.CiteReason.GEN_MAP}, + ) + ], +) _species.add_genetic_map(_gm)