From a27d3717286b7e48db0d6b51ffe34e1960cbee6e Mon Sep 17 00:00:00 2001 From: chriscrsmith Date: Thu, 3 Aug 2023 19:56:38 +0000 Subject: [PATCH] half-entered new dfe got proportion added dom coefficients almost finished added lethal mutations Update dfes.py add uniform DFE (closes #1481) h-s relationship (closes #1488) formatting test dominance coeffs actually in there remove deprecated set-output (see https://github.blog/changelog/2022-10-11-github-actions-deprecating-save-state-and-set-output-commands/ ) and bump cache package version bumps remove python 3.7 as it is end-of-life and creates install problems for scikit-allel make sure the tests are testing what they're supposed to test chris comments changelog clarify manual more chris comments bump cache?!?! using homsap mu using homsap mu flake8 delete stray print different neutral prop Update stdpopsim/catalog/HomSap/dfes.py Co-authored-by: Peter Ralph Update stdpopsim/catalog/HomSap/dfes.py Co-authored-by: Peter Ralph Update stdpopsim/catalog/HomSap/dfes.py Co-authored-by: Peter Ralph updates to new dfe Apply suggestions from code review reduce cache num again fixxxxxxxxxxxxxxxxxxxxxxxxx bump cache trying changeing tests.yml trying changeing tests.yml reduce cache num again --- .github/workflows/tests.yml | 9 +- CHANGELOG.rst | 16 +++ requirements/CI/requirements.txt | 16 ++- stdpopsim/catalog/HomSap/dfes.py | 53 ++++++++++ stdpopsim/dfe.py | 108 +++++++++++++++++-- stdpopsim/slim_engine.py | 175 ++++++++++++++++++++++--------- tests/test_dfes.py | 100 +++++++++++++++++- tests/test_slim_engine.py | 163 ++++++++++++++++++++++++---- 8 files changed, 546 insertions(+), 94 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index cce1ca159..b70a48801 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -18,9 +18,8 @@ jobs: fail-fast: false matrix: os: [ubuntu-20.04, macos-11, windows-latest] - python: [3.7, "3.10"] + python: [3.8, "3.10"] env: - CONDA_ENV_NAME: stdpopsim OS: ${{ matrix.os }} PYTHON: ${{ matrix.python }} @@ -36,7 +35,7 @@ jobs: - name: find conda id: find-conda run: | - echo "::set-output name=CONDA::$CONDA" + echo "name=CONDA::$CONDA" >> $GITHUB_OUTPUT - name: fix conda permissions if: runner.os == 'macOS' @@ -49,7 +48,7 @@ jobs: uses: actions/cache@v2 env: # Increase this to reset the cache if the key hasn't changed. - CACHE_NUM: 2 + CACHE_NUM: 4 with: path: | ${{ steps.find-conda.outputs.CONDA }}/envs/${{ env.CONDA_ENV_NAME }} @@ -60,7 +59,7 @@ jobs: uses: conda-incubator/setup-miniconda@v2 if: steps.cache.outputs.cache-hit != 'true' with: - activate-environment: ${{ env.CONDA_ENV_NAME }} + activate-environment: stdpopsim python-version: ${{ matrix.python }} channels: conda-forge channel-priority: strict diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 82741977d..6db710898 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -2,6 +2,22 @@ [0.2.1] - 2023-XX-XX -------------------- +**Breaking changes**: + +- To add the possibility for a relationship between dominance and selection + coefficient, now each stdpopsim MutationType might have more than one + underlying SLiM mutation type; so, where this is recorded in top-level + metadata (under `ts.metadata["stdpopsim"]["DFEs"]`) is now a list + instead of a single value. This will not affect anyone who is not + parsing the metadata related to DFEs. + +**New features**: + +- *Relationship between dominance and selection coefficient:* + Added the `dominance_coeff_list` argument to `MutationType`, allowing + for DFEs with a discretized relationship between h and s. + (:user:`petrelharp`, :pr:`1498`) + **New species**: - Mus musculus (:user:`peterdfields`, :pr:`1437`). diff --git a/requirements/CI/requirements.txt b/requirements/CI/requirements.txt index 37288de45..a2a805a09 100644 --- a/requirements/CI/requirements.txt +++ b/requirements/CI/requirements.txt @@ -1,10 +1,10 @@ coverage==6.3.3 flake8==5.0.4 -pytest==7.2.1 -pytest-cov==4.0.0 -pytest-xdist==3.2.0 +pytest==7.3.2 +pytest-cov==4.1.0 +pytest-xdist==3.3.1 setuptools-scm==7.1.0 -sphinx==4.3.2 +sphinx==5.0.2 sphinx-argparse==0.4.0 sphinx-issues==3.0.1 sphinx_rtd_theme==1.2.0 @@ -14,11 +14,9 @@ attrs==21.4.0 appdirs==1.4.4 humanize==4.6.0 pyslim==1.0.1 -numpy==1.21.5; python_version=='3.7' -numpy==1.22.3; python_version>='3.8' -scipy; python_version=='3.7' -scipy==1.10.1; python_version>='3.8' -scikit-allel==1.3.5 +numpy==1.22.3 +scipy==1.10.1 +scikit-allel==1.3.6 biopython==1.80 demes==0.2.2 demesdraw==0.3.1 diff --git a/stdpopsim/catalog/HomSap/dfes.py b/stdpopsim/catalog/HomSap/dfes.py index ee97bc4bc..83e1254a5 100644 --- a/stdpopsim/catalog/HomSap/dfes.py +++ b/stdpopsim/catalog/HomSap/dfes.py @@ -147,3 +147,56 @@ def _HuberLogNormalDFE(): _species.add_dfe(_HuberLogNormalDFE()) + + +def _KyriazisDFE(): + id = "Mixed_K23" + description = "Deleterious Gamma DFE with additional lethals" + long_description = """ + The DFE estimated from human data recommended in Kyriazis et al. + (2023), https://doi.org/10.1086/726736, for general use. + This model is similar to the Kim et al. (2017) DFE based on human + genetic data, modified to include the dominance distribution from + Henn et al. (2016). + The model is also augmented with an additional proportion of 0.3% of + recessive lethals, based on the analysis of Wade et al. (2023). + """ + citations = [ + stdpopsim.Citation( + author="Kyriazis et al.", + year=2023, + doi="https://doi.org/10.1086/726736", + reasons={stdpopsim.CiteReason.DFE}, + ) + ] + neutral = stdpopsim.MutationType() + gamma_mean = -0.0131 + gamma_shape = 0.186 + coefs = [0.45, 0.2, 0.05, 0] + breaks = [0.001, 0.01, 0.1] + gamma = stdpopsim.MutationType( + dominance_coeff_list=coefs, + dominance_coeff_breaks=breaks, + distribution_type="g", # gamma distribution + distribution_args=[gamma_mean, gamma_shape], + ) + lethal = stdpopsim.MutationType( + distribution_type="f", # fixed value + distribution_args=[-1], # fitness in SLiM for homozygotes is multiiplied by 1+s + dominance_coeff=0, + ) + proportion_deleterious = 2.31 / (1 + 2.31) + lethal_prop = proportion_deleterious * 0.003 # 0.3% lethals + gamma_prop = proportion_deleterious - lethal_prop + neutral_prop = 1 - proportion_deleterious + return stdpopsim.DFE( + id=id, + description=description, + long_description=long_description, + mutation_types=[neutral, gamma, lethal], + proportions=[neutral_prop, gamma_prop, lethal_prop], + citations=citations, + ) + + +_species.add_dfe(_KyriazisDFE()) diff --git a/stdpopsim/dfe.py b/stdpopsim/dfe.py index 78ac8e7c1..9136fe82a 100644 --- a/stdpopsim/dfe.py +++ b/stdpopsim/dfe.py @@ -8,11 +8,17 @@ import numpy as np +def _copy_converter(x): + if isinstance(x, list): + x = x.copy() + return x + + @attr.s(kw_only=True) class MutationType(object): """ - Class representing a "type" of mutation. The design closely (exactly) - mirrors SLiM's MutationType class. + Class representing a "type" of mutation. The design closely mirrors SLiM's + MutationType class. The main thing that mutation types carry is a way of drawing a selection coefficient for each new mutation. This ``distribution_type`` should be one @@ -23,6 +29,7 @@ class MutationType(object): - ``g``: gamma, two parameters (mean, shape) - ``n``: normal, two parameters (mean, SD) - ``w``: Weibull, two parameters (scale, shape) + - ``u``: Uniform, two parameters (min, max) - ``lp``: positive logNormal, two parameters (mean and sd on log scale; see rlnorm) - ``ln``: negative logNormal, two parameters (mean and sd on log scale; see rlnorm) @@ -31,28 +38,99 @@ class MutationType(object): exponential and gamma, a negative mean can be provided, obtaining always negative values. + Instead of a single dominance coefficient (which would be specified with + `dominance_coeff`), a discretized relationship between dominance and + selection coefficient can be implemented: if dominance_coeff_list is + provided, mutations with selection coefficient s for which + dominance_coeff_breaks[k-1] <= s <= dominance_coeff_breaks[k] will have + dominance coefficient dominance_coeff[k]. In other words, the first entry + of dominance_coeff_list applies to any mutations with selection coefficient + below the first entry of dominance_coeff_breaks; the second entry of + dominance_coeff_list applies to mutations with selection coefficient + between the first and second entries of dominance_coeff_breaks, and so + forth. The list of breaks must therefore be of length one less than the + list of dominance coefficients. + :ivar distribution_type: A one-letter abbreviation for the distribution of fitness effects that each new mutation of this type draws from (see below). :vartype distribution_type: str :ivar distribution_args: Arguments for the distribution type. :vartype distribution_type: list - :ivar dominance_coeff: A single dominance coefficient (negative = underdominance, + :ivar dominance_coeff: The dominance coefficient (negative = underdominance, 0 = recessive, 0.5 = additive, 1.0 = completely dominant, > 1.0 = overdominant) + Default: 0.5. :vartype dominance_coeff: float :ivar convert_to_substitution: Whether to retain any fixed mutations in the simulation: if not, we cannot ask about their frequency once fixed. (Either way, they will remain in the tree sequence). :vartype convert_to_substitution: bool + :ivar dominance_coeff_list: Either None (the default) or a list of floats describing + a list of dominance coefficients, to apply to different selection coefficients + (see details). Cannot be specified along with dominance_coeff. + :vartype dominance_coeff_list: list of floats + :ivar dominance_coeff_breaks: Either None (the default) or a list of floats + describing the intervals of selection coefficient over which each of the entries + of dominance_coeff_list applies (see details). Must be of length one shorter than + dominance_coeff_list. + :vartype dominance_coeff_breaks: list of floats """ - dominance_coeff = attr.ib(default=0.5, type=float) + dominance_coeff = attr.ib(default=None, type=float) distribution_type = attr.ib(default="f", type=str) - distribution_args = attr.ib(factory=lambda: [0], type=list) + distribution_args = attr.ib( + factory=lambda: [0], type=list, converter=_copy_converter + ) convert_to_substitution = attr.ib(default=True, type=bool) + dominance_coeff_list = attr.ib(default=None, type=list, converter=_copy_converter) + dominance_coeff_breaks = attr.ib(default=None, type=list, converter=_copy_converter) def __attrs_post_init__(self): - if not isinstance(self.dominance_coeff, (float, int)): - raise ValueError("dominance_coeff must be a number.") + if self.dominance_coeff is None and self.dominance_coeff_list is None: + self.dominance_coeff = 0.5 + + if self.dominance_coeff is not None: + if (self.dominance_coeff_list is not None) or ( + self.dominance_coeff_breaks is not None + ): + raise ValueError( + "Cannot specify both dominance_coeff and dominance_coeff_list." + ) + if not isinstance(self.dominance_coeff, (float, int)): + raise ValueError("dominance_coeff must be a number.") + if not np.isfinite(self.dominance_coeff): + raise ValueError( + f"Invalid dominance coefficient {self.dominance_coeff}." + ) + + if self.dominance_coeff_list is not None: + # disallow the inefficient and annoying length-one case + if len(self.dominance_coeff_list) < 2: + raise ValueError("dominance_coeff_list must have at least 2 elements.") + for h in self.dominance_coeff_list: + if not isinstance(h, (float, int)): + raise ValueError("dominance_coeff_list must be a list of numbers.") + if not np.isfinite(h): + raise ValueError(f"Invalid dominance coefficient {h}.") + if self.dominance_coeff_breaks is None: + raise ValueError( + "A list of dominance coefficients provided but no breaks." + ) + if len(self.dominance_coeff_list) != len(self.dominance_coeff_breaks) + 1: + raise ValueError( + "len(dominance_coeff_list) must be equal " + "to len(dominance_coeff_breaks) + 1" + ) + lb = -1 * np.inf + for b in self.dominance_coeff_breaks: + if not isinstance(b, (float, int)): + raise ValueError( + "dominance_coeff_breaks must be a list of numbers." + ) + if not np.isfinite(b): + raise ValueError(f"Invalid dominance coefficient break {b}.") + if b < lb: + raise ValueError("dominance_coeff_breaks must be nondecreasing.") + lb = b if not isinstance(self.distribution_type, str): raise ValueError("distribution_type must be str.") @@ -69,9 +147,6 @@ def __attrs_post_init__(self): if not isinstance(self.convert_to_substitution, bool): raise ValueError("convert_to_substitution must be bool.") - if not np.isfinite(self.dominance_coeff): - raise ValueError(f"Invalid dominance coefficient {self.dominance_coeff}.") - # To add a new distribution type: validate the # distribution_args here, and add unit tests. if self.distribution_type == "f": @@ -143,6 +218,19 @@ def __attrs_post_init__(self): f"return {sign}rlnorm(1, {logmean} + log(Q), {logsd});" ] self.distribution_type = "s" + elif self.distribution_type == "u": + # Uniform + if ( + len(self.distribution_args) != 2 + or self.distribution_args[0] > self.distribution_args[1] + ): + raise ValueError( + "Uniformly-distributed sel. coefs. (distribution_type='u') " + "use a (min, max) parameterisation, with min <= max." + ) + umin, umax = self.distribution_args + self.distribution_args = [f"return runif(1, Q * {umin}, Q * {umax});"] + self.distribution_type = "s" else: raise ValueError( f"{self.distribution_type} is not a supported distribution type." diff --git a/stdpopsim/slim_engine.py b/stdpopsim/slim_engine.py index 69b134d3e..f5d163601 100644 --- a/stdpopsim/slim_engine.py +++ b/stdpopsim/slim_engine.py @@ -503,6 +503,22 @@ } } + // Setup mutation callbacks. + // For each stdpopsim mutation type with an h-s relationship + // we have a sequence of assigned SLiM mutation types; + // the first is the one that gets produced by mutation, + // and the remainder are assigned by a mutation callback. + for (i in seqAlong(mut_types_with_callbacks)) { + mt = mut_types_with_callbacks[i]; + sim.registerMutationCallback(NULL, + "{s = mut.selectionCoeff; " + + "k = findInterval(s, dominance_coeff_breaks_" + mt + "); " + + "mut.setMutationType(dominance_coeff_types_" + mt + "[k]); " + + "return T;}", + mt + ); + } + // Sample individuals. for (i in 0:(ncol(sampling_episodes)-1)) { pop = drop(sampling_episodes[0,i]); @@ -539,7 +555,6 @@ """ - _slim_logfile = """ // Optional logfile output: // Fitness values are only available in early(), @@ -669,7 +684,11 @@ "items": { "type": "object", "properties": { - "dominance_coeff": {"type": "number"}, + "dominance_coeff": {"type": ["number", "null"]}, + "dominance_coeff_list": {"type": ["array", "null"]}, + "dominance_coeff_breaks": { + "type": ["array", "null"] + }, "distribution_type": {"type": "string"}, "distribution_args": { "type": "array", @@ -680,7 +699,7 @@ "type": "array", "items": {"type": "number"}, }, - "slim_mutation_type_id": {"type": "integer"}, + "slim_mutation_type_id": {"type": "array"}, "is_neutral": {"type": "boolean"}, }, }, @@ -759,21 +778,38 @@ def _enum_dfe_and_intervals(contig): def _dfe_to_mtypes(contig): """ - Assigns a mutation type id to each of the mutation types contained in this + Assigns mutation type ids to each of the mutation types contained in this `Contig`. This function will return a dictionary with `len(contig.dfe_list)` elements, in which the position of the DFE in `contig.dfe_list` is the key. - For each DFE, the dictionary holds a list of tuples that contain the an - assigned mutation type id (used in SLiM) and the `MutationType` object. + For each DFE, the dictionary holds a list of tuples that contain the + assigned mutation type ids (used in SLiM) and the `MutationType` object. This is necessary so that we use the same numeric ids to the mutation types in the _add_dfes_to_metadata and slim_makescript steps. + + The assigned mutation type ids is stored as a tuple, that is only of length + greater than one if the mutation type is of a sort that is implemented + using more than one SLiM mutation type; currently, these are only mutation + types with a discretized h-s relationship, and (a) only the first of these + mutation types are actually assigned a positive mutation rate in SLiM, but + (b) all mutations of the first type are transformed to the other mutation + types in the list, and hence never end up in the final tree sequence. (See + Recipe 10.6 in the SLiM manual, "Varying the dominance coefficient among + mutations".) """ mid = 0 dfe_to_mtypes = {} for i, d, _ in _enum_dfe_and_intervals(contig): dfe_to_mtypes[i] = [] for mt in d.mutation_types: - dfe_to_mtypes[i].append((mid, mt)) + # the first mutation type will be the one that has the mutation rate + # and each new mutation gets converted to one of the *other* types; + mid_list = [mid] mid += 1 + if mt.dominance_coeff_list is not None: + for _ in mt.dominance_coeff_list: + mid_list.append(mid) + mid += 1 + dfe_to_mtypes[i].append((tuple(mid_list), mt)) return dfe_to_mtypes @@ -798,9 +834,9 @@ def _get_json(obj): for i, d, ints in _enum_dfe_and_intervals(contig): dfes[i]["intervals"] = ints.tolist() dfes[i]["proportions"] = d.proportions - for j, (mid, mt) in enumerate(dfe_to_mtypes[i]): + for j, (mid_list, mt) in enumerate(dfe_to_mtypes[i]): add = { - "slim_mutation_type_id": mid, + "slim_mutation_type_id": list(mid_list), "is_neutral": mt.is_neutral, } dfes[i]["mutation_types"][j].update(add) @@ -1012,7 +1048,7 @@ def fix_time(event): f"refers to a DFE with intervals {dfe_intervals}, not " f"to a single site." ) - mt_id, mt = slim_mutation_ids[dfe_index][0] + mt_id_list, mt = slim_mutation_ids[dfe_index][0] if not mt.distribution_type == "f": raise ValueError( f"The single site id '{ee.single_site_id}' referenced by " @@ -1021,7 +1057,7 @@ def fix_time(event): f"fixed fitness coefficient." ) coordinate = dfe_intervals[0, 0] - mutation_type_id = mt_id + mutation_type_id = mt_id_list[0] if hasattr(ee, "start_time") and hasattr(ee, "end_time"): # Now that GenerationAfter times have been accounted for, we can # properly catch invalid start/end times. @@ -1176,47 +1212,66 @@ def matrix2str( return "".join(s) # Genomic element types. + mutation_callbacks = {} dfe_mtypes = _dfe_to_mtypes(contig) for j, d, ints in _enum_dfe_and_intervals(contig): - # Mutation types. + # Mutation types and proportions. mut_type_list = [] + mut_props_list = [] assert len(dfe_mtypes[j]) == len(d.mutation_types) - for mid, m in dfe_mtypes[j]: - if len(m.Q_scaled_index) >= 1: - distrib_args = [str(arg) for arg in m.distribution_args] - for k in m.Q_scaled_index: - distrib_args[m.Q_scaled_index[k]] = ( - "Q * " + distrib_args[m.Q_scaled_index[k]] + for mt_index, (mid_list, mt) in enumerate(dfe_mtypes[j]): + if len(mt.Q_scaled_index) >= 1: + distrib_args = [str(arg) for arg in mt.distribution_args] + for k in mt.Q_scaled_index: + distrib_args[mt.Q_scaled_index[k]] = ( + "Q * " + distrib_args[mt.Q_scaled_index[k]] ) distrib_args = ", ".join(distrib_args) # dealing with distributions given by "s" Eidos script, else: - distrib_args = "; ".join([f'"{a}"' for a in m.distribution_args]) - mut_type_list.append(mid) - printsc( - f" initializeMutationType({mid}, {m.dominance_coeff}, " - f'"{m.distribution_type}", {distrib_args});' - ) - if not m.convert_to_substitution: - # T is the default for WF simulations. - printsc(f" m{mid}.convertToSubstitution = F;") - # TODO: when msprime.SLiMMutationModel supports stacking policy, - # set policy such that there's at most a single mutation per-site - # and individual - # printsc(f" m{mid}.mutationStackGroup = 0;") - # printsc(f" m{mid}.mutationStackPolicy = 'l';") - mut_types = ", ".join([str(mt) for mt in mut_type_list]) + distrib_args = "; ".join([f'"{a}"' for a in mt.distribution_args]) + if mt.dominance_coeff_list is None: + h_list = [mt.dominance_coeff] + else: + # this first value will apply only to mutations that are never kept, + # and so has no effect + h_list = [0.5] + h_list.extend(mt.dominance_coeff_list) + # record here what we'll need to set up the callbacks in script + mutation_callbacks[mid_list[0]] = { + "dominance_coeff_breaks": mt.dominance_coeff_breaks, + "mutation_types": mid_list[1:], + } + assert len(mid_list) == len(h_list) + first_mt = True + for mid, h in zip(mid_list, h_list): + # We will assign a proportion of 0.0 to mutation types that are neutral + # unless all the mutation types in a DFE are neutral. In such case, + # SLiM would not allow all mutation types in a genomic element type to + # have 0.0 proportion. And the proportions in SLiM won't matter anyway, + # because all the intervals in this fully neutral DFE will have 0.0 + # mutation rate anyway. + # Furthermore, for mutation types with a discretized h-s relationship, + # we only mutate to the first assigned mutation type, and remaining ones + # are produced by a mutation callback. + mut_type_list.append(mid) + use_prop = first_mt and ((not mt.is_neutral) or d.is_neutral) + p = d.proportions[mt_index] if use_prop else 0.0 + mut_props_list.append(p) + printsc( + f" initializeMutationType({mid}, {h}, " + f'"{mt.distribution_type}", {distrib_args});' + ) + if not mt.convert_to_substitution: + # T is the default for WF simulations. + printsc(f" m{mid}.convertToSubstitution = F;") + # TODO: when msprime.SLiMMutationModel supports stacking policy, + # set policy such that there's at most a single mutation per-site + # and individual + # printsc(f" mt{mid}.mutationStackGroup = 0;") + # printsc(f" mt{mid}.mutationStackPolicy = 'l';") + first_mt = False mut_types = ", ".join([str(mt) for mt in mut_type_list]) - # We will assign a proportion of 0.0 to mutation types that are neutral - # unless all the mutation types in a DFE are neutral. In such case, - # SLiM would not allow all mutation types in a genomic element type to - # have 0.0 proportion. And the proportions in SLiM won't matter anyway, - # because all the intervals in this fully neutral DFE will have 0.0 - # mutation rate anyway. - mut_props_list = [ - prop if (not mt.is_neutral) or d.is_neutral else 0.0 - for prop, mt in zip(d.proportions, d.mutation_types) - ] mut_props = ", ".join(map(str, mut_props_list)) printsc( f" initializeGenomicElementType({j}, c({mut_types}), c({mut_props}));" @@ -1351,6 +1406,30 @@ def matrix2str( ) printsc() + # Mutation callbacks. + printsc( + " // Mutations callbacks for h-s relationships, " + "one variable for each callback." + ) + mut_types_with_callbacks = list(mutation_callbacks.keys()) + printsc( + ' defineConstant("mut_types_with_callbacks", c(' + + ", ".join(map(str, mut_types_with_callbacks)) + + "));" + ) + for mt in mut_types_with_callbacks: + printsc( + f' defineConstant("dominance_coeff_types_{mt}", c(' + + ", ".join(map(str, mutation_callbacks[mt]["mutation_types"])) + + "));" + ) + + printsc( + f' defineConstant("dominance_coeff_breaks_{mt}", c(-INF, ' + + ", ".join(map(str, mutation_callbacks[mt]["dominance_coeff_breaks"])) + + ", INF));" + ) + # Allele frequency conditioning op_types = ", ".join( f'"{op}"' for op in stdpopsim.ext.ConditionOnAlleleFrequency.op_types @@ -1721,7 +1800,7 @@ def _recap_and_rescale( max_id = -1 for dfe in metadata["stdpopsim"]["DFEs"]: for mt in dfe["mutation_types"]: - max_id = max(mt["slim_mutation_type_id"], max_id) + max_id = max(max(mt["slim_mutation_type_id"]), max_id) recap_dfe = { "id": "recapitation", "description": "DFE used in recapitation", @@ -1734,7 +1813,9 @@ def _recap_and_rescale( "distribution_args": [0], "convert_to_substitution": False, "Q_scaled_index": [0], - "slim_mutation_type_id": max_id + 1, + "slim_mutation_type_id": [ + max_id + 1, + ], "is_neutral": True, } ], @@ -1797,12 +1878,12 @@ def _get_msp_rate_map(breaks, is_this_dfe, rate): # Use msprime.SLiMMutationModel rather than msprime.JC69 # for neutral DFEs. This ensures that there will be a - # 'selection_coef' key in the mutation metadata (e.g. the + # 'selection_coef' key in the mutation metadata (so the # mutation metadata structure will be consistent across the # tree sequence). # TODO: set stacking policy to "l" when supported model = msprime.SLiMMutationModel( - type=mt["slim_mutation_type_id"], + type=mt["slim_mutation_type_id"][0], next_id=_get_next_id(ts), ) diff --git a/tests/test_dfes.py b/tests/test_dfes.py index 54b16b305..c8fb2a053 100644 --- a/tests/test_dfes.py +++ b/tests/test_dfes.py @@ -32,9 +32,9 @@ def test_Q_scaled_index(self): "n": ([0.5, 1], [0, 1]), "ln": ([0.5, 1], []), "lp": ([0.5, 1], []), + "u": ([-0.5, 1], []), } for t in mut_params: - print(f"{t}\t{mut_params[t]}") if t == "f": mt = dfe.MutationType( distribution_type=t, @@ -157,6 +157,14 @@ def test_create_bad_mutation_type_message(self): distribution_args=[1, -2], ) + # Uniformly-distributed selection coefficients + for bad_args in ([0], [1, 2, 3], [3, -2]): + with pytest.raises(ValueError, match="use a .min, max"): + dfe.MutationType( + distribution_type="u", + distribution_args=bad_args, + ) + # Lognormally-distributed selection coefficients for dt in ["lp", "ln"]: with pytest.raises( @@ -204,13 +212,14 @@ def test_mutation_types(self): "e": ([0.1], [10], [5000], [0]), "n": ([-0.1, 0.2], [0.1, 0.1], [50, 50]), "w": ([0.1, 0.2], [0.1, 0.1], [50, 50]), + "u": ([0.1, 0.2], [0.1, 0.1], [-5, 50]), "lp": ([-0.1, 0.2], [0.1, 0.1], [50, 50]), "ln": ([-0.1, 0.2], [0.1, 0.1], [50, 50]), } for t in mut_params: for p in mut_params[t]: mt = dfe.MutationType(distribution_type=t, distribution_args=p) - if t in ("lp", "ln"): + if t in ("lp", "ln", "u"): assert mt.distribution_type == "s" else: assert mt.distribution_type == t @@ -225,6 +234,7 @@ def test_bad_mutation_types(self): "e": ([], [0, 1], [0.1, 0.4, 0.5], [np.inf]), "n": ([], [0.1, -1], [0.1, 0.4, 0.5], [0.1], [0.3, np.inf]), "w": ([], [-0.1, 1], [0.1, -1], [0.1, 0.4, 0.5], [0.1], [np.inf, 2.3]), + "u": ([], [-0.1, -0.5], [0.1, 0.4, 0.5], [0.1], [2.3, np.inf]), "lp": ([], [0.1, -1], [0.1, 0.4, 0.5], [0.1], [0.1, np.inf]), "ln": ([], [0.1, -1], [0.1, 0.4, 0.5], [0.1], [0.1, np.inf]), } @@ -248,9 +258,35 @@ def test_dominance_coeff(self): mt = dfe.MutationType(dominance_coeff=dominance_coeff) assert mt.dominance_coeff == dominance_coeff + def test_dominance_coeff_list(self): + for dcl, dcb in ( + ([-0.1, 0.7, 1.2], [-2.1, 1.0]), + ([-0.1, -0.7], [-2.1]), + ): + mt = dfe.MutationType(dominance_coeff_list=dcl, dominance_coeff_breaks=dcb) + assert np.allclose(dcl, mt.dominance_coeff_list) + assert np.allclose(dcb, mt.dominance_coeff_breaks) + + def test_pass_by_value(self): + # make sure that for the arguments that are lists + # we can't post-hoc modify them (and thus bypass validation) + val = 0.5 + x = [val] + mt = dfe.MutationType(distribution_args=x) + x[0] = 2 * val + 1 + assert mt.distribution_args[0] == val + x = [val, val] + mt = dfe.MutationType(dominance_coeff_list=x, dominance_coeff_breaks=[0.0]) + x[0] = 2 * val + 1 + assert mt.dominance_coeff_list[0] == val + x = [val] + mt = dfe.MutationType(dominance_coeff_list=[0.0, 0.0], dominance_coeff_breaks=x) + x[0] = 2 * val + 1 + assert mt.dominance_coeff_breaks[0] == val + def test_bad_dominance_coeff(self): - for dominance_coeff in (np.inf, np.nan, "abc", [], None, {}): - with pytest.raises(ValueError): + for dominance_coeff in (np.inf, np.nan, "abc", [], {}): + with pytest.raises(ValueError, match="dominance.coeff"): dfe.MutationType(dominance_coeff=dominance_coeff) def test_bad_distribution_type(self): @@ -258,6 +294,62 @@ def test_bad_distribution_type(self): with pytest.raises(ValueError): dfe.MutationType(distribution_type=distribution_type) + def test_bad_dominance_coeff_list(self): + dcl = [-0.1, 0.7, 1.2] + dcb = [-2.1, 1.0] + # can't specify both dominance_coeff and list + with pytest.raises(ValueError, match="both dominance_coeff and"): + dfe.MutationType( + dominance_coeff=0.5, + dominance_coeff_list=dcl, + dominance_coeff_breaks=dcb, + ) + with pytest.raises(ValueError, match="both dominance_coeff and"): + dfe.MutationType( + dominance_coeff=0.5, + dominance_coeff_list=dcl, + ) + with pytest.raises(ValueError, match="both dominance_coeff and"): + dfe.MutationType( + dominance_coeff=0.5, + dominance_coeff_breaks=dcb, + ) + # must have both coeffs and breaks + with pytest.raises(ValueError, match="dominance.*no breaks"): + dfe.MutationType(dominance_coeff_list=dcl) + # must have at least 2 bins + with pytest.raises(ValueError, match="dominance.*at least 2"): + dfe.MutationType( + dominance_coeff_list=[0.2], + dominance_coeff_breaks=[], + ) + # list must be one longer than breaks + for x in ([], [0.0], dcl): + with pytest.raises(ValueError, match="dominance.*equal to"): + dfe.MutationType( + dominance_coeff_list=dcl, + dominance_coeff_breaks=x, + ) + # bad coefficients + for x in (np.inf, np.nan, "abc", [], {}): + with pytest.raises(ValueError, match="dominance.coeff"): + dfe.MutationType( + dominance_coeff_list=[x] + dcl[1:], + dominance_coeff_breaks=dcb, + ) + # bad breaks + for x in (np.inf, np.nan, "abc", [], {}): + with pytest.raises(ValueError, match="dominance.*break"): + dfe.MutationType( + dominance_coeff_list=dcl, + dominance_coeff_breaks=[x] + dcb[1:], + ) + with pytest.raises(ValueError, match="nondecreasing"): + dfe.MutationType( + dominance_coeff_list=dcl, + dominance_coeff_breaks=list(reversed(dcb)), + ) + class TestAllDFEs: """ diff --git a/tests/test_slim_engine.py b/tests/test_slim_engine.py index d0e30a562..31bce9c73 100644 --- a/tests/test_slim_engine.py +++ b/tests/test_slim_engine.py @@ -1164,7 +1164,16 @@ class TestGenomicElementTypes(PiecewiseConstantSizeMixin): for t, params in mut_params.items() for p in params ] - assert len(example_mut_types) == 10 + for dcl, dcb in [([0.0, 1.0], [0.0]), ([-0.2, 1.4, 0.5], [-0.1, 0.1])]: + for t in ("f", "e", "n"): + mt = stdpopsim.MutationType( + distribution_type=t, + distribution_args=mut_params[t][0], + dominance_coeff_list=dcl, + dominance_coeff_breaks=dcb, + ) + example_mut_types.append((t, mt)) + assert len(example_mut_types) == 16 def get_example_dfes(self): # this is in a function because scoping is weird @@ -1202,7 +1211,6 @@ def test_slim_simulates_dfes(self): contig = get_test_contig() # make theta = 40 contig.mutation_rate = 20 / (1000 * contig.recombination_map.sequence_length) - print(contig.mutation_rate) dfes = [ stdpopsim.DFE( id=str(j), @@ -1232,7 +1240,6 @@ def test_slim_simulates_dfes(self): ts.tables.sites.position < breaks[j + 1], ) )[0] - print(mt) assert len(sites) > 0 for k in sites: s = ts.site(k) @@ -1297,12 +1304,18 @@ def _slim_proportions(dfe): is_dfe_neutral &= mt["is_neutral"] # Recall a mutation type will have proportion 0.0 in SLiM if it is # neutral and not all the mutation types in the corresponding DFE are - # neutral. SLiM does not allow a genomicelementtype to have all 0.0 + # neutral. SLiM does not allow a genomicElementType to have all 0.0 # proportions. - return [ - prop if (not mt["is_neutral"]) or is_dfe_neutral else 0.0 - for prop, mt in zip(dfe["proportions"], dfe["mutation_types"]) - ] + prop_list = [] + for prop, mt in zip(dfe["proportions"], dfe["mutation_types"]): + first_mt = True + for i in mt["slim_mutation_type_id"]: + nonzero_prop = ( + (not mt["is_neutral"]) or is_dfe_neutral + ) and first_mt + prop_list.append(prop if nonzero_prop else 0.0) + first_mt = False + return prop_list for i, dfe in enumerate(ts.metadata["stdpopsim"]["DFEs"]): if dfe["id"] == "recapitation": @@ -1319,12 +1332,43 @@ def _slim_proportions(dfe): "intervalStarts": int_starts, "mutationFractions": _slim_proportions(dfe), "mutationTypes": [ - mt["slim_mutation_type_id"] for mt in dfe["mutation_types"] + mid + for mt in dfe["mutation_types"] + for mid in mt["slim_mutation_type_id"] ], } - ge = self.slim_metadata_key0(ge_types, str(i)) assert ge == ge_from_meta + def verify_discretized_dominance(self, contig, ts): + # check that the dummy first mutation type of discretized + # h-s relationship mutation types are absent + mut_type_counts = collections.Counter( + x["mutation_type"] + for m in ts.mutations() + for x in m.metadata["mutation_list"] + ) + metadata_ids = [x["id"] for x in ts.metadata["stdpopsim"]["DFEs"]] + slim_mt_info = ts.metadata["SLiM"]["user_metadata"]["mutationTypes"][0] + has_recap = metadata_ids[-1] == "recapitation" + slim_to_mt_map = {} + assert len(contig.dfe_list) + has_recap == len(ts.metadata["stdpopsim"]["DFEs"]) + for dfe, ts_metadata in zip(contig.dfe_list, ts.metadata["stdpopsim"]["DFEs"]): + assert dfe.id == ts_metadata["id"] + assert len(dfe.mutation_types) == len(ts_metadata["mutation_types"]) + for mt, md in zip(dfe.mutation_types, ts_metadata["mutation_types"]): + if mt.dominance_coeff_list is not None: + assert len(mt.dominance_coeff_list) + 1 == len( + md["slim_mutation_type_id"] + ) + first_type = md["slim_mutation_type_id"][0] + assert mut_type_counts[first_type] == 0 + for h, k in zip( + mt.dominance_coeff_list, md["slim_mutation_type_id"][1:] + ): + assert str(k) in slim_mt_info + assert np.allclose(slim_mt_info[str(k)][0]["dominanceCoeff"], h) + slim_to_mt_map[k] = mt + def verify_genomic_elements(self, contig, ts): # compare information about genomic elements as recorded # by SLiM itself in metadata to what we think it should be @@ -1335,6 +1379,7 @@ def verify_genomic_elements(self, contig, ts): ts.metadata["SLiM"]["user_metadata"], "genomicElementTypes" ) self.verify_dfes_metadata(ts, ge_types) + self.verify_discretized_dominance(contig, ts) assert len(contig.dfe_list) == len(ge_types) for j, (dfe, intervals) in enumerate( zip(contig.dfe_list, contig.interval_list) @@ -1343,13 +1388,21 @@ def verify_genomic_elements(self, contig, ts): ge = self.slim_metadata_key0(ge_types, str(j)) # checking that the neutral mutations have 0.0 proportion unless # all the mutations are neutral in this dfe - for mt, dfe_prop, slim_prop in zip( - dfe.mutation_types, dfe.proportions, ge["mutationFractions"] - ): + assert len(dfe.mutation_types) == len(dfe.proportions) + ge_index = 0 + for mt, dfe_prop in zip(dfe.mutation_types, dfe.proportions): + slim_prop = ge["mutationFractions"][ge_index] + ge_index += 1 if mt.is_neutral and not dfe.is_neutral: assert slim_prop == 0.0 else: assert slim_prop == dfe_prop + if mt.dominance_coeff_list is not None: + for _ in mt.dominance_coeff_list: + slim_prop = ge["mutationFractions"][ge_index] + assert slim_prop == 0.0 + ge_index += 1 + assert ge_index == len(ge["mutationFractions"]) # "+1" because SLiM's intervals are closed on both ends, # stdpopsim's are closed on the left, open on the right slim_intervals = np.column_stack( @@ -1359,20 +1412,33 @@ def verify_genomic_elements(self, contig, ts): ] ) assert_array_equal(intervals, slim_intervals) - slim_mut_types = ge["mutationTypes"] - assert len(dfe.mutation_types) == len(slim_mut_types) - for mt, mt_id in zip(dfe.mutation_types, slim_mut_types): + ge_index = 0 + for mt in dfe.mutation_types: + mt_id = ge["mutationTypes"][ge_index] + ge_index += 1 assert str(mt_id) in mut_types slim_mt = self.slim_metadata_key0(mut_types, str(mt_id)) - assert mt.dominance_coeff == self.slim_metadata_key0( - slim_mt, "dominanceCoeff" - ) + if mt.dominance_coeff_list is None: + assert mt.dominance_coeff == self.slim_metadata_key0( + slim_mt, "dominanceCoeff" + ) + else: + assert 0.5 == self.slim_metadata_key0(slim_mt, "dominanceCoeff") + for h in mt.dominance_coeff_list: + mt_id = ge["mutationTypes"][ge_index] + ge_index += 1 + assert str(mt_id) in mut_types + slim_mt = self.slim_metadata_key0(mut_types, str(mt_id)) + assert np.allclose( + h, self.slim_metadata_key0(slim_mt, "dominanceCoeff") + ) assert mt.distribution_type == self.slim_metadata_key0( slim_mt, "distributionType" ) assert len(mt.distribution_args) == len(slim_mt["distributionParams"]) for a, b in zip(mt.distribution_args, slim_mt["distributionParams"]): assert a == b + assert ge_index == len(ge["mutationTypes"]) def test_no_dfe(self): contig = get_test_contig() @@ -1630,6 +1696,65 @@ def test_chromosomal_segment(self): self.verify_mutation_rates(contig, ts) assert int(ts.sequence_length) == right - left + def test_dominance_coeff_list(self): + # test that discretized h-s relationships work + contig = get_test_contig() + L = int(contig.length) + emts = [emt for _, emt in self.example_mut_types[10:12]] + for emt in emts: + assert emt.dominance_coeff_list is not None + dfe0 = stdpopsim.DFE( + id="dfe", + description="the first one", + long_description="hello world", + proportions=[1.0, 0.0], + mutation_types=emts, + ) + emts = [emt for _, emt in self.example_mut_types[12:15]] + for emt in emts: + assert emt.dominance_coeff_list is not None + dfe1 = stdpopsim.DFE( + id="dfe", + description="the second one", + long_description="I'm different but have the same name! =( =(", + proportions=[0.2, 0.7, 0.1], + mutation_types=emts, + ) + contig.add_dfe(np.array([[0, 0.5 * L]], dtype="int"), dfe0) + contig.add_dfe(np.array([[0.2 * L, 0.4 * L]], dtype="int"), dfe0) + contig.add_dfe(np.array([[0.45 * L, L]], dtype="int"), dfe1) + contig.add_dfe(np.array([[0.7 * L, 0.9 * L]], dtype="int"), dfe0) + assert len(contig.dfe_list) == 5 + engine = stdpopsim.get_engine("slim") + contig.mutation_rate *= 10 + ts = engine.simulate( + demographic_model=self.model, + contig=contig, + samples=self.samples, + verbosity=3, # to get metadata output + ) + assert len(ts.metadata["stdpopsim"]["DFEs"]) == len(contig.dfe_list) + 1 + # slim mutation type IDs with dominance coeff lists: + mut_id_haslist = {} + for dfe in ts.metadata["stdpopsim"]["DFEs"]: + for mt in dfe["mutation_types"]: + haslist = ( + "dominance_coeff_list" in mt + and mt["dominance_coeff_list"] is not None + ) + for slim_id in mt["slim_mutation_type_id"]: + assert slim_id not in mut_id_haslist + mut_id_haslist[slim_id] = haslist + num_target_muts = 0 + for mut in ts.mutations(): + for md in mut.metadata["mutation_list"]: + if mut_id_haslist[md["mutation_type"]]: + num_target_muts += 1 + # the number 20 is not important, just want to make sure we have *some* + assert num_target_muts > 20 + self.verify_genomic_elements(contig, ts) + self.verify_mutation_rates(contig, ts) + @pytest.mark.skipif(IS_WINDOWS, reason="SLiM not available on windows") class TestLogfile(PiecewiseConstantSizeMixin):