diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index cce1ca159..b70a48801 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -18,9 +18,8 @@ jobs: fail-fast: false matrix: os: [ubuntu-20.04, macos-11, windows-latest] - python: [3.7, "3.10"] + python: [3.8, "3.10"] env: - CONDA_ENV_NAME: stdpopsim OS: ${{ matrix.os }} PYTHON: ${{ matrix.python }} @@ -36,7 +35,7 @@ jobs: - name: find conda id: find-conda run: | - echo "::set-output name=CONDA::$CONDA" + echo "name=CONDA::$CONDA" >> $GITHUB_OUTPUT - name: fix conda permissions if: runner.os == 'macOS' @@ -49,7 +48,7 @@ jobs: uses: actions/cache@v2 env: # Increase this to reset the cache if the key hasn't changed. - CACHE_NUM: 2 + CACHE_NUM: 4 with: path: | ${{ steps.find-conda.outputs.CONDA }}/envs/${{ env.CONDA_ENV_NAME }} @@ -60,7 +59,7 @@ jobs: uses: conda-incubator/setup-miniconda@v2 if: steps.cache.outputs.cache-hit != 'true' with: - activate-environment: ${{ env.CONDA_ENV_NAME }} + activate-environment: stdpopsim python-version: ${{ matrix.python }} channels: conda-forge channel-priority: strict diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 82741977d..6db710898 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -2,6 +2,22 @@ [0.2.1] - 2023-XX-XX -------------------- +**Breaking changes**: + +- To add the possibility for a relationship between dominance and selection + coefficient, now each stdpopsim MutationType might have more than one + underlying SLiM mutation type; so, where this is recorded in top-level + metadata (under `ts.metadata["stdpopsim"]["DFEs"]`) is now a list + instead of a single value. This will not affect anyone who is not + parsing the metadata related to DFEs. + +**New features**: + +- *Relationship between dominance and selection coefficient:* + Added the `dominance_coeff_list` argument to `MutationType`, allowing + for DFEs with a discretized relationship between h and s. + (:user:`petrelharp`, :pr:`1498`) + **New species**: - Mus musculus (:user:`peterdfields`, :pr:`1437`). diff --git a/requirements/CI/requirements.txt b/requirements/CI/requirements.txt index 37288de45..a2a805a09 100644 --- a/requirements/CI/requirements.txt +++ b/requirements/CI/requirements.txt @@ -1,10 +1,10 @@ coverage==6.3.3 flake8==5.0.4 -pytest==7.2.1 -pytest-cov==4.0.0 -pytest-xdist==3.2.0 +pytest==7.3.2 +pytest-cov==4.1.0 +pytest-xdist==3.3.1 setuptools-scm==7.1.0 -sphinx==4.3.2 +sphinx==5.0.2 sphinx-argparse==0.4.0 sphinx-issues==3.0.1 sphinx_rtd_theme==1.2.0 @@ -14,11 +14,9 @@ attrs==21.4.0 appdirs==1.4.4 humanize==4.6.0 pyslim==1.0.1 -numpy==1.21.5; python_version=='3.7' -numpy==1.22.3; python_version>='3.8' -scipy; python_version=='3.7' -scipy==1.10.1; python_version>='3.8' -scikit-allel==1.3.5 +numpy==1.22.3 +scipy==1.10.1 +scikit-allel==1.3.6 biopython==1.80 demes==0.2.2 demesdraw==0.3.1 diff --git a/stdpopsim/catalog/HomSap/dfes.py b/stdpopsim/catalog/HomSap/dfes.py index ee97bc4bc..83e1254a5 100644 --- a/stdpopsim/catalog/HomSap/dfes.py +++ b/stdpopsim/catalog/HomSap/dfes.py @@ -147,3 +147,56 @@ def _HuberLogNormalDFE(): _species.add_dfe(_HuberLogNormalDFE()) + + +def _KyriazisDFE(): + id = "Mixed_K23" + description = "Deleterious Gamma DFE with additional lethals" + long_description = """ + The DFE estimated from human data recommended in Kyriazis et al. + (2023), https://doi.org/10.1086/726736, for general use. + This model is similar to the Kim et al. (2017) DFE based on human + genetic data, modified to include the dominance distribution from + Henn et al. (2016). + The model is also augmented with an additional proportion of 0.3% of + recessive lethals, based on the analysis of Wade et al. (2023). + """ + citations = [ + stdpopsim.Citation( + author="Kyriazis et al.", + year=2023, + doi="https://doi.org/10.1086/726736", + reasons={stdpopsim.CiteReason.DFE}, + ) + ] + neutral = stdpopsim.MutationType() + gamma_mean = -0.0131 + gamma_shape = 0.186 + coefs = [0.45, 0.2, 0.05, 0] + breaks = [0.001, 0.01, 0.1] + gamma = stdpopsim.MutationType( + dominance_coeff_list=coefs, + dominance_coeff_breaks=breaks, + distribution_type="g", # gamma distribution + distribution_args=[gamma_mean, gamma_shape], + ) + lethal = stdpopsim.MutationType( + distribution_type="f", # fixed value + distribution_args=[-1], # fitness in SLiM for homozygotes is multiiplied by 1+s + dominance_coeff=0, + ) + proportion_deleterious = 2.31 / (1 + 2.31) + lethal_prop = proportion_deleterious * 0.003 # 0.3% lethals + gamma_prop = proportion_deleterious - lethal_prop + neutral_prop = 1 - proportion_deleterious + return stdpopsim.DFE( + id=id, + description=description, + long_description=long_description, + mutation_types=[neutral, gamma, lethal], + proportions=[neutral_prop, gamma_prop, lethal_prop], + citations=citations, + ) + + +_species.add_dfe(_KyriazisDFE()) diff --git a/stdpopsim/dfe.py b/stdpopsim/dfe.py index 78ac8e7c1..9136fe82a 100644 --- a/stdpopsim/dfe.py +++ b/stdpopsim/dfe.py @@ -8,11 +8,17 @@ import numpy as np +def _copy_converter(x): + if isinstance(x, list): + x = x.copy() + return x + + @attr.s(kw_only=True) class MutationType(object): """ - Class representing a "type" of mutation. The design closely (exactly) - mirrors SLiM's MutationType class. + Class representing a "type" of mutation. The design closely mirrors SLiM's + MutationType class. The main thing that mutation types carry is a way of drawing a selection coefficient for each new mutation. This ``distribution_type`` should be one @@ -23,6 +29,7 @@ class MutationType(object): - ``g``: gamma, two parameters (mean, shape) - ``n``: normal, two parameters (mean, SD) - ``w``: Weibull, two parameters (scale, shape) + - ``u``: Uniform, two parameters (min, max) - ``lp``: positive logNormal, two parameters (mean and sd on log scale; see rlnorm) - ``ln``: negative logNormal, two parameters (mean and sd on log scale; see rlnorm) @@ -31,28 +38,99 @@ class MutationType(object): exponential and gamma, a negative mean can be provided, obtaining always negative values. + Instead of a single dominance coefficient (which would be specified with + `dominance_coeff`), a discretized relationship between dominance and + selection coefficient can be implemented: if dominance_coeff_list is + provided, mutations with selection coefficient s for which + dominance_coeff_breaks[k-1] <= s <= dominance_coeff_breaks[k] will have + dominance coefficient dominance_coeff[k]. In other words, the first entry + of dominance_coeff_list applies to any mutations with selection coefficient + below the first entry of dominance_coeff_breaks; the second entry of + dominance_coeff_list applies to mutations with selection coefficient + between the first and second entries of dominance_coeff_breaks, and so + forth. The list of breaks must therefore be of length one less than the + list of dominance coefficients. + :ivar distribution_type: A one-letter abbreviation for the distribution of fitness effects that each new mutation of this type draws from (see below). :vartype distribution_type: str :ivar distribution_args: Arguments for the distribution type. :vartype distribution_type: list - :ivar dominance_coeff: A single dominance coefficient (negative = underdominance, + :ivar dominance_coeff: The dominance coefficient (negative = underdominance, 0 = recessive, 0.5 = additive, 1.0 = completely dominant, > 1.0 = overdominant) + Default: 0.5. :vartype dominance_coeff: float :ivar convert_to_substitution: Whether to retain any fixed mutations in the simulation: if not, we cannot ask about their frequency once fixed. (Either way, they will remain in the tree sequence). :vartype convert_to_substitution: bool + :ivar dominance_coeff_list: Either None (the default) or a list of floats describing + a list of dominance coefficients, to apply to different selection coefficients + (see details). Cannot be specified along with dominance_coeff. + :vartype dominance_coeff_list: list of floats + :ivar dominance_coeff_breaks: Either None (the default) or a list of floats + describing the intervals of selection coefficient over which each of the entries + of dominance_coeff_list applies (see details). Must be of length one shorter than + dominance_coeff_list. + :vartype dominance_coeff_breaks: list of floats """ - dominance_coeff = attr.ib(default=0.5, type=float) + dominance_coeff = attr.ib(default=None, type=float) distribution_type = attr.ib(default="f", type=str) - distribution_args = attr.ib(factory=lambda: [0], type=list) + distribution_args = attr.ib( + factory=lambda: [0], type=list, converter=_copy_converter + ) convert_to_substitution = attr.ib(default=True, type=bool) + dominance_coeff_list = attr.ib(default=None, type=list, converter=_copy_converter) + dominance_coeff_breaks = attr.ib(default=None, type=list, converter=_copy_converter) def __attrs_post_init__(self): - if not isinstance(self.dominance_coeff, (float, int)): - raise ValueError("dominance_coeff must be a number.") + if self.dominance_coeff is None and self.dominance_coeff_list is None: + self.dominance_coeff = 0.5 + + if self.dominance_coeff is not None: + if (self.dominance_coeff_list is not None) or ( + self.dominance_coeff_breaks is not None + ): + raise ValueError( + "Cannot specify both dominance_coeff and dominance_coeff_list." + ) + if not isinstance(self.dominance_coeff, (float, int)): + raise ValueError("dominance_coeff must be a number.") + if not np.isfinite(self.dominance_coeff): + raise ValueError( + f"Invalid dominance coefficient {self.dominance_coeff}." + ) + + if self.dominance_coeff_list is not None: + # disallow the inefficient and annoying length-one case + if len(self.dominance_coeff_list) < 2: + raise ValueError("dominance_coeff_list must have at least 2 elements.") + for h in self.dominance_coeff_list: + if not isinstance(h, (float, int)): + raise ValueError("dominance_coeff_list must be a list of numbers.") + if not np.isfinite(h): + raise ValueError(f"Invalid dominance coefficient {h}.") + if self.dominance_coeff_breaks is None: + raise ValueError( + "A list of dominance coefficients provided but no breaks." + ) + if len(self.dominance_coeff_list) != len(self.dominance_coeff_breaks) + 1: + raise ValueError( + "len(dominance_coeff_list) must be equal " + "to len(dominance_coeff_breaks) + 1" + ) + lb = -1 * np.inf + for b in self.dominance_coeff_breaks: + if not isinstance(b, (float, int)): + raise ValueError( + "dominance_coeff_breaks must be a list of numbers." + ) + if not np.isfinite(b): + raise ValueError(f"Invalid dominance coefficient break {b}.") + if b < lb: + raise ValueError("dominance_coeff_breaks must be nondecreasing.") + lb = b if not isinstance(self.distribution_type, str): raise ValueError("distribution_type must be str.") @@ -69,9 +147,6 @@ def __attrs_post_init__(self): if not isinstance(self.convert_to_substitution, bool): raise ValueError("convert_to_substitution must be bool.") - if not np.isfinite(self.dominance_coeff): - raise ValueError(f"Invalid dominance coefficient {self.dominance_coeff}.") - # To add a new distribution type: validate the # distribution_args here, and add unit tests. if self.distribution_type == "f": @@ -143,6 +218,19 @@ def __attrs_post_init__(self): f"return {sign}rlnorm(1, {logmean} + log(Q), {logsd});" ] self.distribution_type = "s" + elif self.distribution_type == "u": + # Uniform + if ( + len(self.distribution_args) != 2 + or self.distribution_args[0] > self.distribution_args[1] + ): + raise ValueError( + "Uniformly-distributed sel. coefs. (distribution_type='u') " + "use a (min, max) parameterisation, with min <= max." + ) + umin, umax = self.distribution_args + self.distribution_args = [f"return runif(1, Q * {umin}, Q * {umax});"] + self.distribution_type = "s" else: raise ValueError( f"{self.distribution_type} is not a supported distribution type." diff --git a/stdpopsim/slim_engine.py b/stdpopsim/slim_engine.py index 69b134d3e..f5d163601 100644 --- a/stdpopsim/slim_engine.py +++ b/stdpopsim/slim_engine.py @@ -503,6 +503,22 @@ } } + // Setup mutation callbacks. + // For each stdpopsim mutation type with an h-s relationship + // we have a sequence of assigned SLiM mutation types; + // the first is the one that gets produced by mutation, + // and the remainder are assigned by a mutation callback. + for (i in seqAlong(mut_types_with_callbacks)) { + mt = mut_types_with_callbacks[i]; + sim.registerMutationCallback(NULL, + "{s = mut.selectionCoeff; " + + "k = findInterval(s, dominance_coeff_breaks_" + mt + "); " + + "mut.setMutationType(dominance_coeff_types_" + mt + "[k]); " + + "return T;}", + mt + ); + } + // Sample individuals. for (i in 0:(ncol(sampling_episodes)-1)) { pop = drop(sampling_episodes[0,i]); @@ -539,7 +555,6 @@ """ - _slim_logfile = """ // Optional logfile output: // Fitness values are only available in early(), @@ -669,7 +684,11 @@ "items": { "type": "object", "properties": { - "dominance_coeff": {"type": "number"}, + "dominance_coeff": {"type": ["number", "null"]}, + "dominance_coeff_list": {"type": ["array", "null"]}, + "dominance_coeff_breaks": { + "type": ["array", "null"] + }, "distribution_type": {"type": "string"}, "distribution_args": { "type": "array", @@ -680,7 +699,7 @@ "type": "array", "items": {"type": "number"}, }, - "slim_mutation_type_id": {"type": "integer"}, + "slim_mutation_type_id": {"type": "array"}, "is_neutral": {"type": "boolean"}, }, }, @@ -759,21 +778,38 @@ def _enum_dfe_and_intervals(contig): def _dfe_to_mtypes(contig): """ - Assigns a mutation type id to each of the mutation types contained in this + Assigns mutation type ids to each of the mutation types contained in this `Contig`. This function will return a dictionary with `len(contig.dfe_list)` elements, in which the position of the DFE in `contig.dfe_list` is the key. - For each DFE, the dictionary holds a list of tuples that contain the an - assigned mutation type id (used in SLiM) and the `MutationType` object. + For each DFE, the dictionary holds a list of tuples that contain the + assigned mutation type ids (used in SLiM) and the `MutationType` object. This is necessary so that we use the same numeric ids to the mutation types in the _add_dfes_to_metadata and slim_makescript steps. + + The assigned mutation type ids is stored as a tuple, that is only of length + greater than one if the mutation type is of a sort that is implemented + using more than one SLiM mutation type; currently, these are only mutation + types with a discretized h-s relationship, and (a) only the first of these + mutation types are actually assigned a positive mutation rate in SLiM, but + (b) all mutations of the first type are transformed to the other mutation + types in the list, and hence never end up in the final tree sequence. (See + Recipe 10.6 in the SLiM manual, "Varying the dominance coefficient among + mutations".) """ mid = 0 dfe_to_mtypes = {} for i, d, _ in _enum_dfe_and_intervals(contig): dfe_to_mtypes[i] = [] for mt in d.mutation_types: - dfe_to_mtypes[i].append((mid, mt)) + # the first mutation type will be the one that has the mutation rate + # and each new mutation gets converted to one of the *other* types; + mid_list = [mid] mid += 1 + if mt.dominance_coeff_list is not None: + for _ in mt.dominance_coeff_list: + mid_list.append(mid) + mid += 1 + dfe_to_mtypes[i].append((tuple(mid_list), mt)) return dfe_to_mtypes @@ -798,9 +834,9 @@ def _get_json(obj): for i, d, ints in _enum_dfe_and_intervals(contig): dfes[i]["intervals"] = ints.tolist() dfes[i]["proportions"] = d.proportions - for j, (mid, mt) in enumerate(dfe_to_mtypes[i]): + for j, (mid_list, mt) in enumerate(dfe_to_mtypes[i]): add = { - "slim_mutation_type_id": mid, + "slim_mutation_type_id": list(mid_list), "is_neutral": mt.is_neutral, } dfes[i]["mutation_types"][j].update(add) @@ -1012,7 +1048,7 @@ def fix_time(event): f"refers to a DFE with intervals {dfe_intervals}, not " f"to a single site." ) - mt_id, mt = slim_mutation_ids[dfe_index][0] + mt_id_list, mt = slim_mutation_ids[dfe_index][0] if not mt.distribution_type == "f": raise ValueError( f"The single site id '{ee.single_site_id}' referenced by " @@ -1021,7 +1057,7 @@ def fix_time(event): f"fixed fitness coefficient." ) coordinate = dfe_intervals[0, 0] - mutation_type_id = mt_id + mutation_type_id = mt_id_list[0] if hasattr(ee, "start_time") and hasattr(ee, "end_time"): # Now that GenerationAfter times have been accounted for, we can # properly catch invalid start/end times. @@ -1176,47 +1212,66 @@ def matrix2str( return "".join(s) # Genomic element types. + mutation_callbacks = {} dfe_mtypes = _dfe_to_mtypes(contig) for j, d, ints in _enum_dfe_and_intervals(contig): - # Mutation types. + # Mutation types and proportions. mut_type_list = [] + mut_props_list = [] assert len(dfe_mtypes[j]) == len(d.mutation_types) - for mid, m in dfe_mtypes[j]: - if len(m.Q_scaled_index) >= 1: - distrib_args = [str(arg) for arg in m.distribution_args] - for k in m.Q_scaled_index: - distrib_args[m.Q_scaled_index[k]] = ( - "Q * " + distrib_args[m.Q_scaled_index[k]] + for mt_index, (mid_list, mt) in enumerate(dfe_mtypes[j]): + if len(mt.Q_scaled_index) >= 1: + distrib_args = [str(arg) for arg in mt.distribution_args] + for k in mt.Q_scaled_index: + distrib_args[mt.Q_scaled_index[k]] = ( + "Q * " + distrib_args[mt.Q_scaled_index[k]] ) distrib_args = ", ".join(distrib_args) # dealing with distributions given by "s" Eidos script, else: - distrib_args = "; ".join([f'"{a}"' for a in m.distribution_args]) - mut_type_list.append(mid) - printsc( - f" initializeMutationType({mid}, {m.dominance_coeff}, " - f'"{m.distribution_type}", {distrib_args});' - ) - if not m.convert_to_substitution: - # T is the default for WF simulations. - printsc(f" m{mid}.convertToSubstitution = F;") - # TODO: when msprime.SLiMMutationModel supports stacking policy, - # set policy such that there's at most a single mutation per-site - # and individual - # printsc(f" m{mid}.mutationStackGroup = 0;") - # printsc(f" m{mid}.mutationStackPolicy = 'l';") - mut_types = ", ".join([str(mt) for mt in mut_type_list]) + distrib_args = "; ".join([f'"{a}"' for a in mt.distribution_args]) + if mt.dominance_coeff_list is None: + h_list = [mt.dominance_coeff] + else: + # this first value will apply only to mutations that are never kept, + # and so has no effect + h_list = [0.5] + h_list.extend(mt.dominance_coeff_list) + # record here what we'll need to set up the callbacks in script + mutation_callbacks[mid_list[0]] = { + "dominance_coeff_breaks": mt.dominance_coeff_breaks, + "mutation_types": mid_list[1:], + } + assert len(mid_list) == len(h_list) + first_mt = True + for mid, h in zip(mid_list, h_list): + # We will assign a proportion of 0.0 to mutation types that are neutral + # unless all the mutation types in a DFE are neutral. In such case, + # SLiM would not allow all mutation types in a genomic element type to + # have 0.0 proportion. And the proportions in SLiM won't matter anyway, + # because all the intervals in this fully neutral DFE will have 0.0 + # mutation rate anyway. + # Furthermore, for mutation types with a discretized h-s relationship, + # we only mutate to the first assigned mutation type, and remaining ones + # are produced by a mutation callback. + mut_type_list.append(mid) + use_prop = first_mt and ((not mt.is_neutral) or d.is_neutral) + p = d.proportions[mt_index] if use_prop else 0.0 + mut_props_list.append(p) + printsc( + f" initializeMutationType({mid}, {h}, " + f'"{mt.distribution_type}", {distrib_args});' + ) + if not mt.convert_to_substitution: + # T is the default for WF simulations. + printsc(f" m{mid}.convertToSubstitution = F;") + # TODO: when msprime.SLiMMutationModel supports stacking policy, + # set policy such that there's at most a single mutation per-site + # and individual + # printsc(f" mt{mid}.mutationStackGroup = 0;") + # printsc(f" mt{mid}.mutationStackPolicy = 'l';") + first_mt = False mut_types = ", ".join([str(mt) for mt in mut_type_list]) - # We will assign a proportion of 0.0 to mutation types that are neutral - # unless all the mutation types in a DFE are neutral. In such case, - # SLiM would not allow all mutation types in a genomic element type to - # have 0.0 proportion. And the proportions in SLiM won't matter anyway, - # because all the intervals in this fully neutral DFE will have 0.0 - # mutation rate anyway. - mut_props_list = [ - prop if (not mt.is_neutral) or d.is_neutral else 0.0 - for prop, mt in zip(d.proportions, d.mutation_types) - ] mut_props = ", ".join(map(str, mut_props_list)) printsc( f" initializeGenomicElementType({j}, c({mut_types}), c({mut_props}));" @@ -1351,6 +1406,30 @@ def matrix2str( ) printsc() + # Mutation callbacks. + printsc( + " // Mutations callbacks for h-s relationships, " + "one variable for each callback." + ) + mut_types_with_callbacks = list(mutation_callbacks.keys()) + printsc( + ' defineConstant("mut_types_with_callbacks", c(' + + ", ".join(map(str, mut_types_with_callbacks)) + + "));" + ) + for mt in mut_types_with_callbacks: + printsc( + f' defineConstant("dominance_coeff_types_{mt}", c(' + + ", ".join(map(str, mutation_callbacks[mt]["mutation_types"])) + + "));" + ) + + printsc( + f' defineConstant("dominance_coeff_breaks_{mt}", c(-INF, ' + + ", ".join(map(str, mutation_callbacks[mt]["dominance_coeff_breaks"])) + + ", INF));" + ) + # Allele frequency conditioning op_types = ", ".join( f'"{op}"' for op in stdpopsim.ext.ConditionOnAlleleFrequency.op_types @@ -1721,7 +1800,7 @@ def _recap_and_rescale( max_id = -1 for dfe in metadata["stdpopsim"]["DFEs"]: for mt in dfe["mutation_types"]: - max_id = max(mt["slim_mutation_type_id"], max_id) + max_id = max(max(mt["slim_mutation_type_id"]), max_id) recap_dfe = { "id": "recapitation", "description": "DFE used in recapitation", @@ -1734,7 +1813,9 @@ def _recap_and_rescale( "distribution_args": [0], "convert_to_substitution": False, "Q_scaled_index": [0], - "slim_mutation_type_id": max_id + 1, + "slim_mutation_type_id": [ + max_id + 1, + ], "is_neutral": True, } ], @@ -1797,12 +1878,12 @@ def _get_msp_rate_map(breaks, is_this_dfe, rate): # Use msprime.SLiMMutationModel rather than msprime.JC69 # for neutral DFEs. This ensures that there will be a - # 'selection_coef' key in the mutation metadata (e.g. the + # 'selection_coef' key in the mutation metadata (so the # mutation metadata structure will be consistent across the # tree sequence). # TODO: set stacking policy to "l" when supported model = msprime.SLiMMutationModel( - type=mt["slim_mutation_type_id"], + type=mt["slim_mutation_type_id"][0], next_id=_get_next_id(ts), ) diff --git a/tests/test_dfes.py b/tests/test_dfes.py index 54b16b305..c8fb2a053 100644 --- a/tests/test_dfes.py +++ b/tests/test_dfes.py @@ -32,9 +32,9 @@ def test_Q_scaled_index(self): "n": ([0.5, 1], [0, 1]), "ln": ([0.5, 1], []), "lp": ([0.5, 1], []), + "u": ([-0.5, 1], []), } for t in mut_params: - print(f"{t}\t{mut_params[t]}") if t == "f": mt = dfe.MutationType( distribution_type=t, @@ -157,6 +157,14 @@ def test_create_bad_mutation_type_message(self): distribution_args=[1, -2], ) + # Uniformly-distributed selection coefficients + for bad_args in ([0], [1, 2, 3], [3, -2]): + with pytest.raises(ValueError, match="use a .min, max"): + dfe.MutationType( + distribution_type="u", + distribution_args=bad_args, + ) + # Lognormally-distributed selection coefficients for dt in ["lp", "ln"]: with pytest.raises( @@ -204,13 +212,14 @@ def test_mutation_types(self): "e": ([0.1], [10], [5000], [0]), "n": ([-0.1, 0.2], [0.1, 0.1], [50, 50]), "w": ([0.1, 0.2], [0.1, 0.1], [50, 50]), + "u": ([0.1, 0.2], [0.1, 0.1], [-5, 50]), "lp": ([-0.1, 0.2], [0.1, 0.1], [50, 50]), "ln": ([-0.1, 0.2], [0.1, 0.1], [50, 50]), } for t in mut_params: for p in mut_params[t]: mt = dfe.MutationType(distribution_type=t, distribution_args=p) - if t in ("lp", "ln"): + if t in ("lp", "ln", "u"): assert mt.distribution_type == "s" else: assert mt.distribution_type == t @@ -225,6 +234,7 @@ def test_bad_mutation_types(self): "e": ([], [0, 1], [0.1, 0.4, 0.5], [np.inf]), "n": ([], [0.1, -1], [0.1, 0.4, 0.5], [0.1], [0.3, np.inf]), "w": ([], [-0.1, 1], [0.1, -1], [0.1, 0.4, 0.5], [0.1], [np.inf, 2.3]), + "u": ([], [-0.1, -0.5], [0.1, 0.4, 0.5], [0.1], [2.3, np.inf]), "lp": ([], [0.1, -1], [0.1, 0.4, 0.5], [0.1], [0.1, np.inf]), "ln": ([], [0.1, -1], [0.1, 0.4, 0.5], [0.1], [0.1, np.inf]), } @@ -248,9 +258,35 @@ def test_dominance_coeff(self): mt = dfe.MutationType(dominance_coeff=dominance_coeff) assert mt.dominance_coeff == dominance_coeff + def test_dominance_coeff_list(self): + for dcl, dcb in ( + ([-0.1, 0.7, 1.2], [-2.1, 1.0]), + ([-0.1, -0.7], [-2.1]), + ): + mt = dfe.MutationType(dominance_coeff_list=dcl, dominance_coeff_breaks=dcb) + assert np.allclose(dcl, mt.dominance_coeff_list) + assert np.allclose(dcb, mt.dominance_coeff_breaks) + + def test_pass_by_value(self): + # make sure that for the arguments that are lists + # we can't post-hoc modify them (and thus bypass validation) + val = 0.5 + x = [val] + mt = dfe.MutationType(distribution_args=x) + x[0] = 2 * val + 1 + assert mt.distribution_args[0] == val + x = [val, val] + mt = dfe.MutationType(dominance_coeff_list=x, dominance_coeff_breaks=[0.0]) + x[0] = 2 * val + 1 + assert mt.dominance_coeff_list[0] == val + x = [val] + mt = dfe.MutationType(dominance_coeff_list=[0.0, 0.0], dominance_coeff_breaks=x) + x[0] = 2 * val + 1 + assert mt.dominance_coeff_breaks[0] == val + def test_bad_dominance_coeff(self): - for dominance_coeff in (np.inf, np.nan, "abc", [], None, {}): - with pytest.raises(ValueError): + for dominance_coeff in (np.inf, np.nan, "abc", [], {}): + with pytest.raises(ValueError, match="dominance.coeff"): dfe.MutationType(dominance_coeff=dominance_coeff) def test_bad_distribution_type(self): @@ -258,6 +294,62 @@ def test_bad_distribution_type(self): with pytest.raises(ValueError): dfe.MutationType(distribution_type=distribution_type) + def test_bad_dominance_coeff_list(self): + dcl = [-0.1, 0.7, 1.2] + dcb = [-2.1, 1.0] + # can't specify both dominance_coeff and list + with pytest.raises(ValueError, match="both dominance_coeff and"): + dfe.MutationType( + dominance_coeff=0.5, + dominance_coeff_list=dcl, + dominance_coeff_breaks=dcb, + ) + with pytest.raises(ValueError, match="both dominance_coeff and"): + dfe.MutationType( + dominance_coeff=0.5, + dominance_coeff_list=dcl, + ) + with pytest.raises(ValueError, match="both dominance_coeff and"): + dfe.MutationType( + dominance_coeff=0.5, + dominance_coeff_breaks=dcb, + ) + # must have both coeffs and breaks + with pytest.raises(ValueError, match="dominance.*no breaks"): + dfe.MutationType(dominance_coeff_list=dcl) + # must have at least 2 bins + with pytest.raises(ValueError, match="dominance.*at least 2"): + dfe.MutationType( + dominance_coeff_list=[0.2], + dominance_coeff_breaks=[], + ) + # list must be one longer than breaks + for x in ([], [0.0], dcl): + with pytest.raises(ValueError, match="dominance.*equal to"): + dfe.MutationType( + dominance_coeff_list=dcl, + dominance_coeff_breaks=x, + ) + # bad coefficients + for x in (np.inf, np.nan, "abc", [], {}): + with pytest.raises(ValueError, match="dominance.coeff"): + dfe.MutationType( + dominance_coeff_list=[x] + dcl[1:], + dominance_coeff_breaks=dcb, + ) + # bad breaks + for x in (np.inf, np.nan, "abc", [], {}): + with pytest.raises(ValueError, match="dominance.*break"): + dfe.MutationType( + dominance_coeff_list=dcl, + dominance_coeff_breaks=[x] + dcb[1:], + ) + with pytest.raises(ValueError, match="nondecreasing"): + dfe.MutationType( + dominance_coeff_list=dcl, + dominance_coeff_breaks=list(reversed(dcb)), + ) + class TestAllDFEs: """ diff --git a/tests/test_slim_engine.py b/tests/test_slim_engine.py index d0e30a562..31bce9c73 100644 --- a/tests/test_slim_engine.py +++ b/tests/test_slim_engine.py @@ -1164,7 +1164,16 @@ class TestGenomicElementTypes(PiecewiseConstantSizeMixin): for t, params in mut_params.items() for p in params ] - assert len(example_mut_types) == 10 + for dcl, dcb in [([0.0, 1.0], [0.0]), ([-0.2, 1.4, 0.5], [-0.1, 0.1])]: + for t in ("f", "e", "n"): + mt = stdpopsim.MutationType( + distribution_type=t, + distribution_args=mut_params[t][0], + dominance_coeff_list=dcl, + dominance_coeff_breaks=dcb, + ) + example_mut_types.append((t, mt)) + assert len(example_mut_types) == 16 def get_example_dfes(self): # this is in a function because scoping is weird @@ -1202,7 +1211,6 @@ def test_slim_simulates_dfes(self): contig = get_test_contig() # make theta = 40 contig.mutation_rate = 20 / (1000 * contig.recombination_map.sequence_length) - print(contig.mutation_rate) dfes = [ stdpopsim.DFE( id=str(j), @@ -1232,7 +1240,6 @@ def test_slim_simulates_dfes(self): ts.tables.sites.position < breaks[j + 1], ) )[0] - print(mt) assert len(sites) > 0 for k in sites: s = ts.site(k) @@ -1297,12 +1304,18 @@ def _slim_proportions(dfe): is_dfe_neutral &= mt["is_neutral"] # Recall a mutation type will have proportion 0.0 in SLiM if it is # neutral and not all the mutation types in the corresponding DFE are - # neutral. SLiM does not allow a genomicelementtype to have all 0.0 + # neutral. SLiM does not allow a genomicElementType to have all 0.0 # proportions. - return [ - prop if (not mt["is_neutral"]) or is_dfe_neutral else 0.0 - for prop, mt in zip(dfe["proportions"], dfe["mutation_types"]) - ] + prop_list = [] + for prop, mt in zip(dfe["proportions"], dfe["mutation_types"]): + first_mt = True + for i in mt["slim_mutation_type_id"]: + nonzero_prop = ( + (not mt["is_neutral"]) or is_dfe_neutral + ) and first_mt + prop_list.append(prop if nonzero_prop else 0.0) + first_mt = False + return prop_list for i, dfe in enumerate(ts.metadata["stdpopsim"]["DFEs"]): if dfe["id"] == "recapitation": @@ -1319,12 +1332,43 @@ def _slim_proportions(dfe): "intervalStarts": int_starts, "mutationFractions": _slim_proportions(dfe), "mutationTypes": [ - mt["slim_mutation_type_id"] for mt in dfe["mutation_types"] + mid + for mt in dfe["mutation_types"] + for mid in mt["slim_mutation_type_id"] ], } - ge = self.slim_metadata_key0(ge_types, str(i)) assert ge == ge_from_meta + def verify_discretized_dominance(self, contig, ts): + # check that the dummy first mutation type of discretized + # h-s relationship mutation types are absent + mut_type_counts = collections.Counter( + x["mutation_type"] + for m in ts.mutations() + for x in m.metadata["mutation_list"] + ) + metadata_ids = [x["id"] for x in ts.metadata["stdpopsim"]["DFEs"]] + slim_mt_info = ts.metadata["SLiM"]["user_metadata"]["mutationTypes"][0] + has_recap = metadata_ids[-1] == "recapitation" + slim_to_mt_map = {} + assert len(contig.dfe_list) + has_recap == len(ts.metadata["stdpopsim"]["DFEs"]) + for dfe, ts_metadata in zip(contig.dfe_list, ts.metadata["stdpopsim"]["DFEs"]): + assert dfe.id == ts_metadata["id"] + assert len(dfe.mutation_types) == len(ts_metadata["mutation_types"]) + for mt, md in zip(dfe.mutation_types, ts_metadata["mutation_types"]): + if mt.dominance_coeff_list is not None: + assert len(mt.dominance_coeff_list) + 1 == len( + md["slim_mutation_type_id"] + ) + first_type = md["slim_mutation_type_id"][0] + assert mut_type_counts[first_type] == 0 + for h, k in zip( + mt.dominance_coeff_list, md["slim_mutation_type_id"][1:] + ): + assert str(k) in slim_mt_info + assert np.allclose(slim_mt_info[str(k)][0]["dominanceCoeff"], h) + slim_to_mt_map[k] = mt + def verify_genomic_elements(self, contig, ts): # compare information about genomic elements as recorded # by SLiM itself in metadata to what we think it should be @@ -1335,6 +1379,7 @@ def verify_genomic_elements(self, contig, ts): ts.metadata["SLiM"]["user_metadata"], "genomicElementTypes" ) self.verify_dfes_metadata(ts, ge_types) + self.verify_discretized_dominance(contig, ts) assert len(contig.dfe_list) == len(ge_types) for j, (dfe, intervals) in enumerate( zip(contig.dfe_list, contig.interval_list) @@ -1343,13 +1388,21 @@ def verify_genomic_elements(self, contig, ts): ge = self.slim_metadata_key0(ge_types, str(j)) # checking that the neutral mutations have 0.0 proportion unless # all the mutations are neutral in this dfe - for mt, dfe_prop, slim_prop in zip( - dfe.mutation_types, dfe.proportions, ge["mutationFractions"] - ): + assert len(dfe.mutation_types) == len(dfe.proportions) + ge_index = 0 + for mt, dfe_prop in zip(dfe.mutation_types, dfe.proportions): + slim_prop = ge["mutationFractions"][ge_index] + ge_index += 1 if mt.is_neutral and not dfe.is_neutral: assert slim_prop == 0.0 else: assert slim_prop == dfe_prop + if mt.dominance_coeff_list is not None: + for _ in mt.dominance_coeff_list: + slim_prop = ge["mutationFractions"][ge_index] + assert slim_prop == 0.0 + ge_index += 1 + assert ge_index == len(ge["mutationFractions"]) # "+1" because SLiM's intervals are closed on both ends, # stdpopsim's are closed on the left, open on the right slim_intervals = np.column_stack( @@ -1359,20 +1412,33 @@ def verify_genomic_elements(self, contig, ts): ] ) assert_array_equal(intervals, slim_intervals) - slim_mut_types = ge["mutationTypes"] - assert len(dfe.mutation_types) == len(slim_mut_types) - for mt, mt_id in zip(dfe.mutation_types, slim_mut_types): + ge_index = 0 + for mt in dfe.mutation_types: + mt_id = ge["mutationTypes"][ge_index] + ge_index += 1 assert str(mt_id) in mut_types slim_mt = self.slim_metadata_key0(mut_types, str(mt_id)) - assert mt.dominance_coeff == self.slim_metadata_key0( - slim_mt, "dominanceCoeff" - ) + if mt.dominance_coeff_list is None: + assert mt.dominance_coeff == self.slim_metadata_key0( + slim_mt, "dominanceCoeff" + ) + else: + assert 0.5 == self.slim_metadata_key0(slim_mt, "dominanceCoeff") + for h in mt.dominance_coeff_list: + mt_id = ge["mutationTypes"][ge_index] + ge_index += 1 + assert str(mt_id) in mut_types + slim_mt = self.slim_metadata_key0(mut_types, str(mt_id)) + assert np.allclose( + h, self.slim_metadata_key0(slim_mt, "dominanceCoeff") + ) assert mt.distribution_type == self.slim_metadata_key0( slim_mt, "distributionType" ) assert len(mt.distribution_args) == len(slim_mt["distributionParams"]) for a, b in zip(mt.distribution_args, slim_mt["distributionParams"]): assert a == b + assert ge_index == len(ge["mutationTypes"]) def test_no_dfe(self): contig = get_test_contig() @@ -1630,6 +1696,65 @@ def test_chromosomal_segment(self): self.verify_mutation_rates(contig, ts) assert int(ts.sequence_length) == right - left + def test_dominance_coeff_list(self): + # test that discretized h-s relationships work + contig = get_test_contig() + L = int(contig.length) + emts = [emt for _, emt in self.example_mut_types[10:12]] + for emt in emts: + assert emt.dominance_coeff_list is not None + dfe0 = stdpopsim.DFE( + id="dfe", + description="the first one", + long_description="hello world", + proportions=[1.0, 0.0], + mutation_types=emts, + ) + emts = [emt for _, emt in self.example_mut_types[12:15]] + for emt in emts: + assert emt.dominance_coeff_list is not None + dfe1 = stdpopsim.DFE( + id="dfe", + description="the second one", + long_description="I'm different but have the same name! =( =(", + proportions=[0.2, 0.7, 0.1], + mutation_types=emts, + ) + contig.add_dfe(np.array([[0, 0.5 * L]], dtype="int"), dfe0) + contig.add_dfe(np.array([[0.2 * L, 0.4 * L]], dtype="int"), dfe0) + contig.add_dfe(np.array([[0.45 * L, L]], dtype="int"), dfe1) + contig.add_dfe(np.array([[0.7 * L, 0.9 * L]], dtype="int"), dfe0) + assert len(contig.dfe_list) == 5 + engine = stdpopsim.get_engine("slim") + contig.mutation_rate *= 10 + ts = engine.simulate( + demographic_model=self.model, + contig=contig, + samples=self.samples, + verbosity=3, # to get metadata output + ) + assert len(ts.metadata["stdpopsim"]["DFEs"]) == len(contig.dfe_list) + 1 + # slim mutation type IDs with dominance coeff lists: + mut_id_haslist = {} + for dfe in ts.metadata["stdpopsim"]["DFEs"]: + for mt in dfe["mutation_types"]: + haslist = ( + "dominance_coeff_list" in mt + and mt["dominance_coeff_list"] is not None + ) + for slim_id in mt["slim_mutation_type_id"]: + assert slim_id not in mut_id_haslist + mut_id_haslist[slim_id] = haslist + num_target_muts = 0 + for mut in ts.mutations(): + for md in mut.metadata["mutation_list"]: + if mut_id_haslist[md["mutation_type"]]: + num_target_muts += 1 + # the number 20 is not important, just want to make sure we have *some* + assert num_target_muts > 20 + self.verify_genomic_elements(contig, ts) + self.verify_mutation_rates(contig, ts) + @pytest.mark.skipif(IS_WINDOWS, reason="SLiM not available on windows") class TestLogfile(PiecewiseConstantSizeMixin):