diff --git a/.gitignore b/.gitignore index 8ad720caa..777fd2adc 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ # Cache with transcript/protein pickle files .gpsea_cache +dev/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/README.md b/README.md index 2a498eb42..c4b00093d 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,8 @@ ![PyPi downloads](https://img.shields.io/pypi/dm/gpsea.svg?label=Pypi%20downloads) ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/gpsea) -GPSEA is a Python library for discovery of genotype-phenotype associations. +GPSEA (Genotypes and Phenotypes - Statistical Evaluation of Associations, pronounced "G"-"P"-"C") is a Python package designed to support genotype-phenotype correlation analysis. + See the [Tutorial](https://monarch-initiative.github.io/gpsea/stable/tutorial.html) and a comprehensive [User guide](https://monarch-initiative.github.io/gpsea/stable/user-guide/index.html) diff --git a/docs/conf.py b/docs/conf.py index 6233764c3..c311c2f80 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -60,9 +60,9 @@ # built documents. # # The short X.Y version. -version = u'0.3' +version = u'0.4' # The full version, including alpha/beta/rc tags. -release = u'0.3.0' +release = u'0.4.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docs/index.rst b/docs/index.rst index f9478fa29..5bcadd386 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -10,8 +10,8 @@ A key question in biology and human genetics concerns the relationships between genetics, the focus is generally placed on the study of whether specific disease-causing alleles are associated with specific phenotypic manifestations of the disease. -`GPSEA` (genotypes and phenotypes - study and evaluation of associations) is a Python package designed to support genotype-phenotype correlation analysis. -We pronounce GPSEA as "G"-"P"-"C". The input to `GPSEA` is a collection of `Global Alliance for Genomics and Health (GA4GH) Phenopackets `_. +`GPSEA` (Genotypes and Phenotypes - Statistical Evaluation of Associations, pronounced "G"-"P"-"C") is a Python package designed to support genotype-phenotype correlation analysis. +The input to `GPSEA` is a collection of `Global Alliance for Genomics and Health (GA4GH) Phenopackets `_. `gpsea` ingests data from these phenopackets and performs analysis of the correlation of specific variants, variant types (e.g., missense vs. premature termination codon), or variant location in protein motifs or other features. The phenotypic abnormalities are represented by `Human Phenotype Ontology (HPO) `_ terms. diff --git a/docs/report/tbx5_all_variants.html b/docs/report/tbx5_all_variants.html new file mode 100644 index 000000000..a96181629 --- /dev/null +++ b/docs/report/tbx5_all_variants.html @@ -0,0 +1,583 @@ + + + + + Cohort + + + + +

GPSEA cohort analysis: All variant alleles

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+

Variant alleles

+ A total of 53 unique alleles were identified in the cohort. +
Variant keyVariant (cDNA)Variant (protein)EffectsCount
12_114385521_114385521_C_Tc.710G>Ap.Arg237Glnmissense22
12_114401830_114401830_C_Tc.238G>Ap.Gly80Argmissense20
12_114385563_114385563_G_Ac.668C>Tp.Thr223Metmissense8
12_114398675_114398675_G_Tc.408C>Ap.Tyr136Terstop gained6
12_114399514_114399514_A_Cc.361T>Gp.Trp121Glymissense, splice region5
12_114403792_114403792_C_CGc.106_107insCp.Ser36ThrfsTer25frameshift5
12_114398682_114398682_C_CGc.400dupp.Arg134ProfsTer49frameshift5
12_114385474_114385474_A_Gc.755+2T>CNonesplice donor4
12_114398656_114398656_C_CGc.426dupp.Ala143ArgfsTer40frameshift4
12_114366360_114366360_C_Tc.787G>Ap.Val263Metmissense4
12_114385522_114385522_G_Ac.709C>Tp.Arg237Trpmissense4
12_114403798_114403798_G_GCc.100dupp.Ala34GlyfsTer27frameshift4
12_114399613_114399613_T_Ac.262A>Tp.Lys88Terstop gained3
12_114401827_114401827_T_Ac.241A>Tp.Arg81Trpmissense, splice region3
12_114366312_114366312_G_Ac.835C>Tp.Arg279Terstop gained3
12_114366366_114366366_T_Ac.781A>Tp.Ser261Cysmissense3
12_114403798_114403799_GC_Gc.100delp.Ala34ProfsTer32frameshift3
12_114385475_114385475_C_Tc.755+1G>ANonesplice donor3
12_114401853_114401853_G_Tc.215C>Ap.Thr72Lysmissense3
12_114385521_114385521_C_Gc.710G>Cp.Arg237Promissense2
12_114398568_114398568_C_Ac.510+5G>TNonesplice donor 5th base, intronic2
12_114366207_114366208_GC_Gc.939delp.Gln315ArgfsTer79frameshift2
12_114366274_114366274_G_Tc.873C>Ap.Tyr291Terstop gained2
12_114366267_114366267_C_Ac.880G>Tp.Glu294Terstop gained2
12_114398578_114398579_CA_Cc.504delp.Phe168LeufsTer6frameshift2
12_114394762_114394763_CA_Cc.641delp.Val214GlyfsTer12frameshift2
12_114398666_114398667_TG_Tc.416delp.Pro139GlnfsTer11frameshift2
12_114403754_114403754_G_Tc.145C>Ap.Gln49Lysmissense, splice region2
12_114385550_114385550_A_AATTATTCTCAGc.680_681insCTGAGAATAATp.Ile227_Glu228insTerinframe insertion, stop retainined2
12_114366348_114366349_CT_Cc.798delp.Val267TrpfsTer127frameshift1
12_114399559_114399559_T_Cc.316A>Gp.Ile106Valmissense1
12_114356064_114356065_TA_Tc.1024delp.Tyr342ThrfsTer52frameshift1
12_114355755_114355756_TG_Tc.1333delp.His445MetfsTer137frameshift1
12_114398632_114398632_G_Ac.451C>Tp.Gln151Terstop gained1
12_114401907_114401907_A_Gc.161T>Cp.Ile54Thrmissense1
12_114355723_114355723_G_Ac.1366C>Tp.Gln456Terstop gained1
12_114398602_114398602_T_Gc.481A>Cp.Thr161Promissense1
12_114398708_114398709_GC_Gc.374delp.Gly125AlafsTer25frameshift1
12_114385553_114385553_C_Ac.678G>Tp.Lys226Asnmissense1
12_114399633_114399633_C_Gc.243-1G>CNonesplice acceptor1
12_114401873_114401874_TA_Tc.194delp.Leu65GlnfsTer10frameshift1
12_114399594_114399594_A_Cc.281T>Gp.Leu94Argmissense1
12_114399622_114399622_G_Tc.253C>Ap.Pro85Thrmissense1
12_114394743_114394746_TGTG_Tc.658_660delp.His220delinframe deletion1
12_114394820_114394820_C_Gc.584G>Cp.Gly195Alamissense1
12_114401846_114401846_C_Gc.222G>Cp.Met74Ilemissense1
12_114366241_114366242_CT_Cc.905delp.Gln302ArgfsTer92frameshift1
12_114355784_114355785_CA_Cc.1304delp.Leu435ArgfsTer147frameshift1
12_114398626_114398627_CG_Cc.456delp.Val153SerfsTer21frameshift1
12_114403859_114403859_G_Tc.40C>Ap.Pro14Thrmissense1
12_114399625_114399629_ACATC_Ac.246_249delp.Met83PhefsTer6frameshift1
12_114394817_114394817_G_Cc.587C>Gp.Ser196Terstop gained1
12_114401921_114401921_C_Gc.148-1G>CNonesplice acceptor1
+ + + + + \ No newline at end of file diff --git a/docs/report/tbx5_frameshift_vs_missense.csv b/docs/report/tbx5_frameshift_vs_missense.csv index 0bc1e5c6c..4bffc6ea1 100644 --- a/docs/report/tbx5_frameshift_vs_missense.csv +++ b/docs/report/tbx5_frameshift_vs_missense.csv @@ -1,22 +1,18 @@ -"Genotype group: Missense, Frameshift",Missense,Missense,Frameshift,Frameshift,, +Allele group,Missense,Missense,Frameshift,Frameshift,, ,Count,Percent,Count,Percent,Corrected p values,p values -Ventricular septal defect [HP:0001629],31/60,52%,19/19,100%,0.0009552459156234353,5.6190936213143254e-05 -Abnormal atrioventricular conduction [HP:0005150],0/22,0%,3/3,100%,0.003695652173913043,0.00043478260869565214 -Atrioventricular block [HP:0001678],0/22,0%,2/2,100%,0.015398550724637682,0.0036231884057971015 -Heart block [HP:0012722],0/22,0%,2/2,100%,0.015398550724637682,0.0036231884057971015 -Absent thumb [HP:0009777],12/71,17%,14/31,45%,0.0191369345329502,0.005628510156750059 -Patent ductus arteriosus [HP:0001643],3/37,8%,2/2,100%,0.038236617183985605,0.01349527665317139 -Triphalangeal thumb [HP:0001199],13/72,18%,13/32,41%,0.062175372424826694,0.02560162393963452 -Cardiac conduction abnormality [HP:0031546],14/36,39%,3/3,100%,0.15811357916621074,0.07440639019586388 -Secundum atrial septal defect [HP:0001684],14/35,40%,4/22,18%,0.2690764879148444,0.1424522583078588 -Muscular ventricular septal defect [HP:0011623],6/59,10%,6/25,24%,0.2868675985983051,0.1687456462342971 -Pulmonary arterial hypertension [HP:0002092],4/6,67%,0/2,0%,0.6623376623376622,0.42857142857142855 -Hypoplasia of the ulna [HP:0003022],1/12,8%,2/10,20%,0.8095238095238093,0.5714285714285713 +Ventricular septal defect [HP:0001629],31/60,52%,19/19,100%,0.0008990549794102921,5.6190936213143254e-05 +Abnormal atrioventricular conduction [HP:0005150],0/22,0%,3/3,100%,0.003478260869565217,0.00043478260869565214 +Atrioventricular block [HP:0001678],0/22,0%,2/2,100%,0.01932367149758454,0.0036231884057971015 +Absent thumb [HP:0009777],12/71,17%,14/31,45%,0.022514040627000236,0.005628510156750059 +Patent ductus arteriosus [HP:0001643],3/37,8%,2/2,100%,0.04318488529014845,0.01349527665317139 +Triphalangeal thumb [HP:0001199],13/72,18%,13/32,41%,0.06827099717235872,0.02560162393963452 +Cardiac conduction abnormality [HP:0031546],14/36,39%,3/3,100%,0.17007174901911745,0.07440639019586388 +Secundum atrial septal defect [HP:0001684],14/35,40%,4/22,18%,0.2849045166157176,0.1424522583078588 +Muscular ventricular septal defect [HP:0011623],6/59,10%,6/25,24%,0.29999225997208373,0.1687456462342971 +Pulmonary arterial hypertension [HP:0002092],4/6,67%,0/2,0%,0.6857142857142857,0.42857142857142855 +Hypoplasia of the ulna [HP:0003022],1/12,8%,2/10,20%,0.831168831168831,0.5714285714285713 Hypoplasia of the radius [HP:0002984],30/62,48%,6/14,43%,1.0,0.7735491022101784 Short thumb [HP:0009778],11/41,27%,8/30,27%,1.0,1.0 +Atrial septal defect [HP:0001631],42/44,95%,20/20,100%,1.0,1.0 Absent radius [HP:0003974],7/32,22%,6/25,24%,1.0,1.0 Short humerus [HP:0005792],7/17,41%,4/9,44%,1.0,1.0 -Atrial septal defect [HP:0001631],42/44,95%,20/20,100%,1.0,1.0 -Abnormal ventricular septum morphology [HP:0010438],31/31,100%,19/19,100%,, -Abnormal cardiac ventricle morphology [HP:0001713],31/31,100%,19/19,100%,, -Abnormal heart morphology [HP:0001627],62/62,100%,30/30,100%,, diff --git a/docs/report/tbx5_frameshift_vs_missense.mtc_report.html b/docs/report/tbx5_frameshift_vs_missense.mtc_report.html index b6ac17472..14f949c39 100644 --- a/docs/report/tbx5_frameshift_vs_missense.mtc_report.html +++ b/docs/report/tbx5_frameshift_vs_missense.mtc_report.html @@ -48,9 +48,9 @@

Phenotype testing report

Phenotype MTC filter: HPO MTC filter

Multiple testing correction: fdr_bh

-

Performed statistical tests for 17 out of the total of 260 HPO terms.

+

Performed statistical tests for 16 out of the total of 260 HPO terms.

- + @@ -59,80 +59,45 @@

Phenotype testing report

- - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + - - - - + + + - - - - + + + - - - - + + + - - - - + + + - - - - + + + diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 0a583b39e..54bf4bc89 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -84,7 +84,7 @@ a :class:`~gpsea.model.Cohort`: ... phenopackets=phenopackets, ... cohort_creator=cohort_creator, ... ) -Patients Created: ... +Individuals Processed: ... and we will check that there are no Q/C issues: @@ -103,7 +103,13 @@ We loaded the patient data into a `cohort` which is ready for the next steps. Explore cohort ^^^^^^^^^^^^^^ -We can now explore the cohort to see how many patients are included. +GPSEA helps with gaining insight into the cohort by providing + + +Show cohort summary +------------------- + +The summary report provides an overview about the HPO terms, variants, diseases, and variant effects that occurr most frequently: >>> from gpsea.view import CohortViewable >>> viewer = CohortViewable(hpo) @@ -121,6 +127,10 @@ We can now explore the cohort to see how many patients are included. from IPython.display import HTML, display display(HTML(report)) + +Plot distribution of variants with respect to the protein sequence +------------------------------------------------------------------ + Now we can show the distribution of variants with respect to the encoded protein. We first obtain `tx_coordinates` (:class:`~gpsea.model.TranscriptCoordinates`) and `protein_meta` (:class:`~gpsea.model.ProteinMetadata`) @@ -154,25 +164,44 @@ and we follow with plotting the diagram of the mutations on the protein: :width: 600px +.. _show-cohort-variants: + +Summarize all variant alleles +----------------------------- + +We can prepare a table of all variant alleles that occurr in the cohort. +Each table row corresponds to a single allele and lists the variant key, +the predicted effect on the transcript (*cDNA*) and protein of interest, +the variant effects, and the number of patients who present +with one or more variant alleles (*Count*): + +>>> from gpsea.view import CohortVariantViewer +>>> viewer = CohortVariantViewer(tx_id=tx_id) +>>> report = viewer.process(cohort=cohort) +>>> with open('docs/report/tbx5_all_variants.html', 'w') as fh: # doctest: +SKIP +... _ = fh.write(report) + +.. raw:: html + :file: report/tbx5_all_variants.html + + Prepare genotype and phenotype predicates ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ We will create a predicate to bin patients into group -depending on presence of a missense and frameshift variant to test +depending on presence of a single allele of a missense or frameshift variant to test if there is a difference between frameshift and non-frameshift variants in the individuals of the *TBX5* cohort. >>> from gpsea.model import VariantEffect ->>> from gpsea.analysis.predicate.genotype import VariantPredicates, groups_predicate ->>> gt_predicate = groups_predicate( -... predicates=( -... VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, tx_id), -... VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id) -... ), -... group_names=('Missense', 'Frameshift'), +>>> from gpsea.analysis.predicate.genotype import VariantPredicates, monoallelic_predicate +>>> gt_predicate = monoallelic_predicate( +... a_predicate=VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, tx_id), +... b_predicate=VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id), +... names=('Missense', 'Frameshift') ... ) >>> gt_predicate.display_question() -'Genotype group: Missense, Frameshift' +'Allele group: Missense, Frameshift' .. note:: @@ -246,15 +275,15 @@ Now we can perform the analysis and investigate the results. ... pheno_predicates=pheno_predicates, ... ) >>> result.total_tests -17 +16 -We only tested 1y HPO terms. This is despite the individuals being collectively annotated with +We only tested 16 HPO terms. This is despite the individuals being collectively annotated with 260 direct and indirect HPO terms >>> len(result.phenotypes) 260 -We can show the reasoning behind *not* testing 243 (`260 - 17`) HPO terms +We can show the reasoning behind *not* testing 244 (`260 - 16`) HPO terms by exploring the phenotype MTC filtering report. >>> from gpsea.view import MtcStatsViewer @@ -266,11 +295,11 @@ by exploring the phenotype MTC filtering report. .. raw:: html :file: report/tbx5_frameshift_vs_missense.mtc_report.html -and these are the top 20 HPO terms ordered by the p value corrected with the Benjamini-Hochberg procedure: +and these are the tested HPO terms ordered by the p value corrected with the Benjamini-Hochberg procedure: ->>> from gpsea.analysis.predicate import PatientCategories ->>> summary_df = result.summarize(hpo, PatientCategories.YES) ->>> summary_df.head(20).to_csv('docs/report/tbx5_frameshift_vs_missense.csv') # doctest: +SKIP +>>> from gpsea.view import summarize_hpo_analysis +>>> summary_df = summarize_hpo_analysis(hpo, result) +>>> summary_df.to_csv('docs/report/tbx5_frameshift_vs_missense.csv') # doctest: +SKIP .. csv-table:: *TBX5* frameshift vs missense :file: report/tbx5_frameshift_vs_missense.csv @@ -283,4 +312,4 @@ was observed in 31/60 (52%) patients with a missense variant but it was observed in 19/19 (100%) patients with a frameshift variant. Fisher exact test computed a p value of `~0.0000562` and the p value corrected by Benjamini-Hochberg procedure -is `~0.000955`. +is `~0.000899`. diff --git a/docs/user-guide/index.rst b/docs/user-guide/index.rst index f3e0569ab..08097f40f 100644 --- a/docs/user-guide/index.rst +++ b/docs/user-guide/index.rst @@ -4,7 +4,13 @@ User guide ========== -TODO - write high level overview and bridge to individual sections. +GPSEA allows users to perform many different kinds of genotype-phenotype correlation (GPCs) analysis. See the :ref:`tutorial` for an introduction. +In general, the analysis will include steps for data input, exploration of the cohort to assist in generating hypotheses about potential GPCs to test, +corresponding choice of genotype and phenotype predicates to perform the test, choice of statistical test, and approach to multiple-testing correction (mtc). +The pages shown in the table of contents provide more information about each step. + + + .. toctree:: :maxdepth: 1 @@ -13,7 +19,6 @@ TODO - write high level overview and bridge to individual sections. input-data exploratory predicates - phenotype_predicates stats mtc glossary diff --git a/docs/user-guide/input-data.rst b/docs/user-guide/input-data.rst index b94379b36..1abd1ab8f 100644 --- a/docs/user-guide/input-data.rst +++ b/docs/user-guide/input-data.rst @@ -80,7 +80,7 @@ loader function: >>> from gpsea.preprocessing import load_phenopackets >>> cohort, qc_results = load_phenopackets(phenopackets, cohort_creator) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE -Patients Created: ... +Individuals Processed: ... >>> len(cohort) 19 diff --git a/docs/user-guide/mtc.rst b/docs/user-guide/mtc.rst index 070539d19..7f47e2286 100644 --- a/docs/user-guide/mtc.rst +++ b/docs/user-guide/mtc.rst @@ -192,62 +192,96 @@ We use static constructor :func:`~gpsea.analysis.mtc_filter.HpoMtcFilter.default for creating :class:`~gpsea.analysis.mtc_filter.HpoMtcFilter`. The constructor takes a threshold as an argument (e.g. 20% in the example above) and the method's logic is made up of 8 individual heuristics -designed to skip testing the HPO terms that are unlikely to yield significant or interesting results: - -#. Skip terms that occur very rarely - The ``term_frequency_threshold`` determines the mininum proportion of individuals - with direct or indirect annotation by the HPO term to test. - We check each of the genotype groups (e.g., MISSENSE vs. not-MISSENSE), and we only retain a term for testing - if the proportion of individuals in at least one genotype group is greater than or equal to ``term_frequency_threshold``. - - This is because of our assumption that even if there is statistical significance, - if a term is only seen in (for example) 7% of individuals in the MISSENSE group and 2% in the not-MISSENSE group, - the term is unlikely to be of great interest because it is rare. - -#. Skip terms if no cell has more than one count - In a related heuristic, we skip terms if no genotype group has more than one count. - This is not completely redundant with the previous condition, - because some terms may have a small number of total observations. - -#. Skip terms if all counts are identical to counts for a child term - Let's say a term such as - `Posterior polar cataract (HP:0001115) `_ - was observed in 7 of 11 individuals with MISSENSE variants - and in 3 of 8 individuals with NONSENSE variants. - If we find the same patient counts (7 of 11 and 3 of 8) in the parent term - `Polar cataract HP:0010696 `_, - then we choose to not test the parent term. - - This is because the more specific an HPO term is, - the more information it has (the more interesting the correlation would be if it exists), - and the result of the Fisher Exact test for *Polar cataract* - would be exactly the same as for *Posterior polar cataract*. - -#. Skip terms if genotypes have same HPO proportions - If both (or all) of the genotype groups have the same proportion of individuals - observed to be annotated to an HPO term, e.g., both are 50%, then skip the term, - because it is not possible that the Fisher exact test will return a significant result. - -#. Skip terms if there are no HPO observations in a group - If one of the genotype groups has neither observed nor excluded observations for an HPO term, skip it. - -#. Skipping terms that are not descendents of `Phenotypic abnormality (HP:0000118) `_ - The HPO has a number of other branches that describe modes of inheritance, - past medical history, and clinical modifiers. - We do not think it makes much sense to test for enrichment of these terms, - and so they are filtered out. - -#. Skipping "general" level terms - All the direct children of the root phenotype term - `Phenotypic abnormality (HP:0000118) `_ are skipped, - because of the assumption that if there is a valid signal, - it will derive from one of the more specific descendents. - - For instance, `Abnormality of the nervous system (HP:0000707) `_ - is a child of *Phenotypic abnormality*, and this assumption implies - that if there is a signal from the nervous system, - it will lead to at least one of the descendents of - *Abnormality of the nervous system* being significant. - - See :ref:`general-hpo-terms` section for details. +designed to skip testing the HPO terms that are unlikely to yield significant or interesting results. + +`HMF01` - Skip terms that occur very rarely +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ``term_frequency_threshold`` determines the mininum proportion of individuals +with direct or indirect annotation by the HPO term to test. +We check each of the genotype groups (e.g., MISSENSE vs. not-MISSENSE), +and we only retain a term for testing if the proportion of individuals +in at least one genotype group is greater than +or equal to ``term_frequency_threshold``. +This is because of our assumption that even if there is statistical significance, +if a term is only seen in (for example) 7% of individuals +in the MISSENSE group and 2% in the not-MISSENSE group, +the term is unlikely to be of great interest because it is rare. + + +`HMF02` - Skip terms if no genotype group has more than one count +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In a related heuristic, we skip terms if no genotype group has more +than one count. This is not completely redundant with the previous condition, +because some terms may have a small number of total observations. + + +`HMF03` - Skip terms if all counts are identical to counts for a child term +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Let's say a term such as +`Posterior polar cataract (HP:0001115) `_ +was observed in 7 of 11 individuals with MISSENSE variants +and in 3 of 8 individuals with NONSENSE variants. +If we find the same patient counts (7 of 11 and 3 of 8) in the parent term +`Polar cataract HP:0010696 `_, +then we choose to not test the parent term. + +This is because the more specific an HPO term is, +the more information it has (the more interesting the correlation would be if it exists), +and the result of a test, such as the Fisher Exact test, would be exactly the same +for *Polar cataract* as for *Posterior polar cataract*. + + +`HMF04` - Skip terms if genotypes have same HPO proportions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If both (or all) of the genotype groups have the same proportion of individuals +observed to be annotated to an HPO term, e.g., both are 50%, then skip the term, +because it is not possible that the Fisher exact test will return a significant result. + + +`HMF05` - Skip term if one of the genotype groups has neither observed nor excluded observations +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Skip terms if there are no HPO observations in a group. + + +`HMF06` - Skip term if underpowered for 2x2 or 2x3 analysis +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If the individuals are binned into 2 phenotype groups and 2 genotype groups (2x2) +and the total count of patients in all genotype-phenotype groups is less than 7, +or into 2 phenotype groups and 3 genotype groups (2x3) and the total count of patients +is less than 6, then there is a lack even of the nominal statistical power +and the counts can never be significant. + + +`HMF07` - Skipping terms that are not descendents of *Phenotypic abnormality* +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The HPO has a number of other branches that describe modes of inheritance, +past medical history, and clinical modifiers. +We do not think it makes much sense to test for enrichment of these terms, +so, all terms that are not descendants of +`Phenotypic abnormality `_ are filtered out. + + +`HMF08` - Skipping "general" level terms +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +All the direct children of the root phenotype term +`Phenotypic abnormality (HP:0000118) `_ +are skipped, because of the assumption that if there is a valid signal, +it will derive from one of the more specific descendents. + +For instance, +`Abnormality of the nervous system `_ +(HP:0000707) is a child of *Phenotypic abnormality*, and this assumption implies +that if there is a signal from the nervous system, +it will lead to at least one of the descendents of +*Abnormality of the nervous system* being significant. + +See :ref:`general-hpo-terms` section for details. diff --git a/docs/user-guide/predicates.rst b/docs/user-guide/predicates.rst index 785c3ef0c..bfd533b78 100644 --- a/docs/user-guide/predicates.rst +++ b/docs/user-guide/predicates.rst @@ -4,502 +4,35 @@ Predicates ========== -Searching for genotype-phenotype associations usually requires to partition -the individuals into several discrete groups to allow testing for the inter-group differences. +Searching for genotype-phenotype associations usually requires that +the individuals are partitioned into two or more discrete groups to allow testing for the inter-group differences. GPSEA reflects these requirements with its predicate API. -Perhaps unsurprisingly, a predicate must be capable of partitioning the individuals into two or more groups. + + +A **predicate** must be capable of partitioning the individuals into two or more groups. The groups must be *exclusive* - each individual must be assigned at most into one group. -Moreover, the groups should be *exhaustive* and cover maximum of the possible states. +In general, it is desirable that the groups cover all or at least the majority of the cohort being analyzed to maximize statistical power. However, the predicate is allowed to return `None` if the individual cannot be assigned. -In result, the individual will be omitted from the downstream analysis. +As a result, the individual will be omitted from the downstream analysis. -Predicates serve both *genotype* and *phenotype* prongs of the analysis. +Predicates serve both the *genotype* and *phenotype* prongs of the analysis. Genotype predicates (:class:`~gpsea.analysis.predicate.genotype.GenotypePolyPredicate`) assign the :class:`~gpsea.model.Patient` into a group (mostly) based on the variant information, while the phenotype predicates (:class:`~gpsea.analysis.predicate.phenotype.PhenotypePolyPredicate`) use the HPO terms to assign a group. -It is literally impossible to use GPSEA without the predicates -because all analyses need at least one predicate (typically a *genotype* predicate). -Luckily, the purpose of this guide is to show all that is to know about predicates. -We will first discuss the genotype predicates and end with phenotype predicates. - -.. _genotype-predicates: - -******************* -Genotype predicates -******************* - -A genotype predicate seeks to divide the individuals along an axis that is orthogonal to phenotypes. -Typically, this includes using the genotype data, such as presence of a missense variant -in a heterozygous genotype. However, other categorical variables, -such as diagnoses (TODO - add link to disease predicate) or cluster ids can also be used. - -The genotype predicates test the individual for a presence of variants that meet certain inclusion criteria. -The testing is done in two steps. First, we count the alleles -of the matching variants and then we interpret the count, possibly including factors -such as the expected mode of inheritance and sex, to assign the individual into a group. -Finding the matching variants is what -the :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` is all about. - - -TODO: wordsmith -We must first create the variant predicate and then wrap it in genotype predicate. - - -Variant predicates -================== - -GPSEA uses the :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` class -to test if a :class:`~gpsea.model.Variant` meets the inclusion criteria. -The variant predicate can leverage multiple primary data: - -+------------------------+-------------------------------------------------------------------------------------------------+ -| Primary data source | Example | -+========================+=================================================================================================+ -| Allele | the variant being a deletion or a single nucleotide variant (SNV) | -+------------------------+-------------------------------------------------------------------------------------------------+ -| Genome | overlaps of a target genomic region | -+------------------------+-------------------------------------------------------------------------------------------------+ -| Functional annotation | variant is predicted to lead to a missense change or affect an exon of certain transcript | -+------------------------+-------------------------------------------------------------------------------------------------+ -| Protein data | variant is located in a region encoding a protein domain, protein feature type | -+------------------------+-------------------------------------------------------------------------------------------------+ - -which demands a considerable amount of flexibility for creating the predicate. - -As a rule of thumb, the predicates for testing basic conditions are available off the shelf, -and they can be used as building block for testing for more complex conditions, -such as testing if the variant is "a missense or synonymous variant located in exon 6 of transcript `NM_013275.6`". - -Let's demonstrate this on few examples. -We will load a cohort of 19 subjects with variants in *RERE*, -leading to `Holt-Oram syndrome MIM:142900 `_. -The the clinical signs and symptoms of the subjects were encoded into HPO terms -along with the pathogenic *RERE* variant. - -Let's load the phenopackets, as previously described in greater detail the :ref:`input-data` section. -Briefly, we first load HPO: - ->>> import hpotk ->>> store = hpotk.configure_ontology_store() ->>> hpo = store.load_minimal_hpo(release='v2024-07-01') - -then, we configure the cohort creator: - ->>> from gpsea.preprocessing import configure_caching_cohort_creator ->>> cohort_creator = configure_caching_cohort_creator(hpo) - -which we use to create a :class:`~gpsea.model.Cohort` from a bunch of phenopackets -from the release `0.1.18` of `Phenopacket Store `_. -We will load 19 individuals with mutations in *RERE* gene: - ->>> from ppktstore.registry import configure_phenopacket_registry ->>> registry = configure_phenopacket_registry() ->>> with registry.open_phenopacket_store(release='0.1.18') as ps: -... phenopackets = tuple(ps.iter_cohort_phenopackets('RERE')) ->>> len(phenopackets) -19 - -and we will convert the phenopacket into a :class:`~gpsea.model.Cohort`: - ->>> from gpsea.preprocessing import load_phenopackets ->>> cohort, _ = load_phenopackets(phenopackets, cohort_creator) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE -Patients Created: ... - -To demonstrate the predicate API, we will use the variant ``1_8358231_8358231_T_C`` that corresponds -to a pathogenic variant `VCV000522858.5 `_ -that replaces the histidine encoded by the 1435th codon of `NM_001042681.2` with arginine: ``NM_001042681.2(RERE):c.4304A>G (p.His1435Arg)``. - ->>> variant_key_of_interest = '1_8358231_8358231_T_C' ->>> variant = cohort.get_variant_by_key(variant_key_of_interest) - -Building blocks ---------------- - -We can check that the variant overlaps with *RERE*: - ->>> from gpsea.analysis.predicate.genotype import VariantPredicates ->>> gene = VariantPredicates.gene('RERE') ->>> gene.test(variant) -True - -it overlaps with the *MANE* transcript: - ->>> rere_mane_tx_id = 'NM_001042681.2' ->>> tx = VariantPredicates.transcript(rere_mane_tx_id) ->>> tx.test(variant) -True - -it in fact overlaps with the exon 20: - ->>> exon20 = VariantPredicates.exon(exon=20, tx_id=rere_mane_tx_id) ->>> exon20.test(variant) -True - -and leads to a missense mutation with respect to the MANE transcript: - ->>> from gpsea.model import VariantEffect ->>> missense = VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, tx_id=rere_mane_tx_id) ->>> missense.test(variant) -True - -See :class:`~gpsea.analysis.predicate.genotype.VariantPredicates` -for more info on the predicates available off the shelf. - - -Complex conditions ------------------- - -We can combine the building blocks to test for more elaborate conditions. -For instance, we can test if the variant meets *any* or several conditions: - ->>> nonsense = VariantPredicates.variant_effect(VariantEffect.STOP_GAINED, tx_id=rere_mane_tx_id) ->>> missense_or_nonsense = missense | nonsense ->>> missense_or_nonsense.test(variant) -True - -or *all* conditions: - ->>> missense_and_exon20 = missense & exon20 ->>> missense_and_exon20.test(variant) -True - -The `VariantPredicate` overloads Python ``&`` (AND) and ``|`` (OR) operators to build a compound predicate from lower level building blocks. - -Therefore, there is nothing that prevents us to combine the predicates into multi-level tests, -such as testing if the variant is a *"chromosomal deletion" or a deletion which removes at least 50 bp*: - ->>> from gpsea.model import VariantClass ->>> chromosomal_deletion = "SO:1000029" ->>> predicate = VariantPredicates.structural_type(chromosomal_deletion) | (VariantPredicates.variant_class(VariantClass.DEL) & VariantPredicates.change_length("<=", -50)) ->>> predicate.get_question() -'(structural type is SO:1000029 OR (variant class is DEL AND change length <= -50))' - - -Inverting conditions --------------------- - -Sometimes we may want to test the variant for a condition that must *not* be met. -For instance, we may want to test if the variant is a deletion -that is *not* predicted to shift the transcript reading frame. -One of doing this would be to build a compound predicates -for all variant effects except of :class:`~gpsea.model.VariantEffect.FRAMESHIFT_VARIANT`: - ->>> non_frameshift_effects = ( -... VariantEffect.SYNONYMOUS_VARIANT, VariantEffect.MISSENSE_VARIANT, VariantEffect.INTRON_VARIANT, -... # and many more effects.. -... ) ->>> non_frameshift_predicate = VariantPredicates.all(VariantPredicates.variant_effect(eff, tx_id=rere_mane_tx_id) for eff in non_frameshift_effects) - -However, this is clearly tedious and it would be much better implemented -by a simple logical not of a predicate for a frameshift variant effect. - -To support this, `VariantPredicate` implements *logical inversion* -which corresponds to Python's ``~`` operator (tilde), to wrap -the underlying predicate and to invert its test result. - -This is how we can use the predicate inversion to build the predicate for non-frameshift deletions: - ->>> non_frameshift_del = ~VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id=rere_mane_tx_id) & VariantPredicates.variant_class(VariantClass.DEL) ->>> non_frameshift_del.get_question() -'(NOT FRAMESHIFT_VARIANT on NM_001042681.2 AND variant class is DEL)' - -Note the presence of a tilde ``~`` before the variant effect predicate and resulting ``NOT`` in the predicate question. - -The variant predicate offers a flexible API for testing if variants meet a condition. -However, the genotype phenotype correlations are done on the individual level -and the variant predicates are used as a component of the genotype predicate. -The next sections show how to use variant predicates to assign individuals into groups. - - -Mode of inheritance predicate -============================= - -The :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate` -assigns the individual into a group based on the number of alleles -that match a condition specified by a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate`. -The :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate` supports -the following Mendelian modes of inheritance (MoI): - - -+-----------------------+-----------------------------------+------------------+------------------------+ -| Mode of inheritance | Sex | Allele count | Genotype category | -+=======================+===================================+==================+========================+ -| Autosomal dominant | `*` | 0 | `HOM_REF` | -+ +-----------------------------------+------------------+------------------------+ -| | `*` | 1 | `HET` | -+ +-----------------------------------+------------------+------------------------+ -| | `*` | :math:`\ge 2` | ``None`` | -+-----------------------+-----------------------------------+------------------+------------------------+ -| Autosomal recessive | `*` | 0 | `HOM_REF` | -+ +-----------------------------------+------------------+------------------------+ -| | `*` | 1 | `HET` | -+ +-----------------------------------+------------------+------------------------+ -| | `*` | 2 | `BIALLELIC_ALT` | -+ +-----------------------------------+------------------+------------------------+ -| | `*` | :math:`\ge 3` | ``None`` | -+-----------------------+-----------------------------------+------------------+------------------------+ -| X-linked dominant | `*` | 0 | `HOM_REF` | -+ +-----------------------------------+------------------+------------------------+ -| | `*` | 1 | `HET` | -+ +-----------------------------------+------------------+------------------------+ -| | `*` | :math:`\ge 2` | ``None`` | -+-----------------------+-----------------------------------+------------------+------------------------+ -| X-linked recessive | `*` | 0 | `HOM_REF` | -+ +-----------------------------------+------------------+------------------------+ -| | :class:`~gpsea.model.Sex.FEMALE` | 1 | `HET` | -+ + +------------------+------------------------+ -| | | 2 | `BIALLELIC_ALT` | -+ + +------------------+------------------------+ -| | | :math:`\ge 3` | ``None`` | -+ +-----------------------------------+------------------+------------------------+ -| | :class:`~gpsea.model.Sex.MALE` | 1 | `HEMI` | -+ + +------------------+------------------------+ -| | | :math:`\ge 2` | ``None`` | -+-----------------------+-----------------------------------+------------------+------------------------+ - -.. note:: - - `BIALLELIC_ALT` includes both homozygous and compound heterozygous genotypes. - -Clinical judgment should be used to choose the MoI for the cohort analysis. -Then a predicate for the desired MoI can be created by one of -:class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate` static constructors: - -* :func:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_dominant` -* :func:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_recessive` -* :func:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.x_dominant` -* :func:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.x_recessive` - -All constructors take an instance -of :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` as an argument. - - -Example -------- - -Here we show seting up a predicate for grouping individuals based on -having a variant that leads to a frameshift or to a stop gain to a fictional transcript ``NM_1234.5`` -to test differences between the genotypes of a disease with an autosomal recessive MoI. - -First, we set up a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` -for testing if a variant meets the condition: - ->>> from gpsea.model import VariantEffect ->>> from gpsea.analysis.predicate.genotype import VariantPredicates ->>> tx_id = 'NM_1234.5' ->>> is_frameshift_or_stop_gain = VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id) \ -... | VariantPredicates.variant_effect(VariantEffect.STOP_GAINED, tx_id) ->>> is_frameshift_or_stop_gain.get_question() -'(FRAMESHIFT_VARIANT on NM_1234.5 OR STOP_GAINED on NM_1234.5)' - -Next, we use :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_recessive` -for assigning a patient into a genotype group: - ->>> from gpsea.analysis.predicate.genotype import ModeOfInheritancePredicate ->>> gt_predicate = ModeOfInheritancePredicate.autosomal_recessive(is_frameshift_or_stop_gain) ->>> gt_predicate.display_question() -'Which genotype group does the patient fit in: HOM_REF, HET, BIALLELIC_ALT' - -The `gt_predicate` can be used in downstream analysis, such as in :class: - - -.. _filtering-predicate: - -Filtering predicate -=================== - -Sometimes a predicate can bin individuals into more genotype groups than necessary and there may be need -to consider only a subset of the groups. A `GenotypePolyPredicate` -created by :class:`~gpsea.analysis.predicate.genotype.filtering_predicate` can retain only a subset -of the target categorizations of interest. - -Example -------- - -Let's suppose we want test the genotype-phenotype association between variants -that lead to frameshift or a stop gain in a fictional transcript `NM_1234.5`, -and we are specifically interested in comparing the heterozygous variants -in a biallelic alternative allele genotypes (homozygous alternate and compound heterozygous). - -First, we set up a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` -for testing if a variant introduces a premature stop codon or leads to the shift of the reading frame: - ->>> from gpsea.model import VariantEffect ->>> from gpsea.analysis.predicate.genotype import VariantPredicates ->>> tx_id = 'NM_1234.5' ->>> is_frameshift_or_stop_gain = VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id) \ -... | VariantPredicates.variant_effect(VariantEffect.STOP_GAINED, tx_id) ->>> is_frameshift_or_stop_gain.get_question() -'(FRAMESHIFT_VARIANT on NM_1234.5 OR STOP_GAINED on NM_1234.5)' - -Then, we create :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_recessive` -to bin according to a genotype group: - ->>> from gpsea.analysis.predicate.genotype import ModeOfInheritancePredicate ->>> gt_predicate = ModeOfInheritancePredicate.autosomal_recessive(is_frameshift_or_stop_gain) ->>> gt_predicate.display_question() -'Which genotype group does the patient fit in: HOM_REF, HET, BIALLELIC_ALT' - -We see that the `gt_predicate` bins the patients into three groups: - ->>> cats = gt_predicate.get_categorizations() ->>> cats -(Categorization(category=HOM_REF), Categorization(category=HET), Categorization(category=BIALLELIC_ALT)) - -We wrap the categorizations of interest along with the `gt_predicate` by the `filtering_predicate` function, -and we will get a :class:`~gpsea.analysis.predicate.genotype.GenotypePolyPredicate` -that includes only the categories of interest: - ->>> from gpsea.analysis.predicate.genotype import filtering_predicate ->>> fgt_predicate = filtering_predicate( -... predicate=gt_predicate, -... targets=(cats[1], cats[2]), -... ) ->>> fgt_predicate.display_question() -'Which genotype group does the patient fit in: HET, BIALLELIC_ALT' - - -.. _groups-predicate: - -Groups predicate -================ - -Sometimes, all we want is to compare if there is a difference between individuals -who include one or more alleles of variant $X$ vs. individuals with variants $Y$, -vs. individuals with variants $Z$, where $X$, $Y$ and $Z$ are variant predicates. -We can do this with a *groups* predicate. - -The :func:`~gpsea.analysis.predicate.genotype.groups_predicate` -takes *n* variant predicates and *n* group labels, and it will assign the patients -into the respective groups if one or more matching allele is found. -However, only one predicate is allowed to return a non-zero allele count. -Otherwise, the patient is assigned with ``None`` and excluded from the analysis. - -Example -------- - -Here we show how to build a :class:`~gpsea.analysis.predicate.genotype.GenotypePolyPredicate` -for testing if the individual has at least one missense vs. frameshift vs. synonymous variant. - ->>> from gpsea.model import VariantEffect ->>> from gpsea.analysis.predicate.genotype import VariantPredicates, groups_predicate ->>> tx_id = 'NM_1234.5' ->>> gt_predicate = groups_predicate( -... predicates=( -... VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, tx_id), -... VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id), -... VariantPredicates.variant_effect(VariantEffect.SYNONYMOUS_VARIANT, tx_id), -... ), -... group_names=('Missense', 'Frameshift', 'Synonymous'), -... ) ->>> gt_predicate.display_question() -'Genotype group: Missense, Frameshift, Synonymous' - - -.. _sex-predicate: - -Partition by the sex of the individual -====================================== - -It is easy to investigate the phenotypic differences between females and males. -The :func:`~gpsea.analysis.predicate.genotype.sex_predicate` provides a predicate -for partitioning based on the sex of the individual: - ->>> from gpsea.analysis.predicate.genotype import sex_predicate ->>> gt_predicate = sex_predicate() ->>> gt_predicate.display_question() -'Sex of the individual: FEMALE, MALE' - -The individuals with :class:`~gpsea.model.Sex.UNKNOWN_SEX` will be omitted from the analysis. - - -.. _diagnosis-predicate: - -Partition by a diagnosis -======================== - -It is also possible to bin the individuals based on a diagnosis. -The :func:`~gpsea.analysis.predicate.genotype.diagnosis_predicate` -prepares a genotype predicate for assigning an individual into a diagnosis group: - ->>> from gpsea.analysis.predicate.genotype import diagnosis_predicate ->>> gt_predicate = diagnosis_predicate( -... diagnoses=('OMIM:154700', 'OMIM:129600'), -... labels=('Marfan syndrome', 'Ectopia lentis, familial'), -... ) ->>> gt_predicate.display_question() -'What disease was diagnosed: OMIM:154700, OMIM:129600' - -Note, an individual must match only one diagnosis group. Any individuals labeled with two or more diagnoses -(e.g. an individual with both *Marfan syndrome* and *Ectopia lentis, familial*) -will be automatically omitted from the analysis. - - -.. _phenotype-predicates: - -******************** -Phenotype predicates -******************** - -The phenotype predicate assigns the individual into a group with respect to tested phenotype. -Typically, the phenotype corresponds to a clinical sign or symptom encoded into an HPO term. - - -Propagating phenotype predicate -=============================== - -When testing for presence or absence of an HPO term, the propagating phenotype predicate -leverages the :ref:`true-path-rule` to take advantage of the HPO hierarchy. -In result, an individual annotated with a term is implicitly annotated with all its ancestors. -For instance, an individual annotated with `Ectopia lentis `_ -is also annotated with `Abnormal lens morphology `_, -`Abnormal anterior eye segment morphology `_, -`Abnormal eye morphology `_, ... - -Similarly, all descendants of a term, whose presence was specifically excluded in an individual, -are implicitly excluded. - -:class:`~gpsea.analysis.predicate.phenotype.PropagatingPhenotypePredicate` implements this logic. - -Example -------- - -Here we show how to set up :class:`~gpsea.analysis.predicate.phenotype.PropagatingPhenotypePredicate` -to test for a presence of `Abnormal lens morphology `_. - - ->>> from gpsea.analysis.predicate.phenotype import PropagatingPhenotypePredicate ->>> query = hpotk.TermId.from_curie('HP:0000517') ->>> pheno_predicate = PropagatingPhenotypePredicate( -... hpo=hpo, -... query=query, -... ) ->>> pheno_predicate.display_question() -'Is Abnormal lens morphology present in the patient: Yes, No' - - -TODO: explain ``missing_implies_phenotype_excluded`` - +All GPSEA analyses need at least one predicate (typically a *genotype* predicate) and many require both *genotype* and *phenotype* predicates. +The following pages provide more information. -Predicates for all cohort phenotypes -==================================== -Constructing phenotype predicates for all HPO terms of a cohort sounds a bit tedious. -The :func:`~gpsea.analysis.predicate.phenotype.prepare_predicates_for_terms_of_interest` -function cuts down the tedium: ->>> from gpsea.analysis.predicate.phenotype import prepare_predicates_for_terms_of_interest ->>> pheno_predicates = prepare_predicates_for_terms_of_interest( -... cohort=cohort, -... hpo=hpo, -... ) ->>> len(pheno_predicates) -301 +.. toctree:: + :maxdepth: 1 + :caption: Contents: -and prepares predicates for testing 301 HPO terms of the *RERE* cohort. + predicates/phenotype_predicates + predicates/genotype_predicates ******* diff --git a/docs/user-guide/devries.rst b/docs/user-guide/predicates/devries.rst similarity index 100% rename from docs/user-guide/devries.rst rename to docs/user-guide/predicates/devries.rst diff --git a/docs/user-guide/predicates/diagnosis_predicate.rst b/docs/user-guide/predicates/diagnosis_predicate.rst new file mode 100644 index 000000000..386af20a0 --- /dev/null +++ b/docs/user-guide/predicates/diagnosis_predicate.rst @@ -0,0 +1,21 @@ +.. _diagnosis-predicate: + +======================== +Partition by a diagnosis +======================== + +It is also possible to bin the individuals based on a diagnosis. +The :func:`~gpsea.analysis.predicate.genotype.diagnosis_predicate` +prepares a genotype predicate for assigning an individual into a diagnosis group: + +>>> from gpsea.analysis.predicate.genotype import diagnosis_predicate +>>> gt_predicate = diagnosis_predicate( +... diagnoses=('OMIM:154700', 'OMIM:129600'), +... labels=('Marfan syndrome', 'Ectopia lentis, familial'), +... ) +>>> gt_predicate.display_question() +'What disease was diagnosed: OMIM:154700, OMIM:129600' + +Note, an individual must match only one diagnosis group. Any individuals labeled with two or more diagnoses +(e.g. an individual with both *Marfan syndrome* and *Ectopia lentis, familial*) +will be automatically omitted from the analysis. \ No newline at end of file diff --git a/docs/user-guide/predicates/genotype_predicates.rst b/docs/user-guide/predicates/genotype_predicates.rst new file mode 100644 index 000000000..f1ff3bf74 --- /dev/null +++ b/docs/user-guide/predicates/genotype_predicates.rst @@ -0,0 +1,32 @@ +.. _genotype-predicates: + +=================== +Genotype Predicates +=================== + + +A genotype predicate seeks to divide the individuals along an axis that is orthogonal to phenotypes. +Typically, this includes using the genotype data, such as presence of a missense variant +in a heterozygous genotype. However, other categorical variables, +such as diagnoses (TODO - add link to disease predicate) or cluster ids can also be used. + +The genotype predicates test the individual for a presence of variants that meet certain inclusion criteria. +The testing is done in two steps. First, we count the alleles +of the matching variants and then we interpret the count, possibly including factors +such as the expected mode of inheritance and sex, to assign the individual into a group. +Finding the matching variants is what +the :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` is all about. + + +.. toctree:: + :maxdepth: 1 + :caption: Contents: + + variant_predicates + mode_of_inheritance_predicate + male_female_predicate + diagnosis_predicate + groups_predicate + + + diff --git a/docs/user-guide/predicates/groups_predicate.rst b/docs/user-guide/predicates/groups_predicate.rst new file mode 100644 index 000000000..a61428976 --- /dev/null +++ b/docs/user-guide/predicates/groups_predicate.rst @@ -0,0 +1,41 @@ +.. _groups-predicate: + +================ +Groups Predicate +================ + + + +Sometimes, all we want is to compare if there is a difference between individuals +who include one or more alleles of variant `X` vs. individuals with variants `Y`, +vs. individuals with variants `Z`, where `X`, `Y` and `Z` are variant predicates. +We can do this with a *groups* predicate. + +The :func:`~gpsea.analysis.predicate.genotype.groups_predicate` +takes *n* variant predicates and *n* group labels, and it will assign the patients +into the respective groups if one or more matching allele is found. +However, only one predicate is allowed to return a non-zero allele count. +Otherwise, the patient is assigned with ``None`` and excluded from the analysis. + +Example +------- + +Here we show how to build a :class:`~gpsea.analysis.predicate.genotype.GenotypePolyPredicate` +for testing if the individual has at least one missense vs. frameshift vs. synonymous variant. + +>>> from gpsea.model import VariantEffect +>>> from gpsea.analysis.predicate.genotype import VariantPredicates, groups_predicate +>>> tx_id = 'NM_1234.5' +>>> gt_predicate = groups_predicate( +... predicates=( +... VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, tx_id), +... VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id), +... VariantPredicates.variant_effect(VariantEffect.SYNONYMOUS_VARIANT, tx_id), +... ), +... group_names=('Missense', 'Frameshift', 'Synonymous'), +... ) +>>> gt_predicate.display_question() +'Genotype group: Missense, Frameshift, Synonymous' + + + diff --git a/docs/user-guide/predicates/hpo_predicate.rst b/docs/user-guide/predicates/hpo_predicate.rst new file mode 100644 index 000000000..a5fc0549a --- /dev/null +++ b/docs/user-guide/predicates/hpo_predicate.rst @@ -0,0 +1,87 @@ +.. _hpo-predicate: + + +HPO predicate +============= + +When testing for presence or absence of an HPO term, the :class:`~gpsea.analysis.predicate.phenotype.HpoPredicate` +leverages the :ref:`true-path-rule` to take advantage of the HPO hierarchy. +In result, an individual annotated with a term is implicitly annotated with all its ancestors. +For instance, an individual annotated with `Ectopia lentis `_ +is also annotated with `Abnormal lens morphology `_, +`Abnormal anterior eye segment morphology `_, +`Abnormal eye morphology `_, ... + +Similarly, all descendants of a term, whose presence was specifically excluded in an individual, +are implicitly excluded. + +Example +------- + +Here we show how to set up :class:`~gpsea.analysis.predicate.phenotype.HpoPredicate` +to test for a presence of `Abnormal lens morphology `_. + +We need to load :class:`~hpotk.MinimalOntology` with HPO data to access the HPO hierarchy: + +>>> import hpotk +>>> store = hpotk.configure_ontology_store() +>>> hpo = store.load_minimal_hpo(release='v2024-07-01') + +and now we can set up a predicate to test for presence of *Abnormal lens morphology*: + +>>> from gpsea.analysis.predicate.phenotype import HpoPredicate +>>> query = hpotk.TermId.from_curie('HP:0000517') +>>> pheno_predicate = HpoPredicate( +... hpo=hpo, +... query=query, +... ) +>>> pheno_predicate.display_question() +'Is Abnormal lens morphology present in the patient: Yes, No' + + + +missing_implies_phenotype_excluded +---------------------------------- + +In many cases, published reports of clinical data about individuals with rare diseases describes phenotypic features that were observed, but do not +provide a comprehensive list of features that were explicitly excluded. By default, GPSEA will only include features that are recorded as observed or excluded in a phenopacket. +Setting this argument to True will cause "n/a" entries to be set to "excluded". We provide this option for exploration but do not recommend its use for the +final analysis unless the assumption behind it is known to be true. + + + +Predicates for all cohort phenotypes +==================================== + +Constructing phenotype predicates for all HPO terms of a cohort sounds a bit tedious. +The :func:`~gpsea.analysis.predicate.phenotype.prepare_predicates_for_terms_of_interest` +function cuts down the tedium. + +For a given phenopacket collection (e.g. 156 patients with mutations in *WWOX* gene included in Phenopacket Store version `0.1.18`) + +>>> from ppktstore.registry import configure_phenopacket_registry +>>> registry = configure_phenopacket_registry() +>>> with registry.open_phenopacket_store(release='0.1.18') as ps: +... phenopackets = tuple(ps.iter_cohort_phenopackets('TBX5')) +>>> len(phenopackets) +156 + +processed into a cohort + +>>> from gpsea.preprocessing import configure_caching_cohort_creator, load_phenopackets +>>> cohort_creator = configure_caching_cohort_creator(hpo) +>>> cohort, _ = load_phenopackets(phenopackets, cohort_creator) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE +Individuals Processed: ... + + +we can create HPO predicates for testing all 260 HPO terms used in the cohort + +>>> from gpsea.analysis.predicate.phenotype import prepare_predicates_for_terms_of_interest +>>> pheno_predicates = prepare_predicates_for_terms_of_interest( +... cohort=cohort, +... hpo=hpo, +... ) +>>> len(pheno_predicates) +260 + +and subject the predicates into further analysis, such as :class:`~gpsea.analysis.pcats.HpoTermAnalysis`. diff --git a/docs/user-guide/predicates/male_female_predicate.rst b/docs/user-guide/predicates/male_female_predicate.rst new file mode 100644 index 000000000..3bed237ec --- /dev/null +++ b/docs/user-guide/predicates/male_female_predicate.rst @@ -0,0 +1,21 @@ +.. _male-female-predicate: + +Partition by the sex of the individual +====================================== + +It is easy to investigate the phenotypic differences between females and males. +The :func:`~gpsea.analysis.predicate.genotype.sex_predicate` provides a predicate +for partitioning based on the sex of the individual: + +>>> from gpsea.analysis.predicate.genotype import sex_predicate +>>> gt_predicate = sex_predicate() +>>> gt_predicate.display_question() +'Sex of the individual: FEMALE, MALE' + +The individuals with :class:`~gpsea.model.Sex.UNKNOWN_SEX` will be omitted from the analysis. + +Note that we have implemented this predicate as a genotype predicate, because it is used in +place of other genotype predicates. Currently, it is not possible to compare the distribution of genotypes across sexes. + + + diff --git a/docs/user-guide/predicates/mode_of_inheritance_predicate.rst b/docs/user-guide/predicates/mode_of_inheritance_predicate.rst new file mode 100644 index 000000000..6c4178e8a --- /dev/null +++ b/docs/user-guide/predicates/mode_of_inheritance_predicate.rst @@ -0,0 +1,93 @@ +.. _mode-of-inheritance-predicate: + +============================== +Mode of Inheritance Predicates +============================== + +There are five basic modes of inheritance for single-gene diseases: autosomal dominant, +autosomal recessive, X-linked dominant, X-linked recessive, and mitochondrial +(See `Understanding Genetics, Appendix B `_). + + +The :class:`~gpsea.analysis.predicate.genotype.autosomal_dominant` +and :class:`~gpsea.analysis.predicate.genotype.autosomal_recessive` +assigns the individual into a group based on the number of the alleles +observed in the individual. +GPSEA supports the following Mendelian modes of inheritance (MoI): + + ++-----------------------+------------------+------------------------+ +| Mode of inheritance | Allele count | Genotype category | ++=======================+==================+========================+ +| Autosomal dominant | 0 | `HOM_REF` | ++ +------------------+------------------------+ +| | 1 | `HET` | ++ +------------------+------------------------+ +| | :math:`\ge 2` | ``None`` | ++-----------------------+------------------+------------------------+ +| Autosomal recessive | 0 | `HOM_REF` | ++ +------------------+------------------------+ +| | 1 | `HET` | ++ +------------------+------------------------+ +| | 2 | `BIALLELIC_ALT` | ++ +------------------+------------------------+ +| | :math:`\ge 3` | ``None`` | ++-----------------------+------------------+------------------------+ + + +.. note:: + + `BIALLELIC_ALT` includes both homozygous and compound heterozygous genotypes. + +Clinical judgment should be used to choose the MoI for the cohort analysis. +Then a predicate for the desired MoI can be created by calling one +of the following methods: + +* :func:`~gpsea.analysis.predicate.genotype.autosomal_dominant` +* :func:`~gpsea.analysis.predicate.genotype.autosomal_recessive` + +By default, the MoI predicates will use *all* variants recorded in the individual. +However, a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` +can be provided to select a variant subset, if necessary. + + +Assign individuals into genotype groups +--------------------------------------- + +Here we show seting up a predicate for grouping individuals for differences +between genotypes of a disease with an autosomal recessive MoI. + +We use :class:`~gpsea.analysis.predicate.genotype.autosomal_recessive` +to create the predicate: + +>>> from gpsea.analysis.predicate.genotype import autosomal_recessive +>>> gt_predicate = autosomal_recessive() +>>> gt_predicate.display_question() +'What is the genotype group: HOM_REF, HET, BIALLELIC_ALT' + +The predicate will use *all* recorded variants to determine if the individual belongs into +homozygous reference (`HOM_REF`), heterozygous (`HET`), or biallelic alternative (`BIALLELIC_ALT`) +category. + + +Use a subset of variants for choosing the genotype group +-------------------------------------------------------- + +To select specific variants, a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` +can be registered with the MoI predicate. +For instance, the following can be done to only consider the variants that lead +to a missense change on a fictional transcript ``NM_1234.5`` +when assigning the genotype group. We set up the variant predicate: + +>>> from gpsea.model import VariantEffect +>>> from gpsea.analysis.predicate.genotype import VariantPredicates +>>> tx_id = 'NM_1234.5' +>>> is_missense = VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, tx_id) +>>> is_missense.get_question() +'MISSENSE_VARIANT on NM_1234.5' + +and we use it to create the MoI predicate: + +>>> gt_predicate = autosomal_recessive(is_missense) +>>> gt_predicate.display_question() +'What is the genotype group: HOM_REF, HET, BIALLELIC_ALT' diff --git a/docs/user-guide/phenotype_predicates.rst b/docs/user-guide/predicates/phenotype_predicates.rst similarity index 94% rename from docs/user-guide/phenotype_predicates.rst rename to docs/user-guide/predicates/phenotype_predicates.rst index 86572657f..b8991a290 100644 --- a/docs/user-guide/phenotype_predicates.rst +++ b/docs/user-guide/predicates/phenotype_predicates.rst @@ -15,4 +15,5 @@ TODO -- separate explanations for HPO (Fisher), scores (Mann Whitney) and surviv :maxdepth: 1 :caption: Contents: + hpo_predicate devries diff --git a/docs/user-guide/predicates/variant_predicates.rst b/docs/user-guide/predicates/variant_predicates.rst new file mode 100644 index 000000000..1053fbbbf --- /dev/null +++ b/docs/user-guide/predicates/variant_predicates.rst @@ -0,0 +1,168 @@ +.. _variant_predicates: + +================== +Variant Predicates +================== + + +GPSEA uses the :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` class +to test if a :class:`~gpsea.model.Variant` meets the inclusion criteria. +The variant predicate can leverage multiple primary data: + ++------------------------+-------------------------------------------------------------------------------------------------+ +| Primary data source | Example | ++========================+=================================================================================================+ +| Allele | the variant being a deletion or a single nucleotide variant (SNV) | ++------------------------+-------------------------------------------------------------------------------------------------+ +| Genome | overlaps of a target genomic region | ++------------------------+-------------------------------------------------------------------------------------------------+ +| Functional annotation | variant is predicted to lead to a missense change or affect an exon of certain transcript | ++------------------------+-------------------------------------------------------------------------------------------------+ +| Protein data | variant is located in a region encoding a protein domain, protein feature type | ++------------------------+-------------------------------------------------------------------------------------------------+ + +which demands a considerable amount of flexibility for creating the predicate. + +As a rule of thumb, the predicates for testing basic conditions are available off the shelf, +and they can be used as building block for testing for more complex conditions, +such as testing if the variant is "a missense or synonymous variant located in exon 6 of transcript `NM_013275.6`". + +Let's demonstrate this on few examples. +We will load a cohort of 19 subjects with variants in *RERE*, +leading to `Holt-Oram syndrome MIM:142900 `_. +The the clinical signs and symptoms of the subjects were encoded into HPO terms +along with the pathogenic *RERE* variant. + +Let's load the phenopackets, as previously described in greater detail the :ref:`input-data` section. +Briefly, we first load HPO: + +>>> import hpotk +>>> store = hpotk.configure_ontology_store() +>>> hpo = store.load_minimal_hpo(release='v2024-07-01') + +then, we configure the cohort creator: + +>>> from gpsea.preprocessing import configure_caching_cohort_creator +>>> cohort_creator = configure_caching_cohort_creator(hpo) + +which we use to create a :class:`~gpsea.model.Cohort` from a bunch of phenopackets +from the release `0.1.18` of `Phenopacket Store `_. +We will load 19 individuals with mutations in *RERE* gene: + +>>> from ppktstore.registry import configure_phenopacket_registry +>>> registry = configure_phenopacket_registry() +>>> with registry.open_phenopacket_store(release='0.1.18') as ps: +... phenopackets = tuple(ps.iter_cohort_phenopackets('RERE')) +>>> len(phenopackets) +19 + +and we will convert the phenopacket into a :class:`~gpsea.model.Cohort`: + +>>> from gpsea.preprocessing import load_phenopackets +>>> cohort, _ = load_phenopackets(phenopackets, cohort_creator) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE +Individuals Processed: ... + +To demonstrate the predicate API, we will use the variant ``1_8358231_8358231_T_C`` that corresponds +to a pathogenic variant `VCV000522858.5 `_ +that replaces the histidine encoded by the 1435th codon of `NM_001042681.2` with arginine: ``NM_001042681.2(RERE):c.4304A>G (p.His1435Arg)``. + +>>> variant_key_of_interest = '1_8358231_8358231_T_C' +>>> variant = cohort.get_variant_by_key(variant_key_of_interest) + +Building blocks +--------------- + +We can check that the variant overlaps with *RERE*: + +>>> from gpsea.analysis.predicate.genotype import VariantPredicates +>>> gene = VariantPredicates.gene('RERE') +>>> gene.test(variant) +True + +it overlaps with the *MANE* transcript: + +>>> rere_mane_tx_id = 'NM_001042681.2' +>>> tx = VariantPredicates.transcript(rere_mane_tx_id) +>>> tx.test(variant) +True + +it in fact overlaps with the exon 20: + +>>> exon20 = VariantPredicates.exon(exon=20, tx_id=rere_mane_tx_id) +>>> exon20.test(variant) +True + +and leads to a missense mutation with respect to the MANE transcript: + +>>> from gpsea.model import VariantEffect +>>> missense = VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, tx_id=rere_mane_tx_id) +>>> missense.test(variant) +True + +See :class:`~gpsea.analysis.predicate.genotype.VariantPredicates` +for more info on the predicates available off the shelf. + + +Complex conditions +------------------ + +We can combine the building blocks to test for more elaborate conditions. +For instance, we can test if the variant meets *any* or several conditions: + +>>> nonsense = VariantPredicates.variant_effect(VariantEffect.STOP_GAINED, tx_id=rere_mane_tx_id) +>>> missense_or_nonsense = missense | nonsense +>>> missense_or_nonsense.test(variant) +True + +or *all* conditions: + +>>> missense_and_exon20 = missense & exon20 +>>> missense_and_exon20.test(variant) +True + +The `VariantPredicate` overloads Python ``&`` (AND) and ``|`` (OR) operators to build a compound predicate from lower level building blocks. + +Therefore, there is nothing that prevents us to combine the predicates into multi-level tests, +such as testing if the variant is a *"chromosomal deletion" or a deletion which removes at least 50 bp*: + +>>> from gpsea.model import VariantClass +>>> chromosomal_deletion = "SO:1000029" +>>> predicate = VariantPredicates.structural_type(chromosomal_deletion) | (VariantPredicates.variant_class(VariantClass.DEL) & VariantPredicates.change_length("<=", -50)) +>>> predicate.get_question() +'(structural type is SO:1000029 OR (variant class is DEL AND change length <= -50))' + + +Inverting conditions +-------------------- + +Sometimes we may want to test the variant for a condition that must *not* be met. +For instance, we may want to test if the variant is a deletion +that is *not* predicted to shift the transcript reading frame. +One of doing this would be to build a compound predicates +for all variant effects except of :class:`~gpsea.model.VariantEffect.FRAMESHIFT_VARIANT`: + +>>> non_frameshift_effects = ( +... VariantEffect.SYNONYMOUS_VARIANT, VariantEffect.MISSENSE_VARIANT, VariantEffect.INTRON_VARIANT, +... # and many more effects.. +... ) +>>> non_frameshift_predicate = VariantPredicates.all(VariantPredicates.variant_effect(eff, tx_id=rere_mane_tx_id) for eff in non_frameshift_effects) + +However, this is clearly tedious and it would be much better implemented +by a simple logical not of a predicate for a frameshift variant effect. + +To support this, `VariantPredicate` implements *logical inversion* +which corresponds to Python's ``~`` operator (tilde), to wrap +the underlying predicate and to invert its test result. + +This is how we can use the predicate inversion to build the predicate for non-frameshift deletions: + +>>> non_frameshift_del = ~VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id=rere_mane_tx_id) & VariantPredicates.variant_class(VariantClass.DEL) +>>> non_frameshift_del.get_question() +'(NOT FRAMESHIFT_VARIANT on NM_001042681.2 AND variant class is DEL)' + +Note the presence of a tilde ``~`` before the variant effect predicate and resulting ``NOT`` in the predicate question. + +The variant predicate offers a flexible API for testing if variants meet a condition. +However, the genotype phenotype correlations are done on the individual level +and the variant predicates are used as a component of the genotype predicate. +The next sections show how to use variant predicates to assign individuals into groups. diff --git a/docs/user-guide/report/tbx5_frameshift.csv b/docs/user-guide/report/tbx5_frameshift.csv index a96700cfb..39c76f0bb 100644 --- a/docs/user-guide/report/tbx5_frameshift.csv +++ b/docs/user-guide/report/tbx5_frameshift.csv @@ -1,22 +1,18 @@ -"Which genotype group does the patient fit in: HOM_REF, HET",HOM_REF,HOM_REF,HET,HET,, +What is the genotype group,HOM_REF,HOM_REF,HET,HET,, ,Count,Percent,Count,Percent,Corrected p values,p values -Ventricular septal defect [HP:0001629],42/71,59%,19/19,100%,0.00411275392326226,0.00024192670136836825 -Abnormal atrioventricular conduction [HP:0005150],1/23,4%,3/3,100%,0.01307692307692308,0.0015384615384615387 -Absent thumb [HP:0009777],18/100,18%,14/31,45%,0.021044138590779585,0.003713671516019927 -Atrioventricular block [HP:0001678],1/23,4%,2/2,100%,0.034,0.01 -Heart block [HP:0012722],1/23,4%,2/2,100%,0.034,0.01 -Patent ductus arteriosus [HP:0001643],6/40,15%,2/2,100%,0.09214092140921408,0.03252032520325203 -Secundum atrial septal defect [HP:0001684],23/55,42%,4/22,18%,0.1440020479198931,0.06544319142266644 -Triphalangeal thumb [HP:0001199],23/99,23%,13/32,41%,0.1440020479198931,0.06932119159387057 -Cardiac conduction abnormality [HP:0031546],15/37,41%,3/3,100%,0.1440020479198931,0.08259109311740892 -Muscular ventricular septal defect [HP:0011623],8/84,10%,6/25,24%,0.1440020479198931,0.08470708701170182 -Pulmonary arterial hypertension [HP:0002092],8/14,57%,0/2,0%,0.6899307928951143,0.4666666666666667 -Short thumb [HP:0009778],25/69,36%,8/30,27%,0.6899307928951143,0.48700997145537483 +Ventricular septal defect [HP:0001629],42/71,59%,19/19,100%,0.003870827221893892,0.00024192670136836825 +Abnormal atrioventricular conduction [HP:0005150],1/23,4%,3/3,100%,0.01230769230769231,0.0015384615384615387 +Absent thumb [HP:0009777],18/100,18%,14/31,45%,0.01980624808543961,0.003713671516019927 +Atrioventricular block [HP:0001678],1/23,4%,2/2,100%,0.04,0.01 +Patent ductus arteriosus [HP:0001643],6/40,15%,2/2,100%,0.10406504065040649,0.03252032520325203 +Secundum atrial septal defect [HP:0001684],23/55,42%,4/22,18%,0.15059037690969213,0.06544319142266644 +Triphalangeal thumb [HP:0001199],23/99,23%,13/32,41%,0.15059037690969213,0.06932119159387057 +Cardiac conduction abnormality [HP:0031546],15/37,41%,3/3,100%,0.15059037690969213,0.08259109311740892 +Muscular ventricular septal defect [HP:0011623],8/84,10%,6/25,24%,0.15059037690969213,0.08470708701170182 +Pulmonary arterial hypertension [HP:0002092],8/14,57%,0/2,0%,0.708378140298727,0.4666666666666667 +Short thumb [HP:0009778],25/69,36%,8/30,27%,0.708378140298727,0.48700997145537483 Absent radius [HP:0003974],9/43,21%,6/25,24%,1.0,0.7703831604944444 Atrial septal defect [HP:0001631],63/65,97%,20/20,100%,1.0,1.0 Hypoplasia of the radius [HP:0002984],34/75,45%,6/14,43%,1.0,1.0 Hypoplasia of the ulna [HP:0003022],3/17,18%,2/10,20%,1.0,1.0 Short humerus [HP:0005792],8/21,38%,4/9,44%,1.0,1.0 -Abnormal atrial septum morphology [HP:0011994],64/64,100%,20/20,100%,, -Abnormal cardiac septum morphology [HP:0001671],89/89,100%,28/28,100%,, -Abnormal heart morphology [HP:0001627],89/89,100%,30/30,100%,, diff --git a/docs/user-guide/report/tbx5_frameshift.mtc_report.html b/docs/user-guide/report/tbx5_frameshift.mtc_report.html index fbc4a9de4..242057f92 100644 --- a/docs/user-guide/report/tbx5_frameshift.mtc_report.html +++ b/docs/user-guide/report/tbx5_frameshift.mtc_report.html @@ -48,9 +48,9 @@

Phenotype testing report

Phenotype MTC filter: HPO MTC filter

Multiple testing correction: fdr_bh

-

Performed statistical tests for 17 out of the total of 260 HPO terms.

+

Performed statistical tests for 16 out of the total of 260 HPO terms.

Using HPO MTC filter, 243 term(s) were omitted from statistical analysis.Using HPO MTC filter, 244 term(s) were omitted from statistical analysis.
Code
TODOHMF01 Skipping term with maximum frequency that was less than threshold 0.2 51
TODOSkipping general term44
TODOSkipping term because all genotypes have same HPO observed proportions41
TODOSkipping term with only 2 observations (not powered for 2x2)26
TODOSkipping term with only 1 observations (not powered for 2x2)24
TODOSkipping term with only 3 observations (not powered for 2x2)22HMF02Skipping term because no genotype has more than one observed HPO count3
TODOSkipping term with only 4 observations (not powered for 2x2)14HMF03Skipping term because of a child term with the same individual counts1
TODOSkipping term with only 6 observations (not powered for 2x2)12HMF04Skipping term because all genotypes have same HPO observed proportions41
TODOSkipping term with only 5 observations (not powered for 2x2)4HMF05Skipping term because one genotype had zero observations2
TODOSkipping term because no genotype has more than one observed HPO count3HMF06Skipping term with less than 7 observations (not powered for 2x2)102
TODOSkipping term because one genotype had zero observations2HMF08Skipping general term44
- + @@ -59,80 +59,45 @@

Phenotype testing report

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + - - - - + + + - - - - + + + - - - - + + + - - - - + + + - - - - + + + - - - - + + + diff --git a/docs/user-guide/stats.rst b/docs/user-guide/stats.rst index 55757ef30..18882892a 100644 --- a/docs/user-guide/stats.rst +++ b/docs/user-guide/stats.rst @@ -102,7 +102,7 @@ with the default cohort creator: >>> from gpsea.preprocessing import configure_caching_cohort_creator, load_phenopackets >>> cohort_creator = configure_caching_cohort_creator(hpo) >>> cohort, qc_results = load_phenopackets(phenopackets, cohort_creator) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE -Patients Created: ... +Individuals Processed: ... >>> qc_results.summarize() # doctest: +SKIP Validated under none policy No errors or warnings were found @@ -127,7 +127,7 @@ First, we create a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` to test if the variant leads to a frameshift (in this case): >>> from gpsea.model import VariantEffect ->>> from gpsea.analysis.predicate.genotype import VariantPredicates, boolean_predicate +>>> from gpsea.analysis.predicate.genotype import VariantPredicates >>> tx_id = 'NM_181486.4' >>> is_frameshift = VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id) >>> is_frameshift.get_question() @@ -136,10 +136,10 @@ to test if the variant leads to a frameshift (in this case): and then we choose the expected mode of inheritance to test. In case of *TBX5*, we expect the autosomal dominant mode of inheritance: ->>> from gpsea.analysis.predicate.genotype import ModeOfInheritancePredicate ->>> gt_predicate = ModeOfInheritancePredicate.autosomal_dominant(is_frameshift) +>>> from gpsea.analysis.predicate.genotype import autosomal_dominant +>>> gt_predicate = autosomal_dominant(is_frameshift) >>> gt_predicate.display_question() -'Which genotype group does the patient fit in: HOM_REF, HET' +'What is the genotype group: HOM_REF, HET' `gt_predicate` will assign the patients with no frameshift variant allele into `HOM_REF` group and the patients with one frameshift allele will be assigned into `HET` group. @@ -230,10 +230,10 @@ We can now execute the analysis: >>> len(result.phenotypes) 260 >>> result.total_tests -17 +16 -Thanks to Phenotype MTC filter, we only tested 17 out of 260 terms. +Thanks to Phenotype MTC filter, we only tested 16 out of 260 terms. We can learn more by showing the MTC filter report: >>> from gpsea.view import MtcStatsViewer @@ -251,12 +251,11 @@ Genotype phenotype associations ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Last, let's explore the associations. The results include a table with all tested HPO terms -ordered by the corrected p value (Benjamini-Hochberg FDR). -Here we show the top 20 table rows: +ordered by the corrected p value (Benjamini-Hochberg FDR): ->>> from gpsea.analysis.predicate import PatientCategories ->>> summary_df = result.summarize(hpo, PatientCategories.YES) ->>> summary_df.head(20).to_csv('docs/user-guide/report/tbx5_frameshift.csv') # doctest: +SKIP +>>> from gpsea.view import summarize_hpo_analysis +>>> summary_df = summarize_hpo_analysis(hpo, result) +>>> summary_df.to_csv('docs/user-guide/report/tbx5_frameshift.csv') # doctest: +SKIP .. csv-table:: *TBX5* frameshift vs rest :file: report/tbx5_frameshift.csv @@ -270,7 +269,7 @@ was observed in 31/60 (52%) patients with a missense variant but it was observed in 19/19 (100%) patients with a frameshift variant. Fisher exact test computed a p value of `~0.000242` and the p value corrected by Benjamini-Hochberg procedure -is `~0.00411`. +is `~0.00387`. The table includes all HPO terms of the cohort, including the terms that were not selected for testing and thus have no associated p value. @@ -345,7 +344,7 @@ which we will use to preprocess the cohort >>> from gpsea.preprocessing import load_phenopackets >>> cohort, _ = load_phenopackets(phenopackets, cohort_creator) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE -Patients Created: ... +Individuals Processed: ... resulting in a cohort consisting of 19 individuals diff --git a/src/gpsea/__init__.py b/src/gpsea/__init__.py index 402fc6ccb..297ea01d2 100644 --- a/src/gpsea/__init__.py +++ b/src/gpsea/__init__.py @@ -2,4 +2,4 @@ GPSEA is a library for analyzing genotype-phenotype correlations in cohorts of rare disease patients. """ -__version__ = "0.3.0" +__version__ = "0.4.0" diff --git a/src/gpsea/analysis/mtc_filter/__init__.py b/src/gpsea/analysis/mtc_filter/__init__.py index 4497ac343..57f0d6149 100644 --- a/src/gpsea/analysis/mtc_filter/__init__.py +++ b/src/gpsea/analysis/mtc_filter/__init__.py @@ -5,10 +5,10 @@ See :ref:`MTC filters ` section for more info. """ -from ._impl import PhenotypeMtcFilter, PhenotypeMtcResult +from ._impl import PhenotypeMtcFilter, PhenotypeMtcResult, PhenotypeMtcIssue from ._impl import UseAllTermsMtcFilter, SpecifiedTermsMtcFilter, HpoMtcFilter __all__ = [ - 'PhenotypeMtcFilter', 'PhenotypeMtcResult', + 'PhenotypeMtcFilter', 'PhenotypeMtcResult', 'PhenotypeMtcIssue', 'UseAllTermsMtcFilter', 'SpecifiedTermsMtcFilter', 'HpoMtcFilter', ] diff --git a/src/gpsea/analysis/mtc_filter/_impl.py b/src/gpsea/analysis/mtc_filter/_impl.py index c52d10b9c..ad691c403 100644 --- a/src/gpsea/analysis/mtc_filter/_impl.py +++ b/src/gpsea/analysis/mtc_filter/_impl.py @@ -1,10 +1,10 @@ import abc +import dataclasses import typing from collections import deque import hpotk -from matplotlib import axis import numpy as np import pandas as pd @@ -12,51 +12,83 @@ from ..predicate.phenotype import PhenotypePolyPredicate, P +@dataclasses.dataclass(eq=True, frozen=True) +class PhenotypeMtcIssue: + """ + The container for data available regarding the reason why a phenotype was filtered out. + """ + + code: str + """ + A `str` with a unique code of the issue. + """ + + reason: str + """ + A human-friendly explanation of the issue. + """ + + class PhenotypeMtcResult: """ `PhenotypeMtcResult` represents a result of :class:`PhenotypeMtcFilter` for a single phenotype. The phenotype can either pass the filter, in order to be included in the downstream analysis (:meth:`is_passed`) - of be filtered out (:meth:`is_filtered_out`) in which case :attr:`reason` must be available. + of be filtered out (:meth:`is_filtered_out`) in which case :attr:`mtc_issue` with more context + regarding the culprit must be available. """ @staticmethod def ok() -> "PhenotypeMtcResult": # A singleton would be nice here... - return PhenotypeMtcResult(status=True, reason=None) + return PhenotypeMtcResult(status=True, issue=None) @staticmethod - def fail(reason: str) -> "PhenotypeMtcResult": - return PhenotypeMtcResult(status=False, reason=reason) + def fail(code: str, reason: str) -> "PhenotypeMtcResult": + issue = PhenotypeMtcIssue(code=code, reason=reason) + return PhenotypeMtcResult(status=False, issue=issue) def __init__( self, status: bool, - reason: typing.Optional[str], + issue: typing.Optional[PhenotypeMtcIssue], ): + assert isinstance(status, bool) + if status: + assert issue is None, '`issue` must NOT be provided if the HPO term passed the MTC filter' + else: + assert issue is not None, '`issue` must be provided if the HPO term failed the MTC filter' + self._status = status - self._reason = reason + self._issue = issue def is_passed(self) -> bool: return self._status def is_filtered_out(self) -> bool: return not self._status - + + @property + def mtc_issue(self) -> typing.Optional[PhenotypeMtcIssue]: + return self._issue + @property def reason(self) -> typing.Optional[str]: - return self._reason + if self.mtc_issue is None: + return None + else: + return self.mtc_issue.reason def __eq__(self, value: object) -> bool: return isinstance(value, PhenotypeMtcResult) \ and self._status == value._status \ - and self._reason == value._reason + and self._issue == value._issue def __hash__(self) -> int: - return hash((self._status, self._reason)) + return hash((self._status, self._issue)) def __str__(self) -> str: - return f'PhenotypeMtcResult(status={self._status}, reason={self._reason})' + return f'PhenotypeMtcResult(status={self._status}, issue={self._issue})' def __repr__(self) -> str: return str(self) @@ -71,6 +103,14 @@ class PhenotypeMtcFilter(typing.Generic[P], metaclass=abc.ABCMeta): Therefore, the expected input asks for :class:`hpotk.TermId` items. For instance, `n_usable` is a mapping from an *HPO term* to an `int` with the count of the patients categorized according to the HPO term. + + :attr:`PhenotypeMtcFilter.OK` is returned for HPO terms that pass MTC filtering + and *should* be included in the analysis. + """ + + OK = PhenotypeMtcResult.ok() + """ + The MTC result for the phenotypes that pass the filtering and *should* be included in the analysis. """ @abc.abstractmethod @@ -91,6 +131,13 @@ def filter( """ pass + @abc.abstractmethod + def possible_results(self) -> typing.Collection[PhenotypeMtcResult]: + """ + Return all possible result types which the `PhenotypeMtcFilter` can produce. + """ + pass + @abc.abstractmethod def filter_method_name(self) -> str: """ @@ -106,9 +153,6 @@ class UseAllTermsMtcFilter(PhenotypeMtcFilter[typing.Any]): See :ref:`use-all-terms-strategy` section for more info. """ - def __init__(self): - self._ok = PhenotypeMtcResult.ok() - def filter( self, gt_predicate: GenotypePolyPredicate, @@ -116,7 +160,10 @@ def filter( counts: typing.Sequence[pd.DataFrame], ) -> typing.Sequence[PhenotypeMtcResult]: # Always OK! 😏 - return tuple(self._ok for _ in ph_predicates) + return tuple(PhenotypeMtcFilter.OK for _ in ph_predicates) + + def possible_results(self) -> typing.Collection[PhenotypeMtcResult]: + return (PhenotypeMtcFilter.OK,) def filter_method_name(self) -> str: return "All HPO terms" @@ -133,6 +180,11 @@ class SpecifiedTermsMtcFilter(PhenotypeMtcFilter[hpotk.TermId]): See :ref:`specify-terms-strategy` section for more info. """ + + NON_SPECIFIED_TERM = PhenotypeMtcResult.fail(code="ST1", reason="Non-specified term") + """ + The MTC filtering result returned when an HPO term does not belong among the selection of terms to be tested. + """ def __init__( self, @@ -142,8 +194,6 @@ def __init__( Args: terms_to_test: an iterable of TermIds representing the terms to test """ - self._ok = PhenotypeMtcResult.ok() - self._fail = PhenotypeMtcResult.fail("Non-specified term") self._terms_to_test_set = set(terms_to_test) def filter( @@ -155,10 +205,16 @@ def filter( results = [] for predicate in ph_predicates: if predicate.phenotype in self._terms_to_test_set: - results.append(self._ok) + results.append(SpecifiedTermsMtcFilter.OK) else: - results.append(self._fail) + results.append(SpecifiedTermsMtcFilter.NON_SPECIFIED_TERM) return tuple(results) + + def possible_results(self) -> typing.Collection[PhenotypeMtcResult]: + return ( + PhenotypeMtcFilter.OK, + SpecifiedTermsMtcFilter.NON_SPECIFIED_TERM, + ) def filter_method_name(self) -> str: return "Specified terms MTC filter" @@ -174,16 +230,21 @@ class HpoMtcFilter(PhenotypeMtcFilter[hpotk.TermId]): We recommend creating an instance using the :func:`default_filter` static factory method. """ - # TODO: this has been here before. Is it still valid? - # 2. Do not perform a test if the counts in the genotype categories do not even have nominal statistical power - # for i in range(2,6): - # for j in range(2,6): - # # create a table with the most extreme possible counts. If this is not significant, then - # # other counts also cannot be - # two_by_two_table = [[i, 0], [0, j]] - # oddsr, p = scipy.stats.fisher_exact(two_by_two_table, alternative='two-sided') - # This code reveals that (2,4), (4,2), (2,3), (3,3), (2,2) and (3,2) are not powered so we can always skip them - # ## TODO -- similar calculate for 3x2 + NO_GENOTYPE_HAS_MORE_THAN_ONE_HPO = PhenotypeMtcResult.fail( + "HMF02", "Skipping term because no genotype has more than one observed HPO count", + ) + SAME_COUNT_AS_THE_ONLY_CHILD = PhenotypeMtcResult.fail( + "HMF03", "Skipping term because of a child term with the same individual counts", + ) + SKIPPING_SAME_OBSERVED_PROPORTIONS = PhenotypeMtcResult.fail( + "HMF04", + "Skipping term because all genotypes have same HPO observed proportions", + ) + SKIPPING_SINCE_ONE_GENOTYPE_HAD_ZERO_OBSERVATIONS = PhenotypeMtcResult.fail( + "HMF05", "Skipping term because one genotype had zero observations" + ) + SKIPPING_NON_PHENOTYPE_TERM = PhenotypeMtcResult.fail("HMF07", "Skipping non phenotype term") + SKIPPING_GENERAL_TERM = PhenotypeMtcResult.fail("HMF08", "Skipping general term") @staticmethod def default_filter( @@ -262,13 +323,40 @@ def __init__( """ self._hpo = hpo self._hpo_term_frequency_filter = term_frequency_threshold - # The following numbers of total observations in the genotype groups can never be significant, - # so we just skip them see above explanation - self._powerless_set = {(2, 4), (4, 2), (2, 3), (3, 3), (2, 2), (3, 2)} - # thus if the total count is 6 or less, there is no power - CAN WE SIMPLIFY? self._general_hpo_terms = set(general_hpo_terms) - self._ok = PhenotypeMtcResult.ok() + + self._below_frequency_threshold = PhenotypeMtcResult.fail( + code="HMF01", + reason="Skipping term with maximum frequency that was" + f" less than threshold {self._hpo_term_frequency_filter}", + ) + + # Do not perform a test if the counts in the genotype categories do not even have nominal statistical power + # The following numbers of total observations in the genotype groups can never be significant, + # so we just skip them. + # ``` + # for i in range(2,6): + # for j in range(2,6): + # # create a table with the most extreme possible counts. If this is not significant, then + # # other counts also cannot be + # two_by_two_table = [[i, 0], [0, j]] + # oddsr, p = scipy.stats.fisher_exact(two_by_two_table, alternative='two-sided') + # ``` + # This code reveals that (2,4), (4,2), (2,3), (3,3), (2,2) and (3,2) are not powered so we can always skip them + # Thus if the total count is 6 or less, there is no power. + self._min_observations_for_2_by_2 = 7 + self._not_powered_for_2_by_2 = PhenotypeMtcResult.fail( + code="HMF06", + reason=f"Skipping term with less than {self._min_observations_for_2_by_2} observations" + " (not powered for 2x2)", + ) + self._min_observations_for_2_by_3 = 6 + self._not_powered_for_2_by_3 = PhenotypeMtcResult.fail( + code="HMF06", + reason=f"Skipping term with less than {self._min_observations_for_2_by_3} observations" + " (not powered for 2x3)", + ) def filter( self, @@ -290,46 +378,39 @@ def filter( continue if term_id in self._general_hpo_terms: - results[idx] = PhenotypeMtcResult.fail("Skipping general term") + results[idx] = HpoMtcFilter.SKIPPING_GENERAL_TERM continue if not self._hpo.graph.is_ancestor_of( hpotk.constants.hpo.base.PHENOTYPIC_ABNORMALITY, term_id ): - results[idx] = PhenotypeMtcResult.fail("Skipping non phenotype term") + results[idx] = HpoMtcFilter.SKIPPING_NON_PHENOTYPE_TERM continue - # get total number of observations - # if term_id not in all_counts: - # reason_for_filtering_out["Skipping non-target term"] += 1 - # continue - ph_predicate = ph_predicates[idx] contingency_matrix = counts[idx] - total = contingency_matrix.sum().sum() + max_freq = HpoMtcFilter.get_maximum_group_observed_HPO_frequency( contingency_matrix, ph_predicate=ph_predicate, ) if max_freq < self._hpo_term_frequency_filter: - reason = ( - "Skipping term with maximum frequency " - f"that was less than threshold {self._hpo_term_frequency_filter}" - ) - results[idx] = PhenotypeMtcResult.fail(reason) + results[idx] = self._below_frequency_threshold continue - if contingency_matrix.shape == (2, 2) and total < 7: - reason = f"Skipping term with only {total} observations (not powered for 2x2)" - results[idx] = PhenotypeMtcResult.fail(reason) + total_count = contingency_matrix.sum().sum() + if contingency_matrix.shape == (2, 2) and total_count < self._min_observations_for_2_by_2: + results[idx] = self._not_powered_for_2_by_2 + continue + elif contingency_matrix.shape == (2, 3) and total_count < self._min_observations_for_2_by_3: + results[idx] = self._not_powered_for_2_by_3 continue if not HpoMtcFilter.some_cell_has_greater_than_one_count( counts=contingency_matrix, ph_predicate=ph_predicate, ): - reason = "Skipping term because no genotype has more than one observed HPO count" - results[idx] = PhenotypeMtcResult.fail(reason) + results[idx] = HpoMtcFilter.NO_GENOTYPE_HAS_MORE_THAN_ONE_HPO continue elif HpoMtcFilter.genotypes_have_same_hpo_proportions( @@ -337,29 +418,40 @@ def filter( gt_predicate=gt_predicate, ph_predicate=ph_predicate, ): - reason = "Skipping term because all genotypes have same HPO observed proportions" - results[idx] = PhenotypeMtcResult.fail(reason) + results[idx] = HpoMtcFilter.SKIPPING_SAME_OBSERVED_PROPORTIONS continue elif HpoMtcFilter.one_genotype_has_zero_hpo_observations( counts=contingency_matrix, gt_predicate=gt_predicate, ): - reason = "Skipping term because one genotype had zero observations" - results[idx] = PhenotypeMtcResult.fail(reason) + results[idx] = HpoMtcFilter.SKIPPING_SINCE_ONE_GENOTYPE_HAD_ZERO_OBSERVATIONS continue - # TODO: the code below actually did not work. - # for child_term_id in self._hpo.graph.get_children(term_id): - # if child_term_id in tested_counts_pf: - # if tested_counts_pf[child_term_id].equals(contingency_matrix): - # # TODO: should we make the match fuzzier by adding a tolerance instead of the exact matches? - # reason = "Child term with same counts previously tested" - # reason_for_filtering_out[reason] += 1 - # continue - + # Check if the term has exactly one child with a very similar number of individuals + # in the genotype and phenotype groups. + child_contingency_matrix = None + for child in self._hpo.graph.get_children(term_id): + if child in p_to_idx: + child_idx = p_to_idx[child] + if child_contingency_matrix is None: + child_contingency_matrix = counts[child_idx] + else: + # Found 2nd child of this term. + # We always want to test a term with + # >1 children in the target set. + child_contingency_matrix = None + break + + if child_contingency_matrix is not None: + if (contingency_matrix - child_contingency_matrix).abs().max(axis=None) < 1: + # Do not test if the count is exactly the same to the counts in the only child term. + results[idx] = HpoMtcFilter.SAME_COUNT_AS_THE_ONLY_CHILD + continue + + # ## # The term should be tested if we get here. - results[idx] = self._ok + results[idx] = PhenotypeMtcFilter.OK # There should be no `None` elements in `results` at this point. if any(r is None for r in results): @@ -375,6 +467,18 @@ def filter( # Ignoring the type hint, because we checked the type match above. return tuple(results) # type: ignore + def possible_results(self) -> typing.Collection[PhenotypeMtcResult]: + return ( + PhenotypeMtcFilter.OK, + HpoMtcFilter.SKIPPING_GENERAL_TERM, + HpoMtcFilter.SKIPPING_NON_PHENOTYPE_TERM, + self._below_frequency_threshold, + self._not_powered_for_2_by_2, + HpoMtcFilter.NO_GENOTYPE_HAS_MORE_THAN_ONE_HPO, + HpoMtcFilter.SKIPPING_SAME_OBSERVED_PROPORTIONS, + HpoMtcFilter.SKIPPING_SINCE_ONE_GENOTYPE_HAD_ZERO_OBSERVATIONS, + ) + def filter_method_name(self) -> str: return "HPO MTC filter" diff --git a/src/gpsea/analysis/pcats/_impl.py b/src/gpsea/analysis/pcats/_impl.py index e8045d2d0..c7eb6a616 100644 --- a/src/gpsea/analysis/pcats/_impl.py +++ b/src/gpsea/analysis/pcats/_impl.py @@ -13,7 +13,6 @@ from gpsea.model import Patient from gpsea.analysis.pcats.stats import CountStatistic -from ..predicate import PatientCategory from ..predicate.genotype import GenotypePolyPredicate from ..predicate.phenotype import P, PhenotypePolyPredicate from ..mtc_filter import PhenotypeMtcFilter, PhenotypeMtcResult @@ -101,14 +100,6 @@ class MultiPhenotypeAnalysisResult(typing.Generic[P], metaclass=abc.ABCMeta): with the order determined by the phenotype order. """ - @property - @abc.abstractmethod - def phenotypes(self) -> typing.Sequence[P]: - """ - Get the tested phenotypes. - """ - pass - @property @abc.abstractmethod def n_usable(self) -> typing.Sequence[int]: @@ -165,95 +156,6 @@ def total_tests(self) -> int: """ return sum(1 for pval in self.pvals if not math.isnan(pval)) - def summarize( - self, - hpo: hpotk.MinimalOntology, - target_phenotype_category: PatientCategory, - ) -> pd.DataFrame: - """ - Create a data frame with summary of the genotype phenotype analysis. - - The *rows* of the frame correspond to the analyzed HPO terms. - - The columns of the data frame have `Count` and `Percentage` per used genotype predicate. - - **Example** - - If we use :class:`~gpsea.analysis.predicate.genotype.VariantEffectPredicate` - which can compare phenotype with and without a missense variant, we will have a data frame - that looks like this:: - - MISSENSE_VARIANT on `NM_1234.5` No Yes - Count Percent Count Percent Corrected p value p value - Arachnodactyly [HP:0001166] 1/10 10% 13/16 81% 0.020299 0.000781 - Abnormality of the musculature [HP:0003011] 6/6 100% 11/11 100% 1.000000 1.000000 - Abnormal nervous system physiology [HP:0012638] 9/9 100% 15/15 100% 1.000000 1.000000 - ... ... ... ... ... ... ... - """ - - # TODO: this should be plotted by a formatter. - # Row index: a list of tested HPO terms - pheno_idx = pd.Index(self.phenotypes) - # Column index: multiindex of counts and percentages for all genotype predicate groups - gt_idx = pd.MultiIndex.from_product( - # TODO: fix the below - iterables=(self._gt_predicate.get_categories(), ("Count", "Percent")), - names=(self._gt_predicate.get_question_base(), None), - ) - - # We'll fill this frame with data - df = pd.DataFrame(index=pheno_idx, columns=gt_idx) - - for phenotype, count in zip(self.phenotypes, self.all_counts): - gt_totals = ( - count.sum() - ) # Sum across the phenotype categories (collapse the rows). - for gt_cat in count.columns: - cnt = count.loc[target_phenotype_category, gt_cat] - total = gt_totals[gt_cat] - df.loc[phenotype, (gt_cat, "Count")] = f"{cnt}/{total}" - pct = 0 if total == 0 else round(cnt * 100 / total) - df.loc[phenotype, (gt_cat, "Percent")] = f"{pct}%" - - # Add columns with p values and corrected p values (if present) - p_val_col_name = "p values" - corrected_p_val_col_name = "Corrected p values" - if self.corrected_pvals is not None: - df.insert( - df.shape[1], ("", corrected_p_val_col_name), self.corrected_pvals - ) - df.insert(df.shape[1], ("", p_val_col_name), self.pvals) - - # Format the index values: `HP:0001250` -> `Seizure [HP:0001250]` if the index members are HPO terms - # or just use the term ID CURIE otherwise (e.g. `OMIM:123000`). - labeled_idx = df.index.map( - lambda term_id: MultiPhenotypeAnalysisResult._format_term_id(hpo, term_id) - ) - - # Last, sort by corrected p value or just p value - df = df.set_index(labeled_idx) - if self.corrected_pvals is not None: - return df.sort_values( - by=[("", corrected_p_val_col_name), ("", p_val_col_name)] - ) - else: - return df.sort_values(by=("", p_val_col_name)) - - @staticmethod - def _format_term_id( - hpo: hpotk.MinimalOntology, - term_id: hpotk.TermId, - ) -> str: - """ - Format a `term_id` as a `str`. HPO term ID is formatted as ` []` whereas other term IDs - are formatted as CURIEs (e.g. `OMIM:123000`). - """ - if term_id.prefix == "HP": - term_name = hpo.get_term_name(term_id) - return f"{term_name} [{term_id.value}]" - else: - return term_id.value - class MultiPhenotypeAnalysis(typing.Generic[P], metaclass=abc.ABCMeta): @@ -377,7 +279,8 @@ def _compute_nominal_pvals( def _apply_mtc( self, pvals: typing.Sequence[float], - ) -> typing.Optional[typing.Sequence[float]]: + ) -> typing.Sequence[float]: + assert self._mtc_correction is not None _, corrected_pvals, _, _ = multitest.multipletests( pvals=pvals, alpha=self._mtc_alpha, @@ -392,14 +295,14 @@ class BaseMultiPhenotypeAnalysisResult(typing.Generic[P], MultiPhenotypeAnalysis def __init__( self, - phenotypes: typing.Sequence[P], + pheno_predicates: typing.Iterable[PhenotypePolyPredicate[P]], n_usable: typing.Sequence[int], all_counts: typing.Sequence[pd.DataFrame], pvals: typing.Sequence[float], corrected_pvals: typing.Optional[typing.Sequence[float]], gt_predicate: GenotypePolyPredicate, ): - self._phenotypes = tuple(phenotypes) + self._pheno_predicates = tuple(pheno_predicates) self._n_usable = tuple(n_usable) self._all_counts = tuple(all_counts) self._pvals = tuple(pvals) @@ -409,9 +312,19 @@ def __init__( assert isinstance(gt_predicate, GenotypePolyPredicate) self._gt_predicate = gt_predicate + @property + def gt_predicate(self) -> GenotypePolyPredicate: + return self._gt_predicate + + @property + def pheno_predicates( + self, + ) -> typing.Sequence[PhenotypePolyPredicate[P]]: + return self._pheno_predicates + @property def phenotypes(self) -> typing.Sequence[P]: - return self._phenotypes + return tuple(p.phenotype for p in self._pheno_predicates) @property def n_usable(self) -> typing.Sequence[int]: @@ -431,7 +344,7 @@ def corrected_pvals(self) -> typing.Optional[typing.Sequence[float]]: def __eq__(self, other): return isinstance(other, BaseMultiPhenotypeAnalysisResult) \ - and self._phenotypes == other._phenotypes \ + and self._pheno_predicates == other._pheno_predicates \ and self._n_usable == other._n_usable \ and self._all_counts == other._all_counts \ and self._pvals == other._pvals \ @@ -441,7 +354,7 @@ def __eq__(self, other): def __hash__(self): # NOTE: if a field is added, the hash method of the subclasses must be updated as well. return hash(( - self._phenotypes, self._n_usable, self._all_counts, + self._pheno_predicates, self._n_usable, self._all_counts, self._pvals, self._corrected_pvals, self._gt_predicate, )) @@ -474,10 +387,8 @@ def _compute_result( else: corrected_pvals = self._apply_mtc(pvals=pvals) - phenotypes = tuple(p.phenotype for p in pheno_predicates) - return BaseMultiPhenotypeAnalysisResult( - phenotypes=phenotypes, + pheno_predicates=pheno_predicates, n_usable=n_usable, all_counts=all_counts, pvals=pvals, @@ -496,7 +407,7 @@ class HpoTermAnalysisResult(BaseMultiPhenotypeAnalysisResult[hpotk.TermId]): def __init__( self, - phenotypes: typing.Sequence[hpotk.TermId], + pheno_predicates: typing.Iterable[PhenotypePolyPredicate[hpotk.TermId]], n_usable: typing.Sequence[int], all_counts: typing.Sequence[pd.DataFrame], pvals: typing.Sequence[float], @@ -507,7 +418,7 @@ def __init__( mtc_name: typing.Optional[str], ): super().__init__( - phenotypes=phenotypes, + pheno_predicates=pheno_predicates, n_usable=n_usable, all_counts=all_counts, pvals=pvals, @@ -549,7 +460,7 @@ def __eq__(self, other): def __hash__(self): return hash(( - self._phenotypes, self._n_usable, self._all_counts, + self._pheno_predicates, self._n_usable, self._all_counts, self._pvals, self._corrected_pvals, self._gt_predicate, self._mtc_filter_name, self._mtc_filter_results, self._mtc_name, )) @@ -590,7 +501,6 @@ def _compute_result( pheno_predicates = tuple(pheno_predicates) if len(pheno_predicates) == 0: raise ValueError("No phenotype predicates were provided") - phenotypes = tuple(p.phenotype for p in pheno_predicates) # 1 - Count the patients n_usable, all_counts = apply_predicates_on_patients( @@ -625,7 +535,7 @@ def _compute_result( corrected_pvals[mtc_mask] = self._apply_mtc(pvals=pvals[mtc_mask]) return HpoTermAnalysisResult( - phenotypes=phenotypes, + pheno_predicates=pheno_predicates, n_usable=n_usable, all_counts=all_counts, pvals=pvals, diff --git a/src/gpsea/analysis/predicate/genotype/__init__.py b/src/gpsea/analysis/predicate/genotype/__init__.py index 23ad8cfbb..9bf0b26aa 100644 --- a/src/gpsea/analysis/predicate/genotype/__init__.py +++ b/src/gpsea/analysis/predicate/genotype/__init__.py @@ -1,13 +1,17 @@ from ._api import GenotypePolyPredicate from ._api import VariantPredicate from ._counter import AlleleCounter -from ._gt_predicates import boolean_predicate, groups_predicate, filtering_predicate, sex_predicate, diagnosis_predicate -from ._gt_predicates import ModeOfInheritancePredicate +from ._gt_predicates import groups_predicate, sex_predicate, diagnosis_predicate +from ._gt_predicates import autosomal_dominant, autosomal_recessive +from ._gt_predicates import monoallelic_predicate, biallelic_predicate +from ._gt_predicates import ModeOfInheritancePredicate # TODO: remove before 1.0.0 from ._variant import VariantPredicates, ProteinPredicates __all__ = [ 'GenotypePolyPredicate', - 'boolean_predicate', 'groups_predicate', 'filtering_predicate', 'sex_predicate', 'diagnosis_predicate', + 'groups_predicate', 'sex_predicate', 'diagnosis_predicate', + 'autosomal_dominant', 'autosomal_recessive', + 'monoallelic_predicate', 'biallelic_predicate', 'ModeOfInheritancePredicate', 'AlleleCounter', 'VariantPredicate', 'VariantPredicates', 'ProteinPredicates', diff --git a/src/gpsea/analysis/predicate/genotype/_counter.py b/src/gpsea/analysis/predicate/genotype/_counter.py index c74be9098..8ad77d555 100644 --- a/src/gpsea/analysis/predicate/genotype/_counter.py +++ b/src/gpsea/analysis/predicate/genotype/_counter.py @@ -8,13 +8,14 @@ class AlleleCounter: :param predicate: a :class:`VariantPredicate` for selecting the target variants. """ - # TODO: this class should probably be an implementation detail, + # TODO: this class should probably be an implementation detail, # and not a public member of the package. def __init__( self, predicate: VariantPredicate, ): + assert isinstance(predicate, VariantPredicate) self._predicate = predicate def get_question(self) -> str: diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py index 3a9301273..c5dbe07f4 100644 --- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py +++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py @@ -1,7 +1,6 @@ -from cProfile import label import dataclasses -import enum import typing +import warnings from collections import defaultdict @@ -9,79 +8,11 @@ from gpsea.model import Patient, Sex -from .._api import Categorization, PatientCategory, PatientCategories +from .._api import Categorization, PatientCategory from ._api import GenotypePolyPredicate from ._api import VariantPredicate from ._counter import AlleleCounter - - -class AlleleCountingGenotypeBooleanPredicate(GenotypePolyPredicate): - # NOT PART OF THE PUBLIC API - """ - The predicate tests presence of at least one matching allele in the patient. - """ - YES = Categorization(PatientCategories.YES) - NO = Categorization(PatientCategories.NO) - - @staticmethod - def for_variant_predicate(predicate: VariantPredicate): - allele_counter = AlleleCounter(predicate=predicate) - return AlleleCountingGenotypeBooleanPredicate(allele_counter=allele_counter) - - def __init__(self, allele_counter: AlleleCounter): - self._allele_counter = allele_counter - - def get_categorizations(self) -> typing.Sequence[Categorization]: - """ - The predicate bins a patient into - :attr:`AlleleCountingGenotypeBooleanPredicate.NO` - or :class:`AlleleCountingGenotypeBooleanPredicate.YES` category. - """ - return ( - AlleleCountingGenotypeBooleanPredicate.YES, - AlleleCountingGenotypeBooleanPredicate.NO, - ) - - def get_question_base(self) -> str: - return self._allele_counter.get_question() - - def test(self, patient: Patient) -> typing.Optional[Categorization]: - self._check_patient(patient) - - allele_count = self._allele_counter.count(patient) - if allele_count > 0: - return AlleleCountingGenotypeBooleanPredicate.YES - elif allele_count == 0: - return AlleleCountingGenotypeBooleanPredicate.NO - else: - raise ValueError( - f"Allele counter should return a non-negative allele count: {allele_count}" - ) - - def __eq__(self, value: object) -> bool: - return ( - isinstance(value, AlleleCountingGenotypeBooleanPredicate) - and self._allele_counter == value._allele_counter - ) - - def __hash__(self) -> int: - return hash((self._allele_counter,)) - - def __str__(self) -> str: - return f"AlleleCountingGenotypeBooleanPredicate(allele_counter={self._allele_counter})" - - def __repr__(self) -> str: - return str(self) - - -def boolean_predicate(variant_predicate: VariantPredicate) -> GenotypePolyPredicate: - """ - Create a genotype boolean predicate from given `variant_predicate` - to test for presence of at least one matching allele in the patient. - """ - return AlleleCountingGenotypeBooleanPredicate.for_variant_predicate( - predicate=variant_predicate, - ) +from ._variant import VariantPredicates class AlleleCountingGroupsPredicate(GenotypePolyPredicate): @@ -186,125 +117,247 @@ def groups_predicate( ) -class FilteringGenotypePolyPredicate(GenotypePolyPredicate): - # NOT PART OF THE PUBLIC API +class PolyCountingGenotypePredicate(GenotypePolyPredicate): @staticmethod - def create( - predicate: "GenotypePolyPredicate", - targets: typing.Collection[Categorization], - ) -> "FilteringGenotypePolyPredicate": - # At least 2 target categorizations must be provided - if len(targets) <= 1: - raise ValueError( - f"At least 2 target categorizations must be provided but got {len(targets)}" - ) - - good_boys = tuple(isinstance(cat, Categorization) for cat in targets) - if not all(good_boys): - offenders = ", ".join( - str(i) for i, is_instance in enumerate(good_boys) if not is_instance - ) - raise ValueError( - f"The targets at following indices are not categorizations: [{offenders}]" - ) + def monoallelic( + a_predicate: VariantPredicate, + b_predicate: VariantPredicate, + names: typing.Tuple[str, str], + ): + count2cat = { + (1, 0): Categorization(PatientCategory(cat_id=0, name=names[0], description=f"Monoallelic {names[0]}")), + (0, 1): Categorization(PatientCategory(cat_id=1, name=names[1], description=f"Monoallelic {names[1]}")), + } - # All `allowed` categorizations must in fact be present in the `base` predicate. - cats_are_in_fact_present = tuple( - cat in predicate.get_categorizations() for cat in targets + return PolyCountingGenotypePredicate._for_predicates_and_categories( + count2cat=count2cat, + a_predicate=a_predicate, + b_predicate=b_predicate, ) - if not all(cats_are_in_fact_present): - missing = ", ".join( - c.category.name - for c, is_present in zip(targets, cats_are_in_fact_present) - if not is_present - ) - raise ValueError(f"Some from the categories are not present: {missing}") - if len(targets) == predicate.n_categorizations(): - raise ValueError( - f"It makes no sense to subset the a predicate with {predicate.n_categorizations()} categorizations " - f"with the same number ({len(targets)}) of targets" - ) + @staticmethod + def biallelic( + a_predicate: VariantPredicate, + b_predicate: VariantPredicate, + names: typing.Tuple[str, str], + ): + count2cat = { + (2, 0): Categorization( + PatientCategory( + cat_id=0, name=f'{names[0]}/{names[0]}', description=f"Biallelic {names[0]}", + ) + ), + (1, 1): Categorization( + PatientCategory( + cat_id=1, name=f'{names[0]}/{names[1]}', description=f"{names[0]}/{names[1]}", + ), + ), + (0, 2): Categorization( + PatientCategory( + cat_id=2, name=f'{names[1]}/{names[1]}', description=f"Biallelic {names[1]}" + ), + ), + } - return FilteringGenotypePolyPredicate( - predicate=predicate, - allowed=targets, + return PolyCountingGenotypePredicate._for_predicates_and_categories( + count2cat=count2cat, + a_predicate=a_predicate, + b_predicate=b_predicate, + ) + + @staticmethod + def _for_predicates_and_categories( + count2cat: typing.Mapping[typing.Tuple[int, int], Categorization], + a_predicate: VariantPredicate, + b_predicate: VariantPredicate, + ) -> "PolyCountingGenotypePredicate": + return PolyCountingGenotypePredicate( + a_counter=AlleleCounter(a_predicate), + b_counter=AlleleCounter(b_predicate), + count2cat=count2cat, ) def __init__( self, - predicate: "GenotypePolyPredicate", - allowed: typing.Iterable[Categorization], + count2cat: typing.Mapping[typing.Tuple[int, int], Categorization], + a_counter: AlleleCounter, + b_counter: AlleleCounter, ): - self._predicate = predicate - self._allowed = tuple(allowed) + self._count2cat = dict(count2cat) + self._categorizations = tuple(count2cat.values()) + self._a_counter = a_counter + self._b_counter = b_counter + self._hash = self._compute_hash() + + def _compute_hash(self) -> int: + hash_value = 17 + + self._groups = defaultdict(list) + for count, cat in self._count2cat.items(): + hash_value += 13 * hash(count) + hash_value += 13 * hash(cat) + + hash_value += 23 * hash(self._a_counter) + hash_value += 23 * hash(self._b_counter) + + return hash_value def get_categorizations(self) -> typing.Sequence[Categorization]: - return self._allowed + return self._categorizations def get_question_base(self) -> str: - return self._predicate.get_question_base() + return 'Allele group' def test(self, patient: Patient) -> typing.Optional[Categorization]: - cat = self._predicate.test(patient) - if cat in self._allowed: - return cat - else: - return None + self._check_patient(patient) - def __repr__(self): - return f"FilteringGenotypePolyPredicate(predicate={self._predicate}, allowed={self._allowed})" + a_count = self._a_counter.count(patient) + b_count = self._b_counter.count(patient) + counts = (a_count, b_count) + + return self._count2cat.get(counts, None) + + def __eq__(self, value: object) -> bool: + return isinstance(value, PolyCountingGenotypePredicate) \ + and self._count2cat == value._count2cat \ + and self._a_counter == value._a_counter \ + and self._b_counter == value._b_counter + + def __hash__(self) -> int: + return self._hash -def filtering_predicate( - predicate: GenotypePolyPredicate, - targets: typing.Collection[Categorization], +def monoallelic_predicate( + a_predicate: VariantPredicate, + b_predicate: VariantPredicate, + names: typing.Tuple[str, str] = ('A', 'B'), ) -> GenotypePolyPredicate: """ - Filtering predicate applies the base `predicate` but only returns the categorizations - from the provided `targets` collection. + The predicate bins patient into one of two groups, `A` and `B`, + based on presence of *exactly* one allele of a variant + that meets the predicate criteria. + + The number of alleles :math:`count_{A}` and :math:`count_{B}` + is computed using `a_predicate` and `b_predicate` + and the individual is assigned into a group + based on the following table: + + +-----------+-------------------+-------------------+ + | Group | :math:`count_{A}` | :math:`count_{B}` | + +===========+===================+===================+ + | A | 1 | 0 | + +-----------+-------------------+-------------------+ + | B | 0 | 1 | + +-----------+-------------------+-------------------+ + + The individuals with different allele counts + (e.g. :math:`count_{A} = 0` and :math:`count_{B} = 2`) + are assigned into the ``None`` group and, thus, omitted from the analysis. + + :param a_predicate: predicate to test if the variants + meet the criteria of the first group (named `A` by default). + :param b_predicate: predicate to test if the variants + meet the criteria of the second group (named `B` by default). + :param names: group names (default ``('A', 'B')``). + """ + return PolyCountingGenotypePredicate.monoallelic( + a_predicate=a_predicate, + b_predicate=b_predicate, + names=names, + ) - This can be useful if only some of the categorizations are interesting. - For instance, if we only seek to compare the differences between heterozygous and hemizygous variants, - but the predicate also bins the patients into homozygous reference, and biallelic alt genotype groups. - See the :ref:`filtering-predicate` section for an example. +def biallelic_predicate( + a_predicate: VariantPredicate, + b_predicate: VariantPredicate, + names: typing.Tuple[str, str] = ('A', 'B'), +) -> GenotypePolyPredicate: + """ + The predicate bins patient into one of the three groups, + `AA`, `AB`, and `BB`, + based on presence of one or two variant alleles + that meet the predicate criteria. + + The number of alleles :math:`count_{A}` and :math:`count_{B}` + is computed using `a_predicate` and `b_predicate` + and the individual is assigned into a group + based on the following table: + + +-----------+-------------------+-------------------+ + | Group | :math:`count_{A}` | :math:`count_{B}` | + +===========+===================+===================+ + | AA | 2 | 0 | + +-----------+-------------------+-------------------+ + | AB | 1 | 1 | + +-----------+-------------------+-------------------+ + | AA | 0 | 2 | + +-----------+-------------------+-------------------+ + + The individuals with different allele counts + (e.g. :math:`count_{A} = 1` and :math:`count_{B} = 2`) + are assigned into the ``None`` group and will be, thus, + omitted from the analysis. + + :param a_predicate: predicate to test if the variants + meet the criteria of the first group (named `A` by default). + :param b_predicate: predicate to test if the variants + meet the criteria of the second group (named `B` by default). + :param names: group names (default ``('A', 'B')``). + """ + return PolyCountingGenotypePredicate.biallelic( + a_predicate=a_predicate, + b_predicate=b_predicate, + names=names, + ) - The `predicate` is checked for being able to produce the all items in `targets` - and the `targets` must include at least 2 categorizations. - :param predicate: the base predicate whose categorizations are subject to filteration. - :param targets: the categorizations to retain +def autosomal_dominant( + variant_predicate: typing.Optional[VariantPredicate] = None, +) -> GenotypePolyPredicate: """ - return FilteringGenotypePolyPredicate.create( - predicate=predicate, - targets=targets, - ) + Create a predicate that assigns the patient either + into homozygous reference or heterozygous + group in line with the autosomal dominant mode of inheritance. + :param variant_predicate: a predicate for choosing the variants for testing + or `None` if all variants should be used. + """ + if variant_predicate is None: + variant_predicate = VariantPredicates.true() -@dataclasses.dataclass(eq=True, frozen=True) -class GenotypeGroup: - allele_count: int - sex: typing.Optional[Sex] - categorization: Categorization + return ModeOfInheritancePredicate._from_moi_info( + variant_predicate=variant_predicate, + mode_of_inheritance_data=ModeOfInheritanceInfo.autosomal_dominant(), + ) -class MendelianInheritanceAspect(enum.Enum): - AUTOSOMAL = 0 - """ - Related to chromosomes that do *not* determine the sex of an individual. +def autosomal_recessive( + variant_predicate: typing.Optional[VariantPredicate] = None, +) -> GenotypePolyPredicate: """ + Create a predicate that assigns the patient either into + homozygous reference, heterozygous, or biallelic alternative allele + (homozygous alternative or compound heterozygous) + group in line with the autosomal recessive mode of inheritance. - GONOSOMAL = 1 - """ - Related to chromosomes that determine the sex of an individual. + :param variant_predicate: a predicate for choosing the variants for testing + or `None` if all variants should be used """ + if variant_predicate is None: + variant_predicate = VariantPredicates.true() + + return ModeOfInheritancePredicate._from_moi_info( + variant_predicate=variant_predicate, + mode_of_inheritance_data=ModeOfInheritanceInfo.autosomal_recessive(), + ) - MITOCHONDRIAL = 2 - """ - Related to mitochondrial DNA. - """ + +@dataclasses.dataclass(eq=True, frozen=True) +class GenotypeGroup: + allele_count: int + sex: typing.Optional[Sex] + categorization: Categorization class ModeOfInheritanceInfo: @@ -332,13 +385,6 @@ class ModeOfInheritanceInfo: description="Homozygous alternate or compound heterozygous", ), ) - HEMI = Categorization( - PatientCategory( - cat_id=3, - name="HEMI", - description="Hemizygous", - ), - ) @staticmethod def autosomal_dominant() -> "ModeOfInheritanceInfo": @@ -355,7 +401,6 @@ def autosomal_dominant() -> "ModeOfInheritanceInfo": ), ) return ModeOfInheritanceInfo( - mendelian_inheritance_aspect=MendelianInheritanceAspect.AUTOSOMAL, groups=groups, ) @@ -379,62 +424,11 @@ def autosomal_recessive() -> "ModeOfInheritanceInfo": ), ) return ModeOfInheritanceInfo( - mendelian_inheritance_aspect=MendelianInheritanceAspect.AUTOSOMAL, - groups=groups, - ) - - @staticmethod - def x_dominant() -> "ModeOfInheritanceInfo": - groups = ( - GenotypeGroup( - allele_count=0, - sex=None, - categorization=ModeOfInheritanceInfo.HOM_REF, - ), - GenotypeGroup( - allele_count=1, - sex=None, - categorization=ModeOfInheritanceInfo.HET, - ), - ) - return ModeOfInheritanceInfo( - mendelian_inheritance_aspect=MendelianInheritanceAspect.GONOSOMAL, - groups=groups, - ) - - @staticmethod - def x_recessive() -> "ModeOfInheritanceInfo": - groups = ( - GenotypeGroup( - allele_count=0, - sex=None, - categorization=ModeOfInheritanceInfo.HOM_REF, - ), - GenotypeGroup( - allele_count=1, - sex=Sex.FEMALE, - categorization=ModeOfInheritanceInfo.HET, - ), - GenotypeGroup( - allele_count=2, - sex=Sex.FEMALE, - categorization=ModeOfInheritanceInfo.BIALLELIC_ALT, - ), - GenotypeGroup( - allele_count=1, - sex=Sex.MALE, - categorization=ModeOfInheritanceInfo.HEMI, - ), - ) - - return ModeOfInheritanceInfo( - mendelian_inheritance_aspect=MendelianInheritanceAspect.GONOSOMAL, groups=groups, ) def __init__( self, - mendelian_inheritance_aspect: MendelianInheritanceAspect, groups: typing.Iterable[GenotypeGroup], ): # We want this to be hashable but also keep a non-hashable dict @@ -442,10 +436,6 @@ def __init__( # The correctness depends on two default dicts with same keys and values # comparing equal. hash_value = 17 - assert isinstance(mendelian_inheritance_aspect, MendelianInheritanceAspect) - self._aspect = mendelian_inheritance_aspect - - hash_value += 31 * hash(self._aspect) self._groups = defaultdict(list) for group in groups: @@ -460,10 +450,6 @@ def groups(self) -> typing.Iterator[GenotypeGroup]: # Flatten `values()` which is an iterable of lists. return (group for meta_group in self._groups.values() for group in meta_group) - @property - def mendelian_inheritance_aspect(self) -> MendelianInheritanceAspect: - return self._aspect - def get_groups_for_allele_count( self, allele_count: int, @@ -474,19 +460,9 @@ def get_groups_for_allele_count( # No group for this allele count is OK return () - def is_autosomal(self) -> bool: - return self._aspect == MendelianInheritanceAspect.AUTOSOMAL - - def is_gonosomal(self) -> bool: - return self._aspect == MendelianInheritanceAspect.GONOSOMAL - - def is_mitochondrial(self) -> bool: - return self._aspect == MendelianInheritanceAspect.MITOCHONDRIAL - def __eq__(self, value: object) -> bool: return ( isinstance(value, ModeOfInheritanceInfo) - and self._aspect == value._aspect and self._groups == value._groups ) @@ -494,7 +470,7 @@ def __hash__(self) -> int: return self._hash def __str__(self) -> str: - return f"ModeOfInheritanceInfo(aspect={self._aspect}, groups={self._groups})" + return f"ModeOfInheritanceInfo(groups={self._groups})" def __repr__(self) -> str: return str(self) @@ -508,8 +484,8 @@ class ModeOfInheritancePredicate(GenotypePolyPredicate): @staticmethod def autosomal_dominant( - variant_predicate: VariantPredicate, - ) -> "ModeOfInheritancePredicate": + variant_predicate: typing.Optional[VariantPredicate] = None, + ) -> GenotypePolyPredicate: """ Create a predicate that assigns the patient either into homozygous reference or heterozygous @@ -517,15 +493,18 @@ def autosomal_dominant( :param variant_predicate: a predicate for choosing the variants for testing. """ - return ModeOfInheritancePredicate.from_moi_info( - variant_predicate=variant_predicate, - mode_of_inheritance_data=ModeOfInheritanceInfo.autosomal_dominant(), + # TODO: remove before 1.0.0 + warnings.warn( + "Use `gpsea.analysis.predicate.genotype.autosomal_dominant` instead", + DeprecationWarning, stacklevel=2, ) + return autosomal_dominant(variant_predicate) + @staticmethod def autosomal_recessive( - variant_predicate: VariantPredicate, - ) -> "ModeOfInheritancePredicate": + variant_predicate: typing.Optional[VariantPredicate] = None, + ) -> GenotypePolyPredicate: """ Create a predicate that assigns the patient either into homozygous reference, heterozygous, or biallelic alternative allele @@ -534,46 +513,16 @@ def autosomal_recessive( :param variant_predicate: a predicate for choosing the variants for testing. """ - return ModeOfInheritancePredicate.from_moi_info( - variant_predicate=variant_predicate, - mode_of_inheritance_data=ModeOfInheritanceInfo.autosomal_recessive(), + # TODO: remove before 1.0.0 + warnings.warn( + "Use `gpsea.analysis.predicate.genotype.autosomal_recessive` instead", + DeprecationWarning, stacklevel=2, ) - @staticmethod - def x_dominant( - variant_predicate: VariantPredicate, - ) -> "ModeOfInheritancePredicate": - """ - Create a predicate that assigns the patient either into - homozygous reference or heterozygous - group in line with the X-linked dominant mode of inheritance. - - :param variant_predicate: a predicate for choosing the variants for testing. - """ - return ModeOfInheritancePredicate.from_moi_info( - variant_predicate=variant_predicate, - mode_of_inheritance_data=ModeOfInheritanceInfo.x_dominant(), - ) + return autosomal_recessive(variant_predicate) @staticmethod - def x_recessive( - variant_predicate: VariantPredicate, - ) -> "ModeOfInheritancePredicate": - """ - Create a predicate that assigns the patient either into - homozygous reference, heterozygous, biallelic alternative allele - (homozygous alternative or compound heterozygous), or hemizygous - group in line with the X-linked recessive mode of inheritance. - - :param variant_predicate: a predicate for choosing the variants for testing. - """ - return ModeOfInheritancePredicate.from_moi_info( - variant_predicate=variant_predicate, - mode_of_inheritance_data=ModeOfInheritanceInfo.x_recessive(), - ) - - @staticmethod - def from_moi_info( + def _from_moi_info( variant_predicate: VariantPredicate, mode_of_inheritance_data: ModeOfInheritanceInfo, ) -> "ModeOfInheritancePredicate": @@ -605,7 +554,7 @@ def __init__( ) if issues: raise ValueError("Cannot create predicate: {}".format(", ".join(issues))) - self._question = "Which genotype group does the patient fit in" + self._question = "What is the genotype group" def get_categorizations(self) -> typing.Sequence[Categorization]: return self._categorizations @@ -619,41 +568,12 @@ def test( ) -> typing.Optional[Categorization]: self._check_patient(patient) - if self._moi_info.is_autosomal(): - allele_count = self._allele_counter.count(patient) - groups = self._moi_info.get_groups_for_allele_count(allele_count) - if len(groups) == 1: - return groups[0].categorization - else: - return None - elif self._moi_info.is_gonosomal(): - if patient.sex.is_provided(): - allele_count = self._allele_counter.count(patient) - groups = self._moi_info.get_groups_for_allele_count(allele_count) - if len(groups) == 0: - # Unable to assign the individual. - return None - elif len(groups) == 1: - # We can only assign into one category no matter what the individual's sex is. - return groups[0].categorization - else: - # We choose depending on the sex. - for group in groups: - if group.sex is not None and group.sex == patient.sex: - return group.categorization - return None - else: - # We must have patient's sex - # to do any meaningful analysis - # in the non-autosomal scenario. - return None - - elif self._moi_info.is_mitochondrial(): - # Cannot deal with mitochondrial inheritance right now. - return None + allele_count = self._allele_counter.count(patient) + groups = self._moi_info.get_groups_for_allele_count(allele_count) + if len(groups) == 1: + return groups[0].categorization else: - # Bug, please report to the developers - raise ValueError("Unexpected mode of inheritance condition") + return None def __eq__(self, value: object) -> bool: return ( @@ -719,6 +639,12 @@ def test(self, patient: Patient) -> typing.Optional[Categorization]: else: return None + def __eq__(self, value: object) -> bool: + return isinstance(value, SexGenotypePredicate) + + def __hash__(self) -> int: + return 31 + INSTANCE = SexGenotypePredicate() @@ -727,7 +653,7 @@ def sex_predicate() -> GenotypePolyPredicate: """ Get a genotype predicate for categorizing patients by their :class:`~gpsea.model.Sex`. - See the :ref:`sex-predicate` section for an example. + See the :ref:`male-female-predicate` section for an example. """ return INSTANCE @@ -748,7 +674,7 @@ def create( pass else: raise ValueError(f"{d} is neither `str` nor `hpotk.TermId`") - + diagnosis_ids.append(d) if labels is None: @@ -782,12 +708,13 @@ def __init__( self._categorizations = tuple( sorted(categorizations.values(), key=lambda c: c.category.cat_id) ) + self._hash = hash(tuple(categorizations.items())) def get_categorizations(self) -> typing.Sequence[Categorization]: return self._categorizations def get_question_base(self) -> str: - return 'What disease was diagnosed' + return "What disease was diagnosed" def test(self, patient: Patient) -> typing.Optional[Categorization]: categorization = None @@ -797,15 +724,22 @@ def test(self, patient: Patient) -> typing.Optional[Categorization]: except KeyError: # No match for this disease, no problem. continue - + if categorization is None: # First time we found a candidate disease categorization = candidate else: # Ambiguous match. We found several matching diagnoses! return None - + return categorization + + def __eq__(self, value: object) -> bool: + return isinstance(value, DiagnosisPredicate) \ + and self._id2cat == value._id2cat + + def __hash__(self) -> int: + return self._hash def diagnosis_predicate( diff --git a/src/gpsea/analysis/predicate/genotype/_predicates.py b/src/gpsea/analysis/predicate/genotype/_predicates.py index 03fae44cd..b1ac93e6f 100644 --- a/src/gpsea/analysis/predicate/genotype/_predicates.py +++ b/src/gpsea/analysis/predicate/genotype/_predicates.py @@ -10,10 +10,41 @@ from ._api import VariantPredicate +class AlwaysTrueVariantPredicate(VariantPredicate): + """ + `AlwaysTrueVariantPredicate` returns `True` for any variant. + """ + + @staticmethod + def get_instance() -> "AlwaysTrueVariantPredicate": + return ALWAYS_TRUE + + def get_question(self) -> str: + return "" + + def test(self, variant: Variant) -> bool: + return True + + def __eq__(self, value: object) -> bool: + return isinstance(value, AlwaysTrueVariantPredicate) + + def __hash__(self) -> int: + return 17 + + def __str__(self): + return repr(self) + + def __repr__(self): + return 'AlwaysTrueVariantPredicate()' + + +ALWAYS_TRUE = AlwaysTrueVariantPredicate() + + class VariantEffectPredicate(VariantPredicate): """ `VariantEffectPredicate` is a `VariantPredicate` that sets up testing - for a specific `VariantEffect` on a given transcript ID. + for a specific `VariantEffect` on a given transcript ID. Args: effect (VariantEffect): the variant effect to search for @@ -28,16 +59,6 @@ def get_question(self) -> str: return f'{self._effect.name} on {self._tx_id}' def test(self, variant: Variant) -> bool: - """Tests if the `Variant` causes the specified `VariantEffect` - on the given transcript. - - Args: - variant (Variant): a - - Returns: - bool: _description_ - """ - tx_anno = variant.get_tx_anno_by_tx_id(self._tx_id) if tx_anno is None: return False @@ -98,7 +119,7 @@ def __repr__(self): class VariantGenePredicate(VariantPredicate): """ - `VariantGenePredicate` tests if the variant + `VariantGenePredicate` tests if the variant is annotated to affect a gene denoted by a `symbol`. Args: @@ -134,7 +155,7 @@ def __repr__(self): class VariantTranscriptPredicate(VariantPredicate): """ - `VariantTranscriptPredicate` tests if the variant + `VariantTranscriptPredicate` tests if the variant is annotated to affect a transcript with `tx_id` accession. Args: @@ -170,24 +191,23 @@ def __repr__(self): class VariantExonPredicate(VariantPredicate): """ - `VariantExonPredicate` tests if the variant affects + `VariantExonPredicate` tests if the variant affects *n*-th exon of the transcript of interest. .. warning:: - We use 1-based numbering to number the exons, + We use 1-based numbering to number the exons, not the usual 0-based numbering of the computer science. - Therefore, the first exon of the transcript + Therefore, the first exon of the transcript has ``exon_number==1``, the second exon is ``2``, and so on ... .. warning:: - We do not check if the `exon_number` spans + We do not check if the `exon_number` spans beyond the number of exons of the given `transcript_id`! - Therefore, ``exon_number==10,000`` will effectively - return :attr:`GenotypeBooleanPredicate.FALSE` - for *all* patients!!! 😱 - Well, at least the patients of the *Homo sapiens sapiens* taxon... + Therefore, ``exon_number==10,000`` will effectively + return `False` for *all* variants!!! 😱 + Well, at least the genomic variants of the *Homo sapiens sapiens* taxon... :param exon: a positive `int` of the target exon :param tx_id: the accession of the transcript of interest, e.g. `NM_123456.7` @@ -195,10 +215,12 @@ class VariantExonPredicate(VariantPredicate): def __init__( self, - exon: int, + exon: int, tx_id: str, ): + assert isinstance(exon, int) and exon > 0, '`exon` must be a positive `int`' self._exon = exon + assert isinstance(tx_id, str) self._tx_id = tx_id def get_question(self) -> str: @@ -337,20 +359,20 @@ def __repr__(self): def _decode_operator(op: str) -> typing.Callable[[int, int], bool]: - if op == '<': - return operator.lt - elif op == '<=': - return operator.le - elif op == '==': - return operator.eq - elif op == '!=': - return operator.ne - elif op == '>=': - return operator.ge - elif op == '>': - return operator.gt - else: - raise ValueError(f'Unsupported operator {op}') + if op == '<': + return operator.lt + elif op == '<=': + return operator.le + elif op == '==': + return operator.eq + elif op == '!=': + return operator.ne + elif op == '>=': + return operator.ge + elif op == '>': + return operator.gt + else: + raise ValueError(f'Unsupported operator {op}') class ChangeLengthPredicate(VariantPredicate): @@ -394,7 +416,7 @@ def __repr__(self): class RefAlleleLengthPredicate(VariantPredicate): """ - `RefAlleleLengthPredicate` tests if the length of the variant's reference allele + `RefAlleleLengthPredicate` tests if the length of the variant's reference allele is greater than, equal, or less than certain value. """ @@ -455,7 +477,8 @@ def __init__( self._tx_id = tx_id def get_question(self) -> str: - return f'variant affects aminoacid(s) between {self._region.start} and {self._region.end} on protein encoded by transcript {self._tx_id}' + return f'variant affects aminoacid(s) between {self._region.start} and {self._region.end} ' \ + f'on protein encoded by transcript {self._tx_id}' def test(self, variant: Variant) -> bool: tx_anno = variant.get_tx_anno_by_tx_id(self._tx_id) @@ -494,9 +517,9 @@ class ProteinFeatureTypePredicate(VariantPredicate): """ def __init__( - self, - feature_type: FeatureType, - tx_id: str, + self, + feature_type: FeatureType, + tx_id: str, protein_metadata_service: ProteinMetadataService, ): self._feature_type = feature_type @@ -504,7 +527,8 @@ def __init__( self._prot_service = protein_metadata_service def get_question(self) -> str: - return f'variant affects {self._feature_type.name} feature type on the protein encoded by transcript {self._tx_id}' + return f'variant affects {self._feature_type.name} feature type on the protein ' \ + f'encoded by transcript {self._tx_id}' def test(self, variant: Variant) -> bool: tx_anno = variant.get_tx_anno_by_tx_id(self._tx_id) @@ -519,7 +543,7 @@ def test(self, variant: Variant) -> bool: return False protein_meta = self._prot_service.annotate(protein_id) - for feat in protein_meta.protein_features: + for feat in protein_meta.protein_features: if feat.feature_type == self._feature_type and location.overlaps_with(feat.info.region): return True return False @@ -538,7 +562,11 @@ def __str__(self): return repr(self) def __repr__(self): - return f'ProteinFeatureTypePredicate(feature_type={self._feature_type}, tx_id={self._tx_id}, protein_metadata_service={self._prot_service})' + return 'ProteinFeatureTypePredicate(' \ + f'feature_type={self._feature_type}, ' \ + f'tx_id={self._tx_id}, ' \ + f'protein_metadata_service={self._prot_service}' \ + ')' class ProteinFeaturePredicate(VariantPredicate): @@ -574,7 +602,7 @@ def test(self, variant: Variant) -> bool: return False protein = self._prot_service.annotate(protein_id) - for feat in protein.protein_features: + for feat in protein.protein_features: if feat.info.name == self._feature_name and location.overlaps_with(feat.info.region): return True return False @@ -593,4 +621,7 @@ def __str__(self): return repr(self) def __repr__(self): - return f'ProteinFeaturePredicate(feature_name={self._feature_name}, tx_id={self._tx_id}, protein_metadata_service={self._prot_service})' + return f'ProteinFeaturePredicate(' \ + f'feature_name={self._feature_name}, ' \ + f'tx_id={self._tx_id}, ' \ + f'protein_metadata_service={self._prot_service})' diff --git a/src/gpsea/analysis/predicate/genotype/_variant.py b/src/gpsea/analysis/predicate/genotype/_variant.py index 54d397352..99f23638b 100644 --- a/src/gpsea/analysis/predicate/genotype/_variant.py +++ b/src/gpsea/analysis/predicate/genotype/_variant.py @@ -18,6 +18,14 @@ class VariantPredicates: that are relatively simple to configure. """ + @staticmethod + def true() -> VariantPredicate: + """ + Prepare an absolutely inclusive :class:`VariantPredicate` - a predicate that returns `True` + for any variant whatsoever. + """ + return AlwaysTrueVariantPredicate.get_instance() + @staticmethod def all(predicates: typing.Iterable[VariantPredicate]) -> VariantPredicate: """ @@ -140,6 +148,21 @@ def exon( """ Prepare a :class:`VariantPredicate` that tests if the variant overlaps with an exon of a specific transcript. + .. warning:: + + We use 1-based numbering to number the exons, + not the usual 0-based numbering of the computer science. + Therefore, the first exon of the transcript + has ``exon_number==1``, the second exon is ``2``, and so on ... + + .. warning:: + + We do not check if the `exon_number` spans + beyond the number of exons of the given `transcript_id`! + Therefore, ``exon_number==10,000`` will effectively return `False` + for *all* variants!!! 😱 + Well, at least the genome variants of the *Homo sapiens sapiens* taxon... + Args: exon: a non-negative `int` with the index of the target exon (e.g. `0` for the 1st exon, `1` for the 2nd, ...) diff --git a/src/gpsea/analysis/predicate/phenotype/__init__.py b/src/gpsea/analysis/predicate/phenotype/__init__.py index 2efa1e553..a70f954b9 100644 --- a/src/gpsea/analysis/predicate/phenotype/__init__.py +++ b/src/gpsea/analysis/predicate/phenotype/__init__.py @@ -6,13 +6,13 @@ or using the phenotype features encoded into HPO terms (:class:`PropagatingPhenotypePredicate`). """ -from ._pheno import PhenotypePolyPredicate, PropagatingPhenotypePredicate +from ._pheno import PhenotypePolyPredicate, HpoPredicate from ._pheno import DiseasePresencePredicate from ._pheno import PhenotypeCategorization, P from ._util import prepare_predicates_for_terms_of_interest, prepare_hpo_terms_of_interest __all__ = [ - 'PhenotypePolyPredicate', 'PropagatingPhenotypePredicate', + 'PhenotypePolyPredicate', 'HpoPredicate', 'DiseasePresencePredicate', 'PhenotypeCategorization', 'P', 'prepare_predicates_for_terms_of_interest', 'prepare_hpo_terms_of_interest', diff --git a/src/gpsea/analysis/predicate/phenotype/_pheno.py b/src/gpsea/analysis/predicate/phenotype/_pheno.py index 3b2d9a7b4..004ea6722 100644 --- a/src/gpsea/analysis/predicate/phenotype/_pheno.py +++ b/src/gpsea/analysis/predicate/phenotype/_pheno.py @@ -92,13 +92,15 @@ def present_phenotype_category(self) -> PatientCategory: return self.present_phenotype_categorization.category -class PropagatingPhenotypePredicate(PhenotypePolyPredicate[hpotk.TermId]): +class HpoPredicate(PhenotypePolyPredicate[hpotk.TermId]): """ - `PropagatingPhenotypePredicate` tests if a patient is annotated with an HPO term. + `HpoPredicate` tests if a patient is annotated with an HPO term. Note, `query` must be a term of the provided `hpo`! - :param hpo: HPO object + See :ref:`hpo-predicate` section for an example usage. + + :param hpo: HPO ontology :param query: the HPO term to test :param missing_implies_phenotype_excluded: `True` if lack of an explicit annotation implies term's absence`. """ @@ -195,8 +197,17 @@ def test( return None + def __eq__(self, value: object) -> bool: + return isinstance(value, HpoPredicate) \ + and self._hpo.version == value._hpo.version \ + and self._query == value._query \ + and self._missing_implies_phenotype_excluded == value._missing_implies_phenotype_excluded + + def __hash__(self) -> int: + return hash((self._hpo.version, self._query, self._missing_implies_phenotype_excluded)) + def __repr__(self): - return f"PropagatingPhenotypeBooleanPredicate(query={self._query})" + return f"HpoPredicate(query={self._query})" class DiseasePresencePredicate(PhenotypePolyPredicate[hpotk.TermId]): @@ -262,5 +273,12 @@ def test( return self._diagnosis_excluded + def __eq__(self, value: object) -> bool: + return isinstance(value, DiseasePresencePredicate) \ + and self._query == value._query + + def __hash__(self) -> int: + return hash((self._query,)) + def __repr__(self): return f"DiseasePresencePredicate(query={self._query})" diff --git a/src/gpsea/analysis/predicate/phenotype/_util.py b/src/gpsea/analysis/predicate/phenotype/_util.py index ac73dd02c..1a0e33d59 100644 --- a/src/gpsea/analysis/predicate/phenotype/_util.py +++ b/src/gpsea/analysis/predicate/phenotype/_util.py @@ -4,7 +4,7 @@ import hpotk -from ._pheno import PhenotypePolyPredicate, PropagatingPhenotypePredicate +from ._pheno import PhenotypePolyPredicate, HpoPredicate from gpsea.model import Patient @@ -26,7 +26,7 @@ def prepare_predicates_for_terms_of_interest( (either directly or indirectly) for the term to be included in the analysis. """ return tuple( - PropagatingPhenotypePredicate( + HpoPredicate( hpo=hpo, query=term, missing_implies_phenotype_excluded=missing_implies_excluded, diff --git a/src/gpsea/data/__init__.py b/src/gpsea/data/__init__.py deleted file mode 100644 index e0372ba7c..000000000 --- a/src/gpsea/data/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from ._toy import get_toy_cohort - -__all__ = ['get_toy_cohort'] diff --git a/src/gpsea/data/_toy.py b/src/gpsea/data/_toy.py deleted file mode 100644 index 24a3b9c4b..000000000 --- a/src/gpsea/data/_toy.py +++ /dev/null @@ -1,314 +0,0 @@ -from hpotk import TermId - -from gpsea.model import * -from gpsea.model.genome import Contig, GenomicRegion, Strand - -CONTIG = Contig("1", "GB_ACC", "REFSEQ_NAME", "UCSC_NAME", 1_000) - - -def make_region(start: int, end: int) -> GenomicRegion: - return GenomicRegion(CONTIG, start, end, Strand.POSITIVE) - - -def get_toy_cohort() -> Cohort: - # TODO - Lauren - implement sample patients from the terms below - # - Arachnodactyly HP:0001166 - # - Focal clonic seizure HP:0002266 - # - Perimembranous ventricular septal defect HP:0011682 - # - Hepatosplenomegaly HP:0001433 - # - Tubularization of Bowman capsule HP:0032648 - # - Intercostal muscle weakness HP:0004878 - # - Enuresis nocturna HP:0010677 - # - Spasticity HP:0001257 - # - Chronic pancreatitis HP:0006280 - - arachnodactyly_T = Phenotype(TermId.from_curie("HP:0001166"), True) - focal_clonic_seizure_T = Phenotype(TermId.from_curie("HP:0002266"), True) - seizure_T = Phenotype(TermId.from_curie("HP:0001250"), True) - spasticity_T = Phenotype(TermId.from_curie("HP:0001257"), True) - arachnodactyly_F = Phenotype(TermId.from_curie("HP:0001166"), False) - focal_clonic_seizure_F = Phenotype(TermId.from_curie("HP:0002266"), False) - seizure_F = Phenotype(TermId.from_curie("HP:0001250"), False) - spasticity_F = Phenotype(TermId.from_curie("HP:0001257"), False) - - Disease_T = Disease(TermId.from_curie("OMIM:001234"), "Test present disease", True) - Disease_F = Disease(TermId.from_curie("OMIM:001234"), "Test absent disease", False) - - snv = Variant.create_variant_from_scratch( - VariantCoordinates(make_region(280, 281), "A", "G", 0), - "FakeGene", - "NM_1234.5", - "NM_1234.5:c.180A>G", - False, - [VariantEffect.MISSENSE_VARIANT], - [1], - "NP_09876.5", - "NP_09876.5:p.Unk200", - 60, - 60, - Genotypes.from_mapping( - { - SampleLabels("A"): Genotype.HETEROZYGOUS, - SampleLabels("B"): Genotype.HETEROZYGOUS, - SampleLabels("C"): Genotype.HOMOZYGOUS_ALTERNATE, - SampleLabels("D"): Genotype.HETEROZYGOUS, - SampleLabels("E"): Genotype.HETEROZYGOUS, - SampleLabels("G"): Genotype.HETEROZYGOUS, - SampleLabels("J"): Genotype.HETEROZYGOUS, - SampleLabels("K"): Genotype.HETEROZYGOUS, - SampleLabels("M"): Genotype.HETEROZYGOUS, - SampleLabels("N"): Genotype.HOMOZYGOUS_ALTERNATE, - SampleLabels("P"): Genotype.HETEROZYGOUS, - SampleLabels("Q"): Genotype.HOMOZYGOUS_ALTERNATE, - SampleLabels("R"): Genotype.HETEROZYGOUS, - SampleLabels("T"): Genotype.HETEROZYGOUS, - SampleLabels("V"): Genotype.HETEROZYGOUS, - SampleLabels("Y"): Genotype.HETEROZYGOUS, - } - ), - ) - deletion = Variant.create_variant_from_scratch( - VariantCoordinates(make_region(360, 363), "TTC", "T", -2), - "FakeGene", - "NM_1234.5", - "NM_1234.5:c.261_263del", - False, - [VariantEffect.FRAMESHIFT_VARIANT], - [2], - "NP_09876.5", - "NP_09876.5:p.Unk200", - 86, - 87, - Genotypes.from_mapping( - { - SampleLabels("D"): Genotype.HETEROZYGOUS, - SampleLabels("F"): Genotype.HETEROZYGOUS, - SampleLabels("G"): Genotype.HETEROZYGOUS, - SampleLabels("H"): Genotype.HETEROZYGOUS, - SampleLabels("I"): Genotype.HOMOZYGOUS_ALTERNATE, - SampleLabels("L"): Genotype.HETEROZYGOUS, - SampleLabels("O"): Genotype.HETEROZYGOUS, - SampleLabels("R"): Genotype.HETEROZYGOUS, - SampleLabels("S"): Genotype.HOMOZYGOUS_ALTERNATE, - SampleLabels("U"): Genotype.HETEROZYGOUS, - SampleLabels("W"): Genotype.HETEROZYGOUS, - SampleLabels("X"): Genotype.HETEROZYGOUS, - SampleLabels("Z"): Genotype.HETEROZYGOUS, - } - ), - ) - het_dup = Variant.create_variant_from_scratch( - VariantCoordinates(make_region(175, 176), "T", "TG", 1), - "FakeGene", - "NM_1234.5", - "NM_1234.5:c.75A>G", - False, - [VariantEffect.FRAMESHIFT_VARIANT], - [1], - "NP_09876.5", - "NP_09876.5:p.Unk200", - 25, - 25, - Genotypes.empty(), - ) # Not used in the patients below, hence `empty()`. - hom_dup = Variant.create_variant_from_scratch( - VariantCoordinates(make_region(175, 176), "T", "TG", 1), - "FakeGene", - "NM_1234.5", - "NM_1234.5:c.75A>G", - False, - [VariantEffect.FRAMESHIFT_VARIANT], - [1], - "NP_09876.5", - "NP_09876.5:p.Unk200", - 25, - 25, - Genotypes.empty(), - ) # Not used in the patients below, hence `empty()`. - - patients = ( - Patient.from_raw_parts( - SampleLabels("A"), - sex=None, - phenotypes=(arachnodactyly_T, spasticity_F, seizure_T), - variants=[snv], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("B"), - sex=None, - phenotypes=(arachnodactyly_T, seizure_T, spasticity_T), - variants=[snv], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("C"), - sex=None, - phenotypes=(arachnodactyly_F, spasticity_T, seizure_T), - variants=[snv], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("D"), - sex=None, - phenotypes=(arachnodactyly_T, spasticity_T, seizure_T), - variants=[snv, deletion], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("E"), - sex=None, - phenotypes=(arachnodactyly_T, spasticity_T, seizure_F), - variants=[snv], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("F"), - sex=None, - phenotypes=(arachnodactyly_F, spasticity_F, seizure_T), - variants=[deletion], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("G"), - sex=None, - phenotypes=(arachnodactyly_T, seizure_T, spasticity_T), - variants=[snv, deletion], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("H"), - sex=None, - phenotypes=(arachnodactyly_T, seizure_T, spasticity_F), - variants=[deletion], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("I"), - sex=None, - phenotypes=(arachnodactyly_F, spasticity_F, seizure_T), - variants=[deletion], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("J"), - sex=None, - phenotypes=(arachnodactyly_T, seizure_T, spasticity_T), - variants=[snv], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("K"), - sex=None, - phenotypes=(arachnodactyly_F, spasticity_T, seizure_T), - variants=[snv], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("L"), - sex=None, - phenotypes=(arachnodactyly_F, seizure_F, spasticity_F), - variants=[deletion], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("M"), - sex=None, - phenotypes=(arachnodactyly_T, seizure_F, spasticity_T), - variants=[snv], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("N"), - sex=None, - phenotypes=(arachnodactyly_F, seizure_T, spasticity_F), - variants=[snv], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("O"), - sex=None, - phenotypes=(arachnodactyly_F, seizure_F, spasticity_T), - variants=[deletion], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("P"), - sex=None, - phenotypes=(arachnodactyly_T, seizure_T, spasticity_F), - variants=[snv], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("Q"), - sex=None, - phenotypes=(arachnodactyly_T, seizure_F, spasticity_F), - variants=[snv], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("R"), - sex=None, - phenotypes=(arachnodactyly_T, seizure_T, spasticity_F), - variants=[snv, deletion], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("S"), - sex=None, - phenotypes=(arachnodactyly_F, seizure_T, spasticity_T), - variants=[deletion], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("T"), - sex=None, - phenotypes=(arachnodactyly_T, seizure_F, spasticity_T), - variants=[snv], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("U"), - sex=None, - phenotypes=(arachnodactyly_F, seizure_T, spasticity_T), - variants=[deletion], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("V"), - sex=None, - phenotypes=(arachnodactyly_T, seizure_T, spasticity_T), - variants=[snv], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("W"), - sex=None, - phenotypes=(arachnodactyly_F, seizure_T, spasticity_T), - variants=[deletion], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("X"), - sex=None, - phenotypes=(arachnodactyly_F, seizure_T, spasticity_T), - variants=[deletion], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("Y"), - sex=None, - phenotypes=(arachnodactyly_T, seizure_T, spasticity_T), - variants=[snv], - diseases=[Disease_T], - ), - Patient.from_raw_parts( - SampleLabels("Z"), - sex=None, - phenotypes=(arachnodactyly_F, seizure_T, spasticity_T), - variants=[deletion], - diseases=[Disease_T], - ), - ) - - return Cohort.from_patients(patients) diff --git a/src/gpsea/model/_variant_effects.py b/src/gpsea/model/_variant_effects.py index e15452e08..9567592d4 100644 --- a/src/gpsea/model/_variant_effects.py +++ b/src/gpsea/model/_variant_effects.py @@ -1,7 +1,10 @@ -from enum import Enum +import enum +import typing +import hpotk -class VariantEffect(Enum): + +class VariantEffect(enum.Enum): """ `VariantEffect` represents consequences of a variant on transcript that are supported by GPSEA. @@ -47,7 +50,7 @@ class VariantEffect(Enum): THREE_PRIME_UTR_VARIANT = "SO:0001624" NON_CODING_TRANSCRIPT_EXON_VARIANT = "SO:0001792" INTRON_VARIANT = "SO:0001627" - NMD_TRANSCRIPT_VARIANT = "SO:0001621", + NMD_TRANSCRIPT_VARIANT = "SO:0001621" NON_CODING_TRANSCRIPT_VARIANT = "SO:0001619" UPSTREAM_GENE_VARIANT = "SO:0001631" DOWNSTREAM_GENE_VARIANT = "SO:0001632" @@ -62,6 +65,43 @@ class VariantEffect(Enum): INTERGENIC_VARIANT = "SO:0001628" SEQUENCE_VARIANT = "SO:0001060" + def to_display(self) -> str: + """ + Get a concise name of the variant effect that is suitable for showing to humans. + + Example + ^^^^^^^ + + >>> from gpsea.model import VariantEffect + >>> VariantEffect.MISSENSE_VARIANT.to_display() + 'missense' + >>> VariantEffect.SPLICE_DONOR_5TH_BASE_VARIANT.to_display() + 'splice donor 5th base' + + :returns: a `str` with the name or `'n/a'` if the variant effect was not assigned a concise name. + """ + return effect_to_display.get(self, "n/a") + + @staticmethod + def structural_so_id_to_display(so_term: typing.Union[hpotk.TermId, str]) -> str: + """ + Get a `str` with a concise name for representing a Sequence Ontology (SO) term identifier. + + Example + ^^^^^^^ + + >>> from gpsea.model import VariantEffect + >>> VariantEffect.structural_so_id_to_display('SO:1000029') + 'chromosomal deletion' + + :param so_term: a CURIE `str` or a :class:`~hpotk.TermId` with the query SO term. + :returns: a `str` with the concise name for the SO term or `'n/a'` if a name has not been assigned yet. + """ + if isinstance(so_term, hpotk.TermId): + so_term = so_term.value + + return so_id_to_display.get(so_term, "n/a") + def __init__(self, curie: str): self._curie = curie @@ -75,3 +115,54 @@ def curie(self) -> str: def __str__(self) -> str: return self.name.lower() + + +effect_to_display = { + VariantEffect.TRANSCRIPT_ABLATION: "transcript ablation", + VariantEffect.SPLICE_ACCEPTOR_VARIANT: "splice acceptor", + VariantEffect.SPLICE_DONOR_VARIANT: "splice donor", + VariantEffect.STOP_GAINED: "stop gained", + VariantEffect.FRAMESHIFT_VARIANT: "frameshift", + VariantEffect.STOP_LOST: "stop lost", + VariantEffect.START_LOST: "start lost", + VariantEffect.TRANSCRIPT_AMPLIFICATION: "transcript amplification", + VariantEffect.INFRAME_INSERTION: "inframe insertion", + VariantEffect.INFRAME_DELETION: "inframe deletion", + VariantEffect.MISSENSE_VARIANT: "missense", + VariantEffect.PROTEIN_ALTERING_VARIANT: "protein altering", + VariantEffect.SPLICE_REGION_VARIANT: "splice region", + VariantEffect.SPLICE_DONOR_5TH_BASE_VARIANT: "splice donor 5th base", + VariantEffect.SPLICE_DONOR_REGION_VARIANT: "splice donor", + VariantEffect.SPLICE_POLYPYRIMIDINE_TRACT_VARIANT: "splice polypyrimidine", + VariantEffect.INCOMPLETE_TERMINAL_CODON_VARIANT: "incomplete terminal codon", + VariantEffect.START_RETAINED_VARIANT: "start retained", + VariantEffect.STOP_RETAINED_VARIANT: "stop retainined", + VariantEffect.SYNONYMOUS_VARIANT: "synonymous", + VariantEffect.CODING_SEQUENCE_VARIANT: "coding sequence", + VariantEffect.MATURE_MIRNA_VARIANT: "mature miRNA", + VariantEffect.FIVE_PRIME_UTR_VARIANT: "5UTR", + VariantEffect.THREE_PRIME_UTR_VARIANT: "3UTR", + VariantEffect.NON_CODING_TRANSCRIPT_EXON_VARIANT: "non-coding transcript exon", + VariantEffect.INTRON_VARIANT: "intronic", + VariantEffect.NMD_TRANSCRIPT_VARIANT: "NMD transcript", + VariantEffect.NON_CODING_TRANSCRIPT_VARIANT: "non-coding transcript", + VariantEffect.UPSTREAM_GENE_VARIANT: "upstream of gene", + VariantEffect.DOWNSTREAM_GENE_VARIANT: "downstream of gene", + VariantEffect.TFBS_ABLATION: "TFBS ablation", + VariantEffect.TFBS_AMPLIFICATION: "TFBS amplification", + VariantEffect.TF_BINDING_SITE_VARIANT: "TFBS binding site", + VariantEffect.REGULATORY_REGION_ABLATION: "regulatory region ablation", + VariantEffect.REGULATORY_REGION_AMPLIFICATION: "regulatory region amplification", + VariantEffect.FEATURE_ELONGATION: "feature elongation", + VariantEffect.REGULATORY_REGION_VARIANT: "regulatory region", + VariantEffect.FEATURE_TRUNCATION: "feature truncation", + VariantEffect.INTERGENIC_VARIANT: "intergenic", + VariantEffect.SEQUENCE_VARIANT: "sequence variant", +} + +so_id_to_display = { + "SO:1000029": "chromosomal deletion", + "SO:1000037": "chromosomal duplication", + "SO:1000030": "chromosomal_inversion", + "SO:1000044": "chromosomal_translocation", +} diff --git a/src/gpsea/preprocessing/_config.py b/src/gpsea/preprocessing/_config.py index 32d8b255c..7c7b3e100 100644 --- a/src/gpsea/preprocessing/_config.py +++ b/src/gpsea/preprocessing/_config.py @@ -455,7 +455,7 @@ def load_phenopackets( # Turn phenopackets into a cohort using the cohort creator. # Keep track of the progress by wrapping the list of phenopackets # with TQDM 😎 - cohort_iter = tqdm(phenopackets, desc="Patients Created", file=sys.stdout) + cohort_iter = tqdm(phenopackets, desc="Individuals Processed", file=sys.stdout, unit="individuals") notepad = cohort_creator.prepare_notepad("Phenopackets") cohort = cohort_creator.process(cohort_iter, notepad) diff --git a/src/gpsea/preprocessing/_uniprot.py b/src/gpsea/preprocessing/_uniprot.py index d1adb98b5..ec17ac0d0 100644 --- a/src/gpsea/preprocessing/_uniprot.py +++ b/src/gpsea/preprocessing/_uniprot.py @@ -10,10 +10,8 @@ class UniprotProteinMetadataService(ProteinMetadataService): """A class that creates ProteinMetadata objects from data found with the Uniprot REST API. - More info on the Uniprot REST API here - https://www.uniprot.org/help/programmatic_access - - Methods: - annotate(protein_id:str): Gets metadata and creates ProteinMetadata for given protein ID + More info on the Uniprot REST API are + in the `Programmatic access `_ section. """ def __init__( @@ -27,8 +25,8 @@ def __init__( @staticmethod def parse_uniprot_json( - payload: typing.Mapping[str, typing.Any], - protein_id: str, + payload: typing.Mapping[str, typing.Any], + protein_id: str, ) -> ProteinMetadata: """ Try to extract `ProteinMetadata` corresponding to `protein_id` from the Uniprot JSON `payload`. @@ -44,39 +42,52 @@ def parse_uniprot_json( results = payload['results'] if len(results) == 0: raise ValueError(f"No proteins found for ID {protein_id}. Please verify refseq ID.") + elif len(results) == 1: + # In case we get only one result, let's use it! + return UniprotProteinMetadataService._extract_metadata( + protein_id=protein_id, + data=results[0], + ) + else: + for protein in results: + if any(uni['id'] == protein_id for uni in protein['uniProtKBCrossReferences']): + return UniprotProteinMetadataService._extract_metadata( + protein_id=protein_id, + data=protein, + ) - for protein in results: - if any(uni['id'] == protein_id for uni in protein['uniProtKBCrossReferences']): - # `protein` has a cross-reference to the `protein_id` of interest - try: - protein_name = protein['proteinDescription']['recommendedName']['fullName']['value'] - except KeyError: - protein_name = protein['proteinDescription']['submissionNames'][0]['fullName']['value'] + raise ValueError(f'Could not find an entry for {protein_id} in Uniprot response') - all_features_list = [] - try: - for feature in protein['features']: - feature_start = int(feature['location']['start']['value']) - feature_end = int(feature['location']['end']['value']) - feature_name = feature['description'] - feature_info = FeatureInfo( - feature_name, - Region(start=feature_start, end=feature_end), - ) - feature_type = FeatureType[feature['type'].upper()] - protein_feature = ProteinFeature.create(feature_info, feature_type) - all_features_list.append(protein_feature) - except KeyError: - raise ValueError(f"No features for {protein_id}") + @staticmethod + def _extract_metadata(protein_id: str, data: typing.Mapping[str, typing.Any]) -> ProteinMetadata: + # `protein` has a cross-reference to the `protein_id` of interest + try: + protein_name = data['proteinDescription']['recommendedName']['fullName']['value'] + except KeyError: + protein_name = data['proteinDescription']['submissionNames'][0]['fullName']['value'] - try: - protein_length = protein["sequence"]["length"] - except KeyError as e: - raise ValueError(e) + all_features_list = [] + try: + for feature in data['features']: + feature_start = int(feature['location']['start']['value']) + feature_end = int(feature['location']['end']['value']) + feature_name = feature['description'] + feature_info = FeatureInfo( + feature_name, + Region(start=feature_start, end=feature_end), + ) + feature_type = FeatureType[feature['type'].upper()] + protein_feature = ProteinFeature.create(feature_info, feature_type) + all_features_list.append(protein_feature) + except KeyError: + raise ValueError(f"No features for {protein_id}") - return ProteinMetadata(protein_id, protein_name, all_features_list, protein_length) + try: + protein_length = data["sequence"]["length"] + except KeyError as e: + raise ValueError(e) - raise ValueError(f'Could not find an entry for {protein_id} in Uniprot response') + return ProteinMetadata(protein_id, protein_name, all_features_list, protein_length) def annotate(self, protein_id: str) -> ProteinMetadata: """ @@ -96,4 +107,5 @@ def annotate(self, protein_id: str) -> ProteinMetadata: raise ValueError(f"only works with a RefSeq database ID (e.g. NP_037407.4), but we got {protein_id}") api_url = self._url % protein_id response = requests.get(api_url, timeout=self._timeout).json() + return UniprotProteinMetadataService.parse_uniprot_json(response, protein_id) diff --git a/src/gpsea/view/__init__.py b/src/gpsea/view/__init__.py index 6eb13dd61..d6a6c22ec 100644 --- a/src/gpsea/view/__init__.py +++ b/src/gpsea/view/__init__.py @@ -1,5 +1,7 @@ from ._cohort import CohortViewable +from ._cohort_variant_viewer import CohortVariantViewer from ._disease import DiseaseViewable +from ._phenotype_analysis import summarize_hpo_analysis from ._protein_viewer import ProteinViewable from ._protein_visualizable import ProteinVisualizable from ._stats import MtcStatsViewer @@ -8,10 +10,12 @@ from ._formatter import VariantFormatter __all__ = [ + 'CohortVariantViewer', 'CohortViewable', 'ProteinVisualizer', 'ProteinVisualizable', 'ProteinViewable', 'DiseaseViewable', 'MtcStatsViewer', + 'summarize_hpo_analysis', 'VariantTranscriptVisualizer', 'VariantFormatter', ] diff --git a/src/gpsea/view/_cohort_variant_viewer.py b/src/gpsea/view/_cohort_variant_viewer.py new file mode 100644 index 000000000..0933ce02e --- /dev/null +++ b/src/gpsea/view/_cohort_variant_viewer.py @@ -0,0 +1,158 @@ +import typing + +from jinja2 import Environment, PackageLoader +from collections import namedtuple, defaultdict + +from gpsea.model import Cohort, Variant, VariantEffect +from ._formatter import VariantFormatter + + +ToDisplay = namedtuple('ToDisplay', ['hgvs_cdna', 'hgvsp', 'variant_effects']) + +VariantData = namedtuple('VariantData', ['variant_key', 'hgvs_cdna', 'hgvsp', 'variant_effects']) + + +class CohortVariantViewer: + """ + `CohortVariantViewer` creates an HTML report with the cohort variants. + + The report can be either written into an HTML file or displayed in a Jupyter notebook. + + See :ref:show-cohort-variants: for an example usage. + """ + + def __init__( + self, + tx_id: str + ): + """ + Args: + tx_id (str): The transcript identifier (Usually, the MANE RefSeq transcript, that should start with "NM_") + """ + environment = Environment(loader=PackageLoader('gpsea.view', 'templates')) + self._cohort_template = environment.get_template("all_variants.html") + self._var_formatter = VariantFormatter(tx_id) + if not tx_id.startswith("NM"): + print(f"[WARNING] Non-RefSeq transcript id: {tx_id}") + self._transcript_id = tx_id + + def process( + self, + cohort: Cohort, + only_hgvs: bool = True + ) -> str: + """ + Create an HTML that should be shown with ``display(HTML(..))`` of the ipython package. + + Args: + cohort (Cohort): The cohort being analyzed in the current notebook. + only_hgvs (bool): Do not show the transcript ID part of the HGVS annotation, just the annotation. + + Returns: + str: an HTML string with parameterized template for rendering + """ + context = self._prepare_context(cohort, only_hgvs=only_hgvs) + return self._cohort_template.render(context) + + def _prepare_context( + self, + cohort: Cohort, + only_hgvs: bool, + ) -> typing.Mapping[str, typing.Any]: + variant_count_dictionary = defaultdict(int) + variant_effect_count_dictionary = defaultdict(int) + variant_key_to_variant = dict() + for var in cohort.all_variants(): + var_key = var.variant_info.variant_key + vdata = self._get_variant_data(var, only_hgvs) + variant_key_to_variant[var_key] = vdata + variant_count_dictionary[var_key] += 1 + for v_eff in vdata.variant_effects: + variant_effect_count_dictionary[v_eff] += 1 + variant_counts = list() + variant_effect_counts = list() + for var_key, count in variant_count_dictionary.items(): + var_data = variant_key_to_variant[var_key] + variant_counts.append( + { + "variant_key": var_data.variant_key, + "variant": var_data.hgvs_cdna, + "variant_name": var_data.hgvs_cdna, + "protein_name": var_data.hgvsp, + "variant_effects": ", ".join(var_data.variant_effects), + "count": count, + } + ) + for v_effect, count in variant_effect_count_dictionary.items(): + variant_effect_counts.append( + { + "effect": v_effect, + "count": count + } + ) + variant_counts = sorted(variant_counts, key=lambda row: row.get("count"), reverse=True) + variant_effect_counts = sorted(variant_effect_counts, key=lambda row: row.get("count"), reverse=True) + + # The following dictionary is used by the Jinja2 HTML template + return { + "has_transcript": False, + "variant_count_list": variant_counts, + "variant_effect_count_list": variant_effect_counts, + "total_unique_allele_count": len(variant_counts) + } + + def _get_variant_data( + self, + variant: Variant, + only_hgvs: bool + ) -> VariantData: + """ + Get user-friendly strings (e.g., HGVS for our target transcript) to match to the chromosomal strings + Args: + variant (Variant): The variant to be formatted. + only_hgvs (bool): do not show the transcript ID part of the HGVS annotation, just the annotation. + + Returns: + VariantData: a named tuple with variant data formatted for human consumption. + """ + variant_key = variant.variant_info.variant_key + if variant.variant_info.has_sv_info(): + sv_info = variant.variant_info.sv_info + gene_symbol = sv_info.gene_symbol + display = f"SV involving {gene_symbol}" + effect = VariantEffect.structural_so_id_to_display(so_term=sv_info.structural_type) + return VariantData( + variant_key=variant_key, + hgvs_cdna=display, + hgvsp="p.?", + variant_effects=[effect], + ) + else: + variant_key = variant.variant_info.variant_key + display = self._var_formatter.format_as_string(variant) + tx_annotation = variant.get_tx_anno_by_tx_id(self._transcript_id) + if tx_annotation is not None: + hgvsp = tx_annotation.hgvsp + var_effects = [var_eff.to_display() for var_eff in tx_annotation.variant_effects] + else: + hgvsp = None + var_effects = [] + if only_hgvs: + # do not show the transcript id + fields_dna = display.split(":") + if len(fields_dna) > 1: + display_hgvs_cDNA = fields_dna[1] + else: + display_hgvs_cDNA = fields_dna[0] + + fields_ps = hgvsp.split(":") if hgvsp is not None else (None,) + if len(fields_ps) > 1: + hgvsp = fields_ps[1] + else: + hgvsp = fields_ps[0] + return VariantData( + variant_key=variant_key, + hgvs_cdna=display_hgvs_cDNA, + hgvsp=hgvsp, + variant_effects=var_effects, + ) diff --git a/src/gpsea/view/_phenotype_analysis.py b/src/gpsea/view/_phenotype_analysis.py new file mode 100644 index 000000000..256c3fb3e --- /dev/null +++ b/src/gpsea/view/_phenotype_analysis.py @@ -0,0 +1,76 @@ +import hpotk +import pandas as pd + +from gpsea.analysis.pcats import HpoTermAnalysisResult + + +def summarize_hpo_analysis( + hpo: hpotk.MinimalOntology, + result: HpoTermAnalysisResult, +) -> pd.DataFrame: + """ + Create a dataframe with counts, frequencies, and p values for the tested HPO terms. + + The HPO terms that were not tested will *not* be included in the frame. + + :param hpo: HPO data. + :param result: the HPO term analysis results to show. + """ + assert isinstance(result, HpoTermAnalysisResult) + + # Row index: a list of tested HPO terms + pheno_idx = pd.Index(result.phenotypes) + # Column index: multiindex of counts and percentages for all genotype predicate groups + gt_idx = pd.MultiIndex.from_product( + iterables=(result.gt_predicate.get_categories(), ("Count", "Percent")), + names=(result.gt_predicate.get_question_base(), None), + ) + + # We'll fill this frame with data + df = pd.DataFrame(index=pheno_idx, columns=gt_idx) + + for ph_predicate, count in zip(result.pheno_predicates, result.all_counts): + # Sum across the phenotype categories (collapse the rows). + gt_totals = count.sum() + + for gt_cat in count.columns: + cnt = count.loc[ph_predicate.present_phenotype_category, gt_cat] + total = gt_totals[gt_cat] + df.loc[ph_predicate.phenotype, (gt_cat, "Count")] = f"{cnt}/{total}" + pct = 0 if total == 0 else round(cnt * 100 / total) + df.loc[ph_predicate.phenotype, (gt_cat, "Percent")] = f"{pct}%" + + # Add columns with p values and corrected p values (if present) + p_val_col_name = "p values" + corrected_p_val_col_name = "Corrected p values" + if result.corrected_pvals is not None: + df.insert(df.shape[1], ("", corrected_p_val_col_name), result.corrected_pvals) + df.insert(df.shape[1], ("", p_val_col_name), result.pvals) + + # Format the index values: `HP:0001250` -> `Seizure [HP:0001250]` if the index members are HPO terms + # or just use the term ID CURIE otherwise (e.g. `OMIM:123000`). + labeled_idx = df.index.map(lambda term_id: format_term_id(hpo, term_id)) + + # Last, sort by corrected p value or just p value + df = df.set_index(labeled_idx) + # and only report the tested HPO terms + with_p_value = df[("", p_val_col_name)].notna() + if result.corrected_pvals is not None: + return df.sort_values(by=[("", corrected_p_val_col_name), ("", p_val_col_name)]).loc[with_p_value] + else: + return df.sort_values(by=("", p_val_col_name)).loc[with_p_value] + + +def format_term_id( + hpo: hpotk.MinimalOntology, + term_id: hpotk.TermId, +) -> str: + """ + Format a `term_id` as a `str`. HPO term ID is formatted as ` []` whereas other term IDs + are formatted as CURIEs (e.g. `OMIM:123000`). + """ + if term_id.prefix == "HP": + term_name = hpo.get_term_name(term_id) + return f"{term_name} [{term_id.value}]" + else: + return term_id.value diff --git a/src/gpsea/view/_stats.py b/src/gpsea/view/_stats.py index b232ad151..5d9e1ab18 100644 --- a/src/gpsea/view/_stats.py +++ b/src/gpsea/view/_stats.py @@ -43,12 +43,19 @@ def _prepare_context( counts = Counter() for result in report.mtc_filter_results: if result.is_filtered_out(): - counts[result.reason] += 1 + counts[result.mtc_issue] += 1 n_skipped = 0 - reason_to_count = list() - for reason, count in sorted(counts.items(), key=lambda item: item[1], reverse=True): - reason_to_count.append({"reason": reason, "count": count}) + issue_to_count = list() + for mtc_issue, count in sorted( + counts.items(), + key=lambda issue2count: (issue2count[0].code, issue2count[0].reason) + ): + issue_to_count.append({ + "code": mtc_issue.code, + "reason": mtc_issue.reason, + "count": count, + }) n_skipped += count n_all = len(report.phenotypes) @@ -61,5 +68,5 @@ def _prepare_context( "skipped_hpo_count": n_skipped, "tested_hpo_count": n_tested, "total_hpo_count": n_all, - "reason_to_count": reason_to_count, + "issue_to_count": issue_to_count, } diff --git a/src/gpsea/view/templates/all_variants.html b/src/gpsea/view/templates/all_variants.html new file mode 100644 index 000000000..9640bd1af --- /dev/null +++ b/src/gpsea/view/templates/all_variants.html @@ -0,0 +1,117 @@ + + + + + Cohort + + + + +

GPSEA cohort analysis: All variant alleles

+ + +
Using HPO MTC filter, 243 term(s) were omitted from statistical analysis.Using HPO MTC filter, 244 term(s) were omitted from statistical analysis.
Code
TODOSkipping term with only 2 observations (not powered for 2x2)51
TODOSkipping term because all genotypes have same HPO observed proportions50
TODOSkipping general term44
TODOSkipping term with only 3 observations (not powered for 2x2)27
TODOSkipping term with only 6 observations (not powered for 2x2)19HMF01Skipping term with maximum frequency that was less than threshold 0.210
TODOSkipping term with only 4 observations (not powered for 2x2)16HMF02Skipping term because no genotype has more than one observed HPO count4
TODOSkipping term with only 5 observations (not powered for 2x2)10HMF03Skipping term because of a child term with the same individual counts1
TODOSkipping term with maximum frequency that was less than threshold 0.210HMF04Skipping term because all genotypes have same HPO observed proportions50
TODOSkipping term with only 1 observations (not powered for 2x2)7HMF05Skipping term because one genotype had zero observations5
TODOSkipping term because one genotype had zero observations5HMF06Skipping term with less than 7 observations (not powered for 2x2)130
TODOSkipping term because no genotype has more than one observed HPO count4HMF08Skipping general term44
+ + + + + + + + + + + + {% for vdata in variant_count_list %} + + + + + + + + {% endfor %} + +
+

Variant alleles

+ A total of {{ total_unique_allele_count }} unique alleles were identified in the cohort. +
Variant keyVariant (cDNA)Variant (protein)EffectsCount
{{ vdata.variant_key }}{{ vdata.variant_name }}{{ vdata.protein_name }}{{ vdata.variant_effects }}{{ vdata.count }}
+ + + + + diff --git a/src/gpsea/view/templates/stats.html b/src/gpsea/view/templates/stats.html index 2a75c055a..3fd48ed5e 100644 --- a/src/gpsea/view/templates/stats.html +++ b/src/gpsea/view/templates/stats.html @@ -57,10 +57,9 @@

Phenotype testing report

Reason Count - {% for skipped in reason_to_count %} + {% for skipped in issue_to_count %} - - TODO + {{ skipped.code }} {{ skipped.reason }} {{ skipped.count }} diff --git a/tests/analysis/pcats/test_hpo_term_analysis.py b/tests/analysis/pcats/test_hpo_term_analysis.py index dbd8d508f..9d18f87a1 100644 --- a/tests/analysis/pcats/test_hpo_term_analysis.py +++ b/tests/analysis/pcats/test_hpo_term_analysis.py @@ -57,25 +57,25 @@ def test_compare_genotype_vs_phenotypes( assert results is not None assert results.total_tests == 4 - assert results.n_usable == (35, 18, 13, 25, 23) + assert results.n_usable == (17, 7, 5, 13, 12) assert results.pvals == pytest.approx( [ - 0.0721291631224236, - 1.0, + 0.35294117647058815, + 0.48571428571428565, float("nan"), - 0.35921595820909313, - 0.6668461434917216, + 0.1048951048951049, + 1., ], nan_ok=True, ) assert results.corrected_pvals is not None assert results.corrected_pvals == pytest.approx( [ - 0.2885166524896944, - 1.0, + 0.6476190476190475, + 0.6476190476190475, float("nan"), - 0.7184319164181863, - 0.8891281913222954, + 0.4195804195804196, + 1.0, ], nan_ok=True, ) diff --git a/tests/analysis/predicate/genotype/conftest.py b/tests/analysis/predicate/genotype/conftest.py index 7138e93ff..037e37121 100644 --- a/tests/analysis/predicate/genotype/conftest.py +++ b/tests/analysis/predicate/genotype/conftest.py @@ -220,7 +220,7 @@ def patient_w_frameshift( @pytest.fixture(scope="package") -def genesis_mutation( +def genesis_missense_mutation( genome_build: GenomeBuild, adam_label: SampleLabels, eve_label: SampleLabels, @@ -268,6 +268,54 @@ def genesis_mutation( ) +@pytest.fixture(scope="package") +def genesis_synonymous_mutation( + genome_build: GenomeBuild, + adam_label: SampleLabels, + eve_label: SampleLabels, + cain_label: SampleLabels, +) -> Variant: + chr22 = genome_build.contig_by_name("chr22") + assert chr22 is not None + return Variant( + variant_info=VariantInfo( + variant_coordinates=VariantCoordinates( + region=GenomicRegion( + contig=chr22, + start=200, + end=201, + strand=Strand.POSITIVE, + ), + ref="T", + alt="G", + change_length=0, + ) + ), + tx_annotations=( + TranscriptAnnotation( + gene_id="a_gene", + tx_id="tx:xyz", + hgvs_cdna=None, + is_preferred=True, + variant_effects=( + VariantEffect.SYNONYMOUS_VARIANT, + ), + affected_exons=(5,), + protein_id="pt:xyz", + hgvsp=None, + protein_effect_coordinates=Region(80, 81), + ), + ), + genotypes=Genotypes.from_mapping( + { + adam_label: Genotype.HETEROZYGOUS, + eve_label: Genotype.HOMOZYGOUS_REFERENCE, + cain_label: Genotype.HOMOZYGOUS_REFERENCE, + } + ), + ) + + @pytest.fixture(scope="package") def adam_label() -> SampleLabels: return SampleLabels("Adam") @@ -276,14 +324,18 @@ def adam_label() -> SampleLabels: @pytest.fixture(scope="package") def adam( adam_label: SampleLabels, - genesis_mutation: Variant, + genesis_missense_mutation: Variant, + genesis_synonymous_mutation: Variant, ) -> Patient: return Patient( adam_label, sex=Sex.MALE, phenotypes=(), diseases=(), - variants=(genesis_mutation,), + variants=( + genesis_missense_mutation, + genesis_synonymous_mutation, + ), ) @@ -295,14 +347,18 @@ def eve_label() -> SampleLabels: @pytest.fixture(scope="package") def eve( eve_label: SampleLabels, - genesis_mutation: Variant, + genesis_missense_mutation: Variant, + genesis_synonymous_mutation: Variant, ) -> Patient: return Patient( eve_label, sex=Sex.FEMALE, phenotypes=(), diseases=(), - variants=(genesis_mutation,), + variants=( + genesis_missense_mutation, + genesis_synonymous_mutation, + ), ) @@ -314,14 +370,18 @@ def cain_label() -> SampleLabels: @pytest.fixture(scope="package") def cain( cain_label: SampleLabels, - genesis_mutation: Variant, + genesis_missense_mutation: Variant, + genesis_synonymous_mutation: Variant, ) -> Patient: return Patient( cain_label, sex=Sex.MALE, phenotypes=(), diseases=(), - variants=(genesis_mutation,), + variants=( + genesis_missense_mutation, + genesis_synonymous_mutation, + ), ) @@ -336,7 +396,7 @@ def cain( @pytest.fixture(scope="package") -def white_mutation( +def white_missense_mutation( genome_build: GenomeBuild, walt_label: SampleLabels, skyler_label: SampleLabels, @@ -364,7 +424,7 @@ def white_mutation( gene_id="a_gene", tx_id="tx:xyz", hgvs_cdna=None, - is_preferred=False, + is_preferred=True, variant_effects=( VariantEffect.MISSENSE_VARIANT, VariantEffect.SPLICE_DONOR_VARIANT, @@ -386,6 +446,56 @@ def white_mutation( ) +@pytest.fixture(scope="package") +def white_synonymous_mutation( + genome_build: GenomeBuild, + walt_label: SampleLabels, + skyler_label: SampleLabels, + flynn_label: SampleLabels, + holly_label: SampleLabels, +) -> Variant: + chr22 = genome_build.contig_by_name("chr22") + assert chr22 is not None + return Variant( + variant_info=VariantInfo( + variant_coordinates=VariantCoordinates( + region=GenomicRegion( + contig=chr22, + start=200, + end=201, + strand=Strand.POSITIVE, + ), + ref="T", + alt="G", + change_length=0, + ) + ), + tx_annotations=( + TranscriptAnnotation( + gene_id="a_gene", + tx_id="tx:xyz", + hgvs_cdna=None, + is_preferred=True, + variant_effects=( + VariantEffect.SYNONYMOUS_VARIANT, + ), + affected_exons=(5,), + protein_id="pt:xyz", + hgvsp=None, + protein_effect_coordinates=Region(80, 81), + ), + ), + genotypes=Genotypes.from_mapping( + { + walt_label: Genotype.HETEROZYGOUS, + skyler_label: Genotype.HETEROZYGOUS, + flynn_label: Genotype.HOMOZYGOUS_REFERENCE, + holly_label: Genotype.HOMOZYGOUS_ALTERNATE, + } + ), + ) + + @pytest.fixture(scope="package") def walt_label() -> SampleLabels: return SampleLabels("Walt") @@ -394,14 +504,18 @@ def walt_label() -> SampleLabels: @pytest.fixture(scope="package") def walt( walt_label: SampleLabels, - white_mutation: Variant, + white_missense_mutation: Variant, + white_synonymous_mutation: Variant, ) -> Patient: return Patient( walt_label, sex=Sex.MALE, phenotypes=(), diseases=(), - variants=(white_mutation,), + variants=( + white_missense_mutation, + white_synonymous_mutation, + ), ) @@ -413,14 +527,18 @@ def skyler_label() -> SampleLabels: @pytest.fixture(scope="package") def skyler( skyler_label: SampleLabels, - white_mutation: Variant, + white_missense_mutation: Variant, + white_synonymous_mutation: Variant, ) -> Patient: return Patient( skyler_label, sex=Sex.FEMALE, phenotypes=(), diseases=(), - variants=(white_mutation,), + variants=( + white_missense_mutation, + white_synonymous_mutation, + ), ) @@ -432,14 +550,18 @@ def flynn_label() -> SampleLabels: @pytest.fixture(scope="package") def flynn( flynn_label: SampleLabels, - white_mutation: Variant, + white_missense_mutation: Variant, + white_synonymous_mutation: Variant, ) -> Patient: return Patient( flynn_label, sex=Sex.MALE, phenotypes=(), diseases=(), - variants=(white_mutation,), + variants=( + white_missense_mutation, + white_synonymous_mutation, + ), ) @@ -451,14 +573,18 @@ def holly_label() -> SampleLabels: @pytest.fixture(scope="package") def holly( holly_label: SampleLabels, - white_mutation: Variant, + white_missense_mutation: Variant, + white_synonymous_mutation: Variant, ) -> Patient: return Patient( holly_label, sex=Sex.FEMALE, phenotypes=(), diseases=(), - variants=(white_mutation,), + variants=( + white_missense_mutation, + white_synonymous_mutation, + ), ) diff --git a/tests/analysis/predicate/genotype/test_gt_predicates.py b/tests/analysis/predicate/genotype/test_gt_predicates.py index 61ff29c30..425c63f50 100644 --- a/tests/analysis/predicate/genotype/test_gt_predicates.py +++ b/tests/analysis/predicate/genotype/test_gt_predicates.py @@ -1,16 +1,16 @@ -import typing - import pytest from gpsea.model import * from gpsea.analysis.predicate.genotype import ( GenotypePolyPredicate, groups_predicate, - filtering_predicate, sex_predicate, + monoallelic_predicate, + biallelic_predicate, + autosomal_dominant, + autosomal_recessive, VariantPredicates, VariantPredicate, - ModeOfInheritancePredicate, ) @@ -103,7 +103,7 @@ def test_autosomal_dominant( request: pytest.FixtureRequest, ): patient = request.getfixturevalue(patient_name) - predicate = ModeOfInheritancePredicate.autosomal_dominant(variant_predicate) + predicate = autosomal_dominant(variant_predicate) categorization = predicate.test(patient) @@ -114,21 +114,19 @@ def test_autosomal_dominant( @pytest.mark.parametrize( "patient_name,name", [ - ("walt", "HET"), - ("skyler", "HET"), - ("flynn", "BIALLELIC_ALT"), - ("holly", "HOM_REF"), + ("adam", "HET"), # 0/0 & 0/1 + ("eve", "HET"), # 0/1 & 0/0 + ("cain", "HET"), # 0/1 & 0/0 ], ) - def test_autosomal_recessive( + def test_autosomal_dominant__with_default_predicate( self, patient_name: str, name: str, - variant_predicate: VariantPredicate, request: pytest.FixtureRequest, ): patient = request.getfixturevalue(patient_name) - predicate = ModeOfInheritancePredicate.autosomal_recessive(variant_predicate) + predicate = autosomal_dominant() categorization = predicate.test(patient) @@ -139,12 +137,13 @@ def test_autosomal_recessive( @pytest.mark.parametrize( "patient_name,name", [ - ("adam", "HOM_REF"), - ("eve", "HET"), - ("cain", "HET"), + ("walt", "HET"), + ("skyler", "HET"), + ("flynn", "BIALLELIC_ALT"), + ("holly", "HOM_REF"), ], ) - def test_x_dominant( + def test_autosomal_recessive( self, patient_name: str, name: str, @@ -152,7 +151,7 @@ def test_x_dominant( request: pytest.FixtureRequest, ): patient = request.getfixturevalue(patient_name) - predicate = ModeOfInheritancePredicate.x_dominant(variant_predicate) + predicate = autosomal_recessive(variant_predicate) categorization = predicate.test(patient) @@ -163,21 +162,21 @@ def test_x_dominant( @pytest.mark.parametrize( "patient_name,name", [ - ("anakin", "HOM_REF"), - ("padme", "HET"), - ("luke", "HEMI"), - ("leia", "HET"), + # The White family has two variants: + ("walt", "BIALLELIC_ALT"), # 0/1 & 0/1 + ("skyler", "BIALLELIC_ALT"), # 0/1 & 0/1 + ("flynn", "BIALLELIC_ALT"), # 1/1 & 0/0 + ("holly", "BIALLELIC_ALT"), # 0/0 & 1/1 ], ) - def test_x_recessive( + def test_autosomal_recessive__with_default_predicate( self, patient_name: str, name: str, - variant_predicate: VariantPredicate, request: pytest.FixtureRequest, ): patient = request.getfixturevalue(patient_name) - predicate = ModeOfInheritancePredicate.x_recessive(variant_predicate) + predicate = autosomal_recessive() categorization = predicate.test(patient) @@ -186,84 +185,76 @@ def test_x_recessive( assert categorization.category.name == name -class TestFilteringPredicate: - - @pytest.fixture(scope="class") - def x_recessive_gt_predicate(self) -> GenotypePolyPredicate: - affects_suox = VariantPredicates.gene("SUOX") - return ModeOfInheritancePredicate.x_recessive( - variant_predicate=affects_suox, - ) +class TestAllelePredicates: @pytest.mark.parametrize( - "indices, expected", + "individual_name,expected_name", [ - ((0, 1), 2), - ((1, 0), 2), - ((1, 2), 2), + ("adam", "B"), # 0/0 & 0/1 + ("eve", "A"), # 0/1 & 0/0 + ("cain", "A"), # 0/1 & 0/0 ], ) - def test_filtering_predicate( + def test_monoallelic_predicate_ad_family( self, - indices: typing.Sequence[int], - expected: int, - x_recessive_gt_predicate: GenotypePolyPredicate, + individual_name: str, + expected_name: str, + request: pytest.FixtureRequest, ): - cats = x_recessive_gt_predicate.get_categorizations() - targets = [cats[i] for i in indices] - predicate = filtering_predicate( - predicate=x_recessive_gt_predicate, - targets=targets, - ) + is_missense = VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, TX_ID) + is_synonymous = VariantPredicates.variant_effect(VariantEffect.SYNONYMOUS_VARIANT, TX_ID) + gt_predicate = monoallelic_predicate(is_missense, is_synonymous) + individual = request.getfixturevalue(individual_name) - actual = len(predicate.get_categorizations()) + actual_cat = gt_predicate.test(individual) - assert actual == expected + assert actual_cat is not None + assert actual_cat.category.name == expected_name - def test_filtering_predicate__explodes_when_not_subsetting( + def test_monoallelic_predicate__general_stuff( self, - x_recessive_gt_predicate: GenotypePolyPredicate, ): - with pytest.raises(ValueError) as ve: - filtering_predicate( - predicate=x_recessive_gt_predicate, - targets=x_recessive_gt_predicate.get_categorizations(), - ) - - assert ( - ve.value.args[0] - == "It makes no sense to subset the a predicate with 4 categorizations with the same number (4) of targets" - ) + is_missense = VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, TX_ID) + is_synonymous = VariantPredicates.variant_effect(VariantEffect.SYNONYMOUS_VARIANT, TX_ID) + + gt_predicate = monoallelic_predicate(is_missense, is_synonymous) + + assert gt_predicate.display_question() == 'Allele group: A, B' - def test_filtering_predicate__explodes_when_using_random_junk( + @pytest.mark.parametrize( + "individual_name,expected_name", + [ + ("walt", "A/B"), # 0/1 & 0/1 + ("skyler", "A/B"), # 0/1 & 0/1 + ("flynn", "A/A"), # 1/1 & 0/0 + ("holly", "B/B"), # 0/0 & 1/1 + ], + ) + def test_biallelic_predicate( self, - x_recessive_gt_predicate: GenotypePolyPredicate, + individual_name: str, + expected_name: str, + request: pytest.FixtureRequest, ): - with pytest.raises(ValueError) as ve: - filtering_predicate( - predicate=x_recessive_gt_predicate, - targets=(0, 1), - ) - - assert ( - ve.value.args[0] - == "The targets at following indices are not categorizations: [0, 1]" - ) + is_missense = VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, TX_ID) + is_synonymous = VariantPredicates.variant_effect(VariantEffect.SYNONYMOUS_VARIANT, TX_ID) + gt_predicate = biallelic_predicate(is_missense, is_synonymous) + individual = request.getfixturevalue(individual_name) + + actual_cat = gt_predicate.test(individual) - def test_filtering_predicate__explodes_when_using_one_category( + assert actual_cat is not None + assert actual_cat.category.name == expected_name + + def test_biallelic_predicate__general_stuff( self, - x_recessive_gt_predicate: GenotypePolyPredicate, ): - with pytest.raises(ValueError) as ve: - filtering_predicate( - predicate=x_recessive_gt_predicate, - targets=(x_recessive_gt_predicate.get_categorizations()[0],), - ) - - assert ( - ve.value.args[0] - == "At least 2 target categorizations must be provided but got 1" - ) + is_missense = VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, TX_ID) + is_synonymous = VariantPredicates.variant_effect(VariantEffect.SYNONYMOUS_VARIANT, TX_ID) + + gt_predicate = biallelic_predicate(is_missense, is_synonymous) + + assert gt_predicate.display_question() == 'Allele group: A/A, A/B, B/B' class TestSexPredicate: diff --git a/tests/analysis/predicate/genotype/test_predicates.py b/tests/analysis/predicate/genotype/test_predicates.py index a076adacf..e85e3a3fb 100644 --- a/tests/analysis/predicate/genotype/test_predicates.py +++ b/tests/analysis/predicate/genotype/test_predicates.py @@ -1,12 +1,19 @@ import pytest -from gpsea.analysis.predicate.genotype import VariantPredicates, ProteinPredicates, VariantPredicate +from gpsea.analysis.predicate.genotype import VariantPredicates, ProteinPredicates from gpsea.model import * from gpsea.model.genome import * class TestVariantPredicates: + def test_always_true_predicate( + self, + suox_cohort: Cohort, + ): + predicate = VariantPredicates.true() + assert all(predicate.test(v) for v in suox_cohort.all_variants()) + @pytest.mark.parametrize( 'effect, expected', [ @@ -46,7 +53,7 @@ def test_variant_key_predicate( @pytest.mark.parametrize( 'exon, expected', [ - (0, False), + (1, False), (4, True), (5, False), ] @@ -61,6 +68,11 @@ def test_exon_predicate( assert predicate.test(missense_variant) == expected + def test_exon_predicate_fails_on_invalid_exon(self): + with pytest.raises(AssertionError) as e: + VariantPredicates.exon(0, tx_id='tx:xyz') + assert e.value.args[0] == '`exon` must be a positive `int`' + @pytest.mark.parametrize( 'tx_id, expected', [ diff --git a/tests/analysis/predicate/test_phenotype.py b/tests/analysis/predicate/test_phenotype.py new file mode 100644 index 000000000..609417a90 --- /dev/null +++ b/tests/analysis/predicate/test_phenotype.py @@ -0,0 +1,31 @@ +import hpotk + +from gpsea.model import Cohort + +from gpsea.analysis.predicate.phenotype import PhenotypePolyPredicate +from gpsea.analysis.predicate.phenotype import prepare_hpo_terms_of_interest, prepare_predicates_for_terms_of_interest + + +def test_prepare_hpo_terms_of_interest( + suox_cohort: Cohort, + hpo: hpotk.MinimalOntology, +): + terms = prepare_hpo_terms_of_interest( + cohort=suox_cohort, + hpo=hpo, + ) + + assert len(terms) == 66 + + +def test_prepare_predicates_for_terms_of_interest( + suox_cohort: Cohort, + hpo: hpotk.MinimalOntology, +): + predicates = prepare_predicates_for_terms_of_interest( + cohort=suox_cohort, + hpo=hpo, + ) + + assert len(predicates) == 66 + assert all(isinstance(p, PhenotypePolyPredicate) for p in predicates) diff --git a/tests/analysis/test_mtc_filter.py b/tests/analysis/test_mtc_filter.py index d904cfe43..b0ab43deb 100644 --- a/tests/analysis/test_mtc_filter.py +++ b/tests/analysis/test_mtc_filter.py @@ -1,4 +1,3 @@ -from itertools import count import typing import hpotk @@ -8,7 +7,7 @@ from gpsea.analysis.mtc_filter import HpoMtcFilter, SpecifiedTermsMtcFilter from gpsea.analysis.predicate.genotype import GenotypePolyPredicate -from gpsea.analysis.predicate.phenotype import PhenotypePolyPredicate, PropagatingPhenotypePredicate +from gpsea.analysis.predicate.phenotype import PhenotypePolyPredicate, HpoPredicate from gpsea.analysis.pcats import apply_predicates_on_patients from gpsea.model import Cohort @@ -55,7 +54,7 @@ def ph_predicate( For the purpose of testing counts, let's pretend the counts were created by this predicate. """ - return PropagatingPhenotypePredicate( + return HpoPredicate( hpo=hpo, query=hpotk.TermId.from_curie("HP:0001250"), # Seizure missing_implies_phenotype_excluded=False, @@ -68,8 +67,8 @@ def prepare_counts_df( ph_predicate: PhenotypePolyPredicate[hpotk.TermId], ): values = np.array(counts).reshape((2, 2)) - index = pd.Index(gt_predicate.get_categories()) - columns = pd.Index(ph_predicate.get_categories()) + index = pd.Index(ph_predicate.get_categories()) + columns = pd.Index(gt_predicate.get_categories()) return pd.DataFrame(data=values, index=index, columns=columns) @pytest.mark.parametrize( @@ -83,7 +82,7 @@ def prepare_counts_df( ) def test_one_genotype_has_zero_hpo_observations( self, - counts: typing.Tuple[int], + counts: typing.Sequence[int], expected: bool, gt_predicate: GenotypePolyPredicate, ph_predicate: PhenotypePolyPredicate[hpotk.TermId], @@ -201,7 +200,7 @@ def test_filter_terms_to_test( reasons = [r.reason for r in mtc_report] assert reasons == [ None, None, - 'Skipping term because all genotypes have same HPO observed proportions', + 'Skipping term with less than 7 observations (not powered for 2x2)', None, None, ] @@ -264,7 +263,7 @@ def test_min_observed_HPO_threshold( for i, p in enumerate(suox_pheno_predicates) } - # Ectopia lentis HP:0001083 (6 9 1 2), freqs are 6/15=0.4 and 1/3=0.33 + # Ectopia lentis HP:0001083 (1 2 3 1), freqs are 1/4=0.25 and 3/4=0.75 idx = curie2idx["HP:0001083"] ectopia = patient_counts[idx] ectopia_predicate = suox_pheno_predicates[idx] @@ -272,9 +271,9 @@ def test_min_observed_HPO_threshold( ectopia, ph_predicate=ectopia_predicate, ) - assert max_f == pytest.approx(0.4, abs=EPSILON) + assert max_f == pytest.approx(0.75, abs=EPSILON) - # Seizure HP:0001250 (17 7 11 0), freqs are 17/24=0.7083 and 11/11=1 + # Seizure HP:0001250 (11 5 0 1), freqs are 11/11=1.0 and 5/6=0.8333333 idx = curie2idx["HP:0001250"] seizure = patient_counts[idx] seizure_predicate = suox_pheno_predicates[idx] @@ -284,7 +283,7 @@ def test_min_observed_HPO_threshold( ) assert max_f == pytest.approx(1.0, abs=EPSILON) - # Sulfocysteinuria HP:0032350 (11 0 2 0), freqs are both 1 + # Sulfocysteinuria HP:0032350 (2 3 0 0), freqs are both 1 idx = curie2idx["HP:0032350"] sulfocysteinuria = patient_counts[idx] sulfocysteinuria_predicate = suox_pheno_predicates[idx] @@ -294,7 +293,7 @@ def test_min_observed_HPO_threshold( ) assert max_f == pytest.approx(1.0, abs=EPSILON) - # Neurodevelopmental delay HP:0012758 (4 13 4 4), freqs are 4/17 = 0.235 and 4/8=0.5 + # Neurodevelopmental delay HP:0012758 (4 0 4 5), freqs are 4/8 = 0.5 and 0/5=0.0 idx = curie2idx["HP:0012758"] ndelay = patient_counts[idx] ndelay_predicate = suox_pheno_predicates[idx] @@ -304,7 +303,7 @@ def test_min_observed_HPO_threshold( ) assert max_f == pytest.approx(0.5, abs=EPSILON) - # Hypertonia HP:0001276 (7 9 4 3) fresa are 7/16=0.4375 and 4/7=0.5714 + # Hypertonia HP:0001276 (4 2 3 3) freqs are 4/7=0.4375 and 2/5=0.5714 idx = curie2idx["HP:0001276"] hypertonia = patient_counts[idx] hypertonia_predicate = suox_pheno_predicates[idx] diff --git a/tests/conftest.py b/tests/conftest.py index b66b55e7d..900e7f45a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,11 +3,15 @@ import typing import hpotk +import numpy as np +import pandas as pd import pytest -from gpsea.analysis.predicate.genotype import GenotypePolyPredicate, VariantPredicates, boolean_predicate -from gpsea.analysis.predicate.phenotype import PhenotypePolyPredicate, PropagatingPhenotypePredicate -from gpsea.io import GpseaJSONEncoder, GpseaJSONDecoder +from gpsea.analysis.mtc_filter import PhenotypeMtcResult +from gpsea.analysis.pcats import HpoTermAnalysisResult +from gpsea.analysis.predicate.genotype import GenotypePolyPredicate, VariantPredicates, autosomal_dominant +from gpsea.analysis.predicate.phenotype import PhenotypePolyPredicate, HpoPredicate +from gpsea.io import GpseaJSONDecoder from gpsea.model import * from gpsea.model.genome import GRCh38, GenomicRegion, Region, Strand, GenomeBuild from ._protein_test_service import ProteinTestMetadataService @@ -32,6 +36,7 @@ def pytest_collection_modifyitems(config, items): if "online" in item.keywords: item.add_marker(skip_online) + @pytest.fixture(scope='session') def fpath_project_dir(fpath_test_dir: str) -> str: """ @@ -106,12 +111,17 @@ def suox_cohort( @pytest.fixture(scope='session') -def suox_gt_predicate() -> GenotypePolyPredicate: - # To bin the patients to a group with >1 MISSENSE variant or 0 MISSENSE variants. - suox_mane_tx_id = 'NM_001032386.2' - return boolean_predicate( +def suox_mane_tx_id() -> str: + return 'NM_001032386.2' + + +@pytest.fixture(scope='session') +def suox_gt_predicate( + suox_mane_tx_id: str, +) -> GenotypePolyPredicate: + return autosomal_dominant( variant_predicate=VariantPredicates.variant_effect( - effect=VariantEffect.MISSENSE_VARIANT, + effect=VariantEffect.MISSENSE_VARIANT, tx_id=suox_mane_tx_id ) ) @@ -128,29 +138,80 @@ def suox_pheno_predicates( Note, these are just a *SUBSET* of all phenotypes that can be tested for in the *SUOX* cohort. """ return ( - PropagatingPhenotypePredicate( + HpoPredicate( hpo=hpo, query=hpotk.TermId.from_curie('HP:0001250'), # Seizure ), - PropagatingPhenotypePredicate( + HpoPredicate( hpo=hpo, query=hpotk.TermId.from_curie('HP:0001083'), # Ectopia lentis ), - PropagatingPhenotypePredicate( + HpoPredicate( hpo=hpo, query=hpotk.TermId.from_curie('HP:0032350'), # Sulfocysteinuria ), - PropagatingPhenotypePredicate( + HpoPredicate( hpo=hpo, query=hpotk.TermId.from_curie('HP:0012758'), # Neurodevelopmental delay ), - PropagatingPhenotypePredicate( + HpoPredicate( hpo=hpo, query=hpotk.TermId.from_curie('HP:0001276'), # Hypertonia ), ) +@pytest.fixture +def hpo_result( + suox_pheno_predicates: typing.Sequence[PhenotypePolyPredicate[hpotk.TermId]], + suox_gt_predicate: GenotypePolyPredicate, +) -> HpoTermAnalysisResult: + return HpoTermAnalysisResult( + pheno_predicates=suox_pheno_predicates, + n_usable=(40, 20, 30, 10, 100), + all_counts=tuple( + make_count_df(count, suox_gt_predicate, ph_pred) + for count, ph_pred in zip( + [ + (10, 5, 15, 10), + (5, 2, 8, 5), + (10, 5, 5, 10), + (2, 3, 2, 3), + (10, 25, 35, 30), + ], + suox_pheno_predicates, + ) + ), + pvals=(0.01, 0.04, 0.3, 0.2, 0.7), + corrected_pvals=(0.04, 0.1, 0.7, 0.5, 1.0), + gt_predicate=suox_gt_predicate, + mtc_filter_name="MTC filter name", + mtc_filter_results=( + PhenotypeMtcResult.ok(), + PhenotypeMtcResult.ok(), + PhenotypeMtcResult.ok(), + PhenotypeMtcResult.ok(), + PhenotypeMtcResult.ok(), + ), + mtc_name="MTC name", + ) + + +def make_count_df( + counts: typing.Tuple[int, int, int, int], + gt_predicate: GenotypePolyPredicate, + ph_predicate: PhenotypePolyPredicate[hpotk.TermId], +) -> pd.DataFrame: + rows = tuple(ph_predicate.get_categories()) + cols = tuple(gt_predicate.get_categories()) + data = np.array(counts).reshape((len(rows), len(cols))) + return pd.DataFrame( + data=data, + index=pd.Index(rows), + columns=pd.Index(cols), + ) + + @pytest.fixture(scope='session') def fpath_suox_tx_coordinates(fpath_test_data_dir: str) -> str: suox_mane_tx_id = 'NM_001032386.2' diff --git a/tests/preprocessing/data/uniprot_response/ITPR1_HUMAN.json b/tests/preprocessing/data/uniprot_response/ITPR1_HUMAN.json new file mode 100644 index 000000000..4e5fdb0c0 --- /dev/null +++ b/tests/preprocessing/data/uniprot_response/ITPR1_HUMAN.json @@ -0,0 +1,446 @@ +{ + "results": [ + { + "entryType": "UniProtKB reviewed (Swiss-Prot)", + "primaryAccession": "Q14643", + "uniProtkbId": "ITPR1_HUMAN", + "proteinDescription": { + "recommendedName": { + "fullName": { + "evidences": [ + { + "evidenceCode": "ECO:0000305" + } + ], + "value": "Inositol 1,4,5-trisphosphate-gated calcium channel ITPR1" + } + }, + "alternativeNames": [ + { + "fullName": { + "value": "IP3 receptor isoform 1" + }, + "shortNames": [ + { + "value": "IP3R 1" + }, + { + "evidences": [ + { + "evidenceCode": "ECO:0000303", + "source": "PubMed", + "id": "7945203" + } + ], + "value": "InsP3R1" + } + ] + }, + { + "fullName": { + "evidences": [ + { + "evidenceCode": "ECO:0000303", + "source": "PubMed", + "id": "8648241" + } + ], + "value": "Inositol 1,4,5 trisphosphate receptor" + } + }, + { + "fullName": { + "evidences": [ + { + "evidenceCode": "ECO:0000303", + "source": "PubMed", + "id": "7500840" + } + ], + "value": "Inositol 1,4,5-trisphosphate receptor type 1" + } + }, + { + "fullName": { + "evidences": [ + { + "evidenceCode": "ECO:0000303", + "source": "PubMed", + "id": "7852357" + }, + { + "evidenceCode": "ECO:0000303", + "source": "PubMed", + "id": "7945203" + } + ], + "value": "Type 1 inositol 1,4,5-trisphosphate receptor" + }, + "shortNames": [ + { + "value": "Type 1 InsP3 receptor" + } + ] + } + ] + }, + "genes": [ + { + "geneName": { + "evidences": [ + { + "evidenceCode": "ECO:0000303", + "source": "PubMed", + "id": "7852357" + }, + { + "evidenceCode": "ECO:0000312", + "source": "HGNC", + "id": "HGNC:6180" + } + ], + "value": "ITPR1" + }, + "synonyms": [ + { + "evidences": [ + { + "evidenceCode": "ECO:0000303", + "source": "PubMed", + "id": "7945203" + } + ], + "value": "INSP3R1" + } + ] + } + ], + "features": [ + { + "type": "Domain", + "location": { + "start": { + "value": 112, + "modifier": "EXACT" + }, + "end": { + "value": 166, + "modifier": "EXACT" + } + }, + "description": "MIR 1", + "evidences": [ + { + "evidenceCode": "ECO:0000255", + "source": "PROSITE-ProRule", + "id": "PRU00131" + } + ] + }, + { + "type": "Domain", + "location": { + "start": { + "value": 173, + "modifier": "EXACT" + }, + "end": { + "value": 223, + "modifier": "EXACT" + } + }, + "description": "MIR 2", + "evidences": [ + { + "evidenceCode": "ECO:0000255", + "source": "PROSITE-ProRule", + "id": "PRU00131" + } + ] + }, + { + "type": "Domain", + "location": { + "start": { + "value": 231, + "modifier": "EXACT" + }, + "end": { + "value": 287, + "modifier": "EXACT" + } + }, + "description": "MIR 3", + "evidences": [ + { + "evidenceCode": "ECO:0000255", + "source": "PROSITE-ProRule", + "id": "PRU00131" + } + ] + }, + { + "type": "Domain", + "location": { + "start": { + "value": 294, + "modifier": "EXACT" + }, + "end": { + "value": 373, + "modifier": "EXACT" + } + }, + "description": "MIR 4", + "evidences": [ + { + "evidenceCode": "ECO:0000255", + "source": "PROSITE-ProRule", + "id": "PRU00131" + } + ] + }, + { + "type": "Domain", + "location": { + "start": { + "value": 379, + "modifier": "EXACT" + }, + "end": { + "value": 435, + "modifier": "EXACT" + } + }, + "description": "MIR 5", + "evidences": [ + { + "evidenceCode": "ECO:0000255", + "source": "PROSITE-ProRule", + "id": "PRU00131" + } + ] + }, + { + "type": "Region", + "location": { + "start": { + "value": 1015, + "modifier": "EXACT" + }, + "end": { + "value": 1036, + "modifier": "EXACT" + } + }, + "description": "Disordered", + "evidences": [ + { + "evidenceCode": "ECO:0000256", + "source": "SAM", + "id": "MobiDB-lite" + } + ] + }, + { + "type": "Region", + "location": { + "start": { + "value": 1146, + "modifier": "EXACT" + }, + "end": { + "value": 1178, + "modifier": "EXACT" + } + }, + "description": "Disordered", + "evidences": [ + { + "evidenceCode": "ECO:0000256", + "source": "SAM", + "id": "MobiDB-lite" + } + ] + }, + { + "type": "Region", + "location": { + "start": { + "value": 1708, + "modifier": "EXACT" + }, + "end": { + "value": 1740, + "modifier": "EXACT" + } + }, + "description": "Disordered", + "evidences": [ + { + "evidenceCode": "ECO:0000256", + "source": "SAM", + "id": "MobiDB-lite" + } + ] + }, + { + "type": "Region", + "location": { + "start": { + "value": 1760, + "modifier": "EXACT" + }, + "end": { + "value": 1796, + "modifier": "EXACT" + } + }, + "description": "Disordered", + "evidences": [ + { + "evidenceCode": "ECO:0000256", + "source": "SAM", + "id": "MobiDB-lite" + } + ] + }, + { + "type": "Region", + "location": { + "start": { + "value": 1890, + "modifier": "EXACT" + }, + "end": { + "value": 1915, + "modifier": "EXACT" + } + }, + "description": "Disordered", + "evidences": [ + { + "evidenceCode": "ECO:0000256", + "source": "SAM", + "id": "MobiDB-lite" + } + ] + }, + { + "type": "Region", + "location": { + "start": { + "value": 1939, + "modifier": "EXACT" + }, + "end": { + "value": 1960, + "modifier": "EXACT" + } + }, + "description": "Disordered", + "evidences": [ + { + "evidenceCode": "ECO:0000256", + "source": "SAM", + "id": "MobiDB-lite" + } + ] + }, + { + "type": "Region", + "location": { + "start": { + "value": 2472, + "modifier": "EXACT" + }, + "end": { + "value": 2537, + "modifier": "EXACT" + } + }, + "description": "Interaction with ERP44", + "evidences": [ + { + "evidenceCode": "ECO:0000250", + "source": "UniProtKB", + "id": "P11881" + } + ] + }, + { + "type": "Region", + "location": { + "start": { + "value": 2729, + "modifier": "EXACT" + }, + "end": { + "value": 2758, + "modifier": "EXACT" + } + }, + "description": "Disordered", + "evidences": [ + { + "evidenceCode": "ECO:0000256", + "source": "SAM", + "id": "MobiDB-lite" + } + ] + } + ], + "uniProtKBCrossReferences": [ + { + "database": "RefSeq", + "id": "NP_001093422.2", + "properties": [ + { + "key": "NucleotideSequenceId", + "value": "NM_001099952.2" + } + ], + "isoformId": "Q14643-3" + }, + { + "database": "RefSeq", + "id": "NP_001161744.1", + "properties": [ + { + "key": "NucleotideSequenceId", + "value": "NM_001168272.1" + } + ], + "isoformId": "Q14643-2" + }, + { + "database": "RefSeq", + "id": "NP_002213.5", + "properties": [ + { + "key": "NucleotideSequenceId", + "value": "NM_002222.5" + } + ], + "isoformId": "Q14643-4" + }, + { + "database": "RefSeq", + "id": "XP_011531985.1", + "properties": [ + { + "key": "NucleotideSequenceId", + "value": "XM_011533683.2" + } + ] + } + ], + "sequence": { + "length": 2758 + }, + "extraAttributes": { + "uniParcId": "UPI00015E0851" + } + } + ] +} \ No newline at end of file diff --git a/tests/preprocessing/test_uniprot.py b/tests/preprocessing/test_uniprot.py index f87e478e0..461ca8631 100644 --- a/tests/preprocessing/test_uniprot.py +++ b/tests/preprocessing/test_uniprot.py @@ -15,6 +15,11 @@ def fpath_uniprot_response_dir( return os.path.join(fpath_preprocessing_data_dir, "uniprot_response") +def read_json_payload(path: str) -> typing.Mapping[str, typing.Any]: + with open(path) as fh: + return json.load(fh) + + class TestUniprotProteinMetadataService: @pytest.fixture @@ -27,8 +32,7 @@ def zn462_human_uniprot_json( fpath_uniprot_response_dir: str, ) -> typing.Mapping[str, typing.Any]: fpath_zn462_human = os.path.join(fpath_uniprot_response_dir, "ZN462_HUMAN.json") - with open(fpath_zn462_human) as f: - return json.load(f) + return read_json_payload(fpath_zn462_human) def test_zn462( self, @@ -44,6 +48,24 @@ def test_zn462( assert protein_metadata.label == "Ankyrin repeat domain-containing protein 11" assert len(protein_metadata.protein_features) == 16 assert protein_metadata.protein_length == 2663 + + def test_itpr1( + self, + fpath_uniprot_response_dir: str, + ): + response_json_path = os.path.join(fpath_uniprot_response_dir, 'ITPR1_HUMAN.json') + response_json = read_json_payload(response_json_path) + + protein_id = "NP_001365381.1" + protein_metadata = UniprotProteinMetadataService.parse_uniprot_json( + payload=response_json, + protein_id=protein_id, + ) + + assert protein_metadata.protein_id == protein_id + assert protein_metadata.label == "Inositol 1,4,5-trisphosphate-gated calcium channel ITPR1" + assert len(protein_metadata.protein_features) == 13 + assert protein_metadata.protein_length == 2758 @pytest.mark.skip("Run manually to regenerate SUOX `NP_001027558.1` metadata") def test_fetch_suox_protein_metadata( diff --git a/tests/test_predicates.py b/tests/test_predicates.py index c87ce0adc..71308b8e8 100644 --- a/tests/test_predicates.py +++ b/tests/test_predicates.py @@ -2,10 +2,9 @@ import pytest from gpsea.analysis.predicate import PatientCategory, PatientCategories -from gpsea.analysis.predicate.phenotype import PropagatingPhenotypePredicate, DiseasePresencePredicate +from gpsea.analysis.predicate.phenotype import HpoPredicate, DiseasePresencePredicate from gpsea.analysis.predicate.genotype import * -from gpsea.model import Cohort, Patient, FeatureType, VariantEffect -from gpsea.model.genome import Region +from gpsea.model import Cohort, Patient from gpsea.preprocessing import ProteinMetadataService @@ -16,7 +15,7 @@ def find_patient(pat_id: str, cohort: Cohort) -> Patient: raise ValueError(f'Could not find patient {pat_id}') -class TestPropagatingPhenotypeBooleanPredicate: +class TestHpoPredicate: @pytest.mark.parametrize('curie, patient_id, expected', # Patient "HetSingleVar" has Phenotypes: @@ -46,7 +45,7 @@ def test_phenotype_predicate__present_or_excluded( ): patient = find_patient(patient_id, toy_cohort) term_id = hpotk.TermId.from_curie(curie) - predicate = PropagatingPhenotypePredicate(hpo=hpo, query=term_id) + predicate = HpoPredicate(hpo=hpo, query=term_id) actual = predicate.test(patient) assert actual.phenotype == term_id @@ -60,7 +59,7 @@ def test_phenotype_predicate__unknown( # Not Measured and not Observed - 'HP:0006280', # Chronic pancreatitis patient = find_patient('HetSingleVar', toy_cohort) term_id = hpotk.TermId.from_curie('HP:0006280') - predicate = PropagatingPhenotypePredicate(hpo=hpo, query=term_id) + predicate = HpoPredicate(hpo=hpo, query=term_id) actual = predicate.test(patient) assert actual is None @@ -89,100 +88,7 @@ def test_disease_predicate( assert actual.category == patient_category -@pytest.mark.parametrize('patient_id, variant_effect, expected_result', - (['HetSingleVar', VariantEffect.FRAMESHIFT_VARIANT, PatientCategories.YES], - ['HetSingleVar', VariantEffect.MISSENSE_VARIANT, PatientCategories.NO], - ['HetDoubleVar1', VariantEffect.STOP_GAINED, PatientCategories.YES], - ['HomoVar', VariantEffect.FRAMESHIFT_VARIANT, PatientCategories.YES], - ['LargeCNV', VariantEffect.STOP_LOST, PatientCategories.YES], - ['LargeCNV', VariantEffect.FEATURE_TRUNCATION, PatientCategories.YES])) -def test_VariantEffectPredicate(patient_id: str, - variant_effect: VariantEffect, - expected_result: PatientCategory, - toy_cohort: Cohort): - patient = find_patient(patient_id, toy_cohort) - predicate = boolean_predicate(VariantPredicates.variant_effect(effect=variant_effect, tx_id='NM_013275.6')) - result = predicate.test(patient) - assert result.category == expected_result - - -@pytest.mark.parametrize('patient_id, variant, hasVarResult', - (['HetSingleVar', '16_89279850_89279850_G_GC', PatientCategories.YES], - # the `HetSingleVar` patient does NOT have the variant. - ['HetSingleVar', '16_89279708_89279725_AGTGTTCGGGGCGGGGCC_A', PatientCategories.NO], - # but `HetDoubleVar2` below DOES have the variant. - ['HetDoubleVar2', '16_89279708_89279725_AGTGTTCGGGGCGGGGCC_A', PatientCategories.YES], - ['HetDoubleVar1', '16_89284601_89284602_GG_A', PatientCategories.YES], - ['HetDoubleVar1', '16_89280752_89280752_G_T', PatientCategories.YES], - # the `HomoVar` patient does NOT have the variant - ['HomoVar', '16_89280752_89280752_G_T', PatientCategories.NO], - ['HomoVar', '16_89279458_89279459_TG_T', PatientCategories.YES], - ['LargeCNV', '16_89190071_89439815_DEL', PatientCategories.YES])) -def test_VariantKeyPredicate(patient_id, variant, hasVarResult, toy_cohort): - predicate = boolean_predicate(VariantPredicates.variant_key(key=variant)) - patient = find_patient(patient_id, toy_cohort) - result = predicate.test(patient) - assert result.category == hasVarResult - - -@pytest.mark.parametrize('patient_id, exon, hasVarResult', - (['HetSingleVar', 9, PatientCategories.YES], - ['HetSingleVar', 13, PatientCategories.NO], - ['HetDoubleVar1', 9, PatientCategories.YES], - ['HetDoubleVar2', 10, PatientCategories.YES], - ['HetDoubleVar2', 9, PatientCategories.YES], - ['HomoVar', 10, PatientCategories.NO], - ['HomoVar', 9, PatientCategories.YES], - ['LargeCNV', 1, PatientCategories.NO], - ['LargeCNV', 13, PatientCategories.YES])) -def test_ExonPredicate(patient_id, exon, hasVarResult, toy_cohort): - patient = find_patient(patient_id, toy_cohort) - predicate = VariantPredicates.exon(exon=exon, tx_id='NM_013275.6') - predicate = boolean_predicate(predicate) - result = predicate.test(patient) - assert result.category == hasVarResult - @pytest.fixture(scope='module') def protein_predicates(protein_test_service: ProteinMetadataService) -> ProteinPredicates: return ProteinPredicates(protein_metadata_service=protein_test_service) - - -@pytest.mark.parametrize('patient_id, feature_type, hasVarResult', - (['HetDoubleVar2', FeatureType.REGION, PatientCategories.YES], - ['HetDoubleVar2', FeatureType.REPEAT, PatientCategories.NO], - ['HetSingleVar', FeatureType.REGION, PatientCategories.YES], - ['HomoVar', FeatureType.REGION, PatientCategories.YES], - ['HetDoubleVar1', FeatureType.REPEAT, PatientCategories.NO])) -## TODO Why do CNV not show as affecting a feature? -##['LargeCNV', FeatureType.REGION , HETEROZYGOUS])) -def test_ProteinFeatureTypePredicate(patient_id, feature_type, hasVarResult, toy_cohort, protein_predicates): - patient = find_patient(patient_id, toy_cohort) - predicate = boolean_predicate(protein_predicates.protein_feature_type(feature_type=feature_type, tx_id='NM_013275.6')) - result = predicate.test(patient) - assert result.category == hasVarResult - - -@pytest.mark.parametrize('patient_id, feature, hasVarResult', - (['HetDoubleVar2', 'Disordered', PatientCategories.YES], - ['HetDoubleVar2', 'BadFeature', PatientCategories.NO], - ['HetSingleVar', 'Disordered', PatientCategories.YES], - ['HomoVar', 'Disordered', PatientCategories.YES], - ['HetDoubleVar1', 'Disordered', PatientCategories.YES])) -def test_ProteinFeaturePredicate(patient_id, feature, hasVarResult, toy_cohort, protein_predicates): - predicate = boolean_predicate(protein_predicates.protein_feature(feature_id=feature, tx_id='NM_013275.6')) - patient = find_patient(patient_id, toy_cohort) - result = predicate.test(patient) - assert result.category == hasVarResult - -@pytest.mark.parametrize('patient_id, region, hasVarResult', - (['HetDoubleVar2', Region(2000, 2500), PatientCategories.YES], - ['HetDoubleVar2', Region(1000, 1500), PatientCategories.NO], - ['HomoVar', Region(2000, 2500), PatientCategories.YES], - ['HetSingleVar', Region(2000, 2500), PatientCategories.YES], - ['HetDoubleVar1', Region(600, 650), PatientCategories.YES])) -def test_ProteinRegionPredicate(patient_id, region, hasVarResult, toy_cohort): - predicate = boolean_predicate(VariantPredicates.region(region=region, tx_id='NM_013275.6')) - patient = find_patient(patient_id, toy_cohort) - result = predicate.test(patient) - assert result.category == hasVarResult diff --git a/tests/view/test_hpo_phenotype_result_viewer.py b/tests/view/test_hpo_phenotype_result_viewer.py new file mode 100644 index 000000000..12e0df74f --- /dev/null +++ b/tests/view/test_hpo_phenotype_result_viewer.py @@ -0,0 +1,16 @@ +import hpotk +import pytest + +from gpsea.analysis.pcats import HpoTermAnalysisResult +from gpsea.view import summarize_hpo_analysis + + +@pytest.mark.skip("Only for manual control") +def test_summarize( + self, + hpo: hpotk.MinimalOntology, + hpo_result: HpoTermAnalysisResult, +): + df = summarize_hpo_analysis(hpo=hpo, result=hpo_result) + + print(df) diff --git a/tests/view/test_stats.py b/tests/view/test_stats.py index 691f4f6a4..f557a3e79 100644 --- a/tests/view/test_stats.py +++ b/tests/view/test_stats.py @@ -7,6 +7,7 @@ from gpsea.analysis.pcats import HpoTermAnalysisResult from gpsea.analysis.predicate import PatientCategories from gpsea.analysis.predicate.genotype import GenotypePolyPredicate +from gpsea.analysis.predicate.phenotype import HpoPredicate from gpsea.analysis.mtc_filter import PhenotypeMtcResult from gpsea.view import MtcStatsViewer @@ -16,12 +17,19 @@ class TestStatsViewable: @pytest.fixture(scope='class') def hpo_term_analysis_result( self, + hpo: hpotk.MinimalOntology, suox_gt_predicate: GenotypePolyPredicate, ) -> HpoTermAnalysisResult: return HpoTermAnalysisResult( - phenotypes=( - hpotk.TermId.from_curie('HP:0001166'), # Arachnodactyly - hpotk.TermId.from_curie('HP:0001250'), # Seizure + pheno_predicates=( + HpoPredicate( + hpo=hpo, + query=hpotk.TermId.from_curie('HP:0001166'), # Arachnodactyly + ), + HpoPredicate( + hpo=hpo, + query=hpotk.TermId.from_curie('HP:0001250'), # Seizure + ), ), n_usable=(40, 20), all_counts=( @@ -41,7 +49,7 @@ def hpo_term_analysis_result( gt_predicate=suox_gt_predicate, mtc_filter_name='Random MTC filter', mtc_filter_results=( - PhenotypeMtcResult.fail("Not too interesting"), + PhenotypeMtcResult.fail("RMF01", "Not too interesting"), PhenotypeMtcResult.ok(), ), mtc_name='fdr_bh', diff --git a/tests/view/test_variant_viewer.py b/tests/view/test_variant_viewer.py new file mode 100644 index 000000000..31fe12bbd --- /dev/null +++ b/tests/view/test_variant_viewer.py @@ -0,0 +1,16 @@ +import pytest + +from gpsea.model import Cohort +from gpsea.view import CohortVariantViewer + + +@pytest.mark.skip("For manual run only") +def test_viewer( + suox_mane_tx_id: str, + suox_cohort: Cohort, +): + viewer = CohortVariantViewer(tx_id=suox_mane_tx_id) + html = viewer.process(suox_cohort) + + with open("all_variants.html", "w") as fh: + fh.write(html)