convert to warning function

Sage-Bionetworks · Feb 1, 2025 · 7d95435 · 7d95435
1 parent 27208f8
commit 7d95435
Show file tree

Hide file tree

Showing 2 changed files with 64 additions and 115 deletions.
diff --git a/geniesp/bpc_redcap_export_mapping.py b/geniesp/bpc_redcap_export_mapping.py
@@ -46,13 +46,6 @@
     "data_CNA.txt",
 ]
 
-ONCOTREE_CODE_TO_COHORT_MAP = {
-    "RCC": "RENAL",
-    "OVARY": "OVARIAN",
-    "MEL": "MELANOMA",
-    "EGC": "ESOPHAGO"
-}
-
 def get_file_data(
     syn: Synapse, mappingdf: pd.DataFrame, sampletype: str, cohort: str = "NSCLC"
 ) -> dict:
@@ -567,19 +560,26 @@ def _convert_to_int(value):
         return float('nan')
 
 
-def map_oncotree_codes_to_cohort_name(oncotree_dict : dict) -> dict:
-    """Maps oncotree codes for certain codes
-
+def check_oncotree_codes(
+    df : pd.DataFrame, 
+    oncotree_dict : Dict[Dict[str, str]]
+    ) -> None:
+    """Check that the oncotree codes in input data 
+        matches oncotree codes in official oncotree mappings
+        and logs a warning if the oncotree codes don't match and 
+        which ones are not found in the input data.
     Args:
-        oncotree_dict (dict): oncotree code mappings
-
-    Returns:
-        dict: remapped oncotree codes
+        df (pd.DataFrame): input data
+        oncotree_dict (Dict[Dict[str, str]]): official oncotree codes
     """
-    remapped_oncotree_dict = {
-        ONCOTREE_CODE_TO_COHORT_MAP.get(code.upper(), code): val for code, val in oncotree_dict.items()
-    }
-    return remapped_oncotree_dict
+    codes_in_df = df["ONCOTREE_CODE"].unique().tolist()
+    invalid_codes = list(set(codes_in_df) - set(list(oncotree_dict.keys())))
+    if invalid_codes:
+        logging.warning(
+            "There are invalid values in ONCOTREE_CODE column in the clinical df." \
+            f"They are: {invalid_codes}."
+            )
+
 
 class BpcProjectRunner(metaclass=ABCMeta):
     """BPC redcap to cbioportal export"""
@@ -1972,24 +1972,24 @@ def create_and_write_case_lists(
             "http://oncotree.mskcc.org/api/tumorTypes/tree?version=oncotree_2018_06_01"
         )
         oncotree_dict = process_functions.get_oncotree_code_mappings(oncotreelink)
-        remapped_oncotree_dict = map_oncotree_codes_to_cohort_name(oncotree_dict)
+        check_oncotree_codes(df = merged_clinicaldf, oncotree_dict = oncotree_dict)
 
         # Map cancer type and cancer type detailed
         # This is to create case list files
         merged_clinicaldf["CANCER_TYPE"] = [
-            remapped_oncotree_dict[code.upper()].get("CANCER_TYPE", float("nan"))
+            oncotree_dict[code.upper()].get("CANCER_TYPE", float("nan"))
             for code in merged_clinicaldf["ONCOTREE_CODE"]
         ]
         merged_clinicaldf["CANCER_TYPE_DETAILED"] = [
-            remapped_oncotree_dict[code.upper()].get("CANCER_TYPE_DETAILED", float("nan"))
+            oncotree_dict[code.upper()].get("CANCER_TYPE_DETAILED", float("nan"))
             for code in merged_clinicaldf["ONCOTREE_CODE"]
         ]
         merged_clinicaldf["ONCOTREE_PRIMARY_NODE"] = [
-            remapped_oncotree_dict[code.upper()].get("ONCOTREE_PRIMARY_NODE", float("nan"))
+            oncotree_dict[code.upper()].get("ONCOTREE_PRIMARY_NODE", float("nan"))
             for code in merged_clinicaldf["ONCOTREE_CODE"]
         ]
         merged_clinicaldf["ONCOTREE_SECONDARY_NODE"] = [
-            remapped_oncotree_dict[code.upper()].get("ONCOTREE_SECONDARY_NODE", float("nan"))
+            oncotree_dict[code.upper()].get("ONCOTREE_SECONDARY_NODE", float("nan"))
             for code in merged_clinicaldf["ONCOTREE_CODE"]
         ]
         # Remove duplicated sample ids (there shouldn't be any)

diff --git a/tests/test_bpc_redcap_export_mapping.py b/tests/test_bpc_redcap_export_mapping.py
@@ -1,3 +1,4 @@
+import logging
 import pytest
 from unittest import mock
 
@@ -6,6 +7,8 @@
 
 from geniesp import bpc_redcap_export_mapping as bpc_export
 
+LOGGER = logging.getLogger(__name__)
+
 
 @pytest.fixture
 def mock_syn():
@@ -115,107 +118,53 @@ def test_that_parse_drug_mappings(input_mapping, var_names, output_mapping):
     assert result == output_mapping
 
 
-@pytest.mark.parametrize(
-    "input_mapping, output_mapping",
+@pytest.mark.parameterize(
+    "input_data, oncotree_dict, expected_warning",
     [
         (
+            pd.DataFrame(
+                dict(
+                    ONCOTREE_CODE=["Renal Cell Carcinoma", "Renal Clear Cell Carcinoma"]
+                )
+            ),
             {
-                "RCC": {
-                    "CANCER_TYPE": "Renal cancer",
-                    "CANCER_TYPE_DETAILED": "Renal cancer detailed",
-                },
-                "OVARY": {
-                    "CANCER_TYPE": "Ovarian Cancer",
-                    "CANCER_TYPE_DETAILED": "Ovarian cancer detailed",
-                },
-            },
-            {
-                "RENAL": {
-                    "CANCER_TYPE": "Renal cancer",
-                    "CANCER_TYPE_DETAILED": "Renal cancer detailed",
-                },
-                "OVARIAN": {
-                    "CANCER_TYPE": "Ovarian Cancer",
-                    "CANCER_TYPE_DETAILED": "Ovarian cancer detailed",
-                },
-            },
-        ),
-        (
-            {
-                "BONE": {
-                    "CANCER_TYPE": "Bone Cancer",
-                    "CANCER_TYPE_DETAILED": "Bone cancer detailed",
-                },
-                "BRAIN": {
-                    "CANCER_TYPE": "Brain Cancer",
-                    "CANCER_TYPE_DETAILED": "Brain cancer detailed",
-                },
-            },
-            {
-                "BONE": {
-                    "CANCER_TYPE": "Bone Cancer",
-                    "CANCER_TYPE_DETAILED": "Bone cancer detailed",
-                },
-                "BRAIN": {
-                    "CANCER_TYPE": "Brain Cancer",
-                    "CANCER_TYPE_DETAILED": "Brain cancer detailed",
-                },
-            },
-        ),
-        (
-            {
-                "RCC": {
-                    "CANCER_TYPE": "Renal cancer",
-                    "CANCER_TYPE_DETAILED": "Renal cancer detailed",
-                },
-                "BONE": {
-                    "CANCER_TYPE": "Bone Cancer",
-                    "CANCER_TYPE_DETAILED": "Bone cancer detailed",
-                },
-            },
-            {
-                "RENAL": {
-                    "CANCER_TYPE": "Renal cancer",
-                    "CANCER_TYPE_DETAILED": "Renal cancer detailed",
-                },
-                "BONE": {
-                    "CANCER_TYPE": "Bone Cancer",
-                    "CANCER_TYPE_DETAILED": "Bone cancer detailed",
-                },
+                "RCC": {"CANCER_TYPE": "Renal Cell Carcinoma"},
+                "OVARY": {"CANCER_TYPE": "Ovarian Cancer"},
             },
+            "There are invalid values in ONCOTREE_CODE column in the clinical df. They are: ['Renal Cell Carcinoma', 'Renal Clear Cell Carcinoma']",
         ),
         (
+            pd.DataFrame(dict(ONCOTREE_CODE=["Renal Cell Carcinoma", "RCC"])),
             {
-                "rCC": {
-                    "CANCER_TYPE": "Renal cancer",
-                    "CANCER_TYPE_DETAILED": "Renal cancer detailed",
-                },
-                "OvArY": {
-                    "CANCER_TYPE": "Ovarian Cancer",
-                    "CANCER_TYPE_DETAILED": "Ovarian cancer detailed",
-                },
-            },
-            {
-                "RENAL": {
-                    "CANCER_TYPE": "Renal cancer",
-                    "CANCER_TYPE_DETAILED": "Renal cancer detailed",
-                },
-                "OVARIAN": {
-                    "CANCER_TYPE": "Ovarian Cancer",
-                    "CANCER_TYPE_DETAILED": "Ovarian cancer detailed",
-                },
+                "RCC": {"CANCER_TYPE": "Renal Cell Carcinoma"},
+                "OVARY": {"CANCER_TYPE": "Ovarian Cancer"},
             },
+            "There are invalid values in ONCOTREE_CODE column in the clinical df. They are: ['Renal Cell Carcinoma']",
         ),
     ],
-    ids=[
-        "all_remapped",
-        "no_codes_to_remap",
-        "some_codes_to_remap",
-        "diff_code_casing",
-    ],
+    ids=["all_invalid", "some_invalid"],
 )
-def test_that_map_oncotree_codes_to_cohort_name_returns_expected_remapped_values(
-    input_mapping, output_mapping
+def test_that_check_oncotree_codes_gives_expected_warning_when_invalid_codes(
+    caplog, input_data, oncotree_dict, expected_warning
 ):
-    result = bpc_export.map_oncotree_codes_to_cohort_name(oncotree_dict=input_mapping)
-    assert result == output_mapping
+    with caplog.at_level(logging.WARNING):
+        bpc_export.check_oncotree_codes(df=input_data, oncotree_dict=oncotree_dict)
+    assert expected_warning in caplog.text
+
+
+def test_that_check_oncotree_codes_gives_no_warning_when_all_codes_valid(
+    caplog, input_data, oncotree_dict
+):
+    input_data = (pd.DataFrame(dict(ONCOTREE_CODE=["RCC", "OVARY"])),)
+    oncotree_dict = (
+        {
+            "RCC": {"CANCER_TYPE": "Renal Cell Carcinoma"},
+            "OVARY": {"CANCER_TYPE": "Ovarian Cancer"},
+        },
+    )
+    with caplog.at_level(logging.WARNING):
+        bpc_export.check_oncotree_codes(df=input_data, oncotree_dict=oncotree_dict)
+    assert (
+        "There are invalid values in ONCOTREE_CODE column in the clinical df."
+        not in caplog.text
+    )