Skip to content

Commit

Permalink
convert to warning function
Browse files Browse the repository at this point in the history
  • Loading branch information
rxu17 committed Feb 1, 2025
1 parent 27208f8 commit 7d95435
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 115 deletions.
46 changes: 23 additions & 23 deletions geniesp/bpc_redcap_export_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,6 @@
"data_CNA.txt",
]

ONCOTREE_CODE_TO_COHORT_MAP = {
"RCC": "RENAL",
"OVARY": "OVARIAN",
"MEL": "MELANOMA",
"EGC": "ESOPHAGO"
}

def get_file_data(
syn: Synapse, mappingdf: pd.DataFrame, sampletype: str, cohort: str = "NSCLC"
) -> dict:
Expand Down Expand Up @@ -567,19 +560,26 @@ def _convert_to_int(value):
return float('nan')


def map_oncotree_codes_to_cohort_name(oncotree_dict : dict) -> dict:
"""Maps oncotree codes for certain codes
def check_oncotree_codes(
df : pd.DataFrame,
oncotree_dict : Dict[Dict[str, str]]
) -> None:
"""Check that the oncotree codes in input data
matches oncotree codes in official oncotree mappings
and logs a warning if the oncotree codes don't match and
which ones are not found in the input data.
Args:
oncotree_dict (dict): oncotree code mappings
Returns:
dict: remapped oncotree codes
df (pd.DataFrame): input data
oncotree_dict (Dict[Dict[str, str]]): official oncotree codes
"""
remapped_oncotree_dict = {
ONCOTREE_CODE_TO_COHORT_MAP.get(code.upper(), code): val for code, val in oncotree_dict.items()
}
return remapped_oncotree_dict
codes_in_df = df["ONCOTREE_CODE"].unique().tolist()
invalid_codes = list(set(codes_in_df) - set(list(oncotree_dict.keys())))
if invalid_codes:
logging.warning(
"There are invalid values in ONCOTREE_CODE column in the clinical df." \
f"They are: {invalid_codes}."
)


class BpcProjectRunner(metaclass=ABCMeta):
"""BPC redcap to cbioportal export"""
Expand Down Expand Up @@ -1972,24 +1972,24 @@ def create_and_write_case_lists(
"http://oncotree.mskcc.org/api/tumorTypes/tree?version=oncotree_2018_06_01"
)
oncotree_dict = process_functions.get_oncotree_code_mappings(oncotreelink)
remapped_oncotree_dict = map_oncotree_codes_to_cohort_name(oncotree_dict)
check_oncotree_codes(df = merged_clinicaldf, oncotree_dict = oncotree_dict)

# Map cancer type and cancer type detailed
# This is to create case list files
merged_clinicaldf["CANCER_TYPE"] = [
remapped_oncotree_dict[code.upper()].get("CANCER_TYPE", float("nan"))
oncotree_dict[code.upper()].get("CANCER_TYPE", float("nan"))
for code in merged_clinicaldf["ONCOTREE_CODE"]
]
merged_clinicaldf["CANCER_TYPE_DETAILED"] = [
remapped_oncotree_dict[code.upper()].get("CANCER_TYPE_DETAILED", float("nan"))
oncotree_dict[code.upper()].get("CANCER_TYPE_DETAILED", float("nan"))
for code in merged_clinicaldf["ONCOTREE_CODE"]
]
merged_clinicaldf["ONCOTREE_PRIMARY_NODE"] = [
remapped_oncotree_dict[code.upper()].get("ONCOTREE_PRIMARY_NODE", float("nan"))
oncotree_dict[code.upper()].get("ONCOTREE_PRIMARY_NODE", float("nan"))
for code in merged_clinicaldf["ONCOTREE_CODE"]
]
merged_clinicaldf["ONCOTREE_SECONDARY_NODE"] = [
remapped_oncotree_dict[code.upper()].get("ONCOTREE_SECONDARY_NODE", float("nan"))
oncotree_dict[code.upper()].get("ONCOTREE_SECONDARY_NODE", float("nan"))
for code in merged_clinicaldf["ONCOTREE_CODE"]
]
# Remove duplicated sample ids (there shouldn't be any)
Expand Down
133 changes: 41 additions & 92 deletions tests/test_bpc_redcap_export_mapping.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
import pytest
from unittest import mock

Expand All @@ -6,6 +7,8 @@

from geniesp import bpc_redcap_export_mapping as bpc_export

LOGGER = logging.getLogger(__name__)


@pytest.fixture
def mock_syn():
Expand Down Expand Up @@ -115,107 +118,53 @@ def test_that_parse_drug_mappings(input_mapping, var_names, output_mapping):
assert result == output_mapping


@pytest.mark.parametrize(
"input_mapping, output_mapping",
@pytest.mark.parameterize(
"input_data, oncotree_dict, expected_warning",
[
(
pd.DataFrame(
dict(
ONCOTREE_CODE=["Renal Cell Carcinoma", "Renal Clear Cell Carcinoma"]
)
),
{
"RCC": {
"CANCER_TYPE": "Renal cancer",
"CANCER_TYPE_DETAILED": "Renal cancer detailed",
},
"OVARY": {
"CANCER_TYPE": "Ovarian Cancer",
"CANCER_TYPE_DETAILED": "Ovarian cancer detailed",
},
},
{
"RENAL": {
"CANCER_TYPE": "Renal cancer",
"CANCER_TYPE_DETAILED": "Renal cancer detailed",
},
"OVARIAN": {
"CANCER_TYPE": "Ovarian Cancer",
"CANCER_TYPE_DETAILED": "Ovarian cancer detailed",
},
},
),
(
{
"BONE": {
"CANCER_TYPE": "Bone Cancer",
"CANCER_TYPE_DETAILED": "Bone cancer detailed",
},
"BRAIN": {
"CANCER_TYPE": "Brain Cancer",
"CANCER_TYPE_DETAILED": "Brain cancer detailed",
},
},
{
"BONE": {
"CANCER_TYPE": "Bone Cancer",
"CANCER_TYPE_DETAILED": "Bone cancer detailed",
},
"BRAIN": {
"CANCER_TYPE": "Brain Cancer",
"CANCER_TYPE_DETAILED": "Brain cancer detailed",
},
},
),
(
{
"RCC": {
"CANCER_TYPE": "Renal cancer",
"CANCER_TYPE_DETAILED": "Renal cancer detailed",
},
"BONE": {
"CANCER_TYPE": "Bone Cancer",
"CANCER_TYPE_DETAILED": "Bone cancer detailed",
},
},
{
"RENAL": {
"CANCER_TYPE": "Renal cancer",
"CANCER_TYPE_DETAILED": "Renal cancer detailed",
},
"BONE": {
"CANCER_TYPE": "Bone Cancer",
"CANCER_TYPE_DETAILED": "Bone cancer detailed",
},
"RCC": {"CANCER_TYPE": "Renal Cell Carcinoma"},
"OVARY": {"CANCER_TYPE": "Ovarian Cancer"},
},
"There are invalid values in ONCOTREE_CODE column in the clinical df. They are: ['Renal Cell Carcinoma', 'Renal Clear Cell Carcinoma']",
),
(
pd.DataFrame(dict(ONCOTREE_CODE=["Renal Cell Carcinoma", "RCC"])),
{
"rCC": {
"CANCER_TYPE": "Renal cancer",
"CANCER_TYPE_DETAILED": "Renal cancer detailed",
},
"OvArY": {
"CANCER_TYPE": "Ovarian Cancer",
"CANCER_TYPE_DETAILED": "Ovarian cancer detailed",
},
},
{
"RENAL": {
"CANCER_TYPE": "Renal cancer",
"CANCER_TYPE_DETAILED": "Renal cancer detailed",
},
"OVARIAN": {
"CANCER_TYPE": "Ovarian Cancer",
"CANCER_TYPE_DETAILED": "Ovarian cancer detailed",
},
"RCC": {"CANCER_TYPE": "Renal Cell Carcinoma"},
"OVARY": {"CANCER_TYPE": "Ovarian Cancer"},
},
"There are invalid values in ONCOTREE_CODE column in the clinical df. They are: ['Renal Cell Carcinoma']",
),
],
ids=[
"all_remapped",
"no_codes_to_remap",
"some_codes_to_remap",
"diff_code_casing",
],
ids=["all_invalid", "some_invalid"],
)
def test_that_map_oncotree_codes_to_cohort_name_returns_expected_remapped_values(
input_mapping, output_mapping
def test_that_check_oncotree_codes_gives_expected_warning_when_invalid_codes(
caplog, input_data, oncotree_dict, expected_warning
):
result = bpc_export.map_oncotree_codes_to_cohort_name(oncotree_dict=input_mapping)
assert result == output_mapping
with caplog.at_level(logging.WARNING):
bpc_export.check_oncotree_codes(df=input_data, oncotree_dict=oncotree_dict)
assert expected_warning in caplog.text


def test_that_check_oncotree_codes_gives_no_warning_when_all_codes_valid(
caplog, input_data, oncotree_dict
):
input_data = (pd.DataFrame(dict(ONCOTREE_CODE=["RCC", "OVARY"])),)
oncotree_dict = (
{
"RCC": {"CANCER_TYPE": "Renal Cell Carcinoma"},
"OVARY": {"CANCER_TYPE": "Ovarian Cancer"},
},
)
with caplog.at_level(logging.WARNING):
bpc_export.check_oncotree_codes(df=input_data, oncotree_dict=oncotree_dict)
assert (
"There are invalid values in ONCOTREE_CODE column in the clinical df."
not in caplog.text
)

0 comments on commit 7d95435

Please sign in to comment.