diff --git a/scripts/config.yaml b/scripts/config.yaml new file mode 100644 index 0000000..14969fb --- /dev/null +++ b/scripts/config.yaml @@ -0,0 +1,50 @@ +dataset: + 'Cancer-level dataset': + id: syn22296816 + file: ca_dx_derived.csv + 'Patient-level dataset': + id: syn22296817 + file: pt_derived.csv + 'Regimen-Cancer level dataset': + id: syn22296818 + file: ca_drugs_derived.csv + 'Imaging-level dataset': + id: syn22296819 + file: prissmm_image_derived.csv + 'Pathology-report level dataset': + id: syn22296820 + file: prissmm_path_derived.csv + 'Med Onc Note level dataset': + id: syn22296822 + file: prissmm_md_derived.csv + 'Cancer panel test level dataset': + id: syn22296823 + file: cpt_derived.csv + 'Cancer-level index dataset': + id: syn22314486 + file: ca_dx_derived_index.csv + 'Cancer-level non-index dataset': + id: syn22314497 + file: ca_dx_derived_non_index.csv + 'Hemeonc dataset': + id: syn23561688 + file: hemonc_mapping_cbio.csv + 'PRISSMM Tumor Marker level dataset': + id: syn23561700 + file: prissmm_tm_derived.csv + 'Cancer-Directed Radiation Therapy dataset': + id: syn25931923 + file: ca_radtx_derived.csv +check: + 1: + function: check_code_name_empty + implemented: 1 + deprecated: 0 + description: Code is empty. + request: Please remove the row or fill in the code name. + 2: + function: check_code_name_absent + implemented: 1 + deprecated: 0 + description: Code does not exist in associated dataset. + request: Please check the code name and associated dataset. \ No newline at end of file diff --git a/scripts/test_validate.py b/scripts/test_validate.py new file mode 100644 index 0000000..531fb95 --- /dev/null +++ b/scripts/test_validate.py @@ -0,0 +1,18 @@ +"""Test validate map""" +import yaml + +from validate_map import * + + +def test__function_map(): + """Test that all functions referenced in the config file are listed in the function map.""" + config = read_config("config.yaml") + fxn_map = create_function_map() + + fxn_config = [] + for check in config["check"]: + fxn_config.append(config["check"][check]["function"]) + + config_not_map = set(fxn_config) - set(fxn_map.keys()) + + assert len(config_not_map) == 0 diff --git a/scripts/validate_map.py b/scripts/validate_map.py new file mode 100644 index 0000000..f6068a2 --- /dev/null +++ b/scripts/validate_map.py @@ -0,0 +1,232 @@ +""" +Description: Validate the BPC to cBioPortal mapping file. +Author: Haley Hunter-Zinck +Date: 2022-01-27 +""" + +import argparse +import logging +import re + +import pandas as pd +import synapseclient +from synapseclient import Synapse +from synapseclient.core.exceptions import ( + SynapseAuthenticationError, + SynapseNoCredentialsError, +) +import yaml + + +def check_code_name_empty(df: pd.DataFrame, syn: Synapse, config: dict) -> list: + """Check for any code that is empty. + Args: + df: dataframe representing map + syn: Synapse object + config: configuration parameters + Returns: + dataframe with metadata on any empty codes. + """ + empty = df.loc[pd.isna(df["code"])]["code"] + return list(empty) + + +def check_code_name_absent(df: pd.DataFrame, syn: Synapse, config: dict) -> list: + """Check for any code that is not code name that + does not appear in its associated data file. + Args: + df: dataframe representing map + syn: Synapse object + config: configuration parameters + Returns: + dataframe with metadata on any missing codes. + """ + absent = [] + for dataset in config["dataset"]: + data = pd.read_csv( + syn.get(config["dataset"][dataset]["id"])["path"], low_memory=False + ) + code_data = data.columns + + # get codes associated with the dataset and of types derived or curated + code_map = list( + df.loc[ + ( + (df["dataset"] == dataset) + & ( + (df["data_type"].str.lower() == "derived") + | (df["data_type"].str.lower() == "curated") + ) + ) + ]["code"] + ) + + # do not check wildcard code names or NA code names + code_remove = [] + for code in code_map: + if bool(re.match(r"^.+[*]$", str(code).strip())): + code_remove.append(code) + elif pd.isna(code): + code_remove.append(code) + for code in code_remove: + code_map.remove(code) + + absent.extend(list(set(code_map) - set(code_data))) + return absent + + +def format_result(codes: list, config: dict, check_no: int) -> pd.DataFrame: + """Format output for interpretable log file. + Args: + df: dataframe representing map + config: configuration parameters + check_no: check number for which to format results + Returns: + dataframe with additional metadata on any errors. + """ + formatted = pd.DataFrame() + formatted["code"] = codes + formatted["check_no"] = str(check_no) + formatted["description"] = config["check"][check_no]["description"] + formatted["action"] = config["check"][check_no]["request"] + return formatted + + +def create_function_map() -> dict: + fxns = { + "check_code_name_absent": check_code_name_absent, + "check_code_name_empty": check_code_name_empty, + } + return fxns + + +def validate_map( + synapse_id: str, syn: Synapse, config: dict, version: int +) -> pd.DataFrame: + """Run all implemented checks on mapping file. + Args: + synapse_id: Synapse ID of mapping file + syn: Synapse object + config: configuration parameters + version: Version number of Synapse ID + Returns: + dataframe with additional metadata on any errors. + """ + + errors = pd.DataFrame() + df = pd.DataFrame() + fxns = create_function_map() + if version == "None": + df = pd.read_csv(syn.get(synapse_id)["path"]) + else: + df = pd.read_csv(syn.get(synapse_id, version=version)["path"]) + + for check_no in config["check"]: + + logging.info(f"Check {check_no}...") + + if ( + config["check"][check_no]["implemented"] + and not config["check"][check_no]["deprecated"] + ): + fxn_name = config["check"][check_no]["function"] + result = fxns[fxn_name](df, syn, config) + errors = errors.append(format_result(result, config, check_no)) + logging.info(f" Found {errors.shape[0]} error(s).") + else: + logging.info(" Check deprecated or not implemented.") + + errors.insert(0, "issue", range(1, errors.shape[0] + 1, 1)) + + return errors + + +def build_parser(): + parser = argparse.ArgumentParser( + description="Checks validity of BPC to cBioPortal mapping file " + ) + parser.add_argument( + "synapse_id", + metavar="SYNAPSE_ID", + type=str, + help="Synapse ID of mapping file", + ) + parser.add_argument( + "--version", + "-v", + metavar="VERSION", + type=str, + default="None", + help="Synapse entity version number " "(default: current)", + ) + parser.add_argument( + "--outfile", + "-o", + metavar="OUTFILE", + type=str, + default="output.csv", + help="Name of output file " "(default: %(default)s)", + ) + parser.add_argument( + "--log", + "-l", + type=str, + choices=["debug", "info", "warning", "error"], + default="error", + help="Set logging output level " "(default: %(default)s)", + ) + return parser + + +def read_config(file: str) -> dict: + config = None + with open(file, "r") as stream: + try: + config = yaml.safe_load(stream) + except yaml.YAMLError as exc: + print(exc) + return config + + +def synapse_login(synapse_config=synapseclient.client.CONFIG_FILE): + """Login to Synapse + Args: + synapse_config: Path to synapse configuration file. + Defaults to ~/.synapseConfig + Returns: + Synapse connection + """ + try: + syn = synapseclient.Synapse(skip_checks=True, configPath=synapse_config) + syn.login(silent=True) + except (SynapseNoCredentialsError, SynapseAuthenticationError): + raise ValueError( + "Login error: please make sure you have correctly " + "configured your client. Instructions here: " + "https://help.synapse.org/docs/Client-Configuration.1985446156.html. " + "You can also create a Synapse Personal Access Token and set it " + "as an environmental variable: " + "SYNAPSE_AUTH_TOKEN=''" + ) + return syn + + +def main(): + + args = build_parser().parse_args() + config = read_config("config.yaml") + syn = synapse_login() + + numeric_level = getattr(logging, args.log.upper(), None) + if not isinstance(numeric_level, int): + raise ValueError("Invalid log level: %s" % args.log) + logging.basicConfig(level=numeric_level) + + res = validate_map(args.synapse_id, syn, config, args.version) + res.to_csv(args.outfile, index=False) + + logging.info(f"Output written to '{args.outfile}'") + + +if __name__ == "__main__": + main()