diff --git a/src/scripts/notebooks/Mondo Mass Obsoletion - SME Review Summary.ipynb b/src/scripts/notebooks/Mondo Mass Obsoletion - SME Review Summary.ipynb new file mode 100644 index 0000000000..bea421ed9d --- /dev/null +++ b/src/scripts/notebooks/Mondo Mass Obsoletion - SME Review Summary.ipynb @@ -0,0 +1,1223 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "4b43b98a", + "metadata": {}, + "source": [ + "## Mondo Mass Obsoletion - SME Review Summary\n", + "\n", + "This notebook summmarizes the results of the SME Review of the Strategic Refinement (Mondo Mass Obsoletion) results to gauge whether classes predicted to leave the branch should when their parent class is obsoleted should actually leave the branch.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b6d8b63e", + "metadata": {}, + "outputs": [], + "source": [ + "# Load packages\n", + "import os\n", + "import re\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "# Enable display of all columns in df\n", + "pd.set_option('display.max_columns', None)\n", + "\n", + "# Enable display of entire cell value\n", + "pd.set_option('display.max_colwidth', None)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "2446c4ce", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "46" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# List all files and get their size\n", + "\n", + "path = \"./data\"\n", + "\n", + "# Get list of all files only in the given directory\n", + "data_files = lambda x : os.path.isfile(os.path.join(path,x))\n", + "files_list = filter(data_files, os.listdir(path))\n", + " \n", + "# Create a list of files in directory along with the size\n", + "file_metadata = [\n", + " (file, os.stat(os.path.join(path, file)).st_size)\n", + " for file in files_list\n", + "]\n", + "\n", + "# Display number of files\n", + "display(len(file_metadata))\n", + "\n", + "\n", + "# Iterate over list of files along with size\n", + "# for filename, size in file_metadata:\n", + "# print(\"{} : {} B\".format(filename, size))" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "80948e20", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "** All Branch Files **\n", + "** Branch: Cancer-or-benign-tumor ( 5 ) ReviewerFiles: [('Cancer-or-benign-tumor_reviewer-JessicaMester.xlsx', 9095), ('Cancer-or-benign-tumor_reviewer-SumirPandit.xlsx', 9191), ('Cancer-or-benign-tumor_reviewer-GiocondaAlyea.xlsx', 9095), ('Cancer-or-benign-tumor_reviewer-1.xlsx', 9087), ('Cancer-or-benign-tumor_reviewer-NeomiCaban.xlsx', 9095)]\n", + "** Branch: Inflammatory-disease ( 3 ) ReviewerFiles: [('Inflammatory-disease_reviewer-MeganKraus.xlsx', 10548), ('Inflammatory-disease_reviewer-GiocondaAlyea.xlsx', 10431), ('Inflammatory-disease_reviewer-1.xlsx', 10032)]\n", + "** Branch: Musculoskeletal-system ( 2 ) ReviewerFiles: [('Musculoskeletal-system_reviewer-1.xlsx', 38813), ('Musculoskeletal-system_reviewer-GiocondaAlyea.xlsx', 39783)]\n", + "** Branch: Endocrine-Disease ( 3 ) ReviewerFiles: [('Endocrine-Disease_reviewer-JessicaMester.xlsx', 10115), ('Endocrine-Disease_reviewer-GiocondaAlyea.xlsx', 10115), ('Endocrine-Disease_reviewer-1.xlsx', 10106)]\n", + "** Branch: .DS ( 1 ) ReviewerFiles: [('.DS_Store', 6148)]\n", + "** Branch: Connective-tissue ( 2 ) ReviewerFiles: [('Connective-tissue_reviewer-AdaHamosh.xlsx', 6820), ('Connective-tissue_reviewer-1.xlsx', 6820)]\n", + "** Branch: Hematologic-disorder ( 3 ) ReviewerFiles: [('Hematologic-disorder_reviewer-GiocondaAlyea.xlsx', 9683), ('Hematologic-disorder_reviewer-JessicaMester.xlsx', 9683), ('Hematologic-disorder_reviewer-1.xlsx', 9674)]\n", + "** Branch: Immune-system-disorder ( 2 ) ReviewerFiles: [('Immune-system-disorder_reviewer-1.xlsx', 12989), ('Immune-system-disorder_reviewer-RachelSparks.xlsx', 13277)]\n", + "** Branch: Cardiovascular-Disorder ( 4 ) ReviewerFiles: [('Cardiovascular-Disorder_reviewer-1.xlsx', 15286), ('Cardiovascular-Disorder_reviewer-RachaelHuntley.xlsx', 15286), ('Cardiovascular-Disorder_reviewer-DylanGration.xlsx', 15344), ('Cardiovascular-Disorder_reviewer-GiocondaAlyea.xlsx', 15294)]\n", + "** Branch: Reproductive-system-disorder ( 4 ) ReviewerFiles: [('Reproductive-system-disorder_reviewer-GiocondaAlyea.xlsx', 12360), ('Reproductive-system-disorder_reviewer-1.xlsx', 11872), ('Reproductive-system-disorder_reviewer-PaolaRoncaglia.xlsx', 13491), ('Reproductive-system-disorder_reviewer-DylanGration.xlsx', 12359)]\n", + "** Branch: Integumentary-system-disorder ( 3 ) ReviewerFiles: [('Integumentary-system-disorder_reviewer-MeganKraus.xlsx', 22224), ('Integumentary-system-disorder_reviewer-1.xlsx', 18454), ('Integumentary-system-disorder_reviewer-GiocondaAlyea.xlsx', 19159)]\n", + "** Branch: Nervous-system-disorder ( 4 ) ReviewerFiles: [('Nervous-system-disorder_reviewer-DylanGration.xlsx', 22144), ('Nervous-system-disorder_reviewer-GiocondaAlyea.xlsx', 23142), ('Nervous-system-disorder_reviewer-1.xlsx', 23133), ('Nervous-system-disorder_reviewer-NeomiCaban.xlsx', 23142)]\n", + "** Branch: Metabolic-disease ( 2 ) ReviewerFiles: [('Metabolic-disease_reviewer-1.xlsx', 11296), ('Metabolic-disease_reviewer-AdaHamosh.xlsx', 11296)]\n", + "** Branch: Disorder-of-visual-system ( 2 ) ReviewerFiles: [('Disorder-of-visual-system_reviewer-1.xlsx', 22619), ('Disorder-of-visual-system_reviewer-GiocondaAlyea.xlsx', 23734)]\n", + "** Branch: Infectious-disease ( 1 ) ReviewerFiles: [('Infectious-disease_reviewer-1.xlsx', 6848)]\n", + "** Branch: Digestive-system-disorder ( 2 ) ReviewerFiles: [('Digestive-system-disorder_reviewer-1.xlsx', 10345), ('Digestive-system-disorder_reviewer-GiocondaAlyea.xlsx', 12291)]\n", + "** Branch: Respiratory-system ( 2 ) ReviewerFiles: [('Respiratory-system_reviewer-1.xlsx', 8920), ('Respiratory-system_reviewer-GiocondaAlyea.xlsx', 9172)]\n", + "** Branch: ~$Disorder-of-visual-system ( 1 ) ReviewerFiles: [('~$Disorder-of-visual-system_reviewer-GiocondaAlyea.xlsx', 165)]\n" + ] + } + ], + "source": [ + "# Create dictionary of file groups based on their branch name\n", + "\n", + "branches = {}\n", + "\n", + "seen = []\n", + "\n", + "for full_filename, size in file_metadata:\n", + " branch, reviewer = full_filename.split('_')\n", + "\n", + " if branch in seen:\n", + " branches[branch].append((full_filename, size))\n", + " else:\n", + " seen.append(branch)\n", + " branches[branch] = [(full_filename, size)]\n", + "\n", + "\n", + "print('** All Branch Files **')\n", + "for k,v in branches.items():\n", + " print(\"** Branch:\", k, '(',len(v), ')', \"ReviewerFiles:\", v)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "020dbc86", + "metadata": {}, + "outputs": [], + "source": [ + "# Find template file names\n", + "\n", + "templates = {}\n", + "\n", + "for branch, files in branches.items():\n", + " template_pattern = r'(.+)_reviewer-1\\.xlsx'\n", + " \n", + " # Find filename of template file\n", + " for file in files:\n", + " template_filename = re.search(template_pattern, file[0])\n", + " \n", + " if template_filename:\n", + " templates[file[0]] = file[1]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "8c61ceed", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "** Cancer-or-benign-tumor , NumReviewers: ( 4 ) ['Cancer-or-benign-tumor_reviewer-JessicaMester.xlsx', 'Cancer-or-benign-tumor_reviewer-SumirPandit.xlsx', 'Cancer-or-benign-tumor_reviewer-GiocondaAlyea.xlsx', 'Cancer-or-benign-tumor_reviewer-NeomiCaban.xlsx']\n", + "** Cardiovascular-Disorder , NumReviewers: ( 2 ) ['Cardiovascular-Disorder_reviewer-DylanGration.xlsx', 'Cardiovascular-Disorder_reviewer-GiocondaAlyea.xlsx']\n", + "** Connective-tissue , NumReviewers: ( 0 ) []\n", + "** Digestive-system-disorder , NumReviewers: ( 1 ) ['Digestive-system-disorder_reviewer-GiocondaAlyea.xlsx']\n", + "** Disorder-of-visual-system , NumReviewers: ( 1 ) ['Disorder-of-visual-system_reviewer-GiocondaAlyea.xlsx']\n", + "** Endocrine-Disease , NumReviewers: ( 2 ) ['Endocrine-Disease_reviewer-JessicaMester.xlsx', 'Endocrine-Disease_reviewer-GiocondaAlyea.xlsx']\n", + "** Hematologic-disorder , NumReviewers: ( 2 ) ['Hematologic-disorder_reviewer-GiocondaAlyea.xlsx', 'Hematologic-disorder_reviewer-JessicaMester.xlsx']\n", + "** Immune-system-disorder , NumReviewers: ( 1 ) ['Immune-system-disorder_reviewer-RachelSparks.xlsx']\n", + "** Infectious-disease , NumReviewers: ( 0 ) []\n", + "** Inflammatory-disease , NumReviewers: ( 2 ) ['Inflammatory-disease_reviewer-MeganKraus.xlsx', 'Inflammatory-disease_reviewer-GiocondaAlyea.xlsx']\n", + "** Integumentary-system-disorder , NumReviewers: ( 2 ) ['Integumentary-system-disorder_reviewer-MeganKraus.xlsx', 'Integumentary-system-disorder_reviewer-GiocondaAlyea.xlsx']\n", + "** Metabolic-disease , NumReviewers: ( 0 ) []\n", + "** Musculoskeletal-system , NumReviewers: ( 1 ) ['Musculoskeletal-system_reviewer-GiocondaAlyea.xlsx']\n", + "** Nervous-system-disorder , NumReviewers: ( 2 ) ['Nervous-system-disorder_reviewer-GiocondaAlyea.xlsx', 'Nervous-system-disorder_reviewer-NeomiCaban.xlsx']\n", + "** Reproductive-system-disorder , NumReviewers: ( 3 ) ['Reproductive-system-disorder_reviewer-GiocondaAlyea.xlsx', 'Reproductive-system-disorder_reviewer-PaolaRoncaglia.xlsx', 'Reproductive-system-disorder_reviewer-DylanGration.xlsx']\n", + "** Respiratory-system , NumReviewers: ( 1 ) ['Respiratory-system_reviewer-GiocondaAlyea.xlsx']\n" + ] + } + ], + "source": [ + "# Check which files were reviewed by comparing file size of \"template\" file to the other files\n", + "# NOTE: Even though some reviewer files were (slightly) larger than the \"template\", there was\n", + "# no reviewer data in the files to analyze \n", + "\n", + "data_files = {}\n", + "\n", + "for template_filename, template_size in templates.items():\n", + " filenames = []\n", + "\n", + " branch_name = template_filename.split('_')[0]\n", + " \n", + " for filename, size in branches[branch_name]:\n", + " if filename != template_filename:\n", + " if size > template_size:\n", + " filenames.append(filename)\n", + " \n", + " data_files[branch_name] = filenames\n", + " \n", + "\n", + "# Display results\n", + "for key in sorted(data_files.keys()):\n", + " print('**', key, ', NumReviewers: (', len(data_files[key]), ')', data_files[key])\n", + " \n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c88469c0", + "metadata": {}, + "outputs": [], + "source": [ + "# # Analyze the branch data_files (in batch)\n", + "# # NOTE: Batch analysis needs more work. Analzye each set of branch files separately/manually\n", + "\n", + "# df = pd.DataFrame()\n", + "\n", + "# for branch_name, files in data_files.items():\n", + "# for f in files:\n", + "# data = pd.read_excel(f\"data/{f}\")\n", + "\n", + "# df = pd.concat(pd.read_excel(f\"data/{excel_file}\") for excel_file in files)\n", + "\n", + "# df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "9bc77100", + "metadata": {}, + "source": [ + "---\n", + "### Analyze Individual Branches in Batch --> too many empty files" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "cbcdcf46", + "metadata": {}, + "outputs": [], + "source": [ + "#####\n", + "# Analyze \"Cancer-or-benign-tumor\" SME file\n", + "# NOTE: While all file_paths ar files greater in size than the template file,\n", + "# only \"Cancer-or-benign-tumor_reviewer-SumirPandit.xlsx\" has any data entered. \n", + "#####\n", + "\n", + "\n", + "branch_name = \"Cancer-or-benign-tumor\" \n", + "file_paths = ['Cancer-or-benign-tumor_reviewer-JessicaMester.xlsx', 'Cancer-or-benign-tumor_reviewer-SumirPandit.xlsx', 'Cancer-or-benign-tumor_reviewer-GiocondaAlyea.xlsx', 'Cancer-or-benign-tumor_reviewer-NeomiCaban.xlsx']\n", + "\n", + "\n", + "selected_columns_indices = [0, 1, 2, 3, 4]\n", + "\n", + "\n", + "# Read the first file into a DataFrame\n", + "merged_dataframe = pd.read_excel(f\"data/{file_paths[0]}\", usecols=selected_columns_indices)\n", + "\n", + "# Iterate through the remaining file paths and merge them based on column 'Mondo ID'\n", + "for file_path in file_paths[1:]:\n", + " # Read the next file into a DataFrame\n", + " next_dataframe = pd.read_excel(f\"data/{file_path}\", usecols=[0,2, 3, 4])\n", + " \n", + " # Extract the reviewer name from the file path\n", + " reviewer_name = file_path.split('-')[-1].split('.')[0]\n", + "\n", + " # Define custom suffixes based on the reviewer name\n", + " suffix_left = f'_left_{reviewer_name}'\n", + " suffix_right = f'_right_{reviewer_name}'\n", + "\n", + " # Merge based on column A with custom suffixes\n", + " merged_dataframe = pd.merge(merged_dataframe, next_dataframe, on='Mondo ID', how='outer', suffixes=('', f'_{reviewer_name}')) # suffixes=(suffix_left, suffix_right)\n", + "\n", + " \n", + " \n", + "# # Define custom suffixes based on the file index\n", + "# suffix_left = f'_left_{file_paths.index(file_path)}'\n", + "# suffix_right = f'_right_{file_paths.index(file_path)}'\n", + "\n", + "# # Merge based on column A\n", + "# merged_dataframe = pd.merge(merged_dataframe, next_dataframe, on='Mondo ID', how='outer', suffixes=(suffix_left, suffix_right))\n", + "\n", + "\n", + "# Print the resulting merged DataFrame\n", + "# merged_dataframe.head()" + ] + }, + { + "cell_type": "markdown", + "id": "8325b923", + "metadata": {}, + "source": [ + "### Result for \"Cancer-or-benign-tumor\"\n", + "\n", + "Only 1 reviewer has added any data and very few files have any reviewer data so let's just manually list out which files have data!\n" + ] + }, + { + "cell_type": "markdown", + "id": "807b9f18", + "metadata": {}, + "source": [ + "---\n", + "---\n", + "### Check Manually which branches have any review data\n", + "\n", + "---\n", + "- Cancer-or-benign-tumor_reviewer-SumirPandit.xlsx\n", + "\n", + "---\n", + "- Cardiovascular-Disorder_reviewer-DylanGration.xlsx\n", + "\n", + "---\n", + "- Digestive-system-disorder_reviewer-GiocondaAlyea.xlsx\n", + "\n", + "---\n", + "- Disorder-of-visual-system_reviewer-GiocondaAlyea.xlsx\n", + "\n", + "---\n", + "- Immune-system-disorder_reviewer-RachelSparks.xlsx\n", + "\n", + "---\n", + "- Inflammatory-disease_reviewer-GiocondaAlyea.xlsx\n", + "- Inflammatory-disease_reviewer-MeganKraus.xlsx\n", + "\n", + "---\n", + "- Integumentary-system-disorder_reviewer-GiocondaAlyea.xlsx\n", + "- Integumentary-system-disorder_reviewer-MeganKraus.xlsx\n", + "\n", + "---\n", + "- Musculoskeletal-system_reviewer-GiocondaAlyea.xlsx\n", + "\n", + "---\n", + "- Reproductive-system-disorder_reviewer-PaolaRoncaglia.xlsx\n", + "\n", + "---\n", + "- Respiratory-system_reviewer-GiocondaAlyea.xlsx\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "0c2c270f", + "metadata": {}, + "source": [ + "---\n", + "## Read in Mass Obsoletion Report" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "b313b177", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
curielabelprevious_parentslatest_parentsadded_parentsremoved_parentsobsoleted_parentsprevious_brancheslatest_branchesis_branch_assignment_changedadded_branchesremoved_branches
0MONDO:001628146,XX ovotesticular disorder of sex development46,XX disorder of gonadal development(MONDO:0017961)disorder of sexual differentiation(MONDO:0002145)disorder of sexual differentiation(MONDO:0002145)46,XX disorder of gonadal development(MONDO:0017961)46,XX disorder of gonadal development(MONDO:0017961)disorder of development or morphogenesis(MONDO:0021147), endocrine system disorder(MONDO:0005151), reproductive system disorder(MONDO:0005039)disorder of development or morphogenesis(MONDO:0021147), endocrine system disorder(MONDO:0005151), reproductive system disorder(MONDO:0005039)FalseNaNNaN
1MONDO:001960846,XX disorder of sex development induced by maternal-derived androgen46,XX disorder of sex development induced by androgens excess(MONDO:0020039)NaNNaN46,XX disorder of sex development induced by androgens excess(MONDO:0020039)46,XX disorder of sex development induced by androgens excess(MONDO:0020039)disorder of development or morphogenesis(MONDO:0021147), endocrine system disorder(MONDO:0005151), reproductive system disorder(MONDO:0005039)NaNTrueNaNreproductive system disorder(MONDO:0005039), disorder of development or morphogenesis(MONDO:0021147), endocrine system disorder(MONDO:0005151)
2MONDO:001959346,XX disorder of sex development induced by fetal androgens excessfemale reproductive system disorder(MONDO:0002263), 46,XX disorder of sex development induced by androgens excess(MONDO:0020039)NaNNaNfemale reproductive system disorder(MONDO:0002263), 46,XX disorder of sex development induced by androgens excess(MONDO:0020039)46,XX disorder of sex development induced by androgens excess(MONDO:0020039)disorder of development or morphogenesis(MONDO:0021147), endocrine system disorder(MONDO:0005151), reproductive system disorder(MONDO:0005039)NaNTrueNaNreproductive system disorder(MONDO:0005039), disorder of development or morphogenesis(MONDO:0021147), endocrine system disorder(MONDO:0005151)
3MONDO:001796246,XX disorder of sex development induced by fetoplacental androgens excessfemale reproductive system disorder(MONDO:0002263), 46,XX disorder of sex development induced by androgens excess(MONDO:0020039)NaNNaNfemale reproductive system disorder(MONDO:0002263), 46,XX disorder of sex development induced by androgens excess(MONDO:0020039)46,XX disorder of sex development induced by androgens excess(MONDO:0020039)disorder of development or morphogenesis(MONDO:0021147), endocrine system disorder(MONDO:0005151), reproductive system disorder(MONDO:0005039)NaNTrueNaNreproductive system disorder(MONDO:0005039), disorder of development or morphogenesis(MONDO:0021147), endocrine system disorder(MONDO:0005151)
4MONDO:0014421glucocorticoid resistancedevelopmental anomaly of metabolic origin(MONDO:0015327), 46,XX disorder of sex development induced by fetal androgens excess(MONDO:0019593), adrenogenital syndrome(MONDO:0015898)adrenogenital syndrome(MONDO:0015898)NaNdevelopmental anomaly of metabolic origin(MONDO:0015327), 46,XX disorder of sex development induced by fetal androgens excess(MONDO:0019593)46,XX disorder of sex development induced by fetal androgens excess(MONDO:0019593)disorder of development or morphogenesis(MONDO:0021147), endocrine system disorder(MONDO:0005151), metabolic disease(MONDO:0005066), reproductive system disorder(MONDO:0005039)endocrine system disorder(MONDO:0005151), metabolic disease(MONDO:0005066)TrueNaNreproductive system disorder(MONDO:0005039), disorder of development or morphogenesis(MONDO:0021147)
\n", + "
" + ], + "text/plain": [ + " curie \\\n", + "0 MONDO:0016281 \n", + "1 MONDO:0019608 \n", + "2 MONDO:0019593 \n", + "3 MONDO:0017962 \n", + "4 MONDO:0014421 \n", + "\n", + " label \\\n", + "0 46,XX ovotesticular disorder of sex development \n", + "1 46,XX disorder of sex development induced by maternal-derived androgen \n", + "2 46,XX disorder of sex development induced by fetal androgens excess \n", + "3 46,XX disorder of sex development induced by fetoplacental androgens excess \n", + "4 glucocorticoid resistance \n", + "\n", + " previous_parents \\\n", + "0 46,XX disorder of gonadal development(MONDO:0017961) \n", + "1 46,XX disorder of sex development induced by androgens excess(MONDO:0020039) \n", + "2 female reproductive system disorder(MONDO:0002263), 46,XX disorder of sex development induced by androgens excess(MONDO:0020039) \n", + "3 female reproductive system disorder(MONDO:0002263), 46,XX disorder of sex development induced by androgens excess(MONDO:0020039) \n", + "4 developmental anomaly of metabolic origin(MONDO:0015327), 46,XX disorder of sex development induced by fetal androgens excess(MONDO:0019593), adrenogenital syndrome(MONDO:0015898) \n", + "\n", + " latest_parents \\\n", + "0 disorder of sexual differentiation(MONDO:0002145) \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 adrenogenital syndrome(MONDO:0015898) \n", + "\n", + " added_parents \\\n", + "0 disorder of sexual differentiation(MONDO:0002145) \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN \n", + "\n", + " removed_parents \\\n", + "0 46,XX disorder of gonadal development(MONDO:0017961) \n", + "1 46,XX disorder of sex development induced by androgens excess(MONDO:0020039) \n", + "2 female reproductive system disorder(MONDO:0002263), 46,XX disorder of sex development induced by androgens excess(MONDO:0020039) \n", + "3 female reproductive system disorder(MONDO:0002263), 46,XX disorder of sex development induced by androgens excess(MONDO:0020039) \n", + "4 developmental anomaly of metabolic origin(MONDO:0015327), 46,XX disorder of sex development induced by fetal androgens excess(MONDO:0019593) \n", + "\n", + " obsoleted_parents \\\n", + "0 46,XX disorder of gonadal development(MONDO:0017961) \n", + "1 46,XX disorder of sex development induced by androgens excess(MONDO:0020039) \n", + "2 46,XX disorder of sex development induced by androgens excess(MONDO:0020039) \n", + "3 46,XX disorder of sex development induced by androgens excess(MONDO:0020039) \n", + "4 46,XX disorder of sex development induced by fetal androgens excess(MONDO:0019593) \n", + "\n", + " previous_branches \\\n", + "0 disorder of development or morphogenesis(MONDO:0021147), endocrine system disorder(MONDO:0005151), reproductive system disorder(MONDO:0005039) \n", + "1 disorder of development or morphogenesis(MONDO:0021147), endocrine system disorder(MONDO:0005151), reproductive system disorder(MONDO:0005039) \n", + "2 disorder of development or morphogenesis(MONDO:0021147), endocrine system disorder(MONDO:0005151), reproductive system disorder(MONDO:0005039) \n", + "3 disorder of development or morphogenesis(MONDO:0021147), endocrine system disorder(MONDO:0005151), reproductive system disorder(MONDO:0005039) \n", + "4 disorder of development or morphogenesis(MONDO:0021147), endocrine system disorder(MONDO:0005151), metabolic disease(MONDO:0005066), reproductive system disorder(MONDO:0005039) \n", + "\n", + " latest_branches \\\n", + "0 disorder of development or morphogenesis(MONDO:0021147), endocrine system disorder(MONDO:0005151), reproductive system disorder(MONDO:0005039) \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 endocrine system disorder(MONDO:0005151), metabolic disease(MONDO:0005066) \n", + "\n", + " is_branch_assignment_changed added_branches \\\n", + "0 False NaN \n", + "1 True NaN \n", + "2 True NaN \n", + "3 True NaN \n", + "4 True NaN \n", + "\n", + " removed_branches \n", + "0 NaN \n", + "1 reproductive system disorder(MONDO:0005039), disorder of development or morphogenesis(MONDO:0021147), endocrine system disorder(MONDO:0005151) \n", + "2 reproductive system disorder(MONDO:0005039), disorder of development or morphogenesis(MONDO:0021147), endocrine system disorder(MONDO:0005151) \n", + "3 reproductive system disorder(MONDO:0005039), disorder of development or morphogenesis(MONDO:0021147), endocrine system disorder(MONDO:0005151) \n", + "4 reproductive system disorder(MONDO:0005039), disorder of development or morphogenesis(MONDO:0021147) " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Load file\n", + "report_df = pd.read_csv('mass_obsoletion_qc_report_15Nov2023.csv', sep=',')\n", + "report_df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "5de7bb45", + "metadata": {}, + "source": [ + "---\n", + "### Check for Agreement with Cancer or benign tumor data from Reviewer\n", + "\n", + "#### Step 1 (Cancer or benign tumor)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "f3551923", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Mondo ID columns_marked_as_true\n", + "0 MONDO:0019328 [no]\n", + "1 MONDO:0017046 [yes]\n", + "2 MONDO:0017571 [unsure]\n", + "3 MONDO:0015756 [yes]\n", + "4 MONDO:0008876 [no]\n", + "5 MONDO:0033954 [no]\n", + "6 MONDO:0015757 [yes]\n", + "7 MONDO:0019329 [no]\n", + "8 MONDO:0008594 [unsure]\n", + "9 MONDO:0016725 [yes]\n", + "10 MONDO:0016745 [yes]\n", + "11 MONDO:0019476 [yes]\n", + "12 MONDO:0015185 [no]\n", + "13 MONDO:0018070 [unsure]\n", + "14 MONDO:0018192 [yes]\n", + "15 MONDO:0018717 [no]\n", + "16 MONDO:0044350 [no]\n", + "17 MONDO:0011605 [no]\n", + "18 MONDO:0015811 [yes]\n", + "19 MONDO:0015812 [yes]\n", + "20 MONDO:0008390 [no]\n", + "21 MONDO:0011512 [unsure]\n" + ] + } + ], + "source": [ + "# Copy report_df\n", + "report_copy_cancer_df = report_df.copy()\n", + "\n", + "# Read in reviewer file\n", + "reviewer_df = pd.read_excel('data/Cancer-or-benign-tumor_reviewer-SumirPandit.xlsx', usecols=['Mondo ID', 'yes', 'no', 'unsure', 'comment'])\n", + "reviewer_df.head()\n", + "\n", + "# # Find which column has the first True value for each row --> sometimes reviewers added yes/no and unsure\n", + "# reviewer_df['first_true_column'] = reviewer_df[['yes', 'no', 'unsure']].idxmax(axis=1)\n", + "# # Display the result\n", + "# print(reviewer_df[['Mondo ID', 'first_true_column']])\n", + "\n", + "\n", + "# Find which column has the first True value for each row\n", + "reviewer_df['columns_marked_as_true'] = reviewer_df[['yes', 'no', 'unsure']].apply(lambda row: row.index[row].tolist(), axis=1)\n", + "# Display the result\n", + "print(reviewer_df[['Mondo ID', 'columns_marked_as_true']])\n", + "\n", + "\n", + "# The form question was \"In your opinion, are the following diseases considered a \"cancer or benign tumor\"?\"\n", + "# Therefore, 'yes' means the term should 'Stay in the branch' and 'no' means it should 'Leave the Branch'." + ] + }, + { + "cell_type": "markdown", + "id": "4817603d", + "metadata": {}, + "source": [ + "#### Step 2 (Cancer of benign tumor)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "f3a28969", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Mondo ID yes no unsure columns_marked_as_true\n", + "0 MONDO:0019328 False True False no\n", + "1 MONDO:0017046 True False False yes\n", + "3 MONDO:0015756 True False False yes\n", + "4 MONDO:0008876 False True False no\n", + "5 MONDO:0033954 False True False no\n", + "6 MONDO:0015757 True False False yes\n", + "7 MONDO:0019329 False True False no\n", + "9 MONDO:0016725 True False False yes\n", + "10 MONDO:0016745 True False False yes\n", + "11 MONDO:0019476 True False False yes\n", + "12 MONDO:0015185 False True False no\n", + "14 MONDO:0018192 True False False yes\n", + "15 MONDO:0018717 False True False no\n", + "16 MONDO:0044350 False True False no\n", + "17 MONDO:0011605 False True False no\n", + "18 MONDO:0015811 True False False yes\n", + "19 MONDO:0015812 True False False yes\n", + "20 MONDO:0008390 False True False no\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/rq/999gqdc13q1gx_pbf99hd6km0000gp/T/ipykernel_94444/3144103158.py:8: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df_filtered['columns_marked_as_true'] = df_filtered['columns_marked_as_true'].apply(lambda x: ', '.join(x))\n" + ] + } + ], + "source": [ + "# Check whether there is agreement between Reviwer and Curator Report\n", + "# to show that the term will 'Stay in the branch'\n", + "\n", + "#-------\n", + "# FIRST - filter dataframe to only contain rows where 'columns_marked_as_true' is only yes or no, to remove\n", + "# Convert the lists to strings and keep rows where the value is 'yes' or 'no'\n", + "df_filtered = reviewer_df[reviewer_df['columns_marked_as_true'].apply(lambda x: isinstance(x, list) and any(value in x for value in ['yes', 'no']))]\n", + "\n", + "# Convert the values in 'columns_marked_as_true' to strings vs. lists\n", + "df_filtered['columns_marked_as_true'] = df_filtered['columns_marked_as_true'].apply(lambda x: ', '.join(x))\n", + "print(df_filtered)\n", + "\n", + "\n", + "# cancer_target_value = 'MONDO:0045024' #cancer or benign tumor(MONDO:0045024)\n", + "\n", + "# # Filter rows in reviewer_df where 'first_true_column' is 'yes'\n", + "# target_rows_reviewer_df = reviewer_df[reviewer_df['first_true_column'] == 'yes']\n", + "\n", + "# # Merge reviewer_df and report_df based on 'Mondo ID'\n", + "# stay_cancer_merged_df = pd.merge(target_rows_reviewer_df, report_copy_cancer_df, left_on='Mondo ID', right_on='curie', how='left')\n", + "# # stay_cancer_merged_df.head()\n", + "\n", + "\n", + "# # Check for the existence of 'MONDO:0045024' in the merged row\n", + "# stay_cancer_merged_df['target_exists_in_reportdf'] = stay_cancer_merged_df['latest_branches'].str.contains(cancer_target_value)\n", + "\n", + "# # Display the result\n", + "# display(stay_cancer_merged_df[['Mondo ID', 'latest_branches', 'target_exists_in_reportdf']])\n", + "\n", + "\n", + "# stay_cancer_merged_df['target_exists_in_reportdf'].value_counts()\n" + ] + }, + { + "cell_type": "markdown", + "id": "0ced1727", + "metadata": {}, + "source": [ + "#### Step 3 (Cancer)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "afd96f47", + "metadata": {}, + "outputs": [], + "source": [ + "# Check whether there is agreement between Reviwer and Curator Report\n", + "# to show that the term will 'Leave the branch'\n", + "cancer_target_value = 'MONDO:0045024' #cancer or benign tumor(MONDO:0045024)\n", + "\n", + "# Filter rows in reviewer_df where 'first_true_column' is 'yes'\n", + "target_rows_reviewer_df = reviewer_df[reviewer_df['first_true_column'] == 'no']\n", + "\n", + "# Merge reviewer_df and report_df based on 'Mondo ID'\n", + "leave_cancer_merged_df = pd.merge(target_rows_reviewer_df, report_copy_cancer_df, left_on='Mondo ID', right_on='curie', how='left')\n", + "# leave_cancer_merged_df.head()\n", + "\n", + "\n", + "# Check for the existence of 'MONDO:0045024' in the merged row\n", + "leave_cancer_merged_df['target_exists_in_reportdf'] = leave_cancer_merged_df['latest_branches'].str.contains(cancer_target_value)\n", + "\n", + "# Display the result\n", + "display(leave_cancer_merged_df[['Mondo ID', 'latest_branches', 'target_exists_in_reportdf']])\n", + "\n", + "\n", + "leave_cancer_merged_df['target_exists_in_reportdf'].value_counts()\n", + "\n", + "# NOTE: A result of 'True' when checking for 'Leave the branch' means there is _disagreement_ \n", + "# between reviewer and curator since the term _did_ remain in the branch." + ] + }, + { + "cell_type": "markdown", + "id": "4e882980", + "metadata": {}, + "source": [ + "---\n", + "---\n", + "### Check for Agreement for Cardiovascular Disorder\n", + "\n", + "#### Step 1 (Cardio)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c7b54bd0", + "metadata": {}, + "outputs": [], + "source": [ + "# Make a copy of report_df\n", + "report_copy_cardio_df = report_df.copy()\n", + "\n", + "# Read in reviewer file\n", + "cardio_reviewer_df = pd.read_excel('data/Cardiovascular-Disorder_reviewer-DylanGration.xlsx', usecols=['Mondo ID', 'yes', 'no', 'unsure'])\n", + "cardio_reviewer_df.head()\n", + "\n", + "# Find which column has the first True value for each row\n", + "cardio_reviewer_df['first_true_column'] = reviewer_df[['yes', 'no', 'unsure']].idxmax(axis=1)\n", + "\n", + "# Display the result\n", + "print(cardio_reviewer_df[['Mondo ID', 'first_true_column']])\n", + "\n", + "\n", + "# The form question was \"In your opinion, are the following diseases considered a \"cancer or benign tumor\"?\"\n", + "# Therefore, 'yes' means the term should 'Stay in the branch' and 'no' means it should 'Leave the Branch'.\n" + ] + }, + { + "cell_type": "markdown", + "id": "22f004c6", + "metadata": {}, + "source": [ + "#### Step 2 (Cardio)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4da68bd2", + "metadata": {}, + "outputs": [], + "source": [ + "# Check whether there is agreement between Reviwer and Curator Report\n", + "# to show that the term will 'Stay in the branch'\n", + "cardio_target_value = 'MONDO:0004995' #cardiovascular disorder(MONDO:0004995)\n", + "\n", + "# Filter rows in reviewer_df where 'first_true_column' is 'yes'\n", + "target_rows_cardio_reviewer_df = cardio_reviewer_df[cardio_reviewer_df['first_true_column'] == 'yes']\n", + "\n", + "# Merge reviewer_df and report_df based on 'Mondo ID'\n", + "stay_cardio_merged_df = pd.merge(target_rows_cardio_reviewer_df, report_copy_cardio_df, left_on='Mondo ID', right_on='curie', how='left')\n", + "# stay_cardio_merged_df.head()\n", + "\n", + "\n", + "# Check for the existence of 'MONDO:0004995' in the merged row\n", + "stay_cardio_merged_df['target_exists_in_reportdf'] = stay_cardio_merged_df['latest_branches'].str.contains(cardio_target_value)\n", + "\n", + "# Display the result\n", + "display(stay_cardio_merged_df[['Mondo ID', 'latest_branches', 'target_exists_in_reportdf']])\n", + "\n", + "\n", + "stay_cardio_merged_df['target_exists_in_reportdf'].value_counts()\n" + ] + }, + { + "cell_type": "markdown", + "id": "9ab8b771", + "metadata": {}, + "source": [ + "#### Step 3 (Cardio)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0a043de1", + "metadata": {}, + "outputs": [], + "source": [ + "# Check whether there is agreement between Reviwer and Curator Report\n", + "# to show that the term will 'Leave the branch'\n", + "cardio_target_value = 'MONDO:0004995' #cardiovascular disorder(MONDO:0004995)\n", + "\n", + "# Filter rows in reviewer_df where 'first_true_column' is 'yes'\n", + "target_rows_reviewer_df = reviewer_df[reviewer_df['first_true_column'] == 'no']\n", + "\n", + "# Merge reviewer_df and report_df based on 'Mondo ID'\n", + "leave_cardio_merged_df = pd.merge(target_rows_reviewer_df, report_copy_cardio_df, left_on='Mondo ID', right_on='curie', how='left')\n", + "# leave_cardio_merged_df.head()\n", + "\n", + "\n", + "# Check for the existence of cardio_target_value in the merged row\n", + "leave_cardio_merged_df['target_exists_in_reportdf'] = leave_cancer_merged_df['latest_branches'].str.contains(cardio_target_value)\n", + "\n", + "# Display the result\n", + "display(leave_cardio_merged_df[['Mondo ID', 'latest_branches', 'target_exists_in_reportdf']])\n", + "\n", + "\n", + "leave_cardio_merged_df['target_exists_in_reportdf'].value_counts()\n", + "\n", + "# NOTE: A result of 'True' when checking for 'Leave the branch' means there is _disagreement_ \n", + "# between reviewer and curator since the term _did_ remain in the branch." + ] + }, + { + "cell_type": "markdown", + "id": "0268b330", + "metadata": {}, + "source": [ + "---\n", + "---\n", + "### Check for Agreement with Digestive data from Reviewer\n", + "\n", + "#### Step 1 (Digestive)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5e355464", + "metadata": {}, + "outputs": [], + "source": [ + "# Make a copy of report_df\n", + "report_copy_digestive_df = report_df.copy()\n", + "\n", + "# Read in reviewer file\n", + "digestive_reviewer_df = pd.read_excel('data/Digestive-system-disorder_reviewer-GiocondaAlyea.xlsx', usecols=['Mondo ID', 'yes', 'no', 'unsure'])\n", + "digestive_reviewer_df.head()\n", + "\n", + "# Find which column has the first True value for each row\n", + "digestive_reviewer_df['first_true_column'] = reviewer_df[['yes', 'no', 'unsure']].idxmax(axis=1)\n", + "\n", + "# Display the result\n", + "print(digestive_reviewer_df[['Mondo ID', 'first_true_column']])\n", + "\n", + "\n", + "# The form question was \"In your opinion, are the following diseases considered a \"cancer or benign tumor\"?\"\n", + "# Therefore, 'yes' means the term should 'Stay in the branch' and 'no' means it should 'Leave the Branch'." + ] + }, + { + "cell_type": "markdown", + "id": "2b12e89a", + "metadata": {}, + "source": [ + "#### Step 2 (Digestive)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "45453604", + "metadata": {}, + "outputs": [], + "source": [ + "# Check whether there is agreement between Reviwer and Curator Report\n", + "# to show that the term will 'Stay in the branch'\n", + "disgestive_target_value = 'MONDO:0004335' #digestive system disorder(MONDO:0004335\n", + "\n", + "# Filter rows in reviewer_df where 'first_true_column' is 'yes'\n", + "target_rows_digestive_reviewer_df = digestive_reviewer_df[digestive_reviewer_df['first_true_column'] == 'yes']\n", + "\n", + "# Merge reviewer_df and report_df based on 'Mondo ID'\n", + "stay_digestive_merged_df = pd.merge(target_rows_digestive_reviewer_df, report_copy_digestive_df, left_on='Mondo ID', right_on='curie', how='left')\n", + "\n", + "# Check for the existence of 'MONDO:0004995' in the merged row\n", + "stay_digestive_merged_df['target_exists_in_reportdf'] = stay_digestive_merged_df['latest_branches'].str.contains(disgestive_target_value)\n", + "\n", + "# Display the result\n", + "display(stay_digestive_merged_df[['Mondo ID', 'latest_branches', 'target_exists_in_reportdf']])\n", + "\n", + "\n", + "stay_digestive_merged_df['target_exists_in_reportdf'].value_counts()\n" + ] + }, + { + "cell_type": "markdown", + "id": "897d2a67", + "metadata": {}, + "source": [ + "#### Step 3 (Digestive)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "82291f21", + "metadata": {}, + "outputs": [], + "source": [ + "# Check whether there is agreement between Reviwer and Curator Report\n", + "# to show that the term will 'Leave the branch'\n", + "disgestive_target_value = 'MONDO:0004335'\n", + "\n", + "# Filter rows in reviewer_df where 'first_true_column' is 'yes'\n", + "target_rows_reviewer_df = digestive_reviewer_df[digestive_reviewer_df['first_true_column'] == 'no']\n", + "\n", + "# Merge reviewer_df and report_df based on 'Mondo ID'\n", + "leave_digestive_merged_df = pd.merge(target_rows_reviewer_df, report_copy_digestive_df, left_on='Mondo ID', right_on='curie', how='left')\n", + "\n", + "\n", + "# Check for the existence of cardio_target_value in the merged row\n", + "leave_digestive_merged_df['target_exists_in_reportdf'] = leave_digestive_merged_df['latest_branches'].str.contains(cardio_target_value)\n", + "\n", + "# Display the result\n", + "display(leave_digestive_merged_df[['Mondo ID', 'latest_branches', 'target_exists_in_reportdf']])\n", + "\n", + "\n", + "leave_digestive_merged_df['target_exists_in_reportdf'].value_counts()\n", + "\n", + "# NOTE: A result of 'True' when checking for 'Leave the branch' means there is _disagreement_ \n", + "# between reviewer and curator since the term _did_ remain in the branch." + ] + }, + { + "cell_type": "markdown", + "id": "0dfca0d1", + "metadata": {}, + "source": [ + "---\n", + "---\n", + "### Check for Agreement on Disorder of visual system\n", + "\n", + "#### Step 1 (Visual)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "0ae594c4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Mondo ID true_columns\n", + "0 MONDO:0007383 [yes]\n", + "1 MONDO:0017304 [yes]\n", + "2 MONDO:0017212 [yes, unsure]\n", + "3 MONDO:0010181 [yes]\n", + "4 MONDO:0008927 [yes]\n", + ".. ... ...\n", + "128 MONDO:0020252 []\n", + "129 MONDO:0018152 []\n", + "130 MONDO:0018997 []\n", + "131 MONDO:0009485 []\n", + "132 MONDO:0008397 []\n", + "\n", + "[133 rows x 2 columns]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Mondo IDyesnounsuretrue_columns
0MONDO:0007383TrueFalseFalse[yes]
1MONDO:0017304TrueFalseFalse[yes]
2MONDO:0017212TrueFalseTrue[yes, unsure]
3MONDO:0010181TrueFalseFalse[yes]
4MONDO:0008927TrueFalseFalse[yes]
..................
128MONDO:0020252FalseFalseFalse[]
129MONDO:0018152FalseFalseFalse[]
130MONDO:0018997FalseFalseFalse[]
131MONDO:0009485FalseFalseFalse[]
132MONDO:0008397FalseFalseFalse[]
\n", + "

133 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " Mondo ID yes no unsure true_columns\n", + "0 MONDO:0007383 True False False [yes]\n", + "1 MONDO:0017304 True False False [yes]\n", + "2 MONDO:0017212 True False True [yes, unsure]\n", + "3 MONDO:0010181 True False False [yes]\n", + "4 MONDO:0008927 True False False [yes]\n", + ".. ... ... ... ... ...\n", + "128 MONDO:0020252 False False False []\n", + "129 MONDO:0018152 False False False []\n", + "130 MONDO:0018997 False False False []\n", + "131 MONDO:0009485 False False False []\n", + "132 MONDO:0008397 False False False []\n", + "\n", + "[133 rows x 5 columns]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Make a copy of report_df\n", + "report_copy_visual_df = report_df.copy()\n", + "\n", + "# Read in reviewer file\n", + "visual_reviewer_df = pd.read_excel('data/Disorder-of-visual-system_reviewer-GiocondaAlyea.xlsx', usecols=['Mondo ID', 'yes', 'no', 'unsure'])\n", + "visual_reviewer_df.head()\n", + "\n", + "# Find which column has the first True value for each row\n", + "# ORIG Analysis\n", + "# visual_reviewer_df['first_true_column'] = visual_reviewer_df[['yes', 'no', 'unsure']].idxmax(axis=1)\n", + "\n", + "# visual_reviewer_df[['yes', 'no', 'unsure']] = visual_reviewer_df[['yes', 'no', 'unsure']].astype(bool)\n", + "# visual_reviewer_df['first_true_column'] = visual_reviewer_df[['yes', 'no', 'unsure']].idxmax(axis=1, skipna=True)\n", + "\n", + "visual_reviewer_df['true_columns'] = visual_reviewer_df[['yes', 'no', 'unsure']].apply(lambda row: row.index[row].tolist(), axis=1)\n", + "\n", + "\n", + "# Display the result\n", + "print(visual_reviewer_df[['Mondo ID', 'true_columns']])\n", + "\n", + "\n", + "visual_reviewer_df.head(len(visual_reviewer_df))\n", + "\n", + "# The form question was \"In your opinion, are the following diseases considered a \"cancer or benign tumor\"?\"\n", + "# Therefore, 'yes' means the term should 'Stay in the branch' and 'no' means it should 'Leave the Branch'." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c0153a95", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/scripts/notebooks/Mondo_reclassification_report_v2023-12-12.xlsx b/src/scripts/notebooks/Mondo_reclassification_report_v2023-12-12.xlsx new file mode 100644 index 0000000000..4b6c5d49e5 Binary files /dev/null and b/src/scripts/notebooks/Mondo_reclassification_report_v2023-12-12.xlsx differ