From 0313cb52cc030a693c5679a0dcc61b28dd883221 Mon Sep 17 00:00:00 2001 From: Mark Wilkinson Date: Wed, 7 Aug 2024 12:54:16 +0200 Subject: [PATCH] working environment for phenotypes --- content/FLAIR-GG/phenotypefrequency.ipynb | 113 ++++++++++++++++------ 1 file changed, 82 insertions(+), 31 deletions(-) diff --git a/content/FLAIR-GG/phenotypefrequency.ipynb b/content/FLAIR-GG/phenotypefrequency.ipynb index 5cce723..4f71e83 100644 --- a/content/FLAIR-GG/phenotypefrequency.ipynb +++ b/content/FLAIR-GG/phenotypefrequency.ipynb @@ -1,20 +1,13 @@ { "cells": [ - { - "cell_type": "markdown", - "id": "039f7a57-a580-4d29-b126-25b51f67140e", - "metadata": {}, - "source": [ - "# Results explorer for the EJP Counting Case\n", - "" - ] - }, { "cell_type": "markdown", "id": "ed7ed56c-0088-4335-8d0c-7b9fdf2def85", "metadata": {}, "source": [ - "## Welcome to the FLAIR-GG Counting Case Analytics Notebook\n", + "## Welcome to the FLAIR-GG Phenotype Frequency Analytics Notebook\n", + "\n", + "The Phenotype Frequency Data Service will count the number of times a phenotype appears in a registry (on a per-patient basis - only counts once per patient)\n", "\n", "Please run the first cell to set-up the analytics environment\n", "\n", @@ -71,25 +64,25 @@ "# type,frequency\n", "# http://purl.obolibrary.org/obo/NCIT_C131922,31\n", "# http://purl.obolibrary.org/obo/NCIT_C136154,22\n", - "# http://www.orpha.net/ORDO/Orphanet_98896,32\n", - "# http://purl.obolibrary.org/obo/NCIT_C20197,76\n", - "# http://purl.obolibrary.org/obo/NCIT_C68615,87\n", - "# http://purl.obolibrary.org/obo/NCIT_C83164,87\n", - "# http://purl.obolibrary.org/obo/NCIT_C13306,24\n", "\n", "response = requests.get(url)\n", "response = json.loads(response.content)\n", - "# print(response)\n", - "site = [\"sample1\", \"sample2\"]\n", - "count = [100, 3]\n", + "#print(response)\n", + "\n", + "data = {}\n", "for provider in response.keys():\n", " print(\"Provider: {}\".format(provider))\n", - " data = response[provider]\n", + " alllines = response[provider]\n", + " data[provider] = []\n", "\n", - " data = data.splitlines().pop()\n", + " lines = alllines.splitlines() # this is the CSV\n", + " lines.pop(0) # get rid of header\n", + " # print(data)\n", + " for line in iter(lines):\n", + " [pheno, freq] = line.split(\",\")\n", + " data[provider].append([pheno, freq])\n", "\n", - " site.append(provider)\n", - " count.append(int(data))\n", + "# print(data)\n", "print(\"DONE\")" ] }, @@ -100,15 +93,73 @@ "metadata": {}, "outputs": [], "source": [ - "source = pd.DataFrame({\n", - " 'site': site,\n", - " 'count': count\n", - "})\n", - "\n", - "alt.Chart(source).mark_bar().encode(\n", - " x='count',\n", - " y='site'\n", - ")" + "data_list = []\n", + "for provider, phenotypes in data.items():\n", + " for phenotype, frequency in phenotypes:\n", + " data_list.append({\"Provider\": provider, \"Phenotype\": phenotype, \"Frequency\": frequency})\n", + "\n", + "# Create a DataFrame\n", + "df = pd.DataFrame(data_list)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac295f7c", + "metadata": {}, + "outputs": [], + "source": [ + "# Simple Bar Chart\n", + "bar_chart = alt.Chart(df).mark_bar().encode(\n", + " x='Phenotype:N',\n", + " y='Frequency:Q',\n", + " color='Provider:N',\n", + " column='Provider:N'\n", + ").properties(\n", + " title='Phenotype Frequencies by Provider'\n", + ")\n", + "\n", + "bar_chart" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "86bee1fb", + "metadata": {}, + "outputs": [], + "source": [ + "# Grouped Bar Chart\n", + "grouped_bar_chart = alt.Chart(df).mark_bar().encode(\n", + " x=alt.X('Phenotype:N', title='Phenotype'),\n", + " y=alt.Y('Frequency:Q', title='Frequency'),\n", + " color=alt.Color('Provider:N', title='Provider'),\n", + " column=alt.Column('Provider:N', title='Provider')\n", + ").properties(\n", + " title='Grouped Phenotype Frequencies by Provider'\n", + ")\n", + "\n", + "grouped_bar_chart" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "54916d07", + "metadata": {}, + "outputs": [], + "source": [ + "# Heatmap\n", + "heatmap = alt.Chart(df).mark_rect().encode(\n", + " x='Phenotype:N',\n", + " y='Provider:N',\n", + " color=alt.Color('Frequency:Q', scale=alt.Scale(scheme='viridis')),\n", + " tooltip=['Provider', 'Phenotype', 'Frequency']\n", + ").properties(\n", + " title='Phenotype Frequencies Heatmap'\n", + ")\n", + "\n", + "heatmap" ] }, {