diff --git a/script_config.yaml b/script_config.yaml index cd44df6..e217a0d 100644 --- a/script_config.yaml +++ b/script_config.yaml @@ -1,5 +1,5 @@ -database_location: "/mnt/biocluster/praktikum/tutorium_19/gnomad_db/data/exomes" # where to create the database, make sure you have space on your device. -gnomad_vcf_location: "/mnt/biocluster/praktikum/tutorium_19/gnomad_db/data/exomes" # where are your *.vcf.bgz located +database_location: "test_out" # where to create the database, make sure you have space on your device. +gnomad_vcf_location: "data" # where are your *.vcf.bgz located tables_location: "test_out" # where to store the preprocessed intermediate files, you can leave it like this script_locations: "test_out" # where to store the scripts, where you can check the progress of your jobs, you can leave it like this gnomad_version: "v4" # main gnomad_version version of the gnomAD vcf file (e.g., v2, v3, v4) diff --git a/scripts/GettingStartedwithGnomAD_DB.ipynb b/scripts/GettingStartedwithGnomAD_DB.ipynb index a58f8c9..66229fa 100644 --- a/scripts/GettingStartedwithGnomAD_DB.ipynb +++ b/scripts/GettingStartedwithGnomAD_DB.ipynb @@ -2,8 +2,8 @@ "cells": [ { "cell_type": "code", - "execution_count": null, - "id": "7303ebd0", + "execution_count": 1, + "id": "ac0fca47", "metadata": {}, "outputs": [], "source": [ @@ -14,7 +14,7 @@ }, { "cell_type": "markdown", - "id": "8d5a63f4", + "id": "acdaa43f", "metadata": {}, "source": [ "# Download SQLite preprocessed files\n", @@ -26,20 +26,44 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "f8529267", + "execution_count": 8, + "id": "13b2eb85", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Starting downloading...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "gnomad_db_wes_v4.0.sqlite3.gz?download=1: 7.31GB [12:40, 9.62MB/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Starting unzipping. This can take some time...\n", + "Database location: test_dir/gnomad_db.sqlite3\n", + "Done!\n" + ] + } + ], "source": [ "# uncomment if you actually want to download it\n", - "# download_link = \"https://zenodo.org/record/5045102/files/gnomad_db_v2.1.1.sqlite3.gz?download=1\"\n", - "# output_dir = \"test_dir\" # database_location\n", - "# gnomAD_DB.download_and_unzip(download_link, output_dir) " + "download_link = \"https://zenodo.org/records/10066310/files/gnomad_db_wes_v4.0.sqlite3.gz?download=1\"\n", + "output_dir = \"test_dir\" # database_location\n", + "gnomAD_DB.download_and_unzip(download_link, output_dir) " ] }, { "cell_type": "markdown", - "id": "6bd9a9da", + "id": "aab41c34", "metadata": {}, "source": [ "# Initialize Database" @@ -47,8 +71,8 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "c148a8df", + "execution_count": 9, + "id": "dc9b3e8c", "metadata": { "tags": [ "parameters" @@ -62,18 +86,32 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "057a03cf", + "execution_count": 11, + "id": "7f430055", "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "AssertionError", + "evalue": "We don't support this version: v45. Please select one fo the following ones: ['v2', 'v3', 'v4']", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/tmp/user/31932/ipykernel_1570523/790803492.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# initialize database\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mdb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgnomAD_DB\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdatabase_location\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgnomad_version\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"v45\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m~/code/gnomAD_DB/gnomad_db/database.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, genodb_path, gnomad_version, parallel, cpu_count)\u001b[0m\n\u001b[1;32m 28\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 29\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 30\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgnomad_version\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_parse_gnomad_version\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgnomad_version\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkeys\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 31\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 32\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmap\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"base_columns\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgnomad_version\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/code/gnomAD_DB/gnomad_db/database.py\u001b[0m in \u001b[0;36m_parse_gnomad_version\u001b[0;34m(self, gnomad_version, supported_gnomad_versions)\u001b[0m\n\u001b[1;32m 179\u001b[0m \u001b[0mgnomad_version\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgnomad_version\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 180\u001b[0m \u001b[0mgnomad_version\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgnomad_version\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\".\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 181\u001b[0;31m \u001b[0;32massert\u001b[0m \u001b[0mgnomad_version\u001b[0m \u001b[0;32min\u001b[0m \u001b[0msupported_gnomad_versions\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34mf\"We don't support this version: {gnomad_version}. Please select one fo the following ones: {supported_gnomad_versions}\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 182\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mgnomad_version\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 183\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mAssertionError\u001b[0m: We don't support this version: v45. Please select one fo the following ones: ['v2', 'v3', 'v4']" + ] + } + ], "source": [ "# initialize database\n", - "db = gnomAD_DB(database_location, gnomad_version=\"v3\")" + "db = gnomAD_DB(database_location, gnomad_version=\"v45\")" ] }, { "cell_type": "markdown", - "id": "6b664ad0", + "id": "a9e3006f", "metadata": {}, "source": [ "# Insert gnomAD variants into the database from single tsv file\n", @@ -83,7 +121,7 @@ { "cell_type": "code", "execution_count": null, - "id": "aa628e88", + "id": "8def6a52", "metadata": {}, "outputs": [], "source": [ @@ -98,7 +136,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2e0f8963", + "id": "34dbb770", "metadata": {}, "outputs": [], "source": [ @@ -108,7 +146,7 @@ }, { "cell_type": "markdown", - "id": "f7ee891c", + "id": "7a9243d2", "metadata": {}, "source": [ "# Query MAF" @@ -117,7 +155,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b196b5e6", + "id": "835e50b4", "metadata": {}, "outputs": [], "source": [ @@ -128,7 +166,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b3d2d014", + "id": "eb3a308d", "metadata": {}, "outputs": [], "source": [ @@ -138,7 +176,7 @@ }, { "cell_type": "markdown", - "id": "76b08258", + "id": "b879dad5", "metadata": {}, "source": [ "## You can pass a dataframe with variants\n", @@ -148,7 +186,7 @@ { "cell_type": "code", "execution_count": null, - "id": "00010e64", + "id": "36014921", "metadata": {}, "outputs": [], "source": [ @@ -158,7 +196,7 @@ { "cell_type": "code", "execution_count": null, - "id": "da779a49", + "id": "a7bfea3b", "metadata": {}, "outputs": [], "source": [ @@ -168,7 +206,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ac19d6eb", + "id": "0aaa8a58", "metadata": {}, "outputs": [], "source": [ @@ -178,7 +216,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bffce318", + "id": "88c20646", "metadata": {}, "outputs": [], "source": [ @@ -193,7 +231,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a4108e52", + "id": "ef74e6bd", "metadata": {}, "outputs": [], "source": [ @@ -202,7 +240,7 @@ }, { "cell_type": "markdown", - "id": "c4818aec", + "id": "b4261ffd", "metadata": {}, "source": [ "## You can pass a single string as a variant" @@ -210,18 +248,50 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "2e788375", + "execution_count": 7, + "id": "084c732a", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "chrom 10\n", + "pos 95606780\n", + "ref A\n", + "alt C\n", + "filter PASS\n", + "AC 2.0\n", + "AN 628768.0\n", + "AF 0.000003\n", + "MQ 60.0\n", + "QD 12.1796\n", + "ReadPosRankSum 0.365\n", + "VarDP 412.0\n", + "AS_VQSLOD 5.5239\n", + "AC_grpmax 2.0\n", + "AN_grpmax 350092.0\n", + "AF_grpmax 0.000006\n", + "AF_eas 0.0\n", + "AF_nfe 0.000006\n", + "AF_fin 0.0\n", + "AF_afr 0.0\n", + "AF_asj 0.0\n", + "Name: 0, dtype: object" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "db.get_info_from_str(\"21:9825790:C>T\", \"AF\")" + "db.get_info_from_str(\"10:95606780:A>C\", \"*\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "478d114f", + "id": "e72d5071", "metadata": {}, "outputs": [], "source": [ @@ -230,7 +300,7 @@ }, { "cell_type": "markdown", - "id": "5b2d8caf", + "id": "b2cd63fc", "metadata": {}, "source": [ "## You can look for the MAF scores in an interval" @@ -239,7 +309,7 @@ { "cell_type": "code", "execution_count": null, - "id": "efa19fcc", + "id": "0e587bf1", "metadata": {}, "outputs": [], "source": [ diff --git a/scripts/createTSVtables.ipynb b/scripts/createTSVtables.ipynb index 0cb830b..dde1962 100644 --- a/scripts/createTSVtables.ipynb +++ b/scripts/createTSVtables.ipynb @@ -3,7 +3,7 @@ { "cell_type": "code", "execution_count": null, - "id": "04c119c1", + "id": "8feb2572", "metadata": { "papermill": { "duration": 0.336842, @@ -29,7 +29,7 @@ { "cell_type": "code", "execution_count": null, - "id": "713cfb12", + "id": "69750f17", "metadata": { "papermill": { "duration": 0.336842, @@ -48,7 +48,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a88b05a7", + "id": "f29e77bc", "metadata": { "papermill": { "duration": 0.014665, @@ -71,7 +71,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fd814421", + "id": "5c001a27", "metadata": { "papermill": { "duration": 0.014665, @@ -94,7 +94,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b8884615", + "id": "d107dcc8", "metadata": { "papermill": { "duration": 0.014665, @@ -116,7 +116,7 @@ { "cell_type": "code", "execution_count": null, - "id": "62457d3f", + "id": "461a81da", "metadata": { "papermill": { "duration": 0.008922, @@ -137,7 +137,7 @@ { "cell_type": "code", "execution_count": null, - "id": "dd3dca18", + "id": "76cdd30e", "metadata": { "papermill": { "duration": 0.008863, @@ -157,7 +157,7 @@ { "cell_type": "code", "execution_count": null, - "id": "488eb2ba", + "id": "088135a0", "metadata": { "papermill": { "duration": 0.008863, @@ -184,7 +184,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9f099dcf", + "id": "b2c59fe7", "metadata": { "papermill": { "duration": 0.329741, @@ -204,7 +204,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f8103d87", + "id": "4c808880", "metadata": {}, "outputs": [], "source": [] @@ -229,7 +229,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.6" + "version": "3.7.0" }, "papermill": { "default_parameters": {}, diff --git a/setup.py b/setup.py index 8469a2a..7f9f885 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import setup, find_packages setup(name='gnomad_db', - version='0.1.2', + version='0.1.3', description='This package scales the huge gnomAD files to a SQLite database, which is easy and fast to query. It extracts from a gnomAD vcf the minor allele frequency for each variant.', author='KalinNonchev', author_email='boo@foo.com',