Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
Kalin Nonchev committed Nov 4, 2023
1 parent 12d07b9 commit bbed0f3
Show file tree
Hide file tree
Showing 4 changed files with 120 additions and 50 deletions.
4 changes: 2 additions & 2 deletions script_config.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
database_location: "/mnt/biocluster/praktikum/tutorium_19/gnomad_db/data/exomes" # where to create the database, make sure you have space on your device.
gnomad_vcf_location: "/mnt/biocluster/praktikum/tutorium_19/gnomad_db/data/exomes" # where are your *.vcf.bgz located
database_location: "test_out" # where to create the database, make sure you have space on your device.
gnomad_vcf_location: "data" # where are your *.vcf.bgz located
tables_location: "test_out" # where to store the preprocessed intermediate files, you can leave it like this
script_locations: "test_out" # where to store the scripts, where you can check the progress of your jobs, you can leave it like this
gnomad_version: "v4" # main gnomad_version version of the gnomAD vcf file (e.g., v2, v3, v4)
Expand Down
142 changes: 106 additions & 36 deletions scripts/GettingStartedwithGnomAD_DB.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "7303ebd0",
"execution_count": 1,
"id": "ac0fca47",
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -14,7 +14,7 @@
},
{
"cell_type": "markdown",
"id": "8d5a63f4",
"id": "acdaa43f",
"metadata": {},
"source": [
"# Download SQLite preprocessed files\n",
Expand All @@ -26,29 +26,53 @@
},
{
"cell_type": "code",
"execution_count": null,
"id": "f8529267",
"execution_count": 8,
"id": "13b2eb85",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Starting downloading...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"gnomad_db_wes_v4.0.sqlite3.gz?download=1: 7.31GB [12:40, 9.62MB/s] \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Starting unzipping. This can take some time...\n",
"Database location: test_dir/gnomad_db.sqlite3\n",
"Done!\n"
]
}
],
"source": [
"# uncomment if you actually want to download it\n",
"# download_link = \"https://zenodo.org/record/5045102/files/gnomad_db_v2.1.1.sqlite3.gz?download=1\"\n",
"# output_dir = \"test_dir\" # database_location\n",
"# gnomAD_DB.download_and_unzip(download_link, output_dir) "
"download_link = \"https://zenodo.org/records/10066310/files/gnomad_db_wes_v4.0.sqlite3.gz?download=1\"\n",
"output_dir = \"test_dir\" # database_location\n",
"gnomAD_DB.download_and_unzip(download_link, output_dir) "
]
},
{
"cell_type": "markdown",
"id": "6bd9a9da",
"id": "aab41c34",
"metadata": {},
"source": [
"# Initialize Database"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c148a8df",
"execution_count": 9,
"id": "dc9b3e8c",
"metadata": {
"tags": [
"parameters"
Expand All @@ -62,18 +86,32 @@
},
{
"cell_type": "code",
"execution_count": null,
"id": "057a03cf",
"execution_count": 11,
"id": "7f430055",
"metadata": {},
"outputs": [],
"outputs": [
{
"ename": "AssertionError",
"evalue": "We don't support this version: v45. Please select one fo the following ones: ['v2', 'v3', 'v4']",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m/tmp/user/31932/ipykernel_1570523/790803492.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# initialize database\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mdb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgnomAD_DB\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdatabase_location\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgnomad_version\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"v45\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m~/code/gnomAD_DB/gnomad_db/database.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, genodb_path, gnomad_version, parallel, cpu_count)\u001b[0m\n\u001b[1;32m 28\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 29\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 30\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgnomad_version\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_parse_gnomad_version\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgnomad_version\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkeys\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 31\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 32\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmap\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"base_columns\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgnomad_version\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/code/gnomAD_DB/gnomad_db/database.py\u001b[0m in \u001b[0;36m_parse_gnomad_version\u001b[0;34m(self, gnomad_version, supported_gnomad_versions)\u001b[0m\n\u001b[1;32m 179\u001b[0m \u001b[0mgnomad_version\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgnomad_version\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 180\u001b[0m \u001b[0mgnomad_version\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgnomad_version\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\".\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 181\u001b[0;31m \u001b[0;32massert\u001b[0m \u001b[0mgnomad_version\u001b[0m \u001b[0;32min\u001b[0m \u001b[0msupported_gnomad_versions\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34mf\"We don't support this version: {gnomad_version}. Please select one fo the following ones: {supported_gnomad_versions}\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 182\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mgnomad_version\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 183\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mAssertionError\u001b[0m: We don't support this version: v45. Please select one fo the following ones: ['v2', 'v3', 'v4']"
]
}
],
"source": [
"# initialize database\n",
"db = gnomAD_DB(database_location, gnomad_version=\"v3\")"
"db = gnomAD_DB(database_location, gnomad_version=\"v45\")"
]
},
{
"cell_type": "markdown",
"id": "6b664ad0",
"id": "a9e3006f",
"metadata": {},
"source": [
"# Insert gnomAD variants into the database from single tsv file\n",
Expand All @@ -83,7 +121,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "aa628e88",
"id": "8def6a52",
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -98,7 +136,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "2e0f8963",
"id": "34dbb770",
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -108,7 +146,7 @@
},
{
"cell_type": "markdown",
"id": "f7ee891c",
"id": "7a9243d2",
"metadata": {},
"source": [
"# Query MAF"
Expand All @@ -117,7 +155,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "b196b5e6",
"id": "835e50b4",
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -128,7 +166,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "b3d2d014",
"id": "eb3a308d",
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -138,7 +176,7 @@
},
{
"cell_type": "markdown",
"id": "76b08258",
"id": "b879dad5",
"metadata": {},
"source": [
"## You can pass a dataframe with variants\n",
Expand All @@ -148,7 +186,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "00010e64",
"id": "36014921",
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -158,7 +196,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "da779a49",
"id": "a7bfea3b",
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -168,7 +206,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "ac19d6eb",
"id": "0aaa8a58",
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -178,7 +216,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "bffce318",
"id": "88c20646",
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -193,7 +231,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "a4108e52",
"id": "ef74e6bd",
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -202,26 +240,58 @@
},
{
"cell_type": "markdown",
"id": "c4818aec",
"id": "b4261ffd",
"metadata": {},
"source": [
"## You can pass a single string as a variant"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2e788375",
"execution_count": 7,
"id": "084c732a",
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"chrom 10\n",
"pos 95606780\n",
"ref A\n",
"alt C\n",
"filter PASS\n",
"AC 2.0\n",
"AN 628768.0\n",
"AF 0.000003\n",
"MQ 60.0\n",
"QD 12.1796\n",
"ReadPosRankSum 0.365\n",
"VarDP 412.0\n",
"AS_VQSLOD 5.5239\n",
"AC_grpmax 2.0\n",
"AN_grpmax 350092.0\n",
"AF_grpmax 0.000006\n",
"AF_eas 0.0\n",
"AF_nfe 0.000006\n",
"AF_fin 0.0\n",
"AF_afr 0.0\n",
"AF_asj 0.0\n",
"Name: 0, dtype: object"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"db.get_info_from_str(\"21:9825790:C>T\", \"AF\")"
"db.get_info_from_str(\"10:95606780:A>C\", \"*\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "478d114f",
"id": "e72d5071",
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -230,7 +300,7 @@
},
{
"cell_type": "markdown",
"id": "5b2d8caf",
"id": "b2cd63fc",
"metadata": {},
"source": [
"## You can look for the MAF scores in an interval"
Expand All @@ -239,7 +309,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "efa19fcc",
"id": "0e587bf1",
"metadata": {},
"outputs": [],
"source": [
Expand Down
Loading

0 comments on commit bbed0f3

Please sign in to comment.