From 79860bb6cca6e58db0c6ccb629111c324e9de68a Mon Sep 17 00:00:00 2001
From: Kalin Nonchev <50597791+KalinNonchev@users.noreply.github.com>
Date: Wed, 30 Jun 2021 18:24:13 +0200
Subject: [PATCH] Parallel and download function (#10)

* cpu count

* cpu count default

* add chrM columns in create script

* Update README.md

* Update README.md

* source db

* utils

* packages

* packages

* rm package

* package

* Update README.md

* rename

* Update README.md

* variable names

* final adj

Co-authored-by: Kalin Nonchev <nonchev@ouga06.cmm.in.tum.de>
---
 README.md                                 | 28 +++++++++++++++--
 gnomad_db/database.py                     | 19 +++++++++---
 gnomad_db/utils.py                        | 38 +++++++++++++++++++++++
 requirements.txt                          |  1 +
 scripts/GettingStartedwithGnomAD_DB.ipynb | 30 ++++++++++++++++--
 scripts/GettingStartedwithGnomAD_DB.py    | 14 +++++++++
 scripts/createTSVtables.ipynb             | 35 ++++++++++++++++++---
 scripts/createTSVtables.py                | 13 ++++++--
 setup.py                                  |  4 +--
 9 files changed, 164 insertions(+), 18 deletions(-)
 create mode 100644 gnomad_db/utils.py

diff --git a/README.md b/README.md
index acc7d43..26a6c8f 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,27 @@ It extracts from a gnomAD vcf the ["AF", "AF_afr", "AF_eas", "AF_fin", "AF_nfe",
 
 ###### The package works for all currently available gnomAD releases.(2021) 
 
-## 1. Data preprocessing and SQL database creation
+## 1. Download SQLite preprocessed files
+
+I have preprocessed and created sqlite3 files for gnomAD v2.1.1 and 3.1.1 for you, which can be easily downloaded from here. They contain all variants on the 24 standard chromosomes.
+
+gnomAD v3.1.1 (hg38, **759'302'267** variants) 25G zipped, 56G in total - https://zenodo.org/record/5045170/files/gnomad_db_v3.1.1.sqlite3.gz?download=1 \
+gnomAD v2.1.1 (hg19, **261'942'336** variants) 9G zipped, 20G in total - https://zenodo.org/record/5045102/files/gnomad_db_v2.1.1.sqlite3.gz?download=1 
+
+You can download it as:
+
+```python
+from gnomad_db.database import gnomAD_DB
+download_link = "https://zenodo.org/record/5045102/files/gnomad_db_v2.1.1.sqlite3.gz?download=1"
+output_dir = "test_dir" # database_location
+gnomAD_DB.download_and_unzip(download_link, output_dir)
+```
+#### NB this would take ~30min (network speed 10mb/s)
+
+
+or you can create the database by yourself. **However, I recommend to use the preprocessed files to save ressources and time**. If you do so, you can go to **2. API usage** and explore the package and its great features!
+
+## 1.1 Data preprocessing and SQL database creation
 
 Start by downloading the vcf files from gnomAD in a single directory:
 
@@ -60,7 +80,8 @@ database_location = "test_dir"
 db = gnomAD_DB(database_location)
 ```
 
-3. Insert some test variants to run the examples below
+3. Insert some test variants to run the examples below \
+**If you have downloaded the preprocessed sqlite3 files, you can skip this step as you already have variants**
 ```python
 # get some variants
 var_df = pd.read_csv("data/test_vcf_gnomad_chr21_10000.tsv.gz", sep="\t", names=db.columns, index_col=False)
@@ -69,7 +90,8 @@ var_df = pd.read_csv("data/test_vcf_gnomad_chr21_10000.tsv.gz", sep="\t", names=
 db.insert_variants(var_df)
 ```
 
-4. Query variant minor allele frequency
+4. Query variant minor allele frequency \
+**These example variants are assembled to hg38!**
 ```python
 # query some MAF scores
 dummy_var_df = pd.DataFrame({
diff --git a/gnomad_db/database.py b/gnomad_db/database.py
index 44f53a8..16c6fcc 100644
--- a/gnomad_db/database.py
+++ b/gnomad_db/database.py
@@ -4,13 +4,14 @@
 import pandas as pd
 import multiprocessing
 from joblib import Parallel, delayed
+from . import utils
 
 
 class gnomAD_DB:
     
-    def __init__(self, genodb_path, parallel=True):
+    def __init__(self, genodb_path, parallel=True, cpu_count=512):
         
-        self.cpu_count = int(multiprocessing.cpu_count())
+        self.cpu_count = min(cpu_count, int(multiprocessing.cpu_count()))
         self.parallel = parallel
 
         self.db_file = os.path.join(genodb_path, 'gnomad_db.sqlite3')
@@ -182,7 +183,9 @@ def get_mafs_for_interval(self, chrom: str, interval_start: int, interval_end: i
     
     
     def get_maf_from_str(self, var: str, query: str="AF") -> float:
-        # variant in form chrom:pos:ref>alt
+        """
+        get AF for variant in form chrom:pos:ref>alt
+        """
         
         chrom, pos, ref, alt = self._pack_from_str(var)
         
@@ -193,4 +196,12 @@ def get_maf_from_str(self, var: str, query: str="AF") -> float:
         where chrom = '{chrom}' AND pos = {pos} AND ref = '{ref}' AND alt = '{alt}';
         """
         
-        return self.query_direct(sql_query).squeeze()
\ No newline at end of file
+        return self.query_direct(sql_query).squeeze()
+    
+    
+    @staticmethod
+    def download_and_unzip(url, output_path):
+        """
+        download database and unzip file
+        """
+        utils.download_and_unzip_file(url, output_path)
\ No newline at end of file
diff --git a/gnomad_db/utils.py b/gnomad_db/utils.py
new file mode 100644
index 0000000..b7c3b66
--- /dev/null
+++ b/gnomad_db/utils.py
@@ -0,0 +1,38 @@
+import urllib.request
+import gzip
+import shutil
+from tqdm import tqdm
+import os
+import time
+
+
+class DownloadProgressBar(tqdm):
+    def update_to(self, b=1, bsize=1, tsize=None):
+        if tsize is not None:
+            self.total = tsize
+        self.update(b * bsize - self.n)
+
+
+def download_url(url, output_dir):
+    with DownloadProgressBar(unit='B', unit_scale=True,
+                             miniters=1, desc=url.split('/')[-1]) as t:
+        urllib.request.urlretrieve(url, filename=f"{output_dir}/gnomad_db.sqlite3.gz", reporthook=t.update_to)
+        time.sleep(5)
+
+def unzip(output_dir):
+    file_name_in = f"{output_dir}/gnomad_db.sqlite3.gz"
+    file_name_out = f"{output_dir}/gnomad_db.sqlite3"
+    with gzip.open(file_name_in, 'rb') as f_in:
+        with open(file_name_out, 'wb') as f_out:
+            shutil.copyfileobj(f_in, f_out)
+    time.sleep(5)
+    os.remove(file_name_in)
+    print(f"Database location: {file_name_out}")
+
+def download_and_unzip_file(url, output_dir):
+    print("Starting downloading...")
+    download_url(url, output_dir)
+    print("Starting unzipping. This can take some time...")
+    unzip(output_dir)
+    print("Done!")
+    
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 3e60185..000e2c1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,3 +4,4 @@ numpy
 pyyaml
 pytest
 joblib
+tqdm
\ No newline at end of file
diff --git a/scripts/GettingStartedwithGnomAD_DB.ipynb b/scripts/GettingStartedwithGnomAD_DB.ipynb
index 70746e7..447e338 100644
--- a/scripts/GettingStartedwithGnomAD_DB.ipynb
+++ b/scripts/GettingStartedwithGnomAD_DB.ipynb
@@ -3,7 +3,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "98d3ae32-d439-41bf-be89-3e6f52a3591f",
+   "id": "12c65da9-cb0c-42f6-a92c-cf0c2535ba13",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -12,6 +12,32 @@
     "import numpy as np"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "20ffe529-064d-482b-9267-9b0fd729070c",
+   "metadata": {},
+   "source": [
+    "# Download SQLite preprocessed files\n",
+    "\n",
+    "I have preprocessed and created sqlite3 files for gnomAD v2.1.1 and 3.1.1 for you, which can be easily downloaded from here. They contain all variants on the 24 standard chromosomes.\n",
+    "\n",
+    "gnomAD v3.1.1 (hg38, 759'302'267 variants) 25G zipped, 56G in total - https://zenodo.org/record/5045170/files/gnomad_db_v3.1.1.sqlite3.gz?download=1 \n",
+    "gnomAD v2.1.1 (hg19, 261'942'336 variants) 9G zipped, 20G in total - https://zenodo.org/record/5045102/files/gnomad_db_v2.1.1.sqlite3.gz?download=1 "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e525076f-8540-444f-912a-b411a224f0ec",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# uncomment if you actually want to download it\n",
+    "# download_link = \"https://zenodo.org/record/5045102/files/gnomad_db_v2.1.1.sqlite3.gz?download=1\"\n",
+    "# output_dir = \"test_dir\" # database_location\n",
+    "# gnomAD_DB.download_and_unzip(download_link, output_dir) "
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "3fa70b8a-1d43-4929-bfee-4df0d071f9fa",
@@ -241,7 +267,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.8"
+   "version": "3.9.5"
   }
  },
  "nbformat": 4,
diff --git a/scripts/GettingStartedwithGnomAD_DB.py b/scripts/GettingStartedwithGnomAD_DB.py
index ec241e2..fed5665 100644
--- a/scripts/GettingStartedwithGnomAD_DB.py
+++ b/scripts/GettingStartedwithGnomAD_DB.py
@@ -18,6 +18,20 @@
 import pandas as pd
 import numpy as np
 
+# %% [markdown]
+# # Download SQLite preprocessed files
+#
+# I have preprocessed and created sqlite3 files for gnomAD v2.1.1 and 3.1.1 for you, which can be easily downloaded from here. They contain all variants on the 24 standard chromosomes.
+#
+# gnomAD v3.1.1 (hg38, 759'302'267 variants) 25G zipped, 56G in total - https://zenodo.org/record/5045170/files/gnomad_db_v3.1.1.sqlite3.gz?download=1 
+# gnomAD v2.1.1 (hg19, 261'942'336 variants) 9G zipped, 20G in total - https://zenodo.org/record/5045102/files/gnomad_db_v2.1.1.sqlite3.gz?download=1 
+
+# %%
+# uncomment if you actually want to download it
+# download_link = "https://zenodo.org/record/5045102/files/gnomad_db_v2.1.1.sqlite3.gz?download=1"
+# output_dir = "test_dir" # database_location
+# gnomAD_DB.download_and_unzip(download_link, output_dir) 
+
 # %% [markdown]
 # # Initialize Database
 
diff --git a/scripts/createTSVtables.ipynb b/scripts/createTSVtables.ipynb
index 08b84a7..dd0e164 100644
--- a/scripts/createTSVtables.ipynb
+++ b/scripts/createTSVtables.ipynb
@@ -21,6 +21,7 @@
     "from subprocess import PIPE, Popen\n",
     "import pandas as pd\n",
     "from joblib import Parallel, delayed\n",
+    "import multiprocessing\n",
     "import os"
    ]
   },
@@ -92,7 +93,27 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "efc70048-1fde-4f0e-af94-cc111f3a9250",
+   "id": "f3b2f898-f126-476b-bab5-d087e27bb0a7",
+   "metadata": {
+    "papermill": {
+     "duration": 0.008863,
+     "end_time": "2021-05-05T20:00:58.715794",
+     "exception": false,
+     "start_time": "2021-05-05T20:00:58.706931",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "cpu_count = int(multiprocessing.cpu_count())\n",
+    "cpu_count"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ae0419e4-55bd-484b-b2f2-981d67457504",
    "metadata": {
     "papermill": {
      "duration": 0.008863,
@@ -108,11 +129,15 @@
     "# extract needed columns\n",
     "# if running DIRECTLY from notebook, add module load i12g/bcftools; in the beginning of cmd\n",
     "def create_table(file, table_location):\n",
+    "    if \"chrM\" in file:\n",
+    "        query_string = \"%CHROM\\t%POS\\t%REF\\t%ALT\\t%AF_hom\\t%AF_het\\n\"\n",
+    "    else:\n",
+    "        query_string = \"%CHROM\\t%POS\\t%REF\\t%ALT\\t%AF\\t%AF_afr\\t%AF_eas\\t%AF_fin\\t%AF_nfe\\t%AF_asj\\t%AF_oth\\t%AF_popmax\\n\"\n",
     "    if not os.path.exists(table_location):\n",
-    "        cmd = f\"bcftools query -f '%CHROM\\t%POS\\t%REF\\t%ALT\\t%AF\\t%AF_afr\\t%AF_eas\\t%AF_fin\\t%AF_nfe\\t%AF_asj\\t%AF_oth\\t%AF_popmax\\n' {file} | gzip > {table_location}\"\n",
+    "        cmd = f\"bcftools query -f '{query_string}' {file} | gzip > {table_location}\"\n",
     "        p = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE)\n",
     "        print(p.communicate())\n",
-    "    \n"
+    "    "
    ]
   },
   {
@@ -132,7 +157,7 @@
    "outputs": [],
    "source": [
     "# run bcftools in parallel\n",
-    "Parallel(12)(delayed(create_table)(file, table_location) for file, table_location in tqdm(zip(files, tables_location)))"
+    "Parallel(cpu_count)(delayed(create_table)(file, table_location) for file, table_location in tqdm(zip(files, tables_location)))"
    ]
   }
  ],
@@ -155,7 +180,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.8"
+   "version": "3.9.5"
   },
   "papermill": {
    "default_parameters": {},
diff --git a/scripts/createTSVtables.py b/scripts/createTSVtables.py
index eddcf56..49b77d0 100644
--- a/scripts/createTSVtables.py
+++ b/scripts/createTSVtables.py
@@ -19,6 +19,7 @@
 from subprocess import PIPE, Popen
 import pandas as pd
 from joblib import Parallel, delayed
+import multiprocessing
 import os
 
 # %% papermill={"duration": 0.014665, "end_time": "2021-05-05T20:00:58.675108", "exception": false, "start_time": "2021-05-05T20:00:58.660443", "status": "completed"} tags=["parameters"]
@@ -36,13 +37,21 @@
 tables_location = [f'{tables_location}/{file.split("/")[-1].replace(".vcf.bgz", "")}.tsv.gz' for file in files]
 tables_location
 
+# %% papermill={"duration": 0.008863, "end_time": "2021-05-05T20:00:58.715794", "exception": false, "start_time": "2021-05-05T20:00:58.706931", "status": "completed"} tags=[]
+cpu_count = int(multiprocessing.cpu_count())
+cpu_count
+
 
 # %% papermill={"duration": 0.008863, "end_time": "2021-05-05T20:00:58.715794", "exception": false, "start_time": "2021-05-05T20:00:58.706931", "status": "completed"} tags=[]
 # extract needed columns
 # if running DIRECTLY from notebook, add module load i12g/bcftools; in the beginning of cmd
 def create_table(file, table_location):
+    if "chrM" in file:
+        query_string = "%CHROM\t%POS\t%REF\t%ALT\t%AF_hom\t%AF_het\n"
+    else:
+        query_string = "%CHROM\t%POS\t%REF\t%ALT\t%AF\t%AF_afr\t%AF_eas\t%AF_fin\t%AF_nfe\t%AF_asj\t%AF_oth\t%AF_popmax\n"
     if not os.path.exists(table_location):
-        cmd = f"bcftools query -f '%CHROM\t%POS\t%REF\t%ALT\t%AF\t%AF_afr\t%AF_eas\t%AF_fin\t%AF_nfe\t%AF_asj\t%AF_oth\t%AF_popmax\n' {file} | gzip > {table_location}"
+        cmd = f"bcftools query -f '{query_string}' {file} | gzip > {table_location}"
         p = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE)
         print(p.communicate())
     
@@ -50,4 +59,4 @@ def create_table(file, table_location):
 
 # %% papermill={"duration": 0.329741, "end_time": "2021-05-05T20:00:59.051392", "exception": false, "start_time": "2021-05-05T20:00:58.721651", "status": "completed"} tags=[]
 # run bcftools in parallel
-Parallel(12)(delayed(create_table)(file, table_location) for file, table_location in tqdm(zip(files, tables_location)))
+Parallel(cpu_count)(delayed(create_table)(file, table_location) for file, table_location in tqdm(zip(files, tables_location)))
diff --git a/setup.py b/setup.py
index 7771d89..a83be1e 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import setup, find_packages
 
 setup(name='gnomad_db',
-      version='0.0.4',
+      version='0.0.5',
       description='This package scales the huge gnomAD files to a SQLite database, which is easy and fast to query. It extracts from a gnomAD vcf the minor allele frequency for each variant.',
       author='KalinNonchev',
       author_email='boo@foo.com',
@@ -11,6 +11,6 @@
       url="https://github.com/KalinNonchev/gnomAD_MAF",
       packages=find_packages(),  # find packages
       include_package_data=True,
-      install_requires=['pandas', 'numpy', 'joblib'],  # external packages as dependencies,
+      install_requires=['pandas', 'numpy', 'joblib', 'tqdm'],  # external packages as dependencies,
       python_requires='>=3.6'
       )