From 79860bb6cca6e58db0c6ccb629111c324e9de68a Mon Sep 17 00:00:00 2001 From: Kalin Nonchev <50597791+KalinNonchev@users.noreply.github.com> Date: Wed, 30 Jun 2021 18:24:13 +0200 Subject: [PATCH] Parallel and download function (#10) * cpu count * cpu count default * add chrM columns in create script * Update README.md * Update README.md * source db * utils * packages * packages * rm package * package * Update README.md * rename * Update README.md * variable names * final adj Co-authored-by: Kalin Nonchev --- README.md | 28 +++++++++++++++-- gnomad_db/database.py | 19 +++++++++--- gnomad_db/utils.py | 38 +++++++++++++++++++++++ requirements.txt | 1 + scripts/GettingStartedwithGnomAD_DB.ipynb | 30 ++++++++++++++++-- scripts/GettingStartedwithGnomAD_DB.py | 14 +++++++++ scripts/createTSVtables.ipynb | 35 ++++++++++++++++++--- scripts/createTSVtables.py | 13 ++++++-- setup.py | 4 +-- 9 files changed, 164 insertions(+), 18 deletions(-) create mode 100644 gnomad_db/utils.py diff --git a/README.md b/README.md index acc7d43..26a6c8f 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,27 @@ It extracts from a gnomAD vcf the ["AF", "AF_afr", "AF_eas", "AF_fin", "AF_nfe", ###### The package works for all currently available gnomAD releases.(2021) -## 1. Data preprocessing and SQL database creation +## 1. Download SQLite preprocessed files + +I have preprocessed and created sqlite3 files for gnomAD v2.1.1 and 3.1.1 for you, which can be easily downloaded from here. They contain all variants on the 24 standard chromosomes. + +gnomAD v3.1.1 (hg38, **759'302'267** variants) 25G zipped, 56G in total - https://zenodo.org/record/5045170/files/gnomad_db_v3.1.1.sqlite3.gz?download=1 \ +gnomAD v2.1.1 (hg19, **261'942'336** variants) 9G zipped, 20G in total - https://zenodo.org/record/5045102/files/gnomad_db_v2.1.1.sqlite3.gz?download=1 + +You can download it as: + +```python +from gnomad_db.database import gnomAD_DB +download_link = "https://zenodo.org/record/5045102/files/gnomad_db_v2.1.1.sqlite3.gz?download=1" +output_dir = "test_dir" # database_location +gnomAD_DB.download_and_unzip(download_link, output_dir) +``` +#### NB this would take ~30min (network speed 10mb/s) + + +or you can create the database by yourself. **However, I recommend to use the preprocessed files to save ressources and time**. If you do so, you can go to **2. API usage** and explore the package and its great features! + +## 1.1 Data preprocessing and SQL database creation Start by downloading the vcf files from gnomAD in a single directory: @@ -60,7 +80,8 @@ database_location = "test_dir" db = gnomAD_DB(database_location) ``` -3. Insert some test variants to run the examples below +3. Insert some test variants to run the examples below \ +**If you have downloaded the preprocessed sqlite3 files, you can skip this step as you already have variants** ```python # get some variants var_df = pd.read_csv("data/test_vcf_gnomad_chr21_10000.tsv.gz", sep="\t", names=db.columns, index_col=False) @@ -69,7 +90,8 @@ var_df = pd.read_csv("data/test_vcf_gnomad_chr21_10000.tsv.gz", sep="\t", names= db.insert_variants(var_df) ``` -4. Query variant minor allele frequency +4. Query variant minor allele frequency \ +**These example variants are assembled to hg38!** ```python # query some MAF scores dummy_var_df = pd.DataFrame({ diff --git a/gnomad_db/database.py b/gnomad_db/database.py index 44f53a8..16c6fcc 100644 --- a/gnomad_db/database.py +++ b/gnomad_db/database.py @@ -4,13 +4,14 @@ import pandas as pd import multiprocessing from joblib import Parallel, delayed +from . import utils class gnomAD_DB: - def __init__(self, genodb_path, parallel=True): + def __init__(self, genodb_path, parallel=True, cpu_count=512): - self.cpu_count = int(multiprocessing.cpu_count()) + self.cpu_count = min(cpu_count, int(multiprocessing.cpu_count())) self.parallel = parallel self.db_file = os.path.join(genodb_path, 'gnomad_db.sqlite3') @@ -182,7 +183,9 @@ def get_mafs_for_interval(self, chrom: str, interval_start: int, interval_end: i def get_maf_from_str(self, var: str, query: str="AF") -> float: - # variant in form chrom:pos:ref>alt + """ + get AF for variant in form chrom:pos:ref>alt + """ chrom, pos, ref, alt = self._pack_from_str(var) @@ -193,4 +196,12 @@ def get_maf_from_str(self, var: str, query: str="AF") -> float: where chrom = '{chrom}' AND pos = {pos} AND ref = '{ref}' AND alt = '{alt}'; """ - return self.query_direct(sql_query).squeeze() \ No newline at end of file + return self.query_direct(sql_query).squeeze() + + + @staticmethod + def download_and_unzip(url, output_path): + """ + download database and unzip file + """ + utils.download_and_unzip_file(url, output_path) \ No newline at end of file diff --git a/gnomad_db/utils.py b/gnomad_db/utils.py new file mode 100644 index 0000000..b7c3b66 --- /dev/null +++ b/gnomad_db/utils.py @@ -0,0 +1,38 @@ +import urllib.request +import gzip +import shutil +from tqdm import tqdm +import os +import time + + +class DownloadProgressBar(tqdm): + def update_to(self, b=1, bsize=1, tsize=None): + if tsize is not None: + self.total = tsize + self.update(b * bsize - self.n) + + +def download_url(url, output_dir): + with DownloadProgressBar(unit='B', unit_scale=True, + miniters=1, desc=url.split('/')[-1]) as t: + urllib.request.urlretrieve(url, filename=f"{output_dir}/gnomad_db.sqlite3.gz", reporthook=t.update_to) + time.sleep(5) + +def unzip(output_dir): + file_name_in = f"{output_dir}/gnomad_db.sqlite3.gz" + file_name_out = f"{output_dir}/gnomad_db.sqlite3" + with gzip.open(file_name_in, 'rb') as f_in: + with open(file_name_out, 'wb') as f_out: + shutil.copyfileobj(f_in, f_out) + time.sleep(5) + os.remove(file_name_in) + print(f"Database location: {file_name_out}") + +def download_and_unzip_file(url, output_dir): + print("Starting downloading...") + download_url(url, output_dir) + print("Starting unzipping. This can take some time...") + unzip(output_dir) + print("Done!") + \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 3e60185..000e2c1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ numpy pyyaml pytest joblib +tqdm \ No newline at end of file diff --git a/scripts/GettingStartedwithGnomAD_DB.ipynb b/scripts/GettingStartedwithGnomAD_DB.ipynb index 70746e7..447e338 100644 --- a/scripts/GettingStartedwithGnomAD_DB.ipynb +++ b/scripts/GettingStartedwithGnomAD_DB.ipynb @@ -3,7 +3,7 @@ { "cell_type": "code", "execution_count": null, - "id": "98d3ae32-d439-41bf-be89-3e6f52a3591f", + "id": "12c65da9-cb0c-42f6-a92c-cf0c2535ba13", "metadata": {}, "outputs": [], "source": [ @@ -12,6 +12,32 @@ "import numpy as np" ] }, + { + "cell_type": "markdown", + "id": "20ffe529-064d-482b-9267-9b0fd729070c", + "metadata": {}, + "source": [ + "# Download SQLite preprocessed files\n", + "\n", + "I have preprocessed and created sqlite3 files for gnomAD v2.1.1 and 3.1.1 for you, which can be easily downloaded from here. They contain all variants on the 24 standard chromosomes.\n", + "\n", + "gnomAD v3.1.1 (hg38, 759'302'267 variants) 25G zipped, 56G in total - https://zenodo.org/record/5045170/files/gnomad_db_v3.1.1.sqlite3.gz?download=1 \n", + "gnomAD v2.1.1 (hg19, 261'942'336 variants) 9G zipped, 20G in total - https://zenodo.org/record/5045102/files/gnomad_db_v2.1.1.sqlite3.gz?download=1 " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e525076f-8540-444f-912a-b411a224f0ec", + "metadata": {}, + "outputs": [], + "source": [ + "# uncomment if you actually want to download it\n", + "# download_link = \"https://zenodo.org/record/5045102/files/gnomad_db_v2.1.1.sqlite3.gz?download=1\"\n", + "# output_dir = \"test_dir\" # database_location\n", + "# gnomAD_DB.download_and_unzip(download_link, output_dir) " + ] + }, { "cell_type": "markdown", "id": "3fa70b8a-1d43-4929-bfee-4df0d071f9fa", @@ -241,7 +267,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.8" + "version": "3.9.5" } }, "nbformat": 4, diff --git a/scripts/GettingStartedwithGnomAD_DB.py b/scripts/GettingStartedwithGnomAD_DB.py index ec241e2..fed5665 100644 --- a/scripts/GettingStartedwithGnomAD_DB.py +++ b/scripts/GettingStartedwithGnomAD_DB.py @@ -18,6 +18,20 @@ import pandas as pd import numpy as np +# %% [markdown] +# # Download SQLite preprocessed files +# +# I have preprocessed and created sqlite3 files for gnomAD v2.1.1 and 3.1.1 for you, which can be easily downloaded from here. They contain all variants on the 24 standard chromosomes. +# +# gnomAD v3.1.1 (hg38, 759'302'267 variants) 25G zipped, 56G in total - https://zenodo.org/record/5045170/files/gnomad_db_v3.1.1.sqlite3.gz?download=1 +# gnomAD v2.1.1 (hg19, 261'942'336 variants) 9G zipped, 20G in total - https://zenodo.org/record/5045102/files/gnomad_db_v2.1.1.sqlite3.gz?download=1 + +# %% +# uncomment if you actually want to download it +# download_link = "https://zenodo.org/record/5045102/files/gnomad_db_v2.1.1.sqlite3.gz?download=1" +# output_dir = "test_dir" # database_location +# gnomAD_DB.download_and_unzip(download_link, output_dir) + # %% [markdown] # # Initialize Database diff --git a/scripts/createTSVtables.ipynb b/scripts/createTSVtables.ipynb index 08b84a7..dd0e164 100644 --- a/scripts/createTSVtables.ipynb +++ b/scripts/createTSVtables.ipynb @@ -21,6 +21,7 @@ "from subprocess import PIPE, Popen\n", "import pandas as pd\n", "from joblib import Parallel, delayed\n", + "import multiprocessing\n", "import os" ] }, @@ -92,7 +93,27 @@ { "cell_type": "code", "execution_count": null, - "id": "efc70048-1fde-4f0e-af94-cc111f3a9250", + "id": "f3b2f898-f126-476b-bab5-d087e27bb0a7", + "metadata": { + "papermill": { + "duration": 0.008863, + "end_time": "2021-05-05T20:00:58.715794", + "exception": false, + "start_time": "2021-05-05T20:00:58.706931", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "cpu_count = int(multiprocessing.cpu_count())\n", + "cpu_count" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ae0419e4-55bd-484b-b2f2-981d67457504", "metadata": { "papermill": { "duration": 0.008863, @@ -108,11 +129,15 @@ "# extract needed columns\n", "# if running DIRECTLY from notebook, add module load i12g/bcftools; in the beginning of cmd\n", "def create_table(file, table_location):\n", + " if \"chrM\" in file:\n", + " query_string = \"%CHROM\\t%POS\\t%REF\\t%ALT\\t%AF_hom\\t%AF_het\\n\"\n", + " else:\n", + " query_string = \"%CHROM\\t%POS\\t%REF\\t%ALT\\t%AF\\t%AF_afr\\t%AF_eas\\t%AF_fin\\t%AF_nfe\\t%AF_asj\\t%AF_oth\\t%AF_popmax\\n\"\n", " if not os.path.exists(table_location):\n", - " cmd = f\"bcftools query -f '%CHROM\\t%POS\\t%REF\\t%ALT\\t%AF\\t%AF_afr\\t%AF_eas\\t%AF_fin\\t%AF_nfe\\t%AF_asj\\t%AF_oth\\t%AF_popmax\\n' {file} | gzip > {table_location}\"\n", + " cmd = f\"bcftools query -f '{query_string}' {file} | gzip > {table_location}\"\n", " p = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE)\n", " print(p.communicate())\n", - " \n" + " " ] }, { @@ -132,7 +157,7 @@ "outputs": [], "source": [ "# run bcftools in parallel\n", - "Parallel(12)(delayed(create_table)(file, table_location) for file, table_location in tqdm(zip(files, tables_location)))" + "Parallel(cpu_count)(delayed(create_table)(file, table_location) for file, table_location in tqdm(zip(files, tables_location)))" ] } ], @@ -155,7 +180,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.8" + "version": "3.9.5" }, "papermill": { "default_parameters": {}, diff --git a/scripts/createTSVtables.py b/scripts/createTSVtables.py index eddcf56..49b77d0 100644 --- a/scripts/createTSVtables.py +++ b/scripts/createTSVtables.py @@ -19,6 +19,7 @@ from subprocess import PIPE, Popen import pandas as pd from joblib import Parallel, delayed +import multiprocessing import os # %% papermill={"duration": 0.014665, "end_time": "2021-05-05T20:00:58.675108", "exception": false, "start_time": "2021-05-05T20:00:58.660443", "status": "completed"} tags=["parameters"] @@ -36,13 +37,21 @@ tables_location = [f'{tables_location}/{file.split("/")[-1].replace(".vcf.bgz", "")}.tsv.gz' for file in files] tables_location +# %% papermill={"duration": 0.008863, "end_time": "2021-05-05T20:00:58.715794", "exception": false, "start_time": "2021-05-05T20:00:58.706931", "status": "completed"} tags=[] +cpu_count = int(multiprocessing.cpu_count()) +cpu_count + # %% papermill={"duration": 0.008863, "end_time": "2021-05-05T20:00:58.715794", "exception": false, "start_time": "2021-05-05T20:00:58.706931", "status": "completed"} tags=[] # extract needed columns # if running DIRECTLY from notebook, add module load i12g/bcftools; in the beginning of cmd def create_table(file, table_location): + if "chrM" in file: + query_string = "%CHROM\t%POS\t%REF\t%ALT\t%AF_hom\t%AF_het\n" + else: + query_string = "%CHROM\t%POS\t%REF\t%ALT\t%AF\t%AF_afr\t%AF_eas\t%AF_fin\t%AF_nfe\t%AF_asj\t%AF_oth\t%AF_popmax\n" if not os.path.exists(table_location): - cmd = f"bcftools query -f '%CHROM\t%POS\t%REF\t%ALT\t%AF\t%AF_afr\t%AF_eas\t%AF_fin\t%AF_nfe\t%AF_asj\t%AF_oth\t%AF_popmax\n' {file} | gzip > {table_location}" + cmd = f"bcftools query -f '{query_string}' {file} | gzip > {table_location}" p = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE) print(p.communicate()) @@ -50,4 +59,4 @@ def create_table(file, table_location): # %% papermill={"duration": 0.329741, "end_time": "2021-05-05T20:00:59.051392", "exception": false, "start_time": "2021-05-05T20:00:58.721651", "status": "completed"} tags=[] # run bcftools in parallel -Parallel(12)(delayed(create_table)(file, table_location) for file, table_location in tqdm(zip(files, tables_location))) +Parallel(cpu_count)(delayed(create_table)(file, table_location) for file, table_location in tqdm(zip(files, tables_location))) diff --git a/setup.py b/setup.py index 7771d89..a83be1e 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import setup, find_packages setup(name='gnomad_db', - version='0.0.4', + version='0.0.5', description='This package scales the huge gnomAD files to a SQLite database, which is easy and fast to query. It extracts from a gnomAD vcf the minor allele frequency for each variant.', author='KalinNonchev', author_email='boo@foo.com', @@ -11,6 +11,6 @@ url="https://github.com/KalinNonchev/gnomAD_MAF", packages=find_packages(), # find packages include_package_data=True, - install_requires=['pandas', 'numpy', 'joblib'], # external packages as dependencies, + install_requires=['pandas', 'numpy', 'joblib', 'tqdm'], # external packages as dependencies, python_requires='>=3.6' )