From df7b61c6e5be1854a7c906ce087cc36c9b880acd Mon Sep 17 00:00:00 2001 From: Kalin Nonchev <50597791+KalinNonchev@users.noreply.github.com> Date: Sat, 4 Nov 2023 10:07:08 +0100 Subject: [PATCH] update v4 (#24) * update v4 --------- Co-authored-by: Kalin Nonchev --- README.md | 28 +-- Snakefile | 6 +- gnomad_db/database.py | 17 +- gnomad_db/pkgdata/gnomad_columns.yaml | 25 ++- script_config.yaml | 2 +- scripts/GettingStartedwithGnomAD_DB.ipynb | 165 ++++++++++++------ scripts/GettingStartedwithGnomAD_DB.py | 110 ------------ scripts/README.md | 2 +- scripts/createTSVtables.ipynb | 47 +++-- scripts/createTSVtables.py | 70 -------- ...gnomad.sh => download_vcf_gnomad_3.1.2.sh} | 0 scripts/download_vcf_gnomad_4_exomes.sh | 50 ++++++ scripts/download_vcf_gnomad_4_genomes.sh | 50 ++++++ scripts/insertVariants.ipynb | 20 +-- scripts/insertVariants.py | 69 -------- setup.py | 2 +- test_dir/test_gnomad_db.py | 4 +- 17 files changed, 313 insertions(+), 354 deletions(-) delete mode 100644 scripts/GettingStartedwithGnomAD_DB.py delete mode 100644 scripts/createTSVtables.py rename scripts/{download_vcf_gnomad.sh => download_vcf_gnomad_3.1.2.sh} (100%) create mode 100644 scripts/download_vcf_gnomad_4_exomes.sh create mode 100644 scripts/download_vcf_gnomad_4_genomes.sh delete mode 100644 scripts/insertVariants.py diff --git a/README.md b/README.md index 829a740..d865e38 100644 --- a/README.md +++ b/README.md @@ -2,9 +2,14 @@ # gnomAD_DB -### Changelog +#### Changelog -#### NEW version (July 2022) +#### NEW version (November 2023) + - release gnomAD WGS v4.0 and WES v4.0 + - `gnomad_version`=["v2"|"v3"|"v4"] argument has to be specified when initializing the database + - minor fixes + +#### version (July 2022) - release gnomAD WGS v3.1.2 - minor bug fixes @@ -12,23 +17,26 @@ - more available variant features present, check [here](https://github.com/KalinNonchev/gnomAD_DB/blob/master/gnomad_db/pkgdata/gnomad_columns.yaml) - `get_maf_from_df` renamed to `get_info_from_df` - `get_maf_from_str` renamed to `get_info_from_str` -- `genome`=["Grch37"|"Grch38"] argument have to be specified, when initializing the database +- [DEPRECATED 11.2023]`genome`=["Grch37"|"Grch38"] argument has to be specified when initializing the database +## Why and What [The Genome Aggregation Database (gnomAD)](https://gnomad.broadinstitute.org) is a resource developed by an international coalition of investigators, with the goal of aggregating and harmonizing both exome and genome sequencing data from a wide variety of large-scale sequencing projects, and making summary data available for the wider scientific community. This package scales the huge gnomAD files (on average ~120G/chrom) to a SQLite database with a size of 34G for WGS v2.1.1 (261.942.336 variants) and 98G for WGS v3.1.2 (about 759.302.267 variants), and allows scientists to look for various variant annotations present in gnomAD (i.e. Allele Count, Depth, Minor Allele Frequency, etc. - [here](https://github.com/KalinNonchev/gnomAD_DB/blob/master/gnomad_db/pkgdata/gnomad_columns.yaml) you can find all selected features given the genome version). (A query containing 300.000 variants takes ~40s.) -It extracts from a gnomAD vcf about 23 variant annotations. You can find further infromation about the exact fields [here](https://github.com/KalinNonchev/gnomAD_DB/blob/master/gnomad_db/pkgdata/gnomad_columns.yaml). +It extracts from a gnomAD vcf about 23 variant annotations. You can find further information about the exact fields [here](https://github.com/KalinNonchev/gnomAD_DB/blob/master/gnomad_db/pkgdata/gnomad_columns.yaml). ###### The package works for all currently available gnomAD releases.(July 2022) ## 1. Download SQLite preprocessed files -I have preprocessed and created sqlite3 files for gnomAD v2.1.1 and 3.1.2 for you, which can be easily downloaded from here. They contain all variants on the 24 standard chromosomes. +I have preprocessed and created sqlite3 files for gnomAD for you, which can be easily downloaded from here. They contain all variants on the 24 standard chromosomes. -gnomAD v3.1.2 (hg38, **759'302'267** variants) 46.2G zipped, 98G in total - https://zenodo.org/record/6818606/files/gnomad_db_v3.1.2.sqlite3.gz?download=1 \ -gnomAD v2.1.1 (hg19, **261'942'336** variants) 16.1G zipped, 48G in total - https://zenodo.org/record/5770384/files/gnomad_db_v2.1.1.sqlite3.gz?download=1 +- WGS gnomAD v4.0 (hg38, **759'302'267** variants) 36.1G zipped, 74G in total - https://zenodo.org/records/10066323/files/gnomad_db_wgs_v4.0.sqlite3.gz?download=1 +- WES gnomAD v4.0 (hg38, **161'417'006** variants) 7.3G zipped, 17G in total - https://zenodo.org/records/10066310/files/gnomad_db_wes_v4.0.sqlite3.gz?download=1 +- WGS gnomAD v3.1.2 (hg38, **759'302'267** variants) 46.2G zipped, 98G in total - https://zenodo.org/record/6818606/files/gnomad_db_v3.1.2.sqlite3.gz?download=1 +- WGS gnomAD v2.1.1 (hg19, **261'942'336** variants) 16.1G zipped, 48G in total - https://zenodo.org/record/5770384/files/gnomad_db_v2.1.1.sqlite3.gz?download=1 You can download it as: @@ -41,7 +49,7 @@ gnomAD_DB.download_and_unzip(download_link, output_dir) #### NB this would take ~30min (network speed 10mb/s) -or you can create the database by yourself. **However, I recommend to use the preprocessed files to save ressources and time**. If you do so, you can go to **2. API usage** and explore the package and its great features! +or you can create the database by yourself. **However, I recommend using the preprocessed files to save resources and time**. If you do so, you can go to **2. API usage** and explore the package and its great features! ## 2. API usage @@ -62,11 +70,11 @@ from gnomad_db.database import gnomAD_DB ``` 2. Initialize database connection \ -**Make sure to have the correct genome version!** +**Make sure to have the correct gnomad version!** ```python # pass dir database_location = "test_dir" -db = gnomAD_DB(database_location, genome="Grch38") +db = gnomAD_DB(database_location, gnomad_version="v3") ``` 3. Insert some test variants to run the examples below \ diff --git a/Snakefile b/Snakefile index 642b054..8607834 100644 --- a/Snakefile +++ b/Snakefile @@ -14,7 +14,7 @@ database_location = config['database_location'] gnomad_vcf_location = config['gnomad_vcf_location'] tables_location = config['tables_location'] script_locations = config['script_locations'] -genome = config['genome'] +gnomad_version = config['gnomad_version'] KERNEL = config['KERNEL'] @@ -32,7 +32,7 @@ rule extract_tables: message: "Running createTSVtables notebook..." shell: - "papermill {input.notebook} {output.notebook} -p gnomad_vcf_location {gnomad_vcf_location} -p tables_location {tables_location} -p genome {genome} -k {KERNEL}" + "papermill {input.notebook} {output.notebook} -p gnomad_vcf_location {gnomad_vcf_location} -p tables_location {tables_location} -p gnomad_version {gnomad_version} -k {KERNEL}" # -------------------------- INSSERT VARIANTS WITH MAF TO DATABASE ------------------------------ @@ -45,7 +45,7 @@ rule insert_variants: message: "Running insertVariants notebook..." shell: - "papermill {input.notebook} {output.notebook} -p database_location {database_location} -p tables_location {tables_location} -p genome {genome} -k {KERNEL}" + "papermill {input.notebook} {output.notebook} -p database_location {database_location} -p tables_location {tables_location} -p gnomad_version {gnomad_version} -k {KERNEL}" # -------------------------- INSSERT VARIANTS WITH MAF TO DATABASE ------------------------------ #rule create_GettingStartedNB: diff --git a/gnomad_db/database.py b/gnomad_db/database.py index 4910710..0ad6708 100644 --- a/gnomad_db/database.py +++ b/gnomad_db/database.py @@ -8,13 +8,13 @@ import yaml import pkg_resources + class gnomAD_DB: - def __init__(self, genodb_path, genome="Grch38", parallel=False, cpu_count=None): + def __init__(self, genodb_path, gnomad_version, parallel=False, cpu_count=None): self.parallel = parallel - self.genome = genome if self.parallel: self.cpu_count = cpu_count if isinstance(cpu_count, int) else int(multiprocessing.cpu_count()) @@ -26,7 +26,10 @@ def __init__(self, genodb_path, genome="Grch38", parallel=False, cpu_count=None) with open(columns_path) as f: columns = yaml.load(f, Loader=yaml.FullLoader) - self.columns = list(map(lambda x: x.lower(), columns["base_columns"])) + columns[self.genome] + + self.gnomad_version = self._parse_gnomad_version(gnomad_version, list(columns.keys())[1:]) + + self.columns = list(map(lambda x: x.lower(), columns["base_columns"])) + columns[self.gnomad_version] self.dict_columns = columns if not os.path.exists(self.db_file): @@ -41,7 +44,7 @@ def open_dbconn(self): def create_table(self): - value_columns = ",".join([f"{col} REAL" for col in self.dict_columns[self.genome]]) + value_columns = ",".join([f"{col} REAL" for col in self.dict_columns[self.gnomad_version]]) sql_create = f""" CREATE TABLE gnomad_db ( chrom TEXT, @@ -171,6 +174,12 @@ def _pack_from_str(self, var: str) -> str: ref = var[2].split(">")[0] alt = var[2].split(">")[1] return chrom, pos, ref, alt + + def _parse_gnomad_version(self, gnomad_version: str, supported_gnomad_versions: list) -> str: + gnomad_version = str(gnomad_version) + gnomad_version = gnomad_version.split(".")[-1] + assert gnomad_version in supported_gnomad_versions, f"We don't support this version: {gnomad_version}. Please select one fo the following ones: {supported_gnomad_versions}" + return gnomad_version def query_direct(self, sql_query: str): diff --git a/gnomad_db/pkgdata/gnomad_columns.yaml b/gnomad_db/pkgdata/gnomad_columns.yaml index df17e85..c1008e5 100644 --- a/gnomad_db/pkgdata/gnomad_columns.yaml +++ b/gnomad_db/pkgdata/gnomad_columns.yaml @@ -4,7 +4,7 @@ base_columns: - REF - ALT - FILTER -Grch37: +v2: - AC # Alternate allele count for samples - AN # Total number of alleles in samples - AF # Alternate allele frequency in samples @@ -23,7 +23,7 @@ Grch37: - AF_fin # Alternate allele frequency in XX samples of Finnish ancestry - AF_afr # Alternate allele frequency in samples of African/African-American ancestry - AF_asj # Alternate allele frequency in samples of Ashkenazi Jewish ancestry -Grch38: +v3: - AC # Alternate allele count for samples - AN # Total number of alleles in samples - AF # Alternate allele frequency in samples @@ -31,15 +31,30 @@ Grch38: - MQ # Root mean square of the mapping quality of reads across all samples - QD # Variant call confidence normalized by depth of sample reads supporting a variant - ReadPosRankSum # Z-score from Wilcoxon rank sum test of alternate vs. reference read position bias -# - DP # Depth of informative coverage for each sample; reads with MQ=255 or with bad mates are filtered - VarDP - AS_VQSLOD -# - VQSLOD # Log-odds ratio of being a true variant versus being a false positive under the trained VQSR Gaussian mixture model - AC_popmax # Allele count in the population with the maximum AF - AN_popmax # Total number of alleles in the population with the maximum AF - AF_popmax # Maximum allele frequency across populations (excluding samples of Ashkenazi - AF_eas # Alternate allele frequency in samples of East Asian ancestry -# - AF_oth # Alternate allele frequency in XY samples of Other ancestry # not supported anymore 9.07.22 + - AF_nfe # Alternate allele frequency in XY samples of Non-Finnish European ancestry + - AF_fin # Alternate allele frequency in XX samples of Finnish ancestry + - AF_afr # Alternate allele frequency in samples of African/African-American ancestry + - AF_asj # Alternate allele frequency in samples of Ashkenazi Jewish ancestry + +v4: + - AC # Alternate allele count for samples + - AN # Total number of alleles in samples + - AF # Alternate allele frequency in samples + - MQ # Root mean square of the mapping quality of reads across all samples + - QD # Variant call confidence normalized by depth of sample reads supporting a variant + - ReadPosRankSum # Z-score from Wilcoxon rank sum test of alternate vs. reference read position bias + - VarDP + - AS_VQSLOD + - AC_grpmax # Allele count in the population with the maximum AF + - AN_grpmax # Total number of alleles in the population with the maximum AF + - AF_grpmax # Maximum allele frequency across populations (excluding samples of Ashkenazi + - AF_eas # Alternate allele frequency in samples of East Asian ancestry - AF_nfe # Alternate allele frequency in XY samples of Non-Finnish European ancestry - AF_fin # Alternate allele frequency in XX samples of Finnish ancestry - AF_afr # Alternate allele frequency in samples of African/African-American ancestry diff --git a/script_config.yaml b/script_config.yaml index 88d8b7f..04c053b 100644 --- a/script_config.yaml +++ b/script_config.yaml @@ -2,5 +2,5 @@ database_location: "test_out" # where to create the database, make sure you have gnomad_vcf_location: "data" # where are your *.vcf.bgz located tables_location: "test_out" # where to store the preprocessed intermediate files, you can leave it like this script_locations: "test_out" # where to store the scripts, where you can check the progress of your jobs, you can leave it like this -genome: "Grch37" # genome version of the gnomAD vcf file (2.1.1 = Grch37, 3.1.1 = Grch38) +gnomad_version: "v2" # main gnomad_version version of the gnomAD vcf file (e.g., v2, v3, v4) KERNEL: "gnomad_db" diff --git a/scripts/GettingStartedwithGnomAD_DB.ipynb b/scripts/GettingStartedwithGnomAD_DB.ipynb index a52fcc5..66229fa 100644 --- a/scripts/GettingStartedwithGnomAD_DB.ipynb +++ b/scripts/GettingStartedwithGnomAD_DB.ipynb @@ -3,21 +3,9 @@ { "cell_type": "code", "execution_count": 1, - "id": "12c65da9-cb0c-42f6-a92c-cf0c2535ba13", + "id": "ac0fca47", "metadata": {}, - "outputs": [ - { - "ename": "ModuleNotFoundError", - "evalue": "No module named 'gnomad_db'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mgnomad_db\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdatabase\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mgnomAD_DB\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mpandas\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'gnomad_db'" - ] - } - ], + "outputs": [], "source": [ "from gnomad_db.database import gnomAD_DB\n", "import pandas as pd\n", @@ -26,33 +14,56 @@ }, { "cell_type": "markdown", - "id": "20ffe529-064d-482b-9267-9b0fd729070c", + "id": "acdaa43f", "metadata": {}, "source": [ "# Download SQLite preprocessed files\n", "\n", - "I have preprocessed and created sqlite3 files for gnomAD v2.1.1 and 3.1.1 for you, which can be easily downloaded from here. They contain all variants on the 24 standard chromosomes.\n", + "I have preprocessed and created sqlite3 files for gnomAD v2, v3, v4 for you, which can be easily downloaded from here. They contain all variants on the 24 standard chromosomes.\n", "\n", - "gnomAD v3.1.1 (hg38, 759'302'267 variants) 25G zipped, 56G in total - https://zenodo.org/record/5045170/files/gnomad_db_v3.1.1.sqlite3.gz?download=1 \n", - "gnomAD v2.1.1 (hg19, 261'942'336 variants) 9G zipped, 20G in total - https://zenodo.org/record/5045102/files/gnomad_db_v2.1.1.sqlite3.gz?download=1 " + "You can find the links to download the sqlite3 file in the README (https://github.com/KalinNonchev/gnomAD_DB)" ] }, { "cell_type": "code", - "execution_count": null, - "id": "e525076f-8540-444f-912a-b411a224f0ec", + "execution_count": 8, + "id": "13b2eb85", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Starting downloading...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "gnomad_db_wes_v4.0.sqlite3.gz?download=1: 7.31GB [12:40, 9.62MB/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Starting unzipping. This can take some time...\n", + "Database location: test_dir/gnomad_db.sqlite3\n", + "Done!\n" + ] + } + ], "source": [ "# uncomment if you actually want to download it\n", - "# download_link = \"https://zenodo.org/record/5045102/files/gnomad_db_v2.1.1.sqlite3.gz?download=1\"\n", - "# output_dir = \"test_dir\" # database_location\n", - "# gnomAD_DB.download_and_unzip(download_link, output_dir) " + "download_link = \"https://zenodo.org/records/10066310/files/gnomad_db_wes_v4.0.sqlite3.gz?download=1\"\n", + "output_dir = \"test_dir\" # database_location\n", + "gnomAD_DB.download_and_unzip(download_link, output_dir) " ] }, { "cell_type": "markdown", - "id": "3fa70b8a-1d43-4929-bfee-4df0d071f9fa", + "id": "aab41c34", "metadata": {}, "source": [ "# Initialize Database" @@ -60,8 +71,8 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "2f7dc0cb-9748-4de8-b93d-0fad06b62f02", + "execution_count": 9, + "id": "dc9b3e8c", "metadata": { "tags": [ "parameters" @@ -75,18 +86,32 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "c357c0be-a00e-49a6-9159-07a700ec861a", + "execution_count": 11, + "id": "7f430055", "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "AssertionError", + "evalue": "We don't support this version: v45. Please select one fo the following ones: ['v2', 'v3', 'v4']", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/tmp/user/31932/ipykernel_1570523/790803492.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# initialize database\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mdb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgnomAD_DB\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdatabase_location\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgnomad_version\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"v45\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m~/code/gnomAD_DB/gnomad_db/database.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, genodb_path, gnomad_version, parallel, cpu_count)\u001b[0m\n\u001b[1;32m 28\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 29\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 30\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgnomad_version\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_parse_gnomad_version\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgnomad_version\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkeys\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 31\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 32\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmap\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"base_columns\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgnomad_version\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/code/gnomAD_DB/gnomad_db/database.py\u001b[0m in \u001b[0;36m_parse_gnomad_version\u001b[0;34m(self, gnomad_version, supported_gnomad_versions)\u001b[0m\n\u001b[1;32m 179\u001b[0m \u001b[0mgnomad_version\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgnomad_version\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 180\u001b[0m \u001b[0mgnomad_version\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgnomad_version\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\".\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 181\u001b[0;31m \u001b[0;32massert\u001b[0m \u001b[0mgnomad_version\u001b[0m \u001b[0;32min\u001b[0m \u001b[0msupported_gnomad_versions\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34mf\"We don't support this version: {gnomad_version}. Please select one fo the following ones: {supported_gnomad_versions}\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 182\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mgnomad_version\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 183\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mAssertionError\u001b[0m: We don't support this version: v45. Please select one fo the following ones: ['v2', 'v3', 'v4']" + ] + } + ], "source": [ "# initialize database\n", - "db = gnomAD_DB(database_location)" + "db = gnomAD_DB(database_location, gnomad_version=\"v45\")" ] }, { "cell_type": "markdown", - "id": "42952d85-f2fe-4fd8-baed-a2097082ffd0", + "id": "a9e3006f", "metadata": {}, "source": [ "# Insert gnomAD variants into the database from single tsv file\n", @@ -96,7 +121,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ac07902b-239b-479d-b81b-ecf37a67e163", + "id": "8def6a52", "metadata": {}, "outputs": [], "source": [ @@ -111,7 +136,7 @@ { "cell_type": "code", "execution_count": null, - "id": "809032d1-44bf-48b9-9535-a5ac3cdeb26b", + "id": "34dbb770", "metadata": {}, "outputs": [], "source": [ @@ -121,7 +146,7 @@ }, { "cell_type": "markdown", - "id": "6bd78960-0cc4-4585-8096-0da88d5737f6", + "id": "7a9243d2", "metadata": {}, "source": [ "# Query MAF" @@ -130,7 +155,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2f9dddfe-7ad7-4111-8b17-2199ee011db2", + "id": "835e50b4", "metadata": {}, "outputs": [], "source": [ @@ -141,7 +166,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bfcd6fa8-0e95-4cc3-85de-ee4eb46c5669", + "id": "eb3a308d", "metadata": {}, "outputs": [], "source": [ @@ -151,7 +176,7 @@ }, { "cell_type": "markdown", - "id": "7e73619c-fa1f-424b-ac70-0cf3bc746a6a", + "id": "b879dad5", "metadata": {}, "source": [ "## You can pass a dataframe with variants\n", @@ -161,7 +186,7 @@ { "cell_type": "code", "execution_count": null, - "id": "60d26665-c328-4d6c-a33c-02ec7888f01a", + "id": "36014921", "metadata": {}, "outputs": [], "source": [ @@ -171,7 +196,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0d47b008-4efc-4d7d-9fe7-b2257cb96ae0", + "id": "a7bfea3b", "metadata": {}, "outputs": [], "source": [ @@ -181,7 +206,7 @@ { "cell_type": "code", "execution_count": null, - "id": "29066ee1-379a-4af2-a384-1f9bcd00d2dd", + "id": "0aaa8a58", "metadata": {}, "outputs": [], "source": [ @@ -191,7 +216,7 @@ { "cell_type": "code", "execution_count": null, - "id": "35082a03-0bdb-4493-b85c-1151e826aa0f", + "id": "88c20646", "metadata": {}, "outputs": [], "source": [ @@ -206,7 +231,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2a08f971-b2d3-43cc-ac43-2b870d276c6b", + "id": "ef74e6bd", "metadata": {}, "outputs": [], "source": [ @@ -215,7 +240,7 @@ }, { "cell_type": "markdown", - "id": "30788bc6-64b1-4e69-a171-8fc819a160a6", + "id": "b4261ffd", "metadata": {}, "source": [ "## You can pass a single string as a variant" @@ -223,18 +248,50 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "a3aa9db7-e9eb-4b25-a2aa-abd2b7e4ceb8", + "execution_count": 7, + "id": "084c732a", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "chrom 10\n", + "pos 95606780\n", + "ref A\n", + "alt C\n", + "filter PASS\n", + "AC 2.0\n", + "AN 628768.0\n", + "AF 0.000003\n", + "MQ 60.0\n", + "QD 12.1796\n", + "ReadPosRankSum 0.365\n", + "VarDP 412.0\n", + "AS_VQSLOD 5.5239\n", + "AC_grpmax 2.0\n", + "AN_grpmax 350092.0\n", + "AF_grpmax 0.000006\n", + "AF_eas 0.0\n", + "AF_nfe 0.000006\n", + "AF_fin 0.0\n", + "AF_afr 0.0\n", + "AF_asj 0.0\n", + "Name: 0, dtype: object" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "db.get_info_from_str(\"21:9825790:C>T\", \"AF\")" + "db.get_info_from_str(\"10:95606780:A>C\", \"*\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "ae316d44-33f8-49b7-a888-a982668c21bd", + "id": "e72d5071", "metadata": {}, "outputs": [], "source": [ @@ -243,7 +300,7 @@ }, { "cell_type": "markdown", - "id": "170a7abb-7e69-49f9-a875-fd7fed8650ad", + "id": "b2cd63fc", "metadata": {}, "source": [ "## You can look for the MAF scores in an interval" @@ -252,7 +309,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b62f9fe7-da67-44a2-b311-a51feb0071b5", + "id": "0e587bf1", "metadata": {}, "outputs": [], "source": [ @@ -265,9 +322,9 @@ "formats": "ipynb,py:percent" }, "kernelspec": { - "display_name": "utr_anno", + "display_name": "gnomad_db", "language": "python", - "name": "utr_anno" + "name": "gnomad_db" }, "language_info": { "codemirror_mode": { @@ -279,7 +336,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.5" + "version": "3.7.0" } }, "nbformat": 4, diff --git a/scripts/GettingStartedwithGnomAD_DB.py b/scripts/GettingStartedwithGnomAD_DB.py deleted file mode 100644 index d7d5b20..0000000 --- a/scripts/GettingStartedwithGnomAD_DB.py +++ /dev/null @@ -1,110 +0,0 @@ -# --- -# jupyter: -# jupytext: -# formats: ipynb,py:percent -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.10.2 -# kernelspec: -# display_name: utr_anno -# language: python -# name: utr_anno -# --- - -# %% -from gnomad_db.database import gnomAD_DB -import pandas as pd -import numpy as np - -# %% [markdown] -# # Download SQLite preprocessed files -# -# I have preprocessed and created sqlite3 files for gnomAD v2.1.1 and 3.1.1 for you, which can be easily downloaded from here. They contain all variants on the 24 standard chromosomes. -# -# gnomAD v3.1.1 (hg38, 759'302'267 variants) 25G zipped, 56G in total - https://zenodo.org/record/5045170/files/gnomad_db_v3.1.1.sqlite3.gz?download=1 -# gnomAD v2.1.1 (hg19, 261'942'336 variants) 9G zipped, 20G in total - https://zenodo.org/record/5045102/files/gnomad_db_v2.1.1.sqlite3.gz?download=1 - -# %% -# uncomment if you actually want to download it -# download_link = "https://zenodo.org/record/5045102/files/gnomad_db_v2.1.1.sqlite3.gz?download=1" -# output_dir = "test_dir" # database_location -# gnomAD_DB.download_and_unzip(download_link, output_dir) - -# %% [markdown] -# # Initialize Database - -# %% tags=["parameters"] -# pass dir -database_location = "test_dir" - -# %% -# initialize database -db = gnomAD_DB(database_location) - -# %% [markdown] -# # Insert gnomAD variants into the database from single tsv file -# Look into insertVariants notebook to do it for big vcf files - -# %% -# get some variants -var_df = pd.read_csv("data/test_vcf_gnomad_chr21_10000.tsv.gz", sep="\t", names=db.columns, index_col=False) -# preprocess missing values -# IMPORTANT: The database removes internally chr prefix (chr1->1) -var_df = var_df.replace(".", np.NaN) -var_df.head() - -# %% -# insert variants -db.insert_variants(var_df) - -# %% [markdown] -# # Query MAF - -# %% -# check db columns, which we can query -db.columns - -# %% -var_df = var_df[["chrom", "pos", "ref", "alt"]] -var_df.head() - -# %% [markdown] -# ## You can pass a dataframe with variants -# It should contain the columns: [chrom, pos, ref, alt] - -# %% -db.get_info_from_df(var_df, "AF").head() # only one columns - -# %% -db.get_info_from_df(var_df, "AF, AF_popmax").head() # multiple columns - -# %% -db.get_info_from_df(var_df, "*") # everything - -# %% -dummy_var_df = pd.DataFrame({ - "chrom": ["1", "21"], - "pos": [21, 9825790], - "ref": ["T", "C"], - "alt": ["G", "T"]}) -dummy_var_df - -# %% -db.get_info_from_df(dummy_var_df, "AF") - -# %% [markdown] -# ## You can pass a single string as a variant - -# %% -db.get_info_from_str("21:9825790:C>T", "AF") - -# %% -db.get_info_from_str("21:9825790:C>T", "*") - -# %% [markdown] -# ## You can look for the MAF scores in an interval - -# %% -db.get_info_for_interval(chrom=21, interval_start=9825780, interval_end=9825799, query="*") diff --git a/scripts/README.md b/scripts/README.md index 05d6bc9..356d993 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -12,7 +12,7 @@ database_location: "test_out" # where to create the database, make sure you have gnomad_vcf_location: "data" # where are your *.vcf.bgz located tables_location: "test_out" # where to store the preprocessed intermediate files, you can leave it like this script_locations: "test_out" # where to store the scripts, where you can check the progress of your jobs, you can leave it like this -genome: "Grch37" # genome version of the gnomAD vcf file (2.1.1 = Grch37, 3.1.1 = Grch38) +gnomad_version: "v4" # genome version of the gnomAD vcf file (e.g., v2, v3, v4) ``` Once this is done, run diff --git a/scripts/createTSVtables.ipynb b/scripts/createTSVtables.ipynb index 1173b02..dde1962 100644 --- a/scripts/createTSVtables.ipynb +++ b/scripts/createTSVtables.ipynb @@ -3,7 +3,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ca9d6c26-56a1-46dc-aed6-93c282ee45a0", + "id": "8feb2572", "metadata": { "papermill": { "duration": 0.336842, @@ -29,7 +29,26 @@ { "cell_type": "code", "execution_count": null, - "id": "8ba7a210-a098-4d64-8a28-b394164447f3", + "id": "69750f17", + "metadata": { + "papermill": { + "duration": 0.336842, + "end_time": "2021-05-05T20:00:58.655105", + "exception": false, + "start_time": "2021-05-05T20:00:58.318263", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "#os.chdir(f'../')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f29e77bc", "metadata": { "papermill": { "duration": 0.014665, @@ -46,13 +65,13 @@ "source": [ "gnomad_vcf_location = \"test\"\n", "tables_location = \"test\"\n", - "genome = \"Grch38\"" + "gnomad_version = \"v4\"" ] }, { "cell_type": "code", "execution_count": null, - "id": "dbc66dfd-1ddb-4bbf-b315-4bc0c9039d47", + "id": "5c001a27", "metadata": { "papermill": { "duration": 0.014665, @@ -67,7 +86,7 @@ "source": [ "with open(\"gnomad_db/pkgdata/gnomad_columns.yaml\") as f:\n", " columns = yaml.load(f, Loader=yaml.FullLoader)\n", - "columns = columns[\"base_columns\"] + columns[genome]\n", + "columns = columns[\"base_columns\"] + columns[gnomad_version]\n", "print(len(columns))\n", "columns[:10]" ] @@ -75,7 +94,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5bdacc49-01a2-40a8-af9c-924f8eef0793", + "id": "d107dcc8", "metadata": { "papermill": { "duration": 0.014665, @@ -97,7 +116,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4f67a4d0-64df-4adc-854b-45ae069e1dd0", + "id": "461a81da", "metadata": { "papermill": { "duration": 0.008922, @@ -118,7 +137,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f3b2f898-f126-476b-bab5-d087e27bb0a7", + "id": "76cdd30e", "metadata": { "papermill": { "duration": 0.008863, @@ -138,7 +157,7 @@ { "cell_type": "code", "execution_count": null, - "id": "aa3b597d-cee5-404e-a2df-f9b64f708d99", + "id": "088135a0", "metadata": { "papermill": { "duration": 0.008863, @@ -165,7 +184,7 @@ { "cell_type": "code", "execution_count": null, - "id": "10632f00-164b-40e1-9860-149c93a01c45", + "id": "b2c59fe7", "metadata": { "papermill": { "duration": 0.329741, @@ -185,7 +204,7 @@ { "cell_type": "code", "execution_count": null, - "id": "be79507d-bcca-4d54-b2f1-cfdc3692967e", + "id": "4c808880", "metadata": {}, "outputs": [], "source": [] @@ -196,9 +215,9 @@ "formats": "ipynb,py:percent" }, "kernelspec": { - "display_name": "utr_anno", + "display_name": "gnomad_db", "language": "python", - "name": "utr_anno" + "name": "gnomad_db" }, "language_info": { "codemirror_mode": { @@ -210,7 +229,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.5" + "version": "3.7.0" }, "papermill": { "default_parameters": {}, diff --git a/scripts/createTSVtables.py b/scripts/createTSVtables.py deleted file mode 100644 index 7a3393a..0000000 --- a/scripts/createTSVtables.py +++ /dev/null @@ -1,70 +0,0 @@ -# --- -# jupyter: -# jupytext: -# formats: ipynb,py:percent -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.10.2 -# kernelspec: -# display_name: utr_anno -# language: python -# name: utr_anno -# --- - -# %% papermill={"duration": 0.336842, "end_time": "2021-05-05T20:00:58.655105", "exception": false, "start_time": "2021-05-05T20:00:58.318263", "status": "completed"} tags=[] -import glob -from tqdm import tqdm -from subprocess import PIPE, Popen -import pandas as pd -from joblib import Parallel, delayed -import multiprocessing -import os -import yaml - -# %% papermill={"duration": 0.014665, "end_time": "2021-05-05T20:00:58.675108", "exception": false, "start_time": "2021-05-05T20:00:58.660443", "status": "completed"} tags=["parameters"] -gnomad_vcf_location = "test" -tables_location = "test" -genome = "Grch38" - -# %% papermill={"duration": 0.014665, "end_time": "2021-05-05T20:00:58.675108", "exception": false, "start_time": "2021-05-05T20:00:58.660443", "status": "completed"} tags=[] -with open("gnomad_db/pkgdata/gnomad_columns.yaml") as f: - columns = yaml.load(f, Loader=yaml.FullLoader) -columns = columns["base_columns"] + columns[genome] -print(len(columns)) -columns[:10] - -# %% papermill={"duration": 0.014665, "end_time": "2021-05-05T20:00:58.675108", "exception": false, "start_time": "2021-05-05T20:00:58.660443", "status": "completed"} tags=[] -# get gnomAD files -files = glob.glob(f"{gnomad_vcf_location}/*.bgz") -print(len(files)) -files - -# %% papermill={"duration": 0.008922, "end_time": "2021-05-05T20:00:58.701950", "exception": false, "start_time": "2021-05-05T20:00:58.693028", "status": "completed"} tags=[] -# write gnomAD files to these tables: -tables_location = [f'{tables_location}/{file.split("/")[-1].replace(".vcf.bgz", "")}.tsv.gz' for file in files] -tables_location - -# %% papermill={"duration": 0.008863, "end_time": "2021-05-05T20:00:58.715794", "exception": false, "start_time": "2021-05-05T20:00:58.706931", "status": "completed"} tags=[] -cpu_count = int(multiprocessing.cpu_count()) -cpu_count - - -# %% papermill={"duration": 0.008863, "end_time": "2021-05-05T20:00:58.715794", "exception": false, "start_time": "2021-05-05T20:00:58.706931", "status": "completed"} tags=[] -# extract needed columns -# if running DIRECTLY from notebook, add module load i12g/bcftools; in the beginning of cmd -def create_table(file, table_location): - query_string = "%" + "\t%".join(columns) + "\n" - if not os.path.exists(table_location): - cmd = f"bcftools query -f '{query_string}' {file} | gzip > {table_location}" - p = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE) - print(p.communicate()) - - - -# %% papermill={"duration": 0.329741, "end_time": "2021-05-05T20:00:59.051392", "exception": false, "start_time": "2021-05-05T20:00:58.721651", "status": "completed"} tags=[] -# run bcftools in parallel -Parallel(cpu_count)(delayed(create_table)(file, table_location) for file, table_location in tqdm(zip(files, tables_location))) - -# %% diff --git a/scripts/download_vcf_gnomad.sh b/scripts/download_vcf_gnomad_3.1.2.sh similarity index 100% rename from scripts/download_vcf_gnomad.sh rename to scripts/download_vcf_gnomad_3.1.2.sh diff --git a/scripts/download_vcf_gnomad_4_exomes.sh b/scripts/download_vcf_gnomad_4_exomes.sh new file mode 100644 index 0000000..d14e453 --- /dev/null +++ b/scripts/download_vcf_gnomad_4_exomes.sh @@ -0,0 +1,50 @@ +### gnomAD v4.0 download VCF in parallel; total 2.3T +# chr 1 +wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/vcf/exomes/gnomad.exomes.v4.0.sites.chr1.vcf.bgz & +# chr 2 +wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/vcf/exomes/gnomad.exomes.v4.0.sites.chr2.vcf.bgz & +# chr 3 +wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/vcf/exomes/gnomad.exomes.v4.0.sites.chr3.vcf.bgz & +# chr 4 +wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/vcf/exomes/gnomad.exomes.v4.0.sites.chr4.vcf.bgz & +# chr 5 +wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/vcf/exomes/gnomad.exomes.v4.0.sites.chr5.vcf.bgz & +# chr 6 +wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/vcf/exomes/gnomad.exomes.v4.0.sites.chr6.vcf.bgz & +# chr 7 +wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/vcf/exomes/gnomad.exomes.v4.0.sites.chr7.vcf.bgz & +# chr 8 +wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/vcf/exomes/gnomad.exomes.v4.0.sites.chr8.vcf.bgz & +# chr 9 +wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/vcf/exomes/gnomad.exomes.v4.0.sites.chr9.vcf.bgz & +# chr 10 +wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/vcf/exomes/gnomad.exomes.v4.0.sites.chr10.vcf.bgz & +# chr 11 +wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/vcf/exomes/gnomad.exomes.v4.0.sites.chr11.vcf.bgz & +# chr 12 +wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/vcf/exomes/gnomad.exomes.v4.0.sites.chr12.vcf.bgz & +# chr 13 +wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/vcf/exomes/gnomad.exomes.v4.0.sites.chr13.vcf.bgz & +# chr 14 +wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/vcf/exomes/gnomad.exomes.v4.0.sites.chr14.vcf.bgz & +# chr 15 +wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/vcf/exomes/gnomad.exomes.v4.0.sites.chr15.vcf.bgz & +# chr 16 +wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/vcf/exomes/gnomad.exomes.v4.0.sites.chr16.vcf.bgz & +# chr 17 +wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/vcf/exomes/gnomad.exomes.v4.0.sites.chr17.vcf.bgz & +# chr 18 +wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/vcf/exomes/gnomad.exomes.v4.0.sites.chr18.vcf.bgz & +# chr 19 +wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/vcf/exomes/gnomad.exomes.v4.0.sites.chr19.vcf.bgz & +# chr 20 +wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/vcf/exomes/gnomad.exomes.v4.0.sites.chr20.vcf.bgz & +# chr 21 +wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/vcf/exomes/gnomad.exomes.v4.0.sites.chr21.vcf.bgz & +# chr 22 +wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/vcf/exomes/gnomad.exomes.v4.0.sites.chr22.vcf.bgz & +# chr x +wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/vcf/exomes/gnomad.exomes.v4.0.sites.chrX.vcf.bgz & +# xhr y +wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/vcf/exomes/gnomad.exomes.v4.0.sites.chrY.vcf.bgz & +wait diff --git a/scripts/download_vcf_gnomad_4_genomes.sh b/scripts/download_vcf_gnomad_4_genomes.sh new file mode 100644 index 0000000..ca4c136 --- /dev/null +++ b/scripts/download_vcf_gnomad_4_genomes.sh @@ -0,0 +1,50 @@ +### gnomAD v4.0 download VCF in parallel; total 2.3T +# chr 1 +wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/vcf/genomes/gnomad.genomes.v4.0.sites.chr1.vcf.bgz & +# chr 2 +wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/vcf/genomes/gnomad.genomes.v4.0.sites.chr2.vcf.bgz & +# chr 3 +wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/vcf/genomes/gnomad.genomes.v4.0.sites.chr3.vcf.bgz & +# chr 4 +wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/vcf/genomes/gnomad.genomes.v4.0.sites.chr4.vcf.bgz & +# chr 5 +wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/vcf/genomes/gnomad.genomes.v4.0.sites.chr5.vcf.bgz & +# chr 6 +wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/vcf/genomes/gnomad.genomes.v4.0.sites.chr6.vcf.bgz & +# chr 7 +wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/vcf/genomes/gnomad.genomes.v4.0.sites.chr7.vcf.bgz & +# chr 8 +wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/vcf/genomes/gnomad.genomes.v4.0.sites.chr8.vcf.bgz & +# chr 9 +wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/vcf/genomes/gnomad.genomes.v4.0.sites.chr9.vcf.bgz & +# chr 10 +wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/vcf/genomes/gnomad.genomes.v4.0.sites.chr10.vcf.bgz & +# chr 11 +wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/vcf/genomes/gnomad.genomes.v4.0.sites.chr11.vcf.bgz & +# chr 12 +wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/vcf/genomes/gnomad.genomes.v4.0.sites.chr12.vcf.bgz & +# chr 13 +wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/vcf/genomes/gnomad.genomes.v4.0.sites.chr13.vcf.bgz & +# chr 14 +wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/vcf/genomes/gnomad.genomes.v4.0.sites.chr14.vcf.bgz & +# chr 15 +wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/vcf/genomes/gnomad.genomes.v4.0.sites.chr15.vcf.bgz & +# chr 16 +wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/vcf/genomes/gnomad.genomes.v4.0.sites.chr16.vcf.bgz & +# chr 17 +wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/vcf/genomes/gnomad.genomes.v4.0.sites.chr17.vcf.bgz & +# chr 18 +wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/vcf/genomes/gnomad.genomes.v4.0.sites.chr18.vcf.bgz & +# chr 19 +wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/vcf/genomes/gnomad.genomes.v4.0.sites.chr19.vcf.bgz & +# chr 20 +wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/vcf/genomes/gnomad.genomes.v4.0.sites.chr20.vcf.bgz & +# chr 21 +wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/vcf/genomes/gnomad.genomes.v4.0.sites.chr21.vcf.bgz & +# chr 22 +wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/vcf/genomes/gnomad.genomes.v4.0.sites.chr22.vcf.bgz & +# chr x +wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/vcf/genomes/gnomad.genomes.v4.0.sites.chrX.vcf.bgz & +# xhr y +wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/4.0/vcf/genomes/gnomad.genomes.v4.0.sites.chrY.vcf.bgz & +wait diff --git a/scripts/insertVariants.ipynb b/scripts/insertVariants.ipynb index 669814c..80c5a87 100644 --- a/scripts/insertVariants.ipynb +++ b/scripts/insertVariants.ipynb @@ -3,7 +3,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2b5b398d-0ffe-4df9-8e57-5aaca0be0949", + "id": "e9cc5826", "metadata": {}, "outputs": [], "source": [ @@ -21,7 +21,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9e3b454f-40c6-4832-83a2-f3d446a514cf", + "id": "d685dde3", "metadata": { "tags": [ "parameters" @@ -31,24 +31,24 @@ "source": [ "database_location = \"test_out\"\n", "tables_location = \"test_out\"\n", - "genome = \"Grch38\"" + "gnomad_version = \"v4\"" ] }, { "cell_type": "code", "execution_count": null, - "id": "f99fd9fa-7c44-4f5d-a4c7-1e581af95cd2", + "id": "3d95c7aa", "metadata": {}, "outputs": [], "source": [ "# initialize database\n", - "db = gnomAD_DB(database_location, genome=genome)" + "db = gnomAD_DB(database_location, gnomad_version=gnomad_version)" ] }, { "cell_type": "code", "execution_count": null, - "id": "26e968a7-b0e6-4086-9453-d6cc33e3b7e7", + "id": "f739d4db", "metadata": {}, "outputs": [], "source": [ @@ -58,7 +58,7 @@ { "cell_type": "code", "execution_count": null, - "id": "581b04b7-4b0d-4be5-b73f-fff1ebdd7e94", + "id": "bcebb226", "metadata": {}, "outputs": [], "source": [ @@ -84,7 +84,7 @@ { "cell_type": "code", "execution_count": null, - "id": "845c883b-1dc4-467f-9e68-cebc6a48d0bf", + "id": "5baf39da", "metadata": {}, "outputs": [], "source": [ @@ -95,7 +95,7 @@ { "cell_type": "code", "execution_count": null, - "id": "065488a5-1151-4559-bb23-1bed1f056200", + "id": "5b005188", "metadata": {}, "outputs": [], "source": [ @@ -108,7 +108,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3227fc44-180a-4221-bb09-1eb5d291fa3a", + "id": "425b21a4", "metadata": {}, "outputs": [], "source": [] diff --git a/scripts/insertVariants.py b/scripts/insertVariants.py deleted file mode 100644 index 9c4b8b1..0000000 --- a/scripts/insertVariants.py +++ /dev/null @@ -1,69 +0,0 @@ -# --- -# jupyter: -# jupytext: -# formats: ipynb,py:percent -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.10.2 -# kernelspec: -# display_name: utr_anno -# language: python -# name: utr_anno -# --- - -# %% -import os -#os.chdir('../') - -from gnomad_db.database import gnomAD_DB -import numpy as np -import pandas as pd -import gzip -from tqdm import tqdm -import glob - -# %% tags=["parameters"] -database_location = "test_out" -tables_location = "test_out" -genome = "Grch38" - -# %% -# initialize database -db = gnomAD_DB(database_location, genome=genome) - -# %% -table_sep = "\t" - - -# %% -# read variants from tsv.gz table in batches -def load_batches(file, batch_size=500_000): - with gzip.open(file, "rb") as f: - batch = [] - for line in tqdm(f): - line = line.decode().rstrip() - if len(batch) == batch_size: - batch = pd.DataFrame(batch, columns=db.columns).replace(".", np.NaN) - yield batch - batch = [] - - batch.append(line.split(table_sep)) - - - if len(batch) != 0: - batch = pd.DataFrame(batch, columns=db.columns).replace(".", np.NaN) - yield batch - -# %% -tables = glob.glob(f"{tables_location}/*.tsv.gz") -tables - -# %% -for table in tables: - print(table) - for batch in load_batches(table): - db.insert_variants(batch) - -# %% diff --git a/setup.py b/setup.py index 8469a2a..7f9f885 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import setup, find_packages setup(name='gnomad_db', - version='0.1.2', + version='0.1.3', description='This package scales the huge gnomAD files to a SQLite database, which is easy and fast to query. It extracts from a gnomAD vcf the minor allele frequency for each variant.', author='KalinNonchev', author_email='boo@foo.com', diff --git a/test_dir/test_gnomad_db.py b/test_dir/test_gnomad_db.py index 3970b33..2d257ae 100644 --- a/test_dir/test_gnomad_db.py +++ b/test_dir/test_gnomad_db.py @@ -9,10 +9,10 @@ def database(): with open("script_config.yaml", 'r') as stream: config = yaml.safe_load(stream) - genome = config["genome"] + gnomad_version = config["gnomad_version"] database_location = config['database_location'] - database = gnomAD_DB(database_location, genome=genome) + database = gnomAD_DB(database_location, gnomad_version=gnomad_version) var_df = pd.read_csv("data/test_vcf_gnomad_chr21_10000.tsv.gz", sep="\t", names=database.columns, index_col=False)