Skip to content

Commit

Permalink
added a url download function #126 #52
Browse files Browse the repository at this point in the history
  • Loading branch information
cb-Hades committed Jul 31, 2024
1 parent 0bbe3be commit 848794b
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 136 deletions.
160 changes: 28 additions & 132 deletions dev/gapfill-testing.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -15,71 +15,6 @@
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/brune/miniconda3/envs/sprg/lib/python3.10/site-packages/pydantic/_internal/_config.py:322: UserWarning: Valid config keys have changed in V2:\n",
"* 'underscore_attrs_are_private' has been removed\n",
" warnings.warn(message, UserWarning)\n"
]
}
],
"source": [
"from refinegems.classes.gapfill import KEGGapFiller\n",
"from refinegems.utility.io import load_model"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"mpath = '/Users/brune/Documents/11_Test_Data/test_refinegems/test_gapfill/JCSC1435.xml'\n",
"\n",
"model = load_model(mpath,'libsbml')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'get_missing_genes' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[4], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m gff2 \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m/Users/brune/Documents/11_Test_Data/test_refinegems/test_gapfill/JCSC1435_RefSeq.gff\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[0;32m----> 2\u001b[0m missing_genes \u001b[38;5;241m=\u001b[39m \u001b[43mget_missing_genes\u001b[49m(gff2,load_model(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m/Users/brune/Documents/11_Test_Data/test_refinegems/test_gapfill/JCSC1435.xml\u001b[39m\u001b[38;5;124m'\u001b[39m,\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlibsbml\u001b[39m\u001b[38;5;124m'\u001b[39m))\n\u001b[1;32m 3\u001b[0m missing_genes\n\u001b[1;32m 5\u001b[0m \u001b[38;5;66;03m# @TODO : some od the genes do not have an old locus tag -> what to do?\u001b[39;00m\n",
"\u001b[0;31mNameError\u001b[0m: name 'get_missing_genes' is not defined"
]
}
],
"source": [
"gff2 = '/Users/brune/Documents/11_Test_Data/test_refinegems/test_gapfill/JCSC1435_RefSeq.gff'\n",
"missing_genes = get_missing_genes(gff2,load_model('/Users/brune/Documents/11_Test_Data/test_refinegems/test_gapfill/JCSC1435.xml','libsbml'))\n",
"missing_genes\n",
"\n",
"# @TODO : some od the genes do not have an old locus tag -> what to do?"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/brune/miniconda3/envs/sprg/lib/python3.10/site-packages/pydantic/_internal/_config.py:322: UserWarning: Valid config keys have changed in V2:\n",
"* 'underscore_attrs_are_private' has been removed\n",
" warnings.warn(message, UserWarning)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
Expand Down Expand Up @@ -122,7 +57,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 4,
"metadata": {},
"outputs": [
{
Expand All @@ -146,92 +81,39 @@
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ec-code</th>\n",
" <th>ncbiprotein</th>\n",
" <th>id</th>\n",
" <th>equation</th>\n",
" <th>reference</th>\n",
" <th>is_transport</th>\n",
" <th>via</th>\n",
" <th>add_to_GPR</th>\n",
" <th>locus_tag</th>\n",
" <th>ec-code</th>\n",
" <th>UniProt</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>5.6.2.2</td>\n",
" <td>[WP_011274363.1]</td>\n",
" <td>MNXR115632</td>\n",
" <td>1 MNXM12437@MNXD1 + 1 MNXM40333@MNXD1 + 1 MNXM...</td>\n",
" <td>metacycR:5.99.1.3-RXN</td>\n",
" <td>None</td>\n",
" <td>MetaNetX</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>5.6.2.2</td>\n",
" <td>[WP_011274363.1]</td>\n",
" <td>MNXR172894</td>\n",
" <td>1 MNXM1100221@MNXD1 + 1 MNXM40333@MNXD1 + 1 MN...</td>\n",
" <td>sabiorkR:15120</td>\n",
" <td>None</td>\n",
" <td>MetaNetX</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>5.6.2.2</td>\n",
" <td>[WP_011274363.1]</td>\n",
" <td>MNXR172895</td>\n",
" <td>1 MNXM1100221@MNXD1 + 1 MNXM735047@MNXD1 + 1 M...</td>\n",
" <td>sabiorkR:15121</td>\n",
" <td>None</td>\n",
" <td>MetaNetX</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <th>4</th>\n",
" <td>WP_011274363.1</td>\n",
" <td>SH0005</td>\n",
" <td>5.6.2.2</td>\n",
" <td>[WP_011274363.1]</td>\n",
" <td>MNXR172896</td>\n",
" <td>1 MNXM1100223@MNXD1 + 1 MNXM40333@MNXD1 + 1 MN...</td>\n",
" <td>sabiorkR:15122</td>\n",
" <td>None</td>\n",
" <td>MetaNetX</td>\n",
" <td>None</td>\n",
" <td>[Q5HK03, Q8CQK4, Q6GKU0, Q2FKQ1, Q6GD85, P0A0K...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ec-code ncbiprotein id \\\n",
"0 5.6.2.2 [WP_011274363.1] MNXR115632 \n",
"1 5.6.2.2 [WP_011274363.1] MNXR172894 \n",
"2 5.6.2.2 [WP_011274363.1] MNXR172895 \n",
"3 5.6.2.2 [WP_011274363.1] MNXR172896 \n",
" ncbiprotein locus_tag ec-code \\\n",
"4 WP_011274363.1 SH0005 5.6.2.2 \n",
"\n",
" equation reference \\\n",
"0 1 MNXM12437@MNXD1 + 1 MNXM40333@MNXD1 + 1 MNXM... metacycR:5.99.1.3-RXN \n",
"1 1 MNXM1100221@MNXD1 + 1 MNXM40333@MNXD1 + 1 MN... sabiorkR:15120 \n",
"2 1 MNXM1100221@MNXD1 + 1 MNXM735047@MNXD1 + 1 M... sabiorkR:15121 \n",
"3 1 MNXM1100223@MNXD1 + 1 MNXM40333@MNXD1 + 1 MN... sabiorkR:15122 \n",
"\n",
" is_transport via add_to_GPR \n",
"0 None MetaNetX None \n",
"1 None MetaNetX None \n",
"2 None MetaNetX None \n",
"3 None MetaNetX None "
" UniProt \n",
"4 [Q5HK03, Q8CQK4, Q6GKU0, Q2FKQ1, Q6GD85, P0A0K... "
]
},
"execution_count": 7,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mapped_res[1]"
"mapped_res[0]"
]
},
{
Expand Down Expand Up @@ -270,6 +152,20 @@
" # download mapping\n",
" # (optionally) contruct DIAMOND DB"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
44 changes: 40 additions & 4 deletions src/refinegems/utility/set_up.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,11 @@
# requirements
################################################################################

import requests

from importlib.resources import files
# from tqdm import tqdm
from pathlib import Path
from tqdm import tqdm
from typing import Literal

################################################################################
Expand All @@ -23,11 +26,44 @@
# functions
################################################################################

# --------------------------
# download databases / files
# --------------------------
# @TEST
# @TODO : add an entry point?
def download_url(dowload_type:Literal['SwissProt gapfill'],
directory:str=None,k:int=10):

# match URLS to type of database, that the user wants to download
match dowload_type:
case 'SwissProt gapfill':
swissprot_api = 'https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz'
swissprot_mapping_api = 'https://rest.uniprot.org/uniprotkb/stream?compressed=true&fields=accession%2Cxref_brenda%2Cec%2Csequence&format=tsv&query=%28*%29+AND+%28reviewed%3Atrue%29'
urls = {'SwissProt.fasta':swissprot_api, 'SwissProt_mapping.tsv':swissprot_mapping_api}
case _:
mes = f'Unknown database or file: {name}'
raise ValueError(mes)

# download each file
for name,url in urls:
r = requests.get(url, stream=True) # open download stream
filename = Path(directory,name) if directory else Path(name)
with open(filename, 'wb') as f:
pbar = tqdm(desc=f'Downloading {name}',
unit="B", unit_scale=True, unit_divisor=1024,
total=int( r.headers['Content-Length'] )) # make the progress bar
pbar.clear() # clear 0% info
for chunk in r.iter_content(chunk_size=k*1024):
if chunk: # filter out keep-alive new chunks
pbar.update(len(chunk)) # update progress bar
f.write(chunk)
pbar.close()

# ---------------------
# handling config files
# ---------------------

# @TODO
# @TODO : sth for gapfilling?
def download_config(filename:str='./my_config.yaml', type=Literal['media','refinegems']):
"""Load a configuration file from the package and save a copy of it for the user to edit.
Expand Down Expand Up @@ -65,9 +101,9 @@ def copy_config_yaml(infile:str, outfile:str):

# @TODO
# refinegems config
case 'refinegems':
case 'gapfill':

# copy_config_yaml(PATH_REFINEGEMS_CONFIG, filename)
# copy_config_yaml(..., filename)
pass

# type not found
Expand Down

0 comments on commit 848794b

Please sign in to comment.