-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDownload_KEGG.py
142 lines (128 loc) · 5.56 KB
/
Download_KEGG.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#!/usr/bin/python
# Written in Python 3.8 in 2023 by A.L.O. Gaenssle
# MODULE: DOWNLOAD PROTEIN DATA from KEGG
# -> downloads information of protein entries by ID in chunks
# -> downloads all organism ids available on KEGG and their taxonomic classification
# -> downloads all neighbors within the given range of each given protein ID
import pandas as pd
from io import StringIO
import re
import Bio
from Bio.KEGG import REST
import urllib.request
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
##-------------------------------------------------------------------------------------------------
## DOWNLOAD FUNCTIONS -----------------------------------------------------------------------------
##-------------------------------------------------------------------------------------------------
## ================================================================================================
## Download all gene IDs associated with the supplied KEGG Orthology (KO)
def DownloadOrthology(Input):
Download = REST.kegg_find("genes",Input).read()
GeneList = Download.strip().split("\n")
ListOfList = [i.split("\t") for i in GeneList]
DataFrame = pd.DataFrame(ListOfList, columns=["ID", "Description"])
return(DataFrame["ID"].to_list(), DataFrame)
## ================================================================================================
## Download all genome taxonomy from KEGG --> KEGG-list
def DownloadOrganismsTemp(Name="organism"):
print("Download organism taxonomy. . .")
Entry = REST.kegg_list(Name).read()
Entry = Entry.replace(";" , "\t")
ColList = ["ID long", "orgID", "Organism", "Kingdom", "Phylum", "Class", "Order"]
DataFrame = pd.read_csv(StringIO(Entry), sep="\t", names=ColList)
DataFrame["Taxonomy"] = DataFrame["Kingdom"] + "-" + DataFrame["Phylum"]
DataFrame = DataFrame[["orgID","Taxonomy"]]
return(DataFrame)
##-------------------------------------------------------------------------------------------------
## SUB-FUNCTIONS OF DownloadNeighbors--------------------------------------------------------------
##-------------------------------------------------------------------------------------------------
## ================================================================================================
## Get list of indexes +/- range of the reference gene ID for KEGG
def GetNeighborIndices(Gene, Range, Step, Size=4):
IndexList = []
IndexDict = {}
if "_" in Gene and Gene.rsplit("_",1)[1].isdigit():
Label = Gene.rsplit("_",1)[0] + "_"
Index = int(Gene.rsplit("_",1)[1])
Fill = len(Gene.rsplit("_",1)[1])
else:
Label = Gene[:-Size]
Index = int(Gene[-Size:])
Fill = Size
for i in range(Index-Range*Step,Index+Range*Step+1, Step):
if i != Index:
NewID = Label + str(i).zfill(Fill)
IndexList.append(NewID)
IndexDict[NewID] = int((i - Index)/Step)
return(IndexList, IndexDict)
## ================================================================================================
## Download protein entries from KEGG -> in chunks of 10 gene IDs --> KEGG-get
def DownloadProteinEntries(IndexList, GeneID):
Data = []
Entry = []
try:
Download = REST.kegg_get(IndexList).read()
except:
Download = ""
Download = Download.split("\n")
for Line in Download:
if Line.startswith("///"):
Data.append(Entry)
Entry = []
else:
Entry.append(Line)
return(Data)
## ================================================================================================
## Download Info for each protein from KEGG
def GetDetailedData(Entry, GeneID, orgID):
Dict = {"Ref": GeneID,"ID": orgID, "orgID": orgID,"Sequence": ""}
inAASeq = False
for Line in Entry:
Line = re.sub("\s\s+" , " ", Line)
Line = Line.strip()
if inAASeq == True:
if Line.startswith("NTSEQ"):
break
else:
Dict["Sequence"] += Line.strip()
if inAASeq == False:
if Line.startswith("ENTRY"):
Dict["ID"] += ":" + Line.split(" ",2)[1]
elif Line.startswith("NAME"):
Dict["Name"] = Line.split(" ",1)[1].replace("(GenBank)", "").strip()
elif Line.startswith("ORTHOLOGY"):
Dict["KO-ID"] = Line.split(" ",2)[1].strip()
elif Line.startswith("ORGANISM") or Line.startswith("VIRUS"):
Line = Line.split(" ",1)[1].strip()
Dict["Organism"] = Line.split(" ",1)[1]
elif Line.startswith("MOTIF"):
Dict["Domain"] = Line.split(" ",1)[1].replace("Pfam:", "").strip()
elif "UniProt" in Line:
Dict["UniProt"] = Line.split(" ",1)[1]
elif Line.startswith("AASEQ"):
Dict["Length"] = Line.split(" ",1)[1]
inAASeq = True
return(Dict)
##-------------------------------------------------------------------------------------------------
## MAIN FUNCTION ----------------------------------------------------------------------------------
##-------------------------------------------------------------------------------------------------
## ================================================================================================
## Main function to download neighbors
def DownloadNeighbors(GeneID, Range, Step=1):
print(f"Download neighbors of {GeneID} (Increment={Step}) . . .")
Data = []
ProteinSet = []
IDList, RangeDict = GetNeighborIndices(GeneID, Range, Step)
if Range > 5:
# Create chunks of clusters since data of 10 proteins can be downloaded from KEGG at once
ClusteredList = [IDList[x:x+10] for x in range(0, len(IDList), 10)]
else:
ClusteredList = [IDList]
for Cluster in ClusteredList:
Data.extend(DownloadProteinEntries(Cluster, GeneID))
for Entry in Data:
ProteinSet.append(GetDetailedData(Entry, GeneID, GeneID.split(":",1)[0]))
for Entry in ProteinSet:
Entry["Pos"] = RangeDict[Entry["ID"]]
return(ProteinSet)