-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathgeneLists.R
35 lines (25 loc) · 1.38 KB
/
geneLists.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# download from https://www.proteinatlas.org/about/download protienatlas.tsv.zip uncompress
library(stringi)
rm(list = ls())
dt <- read.delim(file = "~/GoogleDrive/Rstudio/10Xselector/proteinatlas.tsv", sep = "\t", stringsAsFactors = FALSE)
load("some.Rds")
rownames(featuredata)
rownames(dt) <- dt$Ensembl
rownames(featuredata)[!rownames(featuredata) %in% rownames(dt)]
sum(!rownames(featuredata) %in% rownames(dt))
featureIdx <- rownames(featuredata) %in% rownames(dt)
dt[rownames(featuredata)[featureIdx], "featureNames"] <- featuredata[featureIdx, "Associated.Gene.Name"]
dat <- dt[!is.na(dt$featureNames), ]
proteinClasses <- unique(stri_trim(unlist(strsplit(dat$Protein.class, ","))))
proteinClassList <- list()
for (clIdx in 1:length(proteinClasses)) {
proteinClassList[[proteinClasses[clIdx]]] <- dat[grep(proteinClasses[clIdx], dat$Protein.class), "Ensembl"]
}
hasAntibody <- list(`has Antibody` = dat[dat$Antibody == "", "Ensembl"])
locClasses <- unique(stri_trim(unlist(strsplit(dat$Subcellular.location, "<br>"))))
subCellularLocationList <- list()
for (clIdx in 1:length(locClasses)) {
subCellularLocationList[[locClasses[clIdx]]] <- dat[grep(locClasses[clIdx], dat$Subcellular.location), "Ensembl"]
}
geneLists <- list(`protein Classes` = proteinClassList, Antibody = hasAntibody, `subcellular Location` = subCellularLocationList)
save(file = "geneLists.RData", list = c("geneLists"))