forked from mmkim1210/gene-list
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmunge-gene-list.Rmd
110 lines (85 loc) · 3.48 KB
/
munge-gene-list.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
---
title: "Compile Gene Lists"
output: html_document
editor_options:
chunk_output_type: console
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
rm(list = ls())
library(tidyverse)
library(readxl)
```
## Import Data
```{r, warning=FALSE, message=FALSE}
SFARI <- read_csv("list/SFARI-Gene_genes_01-13-2021release_02-05-2021export.csv")
SFARI <- filter(SFARI, `gene-score` == 1 | syndromic == 1)
nrow(SFARI) # 328 genes
ASD_Sanders <- read_excel("list/mmc7.xlsx")
ASD_Sanders <- filter(ASD_Sanders, tadaFdrAscSscExomeSscAgpSmallDel < 0.1)
nrow(ASD_Sanders) # 65 genes
ASD_Satterstrom <- read_excel("list/Satterstrom.xlsx", sheet = 2, skip = 1)
ASD_Satterstrom <- ASD_Satterstrom %>% slice(-c((nrow(.) - 2):nrow(.)))
nrow(ASD_Satterstrom) # 102 genes
ASD_Ruzzo <- read_excel("list/mmc3.xlsx", sheet = 4)
ASD_Ruzzo <- filter(ASD_Ruzzo, qval < 0.1)
ASD_Ruzzo$`HGNC gene symbol` <- gsub('[\"]', '', ASD_Ruzzo$`HGNC gene symbol`)
nrow(ASD_Ruzzo) # 69 genes
DDD <- read_excel("list/41586_2020_2832_MOESM4_ESM.xlsx")
DDD <- filter(DDD, significant == TRUE)
nrow(DDD) # 285 genes
SCZ_SCHEMA <- read_csv("list/meta_results_2021_02_04_20_01_48.csv")
SCZ_SCHEMA <- filter(SCZ_SCHEMA, `P meta` < 2.2e-6)
SCZ_SCHEMA$symbol <- c("SETD1A", "CUL1", "XPO7", "TRIO", "CACNA1G", "SP4",
"GRIA3", "GRIN2A", "HERC1", "RB1CC1")
nrow(SCZ_SCHEMA) # 10 genes
SCZ_PsychENCODE <- read_csv("list/INT-17_SCZ_High_Confidence_Gene_List.csv")
nrow(SCZ_PsychENCODE) # 321 genes
SCZ_HiC <- read_csv("list/1-s2.0-S0920996419300891-mmc1.txt")
nrow(SCZ_HiC) # 455 genes
ASD_ID_DDD_Coe <- read_excel("list/41588_2018_288_MOESM3_ESM.xlsx",
sheet = 2, skip = 4,range = "A5:X6891") %>%
drop_na(`Significance (FDR ≤ 5%, count > 1)...24`)
nrow(ASD_ID_DDD_Coe) # 253 genes
SynGO <- read_excel("list/syngo_annotations.xlsx")
length(unique(SynGO$`human ortholog gene symbol`)) # 1,225 genes
FMRP <- read_excel("list/mmc2.xls", sheet = 1, skip = 1)
# also check out the following:
# https://www.cell.com/cell/pdf/S0092-8674(14)01105-2.pdf,
# https://www.cell.com/cell/pdf/S0092-8674(14)01106-4.pdf
nrow(FMRP) # 842 genes
CELF4 <- read_excel("list/pgen.1003067.s004.xlsx") %>%
filter(`iCLIP occupancy` > 0.2)
nrow(CELF4) # 2,803 genes
RBFOX <- read_excel("list/RBFOX-mmc2.xlsx", sheet = 1, skip = 1) %>%
filter(`Rbfox2 tag count` > 4,
`Rbfox1 tag count` + `Rbfox3 tag count` > 12)
length(unique(RBFOX$`gene symbol`)) # 2,049 genes
Cancer <- read_tsv("list/Census_all.tsv") %>%
filter(Tier == 1)
nrow(Cancer) # 576 genes
GenCC <- read_csv("list/gencc-submissions.tsv") %>%
filter(classification_title == "Definitive")
nrow(GenCC) # 2,754 genes
Housekeeping <- read_tsv("list/HK_genes.txt", col_names = FALSE)
nrow(Housekeeping) # 3,804 genes
MDEM <- read_tsv("list/MDEM.txt", col_names = FALSE)
nrow(MDEM) # 70 genes
UKBB_rare <- read_excel("list/41586_2021_4103_MOESM5_ESM.xlsx", sheet = 3)
length(unique(UKBB_rare$Gene)) # 564 genes
```
## Combine Data
```{r}
geneset <- list(
"ASD_Ruzzo" = ASD_Ruzzo$`HGNC gene symbol`,
"ASD_Sanders" = ASD_Sanders$RefSeqGeneName,
"ASD_Satterstrom" = ASD_Satterstrom$Gene,
"DDD" = DDD$symbol,
"SCZ_SCHEMA" = SCZ_SCHEMA$symbol,
"SFARI" = SFARI$`gene-symbol`,
"ASD_ID_DDD_Coe" = ASD_ID_DDD_Coe$`Ensembl Symbol`
)
df <- map_dfr(geneset, ~tibble(gene = .), .id = "source") %>% relocate(gene)
df
write_tsv(df, file = "output/risk_genes_brain.tsv")
```