diabetesgenes.R

#' We collated a data table of T1D gene annotations from various studies (see
#' code below) which helps annotating the computed scores in the downstream
#' analysis. We include the code as is as an example of how similar can be
#' achieved for other diseases.
#' The raw data supplied with the papers is stored in `helper-data` subdirectory
## =============================================================================

## Robertson CC, Inshaw JRJ, Onengut-Gumuscu S, Chen W-M, Santa Cruz DF, Yang H, et al. Fine-mapping, trans-ancestral and genomic analyses identify causal variants, cells, genes and drug targets for type 1 diabetes. Nat Genet. 2021;53: 962–971. doi:10.1038/s41588-021-00880-5
## 78 genomic regions
## 257 clumps where lead SNP p < 5E-8, 36 new rgions
## sed -i 's/\[\|\]//g' associations.csv
## X chromosome associations not reported
## hit in PTPN22/RSBN1 region on chr 1 is attributed to RSBN1 as nearest

## https://t1d.hugeamp.org/
helper.file <- file.path("helper-data", "associations.lifted.csv")
t1d.genes <- fread(helper.file, sep=",")
t1d.genes[, nearest := gsub('[\"\"', "", nearest, fixed=TRUE)]
t1d.genes[, nearest := gsub('\"\"]', "", nearest, fixed=TRUE)]
t1d.genes[snp=="rs2476601", nearest := "PTPN22"] # corrected
t1d.genes[, max.z := max(abs(zScore)), by=nearest]
setorder(t1d.genes, -max.z)
t1d.genes <- unique(t1d.genes, by="nearest") # retain top SNP
## 190 unique genes but many clumps overlap
## median clump size is 194 kb

## exeter lab list, excluding mitochondrial genes
t1d.monogenes <- c("ABCC8", "APPL1", "GCK", "HNF1A", "HNF4A", "INS", "KCNJ11", "NEUROD1", "PDX1", "RFX6 and ZFP57", "CEL", "CISD2", "DCAF17", "DNAJC3", "DUT", "DYRK1B", "GATA4", "GATA6", "HNF1B", "MANF", "MAFA", "PAX6", "PCBD1", "PIK3R1", "PPP1R15B", "SLC29A3", "TRMT10A", "WFS1", "ZBTB20", "AKT2", "CIDEC", "INSR", "LIPE", "LMNA", "PLIN1", "POLD1", "PPARG and ZMPSTE24", "ABCC8", "AGPAT2", "BSCL2", "CISD2", "CNOT1", "COQ2", "COQ9", "EIF2B1", "EIF2S3", "EIF2AK3", "FOXP3", "GATA4", "GATA6", "GCK", "GLIS3", "HNF1B", "IER3IP1", "IL2RA", "INS", "INSR", "KCNJ11", "LPL", "LRBA", "MNX1", "NEUROD1", "NEUROG3", "NKX2-2", "PDX1", "PTF1A", "RFX6", "SLC19A2", "SLC2A2", "STAT3", "WFS1", "ZFP57", "AIRE", "CTLA4", "DOCK8", "FOXP3", "IL2RA", "ITCH", "JAK1", "LRBA", "NFKB1", "SIRT1", "SLC29A3", "STAT1", "STAT3", "STAT5B", "TNFAIP3")

t1d.raregenes <- c("IFIH1", "PTPN22", "STK39", "LRP1B", "TYK2", "RBM17")
t1d.monogenes <- unique(c(t1d.monogenes, t1d.raregenes))

t1d.genes <- t1d.genes[, .(chromosome=chrom, position=pos, pValue,
                           clumpStart, clumpEnd), nearest]
## additional hits identified by testing for dominant/recessive models
helper.file <- file.path("helper-data", "NIHMS1701167-supplement-Supplementary_Tables.xlsx")
t1d.extragenes <- as.data.table(readxl::read_excel(helper.file, sheet="ST9", skip=2))
setnames(t1d.extragenes, "Candidate Gene", "nearest")
## clump range not given, use flanking regions of size 20kb
t1d.extragenes <- t1d.extragenes[, .(chromosome, position,
                                     clumpStart=position - 2E4, clumpEnd=position + 2E4,
                                     pValue=pmeta, nearest)]
t1d.hits <- rbind(t1d.genes, t1d.extragenes, fill=TRUE)
t1d.hits[, chromosome := factor(chromosome, levels=c(1:22, "X", "Y"))]

t1d.hits[, position.hg19 := position]
t1d.hits[, clumpStart.hg19 := clumpStart]
t1d.hits[, clumpEnd.hg19 := clumpEnd]

## liftover to hg38
data.dir <- "helper-data"
t1d.hits[, position := liftover.positions(chromosome, position, data.dir)]
t1d.hits[, clumpStart := liftover.positions(chromosome, clumpStart, data.dir)]
t1d.hits[, clumpEnd := liftover.positions(chromosome, clumpEnd, data.dir)]

## fix missing clumpStart generated by liftover
t1d.hits[is.na(clumpStart), clumpStart := clumpStart.hg19 + clumpEnd - clumpEnd.hg19]
t1d.hits[is.na(position), position := position.hg19 + clumpEnd - clumpEnd.hg19]

## Vujkovic M. Discovery of 318 new risk loci for type 2 diabetes and related vascular outcomes among 1.4 million participants in a multi-ancestry meta-analysis. Nat Genet. 2020 Jul;52(7):680-691. doi:10.1038/s41588-020-0637-y.
helper.file <- file.path("helper-data", "NIHMS1589535-supplement-1589535_Supp_Tab1-26.xlsx")
t2d.hits <- as.data.table(readxl::read_excel(helper.file, sheet="T5", skip=2))
t2d.hits <- t2d.hits[, .(CHR, BP, P, NearestGene)]
colnames(t2d.hits) <- c("chromosome", "position", "pValue", "nearest")
t2d.hits[, pValue := as.numeric(pValue)]

## Xue A, Genome-wide association analyses identify 143 risk variants and putative regulatory mechanisms for type 2 diabetes. Nat Commun. 2018 Jul 27;9(1):2941. doi: 10.1038/s41467-018-04951-w.
helper.file <- file.path("helper-data", "xueT2d2018hits.xlsx")
xue.new <- as.data.table(read_excel(helper.file))[, c(1:2, 8:9)] #.(CHR, BP, `P GWAS`, `Nearest gene`)]
colnames(xue.new) <-  c("chromosome", "position", "pValue", "nearest")
xue.new[, pValue := as.numeric(gsub("−", "-", pValue))]

t2d.hits <- rbind(t2d.hits, xue.new)
t2d.hits <- unique(na.omit(t2d.hits))
t2d.hits[, chromosome := factor(chromosome, levels=c(1:22, "X", "Y"))]

t1d.hits[, x.clumpStart := position.absolute(chromosome, clumpStart) - DMhits.flanksize.kb * 1000]
t1d.hits[, x.clumpEnd := position.absolute(chromosome, clumpEnd) + DMhits.flanksize.kb * 1000]

t2d.hits[, x.clumpStart := position.absolute(chromosome, position) - DMhits.flanksize.kb * 1000]
t2d.hits[, x.clumpEnd := position.absolute(chromosome, position) + DMhits.flanksize.kb * 1000]

################################################################
helper.file <- file.path("helper-data", "allgenes.RData")
load(helper.file)
allgenes[, gene_biotype := gsub("_", " ", gene_biotype)]

genes.info <- allgenes
setnames(genes.info, "gene_chrom", "chromosome")
genes.info[, x.gene_startpos := position.absolute(chromosome, gene_startpos)]
genes.info[, x.gene_endpos := position.absolute(chromosome, gene_endpos)]

## annotate genes.info with t1d-associated and t2d-associated genes that overlap the transcription site of the gene

setkey(genes.info, x.gene_startpos, x.gene_endpos)
setkey(t1d.hits, x.clumpStart, x.clumpEnd)
setkey(t2d.hits, x.clumpStart, x.clumpEnd)
## find overlaps between transcript sites in genes.info and clumps in t1d.hits
## FIXME: redo this with a 200 kb flank size

## overlap tables have one record for each overlap of a clump of SNP hits and a gene transcription site
t1d.overlaps <- foverlaps(genes.info[, .(gene_symbol, x.gene_startpos, x.gene_endpos)],
                          t1d.hits[, .(nearest, x.clumpStart, x.clumpEnd)], nomatch=NULL)

t2d.overlaps <- foverlaps(genes.info[, .(gene_symbol, x.gene_startpos, x.gene_endpos)],
                          t1d.hits[, .(nearest, x.clumpStart, x.clumpEnd)], nomatch=NULL)

## many genes appear more than once because the clump overlaps > 1 gene
## to match target genes with T1D hits, we should collapse by gene_symbol and paste together the "nearest" values.

## to annotate the target genes of cis- or trans- QTLs with T1D hits based on distance to a clump of T1D-associated SNPs, we should reuse these overlap tables and annotate the target gene with the pasted "nearest" genes.

## to annotate SNPs in results with T1D hits, we should find overlaps between SNP positions and clumps in t1d.hits, annotate SNPs with the "nearest" field, then collapse by SNP

## to annotate the clumps of SNPs contributing to QTLs with T1D hits, we should find overlaps between the clumps of eQTL SNPs and the clumps of SNPs in t1d.hits, and annotate the eQTL with the "nearest" field. This will identify for instance FUT2 as the "nearest" gene contributing to the cis-eQTL for NECTIN2.

setkey(genes.info, gene_symbol)

genes.info[, is.t1dgene := gene_symbol %in% t1d.hits$nearest]
genes.info[, is.t2dgene := gene_symbol %in% t2d.hits$nearest]

genes.neart1dgenes <- t1d.overlaps[, .(near.t1dgenes=paste(nearest, collapse=",")), by=gene_symbol]
genes.neart2dgenes <- t2d.overlaps[, .(near.t2dgenes=paste(nearest, collapse=",")), by=gene_symbol]

setkey(genes.neart1dgenes, gene_symbol)
setkey(genes.neart2dgenes, gene_symbol)

genes.info <- genes.neart1dgenes[genes.info]
genes.info <- genes.neart2dgenes[genes.info]