-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathannotation using BiomaRT.rtf
59 lines (57 loc) · 2.83 KB
/
annotation using BiomaRT.rtf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
{\rtf1\ansi\ansicpg1252\cocoartf2578
\cocoatextscaling0\cocoaplatform0{\fonttbl\f0\fnil\fcharset0 HelveticaNeue;\f1\fnil\fcharset0 .AppleSystemUIFontMonospaced-Regular;}
{\colortbl;\red255\green255\blue255;\red27\green31\blue35;\red255\green255\blue255;}
{\*\expandedcolortbl;;\cssrgb\c14118\c16078\c18431;\cssrgb\c100000\c100000\c100000;}
\margl1440\margr1440\vieww28600\viewh17440\viewkind0
\deftab720
\pard\pardeftab720\sa320\partightenfactor0
\f0\fs32 \cf2 \cb3 \expnd0\expndtw0\kerning0
\outl0\strokewidth0 \strokec2 \
##R code for annotation using BiomaRT\
##Author Dr Upasna Srivastava\
##This Rscript was uses BiomaRt to create an annotation_tsv for S. scrofa (pig)\
\pard\pardeftab720\sl380\partightenfactor0
\f1\fs27\fsmilli13600 \cf2 library(plyr)\
library(biomaRt)\
\
mart = useMart("ENSEMBL_MART_ENSEMBL", host="www.ensembl.org") #Change host here to use older version\
pigMart<- useDataset("sscrofa_gene_ensembl",mart=mart)\
\
#Keeping track of current version used\
currentMarts <- listMarts()\
currentMarts$version[currentMarts$biomart == "ENSEMBL_MART_ENSEMBL"]\
#Ensembl Genes 97\
\
#Gets attributes using transcript id from biomaRt\
attr<- getBM(attributes=c("ensembl_gene_id", "entrezgene_id", "external_gene_name", "description"), mart=pigMart)\
\
rename_col_by_name <- function (df, old, new) \{\
#https://stackoverflow.com/a/16490387\
names(df)[names(df) == old] <- new\
return(df)\
\}\
\
#Standardize column name of unique ID to 'gene_id' - This corresponds to the unique ID used in GTF\
attr <- rename_col_by_name(attr, 'ensembl_gene_id', 'gene_id')\
\
#For queries with multiple entrez gene IDs, return these IDs in a single cell, as a string with commas separating the individual IDs\
collapsed.entrezIDs <- ddply(attr, .(gene_id), summarize, entrezgene_id=paste(entrezgene_id, collapse=","))\
\
#Sanity check. Gene symbols should be consistent, without conflicting results\
test1 <- ddply(attr, .(gene_id), summarize, external_gene_name=paste(external_gene_name, collapse=","))\
View(test1[grepl(",", test1$entrezgene_id),])\
#Another way of testing automagically\
test1.split <- strsplit(test1$external_gene_name[grepl(",", test1$external_gene_name)], ",")\
unique(sapply(test1.split, function(x) \{length(unique(x))\})) == c(1)\
\
#Sanity check. Descriptions are also consistent, no conflicting results\
test2 <- ddply(attr, .(gene_id), summarize, description=paste(description, collapse="@"))\
test2.split <- strsplit(test2$description[grepl("@", test2$description)], "@")\
unique(sapply(test2.split, function(x) \{length(unique(x))\})) == c(1)\
\
#Since entrez gene IDs are the only ones with conflicting entries, can merge our collapsed entrezIDs from before with the attr table\
\
mapping_table <- merge(collapsed.entrezIDs, attr, on="gene_id", all.x=T)\
\
write.table(mapping_table, "Sscrofa11.1_annotation.tsv", quote=F, sep="\\t", row.names=F)\cb1 \
}