-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathexample_script.r
128 lines (98 loc) · 6.06 KB
/
example_script.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/usr/bin/env Rscript
options(error=recover)
#this script prints out PDF pcoa plots and distance matrices, given an OTU table, phylogenetic tree, and metadata
source("UniFrac.r")
library(ape)
library(phangorn)
library(vegan)
otu.tab <- read.table("data/td_OTU_tag_mapped_lineage.txt", header=T, sep="\t", row.names=1, comment.char="", check.names=FALSE)
#remove taxonomy column to make otu count matrix numeric
taxonomy <- otu.tab$taxonomy
otu.tab <- otu.tab[-length(colnames(otu.tab))]
otu.tab <- t(as.matrix(otu.tab))
#sort taxa from most to least abundant
taxaOrder <- rev(order(apply(otu.tab,2,sum)))
taxonomy <- taxonomy[taxaOrder]
otu.tab <- otu.tab[,taxaOrder]
# read and root tree (rooted tree is required)
tree <- read.tree("data/fasttree_all_seed_OTUs.tre")
tree <- midpoint(tree)
# read metadata
MyMeta<- read.table("data/metadata.txt", header=T, sep="\t", row.names=1, comment.char="", check.names=FALSE)
# filter OTU table and metadata so that only samples which appear in both are retained
otu_indicies <- match(rownames(MyMeta),rownames(otu.tab))
otu_indicies <- otu_indicies[!is.na(otu_indicies)]
otu.tab <- otu.tab[otu_indicies,]
MyMetaOrdered <- MyMeta[match(rownames(otu.tab),rownames(MyMeta)),]
#rarefy data for unweighted unifrac
otu.tab.rarefy <- rrarefy(otu.tab, min(apply(otu.tab,1,sum)))
#calculate distance matrix
unweighted <- getDistanceMatrix(otu.tab.rarefy,tree,method="unweighted",verbose=TRUE)
weighted <- getDistanceMatrix(otu.tab,tree,method="weighted",verbose=TRUE)
information <- getDistanceMatrix(otu.tab,tree,method="information",verbose=TRUE)
ratio_no_log <- getDistanceMatrix(otu.tab,tree,method="ratio_no_log",verbose=TRUE)
#output distance matrices
write.table(unweighted,file="output/unweighted_distance_matrix.txt",sep="\t",quote=FALSE)
write.table(weighted,file="output/weighted_distance_matrix.txt",sep="\t",quote=FALSE)
write.table(information,file="output/information_distance_matrix.txt",sep="\t",quote=FALSE)
write.table(ratio_no_log,file="output/ratio_no_log_normalize_distance_matrix.txt",sep="\t",quote=FALSE)
#conditions (bv - bacterial vaginosis as scored by nugent/amsel, i - intermediate, n - normal/healthy)
groups <- MyMetaOrdered$n_status #levels bv, i, n
originalgroups <- groups
# change conditions so that samples which are more than 50% one taxa are colored by that taxa
otuSum <- apply(otu.tab,1,sum)
otuMax <- apply(otu.tab,1,max)
otuWhichMax <- apply(otu.tab,1,which.max)
otuDominated <- which(otuMax > otuSum/2)
otuMaxTax <- taxonomy[otuWhichMax]
#otuDominated <- c(otuDominated[which(as.numeric(otuMaxTax[otuDominated])==32)],otuDominated[which(as.numeric(otuMaxTax[otuDominated])==33)])
taxonomyGroups <- as.character(groups)
taxonomyGroups[otuDominated] <- as.character(otuMaxTax[otuDominated])
taxonomyGroups <- as.factor(taxonomyGroups)
groups <- taxonomyGroups
# assign appropriate names to single taxa dominated groups
newLevels <- levels(taxonomyGroups)
splittaxa <- strsplit(levels(taxonomyGroups),split=";")
for (i in 1:length(splittaxa)) {
if (length(splittaxa[[i]])>1) {
newLevels[i] <- paste(splittaxa[[i]][length(splittaxa[[i]])-1],splittaxa[[i]][length(splittaxa[[i]])])
}
else {
newLevels[i] <- splittaxa[[i]][1]
}
}
levels(taxonomyGroups) <- newLevels
unweighted.pcoa <- pcoa(unweighted)
weighted.pcoa <- pcoa(weighted)
information.pcoa <- pcoa(information)
ratio_no_log.pcoa <- pcoa(ratio_no_log)
#function to get variance explained for the PCOA component labels
getVarExplained <- function(vector) {
rawVarEx <- apply(vector,2,function(x) sd(x)*sd(x))
totalVarExplained <- sum(rawVarEx)
varEx <- rawVarEx/totalVarExplained
return(varEx)
}
unweighted.varEx <- getVarExplained(unweighted.pcoa$vectors)
weighted.varEx <- getVarExplained(weighted.pcoa$vectors)
information.varEx <- getVarExplained(information.pcoa$vectors)
ratio_no_log.varEx <- getVarExplained(ratio_no_log.pcoa$vectors)
#get vector version of distance matrices for correlation plots below
unweighted.vector <- unlist(unweighted[lower.tri(unweighted,diag=TRUE)])
weighted.vector <- unlist(weighted[lower.tri(weighted,diag=TRUE)])
information.vector <- unlist(information[lower.tri(information,diag=TRUE)])
ratio_no_log.vector <- unlist(ratio_no_log[lower.tri(ratio_no_log,diag=TRUE)])
pdf("output/pcoa_plots.pdf")
#plot pcoa plots
plot(unweighted.pcoa$vectors[,1],unweighted.pcoa$vectors[,2], col=groups,main="Unweighted UniFrac\nprincipal coordinates analysis",xlab=paste("First Component", round(unweighted.varEx[1],digits=3),"variance explained"),ylab=paste("Second Component", round(unweighted.varEx[2],digits=3),"variance explained"),pch=19,cex.lab=1.4,cex.main=2)
plot(weighted.pcoa$vectors[,1],weighted.pcoa$vectors[,2], col=groups,main="Weighted UniFrac\nprincipal coordinates analysis",xlab=paste("First Component", round(weighted.varEx[1],digits=3),"variance explained"),ylab=paste("Second Component", round(weighted.varEx[2],digits=3),"variance explained"),pch=19,cex.lab=1.4,cex.main=2)
legend(0.2,0.32,levels(taxonomyGroups),col=palette(),pch=19)
plot(information.pcoa$vectors[,1],information.pcoa$vectors[,2], col=groups,main="Information UniFrac\nprincipal coordinates analysis",xlab=paste("First Component", round(information.varEx[1],digits=3),"variance explained"),ylab=paste("Second Component", round(information.varEx[2],digits=3),"variance explained"),pch=19,cex.lab=1.4,cex.main=2)
plot(ratio_no_log.pcoa$vectors[,1],ratio_no_log.pcoa$vectors[,2], col=groups,main="Ratio UniFrac\nprincipal coordinates analysis",xlab=paste("First Component", round(ratio_no_log.varEx[1],digits=3),"variance explained"),ylab=paste("Second Component", round(ratio_no_log.varEx[2],digits=3),"variance explained"),pch=19,cex.lab=1.4,cex.main=2)
#plot correlation between different UniFrac modes
plot(unweighted.vector,information.vector,main="unweighted vs. information UniFrac")
plot(weighted.vector,information.vector,main="weighted vs. information UniFrac")
plot(unweighted.vector,weighted.vector,main="unweighted vs. weighted UniFrac")
plot(ratio_no_log.vector,information.vector,main="ratio vs. information UniFrac")
plot(ratio_no_log.vector,weighted.vector,main="ratio vs. weighted UniFrac")
dev.off()