-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtime-course_hclust.r
151 lines (122 loc) · 4.4 KB
/
time-course_hclust.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
setwd("C:/Users/echod/Documents/R_figures/briggsae_Deseq1_featurecounts_data")
library(tximport)
library(tximeta)
library(airway)
library(limma)
library(Glimma)
library(BiocManager)
library(edgeR)
library(DESeq2)
library(ggplot2)
library(tidyverse)
#load in the data
files <- c("day1r1.tabular", "day1r2.tabular", "day1r3.tabular",
"day3r1.tabular", "day3r2.tabular", "day3r3.tabular",
"day6r1.tabular", "day6r2.tabular", "day6r3.tabular",
"day9r2.tabular", "day9r3.tabular")
#have a look
read.delim(files[1], nrow=5)
#put into a matrix
x <- readDGE(files, columns=c(3,2))
class(x)
dim(x)
#rename samples based on their filenames
samplenames <- substring(colnames(x), 1, nchar(colnames(x)))
samplenames
#putting the samples in the matrix x$samples
colnames(x) <- samplenames
group <- as.factor(c("d1", "d1", "d1", "d3", "d3", "d3",
"d6", "d6", "d6", "d9", "d9"))
x$samples$group <- group
# make a regular matrix, for use in deseq2.
countdata <- x [["counts"]]
head(countdata,3)
age = c(1,1,1,3,3,3,6,6,6,9,9)
deseqset <- DESeqDataSetFromMatrix(countData = countdata,
design = ~ 1,
colData=data.frame(condition = as.factor(age)))
#filter out low-expressed genes
#filtering by 10 reads, going from 23169 to 15858 genes
nrow(deseqset)
keep <- rowSums(counts(deseqset) >= 10) >= 3
deseqsetf <-deseqset[keep,]
nrow(deseqsetf)
##############
#transform rnaseq reads to correct for a dependency between the mean and the variance
vsd <- vst(deseqsetf, blind=FALSE)
head(assay(vsd),3)
vsd_vec <- assay(vsd)
#we now have the corrected reads as the vector "vsd_vec", which is the same as the counts_transformed.csv
trans_cts <- read_csv("./hclust_tut_data/counts_transformed_cbr.csv")
sample_info <- read_csv("./hclust_tut_data/sample_info_cbr.csv")
# Summarise counts
trans_cts_mean <- trans_cts %>%
# convert to long format
pivot_longer(cols = day1r1:day9r3, names_to = "sample", values_to = "cts") %>%
# join with sample info table
full_join(sample_info, by = ("sample")) %>%
# for each gene
group_by(gene) %>%
# scale the cts column
mutate(cts_scaled = (cts - mean(cts))/sd(cts)) %>%
# for each gene, strain and minute
group_by(gene, day) %>%
# calculate the mean (scaled) cts
summarise(mean_cts_scaled = mean(cts_scaled),
nrep = n()) %>%
ungroup()
#look at result
head(trans_cts_mean)
# plot the relative expressed values
trans_cts_mean %>%
ggplot(aes(day, mean_cts_scaled)) +
geom_line(aes(group = gene), alpha = 0.2) +
geom_hline(yintercept = 0, colour = "brown", linetype = "dashed")
## onwards to clustering
# Create a matrix. this changes nothing substantial, just a different variable type
hclust_matrix <- trans_cts %>%
select(-gene) %>%
as.matrix()
# assign rownames
rownames(hclust_matrix) <- trans_cts$gene
hclust_matrix <- hclust_matrix %>%
# transpose the matrix so genes are as columns
t() %>%
# apply scalling to each column of the matrix (genes)
scale() %>%
# transpose back so genes are as rows again
t()
#calculate distance between genes (rows)
gene_dist <- dist(hclust_matrix)
#LET'S GET CLUSTERING
gene_hclust <- hclust(gene_dist, method = "complete")
# The default `plot()` function can be used to produce a simple dendrogram
plot(gene_hclust, labels = FALSE)
abline(h = 6.05, col = "brown", lwd = 2) # add horizontal line to illustrate cutting dendrogram
#to sort the genes by clusters, pick any height in the tree and cut it there.
#genes will be assorted to their dendrogram branch.
gene_cluster <- cutree(gene_hclust, k=5) %>%
#make into tibble
enframe() %>%
#rename columns
rename(gene = name, cluster = value)
#visualize
trans_cts_cluster <- trans_cts_mean %>%
inner_join(gene_cluster, by = "gene")
head(trans_cts_cluster)
#the plot
#I made alpha value very low, to show by transparency where most lines overlap
trans_cts_cluster %>%
ggplot(aes(day, mean_cts_scaled)) +
geom_line(aes(group = gene),
alpha = 0.5,
linewidth = 1,
) +
geom_line(stat = "summary",
fun = "median",
colour = "brown",
size = 1.5,
aes(group = 1)
) +
scale_x_continuous(breaks = c(1,3,6,9)) +
facet_grid(cols = vars(cluster))