-
Notifications
You must be signed in to change notification settings - Fork 50
/
Copy path13_ggtree_gallery.Rmd
379 lines (313 loc) · 13.8 KB
/
13_ggtree_gallery.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
\newpage
# Gallery of Reproducible Examples {#chapter13}
## Visualizing pairwise nucleotide sequence distance with a phylogenetic tree {#hpv58}
This example reproduces figure 1 of [@chen_ancient_2017]. It extracts accession numbers from tip labels of the HPV58 tree and calculates pairwise nucleotide sequence distances. The distance matrix is visualized as dot and line plots. This example demonstrates the ability to add multiple layers to a specific panel. As illustrated in Figure \@ref(fig:jv2017), the `geom_facet()` function displays sequence distances as a dot plot and then adds a layer of line plot to the same panel, *i.e.*, sequence distance. In addition, the tree in `geom_facet()` can be fully annotated with multiple layers (clade labels, bootstrap support values, *etc.*). The source code is modified from the supplemental file of [@yu_two_2018].
```{r message=FALSE}
library(TDbook)
library(tibble)
library(tidyr)
library(Biostrings)
library(treeio)
library(ggplot2)
library(ggtree)
# loaded from TDbook package
tree <- tree_HPV58
clade <- c(A3 = 92, A1 = 94, A2 = 108, B1 = 156,
B2 = 159, C = 163, D1 = 173, D2 = 176)
tree <- groupClade(tree, clade)
cols <- c(A1 = "#EC762F", A2 = "#CA6629", A3 = "#894418", B1 = "#0923FA",
B2 = "#020D87", C = "#000000", D1 = "#9ACD32",D2 = "#08630A")
## visualize the tree with tip labels and tree scale
p <- ggtree(tree, aes(color = group), ladderize = FALSE) %>%
rotate(rootnode(tree)) +
geom_tiplab(aes(label = paste0("italic('", label, "')")),
parse = TRUE, size = 2.5) +
geom_treescale(x = 0, y = 1, width = 0.002) +
scale_color_manual(values = c(cols, "black"),
na.value = "black", name = "Lineage",
breaks = c("A1", "A2", "A3", "B1", "B2", "C", "D1", "D2")) +
guides(color = guide_legend(override.aes = list(size = 5, shape = 15))) +
theme_tree2(legend.position = c(.1, .88))
## Optional
## add labels for monophyletic (A, C and D) and paraphyletic (B) groups
dat <- tibble(node = c(94, 108, 131, 92, 156, 159, 163, 173, 176,172),
name = c("A1", "A2", "A3", "A", "B1",
"B2", "C", "D1", "D2", "D"),
offset = c(0.003, 0.003, 0.003, 0.00315, 0.003,
0.003, 0.0031, 0.003, 0.003, 0.00315),
offset.text = c(-.001, -.001, -.001, 0.0002, -.001,
-.001, 0.0002, -.001, -.001, 0.0002),
barsize = c(1.2, 1.2, 1.2, 2, 1.2, 1.2, 3.2, 1.2, 1.2, 2),
extend = list(c(0, 0.5), 0.5, c(0.5, 0), 0, c(0, 0.5),
c(0.5, 0), 0, c(0, 0.5), c(0.5, 0), 0)
) %>%
dplyr::group_split(barsize)
p <- p +
geom_cladelab(
data = dat[[1]],
mapping = aes(
node = node,
label = name,
color = group,
offset = offset,
offset.text = offset.text,
extend = extend
),
barsize = 1.2,
fontface = 3,
align = TRUE
) +
geom_cladelab(
data = dat[[2]],
mapping = aes(
node = node,
label = name,
offset = offset,
offset.text =offset.text,
extend = extend
),
barcolor = "darkgrey",
textcolor = "darkgrey",
barsize = 2,
fontsize = 5,
fontface = 3,
align = TRUE
) +
geom_cladelab(
data = dat[[3]],
mapping = aes(
node = node,
label = name,
offset = offset,
offset.text = offset.text,
extend = extend
),
barcolor = "darkgrey",
textcolor = "darkgrey",
barsize = 3.2,
fontsize = 5,
fontface = 3,
align = TRUE
) +
geom_strip(65, 71, "italic(B)", color = "darkgrey",
offset = 0.00315, align = TRUE, offset.text = 0.0002,
barsize = 2, fontsize = 5, parse = TRUE)
## Optional
## display support values
p <- p + geom_nodelab(aes(subset = (node == 92), label = "*"),
color = "black", nudge_x = -.001, nudge_y = 1) +
geom_nodelab(aes(subset = (node == 155), label = "*"),
color = "black", nudge_x = -.0003, nudge_y = -1) +
geom_nodelab(aes(subset = (node == 158), label = "95/92/1.00"),
color = "black", nudge_x = -0.0001,
nudge_y = -1, hjust = 1) +
geom_nodelab(aes(subset = (node == 162), label = "98/97/1.00"),
color = "black", nudge_x = -0.0001,
nudge_y = -1, hjust = 1) +
geom_nodelab(aes(subset = (node == 172), label = "*"),
color = "black", nudge_x = -.0003, nudge_y = -1)
```
```{r eval=F}
## extract accession numbers from tip labels
tl <- tree$tip.label
acc <- sub("\\w+\\|", "", tl)
names(tl) <- acc
## read sequences from GenBank directly into R
## and convert the object to DNAStringSet
tipseq <- ape::read.GenBank(acc) %>% as.character %>%
lapply(., paste0, collapse = "") %>% unlist %>%
DNAStringSet
## align the sequences using muscle
tipseq_aln <- muscle::muscle(tipseq)
tipseq_aln <- DNAStringSet(tipseq_aln)
```
```{r echo=F}
## extract accession numbers from tip labels
tl <- tree$tip.label
acc <- sub("\\w+\\|", "", tl)
names(tl) <- acc
## writeXStringSet(tipseq_aln, file = "data/HPV58_aln.fas")
#tipseq_aln <- readDNAStringSet("data/HPV58_aln.fas")
tipseq_aln <- TDbook::dna_HPV58_aln %>%
as.character %>%
lapply(., paste0, collapse = "") %>%
unlist() %>%
Biostrings::DNAStringSet()
```
(ref:jv2017scap) Phylogeny of HPV58 complete genomes with dot and line plots of pairwise nucleotide sequence distances.
(ref:jv2017cap) **Phylogeny of HPV58 complete genomes with dot and line plots of pairwise nucleotide sequence distances**.
```{r jv2017, fig.width=12, fig.height=12, fig.cap="(ref:jv2017cap)", fig.scap="(ref:jv2017scap)", warning=FALSE, out.width='100%'}
## calculate pairwise hamming distances among sequences
tipseq_dist <- stringDist(tipseq_aln, method = "hamming")
## calculate the percentage of differences
tipseq_d <- as.matrix(tipseq_dist) / width(tipseq_aln[1]) * 100
## convert the matrix to a tidy data frame for facet_plot
dd <- as_tibble(tipseq_d)
dd$seq1 <- rownames(tipseq_d)
td <- gather(dd,seq2, dist, -seq1)
td$seq1 <- tl[td$seq1]
td$seq2 <- tl[td$seq2]
g <- p$data$group
names(g) <- p$data$label
td$clade <- g[td$seq2]
## visualize the sequence differences using dot plot and line plot
## and align the sequence difference plot to the tree using facet_plot
p2 <- p + geom_facet(panel = "Sequence Distance",
data = td, geom = geom_point, alpha = .6,
mapping = aes(x = dist, color = clade, shape = clade)) +
geom_facet(panel = "Sequence Distance",
data = td, geom = geom_path, alpha = .6,
mapping=aes(x = dist, group = seq2, color = clade)) +
scale_shape_manual(values = 1:8, guide = FALSE)
print(p2)
```
## Displaying Different Symbolic Points for Bootstrap Values. {#symbolic-bootstrap}
We can cut the bootstrap values into several intervals, *e.g.*, to indicate whether the clade is of high, moderate, or low support. Then we can use these intervals as categorical variables to set different colors or shapes of symbolic points to indicate the bootstrap values belong to which category (Figure \@ref(fig:bpinterval)).
(ref:bpintervalscap) Partitioning bootstrap values.
(ref:bpintervalcap) **Partitioning bootstrap values**. Bootstrap values were divided into three categories and this information was used to color circle points.
```{r include = FALSE}
## phytools also have a read.newick function
read.newick <- treeio::read.newick
```
```{r bpinterval, fig.width=7.5, fig.height=8.6, fig.cap="(ref:bpintervalcap)", fig.scap="(ref:bpintervalscap)", out.width='100%'}
library(treeio)
library(ggplot2)
library(ggtree)
library(TDbook)
tree <- read.newick(text=text_RMI_tree, node.label = "support")
root <- rootnode(tree)
ggtree(tree, color="black", size=1.5, linetype=1, right=TRUE) +
geom_tiplab(size=4.5, hjust = -0.060, fontface="bold") + xlim(0, 0.09) +
geom_point2(aes(subset=!isTip & node != root,
fill=cut(support, c(0, 700, 900, 1000))),
shape=21, size=4) +
theme_tree(legend.position=c(0.2, 0.2)) +
scale_fill_manual(values=c("white", "grey", "black"), guide='legend',
name='Bootstrap Percentage(BP)',
breaks=c('(900,1e+03]', '(700,900]', '(0,700]'),
labels=expression(BP>=90,70 <= BP * " < 90", BP < 70))
```
## Highlighting Different Groups {#phylo-grouping}
This example reproduces Figure 1 of [@larsen_identification_2019]. It used `groupOTU()` to add grouping information of chicken CTLDcps. The branch line type and color are defined based on this grouping information. Two groups of CTLDcps are highlighted in different background colors using `geom_hilight` (red for Group II and green for Group V). The avian-specific expansion of Group V with the subgroups of A and B- are labeled using `geom_cladelab` (Figure \@ref(fig:treeLarsen)).
(ref:treeLarsenscap) Phylogenetic tree of CTLDcps.
(ref:treeLarsencap) **Phylogenetic tree of CTLDcps**. Using different background colors, line types and colors, and clade labels to distinguish groups.
```{r treeLarsen, fig.cap="(ref:treeLarsencap)", fig.scap="(ref:treeLarsenscap)", fig.width=7.5, fig.height=6.3, out.width='100%'}
library(TDbook)
mytree <- tree_treenwk_30.4.19
# Define nodes for coloring later on
tiplab <- mytree$tip.label
cls <- tiplab[grep("^ch", tiplab)]
labeltree <- groupOTU(mytree, cls)
p <- ggtree(labeltree, aes(color=group, linetype=group), layout="circular") +
scale_color_manual(values = c("#efad29", "#63bbd4")) +
geom_nodepoint(color="black", size=0.1) +
geom_tiplab(size=2, color="black")
p2 <- flip(p, 136, 110) %>%
flip(141, 145) %>%
rotate(141) %>%
rotate(142) %>%
rotate(160) %>%
rotate(164) %>%
rotate(131)
### Group V and II coloring
dat <- data.frame(
node = c(110, 88, 156,136),
fill = c("#229f8a", "#229f8a", "#229f8a", "#f9311f")
)
p3 <- p2 +
geom_hilight(
data = dat,
mapping = aes(
node = node,
fill = I(fill)
),
alpha = 0.2,
extendto = 1.4
)
### Putting on a label on the avian specific expansion
p4 <- p3 +
geom_cladelab(
node = 113,
label = "Avian-specific expansion",
align = TRUE,
angle = -35,
offset.text = 0.05,
hjust = "center",
fontsize = 2,
offset = .2,
barsize = .2
)
### Adding the bootstrap values with subset used to remove all bootstraps < 50
p5 <- p4 +
geom_nodelab(
mapping = aes(
x = branch,
label = label,
subset = !is.na(as.numeric(label)) & as.numeric(label) > 50
),
size = 2,
color = "black",
nudge_y = 0.6
)
### Putting labels on the subgroups
p6 <- p5 +
geom_cladelab(
data = data.frame(
node = c(114, 121),
name = c("Subgroup A", "Subgroup B")
),
mapping = aes(
node = node,
label = name
),
align = TRUE,
offset = .05,
offset.text = .03,
hjust = "center",
barsize = .2,
fontsize = 2,
angle = "auto",
horizontal = FALSE
) +
theme(
legend.position = "none",
plot.margin = grid::unit(c(-15, -15, -15, -15), "mm")
)
print(p6)
```
## Phylogenetic Tree with Genome Locus Structure {#genome-locus}
The `geom_motif()` is defined in `r Biocpkg("ggtree")` and it is a wrapper layer of the `gggenes::geom_gene_arrow()`. The `geom_motif()` can automatically adjust genomic alignment by selective gene (via the `on` parameter) and can label genes via the `label` parameter. In the following example, we use `example_genes` dataset provided by `r CRANpkg("gggenes")`. As the dataset only provides genomic coordination of a set of genes, a phylogeny for the genomes needs to be constructed first. We calculate Jaccard similarity based on the ratio of overlapping genes among genomes and correspondingly determine genome distance. The BioNJ algorithm was applied to construct the tree. Then we can use `geom_facet()` to visualize the tree with the genomic structures (Figure \@ref(fig:gggenes)).
(ref:gggenesscap) Genomic features with a phylogenetic tree.
(ref:gggenescap) **Genomic features with a phylogenetic tree.**
```{r gggenes, fig.width=9, fig.height=4, fig.cap="(ref:gggenescap)", fig.scap="(ref:gggenesscap)", out.width='100%'}
library(dplyr)
library(ggplot2)
library(gggenes)
library(ggtree)
get_genes <- function(data, genome) {
filter(data, molecule == genome) %>% pull(gene)
}
g <- unique(example_genes[,1])
n <- length(g)
d <- matrix(nrow = n, ncol = n)
rownames(d) <- colnames(d) <- g
genes <- lapply(g, get_genes, data = example_genes)
for (i in 1:n) {
for (j in 1:i) {
jaccard_sim <- length(intersect(genes[[i]], genes[[j]])) /
length(union(genes[[i]], genes[[j]]))
d[j, i] <- d[i, j] <- 1 - jaccard_sim
}
}
tree <- ape::bionj(d)
p <- ggtree(tree, branch.length='none') +
geom_tiplab() + xlim_tree(5.5) +
geom_facet(mapping = aes(xmin = start, xmax = end, fill = gene),
data = example_genes, geom = geom_motif, panel = 'Alignment',
on = 'genE', label = 'gene', align = 'left') +
scale_fill_brewer(palette = "Set3") +
scale_x_continuous(expand=c(0,0)) +
theme(strip.text=element_blank(),
panel.spacing=unit(0, 'cm'))
facet_widths(p, widths=c(1,2))
```