This repository has been archived by the owner on Oct 2, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #55 from d3b-center/rjcorb/51-race-ethn-heatmaps
Generate summary enrichment heatmaps
- Loading branch information
Showing
21 changed files
with
311 additions
and
294 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,184 @@ | ||
# R Corbett 2023 | ||
# | ||
# Generate alluvial plot for PBTA ancestry project | ||
|
||
# load libraries and set directories | ||
library(data.table) | ||
library(tidyverse) | ||
# library(ComplexHeatmap) | ||
# library(RColorBrewer) | ||
# library(circlize) | ||
library(colorblindr) | ||
library(ggpubr) | ||
library(ggalluvial) | ||
library(cowplot) | ||
|
||
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git")) | ||
|
||
# Set file paths | ||
data_dir <- file.path(root_dir, "data") | ||
analysis_dir <- file.path(root_dir, "analyses", "add-histologies") | ||
results_dir <- file.path(analysis_dir, "results") | ||
input_dir <- file.path(analysis_dir, "input") | ||
plots_dir <- file.path(analysis_dir, "plots") | ||
|
||
# call plot theme | ||
source(file.path(root_dir, "figures", "theme.R")) | ||
|
||
# set file paths | ||
ancestry_file <- file.path(results_dir, "merged_ancestry_histology_data.tsv") | ||
|
||
# wrangle data | ||
ancestry <- read_tsv(ancestry_file) | ||
|
||
# consolidate unreported/unknown reported races and ethnicities | ||
ancestry <- ancestry %>% | ||
dplyr::mutate(race = case_when( | ||
race %in% c("Not Reported", "Other", "Reported Unknown", "Not Reported/Unknown", "Unknown") | is.na(race) ~ "Race Unknown", | ||
race == "White/Caucasian" ~ "White", | ||
race == "American Indian or Alaska Native" ~ "AI/AN", | ||
race == "Native Hawaiian or Other Pacific Islander" ~ "NHPI", | ||
race == "Black or African American" ~ "Black/Afr. Am.", | ||
race == "More Than One Race" ~ ">1 Race", | ||
TRUE ~ race | ||
)) %>% | ||
mutate(ethnicity = case_when( | ||
grepl("Reported|Available|Unavailable", ethnicity) | is.na(ethnicity) ~ "Unknown", | ||
grepl("Non-Hispanic|Not Hispanic", ethnicity) ~ "Not Hispanic/Latino", | ||
TRUE ~ "Hispanic/Latino" | ||
)) | ||
|
||
|
||
# Create data frame for alluvial plots, including only predicted_ancestry, race, and ethnicity columns | ||
alluvial_df <- as.data.frame(table(ancestry$predicted_ancestry, ancestry$race, ancestry$ethnicity)) %>% | ||
# format for alluvial plot | ||
dplyr::rename(predicted_ancestry = Var1, | ||
race = Var2, | ||
ethnicity = Var3) %>% | ||
to_lodes_form(axes = 1:3) %>% | ||
dplyr::rename(Group = stratum) %>% | ||
mutate(Group = factor(Group, levels = c("EAS", "SAS", "AFR", "AMR", "EUR", | ||
"Asian", "Black/Afr. Am.", "AI/AN", | ||
"NHPI", "White", ">1 Race", | ||
"Race Unknown", | ||
"Hispanic/Latino", | ||
"Not Hispanic/Latino", | ||
"Unknown"))) %>% | ||
mutate(race = case_when(Group %in% c("Asian", "Black/Afr. Am.", "AI/AN", | ||
"NHPI", "White", ">1 Race", | ||
"Race Unknown") ~ Group, | ||
TRUE ~ NA), | ||
predicted_ancestry = case_when(Group %in% c("EAS", "SAS", "AFR", "EUR", "AMR") ~ Group, | ||
TRUE ~ NA), | ||
ethnicity = case_when(Group %in% c("Hispanic/Latino", | ||
"Not Hispanic/Latino", | ||
"Unknown") ~ Group, | ||
TRUE ~ NA)) | ||
|
||
# Generate alluvial plot | ||
p1 <- ggplot(alluvial_df, aes(y = Freq, stratum = Group, alluvium = alluvium, x = x, fill = Group)) + | ||
geom_alluvium(show.legend = F) + | ||
geom_stratum(show.legend = F) + | ||
scale_fill_manual(values = c("Asian" = "#009E73", "SAS" = "#D55E00", "EAS" = "#009E73", | ||
"White" = "#0072B2", "EUR" = "#0072B2", | ||
"Black/Afr. Am." = "#E69F00", "AFR" = "#E69F00", | ||
"NHPI" = "skyblue", "AMR" = "#56B4E9", | ||
"AI/AN" = "#56B4E9", | ||
"Race Unknown" = "grey", ">1 Race" = "brown", | ||
"Hispanic/Latino" = "#56B4E9", | ||
"Not Hispanic/Latino" = "#0072B2", | ||
"Unknown" = "grey")) + | ||
xlab("") + | ||
ylab("Number of Patients") + | ||
scale_x_discrete(labels = c("predicted ancestry", "reported race", "reported ethnicity")) + | ||
theme_Publication() | ||
|
||
# Create separate ancestry, race, and ethnicity dfs for legend generation | ||
race_df <- data.frame(race = c("Asian", "Black/Afr. Am.", "AI/AN", | ||
"NHPI", "White", ">1 Race", | ||
"Race Unknown"), | ||
value = 1) | ||
ancestry_df <- data.frame(predicted_ancestry = c("EAS", "SAS", "AFR", "EUR", "AMR"), | ||
value = 1) | ||
ethnicity_df <- data.frame(ethnicity = c("Hispanic/Latino", | ||
"Not Hispanic/Latino", | ||
"Unknown"), | ||
value = 1) | ||
|
||
# plot race legend | ||
lgd_race <- ggplot(race_df, aes(x = value, y = factor(race, levels = c("Race Unknown", | ||
">1 Race", "White", | ||
"AI/AN", | ||
"NHPI", | ||
"Black/Afr. Am.", "Asian")), | ||
fill = race)) + | ||
geom_tile(show.legend = T, color = "black", | ||
lwd = 0.5, linetype = 1) + | ||
scale_fill_manual(values = c("Asian" = "#009E73", | ||
"White" = "#0072B2", | ||
"Black/Afr. Am." = "#E69F00", | ||
"AI/AN" = "#56B4E9", | ||
"NHPI" = "skyblue", | ||
">1 Race" = "brown", | ||
"Race Unknown" = "grey"), | ||
breaks = c("Asian", | ||
"Black/Afr. Am.", | ||
"AI/AN", | ||
"NHPI", | ||
"White", | ||
">1 Race", | ||
"Race Unknown")) + | ||
labs(fill = "Reported Race") + | ||
theme_Publication() | ||
leg_race <- get_legend(lgd_race) | ||
|
||
# Plot ancestry legend | ||
lgd_anc <- ggplot(ancestry_df, aes(x = value, y = factor(predicted_ancestry, | ||
levels = c("SAS","EAS","EUR","AMR","AFR")), | ||
fill = predicted_ancestry)) + | ||
geom_tile(show.legend = T, col = "black", | ||
lwd = 0.5, linetype = 1) + | ||
#xlim() + | ||
scale_fill_manual(values = c("SAS" = "#D55E00", | ||
"EAS" = "#009E73", | ||
"EUR" = "#0072B2", | ||
"AFR" = "#E69F00", | ||
"AMR" = "#56B4E9"), breaks = c("EAS", "SAS", "AFR", "AMR", "EUR")) + | ||
labs(fill = "Predicted Ancestry") + | ||
theme_Publication() | ||
leg_anc <- get_legend(lgd_anc) | ||
|
||
# Plot ethnicity legend | ||
lgd_ethn <- ggplot(ethnicity_df, aes(x = value, y = factor(ethnicity, | ||
levels = c("Hispanic/Latino", | ||
"Not Hispanic/Latino", | ||
"Unknown")), | ||
fill = ethnicity)) + | ||
geom_tile(show.legend = T, col = "black", | ||
lwd = 0.5, linetype = 1) + | ||
#xlim() + | ||
scale_fill_manual(values = c("Hispanic/Latino" = "#56B4E9", | ||
"Not Hispanic/Latino" = "#0072B2", | ||
"Unknown" = "grey")) + | ||
labs(fill = "Reported Ethnicity") + | ||
theme_Publication() | ||
leg_ethn <- get_legend(lgd_ethn) | ||
|
||
leg <- plot_grid(leg_anc, leg_race, leg_ethn, | ||
nrow = 3, | ||
align = "v", | ||
rel_heights = c(1.5, 0.4, 1.5)) | ||
|
||
|
||
final_p <- plot_grid(p1, leg, | ||
nrow = 1, | ||
align = "none", | ||
axis = "t", | ||
rel_widths = c(1,0.45)) | ||
|
||
pdf(file.path(plots_dir, "ancestry-race-ethnicity-alluvial.pdf"), | ||
width = 10, height = 6) | ||
|
||
final_p | ||
|
||
dev.off() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
Binary file modified
BIN
+4.31 KB
(100%)
analyses/add-histologies/plots/ancestry-race-ethnicity-alluvial.pdf
Binary file not shown.
Binary file renamed
BIN
+3.76 KB
...plots/plot_group_by_ancestry_unk_race.pdf → ...ots/ethnicity_ancestry_ct_enr_heatmap.pdf
Binary file not shown.
Binary file modified
BIN
+2.03 KB
(130%)
analyses/add-histologies/plots/lgg_subtype_by_predicted_ancestry.pdf
Binary file not shown.
Binary file modified
BIN
+1.66 KB
(130%)
analyses/add-histologies/plots/lgg_tumor_location_by_predicted_ancestry.pdf
Binary file not shown.
Binary file modified
BIN
+1.54 KB
(130%)
analyses/add-histologies/plots/lgg_tumor_resection_by_predicted_ancestry.pdf
Binary file not shown.
Binary file modified
BIN
+0 Bytes
(100%)
analyses/add-histologies/plots/low_major_ancestry_heatmap.pdf
Binary file not shown.
Binary file modified
BIN
+0 Bytes
(100%)
analyses/add-histologies/plots/major_predicted_ancestry_hist.pdf
Binary file not shown.
Binary file modified
BIN
+14 Bytes
(100%)
analyses/add-histologies/plots/plot_group_ancestry_ct_enr_heatmap.pdf
Binary file not shown.
Binary file modified
BIN
+746 Bytes
(110%)
analyses/add-histologies/plots/plot_group_ancestry_enrichment_heatmap.pdf
Binary file not shown.
Binary file not shown.
Binary file modified
BIN
+2 Bytes
(100%)
analyses/add-histologies/plots/predicted_ancestry_counts_by_ethnicity.pdf
Binary file not shown.
Binary file modified
BIN
+0 Bytes
(100%)
analyses/add-histologies/plots/predicted_ancestry_counts_by_race.pdf
Binary file not shown.
Binary file modified
BIN
+3 Bytes
(100%)
analyses/add-histologies/plots/predicted_ancestry_percent_by_ethnicity.pdf
Binary file not shown.
Binary file modified
BIN
+0 Bytes
(100%)
analyses/add-histologies/plots/predicted_ancestry_percent_by_race.pdf
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
# Functions for generating count and enrichment heatmaps | ||
# | ||
# Ryan Corbett | ||
# | ||
# 2024 | ||
|
||
# Function to create frequency count and enrichment heatmap for two selected variables. Significance of enrichment is calculated using hypergeometric tests | ||
|
||
plot_enr <- function(df, var1, var2, | ||
var1_names, var2_names, | ||
padjust = FALSE) { | ||
|
||
enr <- matrix(0, nrow(unique(df[,var1])), | ||
nrow(unique(df[,var2])), | ||
dimnames = list(var1_names, | ||
var2_names)) | ||
pval <- enr | ||
ct <- enr | ||
|
||
# loop through cancer groups to calculate enrichment | ||
for (i in 1:nrow(enr)){ | ||
no_var1 <- sum(unlist(df[,var1]) == rownames(enr)[i]) | ||
for (j in 1:ncol(enr)){ | ||
no_var2 <- sum(unlist(df[,var2]) == colnames(enr)[j] & !is.na(unlist(df[,var2]))) | ||
no_var1_var2 <- sum(unlist(df[,var1]) == rownames(enr)[i] & unlist(df[,var2]) == colnames(enr)[j]) | ||
ct[i,j] <- no_var1_var2 | ||
enr[i,j] <- (no_var1_var2/no_var2)/(no_var1/nrow(df)) | ||
pval[i,j] <- phyper(no_var1_var2, no_var1, nrow(df) - no_var1, no_var2, lower.tail = F) | ||
} | ||
} | ||
|
||
if (padjust == TRUE) { | ||
|
||
fdr <- t(apply(pval, 1, function(x) p.adjust(x, "fdr"))) | ||
sig_mat <- ifelse(fdr < 0.05 & enr > 1 & ct > 1, "*", "") | ||
|
||
} else { | ||
|
||
sig_mat <- ifelse(pval < 0.05 & enr > 1 & ct > 1, "*", "") | ||
|
||
} | ||
|
||
fill_mat <- matrix(glue::glue("{round(enr, 1)}{sig_mat}"), | ||
nrow(enr), ncol(enr)) | ||
|
||
ct_enr_mat <- matrix(glue::glue("{ct}\n({fill_mat})"), | ||
nrow(ct), ncol(ct)) | ||
|
||
col_fun = colorRamp2(c(0, ceiling(max(enr))), c("white", "orangered")) | ||
|
||
region_ht <- Heatmap(enr, | ||
name = "Odds ratio", | ||
cluster_rows = F, | ||
cluster_columns = F, | ||
rect_gp = gpar(col = "black", lwd = 2), | ||
col = col_fun, | ||
cell_fun = function(j, i, x, y, width, height, fill) { | ||
grid.text(sprintf("%s", ct_enr_mat[i, j]), x, y, gp = gpar(fontsize = 12)) | ||
}) | ||
|
||
return(region_ht) | ||
|
||
} |