Merge pull request #55 from d3b-center/rjcorb/51-race-ethn-heatmaps

Generate summary enrichment heatmaps
d3b-center · Feb 29, 2024 · f91742a · f91742a
2 parents a6ae544 + c1a141e
commit f91742a
Show file tree

Hide file tree

Showing 21 changed files with 311 additions and 294 deletions.
diff --git a/analyses/add-histologies/02-summary_stats.R b/analyses/add-histologies/02-summary_stats.R
diff --git a/analyses/add-histologies/03-alluvial_plot.R b/analyses/add-histologies/03-alluvial_plot.R
@@ -0,0 +1,184 @@
+# R Corbett 2023
+#
+# Generate alluvial plot for PBTA ancestry project
+
+# load libraries and set directories
+library(data.table)
+library(tidyverse)
+# library(ComplexHeatmap)
+# library(RColorBrewer)
+# library(circlize)
+library(colorblindr)
+library(ggpubr)
+library(ggalluvial)
+library(cowplot)
+
+root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
+
+# Set file paths
+data_dir <- file.path(root_dir, "data")
+analysis_dir <- file.path(root_dir, "analyses", "add-histologies")
+results_dir <- file.path(analysis_dir, "results")
+input_dir <- file.path(analysis_dir, "input")
+plots_dir <- file.path(analysis_dir, "plots")
+
+# call plot theme 
+source(file.path(root_dir, "figures", "theme.R"))
+
+# set file paths
+ancestry_file <- file.path(results_dir, "merged_ancestry_histology_data.tsv")
+
+# wrangle data
+ancestry <- read_tsv(ancestry_file)
+
+# consolidate unreported/unknown reported races and ethnicities 
+ancestry <- ancestry %>%
+  dplyr::mutate(race = case_when(
+    race %in% c("Not Reported", "Other", "Reported Unknown", "Not Reported/Unknown", "Unknown") | is.na(race) ~ "Race Unknown",
+    race == "White/Caucasian" ~ "White",
+    race == "American Indian or Alaska Native" ~ "AI/AN",
+    race == "Native Hawaiian or Other Pacific Islander" ~ "NHPI",
+    race == "Black or African American" ~ "Black/Afr. Am.",
+    race == "More Than One Race" ~ ">1 Race",
+    TRUE ~ race
+  )) %>%
+  mutate(ethnicity = case_when(
+    grepl("Reported|Available|Unavailable", ethnicity) | is.na(ethnicity) ~ "Unknown",
+    grepl("Non-Hispanic|Not Hispanic", ethnicity) ~ "Not Hispanic/Latino",
+    TRUE ~ "Hispanic/Latino"
+  ))
+
+
+# Create data frame for alluvial plots, including only predicted_ancestry, race, and ethnicity columns
+alluvial_df <- as.data.frame(table(ancestry$predicted_ancestry, ancestry$race, ancestry$ethnicity)) %>% 
+  # format for alluvial plot
+  dplyr::rename(predicted_ancestry = Var1, 
+                race = Var2,
+                ethnicity = Var3) %>%
+  to_lodes_form(axes = 1:3) %>% 
+  dplyr::rename(Group = stratum) %>% 
+  mutate(Group = factor(Group, levels = c("EAS", "SAS", "AFR", "AMR", "EUR",
+                                          "Asian", "Black/Afr. Am.", "AI/AN", 
+                                          "NHPI", "White", ">1 Race",
+                                          "Race Unknown", 
+                                          "Hispanic/Latino",
+                                          "Not Hispanic/Latino",
+                                          "Unknown"))) %>% 
+  mutate(race = case_when(Group %in% c("Asian", "Black/Afr. Am.", "AI/AN", 
+                                       "NHPI", "White", ">1 Race",
+                                       "Race Unknown") ~ Group, 
+                          TRUE ~ NA), 
+         predicted_ancestry = case_when(Group %in% c("EAS", "SAS", "AFR", "EUR", "AMR") ~ Group, 
+                                        TRUE ~ NA),
+         ethnicity = case_when(Group %in% c("Hispanic/Latino",
+                                            "Not Hispanic/Latino",
+                                            "Unknown") ~ Group,
+                               TRUE ~ NA))
+
+# Generate alluvial plot
+p1 <- ggplot(alluvial_df, aes(y = Freq, stratum = Group, alluvium = alluvium, x = x, fill = Group)) + 
+  geom_alluvium(show.legend = F) + 
+  geom_stratum(show.legend = F) +
+  scale_fill_manual(values = c("Asian" = "#009E73", "SAS" = "#D55E00", "EAS" = "#009E73",
+                               "White" = "#0072B2", "EUR" = "#0072B2",
+                               "Black/Afr. Am." = "#E69F00", "AFR" = "#E69F00",
+                               "NHPI" = "skyblue", "AMR" = "#56B4E9",
+                               "AI/AN" = "#56B4E9",
+                               "Race Unknown" = "grey", ">1 Race" = "brown",
+                               "Hispanic/Latino" = "#56B4E9",
+                               "Not Hispanic/Latino" = "#0072B2",
+                               "Unknown" = "grey")) +
+  xlab("") + 
+  ylab("Number of Patients") +
+  scale_x_discrete(labels = c("predicted ancestry", "reported race", "reported ethnicity")) + 
+  theme_Publication()
+
+# Create separate ancestry, race, and ethnicity dfs for legend generation
+race_df <- data.frame(race = c("Asian", "Black/Afr. Am.", "AI/AN", 
+                               "NHPI", "White", ">1 Race",
+                               "Race Unknown"), 
+                      value = 1)
+ancestry_df <- data.frame(predicted_ancestry = c("EAS", "SAS", "AFR", "EUR", "AMR"), 
+                          value = 1)
+ethnicity_df <- data.frame(ethnicity = c("Hispanic/Latino",
+                                         "Not Hispanic/Latino",
+                                         "Unknown"), 
+                           value = 1)
+
+# plot race legend
+lgd_race <- ggplot(race_df, aes(x = value, y = factor(race, levels = c("Race Unknown", 
+                                                                       ">1 Race", "White", 
+                                                                       "AI/AN",
+                                                                       "NHPI", 
+                                                                       "Black/Afr. Am.", "Asian")), 
+                                fill = race)) + 
+  geom_tile(show.legend = T, color = "black",
+            lwd = 0.5, linetype = 1) + 
+  scale_fill_manual(values = c("Asian" = "#009E73",  
+                               "White" = "#0072B2", 
+                               "Black/Afr. Am." = "#E69F00", 
+                               "AI/AN" = "#56B4E9",
+                               "NHPI" = "skyblue", 
+                               ">1 Race" = "brown",
+                               "Race Unknown" = "grey"),
+                    breaks = c("Asian",  
+                               "Black/Afr. Am.", 
+                               "AI/AN",
+                               "NHPI", 
+                               "White", 
+                               ">1 Race",
+                               "Race Unknown")) +
+  labs(fill = "Reported Race") +
+  theme_Publication()
+leg_race <- get_legend(lgd_race)
+
+# Plot ancestry legend
+lgd_anc <- ggplot(ancestry_df, aes(x = value, y = factor(predicted_ancestry, 
+                                                         levels = c("SAS","EAS","EUR","AMR","AFR")), 
+                                   fill = predicted_ancestry)) + 
+  geom_tile(show.legend = T, col = "black",
+            lwd = 0.5, linetype = 1) + 
+  #xlim() + 
+  scale_fill_manual(values = c("SAS" = "#D55E00", 
+                               "EAS" = "#009E73",
+                               "EUR" = "#0072B2", 
+                               "AFR" = "#E69F00", 
+                               "AMR" = "#56B4E9"), breaks = c("EAS", "SAS", "AFR", "AMR", "EUR")) +
+  labs(fill = "Predicted Ancestry") +
+  theme_Publication()
+leg_anc <- get_legend(lgd_anc)
+
+# Plot ethnicity legend
+lgd_ethn <- ggplot(ethnicity_df, aes(x = value, y = factor(ethnicity, 
+                                                           levels = c("Hispanic/Latino",
+                                                                      "Not Hispanic/Latino",
+                                                                      "Unknown")), 
+                                     fill = ethnicity)) + 
+  geom_tile(show.legend = T, col = "black",
+            lwd = 0.5, linetype = 1) + 
+  #xlim() + 
+  scale_fill_manual(values = c("Hispanic/Latino" = "#56B4E9",
+                               "Not Hispanic/Latino" = "#0072B2",
+                               "Unknown" = "grey")) +
+  labs(fill = "Reported Ethnicity") +
+  theme_Publication()
+leg_ethn <- get_legend(lgd_ethn)
+
+leg <- plot_grid(leg_anc, leg_race, leg_ethn,
+                 nrow = 3,
+                 align = "v",
+                 rel_heights = c(1.5, 0.4, 1.5))
+
+
+final_p <- plot_grid(p1, leg,
+                     nrow = 1,
+                     align = "none",
+                     axis = "t",
+                     rel_widths = c(1,0.45))
+
+pdf(file.path(plots_dir, "ancestry-race-ethnicity-alluvial.pdf"),
+    width = 10, height = 6)
+
+final_p
+
+dev.off()
diff --git a/analyses/add-histologies/README.md b/analyses/add-histologies/README.md
@@ -14,6 +14,8 @@ This analysis merges patient ancestry prediction data with OpenPedCan histologie
 
 - `02-summary_stats.R`; plots summary statistics including PCA plots, concordance between ancestry and reported race and ethnicity, plot group by ancestry, etc. 
 
+- `03-aluvial_plot.R`; generates alluvial plot showing relationship between genetic ancestry superpopulation, reported race, and reported ethnicity across cohort
+
 - `input/` files: 
   - `plot-mapping.tsv` table of `plot_group` assignments by `broad histology` and `cancer_group`
 
@@ -23,6 +25,7 @@ This analysis merges patient ancestry prediction data with OpenPedCan histologie
 - `plots/` files: 
   - `ancestry-pcs.pdf`; plot of somalier PCs 1-4
   - `ancestry-race-ethnicity-alluvial.pdf`; alluvial plot of cohort predicted ancestry, reported race, and reported ethnicity
+  - `ethnicity_ancestry_ct_enr_heatmap.pdf`; enrichment heatmap of genetic ancestry subpopulations among reported ethnicity groups
   - `lgg_subtype_by_predicted_ancestry.pdf`
   - `lgg_tumor_location_by_predicted_ancestry.pdf`
   - `lgg_tumor_resection_by_predicted_ancestry.pdf`
@@ -31,9 +34,9 @@ This analysis merges patient ancestry prediction data with OpenPedCan histologie
   - `plot_group_ancestry_ct_enr_heatmap.pdf`; 
   - `plot_group_ancestry_enrichment_heatmap.pdf`
   - `plot_group_by_ancestry.pdf`; plot group count by predicted ancestry 
-  - `plot_group_by_ancestry_unk_race.pdf`; plot group count for only those patients of unknown race
   - `predicted_ancestry_counts_by_ethnicity.pdf`
   - `predicted_ancestry_counts_by_race.pdf`
   - `predicted_ancestry_percent_by_ethnicity.pdf`
   - `predicted_ancestry_percent_by_race.pdf`
+  - `race_ancestry_ct_enr_heatmap.pdf`; enrichment heatmap of genetic ancestry subpopulations among reported race groups
 
diff --git a/analyses/add-histologies/plots/ancestry-pcs.pdf b/analyses/add-histologies/plots/ancestry-pcs.pdf
diff --git a/analyses/add-histologies/plots/ancestry-race-ethnicity-alluvial.pdf b/analyses/add-histologies/plots/ancestry-race-ethnicity-alluvial.pdf
diff --git a/...plots/plot_group_by_ancestry_unk_race.pdf → ...ots/ethnicity_ancestry_ct_enr_heatmap.pdf b/...plots/plot_group_by_ancestry_unk_race.pdf → ...ots/ethnicity_ancestry_ct_enr_heatmap.pdf
diff --git a/analyses/add-histologies/plots/lgg_subtype_by_predicted_ancestry.pdf b/analyses/add-histologies/plots/lgg_subtype_by_predicted_ancestry.pdf
diff --git a/analyses/add-histologies/plots/lgg_tumor_location_by_predicted_ancestry.pdf b/analyses/add-histologies/plots/lgg_tumor_location_by_predicted_ancestry.pdf
diff --git a/analyses/add-histologies/plots/lgg_tumor_resection_by_predicted_ancestry.pdf b/analyses/add-histologies/plots/lgg_tumor_resection_by_predicted_ancestry.pdf
diff --git a/analyses/add-histologies/plots/low_major_ancestry_heatmap.pdf b/analyses/add-histologies/plots/low_major_ancestry_heatmap.pdf
diff --git a/analyses/add-histologies/plots/major_predicted_ancestry_hist.pdf b/analyses/add-histologies/plots/major_predicted_ancestry_hist.pdf
diff --git a/analyses/add-histologies/plots/plot_group_ancestry_ct_enr_heatmap.pdf b/analyses/add-histologies/plots/plot_group_ancestry_ct_enr_heatmap.pdf
diff --git a/analyses/add-histologies/plots/plot_group_ancestry_enrichment_heatmap.pdf b/analyses/add-histologies/plots/plot_group_ancestry_enrichment_heatmap.pdf
diff --git a/analyses/add-histologies/plots/plot_group_by_ancestry.pdf b/analyses/add-histologies/plots/plot_group_by_ancestry.pdf
diff --git a/analyses/add-histologies/plots/predicted_ancestry_counts_by_ethnicity.pdf b/analyses/add-histologies/plots/predicted_ancestry_counts_by_ethnicity.pdf
diff --git a/analyses/add-histologies/plots/predicted_ancestry_counts_by_race.pdf b/analyses/add-histologies/plots/predicted_ancestry_counts_by_race.pdf
diff --git a/analyses/add-histologies/plots/predicted_ancestry_percent_by_ethnicity.pdf b/analyses/add-histologies/plots/predicted_ancestry_percent_by_ethnicity.pdf
diff --git a/analyses/add-histologies/plots/predicted_ancestry_percent_by_race.pdf b/analyses/add-histologies/plots/predicted_ancestry_percent_by_race.pdf
diff --git a/analyses/add-histologies/plots/race_ancestry_ct_enr_heatmap.pdf b/analyses/add-histologies/plots/race_ancestry_ct_enr_heatmap.pdf
diff --git a/analyses/add-histologies/run_module.sh b/analyses/add-histologies/run_module.sh
@@ -9,3 +9,5 @@ Rscript -e "rmarkdown::render('01-add_histologies.Rmd')"
 # Get summary stats
 Rscript --vanilla 02-summary_stats.R
 
+# Generate alluvial plot
+Rscript 03-alluvial_plot.R
diff --git a/analyses/add-histologies/util/heatmap_function.R b/analyses/add-histologies/util/heatmap_function.R
@@ -0,0 +1,63 @@
+# Functions for generating count and enrichment heatmaps
+#
+# Ryan Corbett
+#
+# 2024
+
+# Function to create frequency count and enrichment heatmap for two selected variables. Significance of enrichment is calculated using hypergeometric tests
+
+plot_enr <- function(df, var1, var2,
+                     var1_names, var2_names,
+                     padjust = FALSE) {
+
+  enr <- matrix(0, nrow(unique(df[,var1])),
+                        nrow(unique(df[,var2])),
+                dimnames = list(var1_names,
+                                var2_names))
+  pval <- enr
+  ct <- enr
+
+  # loop through cancer groups to calculate enrichment 
+  for (i in 1:nrow(enr)){
+    no_var1 <- sum(unlist(df[,var1]) == rownames(enr)[i])
+    for (j in 1:ncol(enr)){
+      no_var2 <- sum(unlist(df[,var2]) == colnames(enr)[j] & !is.na(unlist(df[,var2])))
+      no_var1_var2 <- sum(unlist(df[,var1]) == rownames(enr)[i] & unlist(df[,var2]) == colnames(enr)[j])
+      ct[i,j] <- no_var1_var2
+      enr[i,j] <- (no_var1_var2/no_var2)/(no_var1/nrow(df))
+      pval[i,j] <- phyper(no_var1_var2, no_var1, nrow(df) - no_var1, no_var2, lower.tail = F)
+    }
+  }
+
+  if (padjust == TRUE) {
+
+    fdr <- t(apply(pval, 1, function(x) p.adjust(x, "fdr")))
+    sig_mat <- ifelse(fdr < 0.05 & enr > 1 & ct > 1, "*", "")
+
+  } else {
+
+    sig_mat <- ifelse(pval < 0.05 & enr > 1 & ct > 1, "*", "")
+
+  }
+
+  fill_mat <- matrix(glue::glue("{round(enr, 1)}{sig_mat}"), 
+                     nrow(enr), ncol(enr))
+
+  ct_enr_mat <- matrix(glue::glue("{ct}\n({fill_mat})"),
+                       nrow(ct), ncol(ct))
+
+  col_fun = colorRamp2(c(0, ceiling(max(enr))), c("white", "orangered"))
+
+  region_ht <- Heatmap(enr,
+                       name = "Odds ratio",
+                       cluster_rows = F,
+                       cluster_columns = F,
+                       rect_gp = gpar(col = "black", lwd = 2),
+                       col = col_fun,
+                       cell_fun = function(j, i, x, y, width, height, fill) {
+                         grid.text(sprintf("%s", ct_enr_mat[i, j]), x, y, gp = gpar(fontsize = 12))
+                       })
+
+  return(region_ht)
+
+}