4. Differential Gene Expression Analysis - TCGA - Pan-cancer.Rmd

---
title: "Differential Gene Expression Analysis"
subtitle: "Pan-cancer | Necroptosis, Ferroptosis & Pyroptosis"
author: 
  - Kim Williame Lee^[De La Salle University, Manila, Philippines, kim_leejra@dlsu.edu.ph]
  - Mark Edward M. Gonzales^[De La Salle University, Manila, Philippines, gonzales.markedward@gmail.com]
  - Dr. Anish M.S. Shrestha^[De La Salle University, Manila, Philippines, anish.shrestha@dlsu.edu.ph]
output: html_notebook
---

## I. Preliminaries

### Loading libraries

```{r, warning=FALSE, message=FALSE}
library("tidyverse")
library("tibble")
library("msigdbr")
library("ggplot2")
library("TCGAbiolinks")
library("RNAseqQC")
library("DESeq2")
library("ensembldb")
library("purrr")
library("magrittr")
library("vsn")
library("matrixStats")
library("dplyr")
library("grex")
```

## II. Downloading the TCGA gene expression data 

Create a function for downloading TCGA gene expression data. 

For more detailed documentation, refer to `2. Differential Gene Expression Analysis - TCGA.Rmd`.

```{r}
query_and_filter_samples <- function(project) {
  query_tumor <- GDCquery(
    project = project,
    data.category = "Transcriptome Profiling",
    data.type = "Gene Expression Quantification",
    experimental.strategy = "RNA-Seq",
    workflow.type = "STAR - Counts",
    access = "open",
    sample.type = "Primary Tumor"
  )
  tumor <- getResults(query_tumor)

  query_normal <- GDCquery(
    project = project,
    data.category = "Transcriptome Profiling",
    data.type = "Gene Expression Quantification",
    experimental.strategy = "RNA-Seq",
    workflow.type = "STAR - Counts",
    access = "open",
    sample.type = "Solid Tissue Normal"
  )
  normal <- getResults(query_normal)

  submitter_ids <- inner_join(tumor, normal, by = "cases.submitter_id") %>%
    dplyr::select(cases.submitter_id)
  tumor <- tumor %>%
    dplyr::filter(cases.submitter_id %in% submitter_ids$cases.submitter_id)
  normal <- normal %>%
    dplyr::filter(cases.submitter_id %in% submitter_ids$cases.submitter_id)

  samples <- rbind(tumor, normal)
  unique(samples$sample_type)

  query_project <- GDCquery(
    project = project,
    data.category = "Transcriptome Profiling",
    data.type = "Gene Expression Quantification",
    experimental.strategy = "RNA-Seq",
    workflow.type = "STAR - Counts",
    access = "open",
    sample.type = c("Solid Tissue Normal", "Primary Tumor"),
    barcode = as.list(samples$sample.submitter_id)
  )

  # If this is your first time running this notebook (i.e., you have not yet downloaded the results of the query in the previous block),
  # uncomment the line below

  # GDCdownload(query_project)

  return(list(samples = samples, query_project = query_project))
}
```

Download the TCGA gene expression data for different cancer types.

Refer to this link for the list of TCGA cancer type abbreviations: https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/tcga-study-abbreviations

```{r, message=FALSE}
projects <- c(
  "TCGA-LUSC", "TCGA-COAD", "TCGA-KICH", "TCGA-KIRC", "TCGA-PRAD",
  "TCGA-BRCA", "TCGA-HNSC", "TCGA-KIRP", "TCGA-LIHC", "TCGA-STAD",
  "TCGA-THCA", "TCGA-BLCA", "TCGA-LUAD", "TCGA-ESCA"
)

samples <- list()
project_data <- list()

for (project in projects) {
  result <- query_and_filter_samples(project)

  samples[[project]] <- result$samples
  project_data[[project]] <- result$query_project
}
```


Running the code block above should generate and populate a directory named `GDCdata`.

## III. Data preprocessing

Construct the RNA-seq count matrix for each cancer type.

```{r, echo = TRUE, message = FALSE, results="hide"}
tcga_data <- list()
tcga_matrix <- list()

for (project in projects) {
  tcga_data[[project]] <- GDCprepare(project_data[[project]], summarizedExperiment = TRUE)
}
```

```{r}
for (project in projects) {
  count_matrix <- assay(tcga_data[[project]], "unstranded")

  # Remove duplicate entries
  count_matrix_df <- data.frame(count_matrix)
  count_matrix_df <- count_matrix_df[!duplicated(count_matrix_df), ]
  count_matrix <- data.matrix(count_matrix_df)
  rownames(count_matrix) <- cleanid(rownames(count_matrix))
  count_matrix <- count_matrix[!(duplicated(rownames(count_matrix)) | duplicated(rownames(count_matrix), fromLast = TRUE)), ]

  tcga_matrix[[project]] <- count_matrix
}
```
Format the `samples` table so that it can be fed as input to DESeq2.

```{r}
for (project in projects) {
  rownames(samples[[project]]) <- samples[[project]]$cases
  samples[[project]] <- samples[[project]] %>%
    dplyr::select(case = "cases.submitter_id", type = "sample_type")
  samples[[project]]$type <- str_replace(samples[[project]]$type, "Solid Tissue Normal", "normal")
  samples[[project]]$type <- str_replace(samples[[project]]$type, "Primary Tumor", "tumor")
}
```

DESeq2 requires the row names of `samples` should be identical to the column names of `count_matrix`.

```{r, echo = TRUE, results="hide"}
for (project in projects) {
  colnames(tcga_matrix[[project]]) <- gsub(x = colnames(tcga_matrix[[project]]), pattern = "\\.", replacement = "-")
  tcga_matrix[[project]] <- tcga_matrix[[project]][, rownames(samples[[project]])]

  # Sanity check
  print(all(colnames(tcga_matrix[[project]]) == rownames(samples[[project]])))
}
```

## IV. Differential gene expression analysis

References: 

- Official documentation: https://www.bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html
- Good balance of theory and hands-on: https://hbctraining.github.io/DGE_workshop/lessons/04_DGE_DESeq2_analysis.html
- Quality control: https://cran.r-project.org/web/packages/RNAseqQC/vignettes/introduction.html

Construct the `DESeqDataSet` object for each cancer type.

```{r}
dds_results <- list()

for (project in projects) {
  dds_results[[project]] <- DESeqDataSetFromMatrix(
    countData = tcga_matrix[[project]],
    colData = samples[[project]],
    design = ~type
  )
}
```


### Regulated Cell Death

Refer to `1. Exploratory Data Analysis - MSigDB Gene Sets + GTEx TPM.Rmd` for more detailed documentation on obtaining the gene sets.

#### Necroptosis

Fetch the necroptosis gene set.

```{r}
necroptosis.genes <- msigdbr(species = "human", category = "C5", subcategory = "GO:BP") %>%
  dplyr::filter(gs_name == "GOBP_NECROPTOTIC_SIGNALING_PATHWAY")
necroptosis.genes
```
Filter the genes to include only those in the necroptosis gene set.

```{r, echo = TRUE, results="hide"}
tcga_necroptosis <- list()

for (project in projects) {
  rownames(necroptosis.genes) <- necroptosis.genes$ensembl_gene
  tcga_necroptosis[[project]] <- tcga_matrix[[project]][rownames(tcga_matrix[[project]]) %in% necroptosis.genes$ensembl_gene, ]
  tcga_necroptosis[[project]] <- tcga_necroptosis[[project]][, rownames(samples[[project]])]

  # Check if all samples in the counts dataframe are in the samples dataframe
  print(all(colnames(tcga_necroptosis[[project]]) == rownames((samples[[project]]))))
}
```

Perform differential gene expression analysis.

```{r}
dds_necroptosis <- list()
res_necroptosis <- list()

for (project in projects) {
  print(project)
  print("=============")

  dds <- DESeqDataSetFromMatrix(
    countData = tcga_necroptosis[[project]],
    colData = samples[[project]],
    design = ~type
  )
  dds <- filter_genes(dds, min_count = 10)
  dds$type <- relevel(dds$type, ref = "normal")
  dds_necroptosis[[project]] <- DESeq(dds)
  res_necroptosis[[project]] <- results(dds_necroptosis[[project]])
}
```

Prettify the display of results.

```{r}
deseq.bbl.data <- list()

for (project in projects) {
  deseq.results <- res_necroptosis[[project]]
  deseq.bbl.data[[project]] <- data.frame(
    row.names = rownames(deseq.results),
    baseMean = deseq.results$baseMean,
    log2FoldChange = deseq.results$log2FoldChange,
    lfcSE = deseq.results$lfcSE,
    stat = deseq.results$stat,
    pvalue = deseq.results$pvalue,
    padj = deseq.results$padj,
    cancer_type = project,
    gene_symbol = necroptosis.genes[rownames(deseq.results), "gene_symbol"]
  )
}

deseq.bbl.data.combined <- bind_rows(deseq.bbl.data)
deseq.bbl.data.combined <- dplyr::filter(deseq.bbl.data.combined, abs(log2FoldChange) >= 1.5 & padj < 0.05)
deseq.bbl.data.combined
```

Plot the results.

```{r}
sizes <- c("<10^-15" = 4, "10^-10" = 3, "10^-5" = 2, "0.05" = 1)

deseq.bbl.data.combined <- deseq.bbl.data.combined %>%
  mutate(fdr_category = cut(padj,
    breaks = c(-Inf, 1e-15, 1e-10, 1e-5, 0.05),
    labels = c("<10^-15", "10^-10", "10^-5", "0.05"),
    right = FALSE
  ))

top_genes <- deseq.bbl.data.combined %>%
  group_by(cancer_type) %>%
  mutate(rank = rank(-abs(log2FoldChange))) %>%
  dplyr::filter(rank <= 10) %>%
  ungroup()

ggplot(top_genes, aes(y = cancer_type, x = gene_symbol, size = fdr_category, fill = log2FoldChange)) +
  geom_point(alpha = 0.5, shape = 21, color = "black") +
  scale_size_manual(values = sizes) +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red", limits = c(min(deseq.bbl.data.combined$log2FoldChange), max(deseq.bbl.data.combined$log2FoldChange))) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(size = 9, angle = 90, hjust = 1)
  ) +
  theme(legend.position = "bottom") +
  theme(legend.position = "bottom") +
  labs(size = "FDR", fill = "log2 FC", y = "Cancer type", x = "Gene")
```

#### Ferroptosis

Fetch the ferroptosis gene set.

```{r}
ferroptosis.genes <- msigdbr(species = "human", category = "C2", subcategory = "CP:WIKIPATHWAYS") %>%
  dplyr::filter(gs_name == "WP_FERROPTOSIS")
ferroptosis.genes
```

Filter the genes to include only those in the ferroptosis gene set.

```{r, echo = TRUE, results="hide"}
tcga_ferroptosis <- list()

for (project in projects) {
  rownames(ferroptosis.genes) <- ferroptosis.genes$ensembl_gene
  tcga_ferroptosis[[project]] <- tcga_matrix[[project]][rownames(tcga_matrix[[project]]) %in% ferroptosis.genes$ensembl_gene, ]
  tcga_ferroptosis[[project]] <- tcga_ferroptosis[[project]][, rownames(samples[[project]])]

  # Check if all samples in the counts dataframe are in the samples dataframe
  print(all(colnames(tcga_ferroptosis[[project]]) == rownames((samples[[project]]))))
}
```
Perform differential gene expression analysis.

```{r}
dds_ferroptosis <- list()
res_ferroptosis <- list()

for (project in projects) {
  print(project)
  print("=============")

  dds <- DESeqDataSetFromMatrix(
    countData = tcga_ferroptosis[[project]],
    colData = samples[[project]],
    design = ~type
  )
  dds <- filter_genes(dds, min_count = 10)
  dds$type <- relevel(dds$type, ref = "normal")
  dds_ferroptosis[[project]] <- DESeq(dds)
  res_ferroptosis[[project]] <- results(dds_ferroptosis[[project]])
}
```

Prettify the display of results.

```{r}
deseq.bbl.data <- list()

for (project in projects) {
  deseq.results <- res_ferroptosis[[project]]
  deseq.bbl.data[[project]] <- data.frame(
    row.names = rownames(deseq.results),
    baseMean = deseq.results$baseMean,
    log2FoldChange = deseq.results$log2FoldChange,
    lfcSE = deseq.results$lfcSE,
    stat = deseq.results$stat,
    pvalue = deseq.results$pvalue,
    padj = deseq.results$padj,
    cancer_type = project,
    gene_symbol = ferroptosis.genes[rownames(deseq.results), "gene_symbol"]
  )
}

deseq.bbl.data.combined <- bind_rows(deseq.bbl.data)
deseq.bbl.data.combined <- dplyr::filter(deseq.bbl.data.combined, abs(log2FoldChange) >= 1.5 & padj < 0.05)
deseq.bbl.data.combined
```

Plot the results.

```{r}
deseq.bbl.data.combined <- deseq.bbl.data.combined %>%
  mutate(fdr_category = cut(padj,
    breaks = c(-Inf, 1e-15, 1e-10, 1e-5, 0.05),
    labels = c("<10^-15", "10^-10", "10^-5", "0.05"),
    right = FALSE
  ))

top_genes <- deseq.bbl.data.combined %>%
  group_by(cancer_type) %>%
  mutate(rank = rank(-abs(log2FoldChange))) %>%
  dplyr::filter(rank <= 10) %>%
  ungroup()

ggplot(top_genes, aes(y = cancer_type, x = gene_symbol, size = fdr_category, fill = log2FoldChange)) +
  geom_point(alpha = 0.5, shape = 21, color = "black") +
  scale_size_manual(values = sizes) +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red", limits = c(min(deseq.bbl.data.combined$log2FoldChange), max(deseq.bbl.data.combined$log2FoldChange))) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(size = 9, angle = 90, hjust = 1)
  ) +
  theme(legend.position = "bottom") +
  theme(legend.position = "bottom") +
  labs(size = "FDR", fill = "log2 FC", y = "Cancer type", x = "Gene")
```

#### Pyroptosis

Fetch the pyroptosis gene set.

```{r}
pyroptosis.genes <- msigdbr(species = "human", category = "C2", subcategory = "CP:REACTOME") %>%
  dplyr::filter(gs_name == "REACTOME_PYROPTOSIS")
pyroptosis.genes
```

Filter the genes to include only those in the pyroptosis gene set.

```{r, echo = TRUE, results="hide"}
tcga_pyroptosis <- list()

for (project in projects) {
  rownames(pyroptosis.genes) <- pyroptosis.genes$ensembl_gene
  tcga_pyroptosis[[project]] <- tcga_matrix[[project]][rownames(tcga_matrix[[project]]) %in% pyroptosis.genes$ensembl_gene, ]
  tcga_pyroptosis[[project]] <- tcga_pyroptosis[[project]][, rownames(samples[[project]])]

  # Check if all samples in the counts dataframe are in the samples dataframe
  print(all(colnames(tcga_pyroptosis[[project]]) == rownames((samples[[project]]))))
}
```
Perform differential gene expression analysis.

```{r}
dds_pyroptosis <- list()
res_pyroptosis <- list()

for (project in projects) {
  print(project)
  print("=============")

  dds <- DESeqDataSetFromMatrix(
    countData = tcga_pyroptosis[[project]],
    colData = samples[[project]],
    design = ~type
  )
  dds <- filter_genes(dds, min_count = 10)
  dds$type <- relevel(dds$type, ref = "normal")
  dds_pyroptosis[[project]] <- DESeq(dds)
  res_pyroptosis[[project]] <- results(dds_pyroptosis[[project]])
}
```

Prettify the display of results.

```{r}
deseq.bbl.data <- list()

for (project in projects) {
  deseq.results <- res_pyroptosis[[project]]
  deseq.bbl.data[[project]] <- data.frame(
    row.names = rownames(deseq.results),
    baseMean = deseq.results$baseMean,
    log2FoldChange = deseq.results$log2FoldChange,
    lfcSE = deseq.results$lfcSE,
    stat = deseq.results$stat,
    pvalue = deseq.results$pvalue,
    padj = deseq.results$padj,
    cancer_type = project,
    gene_symbol = pyroptosis.genes[rownames(deseq.results), "gene_symbol"]
  )
}

deseq.bbl.data.combined <- bind_rows(deseq.bbl.data)
deseq.bbl.data.combined <- dplyr::filter(deseq.bbl.data.combined, abs(log2FoldChange) >= 1.5 & padj < 0.05)
deseq.bbl.data.combined
```
Plot the results.

```{r}
deseq.bbl.data.combined <- deseq.bbl.data.combined %>%
  mutate(fdr_category = cut(padj,
    breaks = c(-Inf, 1e-15, 1e-10, 1e-5, 0.05),
    labels = c("<10^-15", "10^-10", "10^-5", "0.05"),
    right = FALSE
  ))

top_genes <- deseq.bbl.data.combined %>%
  group_by(cancer_type) %>%
  mutate(rank = rank(-abs(log2FoldChange))) %>%
  dplyr::filter(rank <= 10) %>%
  ungroup()

ggplot(top_genes, aes(y = cancer_type, x = gene_symbol, size = fdr_category, fill = log2FoldChange)) +
  geom_point(alpha = 0.5, shape = 21, color = "black") +
  scale_size_manual(values = sizes) +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red", limits = c(min(deseq.bbl.data.combined$log2FoldChange), max(deseq.bbl.data.combined$log2FoldChange))) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(size = 9, angle = 90, hjust = 1)
  ) +
  theme(legend.position = "bottom") +
  theme(legend.position = "bottom") +
  labs(size = "FDR", fill = "log2 FC", y = "Cancer type", x = "Gene")
```