From 4de25fec9ce7a00a89e9d6a95cd99ebe8e06e826 Mon Sep 17 00:00:00 2001 From: aadamk Date: Tue, 17 Sep 2024 15:40:29 -0400 Subject: [PATCH] various updates --- .../dge_pathway_analysis/01-dge_analysis_deseq.R | 9 --------- analyses/dge_pathway_analysis/run_analysis.sh | 2 +- analyses/methylation_analysis/01-limma_analysis.R | 15 ++++++++++----- analyses/methylation_analysis/run_analysis.sh | 10 ++++++---- 4 files changed, 17 insertions(+), 19 deletions(-) diff --git a/analyses/dge_pathway_analysis/01-dge_analysis_deseq.R b/analyses/dge_pathway_analysis/01-dge_analysis_deseq.R index f029fa9..93ff42d 100644 --- a/analyses/dge_pathway_analysis/01-dge_analysis_deseq.R +++ b/analyses/dge_pathway_analysis/01-dge_analysis_deseq.R @@ -55,15 +55,6 @@ expr_mat <- expr_mat %>% dplyr::select(mm_clusters$sample_id) stopifnot(identical(colnames(expr_mat), mm_clusters$sample_id)) -# use DCGA to filter out low count, low variance features -expr_mat <- DGCA::filterGenes( - inputMat = expr_mat, - filterTypes = c("central", "dispersion"), - filterDispersionType = "cv", - filterDispersionPercentile = 0.2, - sequential = TRUE -) - # use a for-loop clusters <- unique(mm_clusters$mm_cluster) output_df <- data.frame() diff --git a/analyses/dge_pathway_analysis/run_analysis.sh b/analyses/dge_pathway_analysis/run_analysis.sh index b43ba86..7eaf7e3 100644 --- a/analyses/dge_pathway_analysis/run_analysis.sh +++ b/analyses/dge_pathway_analysis/run_analysis.sh @@ -11,7 +11,7 @@ script_directory="$(perl -e 'use File::Basename; cd "$script_directory" || exit # Define directory and input files -data_dir="../../data" +data_dir="/sbgenomics/project-files/opc-v15" gtf_file="${data_dir}/v15/gencode.v39.primary_assembly.annotation.gtf.gz" count_file="${data_dir}/v15/gene-counts-rsem-expected_count-collapsed.rds" cluster_file="../intNMF/results/intnmf_clusters.tsv" diff --git a/analyses/methylation_analysis/01-limma_analysis.R b/analyses/methylation_analysis/01-limma_analysis.R index 8b21381..94c93a3 100644 --- a/analyses/methylation_analysis/01-limma_analysis.R +++ b/analyses/methylation_analysis/01-limma_analysis.R @@ -21,10 +21,15 @@ output_dir <- opt$output_dir dir.create(output_dir, showWarnings = F, recursive = T) # read m-values -methyl_m_values_full <- readRDS(opt$methyl_mat) +methyl_m_values_full <- readRDS(opt$methyl_mat) %>% + dplyr::mutate('Probe_ID' = make.names(Probe_ID)) %>% + unique(by = 'Probe_ID') %>% + tibble::column_to_rownames('Probe_ID') # read annotation -methyl_annot_full <- data.table::fread(opt$methyl_annot) +methyl_annot_full <- data.table::fread(opt$methyl_annot) %>% + dplyr::mutate('Probe_ID' = make.names(Probe_ID)) %>% + unique(by = 'Probe_ID') # create generalized function for gene feature analysis run_analysis <- function(methyl_m_values_full, @@ -33,7 +38,7 @@ run_analysis <- function(methyl_m_values_full, prefix) { # filter only to gene feature using probe annotation file methyl_annot <- methyl_annot_full %>% - filter(Gene_Feature %in% gene_feature_filter) %>% + dplyr::filter(Gene_Feature %in% gene_feature_filter) %>% unique() methyl_m_values <- methyl_m_values_full %>% filter(rownames(methyl_m_values_full) %in% methyl_annot$Probe_ID) @@ -78,8 +83,8 @@ run_analysis <- function(methyl_m_values_full, # differentially expressed probes toptable_output <- limma::topTable(fit.reduced, coef = 2, n = Inf) sigCpGs <- toptable_output %>% - filter(adj.P.Val < 0.05) %>% - mutate(cluster = clusters[i]) %>% + dplyr::filter(adj.P.Val < 0.05) %>% + dplyr::mutate(cluster = clusters[i]) %>% rownames_to_column("probes") print(head(sigCpGs)) diff --git a/analyses/methylation_analysis/run_analysis.sh b/analyses/methylation_analysis/run_analysis.sh index 136ae13..5b51a9f 100644 --- a/analyses/methylation_analysis/run_analysis.sh +++ b/analyses/methylation_analysis/run_analysis.sh @@ -10,12 +10,14 @@ script_directory="$(perl -e 'use File::Basename; print dirname(abs_path(@ARGV[0]));' -- "$0")" cd "$script_directory" || exit +echo "R_MAX_VSIZE=100Gb" > .Renviron + # Define directory and input files -data_subset="../data_preparation/data" -count_file="${data_subset}/gene-counts-rsem-expected_count-collapsed.rds" -methyl_m_file="${data_subset}/methyl-m-values.rds" +data_dir="/sbgenomics/project-files/opc-v15" +count_file="${data_dir}/v15/gene-counts-rsem-expected_count-collapsed.rds" +methyl_m_file="${data_dir}/v15/methyl-m-values.rds" cluster_file="../intNMF/results/intnmf_clusters.tsv" -methyl_annot_file="../../data/v15/infinium.gencode.v39.probe.annotations.tsv.gz" +methyl_annot_file="${data_dir}/v15/infinium.gencode.v39.probe.annotations.tsv.gz" # 1) limma analysis to identify differentially expressed CpG sites Rscript --vanilla 01-limma_analysis.R \