QC refactor mvp

mskcc · May 1, 2018 · 82f7346 · 82f7346
1 parent 554b1f2
commit 82f7346
Show file tree

Hide file tree

Showing 124 changed files with 143 additions and 6,355,018 deletions.
diff --git a/README.md b/README.md
@@ -106,9 +106,10 @@ The same steps for testing can be used for a real run.
 Note that there are several requirements when running on your own data:
 1. The fields that are found in the sample manifest should matched with the examples in `test/test_data`
 2. The sample ID's in the manifest must be matched somewhere in the fastq file names fom the `-d` data folder
-3. The `SAMPLE_CLASS` column of the manifest must consist of the values either "Tumor" or "Normal"
-4. Each "Tumor" sample must have at least one associated "Normal" sample
-5. Each sample folder in the `-d` data folder must have three files that match the following:
+3. The sample ID's in the manifest must be matched somewhere in the path to the SampleSheet.csv files
+4. The `SAMPLE_CLASS` column of the manifest must consist of the values either "Tumor" or "Normal"
+5. Each "Tumor" sample must have at least one associated "Normal" sample
+6. Each sample folder in the `-d` data folder must have these three files:
 ```
 '_R1_001.fastq.gz'
 '_R2_001.fastq.gz'

diff --git a/python_tools/constants.py b/python_tools/constants.py
@@ -111,48 +111,44 @@
 # Constants for QC files #
 ##########################
 
-# Waltz Metrics Files Constants
+# WALTZ Metrics Files Constants
 
 # Use same string for sample ID everywhere
 SAMPLE_ID_COLUMN = TITLE_FILE__SAMPLE_ID_COLUMN
 
-TOTAL_MAPPED_COLUMN = 'total_mapped'
-UNIQUE_MAPPED_COLUMN = 'unique_mapped'
-TOTAL_READS_COLUMN = 'total_reads'
-UNMAPPED_READS_COLUMN = 'unmapped_reads'
-DUPLICATE_FRACTION_COLUMN = 'duplicate_fraction'
-TOTAL_ON_TARGET_COLUMN = 'total_on_target'
-UNIQUE_ON_TARGET_COLUMN = 'unique_on_target'
-TOTAL_ON_TARGET_FRACTION_COLUMN = 'total_on_target_fraction'
-UNIQUE_ON_TARGET_FRACTION_COLUMN = 'unique_on_target_fraction'
-
-TOTAL_OFF_TARGET_FRACTION_COLUMN = 'total_off_target_fraction'
-
+# File suffixes
 WALTZ_READ_COUNTS_FILENAME_SUFFIX = '.read-counts'
 WALTZ_FRAGMENT_SIZES_FILENAME_SUFFIX = '.fragment-sizes'
 WALTZ_INTERVALS_FILENAME_SUFFIX = '-intervals.txt'
 WALTZ_INTERVALS_WITHOUT_DUPLICATES_FILENAME_SUFFIX = '-intervals-without-duplicates.txt'
 
-# todo
-WALTZ_INTERVALS_FILE_HEADER = ['chromosome', 'start', 'stop', 'interval_name', 'fragment_size', '???', 'coverage', 'gc']
-WALTZ_COVERAGE_FILE_HEADER = ['total_coverage', 'unique_coverage']
+TOTAL_OFF_TARGET_FRACTION_COLUMN = 'total_off_target_fraction'
 
+WALTZ_CHROMOSOME_COLUMN = 'chromosome'
+WALTZ_START_COLUMN = 'start'
+WALTZ_STOP_COLUMN = 'stop'
+WALTZ_INTERVAL_NAME_COLUMN = 'interval_name'
+WALTZ_FRAGMENT_SIZE_COLUMN = 'fragment_size'
+WALTZ_PEAK_COVERAGE_COLUMN = 'peak_coverage'
+WALTZ_AVERAGE_COVERAGE_COLUMN = 'average_coverage'
+WALTZ_GC_CONTENT_COLUMN = 'gc'
 
-WALTZ_READ_COUNTS_HEADER = [
-    'Bam',
-    TOTAL_READS_COLUMN,
-    UNMAPPED_READS_COLUMN,
-    TOTAL_MAPPED_COLUMN,
-    UNIQUE_MAPPED_COLUMN,
-    DUPLICATE_FRACTION_COLUMN,
-    TOTAL_ON_TARGET_COLUMN,
-    UNIQUE_ON_TARGET_COLUMN,
-    TOTAL_ON_TARGET_FRACTION_COLUMN,
-    UNIQUE_ON_TARGET_FRACTION_COLUMN
+# todo
+WALTZ_INTERVALS_FILE_HEADER = [
+    WALTZ_CHROMOSOME_COLUMN,
+    WALTZ_START_COLUMN,
+    WALTZ_STOP_COLUMN,
+    WALTZ_INTERVAL_NAME_COLUMN,
+    WALTZ_FRAGMENT_SIZE_COLUMN,
+    WALTZ_PEAK_COVERAGE_COLUMN,
+    WALTZ_AVERAGE_COVERAGE_COLUMN,
+    WALTZ_GC_CONTENT_COLUMN
 ]
 
+WALTZ_COVERAGE_FILE_HEADER = ['total_coverage', 'unique_coverage']
+
 
-# Agbm Metrics Files Constants
+# AGBM Metrics Files Constants
 #
 # Column names
 AGBM_COVERAGE_FILENAME = 'waltz-coverage.txt'
@@ -166,18 +162,29 @@
 
 CHROMOSOME_COLUMN = 'chromosome'
 START_COLUMN = 'start'
+STOP_COLUMN = 'stop'
 END_COLUMN = 'end'
-INTERVAL_NAME_COLUMN = 'interval_name'
+INTERVAL_NAME_COLUMN = WALTZ_INTERVAL_NAME_COLUMN
 LENGTH_COLUMN = 'length'
 GC_COLUMN = 'gc'
-COVERAGE_COLUMN = 'coverage'
+COVERAGE_COLUMN = WALTZ_AVERAGE_COVERAGE_COLUMN
 COVERAGE_WITH_DUPLICATES_COLUMN = 'coverage_with_duplicates'
 COVERAGE_WITHOUT_DUPLICATES_COLUMN = 'coverage_without_duplicates'
 
+TOTAL_MAPPED_COLUMN = 'total_mapped'
+UNIQUE_MAPPED_COLUMN = 'unique_mapped'
+TOTAL_READS_COLUMN = 'total_reads'
+UNMAPPED_READS_COLUMN = 'unmapped_reads'
+DUPLICATE_FRACTION_COLUMN = 'duplicate_fraction'
+TOTAL_ON_TARGET_COLUMN = 'total_on_target'
+UNIQUE_ON_TARGET_COLUMN = 'unique_on_target'
+TOTAL_ON_TARGET_FRACTION_COLUMN = 'total_on_target_fraction'
+UNIQUE_ON_TARGET_FRACTION_COLUMN = 'unique_on_target_fraction'
+
 # File Headers
 AGBM_READ_COUNTS_HEADER = [
-    'bam',
     SAMPLE_ID_COLUMN,
+    'bam',
     TOTAL_READS_COLUMN,
     UNMAPPED_READS_COLUMN,
     TOTAL_MAPPED_COLUMN,
@@ -214,16 +221,15 @@
 ]
 
 
+# TABLES MODULE Files Constants
+#
 # Labels for collapsing methods
 TOTAL_LABEL = 'total'
 PICARD_LABEL = 'picard'
 MARIANAS_UNFILTERED_COLLAPSING_METHOD = 'marianas_unfiltered'
 MARIANAS_SIMPLEX_DUPLEX_COLLAPSING_METHOD = 'marianas_simplex_duplex'
 MARIANAS_DUPLEX_COLLAPSING_METHOD = 'marianas_duplex'
 
-
-# Tables Module Files Constants
-#
 # Headers for tables
 DUPLICATION_RATES_HEADER = ['Sample', 'method', 'duplication_rate']
 INSERT_SIZE_PEAKS_HEADER = ['Sample', 'peak_total', 'peak_total_size', 'peak_unique', 'peak_unique_size']

diff --git a/python_tools/workflow_tools/qc/aggregate_bam_metrics.py b/python_tools/workflow_tools/qc/aggregate_bam_metrics.py
@@ -94,19 +94,26 @@ def create_waltz_coverage_file(files):
     for files in [input_files, unique_input_files]:
         coverage_df = merge_files_across_samples(files)
 
-        coverage_df.columns = AGBM_INTERVALS_COVERAGE_SUM_FILE_HEADER
-        coverage_df['coverage_X_length'] = coverage_df['coverage'] * coverage_df['length']
+        coverage_df.columns = [SAMPLE_ID_COLUMN] + WALTZ_INTERVALS_FILE_HEADER
+        coverage_df['coverage_X_length'] = coverage_df[WALTZ_AVERAGE_COVERAGE_COLUMN] * coverage_df[WALTZ_FRAGMENT_SIZE_COLUMN]
 
-        intervals_count = len(coverage_df['interval_name'].unique())
+        intervals_count = len(coverage_df[INTERVAL_NAME_COLUMN].unique())
 
         coverage_total = coverage_df['coverage_X_length'].groupby(coverage_df[SAMPLE_ID_COLUMN]).transform('sum')
         coverage_df['coverage_total'] = coverage_total
         coverage_df['coverage_avg'] = coverage_df['coverage_total'] / intervals_count
 
-        coverage_per_sample = coverage_df[[SAMPLE_ID_COLUMN, 'coverage_avg']]
+        coverage_per_sample = coverage_df[[SAMPLE_ID_COLUMN, WALTZ_INTERVAL_NAME_COLUMN, WALTZ_AVERAGE_COVERAGE_COLUMN]]
         coverage_dfs.append(coverage_per_sample)
 
-    coverage_df = coverage_dfs[0].merge(coverage_dfs[1], on=SAMPLE_ID_COLUMN)
+    coverage_df = coverage_dfs[0].merge(coverage_dfs[1],
+                                        on=[SAMPLE_ID_COLUMN, WALTZ_INTERVAL_NAME_COLUMN],
+                                        suffixes=('_total', '_unique'))
+
+    coverage_df = coverage_df[[SAMPLE_ID_COLUMN,
+                               WALTZ_AVERAGE_COVERAGE_COLUMN + '_total',
+                               WALTZ_AVERAGE_COVERAGE_COLUMN + '_unique']]
+
     coverage_df.columns = AGBM_COVERAGE_FILE_HEADER
     to_csv(coverage_df, AGBM_COVERAGE_FILENAME)
 
@@ -118,10 +125,10 @@ def create_sum_of_coverage_dup_temp_file(files):
     input_files = [f for f in files if WALTZ_INTERVALS_FILENAME_SUFFIX in f]
 
     intervals_coverage_all = merge_files_across_samples(input_files)
-    intervals_coverage_all.columns = AGBM_INTERVALS_COVERAGE_SUM_FILE_HEADER
+    intervals_coverage_all.columns = [SAMPLE_ID_COLUMN] + WALTZ_INTERVALS_FILE_HEADER
 
     # Todo: is interval_name the same as 0:5 for key?
-    togroupby = [SAMPLE_ID_COLUMN, 'chromosome', 'start', 'end', 'interval_name', 'length']
+    togroupby = [SAMPLE_ID_COLUMN, WALTZ_INTERVAL_NAME_COLUMN]
     gc_coverage_sum_per_interval = intervals_coverage_all.groupby(togroupby).sum().reset_index()
 
     # Todo: should 'gc' be averaged across all samples or come from just last sample?
@@ -137,9 +144,9 @@ def create_sum_of_coverage_nodup_temp_file(files):
     input_files = [f for f in files if WALTZ_INTERVALS_WITHOUT_DUPLICATES_FILENAME_SUFFIX in f]
 
     intervals_coverage_all = merge_files_across_samples(input_files)
-    intervals_coverage_all.columns = AGBM_INTERVALS_COVERAGE_SUM_FILE_HEADER
+    intervals_coverage_all.columns = [SAMPLE_ID_COLUMN] + WALTZ_INTERVALS_FILE_HEADER
 
-    togroupby = [SAMPLE_ID_COLUMN, 'interval_name']
+    togroupby = [SAMPLE_ID_COLUMN, INTERVAL_NAME_COLUMN]
     gc_coverage_sum_per_interval = intervals_coverage_all.groupby(togroupby).sum().reset_index()
 
     to_csv(gc_coverage_sum_per_interval, 't6')
@@ -151,10 +158,10 @@ def create_intervals_coverage_sum_file():
     """
     t5 = pd.read_csv('t5', sep='\t')
     t6 = pd.read_csv('t6', sep='\t')
-    t6 = t6[[SAMPLE_ID_COLUMN, 'interval_name', 'coverage']]
+    t6 = t6[[SAMPLE_ID_COLUMN, INTERVAL_NAME_COLUMN, WALTZ_AVERAGE_COVERAGE_COLUMN]]
 
-    intervals_coverage_sum = t5.merge(t6, on=[SAMPLE_ID_COLUMN, 'interval_name'])
-    intervals_coverage_sum.columns = AGBM_INTERVALS_COVERAGE_SUM_FILE_HEADER + ['coverage_without_duplicates']
+    intervals_coverage_sum = t5.merge(t6, on=[SAMPLE_ID_COLUMN, WALTZ_INTERVAL_NAME_COLUMN], suffixes=('_total', '_unique'))
+    # intervals_coverage_sum.columns = AGBM_INTERVALS_COVERAGE_SUM_FILE_HEADER + ['coverage_without_duplicates']
     to_csv(intervals_coverage_sum, AGBM_INTERVALS_COVERAGE_SUM_FILENAME)
 
 

diff --git a/python_tools/workflow_tools/qc/plotting-collapsed-bams.r b/python_tools/workflow_tools/qc/plotting-collapsed-bams.r
@@ -1,4 +1,4 @@
-#!Rscript
+#! /usr/bin/env Rscript
 
 ##################################################
 # Innovation Laboratory
@@ -355,42 +355,20 @@ parse_sort_order = function(groups_file) {
 }
 
 
-#' Run this on a data.frame to provide consistent sample names to our sample ids
-#' @param data data.frame with 'Sample' column
-#' @param format either 'DASHES' or 'UNDERSCORES'
-convert_sample_names = function(data, format) {
-  if (format == 'DASHES') {
-    data = data %>% mutate(Sample = gsub('_', '-', Sample))
-  } else if (format == 'UNDERSCORES') {
-    data = data %>% mutate(Sample = gsub('-', '_', Sample))
-  } else {
-    stop('Incorrect sample id format provided')
-  }
-  data
-}
-
-
+# Extract actual sample names from full filenames
+#' Ex: sample_names = c('test_patient_T', 'test_patient_N')
+#' test_patient_T_001_aln_srt_MD_IR_FX_BR --> test_patient_T
 cleanup_sample_names = function(data, sample_names) {
-  data = convert_sample_names(data, 'DASHES')
-
-  # Ex: sample_names = c('test_patient_T', 'test_patient_N')
-  # test_patient_T_001_aln_srt_MD_IR_FX_BR --> test_patient_T
   find.string <- paste(unlist(sample_names), collapse = "|")
   find.string <- paste0('.*(', find.string, ').*', collapse='', sep='')
   data = data %>% mutate(Sample = gsub(find.string, '\\1', Sample))
-
-  # Ex: ZS-msi-4506-pl-T01_IGO_05500_EF_41_S41_standard...
-  data = data %>% mutate(Sample = gsub('_standard.*', '', Sample))
-
-  # Ex: ZS-msi-4506-pl-T01_IGO_05500_EF_41_S41
-  #                                       ^^^^
-  data = data %>% mutate(Sample = gsub('_.\\d\\d$', '', Sample))
   data
 }
 
 
 # Main function that will provide all of the desired plots
 main = function(args) {
+
   # Read arguments specifying where the required tables are
   # as well as where the plots should be put
   # todo - should return an object or named list, instead of this indexed list 
@@ -399,81 +377,62 @@ main = function(args) {
   inDirWaltz = args[2]
   outDir = args[3]
   title_file = args[4]
+
+  title_df = read.table(title_file, sep='\t', header=TRUE)
 
   # Define the output PDF file
   date = format(Sys.time(), '%a-%b-%d-%Y_%H-%M-%S')
   final_filename = paste('qc_results', gsub('[:|/]', '-', date), 'pdf', sep='.')
   final_dest = paste(outDir, final_filename, sep="/")
   pdf(file = final_dest, onefile = TRUE)
-
-  title_df = read.table(title_file, sep='\t', header=TRUE)
-
-  # Check if the title_file had the correct delimiter
-  if (length(colnames(title_df)) == 1) {
-    title_df = read.table(title_file, sep=',', header=TRUE)
-  }
 
   # Put title file on first page of PDF
   printTitle(title_df)
+
   # Title file sample colunn is used as sort order
   sort_order = unlist(title_df$Sample)
-  # We prefer dashes
-  title_df = convert_sample_names(title_df, 'DASHES')
-
-  availTabs = list.files(path = inDirTables)
-  if ('read-counts-total.txt' %in% availTabs) {
-    readCountsDataTotal = read.table(paste(inDirTables, 'read-counts-total.txt', sep = '/'), sep = '\t', head = TRUE)
-    dupRateData = read.table(paste(inDirTables, 'duplication-rates.txt', sep = '/'), sep = '\t', head = TRUE)
-    covPerInterval = read.table(paste(inDirTables, 'coverage-per-interval.txt', sep = '/'), sep = '\t', head = TRUE)
-    insertSizePeaks = read.table(paste(inDirTables, 'insert-size-peaks.txt', sep = '/'), sep = '\t', head = TRUE)
-    insertSizes = read.table(paste(inDirWaltz, 'fragment-sizes.txt', sep='/'), sep='\t', head=TRUE)
-
-    printhead = function(x) {
-      print(head(x))
-    }
-
-    # Perform some processing on some of our tables
-    dfList <- list(readCountsDataTotal, dupRateData, covPerInterval, insertSizePeaks, insertSizes)
-    dfList = lapply(dfList, convert_sample_names, 'DASHES')
-    dfList = lapply(dfList, cleanup_sample_names, sort_order)
-    dfList = lapply(dfList, sort_df, 'Sample', sort_order)
-    lapply(dfList, printhead)
-
-    dfList = lapply(dfList, mergeInTitleFileData, title_df)
-
-    readCountsDataTotal = dfList[[1]]
-    dupRateData = dfList[[2]]
-    covPerInterval = dfList[[3]]
-    insertSizePeaks = dfList[[4]]
-    insertSizes = dfList[[5]]
-
-    # Choose the plots that we want to run
-    print(plotDupFrac(dupRateData))
-    print(plotAlignGenome(readCountsDataTotal))
-    # print(plotCovDistPerInterval(covPerInterval))
-    print(plotOnTarget(readCountsDataTotal))
-    print(plotInsertSizeDistribution(insertSizes, insertSizePeaks))
-    print(plotCovDistPerIntervalLine(covPerInterval))
-  }
 
-  meanCovData = read.table(paste(inDirTables, 'coverage-agg.txt', sep='/'), sep='\t', head = TRUE)
-  gcAllSamples = read.table(paste(inDirTables, 'GC-bias-with-coverage-averages-over-all-samples.txt', sep='/'), sep='\t', head=TRUE)
-  gcEachSample = read.table(paste(inDirTables, 'GC-bias-with-coverage-averages-over-each-sample.txt', sep='/'), sep='\t', head=TRUE)
-
-  # Perform some processing on some of our (other) tables
-  dfList <- list(meanCovData, gcEachSample)
-  dfList = lapply(dfList, convert_sample_names, 'DASHES')
-  dfList = lapply(dfList, cleanup_sample_names, sort_order)
-  dfList = lapply(dfList, sort_df, 'Sample', sort_order)
-  dfList = lapply(dfList, mergeInTitleFileData, title_df)
-
-  meanCovData = dfList[[1]]
-  gcEachSample = dfList[[2]]
-
-  print(plotMeanCov(meanCovData))
-  print(plotGCwithCovAllSamples(gcAllSamples))
-  print(plotGCwithCovEachSample(gcEachSample, sort_order))
-  dev.off()
+readCountsDataTotal = read.table(paste(inDirTables, 'read-counts-total.txt', sep='/'), sep='\t', head=TRUE)
+dupRateData = read.table(paste(inDirTables, 'duplication-rates.txt', sep='/'), sep='\t', head=TRUE)
+covPerInterval = read.table(paste(inDirTables, 'coverage-per-interval.txt', sep='/'), sep='\t', head=TRUE)
+insertSizePeaks = read.table(paste(inDirTables, 'insert-size-peaks.txt', sep='/'), sep='\t', head=TRUE)
+insertSizes = read.table(paste(inDirWaltz, 'fragment-sizes.txt', sep='/'), sep='\t', head=TRUE)
+meanCovData = read.table(paste(inDirTables, 'coverage-agg.txt', sep='/'), sep='\t', head=TRUE)
+gcAllSamples = read.table(paste(inDirTables, 'GC-bias-with-coverage-averages-over-all-samples.txt', sep='/'), sep='\t', head=TRUE)
+gcEachSample = read.table(paste(inDirTables, 'GC-bias-with-coverage-averages-over-each-sample.txt', sep='/'), sep='\t', head=TRUE)
+
+printhead = function(x) {
+  print(head(x))
+}
+
+# Perform some processing on some of our tables
+dfList <- list(readCountsDataTotal, dupRateData, covPerInterval, insertSizePeaks, insertSizes, meanCovData, gcEachSample)
+dfList = lapply(dfList, cleanup_sample_names, sort_order)
+dfList = lapply(dfList, sort_df, 'Sample', sort_order)
+lapply(dfList, printhead)
+
+dfList = lapply(dfList, mergeInTitleFileData, title_df)
+
+readCountsDataTotal = dfList[[1]]
+dupRateData = dfList[[2]]
+covPerInterval = dfList[[3]]
+insertSizePeaks = dfList[[4]]
+insertSizes = dfList[[5]]
+meanCovData = dfList[[6]]
+gcEachSample = dfList[[7]]
+
+# Choose the plots that we want to run
+print(plotDupFrac(dupRateData))
+print(plotAlignGenome(readCountsDataTotal))
+# print(plotCovDistPerInterval(covPerInterval))
+print(plotOnTarget(readCountsDataTotal))
+print(plotInsertSizeDistribution(insertSizes, insertSizePeaks))
+print(plotCovDistPerIntervalLine(covPerInterval))
+print(plotMeanCov(meanCovData))
+print(plotGCwithCovAllSamples(gcAllSamples))
+print(plotGCwithCovEachSample(gcEachSample, sort_order))
+
+dev.off()
 }