plots titles and clarification

mskcc · Aug 1, 2018 · 2039d10 · 2039d10
1 parent c24141a
commit 2039d10
Show file tree

Hide file tree

Showing 3 changed files with 10 additions and 11 deletions.
diff --git a/python_tools/workflow_tools/qc/plots_module.r b/python_tools/workflow_tools/qc/plots_module.r
@@ -36,7 +36,7 @@ MY_THEME = theme(text = element_text(size=14),
   plot.margin = unit(c(.1, .1, .1, 1), 'in'))
 
 # Some title file columns will not be printed
-DROP_COLS = c('Pool', 'Pool_input', 'Barcode_index', 'PatientName', 'MAccession', 'Extracted_DNA_Yield')
+DROP_COLS = c('Pool', 'Pool_input', 'Barcode_index', 'PatientName', 'MAccession', 'Extracted_DNA_Yield', 'Barcode_index_1', 'Barcode_index_2')
 # Levels and sort order for collapsing methods
 LEVEL_C = c('TotalCoverage', 'All Unique', 'Simplex', 'Duplex')
 
@@ -189,7 +189,7 @@ plotGCwithCovAllSamples = function(data) {
 }
 
 
-#' Plot Coverage vs %GC content, separately for each sample 
+#' Plot Coverage vs %GC content, separately for each sample
 #' (for each collapsing method)
 #' @param data data.frame with the usual columns
 plotGCwithCovEachSample = function(data, sort_order) {
@@ -236,7 +236,7 @@ plotInsertSizeDistribution = function(insertSizes) {
 
   g = ggplot(insertSizes, aes(x=FragmentSize, y=total_frequency_fraction, colour=sample_and_peak)) +
     stat_smooth(size=.5, n=200, span=0.1, se=FALSE, method='loess', level=.01) +
-    ggtitle('Insert Size Distribution') +
+    ggtitle('Insert Size Distribution (from Unfiltered Pool A reads)') +
     xlab('Insert Size') +
     ylab('Frequency (%)') +
     labs(colour = "Sample, Peak Insert Size") +
@@ -257,7 +257,7 @@ plotCovDistPerIntervalLine = function(data) {
 
   g = ggplot(data) +
     geom_line(aes(x=coverage_scaled, colour=Sample), stat='density') +
-    ggtitle('Distribution of Coverages per Target Interval') +
+    ggtitle('Distribution of Coverages per Target Interval (from Total Reads, Pool A)') +
     scale_y_continuous('Frequency', label=format_comma) +
     scale_x_continuous('Coverage (median scaled)') + 
     coord_cartesian(xlim=c(0, 3)) +
@@ -295,8 +295,7 @@ print_title = function(title_df, coverage_df, inputs_yaml) {
       fg_params=list(cex = .6),
       padding=unit(c(5, 3), "mm")),
     colhead = list(fg_params=list(cex = 0.5)),
-    rowhead = list(fg_params=list(cex = 0.5))
-  )
+    rowhead = list(fg_params=list(cex = 0.5)))
 
   # Round to one decimal place
   coverage_df$average_coverage = format(round(coverage_df$average_coverage, 1), nsmall = 1)
@@ -456,7 +455,7 @@ parse_sort_order = function(groups_file) {
 }
 
 
-# Extract actual sample names from full filenames
+#' Extract actual sample names from full filenames
 #' Ex: sample_names = c('test_patient_T', 'test_patient_N')
 #' test_patient_T_001_aln_srt_MD_IR_FX_BR --> test_patient_T
 cleanup_sample_names = function(data, sample_names) {

diff --git a/python_tools/workflow_tools/qc/qc_wrapper.py b/python_tools/workflow_tools/qc/qc_wrapper.py
@@ -20,7 +20,7 @@
 
 def run_plots_module(tables_output_dir, plots_output_dir, title_file_path, inputs_yaml_path):
     """
-    Note: The following R script should be found in your Virtual
+    Note: The R script should be found in your Virtual
     environment PATH (/somewhere/virtualenv/bin/plots_module.r)
     after installing with `python setup.py install`
     """

diff --git a/python_tools/workflow_tools/qc/tables_module.py b/python_tools/workflow_tools/qc/tables_module.py
@@ -139,7 +139,7 @@ def get_gc_table(curr_method, intervals_filename_suffix, path):
 
         # todo - columns should be given constant labels:
         newDf = pd.DataFrame({
-            'method': [curr_method.replace('Waltz', '')] * len(curr_table),
+            'method': [curr_method] * len(curr_table),
             'Sample': [sample] * len(curr_table),
             'interval_name': curr_table.ix[:, 3],
             'coverage': curr_table.ix[:, 5],
@@ -280,7 +280,7 @@ def main(args):
     # Std, Pool A and B
     read_counts_table = get_read_counts_table(args.standard_waltz_pool_a, POOL_A_LABEL)
     coverage_table = get_coverage_table(args.standard_waltz_pool_a, POOL_A_LABEL)
-    # GC bias comes from A Targets
+    # GC Bias & Coverage Distribution per Interval graphs come from Standard Bam, A Targets
     gc_cov_int_table = get_gc_table(TOTAL_LABEL, WALTZ_INTERVALS_FILENAME_SUFFIX, args.standard_waltz_pool_a)
 
     read_counts_table = pd.concat([get_read_counts_table(args.standard_waltz_pool_b, POOL_B_LABEL), read_counts_table])
@@ -337,7 +337,7 @@ def main(args):
     gc_avg_table_all.to_csv(all_samples_coverage_filename, sep='\t', index=False)
     coverage_per_interval_table.to_csv(coverage_per_interval_filename, sep='\t', index=False)
 
-    # also copy the fragment-sizes.txt file, which the plots module also uses
+    # Fragment Sizes graph comes from Unfiltered Bam, Pool A Targets
     # todo: not clean
     import shutil
     frag_sizes_path = os.path.join(args.unfiltered_waltz_pool_a, 'fragment-sizes.txt')