Skip to content

Commit

Permalink
QC refactor mvp
Browse files Browse the repository at this point in the history
  • Loading branch information
ionox0 committed May 1, 2018
1 parent 554b1f2 commit 82f7346
Show file tree
Hide file tree
Showing 124 changed files with 143 additions and 6,355,018 deletions.
7 changes: 4 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,9 +106,10 @@ The same steps for testing can be used for a real run.
Note that there are several requirements when running on your own data:
1. The fields that are found in the sample manifest should matched with the examples in `test/test_data`
2. The sample ID's in the manifest must be matched somewhere in the fastq file names fom the `-d` data folder
3. The `SAMPLE_CLASS` column of the manifest must consist of the values either "Tumor" or "Normal"
4. Each "Tumor" sample must have at least one associated "Normal" sample
5. Each sample folder in the `-d` data folder must have three files that match the following:
3. The sample ID's in the manifest must be matched somewhere in the path to the SampleSheet.csv files
4. The `SAMPLE_CLASS` column of the manifest must consist of the values either "Tumor" or "Normal"
5. Each "Tumor" sample must have at least one associated "Normal" sample
6. Each sample folder in the `-d` data folder must have these three files:
```
'_R1_001.fastq.gz'
'_R2_001.fastq.gz'
Expand Down
74 changes: 40 additions & 34 deletions python_tools/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,48 +111,44 @@
# Constants for QC files #
##########################

# Waltz Metrics Files Constants
# WALTZ Metrics Files Constants

# Use same string for sample ID everywhere
SAMPLE_ID_COLUMN = TITLE_FILE__SAMPLE_ID_COLUMN

TOTAL_MAPPED_COLUMN = 'total_mapped'
UNIQUE_MAPPED_COLUMN = 'unique_mapped'
TOTAL_READS_COLUMN = 'total_reads'
UNMAPPED_READS_COLUMN = 'unmapped_reads'
DUPLICATE_FRACTION_COLUMN = 'duplicate_fraction'
TOTAL_ON_TARGET_COLUMN = 'total_on_target'
UNIQUE_ON_TARGET_COLUMN = 'unique_on_target'
TOTAL_ON_TARGET_FRACTION_COLUMN = 'total_on_target_fraction'
UNIQUE_ON_TARGET_FRACTION_COLUMN = 'unique_on_target_fraction'

TOTAL_OFF_TARGET_FRACTION_COLUMN = 'total_off_target_fraction'

# File suffixes
WALTZ_READ_COUNTS_FILENAME_SUFFIX = '.read-counts'
WALTZ_FRAGMENT_SIZES_FILENAME_SUFFIX = '.fragment-sizes'
WALTZ_INTERVALS_FILENAME_SUFFIX = '-intervals.txt'
WALTZ_INTERVALS_WITHOUT_DUPLICATES_FILENAME_SUFFIX = '-intervals-without-duplicates.txt'

# todo
WALTZ_INTERVALS_FILE_HEADER = ['chromosome', 'start', 'stop', 'interval_name', 'fragment_size', '???', 'coverage', 'gc']
WALTZ_COVERAGE_FILE_HEADER = ['total_coverage', 'unique_coverage']
TOTAL_OFF_TARGET_FRACTION_COLUMN = 'total_off_target_fraction'

WALTZ_CHROMOSOME_COLUMN = 'chromosome'
WALTZ_START_COLUMN = 'start'
WALTZ_STOP_COLUMN = 'stop'
WALTZ_INTERVAL_NAME_COLUMN = 'interval_name'
WALTZ_FRAGMENT_SIZE_COLUMN = 'fragment_size'
WALTZ_PEAK_COVERAGE_COLUMN = 'peak_coverage'
WALTZ_AVERAGE_COVERAGE_COLUMN = 'average_coverage'
WALTZ_GC_CONTENT_COLUMN = 'gc'

WALTZ_READ_COUNTS_HEADER = [
'Bam',
TOTAL_READS_COLUMN,
UNMAPPED_READS_COLUMN,
TOTAL_MAPPED_COLUMN,
UNIQUE_MAPPED_COLUMN,
DUPLICATE_FRACTION_COLUMN,
TOTAL_ON_TARGET_COLUMN,
UNIQUE_ON_TARGET_COLUMN,
TOTAL_ON_TARGET_FRACTION_COLUMN,
UNIQUE_ON_TARGET_FRACTION_COLUMN
# todo
WALTZ_INTERVALS_FILE_HEADER = [
WALTZ_CHROMOSOME_COLUMN,
WALTZ_START_COLUMN,
WALTZ_STOP_COLUMN,
WALTZ_INTERVAL_NAME_COLUMN,
WALTZ_FRAGMENT_SIZE_COLUMN,
WALTZ_PEAK_COVERAGE_COLUMN,
WALTZ_AVERAGE_COVERAGE_COLUMN,
WALTZ_GC_CONTENT_COLUMN
]

WALTZ_COVERAGE_FILE_HEADER = ['total_coverage', 'unique_coverage']


# Agbm Metrics Files Constants
# AGBM Metrics Files Constants
#
# Column names
AGBM_COVERAGE_FILENAME = 'waltz-coverage.txt'
Expand All @@ -166,18 +162,29 @@

CHROMOSOME_COLUMN = 'chromosome'
START_COLUMN = 'start'
STOP_COLUMN = 'stop'
END_COLUMN = 'end'
INTERVAL_NAME_COLUMN = 'interval_name'
INTERVAL_NAME_COLUMN = WALTZ_INTERVAL_NAME_COLUMN
LENGTH_COLUMN = 'length'
GC_COLUMN = 'gc'
COVERAGE_COLUMN = 'coverage'
COVERAGE_COLUMN = WALTZ_AVERAGE_COVERAGE_COLUMN
COVERAGE_WITH_DUPLICATES_COLUMN = 'coverage_with_duplicates'
COVERAGE_WITHOUT_DUPLICATES_COLUMN = 'coverage_without_duplicates'

TOTAL_MAPPED_COLUMN = 'total_mapped'
UNIQUE_MAPPED_COLUMN = 'unique_mapped'
TOTAL_READS_COLUMN = 'total_reads'
UNMAPPED_READS_COLUMN = 'unmapped_reads'
DUPLICATE_FRACTION_COLUMN = 'duplicate_fraction'
TOTAL_ON_TARGET_COLUMN = 'total_on_target'
UNIQUE_ON_TARGET_COLUMN = 'unique_on_target'
TOTAL_ON_TARGET_FRACTION_COLUMN = 'total_on_target_fraction'
UNIQUE_ON_TARGET_FRACTION_COLUMN = 'unique_on_target_fraction'

# File Headers
AGBM_READ_COUNTS_HEADER = [
'bam',
SAMPLE_ID_COLUMN,
'bam',
TOTAL_READS_COLUMN,
UNMAPPED_READS_COLUMN,
TOTAL_MAPPED_COLUMN,
Expand Down Expand Up @@ -214,16 +221,15 @@
]


# TABLES MODULE Files Constants
#
# Labels for collapsing methods
TOTAL_LABEL = 'total'
PICARD_LABEL = 'picard'
MARIANAS_UNFILTERED_COLLAPSING_METHOD = 'marianas_unfiltered'
MARIANAS_SIMPLEX_DUPLEX_COLLAPSING_METHOD = 'marianas_simplex_duplex'
MARIANAS_DUPLEX_COLLAPSING_METHOD = 'marianas_duplex'


# Tables Module Files Constants
#
# Headers for tables
DUPLICATION_RATES_HEADER = ['Sample', 'method', 'duplication_rate']
INSERT_SIZE_PEAKS_HEADER = ['Sample', 'peak_total', 'peak_total_size', 'peak_unique', 'peak_unique_size']
Expand Down
31 changes: 19 additions & 12 deletions python_tools/workflow_tools/qc/aggregate_bam_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,19 +94,26 @@ def create_waltz_coverage_file(files):
for files in [input_files, unique_input_files]:
coverage_df = merge_files_across_samples(files)

coverage_df.columns = AGBM_INTERVALS_COVERAGE_SUM_FILE_HEADER
coverage_df['coverage_X_length'] = coverage_df['coverage'] * coverage_df['length']
coverage_df.columns = [SAMPLE_ID_COLUMN] + WALTZ_INTERVALS_FILE_HEADER
coverage_df['coverage_X_length'] = coverage_df[WALTZ_AVERAGE_COVERAGE_COLUMN] * coverage_df[WALTZ_FRAGMENT_SIZE_COLUMN]

intervals_count = len(coverage_df['interval_name'].unique())
intervals_count = len(coverage_df[INTERVAL_NAME_COLUMN].unique())

coverage_total = coverage_df['coverage_X_length'].groupby(coverage_df[SAMPLE_ID_COLUMN]).transform('sum')
coverage_df['coverage_total'] = coverage_total
coverage_df['coverage_avg'] = coverage_df['coverage_total'] / intervals_count

coverage_per_sample = coverage_df[[SAMPLE_ID_COLUMN, 'coverage_avg']]
coverage_per_sample = coverage_df[[SAMPLE_ID_COLUMN, WALTZ_INTERVAL_NAME_COLUMN, WALTZ_AVERAGE_COVERAGE_COLUMN]]
coverage_dfs.append(coverage_per_sample)

coverage_df = coverage_dfs[0].merge(coverage_dfs[1], on=SAMPLE_ID_COLUMN)
coverage_df = coverage_dfs[0].merge(coverage_dfs[1],
on=[SAMPLE_ID_COLUMN, WALTZ_INTERVAL_NAME_COLUMN],
suffixes=('_total', '_unique'))

coverage_df = coverage_df[[SAMPLE_ID_COLUMN,
WALTZ_AVERAGE_COVERAGE_COLUMN + '_total',
WALTZ_AVERAGE_COVERAGE_COLUMN + '_unique']]

coverage_df.columns = AGBM_COVERAGE_FILE_HEADER
to_csv(coverage_df, AGBM_COVERAGE_FILENAME)

Expand All @@ -118,10 +125,10 @@ def create_sum_of_coverage_dup_temp_file(files):
input_files = [f for f in files if WALTZ_INTERVALS_FILENAME_SUFFIX in f]

intervals_coverage_all = merge_files_across_samples(input_files)
intervals_coverage_all.columns = AGBM_INTERVALS_COVERAGE_SUM_FILE_HEADER
intervals_coverage_all.columns = [SAMPLE_ID_COLUMN] + WALTZ_INTERVALS_FILE_HEADER

# Todo: is interval_name the same as 0:5 for key?
togroupby = [SAMPLE_ID_COLUMN, 'chromosome', 'start', 'end', 'interval_name', 'length']
togroupby = [SAMPLE_ID_COLUMN, WALTZ_INTERVAL_NAME_COLUMN]
gc_coverage_sum_per_interval = intervals_coverage_all.groupby(togroupby).sum().reset_index()

# Todo: should 'gc' be averaged across all samples or come from just last sample?
Expand All @@ -137,9 +144,9 @@ def create_sum_of_coverage_nodup_temp_file(files):
input_files = [f for f in files if WALTZ_INTERVALS_WITHOUT_DUPLICATES_FILENAME_SUFFIX in f]

intervals_coverage_all = merge_files_across_samples(input_files)
intervals_coverage_all.columns = AGBM_INTERVALS_COVERAGE_SUM_FILE_HEADER
intervals_coverage_all.columns = [SAMPLE_ID_COLUMN] + WALTZ_INTERVALS_FILE_HEADER

togroupby = [SAMPLE_ID_COLUMN, 'interval_name']
togroupby = [SAMPLE_ID_COLUMN, INTERVAL_NAME_COLUMN]
gc_coverage_sum_per_interval = intervals_coverage_all.groupby(togroupby).sum().reset_index()

to_csv(gc_coverage_sum_per_interval, 't6')
Expand All @@ -151,10 +158,10 @@ def create_intervals_coverage_sum_file():
"""
t5 = pd.read_csv('t5', sep='\t')
t6 = pd.read_csv('t6', sep='\t')
t6 = t6[[SAMPLE_ID_COLUMN, 'interval_name', 'coverage']]
t6 = t6[[SAMPLE_ID_COLUMN, INTERVAL_NAME_COLUMN, WALTZ_AVERAGE_COVERAGE_COLUMN]]

intervals_coverage_sum = t5.merge(t6, on=[SAMPLE_ID_COLUMN, 'interval_name'])
intervals_coverage_sum.columns = AGBM_INTERVALS_COVERAGE_SUM_FILE_HEADER + ['coverage_without_duplicates']
intervals_coverage_sum = t5.merge(t6, on=[SAMPLE_ID_COLUMN, WALTZ_INTERVAL_NAME_COLUMN], suffixes=('_total', '_unique'))
# intervals_coverage_sum.columns = AGBM_INTERVALS_COVERAGE_SUM_FILE_HEADER + ['coverage_without_duplicates']
to_csv(intervals_coverage_sum, AGBM_INTERVALS_COVERAGE_SUM_FILENAME)


Expand Down
139 changes: 49 additions & 90 deletions python_tools/workflow_tools/qc/plotting-collapsed-bams.r
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!Rscript
#! /usr/bin/env Rscript

##################################################
# Innovation Laboratory
Expand Down Expand Up @@ -355,42 +355,20 @@ parse_sort_order = function(groups_file) {
}


#' Run this on a data.frame to provide consistent sample names to our sample ids
#' @param data data.frame with 'Sample' column
#' @param format either 'DASHES' or 'UNDERSCORES'
convert_sample_names = function(data, format) {
if (format == 'DASHES') {
data = data %>% mutate(Sample = gsub('_', '-', Sample))
} else if (format == 'UNDERSCORES') {
data = data %>% mutate(Sample = gsub('-', '_', Sample))
} else {
stop('Incorrect sample id format provided')
}
data
}


# Extract actual sample names from full filenames
#' Ex: sample_names = c('test_patient_T', 'test_patient_N')
#' test_patient_T_001_aln_srt_MD_IR_FX_BR --> test_patient_T
cleanup_sample_names = function(data, sample_names) {
data = convert_sample_names(data, 'DASHES')

# Ex: sample_names = c('test_patient_T', 'test_patient_N')
# test_patient_T_001_aln_srt_MD_IR_FX_BR --> test_patient_T
find.string <- paste(unlist(sample_names), collapse = "|")
find.string <- paste0('.*(', find.string, ').*', collapse='', sep='')
data = data %>% mutate(Sample = gsub(find.string, '\\1', Sample))

# Ex: ZS-msi-4506-pl-T01_IGO_05500_EF_41_S41_standard...
data = data %>% mutate(Sample = gsub('_standard.*', '', Sample))

# Ex: ZS-msi-4506-pl-T01_IGO_05500_EF_41_S41
# ^^^^
data = data %>% mutate(Sample = gsub('_.\\d\\d$', '', Sample))
data
}


# Main function that will provide all of the desired plots
main = function(args) {

# Read arguments specifying where the required tables are
# as well as where the plots should be put
# todo - should return an object or named list, instead of this indexed list
Expand All @@ -399,81 +377,62 @@ main = function(args) {
inDirWaltz = args[2]
outDir = args[3]
title_file = args[4]

title_df = read.table(title_file, sep='\t', header=TRUE)

# Define the output PDF file
date = format(Sys.time(), '%a-%b-%d-%Y_%H-%M-%S')
final_filename = paste('qc_results', gsub('[:|/]', '-', date), 'pdf', sep='.')
final_dest = paste(outDir, final_filename, sep="/")
pdf(file = final_dest, onefile = TRUE)

title_df = read.table(title_file, sep='\t', header=TRUE)

# Check if the title_file had the correct delimiter
if (length(colnames(title_df)) == 1) {
title_df = read.table(title_file, sep=',', header=TRUE)
}

# Put title file on first page of PDF
printTitle(title_df)

# Title file sample colunn is used as sort order
sort_order = unlist(title_df$Sample)
# We prefer dashes
title_df = convert_sample_names(title_df, 'DASHES')

availTabs = list.files(path = inDirTables)
if ('read-counts-total.txt' %in% availTabs) {
readCountsDataTotal = read.table(paste(inDirTables, 'read-counts-total.txt', sep = '/'), sep = '\t', head = TRUE)
dupRateData = read.table(paste(inDirTables, 'duplication-rates.txt', sep = '/'), sep = '\t', head = TRUE)
covPerInterval = read.table(paste(inDirTables, 'coverage-per-interval.txt', sep = '/'), sep = '\t', head = TRUE)
insertSizePeaks = read.table(paste(inDirTables, 'insert-size-peaks.txt', sep = '/'), sep = '\t', head = TRUE)
insertSizes = read.table(paste(inDirWaltz, 'fragment-sizes.txt', sep='/'), sep='\t', head=TRUE)

printhead = function(x) {
print(head(x))
}

# Perform some processing on some of our tables
dfList <- list(readCountsDataTotal, dupRateData, covPerInterval, insertSizePeaks, insertSizes)
dfList = lapply(dfList, convert_sample_names, 'DASHES')
dfList = lapply(dfList, cleanup_sample_names, sort_order)
dfList = lapply(dfList, sort_df, 'Sample', sort_order)
lapply(dfList, printhead)

dfList = lapply(dfList, mergeInTitleFileData, title_df)

readCountsDataTotal = dfList[[1]]
dupRateData = dfList[[2]]
covPerInterval = dfList[[3]]
insertSizePeaks = dfList[[4]]
insertSizes = dfList[[5]]

# Choose the plots that we want to run
print(plotDupFrac(dupRateData))
print(plotAlignGenome(readCountsDataTotal))
# print(plotCovDistPerInterval(covPerInterval))
print(plotOnTarget(readCountsDataTotal))
print(plotInsertSizeDistribution(insertSizes, insertSizePeaks))
print(plotCovDistPerIntervalLine(covPerInterval))
}

meanCovData = read.table(paste(inDirTables, 'coverage-agg.txt', sep='/'), sep='\t', head = TRUE)
gcAllSamples = read.table(paste(inDirTables, 'GC-bias-with-coverage-averages-over-all-samples.txt', sep='/'), sep='\t', head=TRUE)
gcEachSample = read.table(paste(inDirTables, 'GC-bias-with-coverage-averages-over-each-sample.txt', sep='/'), sep='\t', head=TRUE)

# Perform some processing on some of our (other) tables
dfList <- list(meanCovData, gcEachSample)
dfList = lapply(dfList, convert_sample_names, 'DASHES')
dfList = lapply(dfList, cleanup_sample_names, sort_order)
dfList = lapply(dfList, sort_df, 'Sample', sort_order)
dfList = lapply(dfList, mergeInTitleFileData, title_df)

meanCovData = dfList[[1]]
gcEachSample = dfList[[2]]

print(plotMeanCov(meanCovData))
print(plotGCwithCovAllSamples(gcAllSamples))
print(plotGCwithCovEachSample(gcEachSample, sort_order))
dev.off()
readCountsDataTotal = read.table(paste(inDirTables, 'read-counts-total.txt', sep='/'), sep='\t', head=TRUE)
dupRateData = read.table(paste(inDirTables, 'duplication-rates.txt', sep='/'), sep='\t', head=TRUE)
covPerInterval = read.table(paste(inDirTables, 'coverage-per-interval.txt', sep='/'), sep='\t', head=TRUE)
insertSizePeaks = read.table(paste(inDirTables, 'insert-size-peaks.txt', sep='/'), sep='\t', head=TRUE)
insertSizes = read.table(paste(inDirWaltz, 'fragment-sizes.txt', sep='/'), sep='\t', head=TRUE)
meanCovData = read.table(paste(inDirTables, 'coverage-agg.txt', sep='/'), sep='\t', head=TRUE)
gcAllSamples = read.table(paste(inDirTables, 'GC-bias-with-coverage-averages-over-all-samples.txt', sep='/'), sep='\t', head=TRUE)
gcEachSample = read.table(paste(inDirTables, 'GC-bias-with-coverage-averages-over-each-sample.txt', sep='/'), sep='\t', head=TRUE)

printhead = function(x) {
print(head(x))
}

# Perform some processing on some of our tables
dfList <- list(readCountsDataTotal, dupRateData, covPerInterval, insertSizePeaks, insertSizes, meanCovData, gcEachSample)
dfList = lapply(dfList, cleanup_sample_names, sort_order)
dfList = lapply(dfList, sort_df, 'Sample', sort_order)
lapply(dfList, printhead)

dfList = lapply(dfList, mergeInTitleFileData, title_df)

readCountsDataTotal = dfList[[1]]
dupRateData = dfList[[2]]
covPerInterval = dfList[[3]]
insertSizePeaks = dfList[[4]]
insertSizes = dfList[[5]]
meanCovData = dfList[[6]]
gcEachSample = dfList[[7]]

# Choose the plots that we want to run
print(plotDupFrac(dupRateData))
print(plotAlignGenome(readCountsDataTotal))
# print(plotCovDistPerInterval(covPerInterval))
print(plotOnTarget(readCountsDataTotal))
print(plotInsertSizeDistribution(insertSizes, insertSizePeaks))
print(plotCovDistPerIntervalLine(covPerInterval))
print(plotMeanCov(meanCovData))
print(plotGCwithCovAllSamples(gcAllSamples))
print(plotGCwithCovEachSample(gcEachSample, sort_order))

dev.off()
}


Expand Down
Loading

0 comments on commit 82f7346

Please sign in to comment.