diff --git a/src/utils/distribution_validation.py b/src/utils/distribution_validation.py new file mode 100644 index 0000000..5d1b536 --- /dev/null +++ b/src/utils/distribution_validation.py @@ -0,0 +1,188 @@ +import pysam +import numpy as np +import os +import sys + +import matplotlib +matplotlib.use('Agg') + +import matplotlib.mlab as mlab +import matplotlib.pyplot as plt + +''' + Assumes all bam files in specified nanopore_prefix directory + are chromosomes in the NA12878 nanopore consortium. + + Returns full paths to all files in the directory that end in .bam +''' +def retrieve_nanopore_bams(nanopore_prefix): + return [x for x in os.listdir(nanopore_prefix) if '.bam' in x[-4:]] + +''' + Simple conversion function to turn counts in a histogram of + counts into percentages +''' +def count_to_distribution(nparray): + total_count = float(sum(nparray)) + return nparray / total_count + +''' + Gets the histogram from a list of coverages according to + a defined set of bins +''' +def coverage_to_histogram(coverage_list, bins): + temp_hist = np.histogram(coverage_list, bins=bins) + return np.array(temp_hist[0]) + +''' + Find the first and last read from a bam file. + + Set the START index to the first read, check every 1000 positions. + + Find the END index by checking every millionth position, + and then iterating after the last nonzero read by 1000's + to find the last read. +''' +def find_boundaries(data_prefix, filename, chr_label): + current_position = 0 + coverages = [] + bamfile = pysam.AlignmentFile(data_prefix + filename, "rb") + + while 1: + if bamfile.count(chr_label, current_position, current_position+1) > 0: + coverages.append(current_position) + break + current_position += 1000 + START = coverages[0] + + # Add nonzero reads to coverages, break once we've found + # 100 consecutive reads + consecutive = 0 + while consecutive < 100: + if bamfile.count(chr_label, current_position, current_position+1) > 0: + coverages.append(current_position) + consecutive = 0 + else: + consecutive += 1 + current_position += 1000000 + + end = coverages[-1:][0] + post = end + 1000000 + for current_position in range(end, post, 1000): + if bamfile.count(chr_label, current_position, current_position+1) > 0: + coverages.append(current_position) + + return START, coverages[-1:][0] + +''' + Dump the zero_positions list to a file +''' +def output_zeroes(seq_type, chr_label, zero_positions): + with open("{}.{}.zeros.txt".format(seq_type, chr_label), 'w') as zero_file: + for entry in zero_positions: + zero_file.write('{}\n'.format(entry)) + print("Dumping {}.{}.zeros.txt".format(seq_type, chr_label)) + zero_file.flush() + +''' + Process for reading coverages from each chromosome in + each bamfile, and producing the visualizations. +''' +def bam_management(START, END, STEPS, data_prefix, bamfilename, chr_label, outfile): + coverage_list = [] + hist_list = [] + zero_positions = [] + + bins = range(0,105,5) + running_total = np.zeros(len(bins)-1) + + # Get reads from START to END in BAMFILE + bamfile = pysam.AlignmentFile(data_prefix + bamfilename, "rb") + + # Print coverage for every STEPS + for i in range(START, END, STEPS): + value = bamfile.count(chr_label, i, i+1) + + # Debugging zero length problems + if value == 0: + zero_positions.append(i) + + # Coverage Distribution + coverage_list.append(value) + if len(coverage_list) > 1000: + running_total += coverage_to_histogram(coverage_list, bins) + coverage_list = [] + outfile.write('{}\n'.format(i)) + outfile.flush() + + # Pick up any stragglers + if len(coverage_list) > 0: + running_total += coverage_to_histogram(coverage_list, bins) + + bamfile.close() + + # change running_total to distribution + distribution = count_to_distribution(running_total) + num_bins = int((END - START) / STEPS) + seq_type = "Illumina" if chr_label not in bamfilename else "Nanopore" + outfile.write("Finished reading {}, now writing distribution.\n".format(chr_label)) + outfile.flush() + + # Dump zero positions + output_zeroes(seq_type, chr_label, zero_positions) + plot_distribution(distribution, seq_type, chr_label) + +def plot_distribution(distribution, seq_type, chr_label): + y_pos = np.arange(20)*5 + plt.clf() + plt.bar(y_pos, distribution, align='center', alpha=1.0, facecolor='black') + plt.xticks(y_pos, y_pos) + plt.ylabel('Frequency') + plt.xlabel('Coverage') + plt.title('{} {} Coverage Distribution'.format(seq_type, chr_label)) + print("Printing {}.{}.coverage_percentage_distribution.png".format(seq_type, chr_label)) + plt.savefig('{}.{}.coverage_percentage_distribution.png'.format(seq_type, chr_label)) + +if __name__=="__main__": + # Illumina NA12878 + #test_prefix = "/data/users/ryan/" + #test_file = "chr3.bam" + #test_label = "chr3" + + # Testing + #START, END = find_boundaries(test_prefix, test_file, test_label) + #bam_management(START, END, STEPS, test_prefix, test_file, test_label) + + # Nanopore chr3 + nanopore_prefix = '/data/users/common/nanopore/' + illumina_prefix = '/data/users/ryan/' + illumina_file = 'NA12878_S1.bam' + STEPS = 1000 + + # Nanopore Processing + for chr_file in retrieve_nanopore_bams(nanopore_prefix): + chr_label = chr_file.split('.')[0] + #if chr_label in ["chr1", "chr3", "chr6", "chr9", "chr11", "chr12", "chr13", "chr14", "chr16", "chr18", "chr19"]: + # continue + outfile = open("output.nanopore.{}.txt".format(chr_label), 'w') + print("Printing to output.nanopore.{}.txt".format(chr_label)) + sys.stdout.flush() + outfile.write("Starting nanopore validation for {}\n".format(chr_label)) + START, END = find_boundaries(nanopore_prefix, chr_file, chr_label) + outfile.write("Bounds: {} {}\n".format(START, END)) + outfile.flush() + bam_management(START, END, STEPS, nanopore_prefix, chr_file, chr_label, outfile) + outfile.close() + + # Illumina Processing + for i in range(1, 20): + chr_label = "chr" + str(i) + outfile = open("output.illumina.{}.txt".format(chr_label), 'w') + print("Printing to output.illumina.{}.txt".format(chr_label)) + sys.stdout.flush() + outfile.write("Starting illumina validation for {}\n".format(chr_label)) + START, END = find_boundaries(illumina_prefix, illumina_file, chr_label) + outfile.write("Bounds: {} {}\n".format(START, END)) + outfile.flush() + bam_management(START, END, STEPS, illumina_prefix, illumina_file, chr_label, outfile) + outfile.close() \ No newline at end of file