diff --git a/Dockerfile b/Dockerfile index fd86665..48a0a83 100644 --- a/Dockerfile +++ b/Dockerfile @@ -22,4 +22,6 @@ RUN git clone https://github.com/genome/bam-readcount.git /tmp/bam-readcount-0.7 rm -rf /tmp/bam-readcount-0.7.4 && \ ln -s /opt/bam-readcount/bin/bam-readcount /usr/bin/bam-readcount +COPY bam_readcount_helper.py /usr/bin/bam_readcount_helper.py + ENTRYPOINT ["/usr/bin/bam-readcount"] diff --git a/bam_readcount_helper.py b/bam_readcount_helper.py new file mode 100644 index 0000000..5b45803 --- /dev/null +++ b/bam_readcount_helper.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python + +import sys +import os +import vcfpy +import tempfile +import csv +from subprocess import Popen, PIPE + +def generate_region_list(hash): + fh = tempfile.NamedTemporaryFile('w', delete=False) + writer = csv.writer(fh, delimiter='\t') + for chr, positions in hash.items(): + for pos in sorted(positions.keys()): + writer.writerow([chr, pos, pos]) + fh.close() + return fh.name + +def filter_sites_in_hash(region_list, bam_file, ref_fasta, sample, output_dir, insertion_centric): + bam_readcount_cmd = ['/usr/bin/bam-readcount', '-f', ref_fasta, '-l', region_list, '-w', '0', '-b', '20'] + if insertion_centric: + bam_readcount_cmd.append('-i') + output_file = os.path.join(output_dir, sample + '_bam_readcount_indel.tsv') + else: + output_file = os.path.join(output_dir, sample + '_bam_readcount_snv.tsv') + bam_readcount_cmd.append(bam_file) + execution = Popen(bam_readcount_cmd, stdout=PIPE, stderr=PIPE) + stdout, stderr = execution.communicate() + if execution.returncode == 0: + with open(output_file, 'wb') as output_fh: + output_fh.write(stdout) + else: + sys.exit(stderr) + + +(script_name, vcf_filename, sample, ref_fasta, bam_file, output_dir)= sys.argv + +vcf_file = vcfpy.Reader.from_path(vcf_filename) +sample_index = vcf_file.samples.index(sample) + +rc_for_indel = {} +rc_for_snp = {} +for variant in vcf_file: + alleles = [variant.REF] + variant.ALT + genotype = variant.genotypes[sample_index] + gt_alt_alleles = [allele for allele in genotype if allele > 0] + used_gt_alt_alleles = [alleles[index] for index in gt_alt_alleles] + if len(used_gt_alt_alleles) > 0: + var = sorted(used_gt_alt_alleles)[0] + else: + var = "" + ref = variant.REF + chr = variant.CHROM + pos = variant.POS + + if len(ref) > 1 or len(var) > 1: + #it's an indel or mnp + if len(ref) == len(var) or (len(ref) > 1 and len(var) > 1): + sys.stderr.write("Complex variant or MNP will be skipped: %s\t%s\t%s\t%s\n" % (chr, pos, ref , var)) + continue + elif len(ref) > len(var): + #it's a deletion + pos += 1 + unmodified_ref = ref + ref = unmodified_ref[1] + var = "-%s" % unmodified_ref[1:] + else: + #it's an insertion + var = "+%s" % var[1:] + if chr not in rc_for_indel: + rc_for_indel[chr] = {} + if pos not in rc_for_indel[chr]: + rc_for_indel[chr][pos] = {} + if ref not in rc_for_indel[chr][pos]: + rc_for_indel[chr][pos][ref] = {} + rc_for_indel[chr][pos][ref] = variant + else: + #it's a SNP + if chr not in rc_for_snp: + rc_for_snp[chr] = {} + if pos not in rc_for_snp[chr]: + rc_for_snp[chr][pos] = {} + if ref not in rc_for_snp[chr][pos]: + rc_for_snp[chr][pos][ref] = {} + rc_for_snp[chr][pos][ref] = variant + +if len(rc_for_snp.keys()) > 0: + region_file = generate_region_list(rc_for_snp) + filter_sites_in_hash(region_file, bam_file, ref_fasta, sample, output_dir, False) +else: + output_file = os.path.join(output_dir, sample + '_bam_readcount_snv.tsv') + open(output_file, 'w').close() + +if len(rc_for_indel.keys()) > 0: + region_file = generate_region_list(rc_for_indel) + filter_sites_in_hash(region_file, bam_file, ref_fasta, sample, output_dir, True) +else: + output_file = os.path.join(output_dir, sample + '_bam_readcount_indel.tsv') + open(output_file, 'w').close()