Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add bam_readcount_helper script to container #2

Open
wants to merge 1 commit into
base: 0.7.4
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,6 @@ RUN git clone https://github.com/genome/bam-readcount.git /tmp/bam-readcount-0.7
rm -rf /tmp/bam-readcount-0.7.4 && \
ln -s /opt/bam-readcount/bin/bam-readcount /usr/bin/bam-readcount

COPY bam_readcount_helper.py /usr/bin/bam_readcount_helper.py

ENTRYPOINT ["/usr/bin/bam-readcount"]
99 changes: 99 additions & 0 deletions bam_readcount_helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
#!/usr/bin/env python

import sys
import os
import vcfpy
import tempfile
import csv
from subprocess import Popen, PIPE

def generate_region_list(hash):
fh = tempfile.NamedTemporaryFile('w', delete=False)
writer = csv.writer(fh, delimiter='\t')
for chr, positions in hash.items():
for pos in sorted(positions.keys()):
writer.writerow([chr, pos, pos])
fh.close()
return fh.name

def filter_sites_in_hash(region_list, bam_file, ref_fasta, sample, output_dir, insertion_centric):
bam_readcount_cmd = ['/usr/bin/bam-readcount', '-f', ref_fasta, '-l', region_list, '-w', '0', '-b', '20']
if insertion_centric:
bam_readcount_cmd.append('-i')
output_file = os.path.join(output_dir, sample + '_bam_readcount_indel.tsv')
else:
output_file = os.path.join(output_dir, sample + '_bam_readcount_snv.tsv')
bam_readcount_cmd.append(bam_file)
execution = Popen(bam_readcount_cmd, stdout=PIPE, stderr=PIPE)
stdout, stderr = execution.communicate()
if execution.returncode == 0:
with open(output_file, 'wb') as output_fh:
output_fh.write(stdout)
else:
sys.exit(stderr)


(script_name, vcf_filename, sample, ref_fasta, bam_file, output_dir)= sys.argv

vcf_file = vcfpy.Reader.from_path(vcf_filename)
sample_index = vcf_file.samples.index(sample)

rc_for_indel = {}
rc_for_snp = {}
for variant in vcf_file:
alleles = [variant.REF] + variant.ALT
genotype = variant.genotypes[sample_index]
gt_alt_alleles = [allele for allele in genotype if allele > 0]
used_gt_alt_alleles = [alleles[index] for index in gt_alt_alleles]
if len(used_gt_alt_alleles) > 0:
var = sorted(used_gt_alt_alleles)[0]
else:
var = ""
ref = variant.REF
chr = variant.CHROM
pos = variant.POS

if len(ref) > 1 or len(var) > 1:
#it's an indel or mnp
if len(ref) == len(var) or (len(ref) > 1 and len(var) > 1):
sys.stderr.write("Complex variant or MNP will be skipped: %s\t%s\t%s\t%s\n" % (chr, pos, ref , var))
continue
elif len(ref) > len(var):
#it's a deletion
pos += 1
unmodified_ref = ref
ref = unmodified_ref[1]
var = "-%s" % unmodified_ref[1:]
else:
#it's an insertion
var = "+%s" % var[1:]
if chr not in rc_for_indel:
rc_for_indel[chr] = {}
if pos not in rc_for_indel[chr]:
rc_for_indel[chr][pos] = {}
if ref not in rc_for_indel[chr][pos]:
rc_for_indel[chr][pos][ref] = {}
rc_for_indel[chr][pos][ref] = variant
else:
#it's a SNP
if chr not in rc_for_snp:
rc_for_snp[chr] = {}
if pos not in rc_for_snp[chr]:
rc_for_snp[chr][pos] = {}
if ref not in rc_for_snp[chr][pos]:
rc_for_snp[chr][pos][ref] = {}
rc_for_snp[chr][pos][ref] = variant

if len(rc_for_snp.keys()) > 0:
region_file = generate_region_list(rc_for_snp)
filter_sites_in_hash(region_file, bam_file, ref_fasta, sample, output_dir, False)
else:
output_file = os.path.join(output_dir, sample + '_bam_readcount_snv.tsv')
open(output_file, 'w').close()

if len(rc_for_indel.keys()) > 0:
region_file = generate_region_list(rc_for_indel)
filter_sites_in_hash(region_file, bam_file, ref_fasta, sample, output_dir, True)
else:
output_file = os.path.join(output_dir, sample + '_bam_readcount_indel.tsv')
open(output_file, 'w').close()