Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New version with CRAM-enabled genome/bam-readcount master #9

Open
wants to merge 19 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 45 additions & 13 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,35 @@
FROM mgibio/samtools:1.3.1
FROM ubuntu:focal

# tzdata config from https://stackoverflow.com/a/47909037
ENV DEBIAN_FRONTEND noninteractive
ENV DEBCONF_NONINTERACTIVE_SEEN true

RUN apt-get update && \
# Required build tools
apt-get install -y build-essential cmake && \
# git to fetch the repo and wget to populate vendor
apt-get install -y git wget && \
# Clean up
apt-get clean all && \
rm -rf /var/lib/apt/lists/* && rm -rf /tmp/* && rm -rf /var/tmp/*

RUN cd / && \
git clone https://github.com/genome/bam-readcount && \
cd bam-readcount && \
# For a specific tag enable the git checkout below
#git checkout cram-v0.0.2 && \
rm -rf build && \
mkdir build && \
cd build && \
cmake .. && \
make


# By adding zlib1g-dev don't appear to need mgibio/samtools:1.3.1 as a base
FROM ubuntu:focal
MAINTAINER John Garza <[email protected]>
ENV DEBIAN_FRONTEND noninteractive
ENV DEBCONF_NONINTERACTIVE_SEEN true

LABEL \
description="Image supporting the bam-readcount program"
Expand All @@ -10,25 +40,27 @@ RUN apt-get update && \
libcurl4-openssl-dev \
libssl-dev \
patch \
python \
python-pip \
git
python-is-python3 \
python3 \
python3-pip \
git \
automake \
autoconf \
libbz2-dev \
liblzma-dev \
zlib1g-dev

ENV SAMTOOLS_ROOT=/opt/samtools
RUN mkdir /opt/bam-readcount
RUN mkdir -p /opt/bam-readcount/bin

WORKDIR /opt/bam-readcount
RUN git clone https://github.com/genome/bam-readcount.git /tmp/bam-readcount-0.7.4 && \
git -C /tmp/bam-readcount-0.7.4 checkout v0.7.4 && \
cmake /tmp/bam-readcount-0.7.4 && \
make && \
rm -rf /tmp/bam-readcount-0.7.4 && \
ln -s /opt/bam-readcount/bin/bam-readcount /usr/bin/bam-readcount
COPY --from=0 /bam-readcount/build/bin/bam-readcount /opt/bam-readcount/bin/bam-readcount
RUN ln -s /opt/bam-readcount/bin/bam-readcount /usr/bin/bam-readcount

COPY bam_readcount_helper.py /usr/bin/bam_readcount_helper.py

RUN pip install --upgrade pip
RUN pip install cyvcf2
RUN pip3 install cyvcf2


#clear inherited entrypoint
ENTRYPOINT []
23 changes: 23 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
bam_readcount_helper-cwl
========================

Ubuntu Focal-based Docker image with `bam-readcount` and helper script


Resources
---------

https://github.com/genome/bam-readcount
`bam-readcount` and documentation


Usage
-----

`bam-readcount` and the helper script are installed as

/usr/bin/bam-readcount
/usr/bin/bam_readcount_helper.py



101 changes: 53 additions & 48 deletions bam_readcount_helper.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python
#!/usr/bin/env python3

import sys
import os
Expand All @@ -16,13 +16,13 @@ def generate_region_list(hash):
fh.close()
return fh.name

def filter_sites_in_hash(region_list, bam_file, ref_fasta, sample, output_dir, insertion_centric):
bam_readcount_cmd = ['/usr/bin/bam-readcount', '-f', ref_fasta, '-l', region_list, '-w', '0', '-b', '20']
def filter_sites_in_hash(region_list, bam_file, ref_fasta, prefixed_sample, output_dir, insertion_centric, map_qual, base_qual):
bam_readcount_cmd = ['/usr/bin/bam-readcount', '-f', ref_fasta, '-l', region_list, '-w', '0', '-b', str(base_qual), '-q', str(map_qual)]
if insertion_centric:
bam_readcount_cmd.append('-i')
output_file = os.path.join(output_dir, sample + '_bam_readcount_indel.tsv')
output_file = os.path.join(output_dir, prefixed_sample + '_bam_readcount_indel.tsv')
else:
output_file = os.path.join(output_dir, sample + '_bam_readcount_snv.tsv')
output_file = os.path.join(output_dir, prefixed_sample + '_bam_readcount_snv.tsv')
bam_readcount_cmd.append(bam_file)
execution = Popen(bam_readcount_cmd, stdout=PIPE, stderr=PIPE)
stdout, stderr = execution.communicate()
Expand All @@ -32,70 +32,75 @@ def filter_sites_in_hash(region_list, bam_file, ref_fasta, sample, output_dir, i
else:
sys.exit(stderr)

#initializing these with default values
min_base_qual = 20
min_mapping_qual = 0

(script_name, vcf_filename, sample, ref_fasta, bam_file, output_dir)= sys.argv
if len(sys.argv) == 7:
(script_name, vcf_filename, sample, ref_fasta, bam_file, prefix, output_dir)= sys.argv
elif len(sys.argv) == 8:
(script_name, vcf_filename, sample, ref_fasta, bam_file, prefix, output_dir, min_base_qual)= sys.argv
elif len(sys.argv) == 9: #elif instead of else for explicit safety
(script_name, vcf_filename, sample, ref_fasta, bam_file, prefix, output_dir, min_base_qual, min_mapping_qual)= sys.argv

if prefix == 'NOPREFIX':
prefixed_sample = sample
else:
prefixed_sample = '_'.join([prefix, sample])

vcf_file = VCF(vcf_filename)
sample_index = vcf_file.samples.index(sample)

rc_for_indel = {}
rc_for_snp = {}
for variant in vcf_file:
alleles = [variant.REF] + variant.ALT
genotype = variant.genotypes[sample_index]
gt_alt_alleles = [allele for allele in genotype if allele > 0]
used_gt_alt_alleles = [alleles[index] for index in gt_alt_alleles]
if len(used_gt_alt_alleles) > 0:
var = sorted(used_gt_alt_alleles)[0]
else:
var = ""
ref = variant.REF
chr = variant.CHROM
start = variant.start
end = variant.end
pos = variant.POS

if len(ref) > 1 or len(var) > 1:
#it's an indel or mnp
if len(ref) == len(var) or (len(ref) > 1 and len(var) > 1):
sys.stderr.write("Complex variant or MNP will be skipped: %s\t%s\t%s\t%s\n" % (chr, pos, ref , var))
continue
elif len(ref) > len(var):
#it's a deletion
pos += 1
unmodified_ref = ref
ref = unmodified_ref[1]
var = "-%s" % unmodified_ref[1:]
for var in variant.ALT:
if len(ref) > 1 or len(var) > 1:
#it's an indel or mnp
if len(ref) == len(var) or (len(ref) > 1 and len(var) > 1):
sys.stderr.write("Complex variant or MNP will be skipped: %s\t%s\t%s\t%s\n" % (chr, pos, ref , var))
continue
elif len(ref) > len(var):
#it's a deletion
pos += 1
unmodified_ref = ref
ref = unmodified_ref[1]
var = "-%s" % unmodified_ref[1:]
else:
#it's an insertion
var = "+%s" % var[1:]
if chr not in rc_for_indel:
rc_for_indel[chr] = {}
if pos not in rc_for_indel[chr]:
rc_for_indel[chr][pos] = {}
if ref not in rc_for_indel[chr][pos]:
rc_for_indel[chr][pos][ref] = {}
rc_for_indel[chr][pos][ref] = variant
else:
#it's an insertion
var = "+%s" % var[1:]
if chr not in rc_for_indel:
rc_for_indel[chr] = {}
if pos not in rc_for_indel[chr]:
rc_for_indel[chr][pos] = {}
if ref not in rc_for_indel[chr][pos]:
rc_for_indel[chr][pos][ref] = {}
rc_for_indel[chr][pos][ref] = variant
else:
#it's a SNP
if chr not in rc_for_snp:
rc_for_snp[chr] = {}
if pos not in rc_for_snp[chr]:
rc_for_snp[chr][pos] = {}
if ref not in rc_for_snp[chr][pos]:
rc_for_snp[chr][pos][ref] = {}
rc_for_snp[chr][pos][ref] = variant
#it's a SNP
if chr not in rc_for_snp:
rc_for_snp[chr] = {}
if pos not in rc_for_snp[chr]:
rc_for_snp[chr][pos] = {}
if ref not in rc_for_snp[chr][pos]:
rc_for_snp[chr][pos][ref] = {}
rc_for_snp[chr][pos][ref] = variant

if len(rc_for_snp.keys()) > 0:
region_file = generate_region_list(rc_for_snp)
filter_sites_in_hash(region_file, bam_file, ref_fasta, sample, output_dir, False)
filter_sites_in_hash(region_file, bam_file, ref_fasta, prefixed_sample, output_dir, False, min_mapping_qual, min_base_qual)
else:
output_file = os.path.join(output_dir, sample + '_bam_readcount_snv.tsv')
output_file = os.path.join(output_dir, prefixed_sample + '_bam_readcount_snv.tsv')
open(output_file, 'w').close()

if len(rc_for_indel.keys()) > 0:
region_file = generate_region_list(rc_for_indel)
filter_sites_in_hash(region_file, bam_file, ref_fasta, sample, output_dir, True)
filter_sites_in_hash(region_file, bam_file, ref_fasta, prefixed_sample, output_dir, True, min_mapping_qual, min_base_qual)
else:
output_file = os.path.join(output_dir, sample + '_bam_readcount_indel.tsv')
output_file = os.path.join(output_dir, prefixed_sample + '_bam_readcount_indel.tsv')
open(output_file, 'w').close()