Skip to content

Commit

Permalink
Merge pull request #175 from timothymillar/minimise-file-handles
Browse files Browse the repository at this point in the history
Version 0.9.2
  • Loading branch information
timothymillar authored Mar 3, 2024
2 parents 92cb214 + 1e97aa5 commit 28983b8
Show file tree
Hide file tree
Showing 44 changed files with 135 additions and 128 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,12 @@
## Unreleased


## Beta v0.9.2

Bug Fixes:
- Avoid holding alignment file handles open #173


## Beta v0.9.1

New Features:
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ RUN apt-get update \

RUN git clone https://github.com/PlantandFoodResearch/MCHap.git \
&& cd MCHap \
&& git checkout v0.9.1 \
&& git checkout v0.9.2 \
&& pip install -r requirements.txt \
&& python3 setup.py sdist \
&& python3 -m pip install dist/mchap-*tar.gz
Expand Down
2 changes: 1 addition & 1 deletion docs/assemble.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ MCHap assemble

De novo assembly of micro-haplotypes.

*(Last updated for MCHap version 0.9.1)*
*(Last updated for MCHap version 0.9.2)*

Background
----------
Expand Down
2 changes: 1 addition & 1 deletion docs/call.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ MCHap call

Calling genotypes from known haplotypes.

*(Last updated for MCHap version 0.9.1)*
*(Last updated for MCHap version 0.9.2)*

Background
----------
Expand Down
19 changes: 14 additions & 5 deletions mchap/application/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -717,7 +717,9 @@ def parse_sample_pools(samples, sample_bams, sample_pool_argument):
return pools, pool_bams


def parse_sample_bam_paths(bam_argument, sample_pool_argument, read_group_field):
def parse_sample_bam_paths(
bam_argument, sample_pool_argument, read_group_field, reference_path
):
"""Combine arguments relating to sample bam file specification.
Parameters
Expand All @@ -738,7 +740,7 @@ def parse_sample_bam_paths(bam_argument, sample_pool_argument, read_group_field)
textfile = False
if len(bam_argument) == 1:
try:
pysam.AlignmentFile(bam_argument[0])
pysam.AlignmentFile(bam_argument[0], reference_filename=reference_path)
except ValueError:
# not a bam
textfile = True
Expand All @@ -747,7 +749,9 @@ def parse_sample_bam_paths(bam_argument, sample_pool_argument, read_group_field)
else:
bams = bam_argument
if not textfile:
sample_bams = extract_sample_ids(bams, id=read_group_field)
sample_bams = extract_sample_ids(
bams, id=read_group_field, reference_path=reference_path
)
samples = list(sample_bams)

# case of plain-text filepath
Expand All @@ -761,7 +765,9 @@ def parse_sample_bam_paths(bam_argument, sample_pool_argument, read_group_field)
if n_fields == 1:
# list of bam paths
bams = [line[0] for line in lines]
sample_bams = extract_sample_ids(bams, id=read_group_field)
sample_bams = extract_sample_ids(
bams, id=read_group_field, reference_path=reference_path
)
samples = list(sample_bams)
elif n_fields == 2:
# list of sample-bam pairs
Expand Down Expand Up @@ -871,7 +877,10 @@ def collect_default_program_arguments(arguments):
)
# merge sample specific data with defaults
samples, sample_bams = parse_sample_bam_paths(
arguments.bam, arguments.sample_pool[0], arguments.read_group_field[0]
arguments.bam,
arguments.sample_pool[0],
arguments.read_group_field[0],
reference_path=arguments.reference[0],
)
sample_ploidy = parse_sample_value_map(
arguments.ploidy[0],
Expand Down
51 changes: 21 additions & 30 deletions mchap/application/baseclass.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,16 +99,6 @@ def format_fields(self):
def loci(self):
raise NotImplementedError()

def alignment_files(self):
out = {}
for pool, pairs in self.sample_bams.items():
pairs = [
(sample, pysam.AlignmentFile(path, reference_filename=self.ref))
for sample, path in pairs
]
out[pool] = pairs
return out

def header_contigs(self):
with pysam.VariantFile(self.vcf) as f:
contigs = f.header.contigs.values()
Expand All @@ -135,7 +125,7 @@ def header(self):
header = meta_fields + contigs + filters + info_fields + format_fields + columns
return [str(line) for line in header]

def _locus_data(self, locus, alignment_files):
def _locus_data(self, locus, sample_bams):
"""Generate a LocusAssemblyData object for a given locus
to be populated with data relating to a single vcf record.
"""
Expand All @@ -144,7 +134,7 @@ def _locus_data(self, locus, alignment_files):
return LocusAssemblyData(
locus=locus,
samples=self.samples,
sample_bams=alignment_files,
sample_bams=sample_bams,
sample_ploidy=self.sample_ploidy,
sample_inbreeding=self.sample_inbreeding,
infofields=infofields,
Expand Down Expand Up @@ -186,19 +176,22 @@ def encode_sample_reads(self, data):
# of bam sample-path pairs.
pairs = data.sample_bams[sample]
read_chars, read_quals = [], []
for name, alignment_file in pairs:
chars, quals = extract_read_variants(
data.locus,
alignment_file=alignment_file,
samples=name,
id=self.read_group_field,
min_quality=self.mapping_quality,
skip_duplicates=self.skip_duplicates,
skip_qcfail=self.skip_qcfail,
skip_supplementary=self.skip_supplementary,
)[name]
read_chars.append(chars)
read_quals.append(quals)
for name, path in pairs:
with pysam.AlignmentFile(
path, reference_filename=self.ref
) as alignment_file:
chars, quals = extract_read_variants(
data.locus,
alignment_file=alignment_file,
samples=name,
id=self.read_group_field,
min_quality=self.mapping_quality,
skip_duplicates=self.skip_duplicates,
skip_qcfail=self.skip_qcfail,
skip_supplementary=self.skip_supplementary,
)[name]
read_chars.append(chars)
read_quals.append(quals)
read_chars = np.concatenate(read_chars)
read_quals = np.concatenate(read_quals)

Expand Down Expand Up @@ -343,7 +336,7 @@ def sumarise_vcf_record(self, data):
data.infodata["AOP"] = prob_occurring.round(self.precision)
return data

def call_locus(self, locus, alignment_files):
def call_locus(self, locus, sample_bams):
"""Call samples at a locus and formats resulting data
into a VCF record line.
Expand All @@ -366,19 +359,17 @@ def call_locus(self, locus, alignment_files):
VCF variant line.
"""
data = self._locus_data(locus, alignment_files)
data = self._locus_data(locus, sample_bams)
self.encode_sample_reads(data)
self.call_sample_genotypes(data)
self.sumarise_sample_genotypes(data)
self.sumarise_vcf_record(data)
return data.format_vcf_record()

def _assemble_loci_wrapped(self, loci):
# create single set of alignment files to use for every locus in the job
alignment_files = self.alignment_files()
for locus in loci:
try:
result = self.call_locus(locus, alignment_files)
result = self.call_locus(locus, self.sample_bams)
except Exception as e:
message = LOCUS_ASSEMBLY_ERROR.format(
name=locus.name,
Expand Down
105 changes: 53 additions & 52 deletions mchap/application/find_snvs.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,65 +215,69 @@ def _count_alleles(zeros, alleles):
return


def bam_samples(alignment_files, tag="SM"):
out = [None] * len(alignment_files)
for i, bam in enumerate(alignment_files):
read_groups = bam.header["RG"]
sample_id = read_groups[0][tag]
if len(read_groups) > 1:
for rg in read_groups:
if rg[tag] != sample_id:
raise ValueError(
"Expected one sample per bam but found {} and {} in {}".format(
sample_id, rg[tag], bam.filename.decode()
def bam_samples(bam_paths, reference_path, tag="SM"):
out = [None] * len(bam_paths)
for i, path in enumerate(bam_paths):
with pysam.AlignmentFile(path, reference_filename=reference_path) as bam:
read_groups = bam.header["RG"]
sample_id = read_groups[0][tag]
if len(read_groups) > 1:
for rg in read_groups:
if rg[tag] != sample_id:
raise ValueError(
"Expected one sample per bam but found {} and {} in {}".format(
sample_id, rg[tag], bam.filename.decode()
)
)
)
out[i] = sample_id
out[i] = sample_id
return np.array(out)


def bam_region_depths(
alignment_files,
bam_paths,
reference_path,
contig,
start,
stop,
dtype=np.int64,
**kwargs,
):
n_samples = len(alignment_files)
n_samples = len(bam_paths)
n_pos = stop - start
shape = (n_pos, n_samples, 4)
depths = np.zeros(shape, dtype=dtype)
for j, bam in enumerate(alignment_files):
for column in bam.pileup(
contig=contig,
start=start,
stop=stop,
truncate=True,
multiple_iterators=False,
**kwargs,
):
# if start <= column.pos < stop:
i = column.pos - start
alleles = column.get_query_sequences()
if isinstance(alleles, list):
alleles = bases_to_indices(alleles)
_count_alleles(depths[i, j], alleles)
for j, path in enumerate(bam_paths):
with pysam.AlignmentFile(path, reference_filename=reference_path) as bam:
for column in bam.pileup(
contig=contig,
start=start,
stop=stop,
truncate=True,
multiple_iterators=False,
**kwargs,
):
# if start <= column.pos < stop:
i = column.pos - start
alleles = column.get_query_sequences()
if isinstance(alleles, list):
alleles = bases_to_indices(alleles)
_count_alleles(depths[i, j], alleles)
return depths


def write_vcf_header(
command, reference, info_fields=None, format_fields=None, samples=None
command, reference_path, info_fields=None, format_fields=None, samples=None
):
vcfversion_header = str(headermeta.fileformat("v4.3"))
date_header = str(headermeta.filedate())
source_header = str(headermeta.source())
command_header = str(headermeta.commandline(command))
reference_header = str(headermeta.reference(reference.filename.decode()))
contig_header = "\n".join(
str(headermeta.ContigHeader(s, i))
for s, i in zip(reference.references, reference.lengths)
)
with pysam.FastaFile(reference_path) as reference:
reference_header = str(headermeta.reference(reference.filename.decode()))
contig_header = "\n".join(
str(headermeta.ContigHeader(s, i))
for s, i in zip(reference.references, reference.lengths)
)
components = [
vcfversion_header,
date_header,
Expand Down Expand Up @@ -400,8 +404,8 @@ def write_vcf_block(
contig,
start,
stop,
reference,
alignment_files,
reference_path,
bam_paths,
# sample_ploidy,
# sample_inbreeding,
# base_error_rate,
Expand All @@ -420,10 +424,12 @@ def write_vcf_block(
assert start < stop
variant_position = np.arange(start, stop)
variant_contig = np.full(len(variant_position), contig)
variant_reference = np.array(list(reference.fetch(contig, start, stop).upper()))
with pysam.FastaFile(reference_path) as reference:
variant_reference = np.array(list(reference.fetch(contig, start, stop).upper()))
variant_reference_index = bases_to_indices(variant_reference)
allele_depth = bam_region_depths(
alignment_files,
bam_paths,
reference_path,
contig,
start,
stop,
Expand Down Expand Up @@ -587,9 +593,8 @@ def main(command):
bed = pd.read_table(bed_path, header=None)[[0, 1, 2]]
bed.columns = ["contig", "start", "stop"]
reference_path = args.reference[0]
reference = pysam.Fastafile(reference_path)
samples, sample_bams = arguments.parse_sample_bam_paths(
args.bam, None, args.read_group_field[0]
args.bam, None, args.read_group_field[0], reference_path=reference_path
)
# sample_ploidy = arguments.parse_sample_value_map(
# args.ploidy[0],
Expand All @@ -611,13 +616,9 @@ def main(command):
# create alignment file objects and reuse them throughout
# this is important for cram performance!
# also pass reference name explicitly for robustness
alignment_files = [
pysam.AlignmentFile(path, reference_filename=reference_path)
for path in bam_paths
]
samples_found = bam_samples(alignment_files, tag=args.read_group_field[0]).astype(
"U"
)
samples_found = bam_samples(
bam_paths, reference_path, tag=args.read_group_field[0]
).astype("U")
mismatch = samples_found != samples
if np.any(mismatch):
raise IOError(
Expand All @@ -630,7 +631,7 @@ def main(command):
format_fields = [formatfields.GT, formatfields.AD]
write_vcf_header(
command,
reference,
reference_path,
samples=samples,
info_fields=info_fields,
format_fields=format_fields,
Expand All @@ -641,8 +642,8 @@ def main(command):
interval.contig,
interval.start,
interval.stop,
reference,
alignment_files,
reference_path,
bam_paths,
# sample_ploidy,
# sample_inbreeding,
# base_error_rate=args.base_error_rate[0],
Expand Down
4 changes: 2 additions & 2 deletions mchap/io/bam.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
ID_TAGS = {"ID", "SM"}


def extract_sample_ids(bam_paths, id="SM"):
def extract_sample_ids(bam_paths, id="SM", reference_path=None):
"""Extract sample id's from a list of bam files.
Parameters
Expand All @@ -38,7 +38,7 @@ def extract_sample_ids(bam_paths, id="SM"):
assert id in ID_TAGS
data = {}
for path in bam_paths:
bam = pysam.AlignmentFile(path)
bam = pysam.AlignmentFile(path, reference_filename=reference_path)
# allow multiple read groups for a single sample within a bam file
# but guard against duplicate sample identifier across multiple bams
bam_data = {read_group[id]: path for read_group in bam.header["RG"]}
Expand Down
Loading

0 comments on commit 28983b8

Please sign in to comment.