Merge pull request #175 from timothymillar/minimise-file-handles

Version 0.9.2
PlantandFoodResearch · Mar 3, 2024 · 28983b8 · 28983b8
2 parents 92cb214 + 1e97aa5
commit 28983b8
Show file tree

Hide file tree

Showing 44 changed files with 135 additions and 128 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,12 @@
 ## Unreleased 
 
 
+## Beta v0.9.2
+
+Bug Fixes:
+- Avoid holding alignment file handles open #173
+
+
 ## Beta v0.9.1
 
 New Features:

diff --git a/Dockerfile b/Dockerfile
@@ -9,7 +9,7 @@ RUN apt-get update \
 
 RUN git clone  https://github.com/PlantandFoodResearch/MCHap.git   \
     && cd MCHap \
-    && git checkout v0.9.1 \
+    && git checkout v0.9.2 \
     && pip install -r requirements.txt \
     && python3 setup.py sdist \
     && python3 -m pip install dist/mchap-*tar.gz

diff --git a/docs/assemble.rst b/docs/assemble.rst
@@ -3,7 +3,7 @@ MCHap assemble
 
 De novo assembly of micro-haplotypes.
 
-*(Last updated for MCHap version 0.9.1)*
+*(Last updated for MCHap version 0.9.2)*
 
 Background
 ----------

diff --git a/docs/call.rst b/docs/call.rst
@@ -3,7 +3,7 @@ MCHap call
 
 Calling genotypes from known haplotypes.
 
-*(Last updated for MCHap version 0.9.1)*
+*(Last updated for MCHap version 0.9.2)*
 
 Background
 ----------

diff --git a/mchap/application/arguments.py b/mchap/application/arguments.py
@@ -717,7 +717,9 @@ def parse_sample_pools(samples, sample_bams, sample_pool_argument):
         return pools, pool_bams
 
 
-def parse_sample_bam_paths(bam_argument, sample_pool_argument, read_group_field):
+def parse_sample_bam_paths(
+    bam_argument, sample_pool_argument, read_group_field, reference_path
+):
     """Combine arguments relating to sample bam file specification.
 
     Parameters
@@ -738,7 +740,7 @@ def parse_sample_bam_paths(bam_argument, sample_pool_argument, read_group_field)
     textfile = False
     if len(bam_argument) == 1:
         try:
-            pysam.AlignmentFile(bam_argument[0])
+            pysam.AlignmentFile(bam_argument[0], reference_filename=reference_path)
         except ValueError:
             # not a bam
             textfile = True
@@ -747,7 +749,9 @@ def parse_sample_bam_paths(bam_argument, sample_pool_argument, read_group_field)
     else:
         bams = bam_argument
     if not textfile:
-        sample_bams = extract_sample_ids(bams, id=read_group_field)
+        sample_bams = extract_sample_ids(
+            bams, id=read_group_field, reference_path=reference_path
+        )
         samples = list(sample_bams)
 
     # case of plain-text filepath
@@ -761,7 +765,9 @@ def parse_sample_bam_paths(bam_argument, sample_pool_argument, read_group_field)
         if n_fields == 1:
             # list of bam paths
             bams = [line[0] for line in lines]
-            sample_bams = extract_sample_ids(bams, id=read_group_field)
+            sample_bams = extract_sample_ids(
+                bams, id=read_group_field, reference_path=reference_path
+            )
             samples = list(sample_bams)
         elif n_fields == 2:
             # list of sample-bam pairs
@@ -871,7 +877,10 @@ def collect_default_program_arguments(arguments):
             )
     # merge sample specific data with defaults
     samples, sample_bams = parse_sample_bam_paths(
-        arguments.bam, arguments.sample_pool[0], arguments.read_group_field[0]
+        arguments.bam,
+        arguments.sample_pool[0],
+        arguments.read_group_field[0],
+        reference_path=arguments.reference[0],
     )
     sample_ploidy = parse_sample_value_map(
         arguments.ploidy[0],

diff --git a/mchap/application/baseclass.py b/mchap/application/baseclass.py
@@ -99,16 +99,6 @@ def format_fields(self):
     def loci(self):
         raise NotImplementedError()
 
-    def alignment_files(self):
-        out = {}
-        for pool, pairs in self.sample_bams.items():
-            pairs = [
-                (sample, pysam.AlignmentFile(path, reference_filename=self.ref))
-                for sample, path in pairs
-            ]
-            out[pool] = pairs
-        return out
-
     def header_contigs(self):
         with pysam.VariantFile(self.vcf) as f:
             contigs = f.header.contigs.values()
@@ -135,7 +125,7 @@ def header(self):
         header = meta_fields + contigs + filters + info_fields + format_fields + columns
         return [str(line) for line in header]
 
-    def _locus_data(self, locus, alignment_files):
+    def _locus_data(self, locus, sample_bams):
         """Generate a LocusAssemblyData object for a given locus
         to be populated with data relating to a single vcf record.
         """
@@ -144,7 +134,7 @@ def _locus_data(self, locus, alignment_files):
         return LocusAssemblyData(
             locus=locus,
             samples=self.samples,
-            sample_bams=alignment_files,
+            sample_bams=sample_bams,
             sample_ploidy=self.sample_ploidy,
             sample_inbreeding=self.sample_inbreeding,
             infofields=infofields,
@@ -186,19 +176,22 @@ def encode_sample_reads(self, data):
                 # of bam sample-path pairs.
                 pairs = data.sample_bams[sample]
                 read_chars, read_quals = [], []
-                for name, alignment_file in pairs:
-                    chars, quals = extract_read_variants(
-                        data.locus,
-                        alignment_file=alignment_file,
-                        samples=name,
-                        id=self.read_group_field,
-                        min_quality=self.mapping_quality,
-                        skip_duplicates=self.skip_duplicates,
-                        skip_qcfail=self.skip_qcfail,
-                        skip_supplementary=self.skip_supplementary,
-                    )[name]
-                    read_chars.append(chars)
-                    read_quals.append(quals)
+                for name, path in pairs:
+                    with pysam.AlignmentFile(
+                        path, reference_filename=self.ref
+                    ) as alignment_file:
+                        chars, quals = extract_read_variants(
+                            data.locus,
+                            alignment_file=alignment_file,
+                            samples=name,
+                            id=self.read_group_field,
+                            min_quality=self.mapping_quality,
+                            skip_duplicates=self.skip_duplicates,
+                            skip_qcfail=self.skip_qcfail,
+                            skip_supplementary=self.skip_supplementary,
+                        )[name]
+                        read_chars.append(chars)
+                        read_quals.append(quals)
                 read_chars = np.concatenate(read_chars)
                 read_quals = np.concatenate(read_quals)
 
@@ -343,7 +336,7 @@ def sumarise_vcf_record(self, data):
             data.infodata["AOP"] = prob_occurring.round(self.precision)
         return data
 
-    def call_locus(self, locus, alignment_files):
+    def call_locus(self, locus, sample_bams):
         """Call samples at a locus and formats resulting data
         into a VCF record line.
 
@@ -366,19 +359,17 @@ def call_locus(self, locus, alignment_files):
             VCF variant line.
 
         """
-        data = self._locus_data(locus, alignment_files)
+        data = self._locus_data(locus, sample_bams)
         self.encode_sample_reads(data)
         self.call_sample_genotypes(data)
         self.sumarise_sample_genotypes(data)
         self.sumarise_vcf_record(data)
         return data.format_vcf_record()
 
     def _assemble_loci_wrapped(self, loci):
-        # create single set of alignment files to use for every locus in the job
-        alignment_files = self.alignment_files()
         for locus in loci:
             try:
-                result = self.call_locus(locus, alignment_files)
+                result = self.call_locus(locus, self.sample_bams)
             except Exception as e:
                 message = LOCUS_ASSEMBLY_ERROR.format(
                     name=locus.name,

diff --git a/mchap/application/find_snvs.py b/mchap/application/find_snvs.py
@@ -215,65 +215,69 @@ def _count_alleles(zeros, alleles):
     return
 
 
-def bam_samples(alignment_files, tag="SM"):
-    out = [None] * len(alignment_files)
-    for i, bam in enumerate(alignment_files):
-        read_groups = bam.header["RG"]
-        sample_id = read_groups[0][tag]
-        if len(read_groups) > 1:
-            for rg in read_groups:
-                if rg[tag] != sample_id:
-                    raise ValueError(
-                        "Expected one sample per bam but found {} and {} in {}".format(
-                            sample_id, rg[tag], bam.filename.decode()
+def bam_samples(bam_paths, reference_path, tag="SM"):
+    out = [None] * len(bam_paths)
+    for i, path in enumerate(bam_paths):
+        with pysam.AlignmentFile(path, reference_filename=reference_path) as bam:
+            read_groups = bam.header["RG"]
+            sample_id = read_groups[0][tag]
+            if len(read_groups) > 1:
+                for rg in read_groups:
+                    if rg[tag] != sample_id:
+                        raise ValueError(
+                            "Expected one sample per bam but found {} and {} in {}".format(
+                                sample_id, rg[tag], bam.filename.decode()
+                            )
                         )
-                    )
-        out[i] = sample_id
+            out[i] = sample_id
     return np.array(out)
 
 
 def bam_region_depths(
-    alignment_files,
+    bam_paths,
+    reference_path,
     contig,
     start,
     stop,
     dtype=np.int64,
     **kwargs,
 ):
-    n_samples = len(alignment_files)
+    n_samples = len(bam_paths)
     n_pos = stop - start
     shape = (n_pos, n_samples, 4)
     depths = np.zeros(shape, dtype=dtype)
-    for j, bam in enumerate(alignment_files):
-        for column in bam.pileup(
-            contig=contig,
-            start=start,
-            stop=stop,
-            truncate=True,
-            multiple_iterators=False,
-            **kwargs,
-        ):
-            # if start <= column.pos < stop:
-            i = column.pos - start
-            alleles = column.get_query_sequences()
-            if isinstance(alleles, list):
-                alleles = bases_to_indices(alleles)
-                _count_alleles(depths[i, j], alleles)
+    for j, path in enumerate(bam_paths):
+        with pysam.AlignmentFile(path, reference_filename=reference_path) as bam:
+            for column in bam.pileup(
+                contig=contig,
+                start=start,
+                stop=stop,
+                truncate=True,
+                multiple_iterators=False,
+                **kwargs,
+            ):
+                # if start <= column.pos < stop:
+                i = column.pos - start
+                alleles = column.get_query_sequences()
+                if isinstance(alleles, list):
+                    alleles = bases_to_indices(alleles)
+                    _count_alleles(depths[i, j], alleles)
     return depths
 
 
 def write_vcf_header(
-    command, reference, info_fields=None, format_fields=None, samples=None
+    command, reference_path, info_fields=None, format_fields=None, samples=None
 ):
     vcfversion_header = str(headermeta.fileformat("v4.3"))
     date_header = str(headermeta.filedate())
     source_header = str(headermeta.source())
     command_header = str(headermeta.commandline(command))
-    reference_header = str(headermeta.reference(reference.filename.decode()))
-    contig_header = "\n".join(
-        str(headermeta.ContigHeader(s, i))
-        for s, i in zip(reference.references, reference.lengths)
-    )
+    with pysam.FastaFile(reference_path) as reference:
+        reference_header = str(headermeta.reference(reference.filename.decode()))
+        contig_header = "\n".join(
+            str(headermeta.ContigHeader(s, i))
+            for s, i in zip(reference.references, reference.lengths)
+        )
     components = [
         vcfversion_header,
         date_header,
@@ -400,8 +404,8 @@ def write_vcf_block(
     contig,
     start,
     stop,
-    reference,
-    alignment_files,
+    reference_path,
+    bam_paths,
     # sample_ploidy,
     # sample_inbreeding,
     # base_error_rate,
@@ -420,10 +424,12 @@ def write_vcf_block(
     assert start < stop
     variant_position = np.arange(start, stop)
     variant_contig = np.full(len(variant_position), contig)
-    variant_reference = np.array(list(reference.fetch(contig, start, stop).upper()))
+    with pysam.FastaFile(reference_path) as reference:
+        variant_reference = np.array(list(reference.fetch(contig, start, stop).upper()))
     variant_reference_index = bases_to_indices(variant_reference)
     allele_depth = bam_region_depths(
-        alignment_files,
+        bam_paths,
+        reference_path,
         contig,
         start,
         stop,
@@ -587,9 +593,8 @@ def main(command):
     bed = pd.read_table(bed_path, header=None)[[0, 1, 2]]
     bed.columns = ["contig", "start", "stop"]
     reference_path = args.reference[0]
-    reference = pysam.Fastafile(reference_path)
     samples, sample_bams = arguments.parse_sample_bam_paths(
-        args.bam, None, args.read_group_field[0]
+        args.bam, None, args.read_group_field[0], reference_path=reference_path
     )
     # sample_ploidy = arguments.parse_sample_value_map(
     #     args.ploidy[0],
@@ -611,13 +616,9 @@ def main(command):
     # create alignment file objects and reuse them throughout
     # this is important for cram performance!
     # also pass reference name explicitly for robustness
-    alignment_files = [
-        pysam.AlignmentFile(path, reference_filename=reference_path)
-        for path in bam_paths
-    ]
-    samples_found = bam_samples(alignment_files, tag=args.read_group_field[0]).astype(
-        "U"
-    )
+    samples_found = bam_samples(
+        bam_paths, reference_path, tag=args.read_group_field[0]
+    ).astype("U")
     mismatch = samples_found != samples
     if np.any(mismatch):
         raise IOError(
@@ -630,7 +631,7 @@ def main(command):
     format_fields = [formatfields.GT, formatfields.AD]
     write_vcf_header(
         command,
-        reference,
+        reference_path,
         samples=samples,
         info_fields=info_fields,
         format_fields=format_fields,
@@ -641,8 +642,8 @@ def main(command):
             interval.contig,
             interval.start,
             interval.stop,
-            reference,
-            alignment_files,
+            reference_path,
+            bam_paths,
             # sample_ploidy,
             # sample_inbreeding,
             # base_error_rate=args.base_error_rate[0],

diff --git a/mchap/io/bam.py b/mchap/io/bam.py
@@ -19,7 +19,7 @@
 ID_TAGS = {"ID", "SM"}
 
 
-def extract_sample_ids(bam_paths, id="SM"):
+def extract_sample_ids(bam_paths, id="SM", reference_path=None):
     """Extract sample id's from a list of bam files.
 
     Parameters
@@ -38,7 +38,7 @@ def extract_sample_ids(bam_paths, id="SM"):
     assert id in ID_TAGS
     data = {}
     for path in bam_paths:
-        bam = pysam.AlignmentFile(path)
+        bam = pysam.AlignmentFile(path, reference_filename=reference_path)
         # allow multiple read groups for a single sample within a bam file
         # but guard against duplicate sample identifier across multiple bams
         bam_data = {read_group[id]: path for read_group in bam.header["RG"]}