Illumina · mjung2019 · Jun 13, 2019 · Jun 13, 2019 · Jun 13, 2019 · Jun 13, 2019
diff --git a/README.md b/README.md
@@ -32,7 +32,7 @@ The library depends on the availability of the numpy package in the python insta
 > for (locus, genotype) in zip( names, genotypes ):  
 > &nbsp;&nbsp;sys.stdout.write( locus + "," + code2genotype[genotype] + "\n" )
 
-Also, see examples/gtc_final_report.py for an additional example of usage. 
+Also, see examples/gtc_final_report.py and examples/gtc_final_report_matrix.py for additional examples of usage. 
 
 ## GTC File Format
 The specification for the GTC file format is provided in docs/GTC_File_Format_v5.pdf

diff --git a/examples/gtc_final_report.py b/examples/gtc_final_report.py
@@ -2,7 +2,7 @@
 import argparse
 import os
 from datetime import datetime
-from IlluminaBeadArrayFiles import GenotypeCalls, BeadPoolManifest, code2genotype
+from IlluminaBeadArrayFiles import BeadPoolManifest, GenotypeCalls, code2genotype
 
 delim = "\t"
 
@@ -18,7 +18,7 @@
     sys.exit(-1)
 
 try:
-    manifest = BeadPoolManifest(args.manifest)
+    manifest = BeadPoolManifest.BeadPoolManifest(args.manifest)
 except:
     sys.stderr.write("Failed to read data from manifest\n")
     sys.exit(-1)
@@ -39,15 +39,17 @@
     output_handle.write(delim.join(["Total Samples", str(len(samples))]) + "\n")
 
     output_handle.write("[Data]\n")
-    output_handle.write(delim.join(["SNP Name", "Sample ID", "Chr", "MapInfo", "Alleles - AB", "Alleles - Plus", "Alleles - Forward"]) + "\n")
+    output_handle.write(delim.join(["SNP Name", "Sample ID", "Chr", "MapInfo", "Alleles - AB", "Alleles - Plus", "Alleles - Forward", "Alleles - TOP", "Genotype Score"]) + "\n")
     for gtc_file in samples:
         sys.stderr.write("Processing " + gtc_file + "\n")
         gtc_file = os.path.join(args.gtc_directory, gtc_file)
-        gtc = GenotypeCalls(gtc_file)
+        gtc = GenotypeCalls.GenotypeCalls(gtc_file)
         genotypes = gtc.get_genotypes()
         plus_strand_genotypes = gtc.get_base_calls_plus_strand(manifest.snps, manifest.ref_strands)
         forward_strand_genotypes = gtc.get_base_calls_forward_strand(manifest.snps, manifest.source_strands)
+        top_strand_genotypes = gtc.get_base_calls_TOP_strand(manifest.snps, manifest.ilmn_strands)
+        genotype_scores = gtc.get_genotype_scores()
 
         assert len(genotypes) == len(manifest.names)
-        for (name, chrom, map_info, genotype, ref_strand_genotype, source_strand_genotype) in zip(manifest.names, manifest.chroms, manifest.map_infos, genotypes, plus_strand_genotypes, forward_strand_genotypes):
-            output_handle.write(delim.join([name, os.path.basename(gtc_file)[:-4], chrom, str(map_info), code2genotype[genotype], ref_strand_genotype, source_strand_genotype]) + "\n")
+        for (name, chrom, map_info, genotype, ref_strand_genotype, source_strand_genotype, ilmn_strand_genotype, genotype_score) in zip(manifest.names, manifest.chroms, manifest.map_infos, genotypes, plus_strand_genotypes, forward_strand_genotypes, top_strand_genotypes, genotype_scores):
+            output_handle.write(delim.join([name, os.path.basename(gtc_file)[:-4], chrom, str(map_info), code2genotype[genotype], ref_strand_genotype, source_strand_genotype, ilmn_strand_genotype, str(genotype_score)]) + "\n")
diff --git a/examples/gtc_final_report_matrix.py b/examples/gtc_final_report_matrix.py
@@ -0,0 +1,146 @@
+import sys
+import argparse
+import os
+from datetime import datetime
+from IlluminaBeadArrayFiles import BeadPoolManifest, GenotypeCalls, code2genotype
+from pandas import DataFrame
+from numpy import around
+
+"""
+This example will allow the user to create matrix report files identical to the ones exported from GenomeStudio
+by chosing the genotype with or without the genotype scores.
+"""
+
+def build_dict(matrix = None, samplename = None, names = None, genotypes = None, genotype_scores = None):
+    if genotype_scores is not None:
+        for (name, genotype, genotype_score) in zip(names, genotypes, genotype_scores):
+            if samplename in matrix:
+                matrix[samplename][name] = genotype + "|" + str(genotype_score)
+            else:
+                matrix[samplename] = {}
+                matrix[samplename][name] = genotype + "|" + str(genotype_score)
+    else:
+        for (name, genotype) in zip(names, genotypes):
+            if samplename in matrix:
+                matrix[samplename][name] = genotype
+            else:
+                matrix[samplename] = {}
+                matrix[samplename][name] = genotype
+    return matrix
+
+delim = "\t"
+NUM_ARGUMENTS = 6
+
+parser = argparse.ArgumentParser("Generate a final report from a directory of GTC files")
+parser.add_argument("manifest", help="BPM manifest file")
+parser.add_argument("gtc_directory", help="Directory containing GTC files")
+parser.add_argument("output_file", help="Location to write report")
+parser.add_argument("--forward", help="python gtc_final_report_matrix.py <path_to_manifest> <path_to_gtc_directory> <path_to_output_file> --forward 1, print matrix with forward alleles")
+parser.add_argument("--forward_GC", help="python gtc_final_report_matrix.py <path_to_manifest> <path_to_gtc_directory> <path_to_output_file> --forward_GC 1, print matrix with forward alleles including genotype scores")
+parser.add_argument("--top", help="python gtc_final_report_matrix.py <path_to_manifest> <path_to_gtc_directory> <path_to_output_file> --top 1, print matrix with top alleles")
+parser.add_argument("--top_GC", help="python gtc_final_report_matrix.py <path_to_manifest> <path_to_gtc_directory> <path_to_output_file> --top_GC 1, print matrix with top alleles including genotype scores")
+parser.add_argument("--AB", help="python gtc_final_report_matrix.py <path_to_manifest> <path_to_gtc_directory> <path_to_output_file> --forward 1, print matrix with forward alleles")
+parser.add_argument("--AB_GC", help="python gtc_final_report_matrix.py <path_to_manifest> <path_to_gtc_directory> <path_to_output_file> --forward_GC 1, print matrix with forward alleles including genotype scores")
+parser.add_argument("--plus", help="python gtc_final_report_matrix.py <path_to_manifest> <path_to_gtc_directory> <path_to_output_file> --top 1, print matrix with top alleles")
+parser.add_argument("--plus_GC", help="python gtc_final_report_matrix.py <path_to_manifest> <path_to_gtc_directory> <path_to_output_file> --top_GC 1, print matrix with top alleles including genotype scores")
+
+args = parser.parse_args()
+
+if len(sys.argv) != NUM_ARGUMENTS:
+    sys.stderr.write("For matrix report user needs to provide either forward or top strand parameter with or without genotype score, can only build one report!\n")
+    sys.exit(-1)
+
+if os.path.isfile(args.output_file):
+    sys.stderr.write("Output file already exists, please delete and re-run\n")
+    sys.exit(-1)
+
+try:
+    manifest = BeadPoolManifest.BeadPoolManifest(args.manifest)
+except:
+    sys.stderr.write("Failed to read data from manifest\n")
+    sys.exit(-1)
+
+with open(args.output_file, "w") as output_handle:
+    output_handle.write("[Header]\n")
+    output_handle.write(delim.join(["Processing Date", datetime.now().strftime("%m/%d/%Y %I:%M %p")])+ "\n")
+    output_handle.write(delim.join(["Content", os.path.basename(args.manifest)]) + "\n")
+    output_handle.write(delim.join(["Num SNPs", str(len(manifest.names))]) + "\n")
+    output_handle.write(delim.join(["Total SNPs", str(len(manifest.names))]) + "\n")
+
+    samples = []
+    for gtc_file in os.listdir(args.gtc_directory):
+        if gtc_file.lower().endswith(".gtc"):
+            samples.append(gtc_file)
+
+    output_handle.write(delim.join(["Num Samples", str(len(samples))]) + "\n")
+    output_handle.write(delim.join(["Total Samples", str(len(samples))]) + "\n")
+    output_handle.write("[Data]\n")
+
+    matrix_forward = {}
+    matrix_forward_GC = {}
+    matrix_top = {}
+    matrix_top_GC = {}
+    matrix_plus = {}
+    matrix_plus_GC = {}
+    matrix_AB = {}
+    matrix_AB_GC = {}
+
+    for gtc_file in samples:
+        samplename = os.path.basename(gtc_file)[:-4]
+        sys.stderr.write("Processing " + gtc_file + "\n")
+        gtc_file = os.path.join(args.gtc_directory, gtc_file)
+        gtc = GenotypeCalls.GenotypeCalls(gtc_file)
+        genotypes = [code2genotype[genotype] for genotype in gtc.get_genotypes()]
+        assert len(genotypes) == len(manifest.names)
+        if args.plus or args.plus_GC:
+            plus_strand_genotypes = gtc.get_base_calls_plus_strand(manifest.snps, manifest.ref_strands)
+        if args.forward or args.forward_GC:
+            forward_strand_genotypes = gtc.get_base_calls_forward_strand(manifest.snps, manifest.source_strands)
+        if args.top or args.top_GC:
+            top_strand_genotypes = gtc.get_base_calls_TOP_strand(manifest.snps, manifest.ilmn_strands)
+        if args.forward_GC or args.top_GC or args.plus_GC or args.AB_GC:
+            genotype_scores = around(gtc.get_genotype_scores(), decimals = 4)
+        #build dictionary for pandas
+        if args.forward_GC:
+            matrix_forward_GC = build_dict(matrix = matrix_forward_GC, samplename = samplename, \
+                names = manifest.names, genotypes = forward_strand_genotypes, genotype_scores = genotype_scores)
+        elif args.forward:
+            matrix_forward = build_dict(matrix = matrix_forward, samplename = samplename, \
+                names = manifest.names, genotypes = forward_strand_genotypes)
+        elif args.top:
+            matrix_top = build_dict(matrix = matrix_top, samplename = samplename, \
+                names = manifest.names, genotypes = top_strand_genotypes)
+        elif args.top_GC:
+            matrix_top_GC = build_dict(matrix = matrix_top_GC, samplename = samplename, \
+                names = manifest.names, genotypes = top_strand_genotypes, genotype_scores = genotype_scores)
+        elif args.plus_GC:
+            matrix_plus_GC = build_dict(matrix = matrix_plus_GC, samplename = samplename, \
+                names = manifest.names, genotypes = plus_strand_genotypes, genotype_scores = genotype_scores)
+        elif args.plus:
+            matrix_plus = build_dict(matrix = matrix_plus, samplename = samplename, \
+                names = manifest.names, genotypes = plus_strand_genotypes)
+        elif args.AB:
+            matrix_AB = build_dict(matrix = matrix_AB, samplename = samplename, \
+                names = manifest.names, genotypes = genotypes)
+        elif args.AB_GC:
+            matrix_AB_GC = build_dict(matrix = matrix_AB_GC, samplename = samplename, \
+                names = manifest.names, genotypes = genotypes, genotype_scores = genotype_scores)
+    #create pandas dataframe from dictionaryand append to file
+    if args.forward_GC:
+        df = DataFrame.from_dict(matrix_forward_GC)
+    elif args.forward:
+        df = DataFrame.from_dict(matrix_forward)
+    elif args.top:
+        df = DataFrame.from_dict(matrix_top)
+    elif args.top_GC:
+        df = DataFrame.from_dict(matrix_top_GC)
+    elif args.plus_GC:
+        df = DataFrame.from_dict(matrix_plus_GC)
+    elif args.plus:
+        df = DataFrame.from_dict(matrix_plus)
+    elif args.AB:
+        df = DataFrame.from_dict(matrix_AB)
+    elif args.AB_GC:
+        df = DataFrame.from_dict(matrix_AB_GC)
+    df = df.reindex(manifest.names)
+    df.to_csv(output_handle, sep = delim)
diff --git a/module/BeadPoolManifest.py b/module/BeadPoolManifest.py
@@ -1,5 +1,6 @@
 from .BeadArrayUtility import read_int, read_string, read_byte
 
+
 class BeadPoolManifest(object):
     """
     Class for parsing binary (BPM) manifest file.
@@ -14,6 +15,7 @@ class BeadPoolManifest(object):
                                          list of normalization transforms read from GTC file
         ref_strands (list(int)): Reference strand annotation for loci (see RefStrand class)
         source_strands (list(int)): Source strand annotations for loci (see SourceStrand class)
+        ilmn_strands  (list(int)): Ilumina strand annotations for loci (see IlmnStrand class)
         num_loci (int): Number of loci in manifest
         manifest_name (string): Name of manifest
         control_config (string): Control description from manifest
@@ -39,6 +41,7 @@ def __init__(self, filename):
         self.normalization_lookups = []
         self.ref_strands = []
         self.source_strands = []
+        self.ilmn_strands = []
         self.num_loci = 0
         self.manifest_name = ""
         self.control_config = ""
@@ -100,6 +103,7 @@ def __parse_file(self, manifest_file):
             self.map_infos = [0] * self.num_loci
             self.ref_strands = [RefStrand.Unknown] * self.num_loci
             self.source_strands = [SourceStrand.Unknown] * self.num_loci
+            self.ilmn_strands = [IlmnStrand.Unknown] * self.num_loci
             for idx in xrange(self.num_loci):
                 locus_entry = LocusEntry(manifest_handle)
                 self.assay_types[name_lookup[locus_entry.name]] = locus_entry.assay_type
@@ -109,6 +113,7 @@ def __parse_file(self, manifest_file):
                 self.map_infos[name_lookup[locus_entry.name]] = locus_entry.map_info
                 self.ref_strands[name_lookup[locus_entry.name]] = locus_entry.ref_strand
                 self.source_strands[name_lookup[locus_entry.name]] = locus_entry.source_strand
+                self.ilmn_strands[name_lookup[locus_entry.name]] = locus_entry.ilmn_strand
 
             if len(self.normalization_ids) != len(self.assay_types):
                 raise Exception(
@@ -179,6 +184,71 @@ def from_string(source_strand):
             raise ValueError(
                 "Unexpected value for source strand " + source_strand)
 
+
+class IlmnStrand(object):
+    Unknown = 0
+    TOP = 1
+    BOT = 2
+    PLUS = 1
+    MINUS = 2
+
+    @staticmethod
+    def to_string(ilmn_strand):
+        """Get an integer representation of Illumina strand annotation
+
+        Args:
+            ilmn_strand (str) : string representation of Illumina strand annotation (e.g., "T")
+
+        Returns:
+            int : int representation of Illumina strand annotation (e.g. IlmnStrand.TOP)
+
+        Raises:
+            ValueError: Unexpected value for Illumina strand
+        """
+        if ilmn_strand == IlmnStrand.Unknown:
+            return "U"
+        elif ilmn_strand == IlmnStrand.TOP:
+            return "T"
+        elif ilmn_strand == IlmnStrand.BOT:
+            return "B"
+        elif ilmn_strand == IlmnStrand.PLUS:
+            return "P"
+        elif ilmn_strand == IlmnStrand.MINUS:
+            return "M"
+
+        else:
+            raise ValueError(
+                "Unexpected value for ilmn strand " + ilmn_strand)
+
+    @staticmethod
+    def from_string(ilmn_strand):
+        """
+        Get a string representation of Illumina strand annotation
+
+        Args:
+            ilmn_strand (int) : int representation of Illumina strand (e.g., IlmnStrand.TOP)
+
+        Returns:
+            str : string representation of Illumina strand annotation
+
+        Raises:
+            ValueError: Unexpected value for Illumina strand
+        """
+        if ilmn_strand == "U" or ilmn_strand == "":
+            return IlmnStrand.Unknown
+        if ilmn_strand == "T":
+            return IlmnStrand.TOP
+        elif ilmn_strand == "B":
+            return IlmnStrand.BOT
+        if ilmn_strand == "P":
+            return IlmnStrand.PLUS
+        elif ilmn_strand == "M":
+            return IlmnStrand.MINUS
+        else:
+            raise ValueError(
+                "Unexpected value for ilmn strand " + ilmn_strand)
+
+
 class RefStrand(object):
     Unknown = 0
     Plus = 1
@@ -188,13 +258,13 @@ class RefStrand(object):
     def to_string(ref_strand):
         """
         Get a string reprensetation of ref strand annotation
-
+        
         Args:
             ref_strand (int) : int representation of ref strand (e.g., RefStrand.Plus)
-
+        
         Returns:
             str : string representation of reference strand annotation
-
+        
         Raises:
             ValueError: Unexpected value for reference strand
         """
@@ -211,13 +281,13 @@ def to_string(ref_strand):
     @staticmethod
     def from_string(ref_strand):
         """Get an integer representation of ref strand annotation
-
+        
         Args:
             ref_strand (str) : string representation of reference strand annotation (e.g., "+")
-
+        
         Returns:
             int : int representation of reference strand annotation (e.g. RefStrand.Plus)
-
+        
         Raises:
             ValueError: Unexpected value for reference strand
         """
@@ -235,7 +305,7 @@ class LocusEntry(object):
     """
     Helper class representing a locus entry within a bead pool manifest. Current only support version
     6,7, and 8.
-
+    
     Attributes:
         ilmn_id (string) : IlmnID (probe identifier) of locus
         name (string): Name (variant identifier) of locus
@@ -247,6 +317,7 @@ class LocusEntry(object):
         address_b (int) : AddressB ID of locus (0 if none)
         ref_strand (int) : See RefStrand class
         source_strand (int) : See SourceStrand class
+        ilmn_strand (int) : See IlmnStrand class
     """
 
     def __init__(self, handle):
@@ -269,6 +340,7 @@ def __init__(self, handle):
         self.address_b = -1
         self.ref_strand = RefStrand.Unknown
         self.source_strand = SourceStrand.Unknown
+        self.ilmn_strand = IlmnStrand.Unknown
         self.__parse_file(handle)
 
     def __parse_file(self, handle):
@@ -308,6 +380,8 @@ def __parse_locus_version_6(self, handle):
         self.ilmn_id = read_string(handle)
         self.source_strand = SourceStrand.from_string(
             self.ilmn_id.split("_")[-2])
+        self.ilmn_strand = IlmnStrand.from_string(
+            self.ilmn_id.split("_")[-3])
         self.name = read_string(handle)
         for idx in xrange(3):
             read_string(handle)