Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added support for Top Alleles, added more report options #18

Open
wants to merge 5 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ The library depends on the availability of the numpy package in the python insta
> for (locus, genotype) in zip( names, genotypes ):
>   sys.stdout.write( locus + "," + code2genotype[genotype] + "\n" )

Also, see examples/gtc_final_report.py for an additional example of usage.
Also, see examples/gtc_final_report.py and examples/gtc_final_report_matrix.py for additional examples of usage.

## GTC File Format
The specification for the GTC file format is provided in docs/GTC_File_Format_v5.pdf
Expand Down
14 changes: 8 additions & 6 deletions examples/gtc_final_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import argparse
import os
from datetime import datetime
from IlluminaBeadArrayFiles import GenotypeCalls, BeadPoolManifest, code2genotype
from IlluminaBeadArrayFiles import BeadPoolManifest, GenotypeCalls, code2genotype

delim = "\t"

Expand All @@ -18,7 +18,7 @@
sys.exit(-1)

try:
manifest = BeadPoolManifest(args.manifest)
manifest = BeadPoolManifest.BeadPoolManifest(args.manifest)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does the reference to this import need to change?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reference should be fine, checked with running gtf_final_report_matrix.py. There is some ambiguity as the module name and class name are the same.

except:
sys.stderr.write("Failed to read data from manifest\n")
sys.exit(-1)
Expand All @@ -39,15 +39,17 @@
output_handle.write(delim.join(["Total Samples", str(len(samples))]) + "\n")

output_handle.write("[Data]\n")
output_handle.write(delim.join(["SNP Name", "Sample ID", "Chr", "MapInfo", "Alleles - AB", "Alleles - Plus", "Alleles - Forward"]) + "\n")
output_handle.write(delim.join(["SNP Name", "Sample ID", "Chr", "MapInfo", "Alleles - AB", "Alleles - Plus", "Alleles - Forward", "Alleles - TOP", "Genotype Score"]) + "\n")
for gtc_file in samples:
sys.stderr.write("Processing " + gtc_file + "\n")
gtc_file = os.path.join(args.gtc_directory, gtc_file)
gtc = GenotypeCalls(gtc_file)
gtc = GenotypeCalls.GenotypeCalls(gtc_file)
genotypes = gtc.get_genotypes()
plus_strand_genotypes = gtc.get_base_calls_plus_strand(manifest.snps, manifest.ref_strands)
forward_strand_genotypes = gtc.get_base_calls_forward_strand(manifest.snps, manifest.source_strands)
top_strand_genotypes = gtc.get_base_calls_TOP_strand(manifest.snps, manifest.ilmn_strands)
genotype_scores = gtc.get_genotype_scores()

assert len(genotypes) == len(manifest.names)
for (name, chrom, map_info, genotype, ref_strand_genotype, source_strand_genotype) in zip(manifest.names, manifest.chroms, manifest.map_infos, genotypes, plus_strand_genotypes, forward_strand_genotypes):
output_handle.write(delim.join([name, os.path.basename(gtc_file)[:-4], chrom, str(map_info), code2genotype[genotype], ref_strand_genotype, source_strand_genotype]) + "\n")
for (name, chrom, map_info, genotype, ref_strand_genotype, source_strand_genotype, ilmn_strand_genotype, genotype_score) in zip(manifest.names, manifest.chroms, manifest.map_infos, genotypes, plus_strand_genotypes, forward_strand_genotypes, top_strand_genotypes, genotype_scores):
output_handle.write(delim.join([name, os.path.basename(gtc_file)[:-4], chrom, str(map_info), code2genotype[genotype], ref_strand_genotype, source_strand_genotype, ilmn_strand_genotype, str(genotype_score)]) + "\n")
146 changes: 146 additions & 0 deletions examples/gtc_final_report_matrix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
import sys
import argparse
import os
from datetime import datetime
from IlluminaBeadArrayFiles import BeadPoolManifest, GenotypeCalls, code2genotype
from pandas import DataFrame
from numpy import around

"""
This example will allow the user to create matrix report files identical to the ones exported from GenomeStudio
by chosing the genotype with or without the genotype scores.
"""

def build_dict(matrix = None, samplename = None, names = None, genotypes = None, genotype_scores = None):
if genotype_scores is not None:
for (name, genotype, genotype_score) in zip(names, genotypes, genotype_scores):
if samplename in matrix:
matrix[samplename][name] = genotype + "|" + str(genotype_score)
else:
matrix[samplename] = {}
matrix[samplename][name] = genotype + "|" + str(genotype_score)
else:
for (name, genotype) in zip(names, genotypes):
if samplename in matrix:
matrix[samplename][name] = genotype
else:
matrix[samplename] = {}
matrix[samplename][name] = genotype
return matrix

delim = "\t"
NUM_ARGUMENTS = 6

parser = argparse.ArgumentParser("Generate a final report from a directory of GTC files")
parser.add_argument("manifest", help="BPM manifest file")
parser.add_argument("gtc_directory", help="Directory containing GTC files")
parser.add_argument("output_file", help="Location to write report")
parser.add_argument("--forward", help="python gtc_final_report_matrix.py <path_to_manifest> <path_to_gtc_directory> <path_to_output_file> --forward 1, print matrix with forward alleles")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should there be an enumerated format option here?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see any harm in explicitly spelling out the arguments in the parser as long as we are not expecting many more cases. But if you like I can use an enumerator for the options. I would think the trade-off in better readability not that big though.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason you have to pass a "1" instead of using something like action='store_true' which would make something like args.forward eval to True? Also, why does the "help" have the entire command written, wouldn't that be redundant?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not really, you can pass action="store_true", default=False to the argument and then you are fine. Ok, I can make this change.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, concerning should we write the entire command. Normally I wouldn't bother but I got so many requests how to run commands for folks not familiar with the console, so doesn't hurt to be too obvious.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, added the changes with the latest commit

parser.add_argument("--forward_GC", help="python gtc_final_report_matrix.py <path_to_manifest> <path_to_gtc_directory> <path_to_output_file> --forward_GC 1, print matrix with forward alleles including genotype scores")
parser.add_argument("--top", help="python gtc_final_report_matrix.py <path_to_manifest> <path_to_gtc_directory> <path_to_output_file> --top 1, print matrix with top alleles")
parser.add_argument("--top_GC", help="python gtc_final_report_matrix.py <path_to_manifest> <path_to_gtc_directory> <path_to_output_file> --top_GC 1, print matrix with top alleles including genotype scores")
parser.add_argument("--AB", help="python gtc_final_report_matrix.py <path_to_manifest> <path_to_gtc_directory> <path_to_output_file> --forward 1, print matrix with forward alleles")
parser.add_argument("--AB_GC", help="python gtc_final_report_matrix.py <path_to_manifest> <path_to_gtc_directory> <path_to_output_file> --forward_GC 1, print matrix with forward alleles including genotype scores")
parser.add_argument("--plus", help="python gtc_final_report_matrix.py <path_to_manifest> <path_to_gtc_directory> <path_to_output_file> --top 1, print matrix with top alleles")
parser.add_argument("--plus_GC", help="python gtc_final_report_matrix.py <path_to_manifest> <path_to_gtc_directory> <path_to_output_file> --top_GC 1, print matrix with top alleles including genotype scores")

args = parser.parse_args()

if len(sys.argv) != NUM_ARGUMENTS:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The argument parser should be able to handle this error?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this case yes as only one matrix report is supported per run, also didn't make sense to me to add a default case.

sys.stderr.write("For matrix report user needs to provide either forward or top strand parameter with or without genotype score, can only build one report!\n")
sys.exit(-1)

if os.path.isfile(args.output_file):
sys.stderr.write("Output file already exists, please delete and re-run\n")
sys.exit(-1)

try:
manifest = BeadPoolManifest.BeadPoolManifest(args.manifest)
except:
sys.stderr.write("Failed to read data from manifest\n")
sys.exit(-1)

with open(args.output_file, "w") as output_handle:
output_handle.write("[Header]\n")
output_handle.write(delim.join(["Processing Date", datetime.now().strftime("%m/%d/%Y %I:%M %p")])+ "\n")
output_handle.write(delim.join(["Content", os.path.basename(args.manifest)]) + "\n")
output_handle.write(delim.join(["Num SNPs", str(len(manifest.names))]) + "\n")
output_handle.write(delim.join(["Total SNPs", str(len(manifest.names))]) + "\n")

samples = []
for gtc_file in os.listdir(args.gtc_directory):
if gtc_file.lower().endswith(".gtc"):
samples.append(gtc_file)

output_handle.write(delim.join(["Num Samples", str(len(samples))]) + "\n")
output_handle.write(delim.join(["Total Samples", str(len(samples))]) + "\n")
output_handle.write("[Data]\n")

matrix_forward = {}
matrix_forward_GC = {}
matrix_top = {}
matrix_top_GC = {}
matrix_plus = {}
matrix_plus_GC = {}
matrix_AB = {}
matrix_AB_GC = {}

for gtc_file in samples:
samplename = os.path.basename(gtc_file)[:-4]
sys.stderr.write("Processing " + gtc_file + "\n")
gtc_file = os.path.join(args.gtc_directory, gtc_file)
gtc = GenotypeCalls.GenotypeCalls(gtc_file)
genotypes = [code2genotype[genotype] for genotype in gtc.get_genotypes()]
assert len(genotypes) == len(manifest.names)
if args.plus or args.plus_GC:
plus_strand_genotypes = gtc.get_base_calls_plus_strand(manifest.snps, manifest.ref_strands)
if args.forward or args.forward_GC:
forward_strand_genotypes = gtc.get_base_calls_forward_strand(manifest.snps, manifest.source_strands)
if args.top or args.top_GC:
top_strand_genotypes = gtc.get_base_calls_TOP_strand(manifest.snps, manifest.ilmn_strands)
if args.forward_GC or args.top_GC or args.plus_GC or args.AB_GC:
genotype_scores = around(gtc.get_genotype_scores(), decimals = 4)
#build dictionary for pandas
if args.forward_GC:
matrix_forward_GC = build_dict(matrix = matrix_forward_GC, samplename = samplename, \
names = manifest.names, genotypes = forward_strand_genotypes, genotype_scores = genotype_scores)
elif args.forward:
matrix_forward = build_dict(matrix = matrix_forward, samplename = samplename, \
names = manifest.names, genotypes = forward_strand_genotypes)
elif args.top:
matrix_top = build_dict(matrix = matrix_top, samplename = samplename, \
names = manifest.names, genotypes = top_strand_genotypes)
elif args.top_GC:
matrix_top_GC = build_dict(matrix = matrix_top_GC, samplename = samplename, \
names = manifest.names, genotypes = top_strand_genotypes, genotype_scores = genotype_scores)
elif args.plus_GC:
matrix_plus_GC = build_dict(matrix = matrix_plus_GC, samplename = samplename, \
names = manifest.names, genotypes = plus_strand_genotypes, genotype_scores = genotype_scores)
elif args.plus:
matrix_plus = build_dict(matrix = matrix_plus, samplename = samplename, \
names = manifest.names, genotypes = plus_strand_genotypes)
elif args.AB:
matrix_AB = build_dict(matrix = matrix_AB, samplename = samplename, \
names = manifest.names, genotypes = genotypes)
elif args.AB_GC:
matrix_AB_GC = build_dict(matrix = matrix_AB_GC, samplename = samplename, \
names = manifest.names, genotypes = genotypes, genotype_scores = genotype_scores)
#create pandas dataframe from dictionaryand append to file
if args.forward_GC:
df = DataFrame.from_dict(matrix_forward_GC)
elif args.forward:
df = DataFrame.from_dict(matrix_forward)
elif args.top:
df = DataFrame.from_dict(matrix_top)
elif args.top_GC:
df = DataFrame.from_dict(matrix_top_GC)
elif args.plus_GC:
df = DataFrame.from_dict(matrix_plus_GC)
elif args.plus:
df = DataFrame.from_dict(matrix_plus)
elif args.AB:
df = DataFrame.from_dict(matrix_AB)
elif args.AB_GC:
df = DataFrame.from_dict(matrix_AB_GC)
df = df.reindex(manifest.names)
df.to_csv(output_handle, sep = delim)
88 changes: 81 additions & 7 deletions module/BeadPoolManifest.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from .BeadArrayUtility import read_int, read_string, read_byte


class BeadPoolManifest(object):
"""
Class for parsing binary (BPM) manifest file.
Expand All @@ -14,6 +15,7 @@ class BeadPoolManifest(object):
list of normalization transforms read from GTC file
ref_strands (list(int)): Reference strand annotation for loci (see RefStrand class)
source_strands (list(int)): Source strand annotations for loci (see SourceStrand class)
ilmn_strands (list(int)): Ilumina strand annotations for loci (see IlmnStrand class)
num_loci (int): Number of loci in manifest
manifest_name (string): Name of manifest
control_config (string): Control description from manifest
Expand All @@ -39,6 +41,7 @@ def __init__(self, filename):
self.normalization_lookups = []
self.ref_strands = []
self.source_strands = []
self.ilmn_strands = []
self.num_loci = 0
self.manifest_name = ""
self.control_config = ""
Expand Down Expand Up @@ -100,6 +103,7 @@ def __parse_file(self, manifest_file):
self.map_infos = [0] * self.num_loci
self.ref_strands = [RefStrand.Unknown] * self.num_loci
self.source_strands = [SourceStrand.Unknown] * self.num_loci
self.ilmn_strands = [IlmnStrand.Unknown] * self.num_loci
for idx in xrange(self.num_loci):
locus_entry = LocusEntry(manifest_handle)
self.assay_types[name_lookup[locus_entry.name]] = locus_entry.assay_type
Expand All @@ -109,6 +113,7 @@ def __parse_file(self, manifest_file):
self.map_infos[name_lookup[locus_entry.name]] = locus_entry.map_info
self.ref_strands[name_lookup[locus_entry.name]] = locus_entry.ref_strand
self.source_strands[name_lookup[locus_entry.name]] = locus_entry.source_strand
self.ilmn_strands[name_lookup[locus_entry.name]] = locus_entry.ilmn_strand

if len(self.normalization_ids) != len(self.assay_types):
raise Exception(
Expand Down Expand Up @@ -179,6 +184,71 @@ def from_string(source_strand):
raise ValueError(
"Unexpected value for source strand " + source_strand)


class IlmnStrand(object):
Unknown = 0
TOP = 1
BOT = 2
PLUS = 1
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These should have different values, otherwise to_string behavior below will be undesirable. If necessary, could generalize get_base_calls generic to take a list of report strands (instead of a single value)

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changed attribute assignments, can now be handled by get_base_calls_generic, see below.

MINUS = 2

@staticmethod
def to_string(ilmn_strand):
"""Get an integer representation of Illumina strand annotation

Args:
ilmn_strand (str) : string representation of Illumina strand annotation (e.g., "T")

Returns:
int : int representation of Illumina strand annotation (e.g. IlmnStrand.TOP)

Raises:
ValueError: Unexpected value for Illumina strand
"""
if ilmn_strand == IlmnStrand.Unknown:
return "U"
elif ilmn_strand == IlmnStrand.TOP:
return "T"
elif ilmn_strand == IlmnStrand.BOT:
return "B"
elif ilmn_strand == IlmnStrand.PLUS:
return "P"
elif ilmn_strand == IlmnStrand.MINUS:
return "M"

else:
raise ValueError(
"Unexpected value for ilmn strand " + ilmn_strand)

@staticmethod
def from_string(ilmn_strand):
"""
Get a string representation of Illumina strand annotation

Args:
ilmn_strand (int) : int representation of Illumina strand (e.g., IlmnStrand.TOP)

Returns:
str : string representation of Illumina strand annotation

Raises:
ValueError: Unexpected value for Illumina strand
"""
if ilmn_strand == "U" or ilmn_strand == "":
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't empty string also raise a ValueError?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, empty string is now an exception.

Question though:
for class RefStrand it states:
if ref_strand == "U" or ref_strand == "":
return RefStrand.Unknown

Is that the desired behavior for empty string?

return IlmnStrand.Unknown
if ilmn_strand == "T":
return IlmnStrand.TOP
elif ilmn_strand == "B":
return IlmnStrand.BOT
if ilmn_strand == "P":
return IlmnStrand.PLUS
elif ilmn_strand == "M":
return IlmnStrand.MINUS
else:
raise ValueError(
"Unexpected value for ilmn strand " + ilmn_strand)


class RefStrand(object):
Unknown = 0
Plus = 1
Expand All @@ -188,13 +258,13 @@ class RefStrand(object):
def to_string(ref_strand):
"""
Get a string reprensetation of ref strand annotation

Args:
ref_strand (int) : int representation of ref strand (e.g., RefStrand.Plus)

Returns:
str : string representation of reference strand annotation

Raises:
ValueError: Unexpected value for reference strand
"""
Expand All @@ -211,13 +281,13 @@ def to_string(ref_strand):
@staticmethod
def from_string(ref_strand):
"""Get an integer representation of ref strand annotation

Args:
ref_strand (str) : string representation of reference strand annotation (e.g., "+")

Returns:
int : int representation of reference strand annotation (e.g. RefStrand.Plus)

Raises:
ValueError: Unexpected value for reference strand
"""
Expand All @@ -235,7 +305,7 @@ class LocusEntry(object):
"""
Helper class representing a locus entry within a bead pool manifest. Current only support version
6,7, and 8.

Attributes:
ilmn_id (string) : IlmnID (probe identifier) of locus
name (string): Name (variant identifier) of locus
Expand All @@ -247,6 +317,7 @@ class LocusEntry(object):
address_b (int) : AddressB ID of locus (0 if none)
ref_strand (int) : See RefStrand class
source_strand (int) : See SourceStrand class
ilmn_strand (int) : See IlmnStrand class
"""

def __init__(self, handle):
Expand All @@ -269,6 +340,7 @@ def __init__(self, handle):
self.address_b = -1
self.ref_strand = RefStrand.Unknown
self.source_strand = SourceStrand.Unknown
self.ilmn_strand = IlmnStrand.Unknown
self.__parse_file(handle)

def __parse_file(self, handle):
Expand Down Expand Up @@ -308,6 +380,8 @@ def __parse_locus_version_6(self, handle):
self.ilmn_id = read_string(handle)
self.source_strand = SourceStrand.from_string(
self.ilmn_id.split("_")[-2])
self.ilmn_strand = IlmnStrand.from_string(
self.ilmn_id.split("_")[-3])
self.name = read_string(handle)
for idx in xrange(3):
read_string(handle)
Expand Down
Loading