-
Notifications
You must be signed in to change notification settings - Fork 34
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Added support for Top Alleles, added more report options #18
base: develop
Are you sure you want to change the base?
Changes from 1 commit
42b585f
515ac5c
844a5c9
37251d8
af03dda
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,146 @@ | ||
import sys | ||
import argparse | ||
import os | ||
from datetime import datetime | ||
from IlluminaBeadArrayFiles import BeadPoolManifest, GenotypeCalls, code2genotype | ||
from pandas import DataFrame | ||
from numpy import around | ||
|
||
""" | ||
This example will allow the user to create matrix report files identical to the ones exported from GenomeStudio | ||
by chosing the genotype with or without the genotype scores. | ||
""" | ||
|
||
def build_dict(matrix = None, samplename = None, names = None, genotypes = None, genotype_scores = None): | ||
if genotype_scores is not None: | ||
for (name, genotype, genotype_score) in zip(names, genotypes, genotype_scores): | ||
if samplename in matrix: | ||
matrix[samplename][name] = genotype + "|" + str(genotype_score) | ||
else: | ||
matrix[samplename] = {} | ||
matrix[samplename][name] = genotype + "|" + str(genotype_score) | ||
else: | ||
for (name, genotype) in zip(names, genotypes): | ||
if samplename in matrix: | ||
matrix[samplename][name] = genotype | ||
else: | ||
matrix[samplename] = {} | ||
matrix[samplename][name] = genotype | ||
return matrix | ||
|
||
delim = "\t" | ||
NUM_ARGUMENTS = 6 | ||
|
||
parser = argparse.ArgumentParser("Generate a final report from a directory of GTC files") | ||
parser.add_argument("manifest", help="BPM manifest file") | ||
parser.add_argument("gtc_directory", help="Directory containing GTC files") | ||
parser.add_argument("output_file", help="Location to write report") | ||
parser.add_argument("--forward", help="python gtc_final_report_matrix.py <path_to_manifest> <path_to_gtc_directory> <path_to_output_file> --forward 1, print matrix with forward alleles") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should there be an enumerated format option here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't see any harm in explicitly spelling out the arguments in the parser as long as we are not expecting many more cases. But if you like I can use an enumerator for the options. I would think the trade-off in better readability not that big though. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a reason you have to pass a "1" instead of using something like action='store_true' which would make something like args.forward eval to True? Also, why does the "help" have the entire command written, wouldn't that be redundant? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not really, you can pass action="store_true", default=False to the argument and then you are fine. Ok, I can make this change. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also, concerning should we write the entire command. Normally I wouldn't bother but I got so many requests how to run commands for folks not familiar with the console, so doesn't hurt to be too obvious. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, added the changes with the latest commit |
||
parser.add_argument("--forward_GC", help="python gtc_final_report_matrix.py <path_to_manifest> <path_to_gtc_directory> <path_to_output_file> --forward_GC 1, print matrix with forward alleles including genotype scores") | ||
parser.add_argument("--top", help="python gtc_final_report_matrix.py <path_to_manifest> <path_to_gtc_directory> <path_to_output_file> --top 1, print matrix with top alleles") | ||
parser.add_argument("--top_GC", help="python gtc_final_report_matrix.py <path_to_manifest> <path_to_gtc_directory> <path_to_output_file> --top_GC 1, print matrix with top alleles including genotype scores") | ||
parser.add_argument("--AB", help="python gtc_final_report_matrix.py <path_to_manifest> <path_to_gtc_directory> <path_to_output_file> --forward 1, print matrix with forward alleles") | ||
parser.add_argument("--AB_GC", help="python gtc_final_report_matrix.py <path_to_manifest> <path_to_gtc_directory> <path_to_output_file> --forward_GC 1, print matrix with forward alleles including genotype scores") | ||
parser.add_argument("--plus", help="python gtc_final_report_matrix.py <path_to_manifest> <path_to_gtc_directory> <path_to_output_file> --top 1, print matrix with top alleles") | ||
parser.add_argument("--plus_GC", help="python gtc_final_report_matrix.py <path_to_manifest> <path_to_gtc_directory> <path_to_output_file> --top_GC 1, print matrix with top alleles including genotype scores") | ||
|
||
args = parser.parse_args() | ||
|
||
if len(sys.argv) != NUM_ARGUMENTS: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The argument parser should be able to handle this error? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In this case yes as only one matrix report is supported per run, also didn't make sense to me to add a default case. |
||
sys.stderr.write("For matrix report user needs to provide either forward or top strand parameter with or without genotype score, can only build one report!\n") | ||
sys.exit(-1) | ||
|
||
if os.path.isfile(args.output_file): | ||
sys.stderr.write("Output file already exists, please delete and re-run\n") | ||
sys.exit(-1) | ||
|
||
try: | ||
manifest = BeadPoolManifest.BeadPoolManifest(args.manifest) | ||
except: | ||
sys.stderr.write("Failed to read data from manifest\n") | ||
sys.exit(-1) | ||
|
||
with open(args.output_file, "w") as output_handle: | ||
output_handle.write("[Header]\n") | ||
output_handle.write(delim.join(["Processing Date", datetime.now().strftime("%m/%d/%Y %I:%M %p")])+ "\n") | ||
output_handle.write(delim.join(["Content", os.path.basename(args.manifest)]) + "\n") | ||
output_handle.write(delim.join(["Num SNPs", str(len(manifest.names))]) + "\n") | ||
output_handle.write(delim.join(["Total SNPs", str(len(manifest.names))]) + "\n") | ||
|
||
samples = [] | ||
for gtc_file in os.listdir(args.gtc_directory): | ||
if gtc_file.lower().endswith(".gtc"): | ||
samples.append(gtc_file) | ||
|
||
output_handle.write(delim.join(["Num Samples", str(len(samples))]) + "\n") | ||
output_handle.write(delim.join(["Total Samples", str(len(samples))]) + "\n") | ||
output_handle.write("[Data]\n") | ||
|
||
matrix_forward = {} | ||
matrix_forward_GC = {} | ||
matrix_top = {} | ||
matrix_top_GC = {} | ||
matrix_plus = {} | ||
matrix_plus_GC = {} | ||
matrix_AB = {} | ||
matrix_AB_GC = {} | ||
|
||
for gtc_file in samples: | ||
samplename = os.path.basename(gtc_file)[:-4] | ||
sys.stderr.write("Processing " + gtc_file + "\n") | ||
gtc_file = os.path.join(args.gtc_directory, gtc_file) | ||
gtc = GenotypeCalls.GenotypeCalls(gtc_file) | ||
genotypes = [code2genotype[genotype] for genotype in gtc.get_genotypes()] | ||
assert len(genotypes) == len(manifest.names) | ||
if args.plus or args.plus_GC: | ||
plus_strand_genotypes = gtc.get_base_calls_plus_strand(manifest.snps, manifest.ref_strands) | ||
if args.forward or args.forward_GC: | ||
forward_strand_genotypes = gtc.get_base_calls_forward_strand(manifest.snps, manifest.source_strands) | ||
if args.top or args.top_GC: | ||
top_strand_genotypes = gtc.get_base_calls_TOP_strand(manifest.snps, manifest.ilmn_strands) | ||
if args.forward_GC or args.top_GC or args.plus_GC or args.AB_GC: | ||
genotype_scores = around(gtc.get_genotype_scores(), decimals = 4) | ||
#build dictionary for pandas | ||
if args.forward_GC: | ||
matrix_forward_GC = build_dict(matrix = matrix_forward_GC, samplename = samplename, \ | ||
names = manifest.names, genotypes = forward_strand_genotypes, genotype_scores = genotype_scores) | ||
elif args.forward: | ||
matrix_forward = build_dict(matrix = matrix_forward, samplename = samplename, \ | ||
names = manifest.names, genotypes = forward_strand_genotypes) | ||
elif args.top: | ||
matrix_top = build_dict(matrix = matrix_top, samplename = samplename, \ | ||
names = manifest.names, genotypes = top_strand_genotypes) | ||
elif args.top_GC: | ||
matrix_top_GC = build_dict(matrix = matrix_top_GC, samplename = samplename, \ | ||
names = manifest.names, genotypes = top_strand_genotypes, genotype_scores = genotype_scores) | ||
elif args.plus_GC: | ||
matrix_plus_GC = build_dict(matrix = matrix_plus_GC, samplename = samplename, \ | ||
names = manifest.names, genotypes = plus_strand_genotypes, genotype_scores = genotype_scores) | ||
elif args.plus: | ||
matrix_plus = build_dict(matrix = matrix_plus, samplename = samplename, \ | ||
names = manifest.names, genotypes = plus_strand_genotypes) | ||
elif args.AB: | ||
matrix_AB = build_dict(matrix = matrix_AB, samplename = samplename, \ | ||
names = manifest.names, genotypes = genotypes) | ||
elif args.AB_GC: | ||
matrix_AB_GC = build_dict(matrix = matrix_AB_GC, samplename = samplename, \ | ||
names = manifest.names, genotypes = genotypes, genotype_scores = genotype_scores) | ||
#create pandas dataframe from dictionaryand append to file | ||
if args.forward_GC: | ||
df = DataFrame.from_dict(matrix_forward_GC) | ||
elif args.forward: | ||
df = DataFrame.from_dict(matrix_forward) | ||
elif args.top: | ||
df = DataFrame.from_dict(matrix_top) | ||
elif args.top_GC: | ||
df = DataFrame.from_dict(matrix_top_GC) | ||
elif args.plus_GC: | ||
df = DataFrame.from_dict(matrix_plus_GC) | ||
elif args.plus: | ||
df = DataFrame.from_dict(matrix_plus) | ||
elif args.AB: | ||
df = DataFrame.from_dict(matrix_AB) | ||
elif args.AB_GC: | ||
df = DataFrame.from_dict(matrix_AB_GC) | ||
df = df.reindex(manifest.names) | ||
df.to_csv(output_handle, sep = delim) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
from .BeadArrayUtility import read_int, read_string, read_byte | ||
|
||
|
||
class BeadPoolManifest(object): | ||
""" | ||
Class for parsing binary (BPM) manifest file. | ||
|
@@ -14,6 +15,7 @@ class BeadPoolManifest(object): | |
list of normalization transforms read from GTC file | ||
ref_strands (list(int)): Reference strand annotation for loci (see RefStrand class) | ||
source_strands (list(int)): Source strand annotations for loci (see SourceStrand class) | ||
ilmn_strands (list(int)): Ilumina strand annotations for loci (see IlmnStrand class) | ||
num_loci (int): Number of loci in manifest | ||
manifest_name (string): Name of manifest | ||
control_config (string): Control description from manifest | ||
|
@@ -39,6 +41,7 @@ def __init__(self, filename): | |
self.normalization_lookups = [] | ||
self.ref_strands = [] | ||
self.source_strands = [] | ||
self.ilmn_strands = [] | ||
self.num_loci = 0 | ||
self.manifest_name = "" | ||
self.control_config = "" | ||
|
@@ -100,6 +103,7 @@ def __parse_file(self, manifest_file): | |
self.map_infos = [0] * self.num_loci | ||
self.ref_strands = [RefStrand.Unknown] * self.num_loci | ||
self.source_strands = [SourceStrand.Unknown] * self.num_loci | ||
self.ilmn_strands = [IlmnStrand.Unknown] * self.num_loci | ||
for idx in xrange(self.num_loci): | ||
locus_entry = LocusEntry(manifest_handle) | ||
self.assay_types[name_lookup[locus_entry.name]] = locus_entry.assay_type | ||
|
@@ -109,6 +113,7 @@ def __parse_file(self, manifest_file): | |
self.map_infos[name_lookup[locus_entry.name]] = locus_entry.map_info | ||
self.ref_strands[name_lookup[locus_entry.name]] = locus_entry.ref_strand | ||
self.source_strands[name_lookup[locus_entry.name]] = locus_entry.source_strand | ||
self.ilmn_strands[name_lookup[locus_entry.name]] = locus_entry.ilmn_strand | ||
|
||
if len(self.normalization_ids) != len(self.assay_types): | ||
raise Exception( | ||
|
@@ -179,6 +184,71 @@ def from_string(source_strand): | |
raise ValueError( | ||
"Unexpected value for source strand " + source_strand) | ||
|
||
|
||
class IlmnStrand(object): | ||
Unknown = 0 | ||
TOP = 1 | ||
BOT = 2 | ||
PLUS = 1 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These should have different values, otherwise to_string behavior below will be undesirable. If necessary, could generalize get_base_calls generic to take a list of report strands (instead of a single value) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Changed attribute assignments, can now be handled by get_base_calls_generic, see below. |
||
MINUS = 2 | ||
|
||
@staticmethod | ||
def to_string(ilmn_strand): | ||
"""Get an integer representation of Illumina strand annotation | ||
|
||
Args: | ||
ilmn_strand (str) : string representation of Illumina strand annotation (e.g., "T") | ||
|
||
Returns: | ||
int : int representation of Illumina strand annotation (e.g. IlmnStrand.TOP) | ||
|
||
Raises: | ||
ValueError: Unexpected value for Illumina strand | ||
""" | ||
if ilmn_strand == IlmnStrand.Unknown: | ||
return "U" | ||
elif ilmn_strand == IlmnStrand.TOP: | ||
return "T" | ||
elif ilmn_strand == IlmnStrand.BOT: | ||
return "B" | ||
elif ilmn_strand == IlmnStrand.PLUS: | ||
return "P" | ||
elif ilmn_strand == IlmnStrand.MINUS: | ||
return "M" | ||
|
||
else: | ||
raise ValueError( | ||
"Unexpected value for ilmn strand " + ilmn_strand) | ||
|
||
@staticmethod | ||
def from_string(ilmn_strand): | ||
""" | ||
Get a string representation of Illumina strand annotation | ||
|
||
Args: | ||
ilmn_strand (int) : int representation of Illumina strand (e.g., IlmnStrand.TOP) | ||
|
||
Returns: | ||
str : string representation of Illumina strand annotation | ||
|
||
Raises: | ||
ValueError: Unexpected value for Illumina strand | ||
""" | ||
if ilmn_strand == "U" or ilmn_strand == "": | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Shouldn't empty string also raise a ValueError? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right, empty string is now an exception. Question though: Is that the desired behavior for empty string? |
||
return IlmnStrand.Unknown | ||
if ilmn_strand == "T": | ||
return IlmnStrand.TOP | ||
elif ilmn_strand == "B": | ||
return IlmnStrand.BOT | ||
if ilmn_strand == "P": | ||
return IlmnStrand.PLUS | ||
elif ilmn_strand == "M": | ||
return IlmnStrand.MINUS | ||
else: | ||
raise ValueError( | ||
"Unexpected value for ilmn strand " + ilmn_strand) | ||
|
||
|
||
class RefStrand(object): | ||
Unknown = 0 | ||
Plus = 1 | ||
|
@@ -188,13 +258,13 @@ class RefStrand(object): | |
def to_string(ref_strand): | ||
""" | ||
Get a string reprensetation of ref strand annotation | ||
|
||
Args: | ||
ref_strand (int) : int representation of ref strand (e.g., RefStrand.Plus) | ||
|
||
Returns: | ||
str : string representation of reference strand annotation | ||
|
||
Raises: | ||
ValueError: Unexpected value for reference strand | ||
""" | ||
|
@@ -211,13 +281,13 @@ def to_string(ref_strand): | |
@staticmethod | ||
def from_string(ref_strand): | ||
"""Get an integer representation of ref strand annotation | ||
|
||
Args: | ||
ref_strand (str) : string representation of reference strand annotation (e.g., "+") | ||
|
||
Returns: | ||
int : int representation of reference strand annotation (e.g. RefStrand.Plus) | ||
|
||
Raises: | ||
ValueError: Unexpected value for reference strand | ||
""" | ||
|
@@ -235,7 +305,7 @@ class LocusEntry(object): | |
""" | ||
Helper class representing a locus entry within a bead pool manifest. Current only support version | ||
6,7, and 8. | ||
|
||
Attributes: | ||
ilmn_id (string) : IlmnID (probe identifier) of locus | ||
name (string): Name (variant identifier) of locus | ||
|
@@ -247,6 +317,7 @@ class LocusEntry(object): | |
address_b (int) : AddressB ID of locus (0 if none) | ||
ref_strand (int) : See RefStrand class | ||
source_strand (int) : See SourceStrand class | ||
ilmn_strand (int) : See IlmnStrand class | ||
""" | ||
|
||
def __init__(self, handle): | ||
|
@@ -269,6 +340,7 @@ def __init__(self, handle): | |
self.address_b = -1 | ||
self.ref_strand = RefStrand.Unknown | ||
self.source_strand = SourceStrand.Unknown | ||
self.ilmn_strand = IlmnStrand.Unknown | ||
self.__parse_file(handle) | ||
|
||
def __parse_file(self, handle): | ||
|
@@ -308,6 +380,8 @@ def __parse_locus_version_6(self, handle): | |
self.ilmn_id = read_string(handle) | ||
self.source_strand = SourceStrand.from_string( | ||
self.ilmn_id.split("_")[-2]) | ||
self.ilmn_strand = IlmnStrand.from_string( | ||
self.ilmn_id.split("_")[-3]) | ||
self.name = read_string(handle) | ||
for idx in xrange(3): | ||
read_string(handle) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does the reference to this import need to change?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Reference should be fine, checked with running gtf_final_report_matrix.py. There is some ambiguity as the module name and class name are the same.