-
Notifications
You must be signed in to change notification settings - Fork 34
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Added support for Top Alleles, added more report options #18
base: develop
Are you sure you want to change the base?
Changes from all commits
42b585f
515ac5c
844a5c9
37251d8
af03dda
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
import sys | ||
import argparse | ||
import os | ||
from datetime import datetime | ||
#import GenotypeCalls | ||
from IlluminaBeadArrayFiles import GenotypeCalls | ||
|
||
delim = "\t" | ||
|
||
parser = argparse.ArgumentParser("Generate a custom report from a directory of GTC files") | ||
parser.add_argument("gtc_directory", help="Directory containing GTC files") | ||
parser.add_argument("output_file", help="Location to write report") | ||
|
||
args = parser.parse_args() | ||
|
||
if os.path.isfile(args.output_file): | ||
sys.stderr.write("Output file already exists, please delete and re-run\n") | ||
sys.exit(-1) | ||
|
||
with open(args.output_file, "w") as output_handle: | ||
output_handle.write("[Header]\n") | ||
output_handle.write(delim.join(["Processing Date", datetime.now().strftime("%m/%d/%Y %I:%M %p")])+ "\n") | ||
samples = [] | ||
for gtc_file in os.listdir(args.gtc_directory): | ||
if gtc_file.lower().endswith(".gtc"): | ||
samples.append(gtc_file) | ||
index = 1 | ||
output_handle.write(delim.join(["Num Samples", str(len(samples))]) + "\n") | ||
output_handle.write(delim.join(["Total Samples", str(len(samples))]) + "\n") | ||
output_handle.write("[Data]\n") | ||
output_handle.write(delim.join(["Index","Sample ID", "Call Rate", "p05 Grn", "p50 Grn", "p90 Grn", "p05 Red", "p50 Red", \ | ||
"p90 Red","p10 GC", "p50 GC"]) + "\n") | ||
for gtc_file in samples: | ||
sys.stderr.write("Processing " + gtc_file + "\n") | ||
gtc_file = os.path.join(args.gtc_directory, gtc_file) | ||
gtc = GenotypeCalls.GenotypeCalls(gtc_file) | ||
call_rate = gtc.get_call_rate() | ||
percentiles_grn = gtc.get_percentiles_x() | ||
p05_grn = percentiles_grn[0] | ||
p50_grn = percentiles_grn[1] | ||
p90_grn = percentiles_grn[2] | ||
percentiles_red = gtc.get_percentiles_y() | ||
p05_red = percentiles_red[0] | ||
p50_red = percentiles_red[1] | ||
p90_red = percentiles_red[2] | ||
p10_GC = gtc.get_gc10() | ||
p50_GC = gtc.get_gc50() | ||
output_handle.write(delim.join([str(index), os.path.basename(gtc_file)[:-4], str(call_rate), str(p05_grn), \ | ||
str(p50_grn), str(p90_grn), str(p05_red), str(p50_red), str(p90_red), str(p10_GC), str(p50_GC)]) + "\n") | ||
index +=1 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,147 @@ | ||
import sys | ||
import argparse | ||
import os | ||
from datetime import datetime | ||
from IlluminaBeadArrayFiles import BeadPoolManifest, GenotypeCalls, code2genotype | ||
from pandas import DataFrame | ||
from numpy import around | ||
|
||
""" | ||
This example will allow the user to create matrix report files identical to the ones exported from GenomeStudio | ||
by chosing the genotype with or without the genotype scores. | ||
""" | ||
|
||
def build_dict(matrix = None, samplename = None, names = None, genotypes = None, genotype_scores = None): | ||
if genotype_scores is not None: | ||
for (name, genotype, genotype_score) in zip(names, genotypes, genotype_scores): | ||
if samplename in matrix: | ||
matrix[samplename][name] = genotype + "|" + str(genotype_score) | ||
else: | ||
matrix[samplename] = {} | ||
matrix[samplename][name] = genotype + "|" + str(genotype_score) | ||
else: | ||
for (name, genotype) in zip(names, genotypes): | ||
if samplename in matrix: | ||
matrix[samplename][name] = genotype | ||
else: | ||
matrix[samplename] = {} | ||
matrix[samplename][name] = genotype | ||
return matrix | ||
|
||
delim = "\t" | ||
NUM_ARGUMENTS = 5 | ||
|
||
parser = argparse.ArgumentParser("Generate a final matrix report by specifying the report strand from a directory of GTC files") | ||
parser.add_argument("manifest", help="BPM manifest file") | ||
parser.add_argument("gtc_directory", help="Directory containing GTC files") | ||
parser.add_argument("output_file", help="Location to write report") | ||
parser.add_argument("--forward", action="store_true", default=False, help="python gtc_final_report_matrix.py <path_to_manifest> <path_to_gtc_directory> <path_to_output_file> --forward - print matrix with forward alleles") | ||
parser.add_argument("--forward_GC", action="store_true", default=False, help="python gtc_final_report_matrix.py <path_to_manifest> <path_to_gtc_directory> <path_to_output_file> --forward_GC - print matrix with forward alleles including genotype scores") | ||
parser.add_argument("--top", action="store_true", default=False, help="python gtc_final_report_matrix.py <path_to_manifest> <path_to_gtc_directory> <path_to_output_file> --top - print matrix with top alleles") | ||
parser.add_argument("--top_GC", action="store_true", default=False, help="python gtc_final_report_matrix.py <path_to_manifest> <path_to_gtc_directory> <path_to_output_file> --top_GC - print matrix with top alleles including genotype scores") | ||
parser.add_argument("--AB", action="store_true", default=False, help="python gtc_final_report_matrix.py <path_to_manifest> <path_to_gtc_directory> <path_to_output_file> --forward - print matrix with forward alleles") | ||
parser.add_argument("--AB_GC", action="store_true", default=False, help="python gtc_final_report_matrix.py <path_to_manifest> <path_to_gtc_directory> <path_to_output_file> --forward_GC - print matrix with forward alleles including genotype scores") | ||
parser.add_argument("--plus", action="store_true", default=False, help="python gtc_final_report_matrix.py <path_to_manifest> <path_to_gtc_directory> <path_to_output_file> --top - print matrix with top alleles") | ||
parser.add_argument("--plus_GC", action="store_true", default=False, help="python gtc_final_report_matrix.py <path_to_manifest> <path_to_gtc_directory> <path_to_output_file> --top_GC - print matrix with top alleles including genotype scores") | ||
|
||
|
||
args = parser.parse_args() | ||
|
||
if len(sys.argv) != NUM_ARGUMENTS: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The argument parser should be able to handle this error? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In this case yes as only one matrix report is supported per run, also didn't make sense to me to add a default case. |
||
sys.stderr.write("For matrix report user needs to provide either forward or top strand parameter with or without genotype score, can only build one report!\n") | ||
sys.exit(-1) | ||
|
||
if os.path.isfile(args.output_file): | ||
sys.stderr.write("Output file already exists, please delete and re-run\n") | ||
sys.exit(-1) | ||
|
||
try: | ||
manifest = BeadPoolManifest.BeadPoolManifest(args.manifest) | ||
except: | ||
sys.stderr.write("Failed to read data from manifest\n") | ||
sys.exit(-1) | ||
|
||
with open(args.output_file, "w") as output_handle: | ||
output_handle.write("[Header]\n") | ||
output_handle.write(delim.join(["Processing Date", datetime.now().strftime("%m/%d/%Y %I:%M %p")])+ "\n") | ||
output_handle.write(delim.join(["Content", os.path.basename(args.manifest)]) + "\n") | ||
output_handle.write(delim.join(["Num SNPs", str(len(manifest.names))]) + "\n") | ||
output_handle.write(delim.join(["Total SNPs", str(len(manifest.names))]) + "\n") | ||
|
||
samples = [] | ||
for gtc_file in os.listdir(args.gtc_directory): | ||
if gtc_file.lower().endswith(".gtc"): | ||
samples.append(gtc_file) | ||
|
||
output_handle.write(delim.join(["Num Samples", str(len(samples))]) + "\n") | ||
output_handle.write(delim.join(["Total Samples", str(len(samples))]) + "\n") | ||
output_handle.write("[Data]\n") | ||
|
||
matrix_forward = {} | ||
matrix_forward_GC = {} | ||
matrix_top = {} | ||
matrix_top_GC = {} | ||
matrix_plus = {} | ||
matrix_plus_GC = {} | ||
matrix_AB = {} | ||
matrix_AB_GC = {} | ||
|
||
for gtc_file in samples: | ||
samplename = os.path.basename(gtc_file)[:-4] | ||
sys.stderr.write("Processing " + gtc_file + "\n") | ||
gtc_file = os.path.join(args.gtc_directory, gtc_file) | ||
gtc = GenotypeCalls.GenotypeCalls(gtc_file) | ||
genotypes = [code2genotype[genotype] for genotype in gtc.get_genotypes()] | ||
assert len(genotypes) == len(manifest.names) | ||
if args.plus or args.plus_GC: | ||
plus_strand_genotypes = gtc.get_base_calls_plus_strand(manifest.snps, manifest.ref_strands) | ||
if args.forward or args.forward_GC: | ||
forward_strand_genotypes = gtc.get_base_calls_forward_strand(manifest.snps, manifest.source_strands) | ||
if args.top or args.top_GC: | ||
top_strand_genotypes = gtc.get_base_calls_TOP_strand(manifest.snps, manifest.ilmn_strands) | ||
if args.forward_GC or args.top_GC or args.plus_GC or args.AB_GC: | ||
genotype_scores = around(gtc.get_genotype_scores(), decimals = 4) | ||
#build dictionary for pandas | ||
if args.forward_GC: | ||
matrix_forward_GC = build_dict(matrix = matrix_forward_GC, samplename = samplename, \ | ||
names = manifest.names, genotypes = forward_strand_genotypes, genotype_scores = genotype_scores) | ||
elif args.forward: | ||
matrix_forward = build_dict(matrix = matrix_forward, samplename = samplename, \ | ||
names = manifest.names, genotypes = forward_strand_genotypes) | ||
elif args.top: | ||
matrix_top = build_dict(matrix = matrix_top, samplename = samplename, \ | ||
names = manifest.names, genotypes = top_strand_genotypes) | ||
elif args.top_GC: | ||
matrix_top_GC = build_dict(matrix = matrix_top_GC, samplename = samplename, \ | ||
names = manifest.names, genotypes = top_strand_genotypes, genotype_scores = genotype_scores) | ||
elif args.plus_GC: | ||
matrix_plus_GC = build_dict(matrix = matrix_plus_GC, samplename = samplename, \ | ||
names = manifest.names, genotypes = plus_strand_genotypes, genotype_scores = genotype_scores) | ||
elif args.plus: | ||
matrix_plus = build_dict(matrix = matrix_plus, samplename = samplename, \ | ||
names = manifest.names, genotypes = plus_strand_genotypes) | ||
elif args.AB: | ||
matrix_AB = build_dict(matrix = matrix_AB, samplename = samplename, \ | ||
names = manifest.names, genotypes = genotypes) | ||
elif args.AB_GC: | ||
matrix_AB_GC = build_dict(matrix = matrix_AB_GC, samplename = samplename, \ | ||
names = manifest.names, genotypes = genotypes, genotype_scores = genotype_scores) | ||
#create pandas dataframe from dictionaryand append to file | ||
if args.forward_GC: | ||
df = DataFrame.from_dict(matrix_forward_GC) | ||
elif args.forward: | ||
df = DataFrame.from_dict(matrix_forward) | ||
elif args.top: | ||
df = DataFrame.from_dict(matrix_top) | ||
elif args.top_GC: | ||
df = DataFrame.from_dict(matrix_top_GC) | ||
elif args.plus_GC: | ||
df = DataFrame.from_dict(matrix_plus_GC) | ||
elif args.plus: | ||
df = DataFrame.from_dict(matrix_plus) | ||
elif args.AB: | ||
df = DataFrame.from_dict(matrix_AB) | ||
elif args.AB_GC: | ||
df = DataFrame.from_dict(matrix_AB_GC) | ||
df = df.reindex(manifest.names) | ||
df.to_csv(output_handle, sep = delim) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does the reference to this import need to change?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Reference should be fine, checked with running gtf_final_report_matrix.py. There is some ambiguity as the module name and class name are the same.