Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added support for Top Alleles, added more report options #18

Open
wants to merge 5 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ The library depends on the availability of the numpy package in the python insta
> for (locus, genotype) in zip( names, genotypes ):
>   sys.stdout.write( locus + "," + code2genotype[genotype] + "\n" )

Also, see examples/gtc_final_report.py for an additional example of usage.
Also, see examples/gtc_final_report.py and examples/gtc_final_report_matrix.py for additional examples of usage.

## GTC File Format
The specification for the GTC file format is provided in docs/GTC_File_Format_v5.pdf
Expand Down
50 changes: 50 additions & 0 deletions examples/gtc_custom_report.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import sys
import argparse
import os
from datetime import datetime
#import GenotypeCalls
from IlluminaBeadArrayFiles import GenotypeCalls

delim = "\t"

parser = argparse.ArgumentParser("Generate a custom report from a directory of GTC files")
parser.add_argument("gtc_directory", help="Directory containing GTC files")
parser.add_argument("output_file", help="Location to write report")

args = parser.parse_args()

if os.path.isfile(args.output_file):
sys.stderr.write("Output file already exists, please delete and re-run\n")
sys.exit(-1)

with open(args.output_file, "w") as output_handle:
output_handle.write("[Header]\n")
output_handle.write(delim.join(["Processing Date", datetime.now().strftime("%m/%d/%Y %I:%M %p")])+ "\n")
samples = []
for gtc_file in os.listdir(args.gtc_directory):
if gtc_file.lower().endswith(".gtc"):
samples.append(gtc_file)
index = 1
output_handle.write(delim.join(["Num Samples", str(len(samples))]) + "\n")
output_handle.write(delim.join(["Total Samples", str(len(samples))]) + "\n")
output_handle.write("[Data]\n")
output_handle.write(delim.join(["Index","Sample ID", "Call Rate", "p05 Grn", "p50 Grn", "p90 Grn", "p05 Red", "p50 Red", \
"p90 Red","p10 GC", "p50 GC"]) + "\n")
for gtc_file in samples:
sys.stderr.write("Processing " + gtc_file + "\n")
gtc_file = os.path.join(args.gtc_directory, gtc_file)
gtc = GenotypeCalls.GenotypeCalls(gtc_file)
call_rate = gtc.get_call_rate()
percentiles_grn = gtc.get_percentiles_x()
p05_grn = percentiles_grn[0]
p50_grn = percentiles_grn[1]
p90_grn = percentiles_grn[2]
percentiles_red = gtc.get_percentiles_y()
p05_red = percentiles_red[0]
p50_red = percentiles_red[1]
p90_red = percentiles_red[2]
p10_GC = gtc.get_gc10()
p50_GC = gtc.get_gc50()
output_handle.write(delim.join([str(index), os.path.basename(gtc_file)[:-4], str(call_rate), str(p05_grn), \
str(p50_grn), str(p90_grn), str(p05_red), str(p50_red), str(p90_red), str(p10_GC), str(p50_GC)]) + "\n")
index +=1
14 changes: 8 additions & 6 deletions examples/gtc_final_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import argparse
import os
from datetime import datetime
from IlluminaBeadArrayFiles import GenotypeCalls, BeadPoolManifest, code2genotype
from IlluminaBeadArrayFiles import BeadPoolManifest, GenotypeCalls, code2genotype

delim = "\t"

Expand All @@ -18,7 +18,7 @@
sys.exit(-1)

try:
manifest = BeadPoolManifest(args.manifest)
manifest = BeadPoolManifest.BeadPoolManifest(args.manifest)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does the reference to this import need to change?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reference should be fine, checked with running gtf_final_report_matrix.py. There is some ambiguity as the module name and class name are the same.

except:
sys.stderr.write("Failed to read data from manifest\n")
sys.exit(-1)
Expand All @@ -39,15 +39,17 @@
output_handle.write(delim.join(["Total Samples", str(len(samples))]) + "\n")

output_handle.write("[Data]\n")
output_handle.write(delim.join(["SNP Name", "Sample ID", "Chr", "MapInfo", "Alleles - AB", "Alleles - Plus", "Alleles - Forward"]) + "\n")
output_handle.write(delim.join(["SNP Name", "Sample ID", "Chr", "MapInfo", "Alleles - AB", "Alleles - Plus", "Alleles - Forward", "Alleles - TOP", "Genotype Score"]) + "\n")
for gtc_file in samples:
sys.stderr.write("Processing " + gtc_file + "\n")
gtc_file = os.path.join(args.gtc_directory, gtc_file)
gtc = GenotypeCalls(gtc_file)
gtc = GenotypeCalls.GenotypeCalls(gtc_file)
genotypes = gtc.get_genotypes()
plus_strand_genotypes = gtc.get_base_calls_plus_strand(manifest.snps, manifest.ref_strands)
forward_strand_genotypes = gtc.get_base_calls_forward_strand(manifest.snps, manifest.source_strands)
top_strand_genotypes = gtc.get_base_calls_TOP_strand(manifest.snps, manifest.ilmn_strands)
genotype_scores = gtc.get_genotype_scores()

assert len(genotypes) == len(manifest.names)
for (name, chrom, map_info, genotype, ref_strand_genotype, source_strand_genotype) in zip(manifest.names, manifest.chroms, manifest.map_infos, genotypes, plus_strand_genotypes, forward_strand_genotypes):
output_handle.write(delim.join([name, os.path.basename(gtc_file)[:-4], chrom, str(map_info), code2genotype[genotype], ref_strand_genotype, source_strand_genotype]) + "\n")
for (name, chrom, map_info, genotype, ref_strand_genotype, source_strand_genotype, ilmn_strand_genotype, genotype_score) in zip(manifest.names, manifest.chroms, manifest.map_infos, genotypes, plus_strand_genotypes, forward_strand_genotypes, top_strand_genotypes, genotype_scores):
output_handle.write(delim.join([name, os.path.basename(gtc_file)[:-4], chrom, str(map_info), code2genotype[genotype], ref_strand_genotype, source_strand_genotype, ilmn_strand_genotype, str(genotype_score)]) + "\n")
147 changes: 147 additions & 0 deletions examples/gtc_final_report_matrix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
import sys
import argparse
import os
from datetime import datetime
from IlluminaBeadArrayFiles import BeadPoolManifest, GenotypeCalls, code2genotype
from pandas import DataFrame
from numpy import around

"""
This example will allow the user to create matrix report files identical to the ones exported from GenomeStudio
by chosing the genotype with or without the genotype scores.
"""

def build_dict(matrix = None, samplename = None, names = None, genotypes = None, genotype_scores = None):
if genotype_scores is not None:
for (name, genotype, genotype_score) in zip(names, genotypes, genotype_scores):
if samplename in matrix:
matrix[samplename][name] = genotype + "|" + str(genotype_score)
else:
matrix[samplename] = {}
matrix[samplename][name] = genotype + "|" + str(genotype_score)
else:
for (name, genotype) in zip(names, genotypes):
if samplename in matrix:
matrix[samplename][name] = genotype
else:
matrix[samplename] = {}
matrix[samplename][name] = genotype
return matrix

delim = "\t"
NUM_ARGUMENTS = 5

parser = argparse.ArgumentParser("Generate a final matrix report by specifying the report strand from a directory of GTC files")
parser.add_argument("manifest", help="BPM manifest file")
parser.add_argument("gtc_directory", help="Directory containing GTC files")
parser.add_argument("output_file", help="Location to write report")
parser.add_argument("--forward", action="store_true", default=False, help="python gtc_final_report_matrix.py <path_to_manifest> <path_to_gtc_directory> <path_to_output_file> --forward - print matrix with forward alleles")
parser.add_argument("--forward_GC", action="store_true", default=False, help="python gtc_final_report_matrix.py <path_to_manifest> <path_to_gtc_directory> <path_to_output_file> --forward_GC - print matrix with forward alleles including genotype scores")
parser.add_argument("--top", action="store_true", default=False, help="python gtc_final_report_matrix.py <path_to_manifest> <path_to_gtc_directory> <path_to_output_file> --top - print matrix with top alleles")
parser.add_argument("--top_GC", action="store_true", default=False, help="python gtc_final_report_matrix.py <path_to_manifest> <path_to_gtc_directory> <path_to_output_file> --top_GC - print matrix with top alleles including genotype scores")
parser.add_argument("--AB", action="store_true", default=False, help="python gtc_final_report_matrix.py <path_to_manifest> <path_to_gtc_directory> <path_to_output_file> --forward - print matrix with forward alleles")
parser.add_argument("--AB_GC", action="store_true", default=False, help="python gtc_final_report_matrix.py <path_to_manifest> <path_to_gtc_directory> <path_to_output_file> --forward_GC - print matrix with forward alleles including genotype scores")
parser.add_argument("--plus", action="store_true", default=False, help="python gtc_final_report_matrix.py <path_to_manifest> <path_to_gtc_directory> <path_to_output_file> --top - print matrix with top alleles")
parser.add_argument("--plus_GC", action="store_true", default=False, help="python gtc_final_report_matrix.py <path_to_manifest> <path_to_gtc_directory> <path_to_output_file> --top_GC - print matrix with top alleles including genotype scores")


args = parser.parse_args()

if len(sys.argv) != NUM_ARGUMENTS:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The argument parser should be able to handle this error?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this case yes as only one matrix report is supported per run, also didn't make sense to me to add a default case.

sys.stderr.write("For matrix report user needs to provide either forward or top strand parameter with or without genotype score, can only build one report!\n")
sys.exit(-1)

if os.path.isfile(args.output_file):
sys.stderr.write("Output file already exists, please delete and re-run\n")
sys.exit(-1)

try:
manifest = BeadPoolManifest.BeadPoolManifest(args.manifest)
except:
sys.stderr.write("Failed to read data from manifest\n")
sys.exit(-1)

with open(args.output_file, "w") as output_handle:
output_handle.write("[Header]\n")
output_handle.write(delim.join(["Processing Date", datetime.now().strftime("%m/%d/%Y %I:%M %p")])+ "\n")
output_handle.write(delim.join(["Content", os.path.basename(args.manifest)]) + "\n")
output_handle.write(delim.join(["Num SNPs", str(len(manifest.names))]) + "\n")
output_handle.write(delim.join(["Total SNPs", str(len(manifest.names))]) + "\n")

samples = []
for gtc_file in os.listdir(args.gtc_directory):
if gtc_file.lower().endswith(".gtc"):
samples.append(gtc_file)

output_handle.write(delim.join(["Num Samples", str(len(samples))]) + "\n")
output_handle.write(delim.join(["Total Samples", str(len(samples))]) + "\n")
output_handle.write("[Data]\n")

matrix_forward = {}
matrix_forward_GC = {}
matrix_top = {}
matrix_top_GC = {}
matrix_plus = {}
matrix_plus_GC = {}
matrix_AB = {}
matrix_AB_GC = {}

for gtc_file in samples:
samplename = os.path.basename(gtc_file)[:-4]
sys.stderr.write("Processing " + gtc_file + "\n")
gtc_file = os.path.join(args.gtc_directory, gtc_file)
gtc = GenotypeCalls.GenotypeCalls(gtc_file)
genotypes = [code2genotype[genotype] for genotype in gtc.get_genotypes()]
assert len(genotypes) == len(manifest.names)
if args.plus or args.plus_GC:
plus_strand_genotypes = gtc.get_base_calls_plus_strand(manifest.snps, manifest.ref_strands)
if args.forward or args.forward_GC:
forward_strand_genotypes = gtc.get_base_calls_forward_strand(manifest.snps, manifest.source_strands)
if args.top or args.top_GC:
top_strand_genotypes = gtc.get_base_calls_TOP_strand(manifest.snps, manifest.ilmn_strands)
if args.forward_GC or args.top_GC or args.plus_GC or args.AB_GC:
genotype_scores = around(gtc.get_genotype_scores(), decimals = 4)
#build dictionary for pandas
if args.forward_GC:
matrix_forward_GC = build_dict(matrix = matrix_forward_GC, samplename = samplename, \
names = manifest.names, genotypes = forward_strand_genotypes, genotype_scores = genotype_scores)
elif args.forward:
matrix_forward = build_dict(matrix = matrix_forward, samplename = samplename, \
names = manifest.names, genotypes = forward_strand_genotypes)
elif args.top:
matrix_top = build_dict(matrix = matrix_top, samplename = samplename, \
names = manifest.names, genotypes = top_strand_genotypes)
elif args.top_GC:
matrix_top_GC = build_dict(matrix = matrix_top_GC, samplename = samplename, \
names = manifest.names, genotypes = top_strand_genotypes, genotype_scores = genotype_scores)
elif args.plus_GC:
matrix_plus_GC = build_dict(matrix = matrix_plus_GC, samplename = samplename, \
names = manifest.names, genotypes = plus_strand_genotypes, genotype_scores = genotype_scores)
elif args.plus:
matrix_plus = build_dict(matrix = matrix_plus, samplename = samplename, \
names = manifest.names, genotypes = plus_strand_genotypes)
elif args.AB:
matrix_AB = build_dict(matrix = matrix_AB, samplename = samplename, \
names = manifest.names, genotypes = genotypes)
elif args.AB_GC:
matrix_AB_GC = build_dict(matrix = matrix_AB_GC, samplename = samplename, \
names = manifest.names, genotypes = genotypes, genotype_scores = genotype_scores)
#create pandas dataframe from dictionaryand append to file
if args.forward_GC:
df = DataFrame.from_dict(matrix_forward_GC)
elif args.forward:
df = DataFrame.from_dict(matrix_forward)
elif args.top:
df = DataFrame.from_dict(matrix_top)
elif args.top_GC:
df = DataFrame.from_dict(matrix_top_GC)
elif args.plus_GC:
df = DataFrame.from_dict(matrix_plus_GC)
elif args.plus:
df = DataFrame.from_dict(matrix_plus)
elif args.AB:
df = DataFrame.from_dict(matrix_AB)
elif args.AB_GC:
df = DataFrame.from_dict(matrix_AB_GC)
df = df.reindex(manifest.names)
df.to_csv(output_handle, sep = delim)
Loading