forked from petercombs/dicty
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPlotGCBias.py
128 lines (107 loc) · 3.48 KB
/
PlotGCBias.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import numpy as np
import pandas as pd
from argparse import ArgumentParser
from os import path
from matplotlib.pyplot import (
scatter,
hlines,
xlim,
ylim,
hist,
figure,
legend,
savefig,
subplot,
xlabel,
ylabel,
)
GC_COLS = [
"chr",
"start",
"stop",
"frac_at",
"frac_gc",
"num_A",
"num_C",
"num_G",
"num_T",
"num_N",
"num_oth",
"seq_len",
]
COV_COLS = ["Chrom", "start", "stop", "cov"]
def longest_common_suffix(list_of_strings):
reversed_strings = ["".join(s[::-1]) for s in list_of_strings]
reversed_lcs = path.commonprefix(reversed_strings)
lcs = "".join(reversed_lcs[::-1])
return lcs
def parse_args():
parser = ArgumentParser()
parser.add_argument("--step-size", "-s", default=1.0, type=float)
parser.add_argument("--output-dir", "-o", default=None)
parser.add_argument("gc_file")
parser.add_argument("window_coverage_bed", nargs="+")
args = parser.parse_args()
if args.output_dir is None:
args.output_dir = path.dirname(args.window_coverage_bed[0])
return args
def step_floor(value, stepsize):
return stepsize * np.floor(value / stepsize)
def step_ceil(value, stepsize):
return stepsize * np.ceil(value / stepsize)
if __name__ == "__main__":
args = parse_args()
gc = pd.read_csv(
args.gc_file, index_col=[0, 1, 2], names=GC_COLS, header=0, sep="\t"
)
gc_low = step_floor(gc.frac_gc.min(), args.step_size / 100)
gc_high = step_ceil(gc.frac_gc.max(), args.step_size / 100)
gc_steps = np.arange(gc_low, gc_high, args.step_size / 100)
common_path = path.commonprefix(args.window_coverage_bed)
common_suffix = longest_common_suffix(args.window_coverage_bed)
print("---->", common_suffix)
outdir = args.output_dir
for cov_file in args.window_coverage_bed:
cov = pd.read_csv(
cov_file, header=None, names=COV_COLS, index_col=[0, 1, 2], sep="\t"
)
# total_cov = sum(gc["seq_len"] * cov["cov"])
gc_cov = pd.Series(index=gc_steps, data=np.nan)
total_len = pd.Series(index=gc_steps, data=np.nan)
for bin_lo, bin_hi in zip(gc_cov.index, gc_cov.index[1:]):
try:
q = gc.query("{} <= frac_gc < {}".format(bin_lo, bin_hi))
gc_cov[bin_lo] = sum(cov.loc[q.index, "cov"])
total_len[bin_lo] = sum(q["seq_len"])
except KeyError:
gc_cov[bin_lo] = 0
total_len[bin_lo] = sum(q["seq_len"])
y = gc_cov / total_len
figure(1, figsize=(8, 6))
subplot(2, 1, 1)
plotname = cov_file.replace(common_path, "").replace(common_suffix, "")
print(plotname)
scatter(gc_cov.index * 100, y / y.mean(), s=4, label=plotname)
figure()
hlines(
1,
gc_cov.index.min() * 100,
gc_cov.index.max() * 100,
"k",
linestyles="dotted",
)
scatter(gc_cov.index * 100, y / y.mean(), label=plotname)
ylabel("Normed Coverage")
xlabel("% GC")
savefig(path.join(outdir, "{}_gc_cov_normed.png".format(plotname)), dpi=300)
figure(1)
hlines(
1, gc_cov.index.min() * 100, gc_cov.index.max() * 100, "k", linestyles="dotted"
)
ylabel("Normed Coverage")
legend()
subplot(2, 1, 2)
hist(gc.frac_gc * 100, bins=gc_steps * 100)
xlabel("% GC")
ylabel("Density")
savefig(path.join(args.output_dir, "gc_cov_normed.png"), dpi=300)