-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathutils.py
312 lines (262 loc) · 15.5 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
import math
import awkward as ak
import numpy as np
def read_input(inputfiles):
import h5py
for i, fname in enumerate(inputfiles):
print("read file", fname)
with h5py.File(fname, 'r') as h5f:
if i == 0:
X = h5f['X'][:]
Y = h5f['Y'][:]
else:
X = np.concatenate((X, h5f['X']), axis=0)
Y = np.concatenate((Y, h5f['Y']), axis=0)
print("finish reading files")
return X, Y
def convertXY2PtPhi(arrayXY):
# convert from array with [:,0] as X and [:,1] as Y to [:,0] as pt and [:,1] as phi
nevents = arrayXY.shape[0]
arrayPtPhi = np.zeros((nevents, 2))
arrayPtPhi[:, 0] = np.sqrt((arrayXY[:, 0]**2 + arrayXY[:, 1]**2))
arrayPtPhi[:, 1] = np.arctan2(arrayXY[:, 1], arrayXY[:, 0])
return arrayPtPhi
def preProcessing(A, normFac, EVT=None):
""" pre-processing input """
norm = normFac
pt = A[:, :, 0:1] / norm
px = A[:, :, 1:2] / norm
py = A[:, :, 2:3] / norm
eta = A[:, :, 3:4]
phi = A[:, :, 4:5]
puppi = A[:, :, 5:6]
# remove outliers
pt[np.where(np.abs(pt > 500/norm))] = 0.
px[np.where(np.abs(px > 500/norm))] = 0.
py[np.where(np.abs(py > 500/norm))] = 0.
inputs = np.concatenate((pt, eta, phi, puppi), axis=2)
pxpy = np.concatenate((px, py), axis=2)
inputs_cat0 = A[:, :, 6] # encoded PF pdgId
inputs_cat1 = A[:, :, 7] # encoded PF charge
return inputs, pxpy, inputs_cat0, inputs_cat1
def MakePlots(trueXY, mlXY, puppiXY, path_out):
# make the 1d distribution, response, resolution,
# and response-corrected resolution plots
# input has [:,0] as X and [:,1] as Y
import matplotlib.pyplot as plt
import mplhep as hep
plt.style.use(hep.style.CMS)
true_ptPhi = convertXY2PtPhi(trueXY)
ml_ptPhi = convertXY2PtPhi(mlXY)
puppi_ptPhi = convertXY2PtPhi(puppiXY)
# [:,0] is pt; [:,1] is phi
Make1DHists(trueXY[:, 0], mlXY[:, 0], puppiXY[:, 0], -400, 400, 40, False, 'MET X [GeV]', 'A.U.', f'{path_out}MET_x.png')
Make1DHists(trueXY[:, 1], mlXY[:, 1], puppiXY[:, 1], -400, 400, 40, False, 'MET Y [GeV]', 'A.U.', f'{path_out}MET_y.png')
Make1DHists(true_ptPhi[:, 0], ml_ptPhi[:, 0], puppi_ptPhi[:, 0], 0, 400, 40, False, 'MET Pt [GeV]', 'A.U.', f'{path_out}MET_pt.png')
# do statistics
from scipy.stats import binned_statistic
nbins = 20
binnings = np.linspace(0, 400, num=nbins+1) # create 20 bins for pt from 0 to 400 GeV
phiBinnings = np.linspace(-3.15, 3.15, num=nbins+1)
truth_means, bin_edges, binnumber = binned_statistic(true_ptPhi[:, 0], true_ptPhi[:, 0], statistic='mean', bins=binnings, range=(0, 400))
ml_means, _, _ = binned_statistic(true_ptPhi[:, 0], ml_ptPhi[:, 0],
statistic='mean', bins=binnings, range=(0, 400))
puppi_means, _, _ = binned_statistic(true_ptPhi[:, 0], puppi_ptPhi[:, 0],
statistic='mean', bins=binnings, range=(0, 400))
# plot response
plt.figure()
plt.hlines(truth_means/truth_means, bin_edges[:-1], bin_edges[1:], colors='k', lw=5,
label='Truth', linestyles='solid')
plt.hlines(ml_means/truth_means, bin_edges[:-1], bin_edges[1:], colors='r', lw=5,
label='ML', linestyles='solid')
plt.hlines(puppi_means/truth_means, bin_edges[:-1], bin_edges[1:], colors='g', lw=5,
label='PUPPI', linestyles='solid')
plt.xlim(0, 400.0)
plt.ylim(0, 1.1)
plt.xlabel('Truth MET [GeV]')
plt.legend(loc='lower right')
plt.ylabel('<MET Estimation>/<MET Truth>')
plt.savefig(f"{path_out}MET_response.png")
plt.close()
# width of a distribution at 1 standard deviation
def resolqt(y):
return(np.percentile(y, 84)-np.percentile(y, 16))/2.0
# response correction factors
# the events are split into 20 bins based on true_pt and get assigned the corresponding `truth_means/ml_means` of all events in that bin
# when ml/puppi_pt is scaled by this response correction, the mean of this distributiobn will coincide with the mean of true_pt
responseCorrection_ml = np.take(truth_means/ml_means, np.digitize(true_ptPhi[:, 0], binnings)-1, mode='clip')
responseCorrection_puppi = np.take(truth_means/puppi_means, np.digitize(true_ptPhi[:, 0], binnings)-1, mode='clip')
# Phi calculation
Phi_diff_ml = true_ptPhi[:, 1] - ml_ptPhi[:, 1]
Phi_diff_ml = np.where(Phi_diff_ml < -math.pi, Phi_diff_ml + 2*math.pi, Phi_diff_ml)
Phi_diff_ml = np.where(Phi_diff_ml > math.pi, Phi_diff_ml - 2*math.pi, Phi_diff_ml)
Phi_diff_puppi = true_ptPhi[:, 1] - puppi_ptPhi[:, 1]
Phi_diff_puppi = np.where(Phi_diff_puppi < -math.pi, Phi_diff_puppi + 2*math.pi, Phi_diff_puppi)
Phi_diff_puppi = np.where(Phi_diff_puppi > math.pi, Phi_diff_puppi - 2*math.pi, Phi_diff_puppi)
# compute resolutions inside all 20 bins
# the dirstribution true_pt - respCor*ml/puppi_pt should be centered at 0
bin_resolX_ml, bin_edges, binnumber = binned_statistic(true_ptPhi[:, 0], trueXY[:, 0] - mlXY[:, 0] * responseCorrection_ml,
statistic=resolqt, bins=binnings, range=(0, 400))
bin_resolY_ml, _, _ = binned_statistic(true_ptPhi[:, 0], trueXY[:, 1] - mlXY[:, 1] * responseCorrection_ml,
statistic=resolqt, bins=binnings, range=(0, 400))
bin_resolPt_ml, _, _ = binned_statistic(true_ptPhi[:, 0], true_ptPhi[:, 0] - ml_ptPhi[:, 0] * responseCorrection_ml,
statistic=resolqt, bins=binnings, range=(0, 400))
bin_resolPhi_ml, bin_edgesPhi, binnumberPhi = binned_statistic(true_ptPhi[:, 1], Phi_diff_ml,
statistic=resolqt, bins=phiBinnings, range=(-3.15, 3.15))
bin_resolX_puppi, _, _ = binned_statistic(true_ptPhi[:, 0], trueXY[:, 0] - puppiXY[:, 0] * responseCorrection_puppi,
statistic=resolqt, bins=binnings, range=(0, 400))
bin_resolY_puppi, _, _ = binned_statistic(true_ptPhi[:, 0], trueXY[:, 1] - puppiXY[:, 1] * responseCorrection_puppi,
statistic=resolqt, bins=binnings, range=(0, 400))
bin_resolPt_puppi, _, _ = binned_statistic(true_ptPhi[:, 0], true_ptPhi[:, 0] - puppi_ptPhi[:, 0] * responseCorrection_puppi,
statistic=resolqt, bins=binnings, range=(0, 400))
bin_resolPhi_puppi, _, _ = binned_statistic(true_ptPhi[:, 1], Phi_diff_puppi,
statistic=resolqt, bins=phiBinnings, range=(-3.15, 3.15))
# calclate the resolution "magnitude" inside all 20 bins
bin_resolXYmagnitude_ml = (bin_resolX_ml**2+bin_resolY_ml**2)**.5
bin_resolXYmagnitude_puppi = (bin_resolX_puppi**2+bin_resolY_puppi**2)**.5
bin_resolXYmagnitude_difference = bin_resolXYmagnitude_puppi - bin_resolXYmagnitude_ml
# transverse MET resolution difference
bin_resolPt_difference = bin_resolPt_puppi - bin_resolPt_ml
# compute the resolution over the entire dataset (1 bin)
average_xRes_ml = resolqt(trueXY[:, 0] - mlXY[:, 0] * responseCorrection_ml)
average_yRes_ml = resolqt(trueXY[:, 1] - mlXY[:, 1] * responseCorrection_ml)
average_ptRes_ml = resolqt(true_ptPhi[:, 0] - ml_ptPhi[:, 0] * responseCorrection_ml)
average_xRes_puppi = resolqt(trueXY[:, 0] - puppiXY[:, 0] * responseCorrection_puppi)
average_yRes_puppi = resolqt(trueXY[:, 1] - puppiXY[:, 1] * responseCorrection_puppi)
average_ptRes_puppi = resolqt(true_ptPhi[:, 0] - puppi_ptPhi[:, 0] * responseCorrection_puppi)
# and the resolution "magnitudes" and the corresponding difference between the puppi and ml predictions
averageXYmag_Res_puppi = (average_xRes_puppi**2+average_yRes_puppi**2)**(.5)
averageXYmag_Res_ml = (average_xRes_ml**2+average_yRes_ml**2)**(.5)
averageXYmag_Res_difference = averageXYmag_Res_puppi-averageXYmag_Res_ml
averagePt_Res_difference = average_ptRes_puppi - average_ptRes_ml
# these two similar metrics can be used to compare the performance across trainings
# for now, i will compute both
# the square root of the number of events in each bin
nEvents_inBin, _ = np.histogram(binnumber, bins=nbins, range=(1, nbins))
rootN = np.sqrt(nEvents_inBin)
nEvents_inBin_phi, _ = np.histogram(binnumberPhi, bins=nbins, range=(1, nbins))
rootN_phi = np.sqrt(nEvents_inBin_phi)
# is used to calculate the error bars for each bin = res/rootN
# locations of error bars
binWidth = binnings[1] # =20
# +8 and +12 put the error bars slightly off the center of the horizontal lines
binCenter = binWidth/2
leftOfBinCenter = .4*binWidth # =8
rightOfBinCenter = .6*binWidth # =12
fig1 = plt.figure(figsize=(15, 12), tight_layout=True)
fig2 = plt.figure(figsize=(15, 12), tight_layout=True)
plt.subplots_adjust(wspace=.2,
hspace=0)
# plot x resolution 20 bins
ax11 = fig1.add_subplot(2, 2, 1)
ax11.hlines(bin_resolX_ml, bin_edges[:-1], bin_edges[1:], colors='r', lw=3,
label='ML', linestyles='solid')
ax11.hlines(bin_resolX_puppi, bin_edges[:-1], bin_edges[1:], colors='g', lw=3,
label='PUPPI', linestyles='solid')
ax11.errorbar(bin_edges[:-1]+rightOfBinCenter, bin_resolX_ml,
yerr=bin_resolX_ml/rootN, fmt='none', color='r')
ax11.errorbar(bin_edges[:-1]+leftOfBinCenter, bin_resolX_puppi,
yerr=bin_resolX_puppi/rootN, fmt='none', color='g')
ax11.grid()
ax11.set_ylabel(r'$\sigma(MET)$ [GeV]', fontsize=19)
ax11.set_title('MET-x Resolution', fontsize=22)
# plot y resolutions 20 bins
ax12 = fig1.add_subplot(2, 2, 2, sharey=ax11)
ax12.hlines(bin_resolY_ml, bin_edges[:-1], bin_edges[1:], colors='r', lw=3,
label='$\\sigma_{ML}$', linestyles='solid')
ax12.hlines(bin_resolY_puppi, bin_edges[:-1], bin_edges[1:], colors='g', lw=3,
label='$\\sigma_{PUPPI}$', linestyles='solid')
ax12.errorbar(bin_edges[:-1]+rightOfBinCenter, bin_resolY_ml,
yerr=bin_resolY_ml/rootN, fmt='none', color='r')
ax12.errorbar(bin_edges[:-1]+leftOfBinCenter, bin_resolY_puppi,
yerr=bin_resolY_puppi/rootN, fmt='none', color='g')
ax12.legend(loc='upper center', prop={'size': 19})
ax12.grid()
ax12.set_title('MET-y Resolution', fontsize=22)
# plot resolution XY magnitude absolute differences
ax13 = fig1.add_subplot(2, 2, 3, sharex=ax11)
ax13.hlines(bin_resolXYmagnitude_difference, bin_edges[:-1], bin_edges[1:], lw=5, linestyles='solid')
ax13.axhline(y=0, color='black', linestyle='-')
ax13.set_xlabel('Truth MET [GeV]', fontsize=19)
ax13.set_ylabel(r'$\sigma_{PUPPI} - \sigma_{ML}$ [GeV]', fontsize=19)
ax13.grid()
ax13.set_title('Absolute Resolution Differences (PUPPI - ML)', fontsize=22)
# relative differences
ax14 = fig1.add_subplot(2, 2, 4, sharex=ax12)
ax14.hlines(bin_resolXYmagnitude_difference/truth_means, bin_edges[:-1], bin_edges[1:], lw=5, linestyles='solid')
ax14.axhline(y=0, color='black', linestyle='-')
ax14.set_ylim(min(bin_resolXYmagnitude_difference/truth_means)-.1, max(bin_resolXYmagnitude_difference/truth_means)+.1)
ax14.set_xlabel('Truth MET [GeV]', fontsize=19)
ax14.set_ylabel(r'$(\sigma_{PUPPI} - \sigma_{ML})$ / $\mu_{bin}$', fontsize=19)
ax14.grid()
ax14.set_title('Relative Resolution Differences', fontsize=22)
trainingName = path_out.split('/')[-2]
fig1.text(0, 1.06, f'training: {trainingName}', fontsize=19)
fig1.text(0, 1.03, r'$\sigma_{PUPPI} - \sigma_{ML}=\sigma_{xyDIF}$; $Mean(\sigma_{xyDIF})=$'+f'{round(averageXYmag_Res_difference,3)}', fontsize=19)
# plot pt resolutions 20 bins
ax21 = fig2.add_subplot(2, 2, 1)
ax21.hlines(bin_resolPt_ml, bin_edges[:-1], bin_edges[1:], colors='r', lw=3,
label='ML', linestyles='solid')
ax21.hlines(bin_resolPt_puppi, bin_edges[:-1], bin_edges[1:], colors='g', lw=3,
label='PUPPI', linestyles='solid')
ax21.errorbar(bin_edges[:-1]+rightOfBinCenter, bin_resolPt_ml,
yerr=bin_resolPt_ml/rootN, fmt='none', color='r')
ax21.errorbar(bin_edges[:-1]+leftOfBinCenter, bin_resolPt_puppi,
yerr=bin_resolPt_puppi/rootN, fmt='none', color='g')
ax21.set_xlabel('truth met [gev]', fontsize=19)
ax21.set_ylabel(r'$\sigma(MET)$ [GeV]', fontsize=20)
ax21.grid()
ax21.set_title('MET-pt Resolution', fontsize=22)
# plot phi resolutions 20 bins
ax22 = fig2.add_subplot(2, 2, 2)
ax22.hlines(bin_resolPhi_ml, bin_edgesPhi[:-1], bin_edgesPhi[1:], colors='r', lw=3,
label='$\\sigma_{ML}$', linestyles='solid')
ax22.hlines(bin_resolPhi_puppi, bin_edgesPhi[:-1], bin_edgesPhi[1:], colors='g', lw=3,
label='$\\sigma_{PUPPI}$', linestyles='solid')
ax22.errorbar(bin_edgesPhi[:-1]+.13, bin_resolPhi_ml,
yerr=bin_resolPhi_ml/rootN_phi, fmt='none', color='r')
ax22.errorbar(bin_edgesPhi[:-1]+.17, bin_resolPhi_puppi,
yerr=bin_resolPhi_puppi/rootN_phi, fmt='none', color='g')
ax22.set_ylabel('radian', fontsize=20)
ax22.set_ylim(0., 1.)
ax22.grid()
ax22.set_xlabel(r'$\phi$ angle', fontsize=19)
ax22.legend(loc='upper center', prop={'size': 19})
ax22.set_title(r'MET-$\Phi$ Resolution', fontsize=20)
# plot resolution pt absolute differences
ax23 = fig2.add_subplot(2, 2, 3, sharex=ax11)
ax23.hlines(bin_resolPt_difference, bin_edges[:-1], bin_edges[1:], lw=5, linestyles='solid')
ax23.axhline(y=0, color='black', linestyle='-')
ax23.set_xlabel('Truth MET [GeV]', fontsize=19)
ax23.set_ylabel(r'$\sigma_{PUPPI} - \sigma_{ML}$ [GeV]', fontsize=20)
ax23.grid()
ax23.set_title('Absolute Resolution Differences', fontsize=22)
# relative differences
ax24 = fig2.add_subplot(2, 2, 4, sharex=ax12)
ax24.hlines(bin_resolPt_difference/truth_means, bin_edges[:-1], bin_edges[1:], lw=5, linestyles='solid')
ax24.axhline(y=0, color='black', linestyle='-')
ax24.set_ylim(min(bin_resolPt_difference/truth_means)-.1, max(bin_resolPt_difference/truth_means)+.1)
ax24.set_xlabel('Truth MET [GeV]', fontsize=19)
ax24.set_ylabel(r'$(\sigma_{PUPPI} - \sigma_{ML})$ / $\mu_{bin}$', fontsize=19)
ax24.grid()
ax24.set_title(f'Relative Resolution Differences', fontsize=22)
fig2.text(0, 1.06, f'training: {trainingName}', fontsize=19)
fig2.text(0, 1.03, r'$\sigma_{PUPPI} - \sigma_{ML}=\sigma_{DIF}$; $Mean(\sigma_{DIF})=$'+f'{round(averagePt_Res_difference,3)}', fontsize=19)
fig1.savefig(f"{path_out}XY_resolution_plots.png", bbox_inches="tight")
fig2.savefig(f"{path_out}pt_resolution_plots.png", bbox_inches="tight")
def Make1DHists(truth, ML, PUPPI, xmin=0, xmax=400, nbins=100, density=False, xname="pt [GeV]", yname="A.U.", outputname="1ddistribution.png"):
import matplotlib.pyplot as plt
import mplhep as hep
plt.style.use(hep.style.CMS)
plt.figure(figsize=(10, 8))
plt.hist(truth, bins=nbins, range=(xmin, xmax), density=density, histtype='step', facecolor='k', label='Truth')
plt.hist(ML, bins=nbins, range=(xmin, xmax), density=density, histtype='step', facecolor='r', label='ML')
plt.hist(PUPPI, bins=nbins, range=(xmin, xmax), density=density, histtype='step', facecolor='g', label='PUPPI')
plt.yscale('log')
plt.legend(loc='upper right')
plt.xlabel(xname)
plt.ylabel(yname)
plt.savefig(outputname)
plt.close()
def to_np_array(ak_array, maxN=100, pad=0):
return ak.fill_none(ak.pad_none(ak_array, maxN, clip=True, axis=-1), pad).to_numpy()