-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathplot.py
141 lines (121 loc) · 4.48 KB
/
plot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import math
import argparse
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tikzplotlib
from glob import glob
from scipy import stats
parser = argparse.ArgumentParser(description='Plot fping/iperf delay distribution.')
parser.add_argument('--iperf', type=str, help='iperf .csv file.')
parser.add_argument('--fping', type=str, help='fping .csv file.')
parser.add_argument('--utc-offset', type=float, default=2, help='Hours offset from UTC.')
def load_df(pattern, utc_offset=None, min_hour=0, max_hour=24):
'''Load all .csv files matching pattern, concatenate them, and return as a df.'''
df = pd.concat(pd.read_csv(filename) for filename in glob(pattern))
df.rename(columns=lambda x: x.strip(), inplace=True)
if 'ms' in df and 'seconds' not in df:
df['seconds'] = df['ms']/1e3
if 'bytes' in df:
df['KB'] = df['bytes'] / 1e3
df['MB'] = df['bytes'] / 1e6
# bin by kb
max_bin = math.ceil(df['KB'].max())
df['bin_kb'] = pd.cut(df['KB'], max_bin, labels=False)
# bin by MB
max_bin = math.ceil(df['MB'].max())
df['bin_mb'] = pd.cut(df['MB'], max_bin, labels=False)
# convert timestamp time to datetime
if utc_offset:
df['timestamp'] += utc_offset*3600
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
df['hour'] = df['timestamp'].dt.hour
# drop samples outside min_hour, max_hour
df.drop(df[df['hour'] < min_hour].index, inplace=True)
df.drop(df[df['hour'] > max_hour].index, inplace=True)
return df
def pdf_from_series(series, bins='log', base=1.01):
'''Return the PDF of the data samples in the series.'''
if bins == 'log':
num = math.ceil(math.log(series.max() / series.min()) / math.log(base))
start = math.log(series.min()) / math.log(base)
stop = num
bins = np.logspace(start, num, base=base, num=num)
pdf, bin_edges = np.histogram(series, bins=bins, density=True)
pdf = np.insert(pdf, 0, 0)
return pdf, bin_edges
def cdf_from_series(series):
'''Return the CDF of the data samples in series.'''
pdf, bin_edges = pdf_from_series(series)
cdf = pdf.cumsum()
cdf /= cdf[-1]
return cdf, bin_edges
def fit_exponential(series, q=0.01):
'''Fit an exponential RV to the sampels in series.'''
loc0 = series.quantile(q=q) # filter outliers
scale0 = series.mean() - loc0
series = series.loc[series > loc0]
series = series.loc[series < series.quantile(0.99)]
loc, scale = stats.expon.fit(series)
return loc, scale
def plot_distribution(df, type='ccdf', label=None, fit_distribution=True, plotf=None):
'''Plot network delay histogram.'''
assert type in ['pdf', 'cdf', 'ccdf']
if plotf is None:
plotf = plt.loglog
if type == 'pdf':
y, x = pdf_from_series(df['seconds'])
elif type == 'cdf':
y, x = cdf_from_series(df['seconds'])
elif type == 'ccdf':
cdf, x = cdf_from_series(df['seconds'])
y = 1-cdf
p = plotf(x, y, '-', label=label)
if fit_distribution:
assert type == 'ccdf', 'not implemented'
loc, scale = fit_exponential(df['seconds'])
print(f'loc={loc} scale={scale}, fitted mean={loc+scale}, empirical mean: {df["seconds"].mean()}')
plotf(
x, 1-stats.expon.cdf(x, loc=loc, scale=scale),
'--', color=p[-1].get_color(),
)
def main():
args = parser.parse_args()
fping_df = None
if args.fping:
fping_df = load_df(args.fping)
iperf_df = None
if args.iperf:
iperf_df = load_df(args.iperf)
# delay histograms
distribution_type = 'ccdf'
fit_distribution = False
plotf = plt.loglog
plt.figure()
if fping_df is not None:
plot_distribution(
fping_df, label='Ping', plotf=plotf,
type=distribution_type, fit_distribution=fit_distribution,
)
if iperf_df is not None:
bs = [1, 2, 3, 10]
for b in bs:
g = iperf_df.loc[iperf_df['bin_mb'] == b]
print(f'{b} MB: {len(g)} samples')
plot_distribution(
g, label=f'{b} MB', plotf=plotf,
type=distribution_type, fit_distribution=fit_distribution,
)
plt.xlim(1e-5, 1)
plt.ylim(1e-4, 1)
plt.title('Network delay')
plt.xlabel('Delay t [s]')
plt.ylabel('Pr(delay > t)')
plt.grid()
plt.legend()
tikzplotlib.clean_figure()
tikzplotlib.save('ccdf.tex')
plt.show()
return
if __name__ == '__main__':
main()