-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfast_gather_summaries.py
executable file
·72 lines (52 loc) · 2.32 KB
/
fast_gather_summaries.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/usr/bin/env python
import glob
import os
import pandas as pd
import numpy as np
import sqlite3
def construct_runname(inpath, replaces=['_glance', '_sci', '_meta', '_ss', '_ddf']):
"""given a directory path, construct a runname
"""
result = os.path.basename(os.path.normpath(inpath))
for rstring in replaces:
result = result.replace(rstring, '')
return result
def fast_gather(dirname='.', dbfilename='resultsDb_sqlite.db'):
"""Let's gather up a bunch of resultDb's
"""
potential_dirs = glob.glob(dirname + '/*/')
db_files = []
run_names = []
for dname in potential_dirs:
fname = os.path.join(dname, dbfilename)
if os.path.isfile(fname):
db_files.append(fname)
run_names.append(construct_runname(dname))
# querry to grab all the summary stats
sql_q = 'select metrics.metric_name, metrics.metric_info_label, summarystats.summary_name, summarystats.summary_value '
sql_q += 'FROM summarystats INNER JOIN metrics ON metrics.metric_id=summarystats.metric_id'
rows = []
for row_name, fname in zip(run_names, db_files):
con = sqlite3.connect(fname)
temp_df = pd.read_sql(sql_q, con)
con.close()
spaces = np.char.array([' ']*np.size(temp_df['metric_name'].values))
s1 = np.char.array(temp_df['metric_name'].values.tolist())
s2 = np.char.array(temp_df['metric_info_label'].values.tolist())
s3 = np.char.array(temp_df['summary_name'].values.tolist())
col_names = s1 + spaces + s2 + spaces + s3
# Make a DataFrame row
row = pd.DataFrame(temp_df['summary_value'].values.reshape([1, temp_df['summary_value'].values.size]),
columns=col_names, index=[row_name])
rows.append(row)
# Create final large DataFrame to hold everything
all_cols = np.unique(np.concatenate([r.columns.values for r in rows]))
u_names = np.unique(run_names)
result_df = pd.DataFrame(np.zeros([u_names.size, all_cols.size])+np.nan, columns=all_cols, index=u_names)
# Put each row into the final DataFrame
for row_name, row in zip(run_names, rows):
result_df.loc[row_name][row.columns] = np.ravel(row.values)
return result_df
if __name__ == '__main__':
result = fast_gather()
result.to_hdf('summary.h5', key="stats")