-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmonitor_experiment.py
134 lines (110 loc) · 4.73 KB
/
monitor_experiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import hashlib
import os
import time
def get_file_hash(filename):
with open(filename, "rb") as f:
file_hash = hashlib.sha256()
while chunk := f.read(8192):
file_hash.update(chunk)
return file_hash.hexdigest()
def get_file_hash_with_mtime(filename):
mtime = os.path.getmtime(filename)
with open(filename, "rb") as f:
file_hash = hashlib.sha256(f"{mtime}".encode()) # Include modification time in hash
while chunk := f.read(8192):
file_hash.update(chunk)
return file_hash.hexdigest()
def get_number_of_lines(filename):
with open(filename, "r") as file:
return sum(1 for _ in file)
def check_for_new_experiment(current_hash, matrix_file):
new_hash = get_file_hash(matrix_file)
if new_hash != current_hash:
new_experiment_length = get_number_of_lines(matrix_file)
return new_hash, new_experiment_length
return None, None
def get_latest_mtime(directory):
max_mtime = 0
found_file = False
for root, _, files in os.walk(directory):
for f in files:
filepath = os.path.join(root, f)
file_mtime = os.path.getmtime(filepath)
if file_mtime > max_mtime:
max_mtime = file_mtime
found_file = True
return max_mtime if found_file else None
def get_earliest_mtime(directory):
min_mtime = float("inf")
found_file = False
for root, _, files in os.walk(directory):
for f in files:
filepath = os.path.join(root, f)
file_mtime = os.path.getmtime(filepath)
if file_mtime < min_mtime:
min_mtime = file_mtime
found_file = True
return min_mtime if found_file else None
def check_experiment_status(exp_folder):
started_runs = 0
finished_runs = 0
for run_id in os.listdir(exp_folder):
run_folder = os.path.join(exp_folder, run_id)
if not os.path.isdir(run_folder):
continue
files = os.listdir(run_folder)
if "parameters.env" in files:
started_runs += 1
if len(files) > 1:
finished_runs += 1
return started_runs, finished_runs
def monitor(repeat, interval_seconds=1, matrix_file="runs_table.csv", exp_folder="./runs", debug=False):
if debug:
start_monitor = time.time()
experiment_length = None
experiment_start_time = None
while True:
new_experiment_length = None
if not os.path.exists(exp_folder):
print("No experiment in progress.", end="\r", flush=True)
else:
new_experiment_length = get_number_of_lines(matrix_file) - 1
experiment_start_time = get_earliest_mtime(exp_folder)
experiment_length = new_experiment_length
print(f"\nExperiment started at {time.ctime(experiment_start_time)}", end="\r", flush=True)
started_runs, finished_runs = check_experiment_status(exp_folder)
experiment_end_time = get_latest_mtime(exp_folder)
experiment_duration = experiment_end_time - experiment_start_time
experiment_duration_till_now = time.time() - experiment_start_time
if finished_runs == experiment_length:
print(
f"\nExperiment finished at {time.ctime(experiment_end_time)}.\n"
f"Total {experiment_duration//60:02,.0f}:{experiment_duration%60:02,.0f}. "
f"{experiment_length} runs, {experiment_duration/experiment_length:.2f} seconds per run.",
end="\r",
flush=True,
)
else:
logstr = (
f" ({experiment_duration_till_now//60:02,.0f}:{experiment_duration_till_now%60:02,.0f} ago)"
+ f"\nRuns: running={started_runs-finished_runs:3.0f}, "
+ f"finished={finished_runs:3.0f}, "
+ f"permutations={experiment_length:3.0f}"
)
if finished_runs > 0:
run_speed = experiment_duration / finished_runs
logstr += f", {run_speed:5.1f}s per run"
runs_left = experiment_length - finished_runs
eta = runs_left * run_speed
logstr += f", ETA {eta//3600:02,.0f}:{eta%3600//60:02,.0f}:{eta%60:02,.0f}"
print(logstr)
if debug:
print(f"monitor took {time.time() - start_monitor:,.3f} seconds", end="\r", flush=True)
if not repeat:
break # Exit the loop if not repeating
time.sleep(interval_seconds)
if __name__ == "__main__":
if len(os.sys.argv) > 1:
monitor(repeat=True, interval_seconds=int(os.sys.argv[1]))
else:
monitor(repeat=False)