forked from diux-dev/cluster
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbackend.py
290 lines (209 loc) · 8.78 KB
/
backend.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
"""Interface for job launching backend."""
# Job launcher Python API: https://docs.google.com/document/d/1yTkb4IPJXOUaEWksQPCH7q0sjqHgBf3f70cWzfoFboc/edit
# AWS job launcher (concepts): https://docs.google.com/document/d/1IbVn8_ckfVO3Z9gIiE0b9K3UrBRRiO9HYZvXSkPXGuw/edit
import os
import glob
import threading
import util as u
# aws_backend.py
# tmux_backend.py
LOGDIR_PREFIX='/efs/runs'
"""
backend = aws_backend # alternatively, backend=tmux_backend to launch jobs locally in separate tmux sessions
run = backend.make_run("helloworld") # sets up /efs/runs/helloworld
worker_job = run.make_job("worker", instance_type="g3.4xlarge", num_tasks=4, ami=ami, setup_script=setup_script)
ps_job = run.make_job("ps", instance_type="c5.xlarge", num_tasks=4, ami=ami, setup_script=setup_script)
setup_tf_config(worker_job, ps_job)
ps_job.run("python cifar10_main.py --num_gpus=0") # runs command on each task
worker_job.run("python cifar10_main.py --num_gpus=4")
tb_job = run.make_job("tb", instance_type="m4.xlarge", num_tasks=1, public_port=6006)
tb_job.run("tensorboard --logdir=%s --port=%d" %(run.logdir, 6006))
# when job has one task, job.task[0].ip can be accessed as job.ip
print("See TensorBoard progress on %s:%d" %(tb_job.ip, 6006))
print("To interact with workers: %s" %(worker_job.connect_instructions))
To reconnect to existing job:
"""
def set_global_logdir_prefix(logdir_prefix):
"""Globally changes logdir prefix across all runs."""
global LOGDIR_PREFIX
LOGDIR_PREFIX = logdir_prefix
# todo: rename to "start_run" instead of setup_run?
def make_run(name):
"""Sets up "run" with given name, such as "training run"."""
raise NotImplementedError()
# def make_job(run_name, job_name, **kwargs):
# """Initializes Job object. It will reuse existing cluster resources if the job with given parameters has already been launched."""
# raise NotImplementedError()
class Run:
"""Run is a collection of jobs that share statistics. IE, training run will contain gradient worker job, parameter server job, and TensorBoard visualizer job. These jobs will use the same shared directory to store checkpoints and event files."""
def __init__(self, name, install_script=None):
"""Creates a run. If install_script is specified, it's used as default
install_script for all jobs (can be overridden by Job constructor)"""
raise NotImplementedError()
def make_job(self, name, num_tasks=1, install_script=None, **kwargs):
"""Creates job in the given run. If install_script is None, uses
install_script associated with the Run."""
raise NotImplementedError()
def run(self, *args, **kwargs):
"""Runs command on every job in the run."""
for job in self.jobs:
job.run(*args, **kwargs)
def run_and_capture_output(self, *args, **kwargs):
"""Runs command on every first job in the run, returns stdout."""
return self.jobs[0].run_and_capture_output(*args, **kwargs)
def _run_raw(self, *args, **kwargs):
"""_run_raw on every job in the run."""
for job in self.jobs:
job._run_raw(*args, **kwargs)
def upload(self, *args, **kwargs):
"""Runs command on every job in the run."""
for job in self.jobs:
job.upload(*args, **kwargs)
def log(self, message, *args):
"""Log to client console."""
ts = u.current_timestamp()
if args:
message = message % args
print("%s %s: %s"%(ts, self.name, message))
class Job:
def __init__(self):
self.tasks = []
def run_async(self, cmd, *args, **kwargs):
self.run(cmd, sync=False, *args, **kwargs)
def run(self, cmd, *args, **kwargs):
"""Runs command on every task in the job."""
for task in self.tasks:
task.run(cmd, *args, **kwargs)
def run_and_capture_output(self, cmd, *args, **kwargs):
"""Runs command on first task in the job, returns stdout."""
return self.tasks[0].run_and_capture_output(cmd, *args, **kwargs)
def _run_raw(self, *args, **kwargs):
"""_run_raw on every task in the job."""
for task in self.tasks:
task._run_raw(*args, **kwargs)
def run_async_join(self, cmd, *args, **kwargs):
"""Runs command on every task in the job async. Then waits for all to finish"""
def t_run_cmd(t): t.run(cmd, *args, **kwargs)
self.async_join(t_run_cmd)
def upload(self, *args, **kwargs):
"""Runs command on every task in the job."""
for task in self.tasks:
task.upload(*args, **kwargs)
def upload_async(self, *args, **kwargs):
def t_upload(t): t.upload(*args, **kwargs)
self.async_join(t_upload)
def async_join(self, task_fn):
exceptions = []
def fn_wrapper(x): # Propagate exceptions to crash the main thread
try: task_fn(x)
except Exception as e: exceptions.append(e)
t_threads = [threading.Thread(name=f't_{i}', target=fn_wrapper, args=[t]) for i,t in enumerate(self.tasks)]
for thread in t_threads: thread.start()
for thread in t_threads: thread.join()
if exceptions: raise exceptions[0]
# todo: rename to initialize
def wait_until_ready(self):
"""Waits until all tasks in the job are available and initialized."""
# import threading
# t_threads = [threading.Thread(name=f't_{i}', target=lambda t: t.wait_until_ready(), args=[t]) for i,t in enumerate(self.tasks)]
# for thread in t_threads: thread.start()
# for thread in t_threads: thread.join()
for task in self.tasks:
task.wait_until_ready()
# todo: initialization should start async in constructor instead of here
# these methods redirect to the first task
@property
def ip(self):
return self.tasks[0].ip
@property
def public_ip(self):
return self.tasks[0].public_ip
@property
def port(self):
return self.tasks[0].port
@property
def public_port(self):
return self.tasks[0].public_port
@property
def connect_instructions(self):
return self.tasks[0].connect_instructions
@property
def logdir(self):
return self._run.logdir
@property
def instance(self):
return self.tasks[0].instance
class Task:
def run(self, cmd, sync, ignore_errors):
"""Runs command on given task."""
raise NotImplementedError()
def _run_raw(self, cmd, sync, ignore_errors):
"""Runs command directly on every task in the job, skipping tmux interface. Use if want to create/manage additional tmux sessions manually."""
raise NotImplementedError()
def run_async(self, cmd, *args, **kwargs):
self.run(cmd, sync=False, *args, **kwargs)
def _upload_handler(self, line):
"""Handle following types of commands.
Individual files, ie
%upload file.txt
Glob expressions, ie
%upload *.py"""
toks = line.split()
assert len(toks) == 2
assert toks[0] == '%upload'
fname = toks[1]
fname = fname.replace("~", os.environ["HOME"])
for fn in glob.glob(fname):
self.upload(fn)
def upload(self, local_fn, remote_fn=None, skip_existing=False):
"""Uploads given file to the task. If remote_fn is not specified, dumps it
into task current directory with the same name."""
raise NotImplementedError()
def download(self, remote_fn, local_fn=None):
"""Downloads remote file to current directory."""
raise NotImplementedError()
@property
def ip(self):
raise NotImplementedError()
@property
def public_ip(self):
"""Helper method to provide a publicly facing ip for given task when
tasks run on a different network than user (ie, AWS internal vs. user's
laptop)"""
raise NotImplementedError()
@property
def logdir(self):
return self.job.logdir
@property
def port(self):
"""This is (the main) internal port that this task will use for
communicating with other tasks. When using TensorFlow, this would be the
port on which TensorFlow server is listening."""
return self._port
@property
def public_port(self):
"""This is a port that's used to access task from public internet.
On AWS it tends to be fixed because it's set by underlying infrastructure
(security group), defer implementation to backend."""
raise NotImplementedError()
def log(self, message, *args):
"""Log to client console."""
ts = u.current_timestamp()
if args:
message = message % args
print("%s %d.%s: %s"%(ts, self.id, self.job.name, message))
def file_write(self, fn, contents):
"""Write string contents to file fn in task."""
raise NotImplementedError()
def file_read(self, fn):
"""Read contents of file and return it as string."""
raise NotImplementedError()
def file_exists(self, fn):
"""Return true if file exists in task current directory."""
raise NotImplementedError()
def stream_file(self, fn):
"""Streams task-local file to console (path relative to taskdir)."""
raise NotImplementedError()
def _ossystem(self, cmd):
# self.log(cmd)
os.system(cmd)