aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJonathan Herman <hermanjl@cs.unc.edu>2013-04-29 16:50:23 -0400
committerJonathan Herman <hermanjl@cs.unc.edu>2013-04-29 16:50:23 -0400
commit7e32c3915e7ea27d2533d99a22fa53ef923198f5 (patch)
tree5bd8d8a3ed6861e039a683f47a953d2f7a22d8b1
parent7545402506aa76261e18d85af585ff0ac1cf05c1 (diff)
Added run_exps.py option to --retry failed experiments.
If the retry flag is specified, failed experiments will be re-run after all other experiments have run. They can be re-run at most 5 times. This commit required a refactoring of run_exps.py to clean up the main experiment running loop.
-rw-r--r--gen/edf_generators.py1
-rwxr-xr-xgen_exps.py15
-rw-r--r--run/experiment.py50
-rwxr-xr-xrun_exps.py246
4 files changed, 194 insertions, 118 deletions
diff --git a/gen/edf_generators.py b/gen/edf_generators.py
index a722c21..8e4b8df 100644
--- a/gen/edf_generators.py
+++ b/gen/edf_generators.py
@@ -28,6 +28,7 @@ class EdfGenerator(gen.Generator):
28 pdist = self._create_dist('period', 28 pdist = self._create_dist('period',
29 exp_params['periods'], 29 exp_params['periods'],
30 gen.NAMED_PERIODS) 30 gen.NAMED_PERIODS)
31
31 udist = self._create_dist('utilization', 32 udist = self._create_dist('utilization',
32 exp_params['utils'], 33 exp_params['utils'],
33 gen.NAMED_UTILIZATIONS) 34 gen.NAMED_UTILIZATIONS)
diff --git a/gen_exps.py b/gen_exps.py
index b847661..00ce27b 100755
--- a/gen_exps.py
+++ b/gen_exps.py
@@ -43,6 +43,14 @@ def load_file(fname):
43 except: 43 except:
44 raise IOError("Invalid generation file: %s" % fname) 44 raise IOError("Invalid generation file: %s" % fname)
45 45
46def print_descriptions(described):
47 for generator in described.split(','):
48 if generator not in gen.get_generators():
49 sys.stderr.write("No generator '%s'\n" % generator)
50 else:
51 print("Generator '%s', " % generator)
52 gen.get_generators()[generator]().print_help()
53
46def main(): 54def main():
47 opts, args = parse_args() 55 opts, args = parse_args()
48 56
@@ -50,12 +58,7 @@ def main():
50 if opts.list_gens: 58 if opts.list_gens:
51 print(", ".join(gen.get_generators())) 59 print(", ".join(gen.get_generators()))
52 if opts.described != None: 60 if opts.described != None:
53 for generator in opts.described.split(','): 61 print_descriptions(opts.described)
54 if generator not in gen.get_generators():
55 sys.stderr.write("No generator '%s'\n" % generator)
56 else:
57 print("Generator '%s', " % generator)
58 gen.get_generators()[generator]().print_help()
59 if opts.list_gens or opts.described: 62 if opts.list_gens or opts.described:
60 return 0 63 return 0
61 64
diff --git a/run/experiment.py b/run/experiment.py
index b0e46b6..9a70414 100644
--- a/run/experiment.py
+++ b/run/experiment.py
@@ -2,6 +2,7 @@ import os
2import time 2import time
3import run.litmus_util as lu 3import run.litmus_util as lu
4import shutil as sh 4import shutil as sh
5
5from operator import methodcaller 6from operator import methodcaller
6 7
7class ExperimentException(Exception): 8class ExperimentException(Exception):
@@ -69,21 +70,24 @@ class Experiment(object):
69 executable.cwd = self.working_dir 70 executable.cwd = self.working_dir
70 map(assign_cwd, self.executables) 71 map(assign_cwd, self.executables)
71 72
72 def __kill_all(self): 73 def __try_kill_all(self):
73 if lu.waiting_tasks(): 74 try:
74 released = lu.release_tasks() 75 if lu.waiting_tasks():
75 self.log("Re-released %d tasks" % released) 76 released = lu.release_tasks()
77 self.log("Re-released %d tasks" % released)
76 78
77 time.sleep(1) 79 time.sleep(1)
78 80
79 self.log("Killing all tasks") 81 self.log("Killing all tasks")
80 for e in self.executables: 82 for e in self.executables:
81 try: 83 try:
82 e.kill() 84 e.kill()
83 except: 85 except:
84 pass 86 pass
85 87
86 time.sleep(1) 88 time.sleep(1)
89 except:
90 self.log("Failed to kill all tasks.")
87 91
88 def __strip_path(self, path): 92 def __strip_path(self, path):
89 '''Shorten path to something more readable.''' 93 '''Shorten path to something more readable.'''
@@ -138,7 +142,7 @@ class Experiment(object):
138 now_ready = lu.waiting_tasks() 142 now_ready = lu.waiting_tasks()
139 if now_ready != num_ready: 143 if now_ready != num_ready:
140 wait_start = time.time() 144 wait_start = time.time()
141 num_ready = lu.now_ready 145 num_ready = now_ready
142 146
143 def __run_tasks(self): 147 def __run_tasks(self):
144 self.log("Starting %d tasks" % len(self.executables)) 148 self.log("Starting %d tasks" % len(self.executables))
@@ -185,6 +189,7 @@ class Experiment(object):
185 189
186 sched = lu.scheduler() 190 sched = lu.scheduler()
187 if sched != "Linux": 191 if sched != "Linux":
192 self.log("Switching back to Linux scheduler")
188 try: 193 try:
189 lu.switch_scheduler("Linux") 194 lu.switch_scheduler("Linux")
190 except: 195 except:
@@ -229,6 +234,8 @@ class Experiment(object):
229 self.log("Stopping regular tracers") 234 self.log("Stopping regular tracers")
230 map(methodcaller('stop_tracing'), self.regular_tracers) 235 map(methodcaller('stop_tracing'), self.regular_tracers)
231 236
237 os.system('sync')
238
232 def log(self, msg): 239 def log(self, msg):
233 print("[Exp %s]: %s" % (self.name, msg)) 240 print("[Exp %s]: %s" % (self.name, msg))
234 241
@@ -236,6 +243,7 @@ class Experiment(object):
236 self.__to_linux() 243 self.__to_linux()
237 244
238 succ = False 245 succ = False
246 exception = None
239 try: 247 try:
240 self.__setup() 248 self.__setup()
241 249
@@ -244,16 +252,20 @@ class Experiment(object):
244 self.log("Saving results in %s" % self.finished_dir) 252 self.log("Saving results in %s" % self.finished_dir)
245 succ = True 253 succ = True
246 except Exception as e: 254 except Exception as e:
255 exception = e
256
247 # Give time for whatever failed to finish failing 257 # Give time for whatever failed to finish failing
248 time.sleep(2) 258 time.sleep(2)
249 self.__kill_all()
250 259
251 raise e 260 self.__try_kill_all()
252 finally:
253 self.__teardown()
254 finally: 261 finally:
255 self.log("Switching back to Linux scheduler") 262 try:
256 self.__to_linux() 263 self.__teardown()
264 self.__to_linux()
265 except Exception as e:
266 exception = exception or e
267 finally:
268 if exception: raise exception
257 269
258 if succ: 270 if succ:
259 self.__save_results() 271 self.__save_results()
diff --git a/run_exps.py b/run_exps.py
index a15018d..1d46b45 100755
--- a/run_exps.py
+++ b/run_exps.py
@@ -3,6 +3,7 @@ from __future__ import print_function
3 3
4import common as com 4import common as com
5import os 5import os
6import pprint
6import re 7import re
7import shutil 8import shutil
8import sys 9import sys
@@ -11,17 +12,26 @@ import run.tracer as trace
11from config.config import PARAMS,DEFAULTS 12from config.config import PARAMS,DEFAULTS
12from collections import namedtuple 13from collections import namedtuple
13from optparse import OptionParser 14from optparse import OptionParser
15from parse.enum import Enum
14from run.executable.executable import Executable 16from run.executable.executable import Executable
15from run.experiment import Experiment,ExperimentDone,SystemCorrupted 17from run.experiment import Experiment,ExperimentDone,SystemCorrupted
16from run.proc_entry import ProcEntry 18from run.proc_entry import ProcEntry
17 19
20'''Maximum times an experiment will be retried'''
21MAX_RETRY = 5
22
18'''Customizable experiment parameters''' 23'''Customizable experiment parameters'''
19ExpParams = namedtuple('ExpParams', ['scheduler', 'duration', 'tracers', 24ExpParams = namedtuple('ExpParams', ['scheduler', 'duration', 'tracers',
20 'kernel', 'config_options', 'file_params', 25 'kernel', 'config_options', 'file_params',
21 'pre_script', 'post_script']) 26 'pre_script', 'post_script'])
27'''Tracked with each experiment'''
28ExpState = Enum(['Failed', 'Succeeded', 'Invalid', 'Done', 'None'])
29ExpData = com.recordtype('ExpData', ['name', 'params', 'sched_file', 'out_dir',
30 'retries', 'state'])
22'''Comparison of requested versus actual kernel compile parameter value''' 31'''Comparison of requested versus actual kernel compile parameter value'''
23ConfigResult = namedtuple('ConfigResult', ['param', 'wanted', 'actual']) 32ConfigResult = namedtuple('ConfigResult', ['param', 'wanted', 'actual'])
24 33
34
25class InvalidKernel(Exception): 35class InvalidKernel(Exception):
26 def __init__(self, kernel): 36 def __init__(self, kernel):
27 self.kernel = kernel 37 self.kernel = kernel
@@ -72,6 +82,9 @@ def parse_args():
72 parser.add_option('-e', '--email', metavar='username@server', 82 parser.add_option('-e', '--email', metavar='username@server',
73 dest='email', default=None, 83 dest='email', default=None,
74 help='send an email when all experiments complete') 84 help='send an email when all experiments complete')
85 parser.add_option('-r', '--retry', dest='retry',
86 action='store_true', default=False,
87 help='retry failed experiments')
75 88
76 return parser.parse_args() 89 return parser.parse_args()
77 90
@@ -252,65 +265,118 @@ def make_exp_params(cmd_scheduler, cmd_duration, sched_dir, param_file):
252 config_options=copts, tracers=tracers, file_params=fparams, 265 config_options=copts, tracers=tracers, file_params=fparams,
253 pre_script=pre_script, post_script=post_script) 266 pre_script=pre_script, post_script=post_script)
254 267
255def run_experiment(name, sched_file, exp_params, out_dir, 268def run_experiment(data, start_message, ignore, jabber):
256 start_message, ignore, jabber):
257 '''Load and parse data from files and run result.''' 269 '''Load and parse data from files and run result.'''
258 if not os.path.isfile(sched_file): 270 if not os.path.isfile(data.sched_file):
259 raise IOError("Cannot find schedule file: %s" % sched_file) 271 raise IOError("Cannot find schedule file: %s" % data.sched_file)
260 272
261 dir_name, fname = os.path.split(sched_file) 273 dir_name, fname = os.path.split(data.sched_file)
262 work_dir = "%s/tmp" % dir_name 274 work_dir = "%s/tmp" % dir_name
263 275
264 procs, execs = load_schedule(name, sched_file, exp_params.duration) 276 procs, execs = load_schedule(data.name, data.sched_file, data.params.duration)
265 277
266 exp = Experiment(name, exp_params.scheduler, work_dir, out_dir, 278 exp = Experiment(data.name, data.params.scheduler, work_dir,
267 procs, execs, exp_params.tracers) 279 data.out_dir, procs, execs, data.params.tracers)
268 280
269 exp.log(start_message) 281 exp.log(start_message)
270 282
271 if not ignore: 283 if not ignore:
272 verify_environment(exp_params) 284 verify_environment(data.params)
273 285
274 run_script(exp_params.pre_script, exp, dir_name, work_dir) 286 run_script(data.params.pre_script, exp, dir_name, work_dir)
275 287
276 exp.run_exp() 288 exp.run_exp()
277 289
278 run_script(exp_params.post_script, exp, dir_name, out_dir) 290 run_script(data.params.post_script, exp, dir_name, data.out_dir)
279 291
280 if jabber: 292 if jabber:
281 jabber.send("Completed '%s'" % name) 293 jabber.send("Completed '%s'" % data.name)
282 294
283 # Save parameters used to run experiment in out_dir 295 # Save parameters used to run dataeriment in out_dir
284 out_params = dict(exp_params.file_params.items() + 296 out_params = dict([(PARAMS['sched'], data.params.scheduler),
285 [(PARAMS['sched'], exp_params.scheduler),
286 (PARAMS['tasks'], len(execs)), 297 (PARAMS['tasks'], len(execs)),
287 (PARAMS['dur'], exp_params.duration)]) 298 (PARAMS['dur'], data.params.duration)] +
299 data.params.file_params.items())
288 300
289 # Feather-trace clock frequency saved for accurate overhead parsing 301 # Feather-trace clock frequency saved for accurate overhead parsing
290 ft_freq = com.ft_freq() 302 ft_freq = com.ft_freq()
291 if ft_freq: 303 if ft_freq:
292 out_params[PARAMS['cycles']] = ft_freq 304 out_params[PARAMS['cycles']] = ft_freq
293 305
294 with open("%s/%s" % (out_dir, DEFAULTS['params_file']), 'w') as f: 306 out_param_f = "%s/%s" % (data.out_dir, DEFAULTS['params_file'])
295 f.write(str(out_params)) 307 with open(out_param_f, 'w') as f:
308 pprint.pprint(out_params, f)
296 309
297 310
298def get_exps(opts, args): 311def make_paths(exp, opts, out_base_dir):
299 '''Return list of experiment files or directories''' 312 '''Translate experiment name to (schedule file, output directory) paths'''
300 if args: 313 path = "%s/%s" % (os.getcwd(), exp)
301 return args 314 out_dir = "%s/%s" % (out_base_dir, os.path.split(exp.strip('/'))[1])
315
316 if not os.path.exists(path):
317 raise IOError("Invalid experiment: %s" % path)
318
319 if opts.force and os.path.exists(out_dir):
320 shutil.rmtree(out_dir)
302 321
303 # Default to sched_file > generated dirs 322 if os.path.isdir(path):
304 if os.path.exists(opts.sched_file): 323 sched_file = "%s/%s" % (path, opts.sched_file)
305 sys.stderr.write("Reading schedule from %s.\n" % opts.sched_file)
306 return [opts.sched_file]
307 elif os.path.exists(DEFAULTS['out-gen']):
308 sys.stderr.write("Reading schedules from %s/*.\n" % DEFAULTS['out-gen'])
309 sched_dirs = os.listdir(DEFAULTS['out-gen'])
310 return ['%s/%s' % (DEFAULTS['out-gen'], d) for d in sched_dirs]
311 else: 324 else:
312 sys.stderr.write("Run with -h to view options.\n"); 325 sched_file = path
313 sys.exit(1) 326
327 return sched_file, out_dir
328
329
330def get_common_header(args):
331 common = ""
332 done = False
333
334 if len(args) == 1:
335 return common
336
337 while not done:
338 common += args[0][len(common)]
339 for path in args:
340 if path.find(common, 0, len(common)):
341 done = True
342 break
343
344 return common[:len(common)-1]
345
346
347def get_exps(opts, args, out_base_dir):
348 '''Return list of ExpDatas'''
349
350 if not args:
351 if os.path.exists(opts.sched_file):
352 # Default to sched_file in current directory
353 sys.stderr.write("Reading schedule from %s.\n" % opts.sched_file)
354 args = [opts.sched_file]
355 elif os.path.exists(DEFAULTS['out-gen']):
356 # Then try experiments created by gen_exps
357 sys.stderr.write("Reading schedules from %s/*.\n" % DEFAULTS['out-gen'])
358 sched_dirs = os.listdir(DEFAULTS['out-gen'])
359 args = ['%s/%s' % (DEFAULTS['out-gen'], d) for d in sched_dirs]
360 else:
361 sys.stderr.write("Run with -h to view options.\n");
362 sys.exit(1)
363
364 # Part of arg paths which is identical for each arg
365 common = get_common_header(args)
366
367 exps = []
368 for path in args:
369 sched_file, out_dir = make_paths(path, opts, out_base_dir)
370 name = path[len(common):]
371
372 sched_dir = os.path.split(sched_file)[0]
373 exp_params = make_exp_params(opts.scheduler, opts.duration,
374 sched_dir, opts.param_file)
375
376 exps += [ExpData(name, exp_params, sched_file, out_dir,
377 0, ExpState.None)]
378
379 return exps
314 380
315 381
316def setup_jabber(target): 382def setup_jabber(target):
@@ -338,32 +404,53 @@ def setup_email(target):
338 return None 404 return None
339 405
340 406
341def make_paths(exp, out_base_dir, opts): 407def run_exps(exps, opts):
342 '''Translate experiment name to (schedule file, output directory) paths''' 408 jabber = setup_jabber(opts.jabber) if opts.jabber else None
343 path = "%s/%s" % (os.getcwd(), exp)
344 out_dir = "%s/%s" % (out_base_dir, os.path.split(exp.strip('/'))[1])
345
346 if not os.path.exists(path):
347 raise IOError("Invalid experiment: %s" % path)
348 409
349 if opts.force and os.path.exists(out_dir): 410 exps_remaining = list(enumerate(exps))
350 shutil.rmtree(out_dir) 411 while exps_remaining:
412 i, exp = exps_remaining.pop(0)
351 413
352 if os.path.isdir(path): 414 verb = "Loading" if exp.state == ExpState.None else "Re-running failed"
353 sched_file = "%s/%s" % (path, opts.sched_file) 415 start_message = "%s experiment %d of %d." % (verb, i+1, len(exps))
354 else:
355 sched_file = path
356 416
357 return sched_file, out_dir 417 try:
418 run_experiment(exp, start_message, opts.ignore, jabber)
419 exp.state = ExpState.Succeeded
420 except KeyboardInterrupt:
421 sys.stderr.write("Keyboard interrupt, quitting\n")
422 break
423 except ExperimentDone:
424 sys.stderr.write("Experiment already completed at '%s'\n" % exp.out_dir)
425 exp.state = ExpState.Done
426 except (InvalidKernel, InvalidConfig) as e:
427 sys.stderr.write("Invalid environment for experiment '%s'\n" % exp.name)
428 sys.stderr.write("%s\n" % e)
429 exp.state = ExpState.Invalid
430 except SystemCorrupted as e:
431 sys.stderr.write("System is corrupted! Fix state before continuing.\n")
432 sys.stderr.write("%s\n" % e)
433 exp.state = ExpState.Failed
434 if not opts.retry:
435 break
436 else:
437 sys.stderr.write("Remaining experiments may fail\n")
438 except Exception as e:
439 sys.stderr.write("Failed experiment %s\n" % exp.name)
440 sys.stderr.write("%s\n" % e)
441 exp.state = ExpState.Failed
358 442
443 if exp.state is ExpState.Failed and opts.retry:
444 if exp.retries < MAX_RETRY:
445 exps_remaining += [(i, exp)]
446 exp.retries += 1
447 else:
448 sys.stderr.write("Hit maximum retries of %d\n" % MAX_RETRY)
359 449
360def main(): 450def main():
361 opts, args = parse_args() 451 opts, args = parse_args()
362 452
363 exps = get_exps(opts, args) 453 email = setup_email(opts.email) if opts.email else None
364
365 jabber = setup_jabber(opts.jabber) if opts.jabber else None
366 email = setup_email(opts.email) if opts.email else None
367 454
368 out_base = os.path.abspath(opts.out_dir) 455 out_base = os.path.abspath(opts.out_dir)
369 created = False 456 created = False
@@ -371,62 +458,35 @@ def main():
371 created = True 458 created = True
372 os.mkdir(out_base) 459 os.mkdir(out_base)
373 460
374 ran = done = succ = failed = invalid = 0 461 exps = get_exps(opts, args, out_base)
375 462
376 for i, exp in enumerate(exps): 463 run_exps(exps, opts)
377 sched_file, out_dir = make_paths(exp, out_base, opts)
378 sched_dir = os.path.split(sched_file)[0]
379 464
380 try: 465 def state_count(state):
381 start_message = "Loading experiment %d of %d." % (i+1, len(exps)) 466 return len(filter(lambda x: x.state is state, exps))
382 exp_params = make_exp_params(opts.scheduler, opts.duration,
383 sched_dir, opts.param_file)
384 467
385 run_experiment(exp, sched_file, exp_params, out_dir, 468 ran = len(filter(lambda x: x.state is not ExpState.None, exps))
386 start_message, opts.ignore, jabber) 469 succ = state_count(ExpState.Succeeded)
387
388 succ += 1
389 except ExperimentDone:
390 sys.stderr.write("Experiment '%s' already completed " % exp +
391 "at '%s'\n" % out_base)
392 done += 1
393 except (InvalidKernel, InvalidConfig) as e:
394 sys.stderr.write("Invalid environment for experiment '%s'\n" % exp)
395 sys.stderr.write("%s\n" % e)
396 invalid += 1
397 except KeyboardInterrupt:
398 sys.stderr.write("Keyboard interrupt, quitting\n")
399 break
400 except SystemCorrupted as e:
401 sys.stderr.write("System is corrupted! Fix state before continuing.\n")
402 sys.stderr.write("%s\n" % e)
403 break
404 except Exception as e:
405 sys.stderr.write("Failed experiment %s\n" % exp)
406 sys.stderr.write("%s\n" % e)
407 failed += 1
408
409 ran += 1
410
411 # Clean out directory if it failed immediately
412 if not os.listdir(out_base) and created and not succ:
413 os.rmdir(out_base)
414 470
415 message = "Experiments ran:\t%d of %d" % (ran, len(exps)) +\ 471 message = "Experiments ran:\t%d of %d" % (ran, len(exps)) +\
416 "\n Successful:\t\t%d" % succ +\ 472 "\n Successful:\t\t%d" % succ +\
417 "\n Failed:\t\t%d" % failed +\ 473 "\n Failed:\t\t%d" % state_count(ExpState.Failed) +\
418 "\n Already Done:\t\t%d" % done +\ 474 "\n Already Done:\t\t%d" % state_count(ExpState.Done) +\
419 "\n Invalid Environment:\t%d" % invalid 475 "\n Invalid Environment:\t%d" % state_count(ExpState.Invalid)
420 476
421 print(message) 477 print(message)
422 478
479 if email:
480 email.send(message)
481 email.close()
482
423 if succ: 483 if succ:
424 sys.stderr.write("Successful experiment data saved in %s.\n" % 484 sys.stderr.write("Successful experiment data saved in %s.\n" %
425 opts.out_dir) 485 opts.out_dir)
486 elif not os.listdir(out_base) and created:
487 # Remove directory if no data was put into it
488 os.rmdir(out_base)
426 489
427 if email:
428 email.send(message)
429 email.close()
430 490
431if __name__ == '__main__': 491if __name__ == '__main__':
432 main() 492 main()