aboutsummaryrefslogtreecommitdiffstats
path: root/run_exps.py
diff options
context:
space:
mode:
authorJonathan Herman <hermanjl@cs.unc.edu>2013-04-29 16:50:23 -0400
committerJonathan Herman <hermanjl@cs.unc.edu>2013-04-29 16:50:23 -0400
commit7e32c3915e7ea27d2533d99a22fa53ef923198f5 (patch)
tree5bd8d8a3ed6861e039a683f47a953d2f7a22d8b1 /run_exps.py
parent7545402506aa76261e18d85af585ff0ac1cf05c1 (diff)
Added run_exps.py option to --retry failed experiments.
If the retry flag is specified, failed experiments will be re-run after all other experiments have run. They can be re-run at most 5 times. This commit required a refactoring of run_exps.py to clean up the main experiment running loop.
Diffstat (limited to 'run_exps.py')
-rwxr-xr-xrun_exps.py246
1 files changed, 153 insertions, 93 deletions
diff --git a/run_exps.py b/run_exps.py
index a15018d..1d46b45 100755
--- a/run_exps.py
+++ b/run_exps.py
@@ -3,6 +3,7 @@ from __future__ import print_function
3 3
4import common as com 4import common as com
5import os 5import os
6import pprint
6import re 7import re
7import shutil 8import shutil
8import sys 9import sys
@@ -11,17 +12,26 @@ import run.tracer as trace
11from config.config import PARAMS,DEFAULTS 12from config.config import PARAMS,DEFAULTS
12from collections import namedtuple 13from collections import namedtuple
13from optparse import OptionParser 14from optparse import OptionParser
15from parse.enum import Enum
14from run.executable.executable import Executable 16from run.executable.executable import Executable
15from run.experiment import Experiment,ExperimentDone,SystemCorrupted 17from run.experiment import Experiment,ExperimentDone,SystemCorrupted
16from run.proc_entry import ProcEntry 18from run.proc_entry import ProcEntry
17 19
20'''Maximum times an experiment will be retried'''
21MAX_RETRY = 5
22
18'''Customizable experiment parameters''' 23'''Customizable experiment parameters'''
19ExpParams = namedtuple('ExpParams', ['scheduler', 'duration', 'tracers', 24ExpParams = namedtuple('ExpParams', ['scheduler', 'duration', 'tracers',
20 'kernel', 'config_options', 'file_params', 25 'kernel', 'config_options', 'file_params',
21 'pre_script', 'post_script']) 26 'pre_script', 'post_script'])
27'''Tracked with each experiment'''
28ExpState = Enum(['Failed', 'Succeeded', 'Invalid', 'Done', 'None'])
29ExpData = com.recordtype('ExpData', ['name', 'params', 'sched_file', 'out_dir',
30 'retries', 'state'])
22'''Comparison of requested versus actual kernel compile parameter value''' 31'''Comparison of requested versus actual kernel compile parameter value'''
23ConfigResult = namedtuple('ConfigResult', ['param', 'wanted', 'actual']) 32ConfigResult = namedtuple('ConfigResult', ['param', 'wanted', 'actual'])
24 33
34
25class InvalidKernel(Exception): 35class InvalidKernel(Exception):
26 def __init__(self, kernel): 36 def __init__(self, kernel):
27 self.kernel = kernel 37 self.kernel = kernel
@@ -72,6 +82,9 @@ def parse_args():
72 parser.add_option('-e', '--email', metavar='username@server', 82 parser.add_option('-e', '--email', metavar='username@server',
73 dest='email', default=None, 83 dest='email', default=None,
74 help='send an email when all experiments complete') 84 help='send an email when all experiments complete')
85 parser.add_option('-r', '--retry', dest='retry',
86 action='store_true', default=False,
87 help='retry failed experiments')
75 88
76 return parser.parse_args() 89 return parser.parse_args()
77 90
@@ -252,65 +265,118 @@ def make_exp_params(cmd_scheduler, cmd_duration, sched_dir, param_file):
252 config_options=copts, tracers=tracers, file_params=fparams, 265 config_options=copts, tracers=tracers, file_params=fparams,
253 pre_script=pre_script, post_script=post_script) 266 pre_script=pre_script, post_script=post_script)
254 267
255def run_experiment(name, sched_file, exp_params, out_dir, 268def run_experiment(data, start_message, ignore, jabber):
256 start_message, ignore, jabber):
257 '''Load and parse data from files and run result.''' 269 '''Load and parse data from files and run result.'''
258 if not os.path.isfile(sched_file): 270 if not os.path.isfile(data.sched_file):
259 raise IOError("Cannot find schedule file: %s" % sched_file) 271 raise IOError("Cannot find schedule file: %s" % data.sched_file)
260 272
261 dir_name, fname = os.path.split(sched_file) 273 dir_name, fname = os.path.split(data.sched_file)
262 work_dir = "%s/tmp" % dir_name 274 work_dir = "%s/tmp" % dir_name
263 275
264 procs, execs = load_schedule(name, sched_file, exp_params.duration) 276 procs, execs = load_schedule(data.name, data.sched_file, data.params.duration)
265 277
266 exp = Experiment(name, exp_params.scheduler, work_dir, out_dir, 278 exp = Experiment(data.name, data.params.scheduler, work_dir,
267 procs, execs, exp_params.tracers) 279 data.out_dir, procs, execs, data.params.tracers)
268 280
269 exp.log(start_message) 281 exp.log(start_message)
270 282
271 if not ignore: 283 if not ignore:
272 verify_environment(exp_params) 284 verify_environment(data.params)
273 285
274 run_script(exp_params.pre_script, exp, dir_name, work_dir) 286 run_script(data.params.pre_script, exp, dir_name, work_dir)
275 287
276 exp.run_exp() 288 exp.run_exp()
277 289
278 run_script(exp_params.post_script, exp, dir_name, out_dir) 290 run_script(data.params.post_script, exp, dir_name, data.out_dir)
279 291
280 if jabber: 292 if jabber:
281 jabber.send("Completed '%s'" % name) 293 jabber.send("Completed '%s'" % data.name)
282 294
283 # Save parameters used to run experiment in out_dir 295 # Save parameters used to run dataeriment in out_dir
284 out_params = dict(exp_params.file_params.items() + 296 out_params = dict([(PARAMS['sched'], data.params.scheduler),
285 [(PARAMS['sched'], exp_params.scheduler),
286 (PARAMS['tasks'], len(execs)), 297 (PARAMS['tasks'], len(execs)),
287 (PARAMS['dur'], exp_params.duration)]) 298 (PARAMS['dur'], data.params.duration)] +
299 data.params.file_params.items())
288 300
289 # Feather-trace clock frequency saved for accurate overhead parsing 301 # Feather-trace clock frequency saved for accurate overhead parsing
290 ft_freq = com.ft_freq() 302 ft_freq = com.ft_freq()
291 if ft_freq: 303 if ft_freq:
292 out_params[PARAMS['cycles']] = ft_freq 304 out_params[PARAMS['cycles']] = ft_freq
293 305
294 with open("%s/%s" % (out_dir, DEFAULTS['params_file']), 'w') as f: 306 out_param_f = "%s/%s" % (data.out_dir, DEFAULTS['params_file'])
295 f.write(str(out_params)) 307 with open(out_param_f, 'w') as f:
308 pprint.pprint(out_params, f)
296 309
297 310
298def get_exps(opts, args): 311def make_paths(exp, opts, out_base_dir):
299 '''Return list of experiment files or directories''' 312 '''Translate experiment name to (schedule file, output directory) paths'''
300 if args: 313 path = "%s/%s" % (os.getcwd(), exp)
301 return args 314 out_dir = "%s/%s" % (out_base_dir, os.path.split(exp.strip('/'))[1])
315
316 if not os.path.exists(path):
317 raise IOError("Invalid experiment: %s" % path)
318
319 if opts.force and os.path.exists(out_dir):
320 shutil.rmtree(out_dir)
302 321
303 # Default to sched_file > generated dirs 322 if os.path.isdir(path):
304 if os.path.exists(opts.sched_file): 323 sched_file = "%s/%s" % (path, opts.sched_file)
305 sys.stderr.write("Reading schedule from %s.\n" % opts.sched_file)
306 return [opts.sched_file]
307 elif os.path.exists(DEFAULTS['out-gen']):
308 sys.stderr.write("Reading schedules from %s/*.\n" % DEFAULTS['out-gen'])
309 sched_dirs = os.listdir(DEFAULTS['out-gen'])
310 return ['%s/%s' % (DEFAULTS['out-gen'], d) for d in sched_dirs]
311 else: 324 else:
312 sys.stderr.write("Run with -h to view options.\n"); 325 sched_file = path
313 sys.exit(1) 326
327 return sched_file, out_dir
328
329
330def get_common_header(args):
331 common = ""
332 done = False
333
334 if len(args) == 1:
335 return common
336
337 while not done:
338 common += args[0][len(common)]
339 for path in args:
340 if path.find(common, 0, len(common)):
341 done = True
342 break
343
344 return common[:len(common)-1]
345
346
347def get_exps(opts, args, out_base_dir):
348 '''Return list of ExpDatas'''
349
350 if not args:
351 if os.path.exists(opts.sched_file):
352 # Default to sched_file in current directory
353 sys.stderr.write("Reading schedule from %s.\n" % opts.sched_file)
354 args = [opts.sched_file]
355 elif os.path.exists(DEFAULTS['out-gen']):
356 # Then try experiments created by gen_exps
357 sys.stderr.write("Reading schedules from %s/*.\n" % DEFAULTS['out-gen'])
358 sched_dirs = os.listdir(DEFAULTS['out-gen'])
359 args = ['%s/%s' % (DEFAULTS['out-gen'], d) for d in sched_dirs]
360 else:
361 sys.stderr.write("Run with -h to view options.\n");
362 sys.exit(1)
363
364 # Part of arg paths which is identical for each arg
365 common = get_common_header(args)
366
367 exps = []
368 for path in args:
369 sched_file, out_dir = make_paths(path, opts, out_base_dir)
370 name = path[len(common):]
371
372 sched_dir = os.path.split(sched_file)[0]
373 exp_params = make_exp_params(opts.scheduler, opts.duration,
374 sched_dir, opts.param_file)
375
376 exps += [ExpData(name, exp_params, sched_file, out_dir,
377 0, ExpState.None)]
378
379 return exps
314 380
315 381
316def setup_jabber(target): 382def setup_jabber(target):
@@ -338,32 +404,53 @@ def setup_email(target):
338 return None 404 return None
339 405
340 406
341def make_paths(exp, out_base_dir, opts): 407def run_exps(exps, opts):
342 '''Translate experiment name to (schedule file, output directory) paths''' 408 jabber = setup_jabber(opts.jabber) if opts.jabber else None
343 path = "%s/%s" % (os.getcwd(), exp)
344 out_dir = "%s/%s" % (out_base_dir, os.path.split(exp.strip('/'))[1])
345
346 if not os.path.exists(path):
347 raise IOError("Invalid experiment: %s" % path)
348 409
349 if opts.force and os.path.exists(out_dir): 410 exps_remaining = list(enumerate(exps))
350 shutil.rmtree(out_dir) 411 while exps_remaining:
412 i, exp = exps_remaining.pop(0)
351 413
352 if os.path.isdir(path): 414 verb = "Loading" if exp.state == ExpState.None else "Re-running failed"
353 sched_file = "%s/%s" % (path, opts.sched_file) 415 start_message = "%s experiment %d of %d." % (verb, i+1, len(exps))
354 else:
355 sched_file = path
356 416
357 return sched_file, out_dir 417 try:
418 run_experiment(exp, start_message, opts.ignore, jabber)
419 exp.state = ExpState.Succeeded
420 except KeyboardInterrupt:
421 sys.stderr.write("Keyboard interrupt, quitting\n")
422 break
423 except ExperimentDone:
424 sys.stderr.write("Experiment already completed at '%s'\n" % exp.out_dir)
425 exp.state = ExpState.Done
426 except (InvalidKernel, InvalidConfig) as e:
427 sys.stderr.write("Invalid environment for experiment '%s'\n" % exp.name)
428 sys.stderr.write("%s\n" % e)
429 exp.state = ExpState.Invalid
430 except SystemCorrupted as e:
431 sys.stderr.write("System is corrupted! Fix state before continuing.\n")
432 sys.stderr.write("%s\n" % e)
433 exp.state = ExpState.Failed
434 if not opts.retry:
435 break
436 else:
437 sys.stderr.write("Remaining experiments may fail\n")
438 except Exception as e:
439 sys.stderr.write("Failed experiment %s\n" % exp.name)
440 sys.stderr.write("%s\n" % e)
441 exp.state = ExpState.Failed
358 442
443 if exp.state is ExpState.Failed and opts.retry:
444 if exp.retries < MAX_RETRY:
445 exps_remaining += [(i, exp)]
446 exp.retries += 1
447 else:
448 sys.stderr.write("Hit maximum retries of %d\n" % MAX_RETRY)
359 449
360def main(): 450def main():
361 opts, args = parse_args() 451 opts, args = parse_args()
362 452
363 exps = get_exps(opts, args) 453 email = setup_email(opts.email) if opts.email else None
364
365 jabber = setup_jabber(opts.jabber) if opts.jabber else None
366 email = setup_email(opts.email) if opts.email else None
367 454
368 out_base = os.path.abspath(opts.out_dir) 455 out_base = os.path.abspath(opts.out_dir)
369 created = False 456 created = False
@@ -371,62 +458,35 @@ def main():
371 created = True 458 created = True
372 os.mkdir(out_base) 459 os.mkdir(out_base)
373 460
374 ran = done = succ = failed = invalid = 0 461 exps = get_exps(opts, args, out_base)
375 462
376 for i, exp in enumerate(exps): 463 run_exps(exps, opts)
377 sched_file, out_dir = make_paths(exp, out_base, opts)
378 sched_dir = os.path.split(sched_file)[0]
379 464
380 try: 465 def state_count(state):
381 start_message = "Loading experiment %d of %d." % (i+1, len(exps)) 466 return len(filter(lambda x: x.state is state, exps))
382 exp_params = make_exp_params(opts.scheduler, opts.duration,
383 sched_dir, opts.param_file)
384 467
385 run_experiment(exp, sched_file, exp_params, out_dir, 468 ran = len(filter(lambda x: x.state is not ExpState.None, exps))
386 start_message, opts.ignore, jabber) 469 succ = state_count(ExpState.Succeeded)
387
388 succ += 1
389 except ExperimentDone:
390 sys.stderr.write("Experiment '%s' already completed " % exp +
391 "at '%s'\n" % out_base)
392 done += 1
393 except (InvalidKernel, InvalidConfig) as e:
394 sys.stderr.write("Invalid environment for experiment '%s'\n" % exp)
395 sys.stderr.write("%s\n" % e)
396 invalid += 1
397 except KeyboardInterrupt:
398 sys.stderr.write("Keyboard interrupt, quitting\n")
399 break
400 except SystemCorrupted as e:
401 sys.stderr.write("System is corrupted! Fix state before continuing.\n")
402 sys.stderr.write("%s\n" % e)
403 break
404 except Exception as e:
405 sys.stderr.write("Failed experiment %s\n" % exp)
406 sys.stderr.write("%s\n" % e)
407 failed += 1
408
409 ran += 1
410
411 # Clean out directory if it failed immediately
412 if not os.listdir(out_base) and created and not succ:
413 os.rmdir(out_base)
414 470
415 message = "Experiments ran:\t%d of %d" % (ran, len(exps)) +\ 471 message = "Experiments ran:\t%d of %d" % (ran, len(exps)) +\
416 "\n Successful:\t\t%d" % succ +\ 472 "\n Successful:\t\t%d" % succ +\
417 "\n Failed:\t\t%d" % failed +\ 473 "\n Failed:\t\t%d" % state_count(ExpState.Failed) +\
418 "\n Already Done:\t\t%d" % done +\ 474 "\n Already Done:\t\t%d" % state_count(ExpState.Done) +\
419 "\n Invalid Environment:\t%d" % invalid 475 "\n Invalid Environment:\t%d" % state_count(ExpState.Invalid)
420 476
421 print(message) 477 print(message)
422 478
479 if email:
480 email.send(message)
481 email.close()
482
423 if succ: 483 if succ:
424 sys.stderr.write("Successful experiment data saved in %s.\n" % 484 sys.stderr.write("Successful experiment data saved in %s.\n" %
425 opts.out_dir) 485 opts.out_dir)
486 elif not os.listdir(out_base) and created:
487 # Remove directory if no data was put into it
488 os.rmdir(out_base)
426 489
427 if email:
428 email.send(message)
429 email.close()
430 490
431if __name__ == '__main__': 491if __name__ == '__main__':
432 main() 492 main()