diff options
author | Jonathan Herman <hermanjl@cs.unc.edu> | 2013-04-29 16:50:23 -0400 |
---|---|---|
committer | Jonathan Herman <hermanjl@cs.unc.edu> | 2013-04-29 16:50:23 -0400 |
commit | 7e32c3915e7ea27d2533d99a22fa53ef923198f5 (patch) | |
tree | 5bd8d8a3ed6861e039a683f47a953d2f7a22d8b1 /run_exps.py | |
parent | 7545402506aa76261e18d85af585ff0ac1cf05c1 (diff) |
Added run_exps.py option to --retry failed experiments.
If the retry flag is specified, failed experiments will be re-run after all
other experiments have run. They can be re-run at most 5 times.
This commit required a refactoring of run_exps.py to clean up the main
experiment running loop.
Diffstat (limited to 'run_exps.py')
-rwxr-xr-x | run_exps.py | 246 |
1 files changed, 153 insertions, 93 deletions
diff --git a/run_exps.py b/run_exps.py index a15018d..1d46b45 100755 --- a/run_exps.py +++ b/run_exps.py | |||
@@ -3,6 +3,7 @@ from __future__ import print_function | |||
3 | 3 | ||
4 | import common as com | 4 | import common as com |
5 | import os | 5 | import os |
6 | import pprint | ||
6 | import re | 7 | import re |
7 | import shutil | 8 | import shutil |
8 | import sys | 9 | import sys |
@@ -11,17 +12,26 @@ import run.tracer as trace | |||
11 | from config.config import PARAMS,DEFAULTS | 12 | from config.config import PARAMS,DEFAULTS |
12 | from collections import namedtuple | 13 | from collections import namedtuple |
13 | from optparse import OptionParser | 14 | from optparse import OptionParser |
15 | from parse.enum import Enum | ||
14 | from run.executable.executable import Executable | 16 | from run.executable.executable import Executable |
15 | from run.experiment import Experiment,ExperimentDone,SystemCorrupted | 17 | from run.experiment import Experiment,ExperimentDone,SystemCorrupted |
16 | from run.proc_entry import ProcEntry | 18 | from run.proc_entry import ProcEntry |
17 | 19 | ||
20 | '''Maximum times an experiment will be retried''' | ||
21 | MAX_RETRY = 5 | ||
22 | |||
18 | '''Customizable experiment parameters''' | 23 | '''Customizable experiment parameters''' |
19 | ExpParams = namedtuple('ExpParams', ['scheduler', 'duration', 'tracers', | 24 | ExpParams = namedtuple('ExpParams', ['scheduler', 'duration', 'tracers', |
20 | 'kernel', 'config_options', 'file_params', | 25 | 'kernel', 'config_options', 'file_params', |
21 | 'pre_script', 'post_script']) | 26 | 'pre_script', 'post_script']) |
27 | '''Tracked with each experiment''' | ||
28 | ExpState = Enum(['Failed', 'Succeeded', 'Invalid', 'Done', 'None']) | ||
29 | ExpData = com.recordtype('ExpData', ['name', 'params', 'sched_file', 'out_dir', | ||
30 | 'retries', 'state']) | ||
22 | '''Comparison of requested versus actual kernel compile parameter value''' | 31 | '''Comparison of requested versus actual kernel compile parameter value''' |
23 | ConfigResult = namedtuple('ConfigResult', ['param', 'wanted', 'actual']) | 32 | ConfigResult = namedtuple('ConfigResult', ['param', 'wanted', 'actual']) |
24 | 33 | ||
34 | |||
25 | class InvalidKernel(Exception): | 35 | class InvalidKernel(Exception): |
26 | def __init__(self, kernel): | 36 | def __init__(self, kernel): |
27 | self.kernel = kernel | 37 | self.kernel = kernel |
@@ -72,6 +82,9 @@ def parse_args(): | |||
72 | parser.add_option('-e', '--email', metavar='username@server', | 82 | parser.add_option('-e', '--email', metavar='username@server', |
73 | dest='email', default=None, | 83 | dest='email', default=None, |
74 | help='send an email when all experiments complete') | 84 | help='send an email when all experiments complete') |
85 | parser.add_option('-r', '--retry', dest='retry', | ||
86 | action='store_true', default=False, | ||
87 | help='retry failed experiments') | ||
75 | 88 | ||
76 | return parser.parse_args() | 89 | return parser.parse_args() |
77 | 90 | ||
@@ -252,65 +265,118 @@ def make_exp_params(cmd_scheduler, cmd_duration, sched_dir, param_file): | |||
252 | config_options=copts, tracers=tracers, file_params=fparams, | 265 | config_options=copts, tracers=tracers, file_params=fparams, |
253 | pre_script=pre_script, post_script=post_script) | 266 | pre_script=pre_script, post_script=post_script) |
254 | 267 | ||
255 | def run_experiment(name, sched_file, exp_params, out_dir, | 268 | def run_experiment(data, start_message, ignore, jabber): |
256 | start_message, ignore, jabber): | ||
257 | '''Load and parse data from files and run result.''' | 269 | '''Load and parse data from files and run result.''' |
258 | if not os.path.isfile(sched_file): | 270 | if not os.path.isfile(data.sched_file): |
259 | raise IOError("Cannot find schedule file: %s" % sched_file) | 271 | raise IOError("Cannot find schedule file: %s" % data.sched_file) |
260 | 272 | ||
261 | dir_name, fname = os.path.split(sched_file) | 273 | dir_name, fname = os.path.split(data.sched_file) |
262 | work_dir = "%s/tmp" % dir_name | 274 | work_dir = "%s/tmp" % dir_name |
263 | 275 | ||
264 | procs, execs = load_schedule(name, sched_file, exp_params.duration) | 276 | procs, execs = load_schedule(data.name, data.sched_file, data.params.duration) |
265 | 277 | ||
266 | exp = Experiment(name, exp_params.scheduler, work_dir, out_dir, | 278 | exp = Experiment(data.name, data.params.scheduler, work_dir, |
267 | procs, execs, exp_params.tracers) | 279 | data.out_dir, procs, execs, data.params.tracers) |
268 | 280 | ||
269 | exp.log(start_message) | 281 | exp.log(start_message) |
270 | 282 | ||
271 | if not ignore: | 283 | if not ignore: |
272 | verify_environment(exp_params) | 284 | verify_environment(data.params) |
273 | 285 | ||
274 | run_script(exp_params.pre_script, exp, dir_name, work_dir) | 286 | run_script(data.params.pre_script, exp, dir_name, work_dir) |
275 | 287 | ||
276 | exp.run_exp() | 288 | exp.run_exp() |
277 | 289 | ||
278 | run_script(exp_params.post_script, exp, dir_name, out_dir) | 290 | run_script(data.params.post_script, exp, dir_name, data.out_dir) |
279 | 291 | ||
280 | if jabber: | 292 | if jabber: |
281 | jabber.send("Completed '%s'" % name) | 293 | jabber.send("Completed '%s'" % data.name) |
282 | 294 | ||
283 | # Save parameters used to run experiment in out_dir | 295 | # Save parameters used to run dataeriment in out_dir |
284 | out_params = dict(exp_params.file_params.items() + | 296 | out_params = dict([(PARAMS['sched'], data.params.scheduler), |
285 | [(PARAMS['sched'], exp_params.scheduler), | ||
286 | (PARAMS['tasks'], len(execs)), | 297 | (PARAMS['tasks'], len(execs)), |
287 | (PARAMS['dur'], exp_params.duration)]) | 298 | (PARAMS['dur'], data.params.duration)] + |
299 | data.params.file_params.items()) | ||
288 | 300 | ||
289 | # Feather-trace clock frequency saved for accurate overhead parsing | 301 | # Feather-trace clock frequency saved for accurate overhead parsing |
290 | ft_freq = com.ft_freq() | 302 | ft_freq = com.ft_freq() |
291 | if ft_freq: | 303 | if ft_freq: |
292 | out_params[PARAMS['cycles']] = ft_freq | 304 | out_params[PARAMS['cycles']] = ft_freq |
293 | 305 | ||
294 | with open("%s/%s" % (out_dir, DEFAULTS['params_file']), 'w') as f: | 306 | out_param_f = "%s/%s" % (data.out_dir, DEFAULTS['params_file']) |
295 | f.write(str(out_params)) | 307 | with open(out_param_f, 'w') as f: |
308 | pprint.pprint(out_params, f) | ||
296 | 309 | ||
297 | 310 | ||
298 | def get_exps(opts, args): | 311 | def make_paths(exp, opts, out_base_dir): |
299 | '''Return list of experiment files or directories''' | 312 | '''Translate experiment name to (schedule file, output directory) paths''' |
300 | if args: | 313 | path = "%s/%s" % (os.getcwd(), exp) |
301 | return args | 314 | out_dir = "%s/%s" % (out_base_dir, os.path.split(exp.strip('/'))[1]) |
315 | |||
316 | if not os.path.exists(path): | ||
317 | raise IOError("Invalid experiment: %s" % path) | ||
318 | |||
319 | if opts.force and os.path.exists(out_dir): | ||
320 | shutil.rmtree(out_dir) | ||
302 | 321 | ||
303 | # Default to sched_file > generated dirs | 322 | if os.path.isdir(path): |
304 | if os.path.exists(opts.sched_file): | 323 | sched_file = "%s/%s" % (path, opts.sched_file) |
305 | sys.stderr.write("Reading schedule from %s.\n" % opts.sched_file) | ||
306 | return [opts.sched_file] | ||
307 | elif os.path.exists(DEFAULTS['out-gen']): | ||
308 | sys.stderr.write("Reading schedules from %s/*.\n" % DEFAULTS['out-gen']) | ||
309 | sched_dirs = os.listdir(DEFAULTS['out-gen']) | ||
310 | return ['%s/%s' % (DEFAULTS['out-gen'], d) for d in sched_dirs] | ||
311 | else: | 324 | else: |
312 | sys.stderr.write("Run with -h to view options.\n"); | 325 | sched_file = path |
313 | sys.exit(1) | 326 | |
327 | return sched_file, out_dir | ||
328 | |||
329 | |||
330 | def get_common_header(args): | ||
331 | common = "" | ||
332 | done = False | ||
333 | |||
334 | if len(args) == 1: | ||
335 | return common | ||
336 | |||
337 | while not done: | ||
338 | common += args[0][len(common)] | ||
339 | for path in args: | ||
340 | if path.find(common, 0, len(common)): | ||
341 | done = True | ||
342 | break | ||
343 | |||
344 | return common[:len(common)-1] | ||
345 | |||
346 | |||
347 | def get_exps(opts, args, out_base_dir): | ||
348 | '''Return list of ExpDatas''' | ||
349 | |||
350 | if not args: | ||
351 | if os.path.exists(opts.sched_file): | ||
352 | # Default to sched_file in current directory | ||
353 | sys.stderr.write("Reading schedule from %s.\n" % opts.sched_file) | ||
354 | args = [opts.sched_file] | ||
355 | elif os.path.exists(DEFAULTS['out-gen']): | ||
356 | # Then try experiments created by gen_exps | ||
357 | sys.stderr.write("Reading schedules from %s/*.\n" % DEFAULTS['out-gen']) | ||
358 | sched_dirs = os.listdir(DEFAULTS['out-gen']) | ||
359 | args = ['%s/%s' % (DEFAULTS['out-gen'], d) for d in sched_dirs] | ||
360 | else: | ||
361 | sys.stderr.write("Run with -h to view options.\n"); | ||
362 | sys.exit(1) | ||
363 | |||
364 | # Part of arg paths which is identical for each arg | ||
365 | common = get_common_header(args) | ||
366 | |||
367 | exps = [] | ||
368 | for path in args: | ||
369 | sched_file, out_dir = make_paths(path, opts, out_base_dir) | ||
370 | name = path[len(common):] | ||
371 | |||
372 | sched_dir = os.path.split(sched_file)[0] | ||
373 | exp_params = make_exp_params(opts.scheduler, opts.duration, | ||
374 | sched_dir, opts.param_file) | ||
375 | |||
376 | exps += [ExpData(name, exp_params, sched_file, out_dir, | ||
377 | 0, ExpState.None)] | ||
378 | |||
379 | return exps | ||
314 | 380 | ||
315 | 381 | ||
316 | def setup_jabber(target): | 382 | def setup_jabber(target): |
@@ -338,32 +404,53 @@ def setup_email(target): | |||
338 | return None | 404 | return None |
339 | 405 | ||
340 | 406 | ||
341 | def make_paths(exp, out_base_dir, opts): | 407 | def run_exps(exps, opts): |
342 | '''Translate experiment name to (schedule file, output directory) paths''' | 408 | jabber = setup_jabber(opts.jabber) if opts.jabber else None |
343 | path = "%s/%s" % (os.getcwd(), exp) | ||
344 | out_dir = "%s/%s" % (out_base_dir, os.path.split(exp.strip('/'))[1]) | ||
345 | |||
346 | if not os.path.exists(path): | ||
347 | raise IOError("Invalid experiment: %s" % path) | ||
348 | 409 | ||
349 | if opts.force and os.path.exists(out_dir): | 410 | exps_remaining = list(enumerate(exps)) |
350 | shutil.rmtree(out_dir) | 411 | while exps_remaining: |
412 | i, exp = exps_remaining.pop(0) | ||
351 | 413 | ||
352 | if os.path.isdir(path): | 414 | verb = "Loading" if exp.state == ExpState.None else "Re-running failed" |
353 | sched_file = "%s/%s" % (path, opts.sched_file) | 415 | start_message = "%s experiment %d of %d." % (verb, i+1, len(exps)) |
354 | else: | ||
355 | sched_file = path | ||
356 | 416 | ||
357 | return sched_file, out_dir | 417 | try: |
418 | run_experiment(exp, start_message, opts.ignore, jabber) | ||
419 | exp.state = ExpState.Succeeded | ||
420 | except KeyboardInterrupt: | ||
421 | sys.stderr.write("Keyboard interrupt, quitting\n") | ||
422 | break | ||
423 | except ExperimentDone: | ||
424 | sys.stderr.write("Experiment already completed at '%s'\n" % exp.out_dir) | ||
425 | exp.state = ExpState.Done | ||
426 | except (InvalidKernel, InvalidConfig) as e: | ||
427 | sys.stderr.write("Invalid environment for experiment '%s'\n" % exp.name) | ||
428 | sys.stderr.write("%s\n" % e) | ||
429 | exp.state = ExpState.Invalid | ||
430 | except SystemCorrupted as e: | ||
431 | sys.stderr.write("System is corrupted! Fix state before continuing.\n") | ||
432 | sys.stderr.write("%s\n" % e) | ||
433 | exp.state = ExpState.Failed | ||
434 | if not opts.retry: | ||
435 | break | ||
436 | else: | ||
437 | sys.stderr.write("Remaining experiments may fail\n") | ||
438 | except Exception as e: | ||
439 | sys.stderr.write("Failed experiment %s\n" % exp.name) | ||
440 | sys.stderr.write("%s\n" % e) | ||
441 | exp.state = ExpState.Failed | ||
358 | 442 | ||
443 | if exp.state is ExpState.Failed and opts.retry: | ||
444 | if exp.retries < MAX_RETRY: | ||
445 | exps_remaining += [(i, exp)] | ||
446 | exp.retries += 1 | ||
447 | else: | ||
448 | sys.stderr.write("Hit maximum retries of %d\n" % MAX_RETRY) | ||
359 | 449 | ||
360 | def main(): | 450 | def main(): |
361 | opts, args = parse_args() | 451 | opts, args = parse_args() |
362 | 452 | ||
363 | exps = get_exps(opts, args) | 453 | email = setup_email(opts.email) if opts.email else None |
364 | |||
365 | jabber = setup_jabber(opts.jabber) if opts.jabber else None | ||
366 | email = setup_email(opts.email) if opts.email else None | ||
367 | 454 | ||
368 | out_base = os.path.abspath(opts.out_dir) | 455 | out_base = os.path.abspath(opts.out_dir) |
369 | created = False | 456 | created = False |
@@ -371,62 +458,35 @@ def main(): | |||
371 | created = True | 458 | created = True |
372 | os.mkdir(out_base) | 459 | os.mkdir(out_base) |
373 | 460 | ||
374 | ran = done = succ = failed = invalid = 0 | 461 | exps = get_exps(opts, args, out_base) |
375 | 462 | ||
376 | for i, exp in enumerate(exps): | 463 | run_exps(exps, opts) |
377 | sched_file, out_dir = make_paths(exp, out_base, opts) | ||
378 | sched_dir = os.path.split(sched_file)[0] | ||
379 | 464 | ||
380 | try: | 465 | def state_count(state): |
381 | start_message = "Loading experiment %d of %d." % (i+1, len(exps)) | 466 | return len(filter(lambda x: x.state is state, exps)) |
382 | exp_params = make_exp_params(opts.scheduler, opts.duration, | ||
383 | sched_dir, opts.param_file) | ||
384 | 467 | ||
385 | run_experiment(exp, sched_file, exp_params, out_dir, | 468 | ran = len(filter(lambda x: x.state is not ExpState.None, exps)) |
386 | start_message, opts.ignore, jabber) | 469 | succ = state_count(ExpState.Succeeded) |
387 | |||
388 | succ += 1 | ||
389 | except ExperimentDone: | ||
390 | sys.stderr.write("Experiment '%s' already completed " % exp + | ||
391 | "at '%s'\n" % out_base) | ||
392 | done += 1 | ||
393 | except (InvalidKernel, InvalidConfig) as e: | ||
394 | sys.stderr.write("Invalid environment for experiment '%s'\n" % exp) | ||
395 | sys.stderr.write("%s\n" % e) | ||
396 | invalid += 1 | ||
397 | except KeyboardInterrupt: | ||
398 | sys.stderr.write("Keyboard interrupt, quitting\n") | ||
399 | break | ||
400 | except SystemCorrupted as e: | ||
401 | sys.stderr.write("System is corrupted! Fix state before continuing.\n") | ||
402 | sys.stderr.write("%s\n" % e) | ||
403 | break | ||
404 | except Exception as e: | ||
405 | sys.stderr.write("Failed experiment %s\n" % exp) | ||
406 | sys.stderr.write("%s\n" % e) | ||
407 | failed += 1 | ||
408 | |||
409 | ran += 1 | ||
410 | |||
411 | # Clean out directory if it failed immediately | ||
412 | if not os.listdir(out_base) and created and not succ: | ||
413 | os.rmdir(out_base) | ||
414 | 470 | ||
415 | message = "Experiments ran:\t%d of %d" % (ran, len(exps)) +\ | 471 | message = "Experiments ran:\t%d of %d" % (ran, len(exps)) +\ |
416 | "\n Successful:\t\t%d" % succ +\ | 472 | "\n Successful:\t\t%d" % succ +\ |
417 | "\n Failed:\t\t%d" % failed +\ | 473 | "\n Failed:\t\t%d" % state_count(ExpState.Failed) +\ |
418 | "\n Already Done:\t\t%d" % done +\ | 474 | "\n Already Done:\t\t%d" % state_count(ExpState.Done) +\ |
419 | "\n Invalid Environment:\t%d" % invalid | 475 | "\n Invalid Environment:\t%d" % state_count(ExpState.Invalid) |
420 | 476 | ||
421 | print(message) | 477 | print(message) |
422 | 478 | ||
479 | if email: | ||
480 | email.send(message) | ||
481 | email.close() | ||
482 | |||
423 | if succ: | 483 | if succ: |
424 | sys.stderr.write("Successful experiment data saved in %s.\n" % | 484 | sys.stderr.write("Successful experiment data saved in %s.\n" % |
425 | opts.out_dir) | 485 | opts.out_dir) |
486 | elif not os.listdir(out_base) and created: | ||
487 | # Remove directory if no data was put into it | ||
488 | os.rmdir(out_base) | ||
426 | 489 | ||
427 | if email: | ||
428 | email.send(message) | ||
429 | email.close() | ||
430 | 490 | ||
431 | if __name__ == '__main__': | 491 | if __name__ == '__main__': |
432 | main() | 492 | main() |