diff options
author | Jonathan Herman <hermanjl@cs.unc.edu> | 2013-05-01 15:48:01 -0400 |
---|---|---|
committer | Jonathan Herman <hermanjl@cs.unc.edu> | 2013-05-01 15:48:01 -0400 |
commit | cd9f1b026cc5c4526dfbd2f7b1c5f39edb6a7309 (patch) | |
tree | 5b6221e55d7f50c88a574ed4f57ff7efd9b7103d /run_exps.py | |
parent | 94cc65997d237ddeab24d396f06bb93bc0644a9d (diff) |
Added --crontab option to run_exps.py
This will use crontab to automatically restart the machine and resume
the script when the machine crashes. An additional option, -k, is provided
to cancel this operation.
Diffstat (limited to 'run_exps.py')
-rwxr-xr-x | run_exps.py | 82 |
1 files changed, 71 insertions, 11 deletions
diff --git a/run_exps.py b/run_exps.py index 1bad2a3..21666a9 100755 --- a/run_exps.py +++ b/run_exps.py | |||
@@ -3,10 +3,12 @@ from __future__ import print_function | |||
3 | 3 | ||
4 | import common as com | 4 | import common as com |
5 | import os | 5 | import os |
6 | import pickle | ||
6 | import pprint | 7 | import pprint |
7 | import re | 8 | import re |
8 | import shutil | 9 | import shutil |
9 | import sys | 10 | import sys |
11 | import run.crontab as cron | ||
10 | import run.tracer as trace | 12 | import run.tracer as trace |
11 | 13 | ||
12 | from config.config import PARAMS,DEFAULTS,FILES | 14 | from config.config import PARAMS,DEFAULTS,FILES |
@@ -17,9 +19,6 @@ from run.executable.executable import Executable | |||
17 | from run.experiment import Experiment,ExperimentDone,SystemCorrupted | 19 | from run.experiment import Experiment,ExperimentDone,SystemCorrupted |
18 | from run.proc_entry import ProcEntry | 20 | from run.proc_entry import ProcEntry |
19 | 21 | ||
20 | '''Maximum times an experiment will be retried''' | ||
21 | MAX_RETRY = 5 | ||
22 | |||
23 | '''Customizable experiment parameters''' | 22 | '''Customizable experiment parameters''' |
24 | ExpParams = namedtuple('ExpParams', ['scheduler', 'duration', 'tracers', | 23 | ExpParams = namedtuple('ExpParams', ['scheduler', 'duration', 'tracers', |
25 | 'kernel', 'config_options', 'file_params', | 24 | 'kernel', 'config_options', 'file_params', |
@@ -31,6 +30,11 @@ ExpData = com.recordtype('ExpData', ['name', 'params', 'sched_file', 'out_dir', | |||
31 | '''Comparison of requested versus actual kernel compile parameter value''' | 30 | '''Comparison of requested versus actual kernel compile parameter value''' |
32 | ConfigResult = namedtuple('ConfigResult', ['param', 'wanted', 'actual']) | 31 | ConfigResult = namedtuple('ConfigResult', ['param', 'wanted', 'actual']) |
33 | 32 | ||
33 | '''Maximum times an experiment will be retried''' | ||
34 | MAX_RETRY = 5 | ||
35 | '''Location experiment retry count is stored''' | ||
36 | TRIES_FNAME = ".tries.pkl" | ||
37 | |||
34 | 38 | ||
35 | class InvalidKernel(Exception): | 39 | class InvalidKernel(Exception): |
36 | def __init__(self, kernel): | 40 | def __init__(self, kernel): |
@@ -88,6 +92,9 @@ def parse_args(): | |||
88 | action='store_true', default=False, | 92 | action='store_true', default=False, |
89 | help='use crontab to resume interrupted script after ' | 93 | help='use crontab to resume interrupted script after ' |
90 | 'system restarts. implies --retry') | 94 | 'system restarts. implies --retry') |
95 | group.add_option('-k', '--kill-crontab', dest='kill', | ||
96 | action='store_true', default=False, | ||
97 | help='kill existing script crontabs and exit') | ||
91 | parser.add_option_group(group) | 98 | parser.add_option_group(group) |
92 | 99 | ||
93 | return parser.parse_args() | 100 | return parser.parse_args() |
@@ -314,7 +321,7 @@ def run_experiment(data, start_message, ignore, jabber): | |||
314 | 321 | ||
315 | def make_paths(exp, opts, out_base_dir): | 322 | def make_paths(exp, opts, out_base_dir): |
316 | '''Translate experiment name to (schedule file, output directory) paths''' | 323 | '''Translate experiment name to (schedule file, output directory) paths''' |
317 | path = "%s/%s" % (os.getcwd(), exp) | 324 | path = os.path.abspath(exp) |
318 | out_dir = "%s/%s" % (out_base_dir, os.path.split(exp.strip('/'))[1]) | 325 | out_dir = "%s/%s" % (out_base_dir, os.path.split(exp.strip('/'))[1]) |
319 | 326 | ||
320 | if not os.path.exists(path): | 327 | if not os.path.exists(path): |
@@ -408,10 +415,35 @@ def setup_email(target): | |||
408 | return None | 415 | return None |
409 | 416 | ||
410 | 417 | ||
418 | def tries_file(exp): | ||
419 | return "%s/%s" % (os.path.split(exp.sched_file)[0], TRIES_FNAME) | ||
420 | |||
421 | |||
422 | def get_tries(exp): | ||
423 | if not os.path.exists(tries_file(exp)): | ||
424 | return 0 | ||
425 | with open(tries_file(exp), 'r') as f: | ||
426 | return int(pickle.load(f)) | ||
427 | |||
428 | |||
429 | def set_tries(exp, val): | ||
430 | if not val: | ||
431 | if os.path.exists(tries_file(exp)): | ||
432 | os.remove(tries_file(exp)) | ||
433 | else: | ||
434 | with open(tries_file(exp), 'w') as f: | ||
435 | pickle.dump(str(val), f) | ||
436 | os.system('sync') | ||
437 | |||
438 | |||
411 | def run_exps(exps, opts): | 439 | def run_exps(exps, opts): |
412 | jabber = setup_jabber(opts.jabber) if opts.jabber else None | 440 | jabber = setup_jabber(opts.jabber) if opts.jabber else None |
413 | 441 | ||
414 | exps_remaining = list(enumerate(exps)) | 442 | # Give each experiment a unique id |
443 | exps_remaining = enumerate(exps) | ||
444 | # But run experiments which have failed the most last | ||
445 | exps_remaining = sorted(exps_remaining, key=lambda x: get_tries(x[1])) | ||
446 | |||
415 | while exps_remaining: | 447 | while exps_remaining: |
416 | i, exp = exps_remaining.pop(0) | 448 | i, exp = exps_remaining.pop(0) |
417 | 449 | ||
@@ -419,17 +451,26 @@ def run_exps(exps, opts): | |||
419 | start_message = "%s experiment %d of %d." % (verb, i+1, len(exps)) | 451 | start_message = "%s experiment %d of %d." % (verb, i+1, len(exps)) |
420 | 452 | ||
421 | try: | 453 | try: |
454 | set_tries(exp, get_tries(exp) + 1) | ||
455 | if get_tries(exp) > MAX_RETRY: | ||
456 | raise Exception("Hit maximum retries of %d" % MAX_RETRY) | ||
457 | |||
422 | run_experiment(exp, start_message, opts.ignore, jabber) | 458 | run_experiment(exp, start_message, opts.ignore, jabber) |
459 | |||
460 | set_tries(exp, 0) | ||
423 | exp.state = ExpState.Succeeded | 461 | exp.state = ExpState.Succeeded |
424 | except KeyboardInterrupt: | 462 | except KeyboardInterrupt: |
425 | sys.stderr.write("Keyboard interrupt, quitting\n") | 463 | sys.stderr.write("Keyboard interrupt, quitting\n") |
464 | set_tries(exp, get_tries(exp) - 1) | ||
426 | break | 465 | break |
427 | except ExperimentDone: | 466 | except ExperimentDone: |
428 | sys.stderr.write("Experiment already completed at '%s'\n" % exp.out_dir) | 467 | sys.stderr.write("Experiment already completed at '%s'\n" % exp.out_dir) |
468 | set_tries(exp, 0) | ||
429 | exp.state = ExpState.Done | 469 | exp.state = ExpState.Done |
430 | except (InvalidKernel, InvalidConfig) as e: | 470 | except (InvalidKernel, InvalidConfig) as e: |
431 | sys.stderr.write("Invalid environment for experiment '%s'\n" % exp.name) | 471 | sys.stderr.write("Invalid environment for experiment '%s'\n" % exp.name) |
432 | sys.stderr.write("%s\n" % e) | 472 | sys.stderr.write("%s\n" % e) |
473 | set_tries(exp, get_tries(exp) - 1) | ||
433 | exp.state = ExpState.Invalid | 474 | exp.state = ExpState.Invalid |
434 | except SystemCorrupted as e: | 475 | except SystemCorrupted as e: |
435 | sys.stderr.write("System is corrupted! Fix state before continuing.\n") | 476 | sys.stderr.write("System is corrupted! Fix state before continuing.\n") |
@@ -445,17 +486,19 @@ def run_exps(exps, opts): | |||
445 | exp.state = ExpState.Failed | 486 | exp.state = ExpState.Failed |
446 | 487 | ||
447 | if exp.state is ExpState.Failed and opts.retry: | 488 | if exp.state is ExpState.Failed and opts.retry: |
448 | if exp.retries < MAX_RETRY: | 489 | exps_remaining += [(i, exp)] |
449 | exps_remaining += [(i, exp)] | 490 | |
450 | exp.retries += 1 | ||
451 | else: | ||
452 | sys.stderr.write("Hit maximum retries of %d\n" % MAX_RETRY) | ||
453 | 491 | ||
454 | def main(): | 492 | def main(): |
455 | opts, args = parse_args() | 493 | opts, args = parse_args() |
456 | 494 | ||
495 | if opts.kill: | ||
496 | cron.kill_boot_job() | ||
497 | sys.exit(1) | ||
498 | |||
457 | email = setup_email(opts.email) if opts.email else None | 499 | email = setup_email(opts.email) if opts.email else None |
458 | 500 | ||
501 | # Create base output directory for run data | ||
459 | out_base = os.path.abspath(opts.out_dir) | 502 | out_base = os.path.abspath(opts.out_dir) |
460 | created = False | 503 | created = False |
461 | if not os.path.exists(out_base): | 504 | if not os.path.exists(out_base): |
@@ -464,7 +507,24 @@ def main(): | |||
464 | 507 | ||
465 | exps = get_exps(opts, args, out_base) | 508 | exps = get_exps(opts, args, out_base) |
466 | 509 | ||
467 | run_exps(exps, opts) | 510 | if opts.crontab: |
511 | # Resume script on startup | ||
512 | opts.retry = True | ||
513 | cron.install_boot_job(['f', '--forced'], | ||
514 | "Stop with %s -k" % com.get_cmd()) | ||
515 | |||
516 | if opts.force or not opts.retry: | ||
517 | cron.clean_output() | ||
518 | for e in exps: | ||
519 | set_tries(e, 0) | ||
520 | |||
521 | try: | ||
522 | run_exps(exps, opts) | ||
523 | finally: | ||
524 | # Remove persistent state | ||
525 | for e in exps: | ||
526 | set_tries(e, 0) | ||
527 | cron.remove_boot_job() | ||
468 | 528 | ||
469 | def state_count(state): | 529 | def state_count(state): |
470 | return len(filter(lambda x: x.state is state, exps)) | 530 | return len(filter(lambda x: x.state is state, exps)) |