diff options
| author | Jonathan Herman <hermanjl@cs.unc.edu> | 2013-05-01 15:48:01 -0400 |
|---|---|---|
| committer | Jonathan Herman <hermanjl@cs.unc.edu> | 2013-05-01 15:48:01 -0400 |
| commit | cd9f1b026cc5c4526dfbd2f7b1c5f39edb6a7309 (patch) | |
| tree | 5b6221e55d7f50c88a574ed4f57ff7efd9b7103d | |
| parent | 94cc65997d237ddeab24d396f06bb93bc0644a9d (diff) | |
Added --crontab option to run_exps.py
This will use crontab to automatically restart the machine and resume
the script when the machine crashes. An additional option, -k, is provided
to cancel this operation.
| -rw-r--r-- | common.py | 5 | ||||
| -rw-r--r-- | run/crontab.py | 151 | ||||
| -rw-r--r-- | run/experiment.py | 8 | ||||
| -rwxr-xr-x | run_exps.py | 82 |
4 files changed, 234 insertions, 12 deletions
| @@ -182,7 +182,7 @@ def ft_freq(): | |||
| 182 | 182 | ||
| 183 | 183 | ||
| 184 | def kernel(): | 184 | def kernel(): |
| 185 | return subprocess.check_output(["uname", "-r"]) | 185 | return subprocess.check_output(["uname", "-r"]).strip("\n") |
| 186 | 186 | ||
| 187 | def is_executable(fname): | 187 | def is_executable(fname): |
| 188 | '''Return whether the file passed in is executable''' | 188 | '''Return whether the file passed in is executable''' |
| @@ -210,3 +210,6 @@ def log_once(id, msg = None, indent = True): | |||
| 210 | if indent: | 210 | if indent: |
| 211 | msg = ' ' + msg.strip('\t').replace('\n', '\n\t') | 211 | msg = ' ' + msg.strip('\t').replace('\n', '\n\t') |
| 212 | sys.stderr.write('\n' + msg.strip('\n') + '\n') | 212 | sys.stderr.write('\n' + msg.strip('\n') + '\n') |
| 213 | |||
| 214 | def get_cmd(): | ||
| 215 | return os.path.split(sys.argv[0])[1] | ||
diff --git a/run/crontab.py b/run/crontab.py new file mode 100644 index 0000000..87d71b1 --- /dev/null +++ b/run/crontab.py | |||
| @@ -0,0 +1,151 @@ | |||
| 1 | from __future__ import print_function | ||
| 2 | |||
| 3 | import common | ||
| 4 | import os | ||
| 5 | import re | ||
| 6 | import sys | ||
| 7 | |||
| 8 | from subprocess import Popen, PIPE, check_output | ||
| 9 | |||
| 10 | PANIC_DUR = 10 | ||
| 11 | DELAY = 30 | ||
| 12 | DELAY_INTERVAL = 10 | ||
| 13 | |||
| 14 | def get_cron_data(): | ||
| 15 | try: | ||
| 16 | return check_output(['crontab', '-l']) | ||
| 17 | except: | ||
| 18 | return "" | ||
| 19 | |||
| 20 | def wall(message): | ||
| 21 | '''A wall command with no header''' | ||
| 22 | return "echo '%s' | wall -n" % message | ||
| 23 | |||
| 24 | def sanitize(args, ignored): | ||
| 25 | ret_args = [] | ||
| 26 | for a in args: | ||
| 27 | if a in ignored: | ||
| 28 | continue | ||
| 29 | if '-' == a[0] and '--' != a[0:2]: | ||
| 30 | for i in ignored: | ||
| 31 | a = a.replace(i, '') | ||
| 32 | ret_args += [a] | ||
| 33 | return ret_args | ||
| 34 | |||
| 35 | def get_outfname(): | ||
| 36 | return "cron-%s.txt" % common.get_cmd() | ||
| 37 | |||
| 38 | def get_boot_cron(ignored_params, extra=""): | ||
| 39 | '''Turn current python script into a crontab reboot entry''' | ||
| 40 | job_args = sanitize(sys.argv, ignored_params) | ||
| 41 | job = " ".join(job_args) | ||
| 42 | out_fname = get_outfname() | ||
| 43 | |||
| 44 | short_job = " ".join([common.get_cmd()] + job_args[1:]) | ||
| 45 | msg = "Job '%s' will write output to '%s'" % (short_job, out_fname) | ||
| 46 | |||
| 47 | sys.stderr.write("%s %d seconds after reboot.\n" % (msg, DELAY)) | ||
| 48 | |||
| 49 | # Create sleep and wall commands which will countdown DELAY seconds | ||
| 50 | # before executing the job | ||
| 51 | cmds = ["sleep %d" % DELAY_INTERVAL] | ||
| 52 | delay_rem = DELAY - DELAY_INTERVAL | ||
| 53 | while delay_rem > 0: | ||
| 54 | wmsg = "Restarting experiments in %d seconds. %s" % (delay_rem, extra) | ||
| 55 | cmds += [wall(wmsg)] | ||
| 56 | cmds += ["sleep %d" % min(DELAY_INTERVAL, delay_rem)] | ||
| 57 | delay_rem -= DELAY_INTERVAL | ||
| 58 | delay_cmd = ";".join(cmds) | ||
| 59 | |||
| 60 | # Create command which will only execute if the same kernel is running | ||
| 61 | kern = common.kernel() | ||
| 62 | fail_wall = wall("Need matching kernel '%s' to run!" % kern) | ||
| 63 | run_cmd = "echo '%s' | grep -q `uname -r` && %s && %s && %s >> %s 2>>%s || %s" %\ | ||
| 64 | (kern, wall(msg), wall("Starting..."), job, out_fname, out_fname, fail_wall) | ||
| 65 | |||
| 66 | return "@reboot cd %s; %s; %s;" % (os.getcwd(), delay_cmd, run_cmd) | ||
| 67 | |||
| 68 | def set_panic_restart(bool_val): | ||
| 69 | '''Enable / disable restart on panics''' | ||
| 70 | if bool_val: | ||
| 71 | sys.stderr.write("Kernel will reboot after panic.\n") | ||
| 72 | dur = PANIC_DUR | ||
| 73 | else: | ||
| 74 | sys.stderr.write("Kernel will no longer reboot after panic.\n") | ||
| 75 | dur = 0 | ||
| 76 | |||
| 77 | check_output(['sysctl', '-w', "kernel.panic=%d" % dur, | ||
| 78 | "kernel.panic_on_oops=%d" % dur]) | ||
| 79 | |||
| 80 | def write_cron_data(data): | ||
| 81 | '''Write new crontab entry. No blank lines are written''' | ||
| 82 | |||
| 83 | # I don't know why "^\s*$" doesn't match, hence this ugly regex | ||
| 84 | data = re.sub(r"\n\s*\n", "\n", data, re.M) | ||
| 85 | |||
| 86 | sp = Popen(["crontab", "-"], stdin=PIPE) | ||
| 87 | stdout, stderr = sp.communicate(input=data) | ||
| 88 | |||
| 89 | def install_path(): | ||
| 90 | '''Place the current path in the crontab entry''' | ||
| 91 | data = get_cron_data() | ||
| 92 | curr_line = re.findall(r"PATH=.*", data) | ||
| 93 | |||
| 94 | if curr_line: | ||
| 95 | curr_paths = re.findall(r"((?:\/\w+)+)", curr_line[0]) | ||
| 96 | data = re.sub(curr_line[0], "", data) | ||
| 97 | else: | ||
| 98 | curr_paths = [] | ||
| 99 | curr_paths = set(curr_paths) | ||
| 100 | |||
| 101 | for path in os.environ["PATH"].split(os.pathsep): | ||
| 102 | curr_paths.add(path) | ||
| 103 | |||
| 104 | data = "PATH=" + os.pathsep.join(curr_paths) + "\n" + data | ||
| 105 | |||
| 106 | write_cron_data(data) | ||
| 107 | |||
| 108 | def install_boot_job(ignored_params, reboot_message): | ||
| 109 | '''Re-run the current python script on system reboot using crontab''' | ||
| 110 | remove_boot_job() | ||
| 111 | |||
| 112 | data = get_cron_data() | ||
| 113 | job = get_boot_cron(ignored_params, reboot_message) | ||
| 114 | |||
| 115 | set_panic_restart(True) | ||
| 116 | |||
| 117 | write_cron_data(data + job + "\n") | ||
| 118 | |||
| 119 | if job not in get_cron_data(): | ||
| 120 | raise IOError("Failed to write %s into cron!" % job) | ||
| 121 | else: | ||
| 122 | install_path() | ||
| 123 | |||
| 124 | def clean_output(): | ||
| 125 | fname = get_outfname() | ||
| 126 | if os.path.exists(fname): | ||
| 127 | os.remove(fname) | ||
| 128 | |||
| 129 | def kill_boot_job(): | ||
| 130 | remove_boot_job() | ||
| 131 | |||
| 132 | cmd = common.get_cmd() | ||
| 133 | |||
| 134 | procs = check_output("ps -eo pid,args".split(" ")) | ||
| 135 | pairs = re.findall("(\d+) (.*)", procs) | ||
| 136 | |||
| 137 | for pid, args in pairs: | ||
| 138 | if re.search(r"/bin/sh -c.*%s"%cmd, args): | ||
| 139 | sys.stderr.write("Killing job %s\n" % pid) | ||
| 140 | check_output(("kill -9 %s" % pid).split(" ")) | ||
| 141 | |||
| 142 | def remove_boot_job(): | ||
| 143 | '''Remove installed reboot job from crontab''' | ||
| 144 | data = get_cron_data() | ||
| 145 | regex = re.compile(r".*%s.*" % re.escape(common.get_cmd()), re.M) | ||
| 146 | |||
| 147 | if regex.search(data): | ||
| 148 | new_cron = regex.sub("", data) | ||
| 149 | write_cron_data(new_cron) | ||
| 150 | |||
| 151 | set_panic_restart(False) | ||
diff --git a/run/experiment.py b/run/experiment.py index 9a70414..da0e32e 100644 --- a/run/experiment.py +++ b/run/experiment.py | |||
| @@ -35,6 +35,9 @@ class Experiment(object): | |||
| 35 | self.exec_err = None | 35 | self.exec_err = None |
| 36 | self.tracer_types = tracer_types | 36 | self.tracer_types = tracer_types |
| 37 | 37 | ||
| 38 | self.regular_tracers = [] | ||
| 39 | self.exact_tracers = [] | ||
| 40 | |||
| 38 | def __setup_tracers(self): | 41 | def __setup_tracers(self): |
| 39 | tracers = [ t(self.working_dir) for t in self.tracer_types ] | 42 | tracers = [ t(self.working_dir) for t in self.tracer_types ] |
| 40 | 43 | ||
| @@ -55,8 +58,13 @@ class Experiment(object): | |||
| 55 | Experiment.INTERRUPTED_DIR) | 58 | Experiment.INTERRUPTED_DIR) |
| 56 | interrupted = "%s/%s" % (os.path.split(self.working_dir)[0], | 59 | interrupted = "%s/%s" % (os.path.split(self.working_dir)[0], |
| 57 | Experiment.INTERRUPTED_DIR) | 60 | Experiment.INTERRUPTED_DIR) |
| 61 | old_int = "%s/%s" % (self.working_dir, Experiment.INTERRUPTED_DIR) | ||
| 62 | |||
| 58 | if os.path.exists(interrupted): | 63 | if os.path.exists(interrupted): |
| 59 | sh.rmtree(interrupted) | 64 | sh.rmtree(interrupted) |
| 65 | if os.path.exists(old_int): | ||
| 66 | sh.rmtree(old_int) | ||
| 67 | |||
| 60 | os.rename(self.working_dir, interrupted) | 68 | os.rename(self.working_dir, interrupted) |
| 61 | 69 | ||
| 62 | os.mkdir(self.working_dir) | 70 | os.mkdir(self.working_dir) |
diff --git a/run_exps.py b/run_exps.py index 1bad2a3..21666a9 100755 --- a/run_exps.py +++ b/run_exps.py | |||
| @@ -3,10 +3,12 @@ from __future__ import print_function | |||
| 3 | 3 | ||
| 4 | import common as com | 4 | import common as com |
| 5 | import os | 5 | import os |
| 6 | import pickle | ||
| 6 | import pprint | 7 | import pprint |
| 7 | import re | 8 | import re |
| 8 | import shutil | 9 | import shutil |
| 9 | import sys | 10 | import sys |
| 11 | import run.crontab as cron | ||
| 10 | import run.tracer as trace | 12 | import run.tracer as trace |
| 11 | 13 | ||
| 12 | from config.config import PARAMS,DEFAULTS,FILES | 14 | from config.config import PARAMS,DEFAULTS,FILES |
| @@ -17,9 +19,6 @@ from run.executable.executable import Executable | |||
| 17 | from run.experiment import Experiment,ExperimentDone,SystemCorrupted | 19 | from run.experiment import Experiment,ExperimentDone,SystemCorrupted |
| 18 | from run.proc_entry import ProcEntry | 20 | from run.proc_entry import ProcEntry |
| 19 | 21 | ||
| 20 | '''Maximum times an experiment will be retried''' | ||
| 21 | MAX_RETRY = 5 | ||
| 22 | |||
| 23 | '''Customizable experiment parameters''' | 22 | '''Customizable experiment parameters''' |
| 24 | ExpParams = namedtuple('ExpParams', ['scheduler', 'duration', 'tracers', | 23 | ExpParams = namedtuple('ExpParams', ['scheduler', 'duration', 'tracers', |
| 25 | 'kernel', 'config_options', 'file_params', | 24 | 'kernel', 'config_options', 'file_params', |
| @@ -31,6 +30,11 @@ ExpData = com.recordtype('ExpData', ['name', 'params', 'sched_file', 'out_dir', | |||
| 31 | '''Comparison of requested versus actual kernel compile parameter value''' | 30 | '''Comparison of requested versus actual kernel compile parameter value''' |
| 32 | ConfigResult = namedtuple('ConfigResult', ['param', 'wanted', 'actual']) | 31 | ConfigResult = namedtuple('ConfigResult', ['param', 'wanted', 'actual']) |
| 33 | 32 | ||
| 33 | '''Maximum times an experiment will be retried''' | ||
| 34 | MAX_RETRY = 5 | ||
| 35 | '''Location experiment retry count is stored''' | ||
| 36 | TRIES_FNAME = ".tries.pkl" | ||
| 37 | |||
| 34 | 38 | ||
| 35 | class InvalidKernel(Exception): | 39 | class InvalidKernel(Exception): |
| 36 | def __init__(self, kernel): | 40 | def __init__(self, kernel): |
| @@ -88,6 +92,9 @@ def parse_args(): | |||
| 88 | action='store_true', default=False, | 92 | action='store_true', default=False, |
| 89 | help='use crontab to resume interrupted script after ' | 93 | help='use crontab to resume interrupted script after ' |
| 90 | 'system restarts. implies --retry') | 94 | 'system restarts. implies --retry') |
| 95 | group.add_option('-k', '--kill-crontab', dest='kill', | ||
| 96 | action='store_true', default=False, | ||
| 97 | help='kill existing script crontabs and exit') | ||
| 91 | parser.add_option_group(group) | 98 | parser.add_option_group(group) |
| 92 | 99 | ||
| 93 | return parser.parse_args() | 100 | return parser.parse_args() |
| @@ -314,7 +321,7 @@ def run_experiment(data, start_message, ignore, jabber): | |||
| 314 | 321 | ||
| 315 | def make_paths(exp, opts, out_base_dir): | 322 | def make_paths(exp, opts, out_base_dir): |
| 316 | '''Translate experiment name to (schedule file, output directory) paths''' | 323 | '''Translate experiment name to (schedule file, output directory) paths''' |
| 317 | path = "%s/%s" % (os.getcwd(), exp) | 324 | path = os.path.abspath(exp) |
| 318 | out_dir = "%s/%s" % (out_base_dir, os.path.split(exp.strip('/'))[1]) | 325 | out_dir = "%s/%s" % (out_base_dir, os.path.split(exp.strip('/'))[1]) |
| 319 | 326 | ||
| 320 | if not os.path.exists(path): | 327 | if not os.path.exists(path): |
| @@ -408,10 +415,35 @@ def setup_email(target): | |||
| 408 | return None | 415 | return None |
| 409 | 416 | ||
| 410 | 417 | ||
| 418 | def tries_file(exp): | ||
| 419 | return "%s/%s" % (os.path.split(exp.sched_file)[0], TRIES_FNAME) | ||
| 420 | |||
| 421 | |||
| 422 | def get_tries(exp): | ||
| 423 | if not os.path.exists(tries_file(exp)): | ||
| 424 | return 0 | ||
| 425 | with open(tries_file(exp), 'r') as f: | ||
| 426 | return int(pickle.load(f)) | ||
| 427 | |||
| 428 | |||
| 429 | def set_tries(exp, val): | ||
| 430 | if not val: | ||
| 431 | if os.path.exists(tries_file(exp)): | ||
| 432 | os.remove(tries_file(exp)) | ||
| 433 | else: | ||
| 434 | with open(tries_file(exp), 'w') as f: | ||
| 435 | pickle.dump(str(val), f) | ||
| 436 | os.system('sync') | ||
| 437 | |||
| 438 | |||
| 411 | def run_exps(exps, opts): | 439 | def run_exps(exps, opts): |
| 412 | jabber = setup_jabber(opts.jabber) if opts.jabber else None | 440 | jabber = setup_jabber(opts.jabber) if opts.jabber else None |
| 413 | 441 | ||
| 414 | exps_remaining = list(enumerate(exps)) | 442 | # Give each experiment a unique id |
| 443 | exps_remaining = enumerate(exps) | ||
| 444 | # But run experiments which have failed the most last | ||
| 445 | exps_remaining = sorted(exps_remaining, key=lambda x: get_tries(x[1])) | ||
| 446 | |||
| 415 | while exps_remaining: | 447 | while exps_remaining: |
| 416 | i, exp = exps_remaining.pop(0) | 448 | i, exp = exps_remaining.pop(0) |
| 417 | 449 | ||
| @@ -419,17 +451,26 @@ def run_exps(exps, opts): | |||
| 419 | start_message = "%s experiment %d of %d." % (verb, i+1, len(exps)) | 451 | start_message = "%s experiment %d of %d." % (verb, i+1, len(exps)) |
| 420 | 452 | ||
| 421 | try: | 453 | try: |
| 454 | set_tries(exp, get_tries(exp) + 1) | ||
| 455 | if get_tries(exp) > MAX_RETRY: | ||
| 456 | raise Exception("Hit maximum retries of %d" % MAX_RETRY) | ||
| 457 | |||
| 422 | run_experiment(exp, start_message, opts.ignore, jabber) | 458 | run_experiment(exp, start_message, opts.ignore, jabber) |
| 459 | |||
| 460 | set_tries(exp, 0) | ||
| 423 | exp.state = ExpState.Succeeded | 461 | exp.state = ExpState.Succeeded |
| 424 | except KeyboardInterrupt: | 462 | except KeyboardInterrupt: |
| 425 | sys.stderr.write("Keyboard interrupt, quitting\n") | 463 | sys.stderr.write("Keyboard interrupt, quitting\n") |
| 464 | set_tries(exp, get_tries(exp) - 1) | ||
| 426 | break | 465 | break |
| 427 | except ExperimentDone: | 466 | except ExperimentDone: |
| 428 | sys.stderr.write("Experiment already completed at '%s'\n" % exp.out_dir) | 467 | sys.stderr.write("Experiment already completed at '%s'\n" % exp.out_dir) |
| 468 | set_tries(exp, 0) | ||
| 429 | exp.state = ExpState.Done | 469 | exp.state = ExpState.Done |
| 430 | except (InvalidKernel, InvalidConfig) as e: | 470 | except (InvalidKernel, InvalidConfig) as e: |
| 431 | sys.stderr.write("Invalid environment for experiment '%s'\n" % exp.name) | 471 | sys.stderr.write("Invalid environment for experiment '%s'\n" % exp.name) |
| 432 | sys.stderr.write("%s\n" % e) | 472 | sys.stderr.write("%s\n" % e) |
| 473 | set_tries(exp, get_tries(exp) - 1) | ||
| 433 | exp.state = ExpState.Invalid | 474 | exp.state = ExpState.Invalid |
| 434 | except SystemCorrupted as e: | 475 | except SystemCorrupted as e: |
| 435 | sys.stderr.write("System is corrupted! Fix state before continuing.\n") | 476 | sys.stderr.write("System is corrupted! Fix state before continuing.\n") |
| @@ -445,17 +486,19 @@ def run_exps(exps, opts): | |||
| 445 | exp.state = ExpState.Failed | 486 | exp.state = ExpState.Failed |
| 446 | 487 | ||
| 447 | if exp.state is ExpState.Failed and opts.retry: | 488 | if exp.state is ExpState.Failed and opts.retry: |
| 448 | if exp.retries < MAX_RETRY: | 489 | exps_remaining += [(i, exp)] |
| 449 | exps_remaining += [(i, exp)] | 490 | |
| 450 | exp.retries += 1 | ||
| 451 | else: | ||
| 452 | sys.stderr.write("Hit maximum retries of %d\n" % MAX_RETRY) | ||
| 453 | 491 | ||
| 454 | def main(): | 492 | def main(): |
| 455 | opts, args = parse_args() | 493 | opts, args = parse_args() |
| 456 | 494 | ||
| 495 | if opts.kill: | ||
| 496 | cron.kill_boot_job() | ||
| 497 | sys.exit(1) | ||
| 498 | |||
| 457 | email = setup_email(opts.email) if opts.email else None | 499 | email = setup_email(opts.email) if opts.email else None |
| 458 | 500 | ||
| 501 | # Create base output directory for run data | ||
| 459 | out_base = os.path.abspath(opts.out_dir) | 502 | out_base = os.path.abspath(opts.out_dir) |
| 460 | created = False | 503 | created = False |
| 461 | if not os.path.exists(out_base): | 504 | if not os.path.exists(out_base): |
| @@ -464,7 +507,24 @@ def main(): | |||
| 464 | 507 | ||
| 465 | exps = get_exps(opts, args, out_base) | 508 | exps = get_exps(opts, args, out_base) |
| 466 | 509 | ||
| 467 | run_exps(exps, opts) | 510 | if opts.crontab: |
| 511 | # Resume script on startup | ||
| 512 | opts.retry = True | ||
| 513 | cron.install_boot_job(['f', '--forced'], | ||
| 514 | "Stop with %s -k" % com.get_cmd()) | ||
| 515 | |||
| 516 | if opts.force or not opts.retry: | ||
| 517 | cron.clean_output() | ||
| 518 | for e in exps: | ||
| 519 | set_tries(e, 0) | ||
| 520 | |||
| 521 | try: | ||
| 522 | run_exps(exps, opts) | ||
| 523 | finally: | ||
| 524 | # Remove persistent state | ||
| 525 | for e in exps: | ||
| 526 | set_tries(e, 0) | ||
| 527 | cron.remove_boot_job() | ||
| 468 | 528 | ||
| 469 | def state_count(state): | 529 | def state_count(state): |
| 470 | return len(filter(lambda x: x.state is state, exps)) | 530 | return len(filter(lambda x: x.state is state, exps)) |
