diff options
author | Jonathan Herman <hermanjl@cs.unc.edu> | 2013-05-01 15:48:01 -0400 |
---|---|---|
committer | Jonathan Herman <hermanjl@cs.unc.edu> | 2013-05-01 15:48:01 -0400 |
commit | cd9f1b026cc5c4526dfbd2f7b1c5f39edb6a7309 (patch) | |
tree | 5b6221e55d7f50c88a574ed4f57ff7efd9b7103d | |
parent | 94cc65997d237ddeab24d396f06bb93bc0644a9d (diff) |
Added --crontab option to run_exps.py
This will use crontab to automatically restart the machine and resume
the script when the machine crashes. An additional option, -k, is provided
to cancel this operation.
-rw-r--r-- | common.py | 5 | ||||
-rw-r--r-- | run/crontab.py | 151 | ||||
-rw-r--r-- | run/experiment.py | 8 | ||||
-rwxr-xr-x | run_exps.py | 82 |
4 files changed, 234 insertions, 12 deletions
@@ -182,7 +182,7 @@ def ft_freq(): | |||
182 | 182 | ||
183 | 183 | ||
184 | def kernel(): | 184 | def kernel(): |
185 | return subprocess.check_output(["uname", "-r"]) | 185 | return subprocess.check_output(["uname", "-r"]).strip("\n") |
186 | 186 | ||
187 | def is_executable(fname): | 187 | def is_executable(fname): |
188 | '''Return whether the file passed in is executable''' | 188 | '''Return whether the file passed in is executable''' |
@@ -210,3 +210,6 @@ def log_once(id, msg = None, indent = True): | |||
210 | if indent: | 210 | if indent: |
211 | msg = ' ' + msg.strip('\t').replace('\n', '\n\t') | 211 | msg = ' ' + msg.strip('\t').replace('\n', '\n\t') |
212 | sys.stderr.write('\n' + msg.strip('\n') + '\n') | 212 | sys.stderr.write('\n' + msg.strip('\n') + '\n') |
213 | |||
214 | def get_cmd(): | ||
215 | return os.path.split(sys.argv[0])[1] | ||
diff --git a/run/crontab.py b/run/crontab.py new file mode 100644 index 0000000..87d71b1 --- /dev/null +++ b/run/crontab.py | |||
@@ -0,0 +1,151 @@ | |||
1 | from __future__ import print_function | ||
2 | |||
3 | import common | ||
4 | import os | ||
5 | import re | ||
6 | import sys | ||
7 | |||
8 | from subprocess import Popen, PIPE, check_output | ||
9 | |||
10 | PANIC_DUR = 10 | ||
11 | DELAY = 30 | ||
12 | DELAY_INTERVAL = 10 | ||
13 | |||
14 | def get_cron_data(): | ||
15 | try: | ||
16 | return check_output(['crontab', '-l']) | ||
17 | except: | ||
18 | return "" | ||
19 | |||
20 | def wall(message): | ||
21 | '''A wall command with no header''' | ||
22 | return "echo '%s' | wall -n" % message | ||
23 | |||
24 | def sanitize(args, ignored): | ||
25 | ret_args = [] | ||
26 | for a in args: | ||
27 | if a in ignored: | ||
28 | continue | ||
29 | if '-' == a[0] and '--' != a[0:2]: | ||
30 | for i in ignored: | ||
31 | a = a.replace(i, '') | ||
32 | ret_args += [a] | ||
33 | return ret_args | ||
34 | |||
35 | def get_outfname(): | ||
36 | return "cron-%s.txt" % common.get_cmd() | ||
37 | |||
38 | def get_boot_cron(ignored_params, extra=""): | ||
39 | '''Turn current python script into a crontab reboot entry''' | ||
40 | job_args = sanitize(sys.argv, ignored_params) | ||
41 | job = " ".join(job_args) | ||
42 | out_fname = get_outfname() | ||
43 | |||
44 | short_job = " ".join([common.get_cmd()] + job_args[1:]) | ||
45 | msg = "Job '%s' will write output to '%s'" % (short_job, out_fname) | ||
46 | |||
47 | sys.stderr.write("%s %d seconds after reboot.\n" % (msg, DELAY)) | ||
48 | |||
49 | # Create sleep and wall commands which will countdown DELAY seconds | ||
50 | # before executing the job | ||
51 | cmds = ["sleep %d" % DELAY_INTERVAL] | ||
52 | delay_rem = DELAY - DELAY_INTERVAL | ||
53 | while delay_rem > 0: | ||
54 | wmsg = "Restarting experiments in %d seconds. %s" % (delay_rem, extra) | ||
55 | cmds += [wall(wmsg)] | ||
56 | cmds += ["sleep %d" % min(DELAY_INTERVAL, delay_rem)] | ||
57 | delay_rem -= DELAY_INTERVAL | ||
58 | delay_cmd = ";".join(cmds) | ||
59 | |||
60 | # Create command which will only execute if the same kernel is running | ||
61 | kern = common.kernel() | ||
62 | fail_wall = wall("Need matching kernel '%s' to run!" % kern) | ||
63 | run_cmd = "echo '%s' | grep -q `uname -r` && %s && %s && %s >> %s 2>>%s || %s" %\ | ||
64 | (kern, wall(msg), wall("Starting..."), job, out_fname, out_fname, fail_wall) | ||
65 | |||
66 | return "@reboot cd %s; %s; %s;" % (os.getcwd(), delay_cmd, run_cmd) | ||
67 | |||
68 | def set_panic_restart(bool_val): | ||
69 | '''Enable / disable restart on panics''' | ||
70 | if bool_val: | ||
71 | sys.stderr.write("Kernel will reboot after panic.\n") | ||
72 | dur = PANIC_DUR | ||
73 | else: | ||
74 | sys.stderr.write("Kernel will no longer reboot after panic.\n") | ||
75 | dur = 0 | ||
76 | |||
77 | check_output(['sysctl', '-w', "kernel.panic=%d" % dur, | ||
78 | "kernel.panic_on_oops=%d" % dur]) | ||
79 | |||
80 | def write_cron_data(data): | ||
81 | '''Write new crontab entry. No blank lines are written''' | ||
82 | |||
83 | # I don't know why "^\s*$" doesn't match, hence this ugly regex | ||
84 | data = re.sub(r"\n\s*\n", "\n", data, re.M) | ||
85 | |||
86 | sp = Popen(["crontab", "-"], stdin=PIPE) | ||
87 | stdout, stderr = sp.communicate(input=data) | ||
88 | |||
89 | def install_path(): | ||
90 | '''Place the current path in the crontab entry''' | ||
91 | data = get_cron_data() | ||
92 | curr_line = re.findall(r"PATH=.*", data) | ||
93 | |||
94 | if curr_line: | ||
95 | curr_paths = re.findall(r"((?:\/\w+)+)", curr_line[0]) | ||
96 | data = re.sub(curr_line[0], "", data) | ||
97 | else: | ||
98 | curr_paths = [] | ||
99 | curr_paths = set(curr_paths) | ||
100 | |||
101 | for path in os.environ["PATH"].split(os.pathsep): | ||
102 | curr_paths.add(path) | ||
103 | |||
104 | data = "PATH=" + os.pathsep.join(curr_paths) + "\n" + data | ||
105 | |||
106 | write_cron_data(data) | ||
107 | |||
108 | def install_boot_job(ignored_params, reboot_message): | ||
109 | '''Re-run the current python script on system reboot using crontab''' | ||
110 | remove_boot_job() | ||
111 | |||
112 | data = get_cron_data() | ||
113 | job = get_boot_cron(ignored_params, reboot_message) | ||
114 | |||
115 | set_panic_restart(True) | ||
116 | |||
117 | write_cron_data(data + job + "\n") | ||
118 | |||
119 | if job not in get_cron_data(): | ||
120 | raise IOError("Failed to write %s into cron!" % job) | ||
121 | else: | ||
122 | install_path() | ||
123 | |||
124 | def clean_output(): | ||
125 | fname = get_outfname() | ||
126 | if os.path.exists(fname): | ||
127 | os.remove(fname) | ||
128 | |||
129 | def kill_boot_job(): | ||
130 | remove_boot_job() | ||
131 | |||
132 | cmd = common.get_cmd() | ||
133 | |||
134 | procs = check_output("ps -eo pid,args".split(" ")) | ||
135 | pairs = re.findall("(\d+) (.*)", procs) | ||
136 | |||
137 | for pid, args in pairs: | ||
138 | if re.search(r"/bin/sh -c.*%s"%cmd, args): | ||
139 | sys.stderr.write("Killing job %s\n" % pid) | ||
140 | check_output(("kill -9 %s" % pid).split(" ")) | ||
141 | |||
142 | def remove_boot_job(): | ||
143 | '''Remove installed reboot job from crontab''' | ||
144 | data = get_cron_data() | ||
145 | regex = re.compile(r".*%s.*" % re.escape(common.get_cmd()), re.M) | ||
146 | |||
147 | if regex.search(data): | ||
148 | new_cron = regex.sub("", data) | ||
149 | write_cron_data(new_cron) | ||
150 | |||
151 | set_panic_restart(False) | ||
diff --git a/run/experiment.py b/run/experiment.py index 9a70414..da0e32e 100644 --- a/run/experiment.py +++ b/run/experiment.py | |||
@@ -35,6 +35,9 @@ class Experiment(object): | |||
35 | self.exec_err = None | 35 | self.exec_err = None |
36 | self.tracer_types = tracer_types | 36 | self.tracer_types = tracer_types |
37 | 37 | ||
38 | self.regular_tracers = [] | ||
39 | self.exact_tracers = [] | ||
40 | |||
38 | def __setup_tracers(self): | 41 | def __setup_tracers(self): |
39 | tracers = [ t(self.working_dir) for t in self.tracer_types ] | 42 | tracers = [ t(self.working_dir) for t in self.tracer_types ] |
40 | 43 | ||
@@ -55,8 +58,13 @@ class Experiment(object): | |||
55 | Experiment.INTERRUPTED_DIR) | 58 | Experiment.INTERRUPTED_DIR) |
56 | interrupted = "%s/%s" % (os.path.split(self.working_dir)[0], | 59 | interrupted = "%s/%s" % (os.path.split(self.working_dir)[0], |
57 | Experiment.INTERRUPTED_DIR) | 60 | Experiment.INTERRUPTED_DIR) |
61 | old_int = "%s/%s" % (self.working_dir, Experiment.INTERRUPTED_DIR) | ||
62 | |||
58 | if os.path.exists(interrupted): | 63 | if os.path.exists(interrupted): |
59 | sh.rmtree(interrupted) | 64 | sh.rmtree(interrupted) |
65 | if os.path.exists(old_int): | ||
66 | sh.rmtree(old_int) | ||
67 | |||
60 | os.rename(self.working_dir, interrupted) | 68 | os.rename(self.working_dir, interrupted) |
61 | 69 | ||
62 | os.mkdir(self.working_dir) | 70 | os.mkdir(self.working_dir) |
diff --git a/run_exps.py b/run_exps.py index 1bad2a3..21666a9 100755 --- a/run_exps.py +++ b/run_exps.py | |||
@@ -3,10 +3,12 @@ from __future__ import print_function | |||
3 | 3 | ||
4 | import common as com | 4 | import common as com |
5 | import os | 5 | import os |
6 | import pickle | ||
6 | import pprint | 7 | import pprint |
7 | import re | 8 | import re |
8 | import shutil | 9 | import shutil |
9 | import sys | 10 | import sys |
11 | import run.crontab as cron | ||
10 | import run.tracer as trace | 12 | import run.tracer as trace |
11 | 13 | ||
12 | from config.config import PARAMS,DEFAULTS,FILES | 14 | from config.config import PARAMS,DEFAULTS,FILES |
@@ -17,9 +19,6 @@ from run.executable.executable import Executable | |||
17 | from run.experiment import Experiment,ExperimentDone,SystemCorrupted | 19 | from run.experiment import Experiment,ExperimentDone,SystemCorrupted |
18 | from run.proc_entry import ProcEntry | 20 | from run.proc_entry import ProcEntry |
19 | 21 | ||
20 | '''Maximum times an experiment will be retried''' | ||
21 | MAX_RETRY = 5 | ||
22 | |||
23 | '''Customizable experiment parameters''' | 22 | '''Customizable experiment parameters''' |
24 | ExpParams = namedtuple('ExpParams', ['scheduler', 'duration', 'tracers', | 23 | ExpParams = namedtuple('ExpParams', ['scheduler', 'duration', 'tracers', |
25 | 'kernel', 'config_options', 'file_params', | 24 | 'kernel', 'config_options', 'file_params', |
@@ -31,6 +30,11 @@ ExpData = com.recordtype('ExpData', ['name', 'params', 'sched_file', 'out_dir', | |||
31 | '''Comparison of requested versus actual kernel compile parameter value''' | 30 | '''Comparison of requested versus actual kernel compile parameter value''' |
32 | ConfigResult = namedtuple('ConfigResult', ['param', 'wanted', 'actual']) | 31 | ConfigResult = namedtuple('ConfigResult', ['param', 'wanted', 'actual']) |
33 | 32 | ||
33 | '''Maximum times an experiment will be retried''' | ||
34 | MAX_RETRY = 5 | ||
35 | '''Location experiment retry count is stored''' | ||
36 | TRIES_FNAME = ".tries.pkl" | ||
37 | |||
34 | 38 | ||
35 | class InvalidKernel(Exception): | 39 | class InvalidKernel(Exception): |
36 | def __init__(self, kernel): | 40 | def __init__(self, kernel): |
@@ -88,6 +92,9 @@ def parse_args(): | |||
88 | action='store_true', default=False, | 92 | action='store_true', default=False, |
89 | help='use crontab to resume interrupted script after ' | 93 | help='use crontab to resume interrupted script after ' |
90 | 'system restarts. implies --retry') | 94 | 'system restarts. implies --retry') |
95 | group.add_option('-k', '--kill-crontab', dest='kill', | ||
96 | action='store_true', default=False, | ||
97 | help='kill existing script crontabs and exit') | ||
91 | parser.add_option_group(group) | 98 | parser.add_option_group(group) |
92 | 99 | ||
93 | return parser.parse_args() | 100 | return parser.parse_args() |
@@ -314,7 +321,7 @@ def run_experiment(data, start_message, ignore, jabber): | |||
314 | 321 | ||
315 | def make_paths(exp, opts, out_base_dir): | 322 | def make_paths(exp, opts, out_base_dir): |
316 | '''Translate experiment name to (schedule file, output directory) paths''' | 323 | '''Translate experiment name to (schedule file, output directory) paths''' |
317 | path = "%s/%s" % (os.getcwd(), exp) | 324 | path = os.path.abspath(exp) |
318 | out_dir = "%s/%s" % (out_base_dir, os.path.split(exp.strip('/'))[1]) | 325 | out_dir = "%s/%s" % (out_base_dir, os.path.split(exp.strip('/'))[1]) |
319 | 326 | ||
320 | if not os.path.exists(path): | 327 | if not os.path.exists(path): |
@@ -408,10 +415,35 @@ def setup_email(target): | |||
408 | return None | 415 | return None |
409 | 416 | ||
410 | 417 | ||
418 | def tries_file(exp): | ||
419 | return "%s/%s" % (os.path.split(exp.sched_file)[0], TRIES_FNAME) | ||
420 | |||
421 | |||
422 | def get_tries(exp): | ||
423 | if not os.path.exists(tries_file(exp)): | ||
424 | return 0 | ||
425 | with open(tries_file(exp), 'r') as f: | ||
426 | return int(pickle.load(f)) | ||
427 | |||
428 | |||
429 | def set_tries(exp, val): | ||
430 | if not val: | ||
431 | if os.path.exists(tries_file(exp)): | ||
432 | os.remove(tries_file(exp)) | ||
433 | else: | ||
434 | with open(tries_file(exp), 'w') as f: | ||
435 | pickle.dump(str(val), f) | ||
436 | os.system('sync') | ||
437 | |||
438 | |||
411 | def run_exps(exps, opts): | 439 | def run_exps(exps, opts): |
412 | jabber = setup_jabber(opts.jabber) if opts.jabber else None | 440 | jabber = setup_jabber(opts.jabber) if opts.jabber else None |
413 | 441 | ||
414 | exps_remaining = list(enumerate(exps)) | 442 | # Give each experiment a unique id |
443 | exps_remaining = enumerate(exps) | ||
444 | # But run experiments which have failed the most last | ||
445 | exps_remaining = sorted(exps_remaining, key=lambda x: get_tries(x[1])) | ||
446 | |||
415 | while exps_remaining: | 447 | while exps_remaining: |
416 | i, exp = exps_remaining.pop(0) | 448 | i, exp = exps_remaining.pop(0) |
417 | 449 | ||
@@ -419,17 +451,26 @@ def run_exps(exps, opts): | |||
419 | start_message = "%s experiment %d of %d." % (verb, i+1, len(exps)) | 451 | start_message = "%s experiment %d of %d." % (verb, i+1, len(exps)) |
420 | 452 | ||
421 | try: | 453 | try: |
454 | set_tries(exp, get_tries(exp) + 1) | ||
455 | if get_tries(exp) > MAX_RETRY: | ||
456 | raise Exception("Hit maximum retries of %d" % MAX_RETRY) | ||
457 | |||
422 | run_experiment(exp, start_message, opts.ignore, jabber) | 458 | run_experiment(exp, start_message, opts.ignore, jabber) |
459 | |||
460 | set_tries(exp, 0) | ||
423 | exp.state = ExpState.Succeeded | 461 | exp.state = ExpState.Succeeded |
424 | except KeyboardInterrupt: | 462 | except KeyboardInterrupt: |
425 | sys.stderr.write("Keyboard interrupt, quitting\n") | 463 | sys.stderr.write("Keyboard interrupt, quitting\n") |
464 | set_tries(exp, get_tries(exp) - 1) | ||
426 | break | 465 | break |
427 | except ExperimentDone: | 466 | except ExperimentDone: |
428 | sys.stderr.write("Experiment already completed at '%s'\n" % exp.out_dir) | 467 | sys.stderr.write("Experiment already completed at '%s'\n" % exp.out_dir) |
468 | set_tries(exp, 0) | ||
429 | exp.state = ExpState.Done | 469 | exp.state = ExpState.Done |
430 | except (InvalidKernel, InvalidConfig) as e: | 470 | except (InvalidKernel, InvalidConfig) as e: |
431 | sys.stderr.write("Invalid environment for experiment '%s'\n" % exp.name) | 471 | sys.stderr.write("Invalid environment for experiment '%s'\n" % exp.name) |
432 | sys.stderr.write("%s\n" % e) | 472 | sys.stderr.write("%s\n" % e) |
473 | set_tries(exp, get_tries(exp) - 1) | ||
433 | exp.state = ExpState.Invalid | 474 | exp.state = ExpState.Invalid |
434 | except SystemCorrupted as e: | 475 | except SystemCorrupted as e: |
435 | sys.stderr.write("System is corrupted! Fix state before continuing.\n") | 476 | sys.stderr.write("System is corrupted! Fix state before continuing.\n") |
@@ -445,17 +486,19 @@ def run_exps(exps, opts): | |||
445 | exp.state = ExpState.Failed | 486 | exp.state = ExpState.Failed |
446 | 487 | ||
447 | if exp.state is ExpState.Failed and opts.retry: | 488 | if exp.state is ExpState.Failed and opts.retry: |
448 | if exp.retries < MAX_RETRY: | 489 | exps_remaining += [(i, exp)] |
449 | exps_remaining += [(i, exp)] | 490 | |
450 | exp.retries += 1 | ||
451 | else: | ||
452 | sys.stderr.write("Hit maximum retries of %d\n" % MAX_RETRY) | ||
453 | 491 | ||
454 | def main(): | 492 | def main(): |
455 | opts, args = parse_args() | 493 | opts, args = parse_args() |
456 | 494 | ||
495 | if opts.kill: | ||
496 | cron.kill_boot_job() | ||
497 | sys.exit(1) | ||
498 | |||
457 | email = setup_email(opts.email) if opts.email else None | 499 | email = setup_email(opts.email) if opts.email else None |
458 | 500 | ||
501 | # Create base output directory for run data | ||
459 | out_base = os.path.abspath(opts.out_dir) | 502 | out_base = os.path.abspath(opts.out_dir) |
460 | created = False | 503 | created = False |
461 | if not os.path.exists(out_base): | 504 | if not os.path.exists(out_base): |
@@ -464,7 +507,24 @@ def main(): | |||
464 | 507 | ||
465 | exps = get_exps(opts, args, out_base) | 508 | exps = get_exps(opts, args, out_base) |
466 | 509 | ||
467 | run_exps(exps, opts) | 510 | if opts.crontab: |
511 | # Resume script on startup | ||
512 | opts.retry = True | ||
513 | cron.install_boot_job(['f', '--forced'], | ||
514 | "Stop with %s -k" % com.get_cmd()) | ||
515 | |||
516 | if opts.force or not opts.retry: | ||
517 | cron.clean_output() | ||
518 | for e in exps: | ||
519 | set_tries(e, 0) | ||
520 | |||
521 | try: | ||
522 | run_exps(exps, opts) | ||
523 | finally: | ||
524 | # Remove persistent state | ||
525 | for e in exps: | ||
526 | set_tries(e, 0) | ||
527 | cron.remove_boot_job() | ||
468 | 528 | ||
469 | def state_count(state): | 529 | def state_count(state): |
470 | return len(filter(lambda x: x.state is state, exps)) | 530 | return len(filter(lambda x: x.state is state, exps)) |