aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJonathan Herman <hermanjl@cs.unc.edu>2013-05-01 15:48:01 -0400
committerJonathan Herman <hermanjl@cs.unc.edu>2013-05-01 15:48:01 -0400
commitcd9f1b026cc5c4526dfbd2f7b1c5f39edb6a7309 (patch)
tree5b6221e55d7f50c88a574ed4f57ff7efd9b7103d
parent94cc65997d237ddeab24d396f06bb93bc0644a9d (diff)
Added --crontab option to run_exps.py
This will use crontab to automatically restart the machine and resume the script when the machine crashes. An additional option, -k, is provided to cancel this operation.
-rw-r--r--common.py5
-rw-r--r--run/crontab.py151
-rw-r--r--run/experiment.py8
-rwxr-xr-xrun_exps.py82
4 files changed, 234 insertions, 12 deletions
diff --git a/common.py b/common.py
index 7abf0f2..9ea2915 100644
--- a/common.py
+++ b/common.py
@@ -182,7 +182,7 @@ def ft_freq():
182 182
183 183
184def kernel(): 184def kernel():
185 return subprocess.check_output(["uname", "-r"]) 185 return subprocess.check_output(["uname", "-r"]).strip("\n")
186 186
187def is_executable(fname): 187def is_executable(fname):
188 '''Return whether the file passed in is executable''' 188 '''Return whether the file passed in is executable'''
@@ -210,3 +210,6 @@ def log_once(id, msg = None, indent = True):
210 if indent: 210 if indent:
211 msg = ' ' + msg.strip('\t').replace('\n', '\n\t') 211 msg = ' ' + msg.strip('\t').replace('\n', '\n\t')
212 sys.stderr.write('\n' + msg.strip('\n') + '\n') 212 sys.stderr.write('\n' + msg.strip('\n') + '\n')
213
214def get_cmd():
215 return os.path.split(sys.argv[0])[1]
diff --git a/run/crontab.py b/run/crontab.py
new file mode 100644
index 0000000..87d71b1
--- /dev/null
+++ b/run/crontab.py
@@ -0,0 +1,151 @@
1from __future__ import print_function
2
3import common
4import os
5import re
6import sys
7
8from subprocess import Popen, PIPE, check_output
9
10PANIC_DUR = 10
11DELAY = 30
12DELAY_INTERVAL = 10
13
14def get_cron_data():
15 try:
16 return check_output(['crontab', '-l'])
17 except:
18 return ""
19
20def wall(message):
21 '''A wall command with no header'''
22 return "echo '%s' | wall -n" % message
23
24def sanitize(args, ignored):
25 ret_args = []
26 for a in args:
27 if a in ignored:
28 continue
29 if '-' == a[0] and '--' != a[0:2]:
30 for i in ignored:
31 a = a.replace(i, '')
32 ret_args += [a]
33 return ret_args
34
35def get_outfname():
36 return "cron-%s.txt" % common.get_cmd()
37
38def get_boot_cron(ignored_params, extra=""):
39 '''Turn current python script into a crontab reboot entry'''
40 job_args = sanitize(sys.argv, ignored_params)
41 job = " ".join(job_args)
42 out_fname = get_outfname()
43
44 short_job = " ".join([common.get_cmd()] + job_args[1:])
45 msg = "Job '%s' will write output to '%s'" % (short_job, out_fname)
46
47 sys.stderr.write("%s %d seconds after reboot.\n" % (msg, DELAY))
48
49 # Create sleep and wall commands which will countdown DELAY seconds
50 # before executing the job
51 cmds = ["sleep %d" % DELAY_INTERVAL]
52 delay_rem = DELAY - DELAY_INTERVAL
53 while delay_rem > 0:
54 wmsg = "Restarting experiments in %d seconds. %s" % (delay_rem, extra)
55 cmds += [wall(wmsg)]
56 cmds += ["sleep %d" % min(DELAY_INTERVAL, delay_rem)]
57 delay_rem -= DELAY_INTERVAL
58 delay_cmd = ";".join(cmds)
59
60 # Create command which will only execute if the same kernel is running
61 kern = common.kernel()
62 fail_wall = wall("Need matching kernel '%s' to run!" % kern)
63 run_cmd = "echo '%s' | grep -q `uname -r` && %s && %s && %s >> %s 2>>%s || %s" %\
64 (kern, wall(msg), wall("Starting..."), job, out_fname, out_fname, fail_wall)
65
66 return "@reboot cd %s; %s; %s;" % (os.getcwd(), delay_cmd, run_cmd)
67
68def set_panic_restart(bool_val):
69 '''Enable / disable restart on panics'''
70 if bool_val:
71 sys.stderr.write("Kernel will reboot after panic.\n")
72 dur = PANIC_DUR
73 else:
74 sys.stderr.write("Kernel will no longer reboot after panic.\n")
75 dur = 0
76
77 check_output(['sysctl', '-w', "kernel.panic=%d" % dur,
78 "kernel.panic_on_oops=%d" % dur])
79
80def write_cron_data(data):
81 '''Write new crontab entry. No blank lines are written'''
82
83 # I don't know why "^\s*$" doesn't match, hence this ugly regex
84 data = re.sub(r"\n\s*\n", "\n", data, re.M)
85
86 sp = Popen(["crontab", "-"], stdin=PIPE)
87 stdout, stderr = sp.communicate(input=data)
88
89def install_path():
90 '''Place the current path in the crontab entry'''
91 data = get_cron_data()
92 curr_line = re.findall(r"PATH=.*", data)
93
94 if curr_line:
95 curr_paths = re.findall(r"((?:\/\w+)+)", curr_line[0])
96 data = re.sub(curr_line[0], "", data)
97 else:
98 curr_paths = []
99 curr_paths = set(curr_paths)
100
101 for path in os.environ["PATH"].split(os.pathsep):
102 curr_paths.add(path)
103
104 data = "PATH=" + os.pathsep.join(curr_paths) + "\n" + data
105
106 write_cron_data(data)
107
108def install_boot_job(ignored_params, reboot_message):
109 '''Re-run the current python script on system reboot using crontab'''
110 remove_boot_job()
111
112 data = get_cron_data()
113 job = get_boot_cron(ignored_params, reboot_message)
114
115 set_panic_restart(True)
116
117 write_cron_data(data + job + "\n")
118
119 if job not in get_cron_data():
120 raise IOError("Failed to write %s into cron!" % job)
121 else:
122 install_path()
123
124def clean_output():
125 fname = get_outfname()
126 if os.path.exists(fname):
127 os.remove(fname)
128
129def kill_boot_job():
130 remove_boot_job()
131
132 cmd = common.get_cmd()
133
134 procs = check_output("ps -eo pid,args".split(" "))
135 pairs = re.findall("(\d+) (.*)", procs)
136
137 for pid, args in pairs:
138 if re.search(r"/bin/sh -c.*%s"%cmd, args):
139 sys.stderr.write("Killing job %s\n" % pid)
140 check_output(("kill -9 %s" % pid).split(" "))
141
142def remove_boot_job():
143 '''Remove installed reboot job from crontab'''
144 data = get_cron_data()
145 regex = re.compile(r".*%s.*" % re.escape(common.get_cmd()), re.M)
146
147 if regex.search(data):
148 new_cron = regex.sub("", data)
149 write_cron_data(new_cron)
150
151 set_panic_restart(False)
diff --git a/run/experiment.py b/run/experiment.py
index 9a70414..da0e32e 100644
--- a/run/experiment.py
+++ b/run/experiment.py
@@ -35,6 +35,9 @@ class Experiment(object):
35 self.exec_err = None 35 self.exec_err = None
36 self.tracer_types = tracer_types 36 self.tracer_types = tracer_types
37 37
38 self.regular_tracers = []
39 self.exact_tracers = []
40
38 def __setup_tracers(self): 41 def __setup_tracers(self):
39 tracers = [ t(self.working_dir) for t in self.tracer_types ] 42 tracers = [ t(self.working_dir) for t in self.tracer_types ]
40 43
@@ -55,8 +58,13 @@ class Experiment(object):
55 Experiment.INTERRUPTED_DIR) 58 Experiment.INTERRUPTED_DIR)
56 interrupted = "%s/%s" % (os.path.split(self.working_dir)[0], 59 interrupted = "%s/%s" % (os.path.split(self.working_dir)[0],
57 Experiment.INTERRUPTED_DIR) 60 Experiment.INTERRUPTED_DIR)
61 old_int = "%s/%s" % (self.working_dir, Experiment.INTERRUPTED_DIR)
62
58 if os.path.exists(interrupted): 63 if os.path.exists(interrupted):
59 sh.rmtree(interrupted) 64 sh.rmtree(interrupted)
65 if os.path.exists(old_int):
66 sh.rmtree(old_int)
67
60 os.rename(self.working_dir, interrupted) 68 os.rename(self.working_dir, interrupted)
61 69
62 os.mkdir(self.working_dir) 70 os.mkdir(self.working_dir)
diff --git a/run_exps.py b/run_exps.py
index 1bad2a3..21666a9 100755
--- a/run_exps.py
+++ b/run_exps.py
@@ -3,10 +3,12 @@ from __future__ import print_function
3 3
4import common as com 4import common as com
5import os 5import os
6import pickle
6import pprint 7import pprint
7import re 8import re
8import shutil 9import shutil
9import sys 10import sys
11import run.crontab as cron
10import run.tracer as trace 12import run.tracer as trace
11 13
12from config.config import PARAMS,DEFAULTS,FILES 14from config.config import PARAMS,DEFAULTS,FILES
@@ -17,9 +19,6 @@ from run.executable.executable import Executable
17from run.experiment import Experiment,ExperimentDone,SystemCorrupted 19from run.experiment import Experiment,ExperimentDone,SystemCorrupted
18from run.proc_entry import ProcEntry 20from run.proc_entry import ProcEntry
19 21
20'''Maximum times an experiment will be retried'''
21MAX_RETRY = 5
22
23'''Customizable experiment parameters''' 22'''Customizable experiment parameters'''
24ExpParams = namedtuple('ExpParams', ['scheduler', 'duration', 'tracers', 23ExpParams = namedtuple('ExpParams', ['scheduler', 'duration', 'tracers',
25 'kernel', 'config_options', 'file_params', 24 'kernel', 'config_options', 'file_params',
@@ -31,6 +30,11 @@ ExpData = com.recordtype('ExpData', ['name', 'params', 'sched_file', 'out_dir',
31'''Comparison of requested versus actual kernel compile parameter value''' 30'''Comparison of requested versus actual kernel compile parameter value'''
32ConfigResult = namedtuple('ConfigResult', ['param', 'wanted', 'actual']) 31ConfigResult = namedtuple('ConfigResult', ['param', 'wanted', 'actual'])
33 32
33'''Maximum times an experiment will be retried'''
34MAX_RETRY = 5
35'''Location experiment retry count is stored'''
36TRIES_FNAME = ".tries.pkl"
37
34 38
35class InvalidKernel(Exception): 39class InvalidKernel(Exception):
36 def __init__(self, kernel): 40 def __init__(self, kernel):
@@ -88,6 +92,9 @@ def parse_args():
88 action='store_true', default=False, 92 action='store_true', default=False,
89 help='use crontab to resume interrupted script after ' 93 help='use crontab to resume interrupted script after '
90 'system restarts. implies --retry') 94 'system restarts. implies --retry')
95 group.add_option('-k', '--kill-crontab', dest='kill',
96 action='store_true', default=False,
97 help='kill existing script crontabs and exit')
91 parser.add_option_group(group) 98 parser.add_option_group(group)
92 99
93 return parser.parse_args() 100 return parser.parse_args()
@@ -314,7 +321,7 @@ def run_experiment(data, start_message, ignore, jabber):
314 321
315def make_paths(exp, opts, out_base_dir): 322def make_paths(exp, opts, out_base_dir):
316 '''Translate experiment name to (schedule file, output directory) paths''' 323 '''Translate experiment name to (schedule file, output directory) paths'''
317 path = "%s/%s" % (os.getcwd(), exp) 324 path = os.path.abspath(exp)
318 out_dir = "%s/%s" % (out_base_dir, os.path.split(exp.strip('/'))[1]) 325 out_dir = "%s/%s" % (out_base_dir, os.path.split(exp.strip('/'))[1])
319 326
320 if not os.path.exists(path): 327 if not os.path.exists(path):
@@ -408,10 +415,35 @@ def setup_email(target):
408 return None 415 return None
409 416
410 417
418def tries_file(exp):
419 return "%s/%s" % (os.path.split(exp.sched_file)[0], TRIES_FNAME)
420
421
422def get_tries(exp):
423 if not os.path.exists(tries_file(exp)):
424 return 0
425 with open(tries_file(exp), 'r') as f:
426 return int(pickle.load(f))
427
428
429def set_tries(exp, val):
430 if not val:
431 if os.path.exists(tries_file(exp)):
432 os.remove(tries_file(exp))
433 else:
434 with open(tries_file(exp), 'w') as f:
435 pickle.dump(str(val), f)
436 os.system('sync')
437
438
411def run_exps(exps, opts): 439def run_exps(exps, opts):
412 jabber = setup_jabber(opts.jabber) if opts.jabber else None 440 jabber = setup_jabber(opts.jabber) if opts.jabber else None
413 441
414 exps_remaining = list(enumerate(exps)) 442 # Give each experiment a unique id
443 exps_remaining = enumerate(exps)
444 # But run experiments which have failed the most last
445 exps_remaining = sorted(exps_remaining, key=lambda x: get_tries(x[1]))
446
415 while exps_remaining: 447 while exps_remaining:
416 i, exp = exps_remaining.pop(0) 448 i, exp = exps_remaining.pop(0)
417 449
@@ -419,17 +451,26 @@ def run_exps(exps, opts):
419 start_message = "%s experiment %d of %d." % (verb, i+1, len(exps)) 451 start_message = "%s experiment %d of %d." % (verb, i+1, len(exps))
420 452
421 try: 453 try:
454 set_tries(exp, get_tries(exp) + 1)
455 if get_tries(exp) > MAX_RETRY:
456 raise Exception("Hit maximum retries of %d" % MAX_RETRY)
457
422 run_experiment(exp, start_message, opts.ignore, jabber) 458 run_experiment(exp, start_message, opts.ignore, jabber)
459
460 set_tries(exp, 0)
423 exp.state = ExpState.Succeeded 461 exp.state = ExpState.Succeeded
424 except KeyboardInterrupt: 462 except KeyboardInterrupt:
425 sys.stderr.write("Keyboard interrupt, quitting\n") 463 sys.stderr.write("Keyboard interrupt, quitting\n")
464 set_tries(exp, get_tries(exp) - 1)
426 break 465 break
427 except ExperimentDone: 466 except ExperimentDone:
428 sys.stderr.write("Experiment already completed at '%s'\n" % exp.out_dir) 467 sys.stderr.write("Experiment already completed at '%s'\n" % exp.out_dir)
468 set_tries(exp, 0)
429 exp.state = ExpState.Done 469 exp.state = ExpState.Done
430 except (InvalidKernel, InvalidConfig) as e: 470 except (InvalidKernel, InvalidConfig) as e:
431 sys.stderr.write("Invalid environment for experiment '%s'\n" % exp.name) 471 sys.stderr.write("Invalid environment for experiment '%s'\n" % exp.name)
432 sys.stderr.write("%s\n" % e) 472 sys.stderr.write("%s\n" % e)
473 set_tries(exp, get_tries(exp) - 1)
433 exp.state = ExpState.Invalid 474 exp.state = ExpState.Invalid
434 except SystemCorrupted as e: 475 except SystemCorrupted as e:
435 sys.stderr.write("System is corrupted! Fix state before continuing.\n") 476 sys.stderr.write("System is corrupted! Fix state before continuing.\n")
@@ -445,17 +486,19 @@ def run_exps(exps, opts):
445 exp.state = ExpState.Failed 486 exp.state = ExpState.Failed
446 487
447 if exp.state is ExpState.Failed and opts.retry: 488 if exp.state is ExpState.Failed and opts.retry:
448 if exp.retries < MAX_RETRY: 489 exps_remaining += [(i, exp)]
449 exps_remaining += [(i, exp)] 490
450 exp.retries += 1
451 else:
452 sys.stderr.write("Hit maximum retries of %d\n" % MAX_RETRY)
453 491
454def main(): 492def main():
455 opts, args = parse_args() 493 opts, args = parse_args()
456 494
495 if opts.kill:
496 cron.kill_boot_job()
497 sys.exit(1)
498
457 email = setup_email(opts.email) if opts.email else None 499 email = setup_email(opts.email) if opts.email else None
458 500
501 # Create base output directory for run data
459 out_base = os.path.abspath(opts.out_dir) 502 out_base = os.path.abspath(opts.out_dir)
460 created = False 503 created = False
461 if not os.path.exists(out_base): 504 if not os.path.exists(out_base):
@@ -464,7 +507,24 @@ def main():
464 507
465 exps = get_exps(opts, args, out_base) 508 exps = get_exps(opts, args, out_base)
466 509
467 run_exps(exps, opts) 510 if opts.crontab:
511 # Resume script on startup
512 opts.retry = True
513 cron.install_boot_job(['f', '--forced'],
514 "Stop with %s -k" % com.get_cmd())
515
516 if opts.force or not opts.retry:
517 cron.clean_output()
518 for e in exps:
519 set_tries(e, 0)
520
521 try:
522 run_exps(exps, opts)
523 finally:
524 # Remove persistent state
525 for e in exps:
526 set_tries(e, 0)
527 cron.remove_boot_job()
468 528
469 def state_count(state): 529 def state_count(state):
470 return len(filter(lambda x: x.state is state, exps)) 530 return len(filter(lambda x: x.state is state, exps))