diff options
| author | Jonathan Herman <hermanjl@cs.unc.edu> | 2013-05-01 15:48:01 -0400 |
|---|---|---|
| committer | Jonathan Herman <hermanjl@cs.unc.edu> | 2013-05-01 15:48:01 -0400 |
| commit | cd9f1b026cc5c4526dfbd2f7b1c5f39edb6a7309 (patch) | |
| tree | 5b6221e55d7f50c88a574ed4f57ff7efd9b7103d /run | |
| parent | 94cc65997d237ddeab24d396f06bb93bc0644a9d (diff) | |
Added --crontab option to run_exps.py
This will use crontab to automatically restart the machine and resume
the script when the machine crashes. An additional option, -k, is provided
to cancel this operation.
Diffstat (limited to 'run')
| -rw-r--r-- | run/crontab.py | 151 | ||||
| -rw-r--r-- | run/experiment.py | 8 |
2 files changed, 159 insertions, 0 deletions
diff --git a/run/crontab.py b/run/crontab.py new file mode 100644 index 0000000..87d71b1 --- /dev/null +++ b/run/crontab.py | |||
| @@ -0,0 +1,151 @@ | |||
| 1 | from __future__ import print_function | ||
| 2 | |||
| 3 | import common | ||
| 4 | import os | ||
| 5 | import re | ||
| 6 | import sys | ||
| 7 | |||
| 8 | from subprocess import Popen, PIPE, check_output | ||
| 9 | |||
| 10 | PANIC_DUR = 10 | ||
| 11 | DELAY = 30 | ||
| 12 | DELAY_INTERVAL = 10 | ||
| 13 | |||
| 14 | def get_cron_data(): | ||
| 15 | try: | ||
| 16 | return check_output(['crontab', '-l']) | ||
| 17 | except: | ||
| 18 | return "" | ||
| 19 | |||
| 20 | def wall(message): | ||
| 21 | '''A wall command with no header''' | ||
| 22 | return "echo '%s' | wall -n" % message | ||
| 23 | |||
| 24 | def sanitize(args, ignored): | ||
| 25 | ret_args = [] | ||
| 26 | for a in args: | ||
| 27 | if a in ignored: | ||
| 28 | continue | ||
| 29 | if '-' == a[0] and '--' != a[0:2]: | ||
| 30 | for i in ignored: | ||
| 31 | a = a.replace(i, '') | ||
| 32 | ret_args += [a] | ||
| 33 | return ret_args | ||
| 34 | |||
| 35 | def get_outfname(): | ||
| 36 | return "cron-%s.txt" % common.get_cmd() | ||
| 37 | |||
| 38 | def get_boot_cron(ignored_params, extra=""): | ||
| 39 | '''Turn current python script into a crontab reboot entry''' | ||
| 40 | job_args = sanitize(sys.argv, ignored_params) | ||
| 41 | job = " ".join(job_args) | ||
| 42 | out_fname = get_outfname() | ||
| 43 | |||
| 44 | short_job = " ".join([common.get_cmd()] + job_args[1:]) | ||
| 45 | msg = "Job '%s' will write output to '%s'" % (short_job, out_fname) | ||
| 46 | |||
| 47 | sys.stderr.write("%s %d seconds after reboot.\n" % (msg, DELAY)) | ||
| 48 | |||
| 49 | # Create sleep and wall commands which will countdown DELAY seconds | ||
| 50 | # before executing the job | ||
| 51 | cmds = ["sleep %d" % DELAY_INTERVAL] | ||
| 52 | delay_rem = DELAY - DELAY_INTERVAL | ||
| 53 | while delay_rem > 0: | ||
| 54 | wmsg = "Restarting experiments in %d seconds. %s" % (delay_rem, extra) | ||
| 55 | cmds += [wall(wmsg)] | ||
| 56 | cmds += ["sleep %d" % min(DELAY_INTERVAL, delay_rem)] | ||
| 57 | delay_rem -= DELAY_INTERVAL | ||
| 58 | delay_cmd = ";".join(cmds) | ||
| 59 | |||
| 60 | # Create command which will only execute if the same kernel is running | ||
| 61 | kern = common.kernel() | ||
| 62 | fail_wall = wall("Need matching kernel '%s' to run!" % kern) | ||
| 63 | run_cmd = "echo '%s' | grep -q `uname -r` && %s && %s && %s >> %s 2>>%s || %s" %\ | ||
| 64 | (kern, wall(msg), wall("Starting..."), job, out_fname, out_fname, fail_wall) | ||
| 65 | |||
| 66 | return "@reboot cd %s; %s; %s;" % (os.getcwd(), delay_cmd, run_cmd) | ||
| 67 | |||
| 68 | def set_panic_restart(bool_val): | ||
| 69 | '''Enable / disable restart on panics''' | ||
| 70 | if bool_val: | ||
| 71 | sys.stderr.write("Kernel will reboot after panic.\n") | ||
| 72 | dur = PANIC_DUR | ||
| 73 | else: | ||
| 74 | sys.stderr.write("Kernel will no longer reboot after panic.\n") | ||
| 75 | dur = 0 | ||
| 76 | |||
| 77 | check_output(['sysctl', '-w', "kernel.panic=%d" % dur, | ||
| 78 | "kernel.panic_on_oops=%d" % dur]) | ||
| 79 | |||
| 80 | def write_cron_data(data): | ||
| 81 | '''Write new crontab entry. No blank lines are written''' | ||
| 82 | |||
| 83 | # I don't know why "^\s*$" doesn't match, hence this ugly regex | ||
| 84 | data = re.sub(r"\n\s*\n", "\n", data, re.M) | ||
| 85 | |||
| 86 | sp = Popen(["crontab", "-"], stdin=PIPE) | ||
| 87 | stdout, stderr = sp.communicate(input=data) | ||
| 88 | |||
| 89 | def install_path(): | ||
| 90 | '''Place the current path in the crontab entry''' | ||
| 91 | data = get_cron_data() | ||
| 92 | curr_line = re.findall(r"PATH=.*", data) | ||
| 93 | |||
| 94 | if curr_line: | ||
| 95 | curr_paths = re.findall(r"((?:\/\w+)+)", curr_line[0]) | ||
| 96 | data = re.sub(curr_line[0], "", data) | ||
| 97 | else: | ||
| 98 | curr_paths = [] | ||
| 99 | curr_paths = set(curr_paths) | ||
| 100 | |||
| 101 | for path in os.environ["PATH"].split(os.pathsep): | ||
| 102 | curr_paths.add(path) | ||
| 103 | |||
| 104 | data = "PATH=" + os.pathsep.join(curr_paths) + "\n" + data | ||
| 105 | |||
| 106 | write_cron_data(data) | ||
| 107 | |||
| 108 | def install_boot_job(ignored_params, reboot_message): | ||
| 109 | '''Re-run the current python script on system reboot using crontab''' | ||
| 110 | remove_boot_job() | ||
| 111 | |||
| 112 | data = get_cron_data() | ||
| 113 | job = get_boot_cron(ignored_params, reboot_message) | ||
| 114 | |||
| 115 | set_panic_restart(True) | ||
| 116 | |||
| 117 | write_cron_data(data + job + "\n") | ||
| 118 | |||
| 119 | if job not in get_cron_data(): | ||
| 120 | raise IOError("Failed to write %s into cron!" % job) | ||
| 121 | else: | ||
| 122 | install_path() | ||
| 123 | |||
| 124 | def clean_output(): | ||
| 125 | fname = get_outfname() | ||
| 126 | if os.path.exists(fname): | ||
| 127 | os.remove(fname) | ||
| 128 | |||
| 129 | def kill_boot_job(): | ||
| 130 | remove_boot_job() | ||
| 131 | |||
| 132 | cmd = common.get_cmd() | ||
| 133 | |||
| 134 | procs = check_output("ps -eo pid,args".split(" ")) | ||
| 135 | pairs = re.findall("(\d+) (.*)", procs) | ||
| 136 | |||
| 137 | for pid, args in pairs: | ||
| 138 | if re.search(r"/bin/sh -c.*%s"%cmd, args): | ||
| 139 | sys.stderr.write("Killing job %s\n" % pid) | ||
| 140 | check_output(("kill -9 %s" % pid).split(" ")) | ||
| 141 | |||
| 142 | def remove_boot_job(): | ||
| 143 | '''Remove installed reboot job from crontab''' | ||
| 144 | data = get_cron_data() | ||
| 145 | regex = re.compile(r".*%s.*" % re.escape(common.get_cmd()), re.M) | ||
| 146 | |||
| 147 | if regex.search(data): | ||
| 148 | new_cron = regex.sub("", data) | ||
| 149 | write_cron_data(new_cron) | ||
| 150 | |||
| 151 | set_panic_restart(False) | ||
diff --git a/run/experiment.py b/run/experiment.py index 9a70414..da0e32e 100644 --- a/run/experiment.py +++ b/run/experiment.py | |||
| @@ -35,6 +35,9 @@ class Experiment(object): | |||
| 35 | self.exec_err = None | 35 | self.exec_err = None |
| 36 | self.tracer_types = tracer_types | 36 | self.tracer_types = tracer_types |
| 37 | 37 | ||
| 38 | self.regular_tracers = [] | ||
| 39 | self.exact_tracers = [] | ||
| 40 | |||
| 38 | def __setup_tracers(self): | 41 | def __setup_tracers(self): |
| 39 | tracers = [ t(self.working_dir) for t in self.tracer_types ] | 42 | tracers = [ t(self.working_dir) for t in self.tracer_types ] |
| 40 | 43 | ||
| @@ -55,8 +58,13 @@ class Experiment(object): | |||
| 55 | Experiment.INTERRUPTED_DIR) | 58 | Experiment.INTERRUPTED_DIR) |
| 56 | interrupted = "%s/%s" % (os.path.split(self.working_dir)[0], | 59 | interrupted = "%s/%s" % (os.path.split(self.working_dir)[0], |
| 57 | Experiment.INTERRUPTED_DIR) | 60 | Experiment.INTERRUPTED_DIR) |
| 61 | old_int = "%s/%s" % (self.working_dir, Experiment.INTERRUPTED_DIR) | ||
| 62 | |||
| 58 | if os.path.exists(interrupted): | 63 | if os.path.exists(interrupted): |
| 59 | sh.rmtree(interrupted) | 64 | sh.rmtree(interrupted) |
| 65 | if os.path.exists(old_int): | ||
| 66 | sh.rmtree(old_int) | ||
| 67 | |||
| 60 | os.rename(self.working_dir, interrupted) | 68 | os.rename(self.working_dir, interrupted) |
| 61 | 69 | ||
| 62 | os.mkdir(self.working_dir) | 70 | os.mkdir(self.working_dir) |
