diff options
author | Jonathan Herman <hermanjl@cs.unc.edu> | 2013-05-01 15:48:01 -0400 |
---|---|---|
committer | Jonathan Herman <hermanjl@cs.unc.edu> | 2013-05-01 15:48:01 -0400 |
commit | cd9f1b026cc5c4526dfbd2f7b1c5f39edb6a7309 (patch) | |
tree | 5b6221e55d7f50c88a574ed4f57ff7efd9b7103d /run/crontab.py | |
parent | 94cc65997d237ddeab24d396f06bb93bc0644a9d (diff) |
Added --crontab option to run_exps.py
This will use crontab to automatically restart the machine and resume
the script when the machine crashes. An additional option, -k, is provided
to cancel this operation.
Diffstat (limited to 'run/crontab.py')
-rw-r--r-- | run/crontab.py | 151 |
1 files changed, 151 insertions, 0 deletions
diff --git a/run/crontab.py b/run/crontab.py new file mode 100644 index 0000000..87d71b1 --- /dev/null +++ b/run/crontab.py | |||
@@ -0,0 +1,151 @@ | |||
1 | from __future__ import print_function | ||
2 | |||
3 | import common | ||
4 | import os | ||
5 | import re | ||
6 | import sys | ||
7 | |||
8 | from subprocess import Popen, PIPE, check_output | ||
9 | |||
10 | PANIC_DUR = 10 | ||
11 | DELAY = 30 | ||
12 | DELAY_INTERVAL = 10 | ||
13 | |||
14 | def get_cron_data(): | ||
15 | try: | ||
16 | return check_output(['crontab', '-l']) | ||
17 | except: | ||
18 | return "" | ||
19 | |||
20 | def wall(message): | ||
21 | '''A wall command with no header''' | ||
22 | return "echo '%s' | wall -n" % message | ||
23 | |||
24 | def sanitize(args, ignored): | ||
25 | ret_args = [] | ||
26 | for a in args: | ||
27 | if a in ignored: | ||
28 | continue | ||
29 | if '-' == a[0] and '--' != a[0:2]: | ||
30 | for i in ignored: | ||
31 | a = a.replace(i, '') | ||
32 | ret_args += [a] | ||
33 | return ret_args | ||
34 | |||
35 | def get_outfname(): | ||
36 | return "cron-%s.txt" % common.get_cmd() | ||
37 | |||
38 | def get_boot_cron(ignored_params, extra=""): | ||
39 | '''Turn current python script into a crontab reboot entry''' | ||
40 | job_args = sanitize(sys.argv, ignored_params) | ||
41 | job = " ".join(job_args) | ||
42 | out_fname = get_outfname() | ||
43 | |||
44 | short_job = " ".join([common.get_cmd()] + job_args[1:]) | ||
45 | msg = "Job '%s' will write output to '%s'" % (short_job, out_fname) | ||
46 | |||
47 | sys.stderr.write("%s %d seconds after reboot.\n" % (msg, DELAY)) | ||
48 | |||
49 | # Create sleep and wall commands which will countdown DELAY seconds | ||
50 | # before executing the job | ||
51 | cmds = ["sleep %d" % DELAY_INTERVAL] | ||
52 | delay_rem = DELAY - DELAY_INTERVAL | ||
53 | while delay_rem > 0: | ||
54 | wmsg = "Restarting experiments in %d seconds. %s" % (delay_rem, extra) | ||
55 | cmds += [wall(wmsg)] | ||
56 | cmds += ["sleep %d" % min(DELAY_INTERVAL, delay_rem)] | ||
57 | delay_rem -= DELAY_INTERVAL | ||
58 | delay_cmd = ";".join(cmds) | ||
59 | |||
60 | # Create command which will only execute if the same kernel is running | ||
61 | kern = common.kernel() | ||
62 | fail_wall = wall("Need matching kernel '%s' to run!" % kern) | ||
63 | run_cmd = "echo '%s' | grep -q `uname -r` && %s && %s && %s >> %s 2>>%s || %s" %\ | ||
64 | (kern, wall(msg), wall("Starting..."), job, out_fname, out_fname, fail_wall) | ||
65 | |||
66 | return "@reboot cd %s; %s; %s;" % (os.getcwd(), delay_cmd, run_cmd) | ||
67 | |||
68 | def set_panic_restart(bool_val): | ||
69 | '''Enable / disable restart on panics''' | ||
70 | if bool_val: | ||
71 | sys.stderr.write("Kernel will reboot after panic.\n") | ||
72 | dur = PANIC_DUR | ||
73 | else: | ||
74 | sys.stderr.write("Kernel will no longer reboot after panic.\n") | ||
75 | dur = 0 | ||
76 | |||
77 | check_output(['sysctl', '-w', "kernel.panic=%d" % dur, | ||
78 | "kernel.panic_on_oops=%d" % dur]) | ||
79 | |||
80 | def write_cron_data(data): | ||
81 | '''Write new crontab entry. No blank lines are written''' | ||
82 | |||
83 | # I don't know why "^\s*$" doesn't match, hence this ugly regex | ||
84 | data = re.sub(r"\n\s*\n", "\n", data, re.M) | ||
85 | |||
86 | sp = Popen(["crontab", "-"], stdin=PIPE) | ||
87 | stdout, stderr = sp.communicate(input=data) | ||
88 | |||
89 | def install_path(): | ||
90 | '''Place the current path in the crontab entry''' | ||
91 | data = get_cron_data() | ||
92 | curr_line = re.findall(r"PATH=.*", data) | ||
93 | |||
94 | if curr_line: | ||
95 | curr_paths = re.findall(r"((?:\/\w+)+)", curr_line[0]) | ||
96 | data = re.sub(curr_line[0], "", data) | ||
97 | else: | ||
98 | curr_paths = [] | ||
99 | curr_paths = set(curr_paths) | ||
100 | |||
101 | for path in os.environ["PATH"].split(os.pathsep): | ||
102 | curr_paths.add(path) | ||
103 | |||
104 | data = "PATH=" + os.pathsep.join(curr_paths) + "\n" + data | ||
105 | |||
106 | write_cron_data(data) | ||
107 | |||
108 | def install_boot_job(ignored_params, reboot_message): | ||
109 | '''Re-run the current python script on system reboot using crontab''' | ||
110 | remove_boot_job() | ||
111 | |||
112 | data = get_cron_data() | ||
113 | job = get_boot_cron(ignored_params, reboot_message) | ||
114 | |||
115 | set_panic_restart(True) | ||
116 | |||
117 | write_cron_data(data + job + "\n") | ||
118 | |||
119 | if job not in get_cron_data(): | ||
120 | raise IOError("Failed to write %s into cron!" % job) | ||
121 | else: | ||
122 | install_path() | ||
123 | |||
124 | def clean_output(): | ||
125 | fname = get_outfname() | ||
126 | if os.path.exists(fname): | ||
127 | os.remove(fname) | ||
128 | |||
129 | def kill_boot_job(): | ||
130 | remove_boot_job() | ||
131 | |||
132 | cmd = common.get_cmd() | ||
133 | |||
134 | procs = check_output("ps -eo pid,args".split(" ")) | ||
135 | pairs = re.findall("(\d+) (.*)", procs) | ||
136 | |||
137 | for pid, args in pairs: | ||
138 | if re.search(r"/bin/sh -c.*%s"%cmd, args): | ||
139 | sys.stderr.write("Killing job %s\n" % pid) | ||
140 | check_output(("kill -9 %s" % pid).split(" ")) | ||
141 | |||
142 | def remove_boot_job(): | ||
143 | '''Remove installed reboot job from crontab''' | ||
144 | data = get_cron_data() | ||
145 | regex = re.compile(r".*%s.*" % re.escape(common.get_cmd()), re.M) | ||
146 | |||
147 | if regex.search(data): | ||
148 | new_cron = regex.sub("", data) | ||
149 | write_cron_data(new_cron) | ||
150 | |||
151 | set_panic_restart(False) | ||