Added --crontab option to run_exps.py

This will use crontab to automatically restart the machine and resume the script when the machine crashes. An additional option, -k, is provided to cancel this operation.
author: Jonathan Herman <hermanjl@cs.unc.edu> 2013-05-01 15:48:01 -0400
committer: Jonathan Herman <hermanjl@cs.unc.edu> 2013-05-01 15:48:01 -0400
commit: cd9f1b026cc5c4526dfbd2f7b1c5f39edb6a7309 (patch)
tree: 5b6221e55d7f50c88a574ed4f57ff7efd9b7103d
parent: 94cc65997d237ddeab24d396f06bb93bc0644a9d (diff)
4 files changed, 234 insertions, 12 deletions
diff --git a/common.py b/common.py
index 7abf0f2..9ea2915 100644
--- a/common.py
+++ b/common.py
@@ -182,7 +182,7 @@ def ft_freq():
 def kernel():
-    return subprocess.check_output(["uname", "-r"])
+    return subprocess.check_output(["uname", "-r"]).strip("\n")
 def is_executable(fname):
    '''Return whether the file passed in is executable'''
@@ -210,3 +210,6 @@ def log_once(id, msg = None, indent = True):
        if indent:
            msg = '   ' + msg.strip('\t').replace('\n', '\n\t')
        sys.stderr.write('\n' + msg.strip('\n') + '\n')
+def get_cmd():
+    return os.path.split(sys.argv[0])[1]
diff --git a/run/crontab.py b/run/crontab.py
new file mode 100644
index 0000000..87d71b1
--- /dev/null
+++ b/run/crontab.py
@@ -0,0 +1,151 @@
+from __future__ import print_function
+import common
+import os
+import re
+import sys
+from subprocess import Popen, PIPE, check_output
+PANIC_DUR = 10
+DELAY = 30
+DELAY_INTERVAL = 10
+def get_cron_data():
+    try:
+        return check_output(['crontab', '-l'])
+    except:
+        return ""
+def wall(message):
+    '''A wall command with no header'''
+    return "echo '%s' | wall -n" % message
+def sanitize(args, ignored):
+    ret_args = []
+    for a in args:
+        if a in ignored:
+            continue
+        if '-' == a[0] and '--' != a[0:2]:
+            for i in ignored:
+                a = a.replace(i, '')
+        ret_args += [a]
+    return ret_args
+def get_outfname():
+    return "cron-%s.txt" % common.get_cmd()
+def get_boot_cron(ignored_params, extra=""):
+    '''Turn current python script into a crontab reboot entry'''
+    job_args = sanitize(sys.argv, ignored_params)
+    job = " ".join(job_args)
+    out_fname = get_outfname()
+    short_job = " ".join([common.get_cmd()] + job_args[1:])
+    msg = "Job '%s' will write output to '%s'" % (short_job, out_fname)
+    sys.stderr.write("%s %d seconds after reboot.\n" % (msg, DELAY))
+    # Create sleep and wall commands which will countdown DELAY seconds
+    # before executing the job
+    cmds = ["sleep %d" % DELAY_INTERVAL]
+    delay_rem = DELAY - DELAY_INTERVAL
+    while delay_rem > 0:
+        wmsg = "Restarting experiments in %d seconds. %s" % (delay_rem, extra)
+        cmds += [wall(wmsg)]
+        cmds += ["sleep %d" % min(DELAY_INTERVAL, delay_rem)]
+        delay_rem -= DELAY_INTERVAL
+    delay_cmd = ";".join(cmds)
+    # Create command which will only execute if the same kernel is running
+    kern = common.kernel()
+    fail_wall = wall("Need matching kernel '%s' to run!" % kern)
+    run_cmd = "echo '%s' | grep -q `uname -r` && %s && %s && %s >> %s 2>>%s || %s" %\
+      (kern, wall(msg), wall("Starting..."), job, out_fname, out_fname, fail_wall)
+    return "@reboot cd %s; %s; %s;" % (os.getcwd(), delay_cmd, run_cmd)
+def set_panic_restart(bool_val):
+    '''Enable / disable restart on panics'''
+    if bool_val:
+        sys.stderr.write("Kernel will reboot after panic.\n")
+        dur = PANIC_DUR
+    else:
+        sys.stderr.write("Kernel will no longer reboot after panic.\n")
+        dur = 0
+    check_output(['sysctl', '-w', "kernel.panic=%d" % dur,
+                  "kernel.panic_on_oops=%d" % dur])
+def write_cron_data(data):
+    '''Write new crontab entry. No blank lines are written'''
+    # I don't know why "^\s*$" doesn't match, hence this ugly regex
+    data = re.sub(r"\n\s*\n", "\n", data, re.M)
+    sp = Popen(["crontab", "-"], stdin=PIPE)
+    stdout, stderr = sp.communicate(input=data)
+def install_path():
+    '''Place the current path in the crontab entry'''
+    data = get_cron_data()
+    curr_line = re.findall(r"PATH=.*", data)
+    if curr_line:
+        curr_paths = re.findall(r"((?:\/\w+)+)", curr_line[0])
+        data = re.sub(curr_line[0], "", data)
+    else:
+        curr_paths = []
+    curr_paths = set(curr_paths)
+    for path in os.environ["PATH"].split(os.pathsep):
+        curr_paths.add(path)
+    data = "PATH=" + os.pathsep.join(curr_paths) + "\n" + data
+    write_cron_data(data)
+def install_boot_job(ignored_params, reboot_message):
+    '''Re-run the current python script on system reboot using crontab'''
+    remove_boot_job()
+    data = get_cron_data()
+    job  = get_boot_cron(ignored_params, reboot_message)
+    set_panic_restart(True)
+    write_cron_data(data + job + "\n")
+    if job not in get_cron_data():
+        raise IOError("Failed to write %s into cron!" % job)
+    else:
+        install_path()
+def clean_output():
+    fname = get_outfname()
+    if os.path.exists(fname):
+        os.remove(fname)
+def kill_boot_job():
+    remove_boot_job()
+    cmd = common.get_cmd()
+    procs = check_output("ps -eo pid,args".split(" "))
+    pairs = re.findall("(\d+) (.*)", procs)
+    for pid, args in pairs:
+        if re.search(r"/bin/sh -c.*%s"%cmd, args):
+            sys.stderr.write("Killing job %s\n" % pid)
+            check_output(("kill -9 %s" % pid).split(" "))
+def remove_boot_job():
+    '''Remove installed reboot job from crontab'''
+    data  = get_cron_data()
+    regex = re.compile(r".*%s.*" % re.escape(common.get_cmd()), re.M)
+    if regex.search(data):
+        new_cron = regex.sub("", data)
+        write_cron_data(new_cron)
+        set_panic_restart(False)
diff --git a/run/experiment.py b/run/experiment.py
index 9a70414..da0e32e 100644
--- a/run/experiment.py
+++ b/run/experiment.py
@@ -35,6 +35,9 @@ class Experiment(object):
        self.exec_err = None
        self.tracer_types = tracer_types
+        self.regular_tracers = []
+        self.exact_tracers = []
    def __setup_tracers(self):
        tracers = [ t(self.working_dir) for t in self.tracer_types ]
@@ -55,8 +58,13 @@ class Experiment(object):
                     Experiment.INTERRUPTED_DIR)
            interrupted = "%s/%s" % (os.path.split(self.working_dir)[0],
                                     Experiment.INTERRUPTED_DIR)
+            old_int = "%s/%s" % (self.working_dir, Experiment.INTERRUPTED_DIR)
            if os.path.exists(interrupted):
                sh.rmtree(interrupted)
+            if os.path.exists(old_int):
+                sh.rmtree(old_int)
            os.rename(self.working_dir, interrupted)
        os.mkdir(self.working_dir)
diff --git a/run_exps.py b/run_exps.py
index 1bad2a3..21666a9 100755
--- a/run_exps.py
+++ b/run_exps.py
@@ -3,10 +3,12 @@ from __future__ import print_function
 import common as com
 import os
+import pickle
 import pprint
 import re
 import shutil
 import sys
+import run.crontab as cron
 import run.tracer as trace
 from config.config import PARAMS,DEFAULTS,FILES
@@ -17,9 +19,6 @@ from run.executable.executable import Executable
 from run.experiment import Experiment,ExperimentDone,SystemCorrupted
 from run.proc_entry import ProcEntry
-'''Maximum times an experiment will be retried'''
-MAX_RETRY = 5
 '''Customizable experiment parameters'''
 ExpParams = namedtuple('ExpParams', ['scheduler', 'duration', 'tracers',
                                     'kernel', 'config_options', 'file_params',
@@ -31,6 +30,11 @@ ExpData  = com.recordtype('ExpData', ['name', 'params', 'sched_file', 'out_dir',
 '''Comparison of requested versus actual kernel compile parameter value'''
 ConfigResult = namedtuple('ConfigResult', ['param', 'wanted', 'actual'])
+'''Maximum times an experiment will be retried'''
+MAX_RETRY = 5
+'''Location experiment retry count is stored'''
+TRIES_FNAME = ".tries.pkl"
 class InvalidKernel(Exception):
    def __init__(self, kernel):
@@ -88,6 +92,9 @@ def parse_args():
                     action='store_true', default=False,
                     help='use crontab to resume interrupted script after '
                     'system restarts. implies --retry')
+    group.add_option('-k', '--kill-crontab', dest='kill',
+                     action='store_true', default=False,
+                     help='kill existing script crontabs and exit')
    parser.add_option_group(group)
    return parser.parse_args()
@@ -314,7 +321,7 @@ def run_experiment(data, start_message, ignore, jabber):
 def make_paths(exp, opts, out_base_dir):
    '''Translate experiment name to (schedule file, output directory) paths'''
-    path = "%s/%s" % (os.getcwd(), exp)
+    path = os.path.abspath(exp)
    out_dir = "%s/%s" % (out_base_dir, os.path.split(exp.strip('/'))[1])
    if not os.path.exists(path):
@@ -408,10 +415,35 @@ def setup_email(target):
    return None
+def tries_file(exp):
+    return "%s/%s" % (os.path.split(exp.sched_file)[0], TRIES_FNAME)
+def get_tries(exp):
+    if not os.path.exists(tries_file(exp)):
+        return 0
+    with open(tries_file(exp), 'r') as f:
+        return int(pickle.load(f))
+def set_tries(exp, val):
+    if not val:
+        if os.path.exists(tries_file(exp)):
+            os.remove(tries_file(exp))
+    else:
+        with open(tries_file(exp), 'w') as f:
+            pickle.dump(str(val), f)
+    os.system('sync')
 def run_exps(exps, opts):
    jabber = setup_jabber(opts.jabber) if opts.jabber else None
-    exps_remaining = list(enumerate(exps))
+    # Give each experiment a unique id
+    exps_remaining = enumerate(exps)
+    # But run experiments which have failed the most last
+    exps_remaining = sorted(exps_remaining, key=lambda x: get_tries(x[1]))
    while exps_remaining:
        i, exp = exps_remaining.pop(0)
@@ -419,17 +451,26 @@ def run_exps(exps, opts):
        start_message = "%s experiment %d of %d." % (verb, i+1, len(exps))
        try:
+            set_tries(exp, get_tries(exp) + 1)
+            if get_tries(exp) > MAX_RETRY:
+                raise Exception("Hit maximum retries of %d" % MAX_RETRY)
            run_experiment(exp, start_message, opts.ignore, jabber)
+            set_tries(exp, 0)
            exp.state = ExpState.Succeeded
        except KeyboardInterrupt:
            sys.stderr.write("Keyboard interrupt, quitting\n")
+            set_tries(exp, get_tries(exp) - 1)
            break
        except ExperimentDone:
            sys.stderr.write("Experiment already completed at '%s'\n" % exp.out_dir)
+            set_tries(exp, 0)
            exp.state = ExpState.Done
        except (InvalidKernel, InvalidConfig) as e:
            sys.stderr.write("Invalid environment for experiment '%s'\n" % exp.name)
            sys.stderr.write("%s\n" % e)
+            set_tries(exp, get_tries(exp) - 1)
            exp.state = ExpState.Invalid
        except SystemCorrupted as e:
            sys.stderr.write("System is corrupted! Fix state before continuing.\n")
@@ -445,17 +486,19 @@ def run_exps(exps, opts):
            exp.state = ExpState.Failed
        if exp.state is ExpState.Failed and opts.retry:
-            if exp.retries < MAX_RETRY:
+            exps_remaining += [(i, exp)]
-                exps_remaining += [(i, exp)]
-                exp.retries += 1
-            else:
-                sys.stderr.write("Hit maximum retries of %d\n" % MAX_RETRY)
 def main():
    opts, args = parse_args()
+    if opts.kill:
+        cron.kill_boot_job()
+        sys.exit(1)
    email = setup_email(opts.email) if opts.email else None
+    # Create base output directory for run data
    out_base = os.path.abspath(opts.out_dir)
    created  = False
    if not os.path.exists(out_base):
@@ -464,7 +507,24 @@ def main():
    exps = get_exps(opts, args, out_base)
-    run_exps(exps, opts)
+    if opts.crontab:
+        # Resume script on startup
+        opts.retry = True
+        cron.install_boot_job(['f', '--forced'],
+                              "Stop with %s -k" % com.get_cmd())
+    if opts.force or not opts.retry:
+        cron.clean_output()
+        for e in exps:
+            set_tries(e, 0)
+    try:
+        run_exps(exps, opts)
+    finally:
+        # Remove persistent state
+        for e in exps:
+            set_tries(e, 0)
+        cron.remove_boot_job()
    def state_count(state):
        return len(filter(lambda x: x.state is state, exps))
author	Jonathan Herman <hermanjl@cs.unc.edu>	2013-05-01 15:48:01 -0400
committer	Jonathan Herman <hermanjl@cs.unc.edu>	2013-05-01 15:48:01 -0400
commit	cd9f1b026cc5c4526dfbd2f7b1c5f39edb6a7309 (patch)
tree	5b6221e55d7f50c88a574ed4f57ff7efd9b7103d
parent	94cc65997d237ddeab24d396f06bb93bc0644a9d (diff)

diff --git a/common.py b/common.py index 7abf0f2..9ea2915 100644 --- a/common.py +++ b/common.py
@@ -182,7 +182,7 @@ def ft_freq():
182		182
183		183
184	def kernel():	184	def kernel():
185	return subprocess.check_output(["uname", "-r"])	185	return subprocess.check_output(["uname", "-r"]).strip("\n")
186		186
187	def is_executable(fname):	187	def is_executable(fname):
188	'''Return whether the file passed in is executable'''	188	'''Return whether the file passed in is executable'''
@@ -210,3 +210,6 @@ def log_once(id, msg = None, indent = True):
210	if indent:	210	if indent:
211	msg = ' ' + msg.strip('\t').replace('\n', '\n\t')	211	msg = ' ' + msg.strip('\t').replace('\n', '\n\t')
212	sys.stderr.write('\n' + msg.strip('\n') + '\n')	212	sys.stderr.write('\n' + msg.strip('\n') + '\n')
		213
		214	def get_cmd():
		215	return os.path.split(sys.argv[0])[1]


diff --git a/run/crontab.py b/run/crontab.py new file mode 100644 index 0000000..87d71b1 --- /dev/null +++ b/run/crontab.py
@@ -0,0 +1,151 @@
		1	from __future__ import print_function
		2
		3	import common
		4	import os
		5	import re
		6	import sys
		7
		8	from subprocess import Popen, PIPE, check_output
		9
		10	PANIC_DUR = 10
		11	DELAY = 30
		12	DELAY_INTERVAL = 10
		13
		14	def get_cron_data():
		15	try:
		16	return check_output(['crontab', '-l'])
		17	except:
		18	return ""
		19
		20	def wall(message):
		21	'''A wall command with no header'''
		22	return "echo '%s' \| wall -n" % message
		23
		24	def sanitize(args, ignored):
		25	ret_args = []
		26	for a in args:
		27	if a in ignored:
		28	continue
		29	if '-' == a[0] and '--' != a[0:2]:
		30	for i in ignored:
		31	a = a.replace(i, '')
		32	ret_args += [a]
		33	return ret_args
		34
		35	def get_outfname():
		36	return "cron-%s.txt" % common.get_cmd()
		37
		38	def get_boot_cron(ignored_params, extra=""):
		39	'''Turn current python script into a crontab reboot entry'''
		40	job_args = sanitize(sys.argv, ignored_params)
		41	job = " ".join(job_args)
		42	out_fname = get_outfname()
		43
		44	short_job = " ".join([common.get_cmd()] + job_args[1:])
		45	msg = "Job '%s' will write output to '%s'" % (short_job, out_fname)
		46
		47	sys.stderr.write("%s %d seconds after reboot.\n" % (msg, DELAY))
		48
		49	# Create sleep and wall commands which will countdown DELAY seconds
		50	# before executing the job
		51	cmds = ["sleep %d" % DELAY_INTERVAL]
		52	delay_rem = DELAY - DELAY_INTERVAL
		53	while delay_rem > 0:
		54	wmsg = "Restarting experiments in %d seconds. %s" % (delay_rem, extra)
		55	cmds += [wall(wmsg)]
		56	cmds += ["sleep %d" % min(DELAY_INTERVAL, delay_rem)]
		57	delay_rem -= DELAY_INTERVAL
		58	delay_cmd = ";".join(cmds)
		59
		60	# Create command which will only execute if the same kernel is running
		61	kern = common.kernel()
		62	fail_wall = wall("Need matching kernel '%s' to run!" % kern)
		63	run_cmd = "echo '%s' \| grep -q `uname -r` && %s && %s && %s >> %s 2>>%s \|\| %s" %\
		64	(kern, wall(msg), wall("Starting..."), job, out_fname, out_fname, fail_wall)
		65
		66	return "@reboot cd %s; %s; %s;" % (os.getcwd(), delay_cmd, run_cmd)
		67
		68	def set_panic_restart(bool_val):
		69	'''Enable / disable restart on panics'''
		70	if bool_val:
		71	sys.stderr.write("Kernel will reboot after panic.\n")
		72	dur = PANIC_DUR
		73	else:
		74	sys.stderr.write("Kernel will no longer reboot after panic.\n")
		75	dur = 0
		76
		77	check_output(['sysctl', '-w', "kernel.panic=%d" % dur,
		78	"kernel.panic_on_oops=%d" % dur])
		79
		80	def write_cron_data(data):
		81	'''Write new crontab entry. No blank lines are written'''
		82
		83	# I don't know why "^\s*$" doesn't match, hence this ugly regex
		84	data = re.sub(r"\n\s*\n", "\n", data, re.M)
		85
		86	sp = Popen(["crontab", "-"], stdin=PIPE)
		87	stdout, stderr = sp.communicate(input=data)
		88
		89	def install_path():
		90	'''Place the current path in the crontab entry'''
		91	data = get_cron_data()
		92	curr_line = re.findall(r"PATH=.*", data)
		93
		94	if curr_line:
		95	curr_paths = re.findall(r"((?:\/\w+)+)", curr_line[0])
		96	data = re.sub(curr_line[0], "", data)
		97	else:
		98	curr_paths = []
		99	curr_paths = set(curr_paths)
		100
		101	for path in os.environ["PATH"].split(os.pathsep):
		102	curr_paths.add(path)
		103
		104	data = "PATH=" + os.pathsep.join(curr_paths) + "\n" + data
		105
		106	write_cron_data(data)
		107
		108	def install_boot_job(ignored_params, reboot_message):
		109	'''Re-run the current python script on system reboot using crontab'''
		110	remove_boot_job()
		111
		112	data = get_cron_data()
		113	job = get_boot_cron(ignored_params, reboot_message)
		114
		115	set_panic_restart(True)
		116
		117	write_cron_data(data + job + "\n")
		118
		119	if job not in get_cron_data():
		120	raise IOError("Failed to write %s into cron!" % job)
		121	else:
		122	install_path()
		123
		124	def clean_output():
		125	fname = get_outfname()
		126	if os.path.exists(fname):
		127	os.remove(fname)
		128
		129	def kill_boot_job():
		130	remove_boot_job()
		131
		132	cmd = common.get_cmd()
		133
		134	procs = check_output("ps -eo pid,args".split(" "))
		135	pairs = re.findall("(\d+) (.*)", procs)
		136
		137	for pid, args in pairs:
		138	if re.search(r"/bin/sh -c.*%s"%cmd, args):
		139	sys.stderr.write("Killing job %s\n" % pid)
		140	check_output(("kill -9 %s" % pid).split(" "))
		141
		142	def remove_boot_job():
		143	'''Remove installed reboot job from crontab'''
		144	data = get_cron_data()
		145	regex = re.compile(r".%s." % re.escape(common.get_cmd()), re.M)
		146
		147	if regex.search(data):
		148	new_cron = regex.sub("", data)
		149	write_cron_data(new_cron)
		150
		151	set_panic_restart(False)


diff --git a/run/experiment.py b/run/experiment.py index 9a70414..da0e32e 100644 --- a/run/experiment.py +++ b/run/experiment.py
@@ -35,6 +35,9 @@ class Experiment(object):
35	self.exec_err = None	35	self.exec_err = None
36	self.tracer_types = tracer_types	36	self.tracer_types = tracer_types
37		37
		38	self.regular_tracers = []
		39	self.exact_tracers = []
		40
38	def __setup_tracers(self):	41	def __setup_tracers(self):
39	tracers = [ t(self.working_dir) for t in self.tracer_types ]	42	tracers = [ t(self.working_dir) for t in self.tracer_types ]
40		43
@@ -55,8 +58,13 @@ class Experiment(object):
55	Experiment.INTERRUPTED_DIR)	58	Experiment.INTERRUPTED_DIR)
56	interrupted = "%s/%s" % (os.path.split(self.working_dir)[0],	59	interrupted = "%s/%s" % (os.path.split(self.working_dir)[0],
57	Experiment.INTERRUPTED_DIR)	60	Experiment.INTERRUPTED_DIR)
		61	old_int = "%s/%s" % (self.working_dir, Experiment.INTERRUPTED_DIR)
		62
58	if os.path.exists(interrupted):	63	if os.path.exists(interrupted):
59	sh.rmtree(interrupted)	64	sh.rmtree(interrupted)
		65	if os.path.exists(old_int):
		66	sh.rmtree(old_int)
		67
60	os.rename(self.working_dir, interrupted)	68	os.rename(self.working_dir, interrupted)
61		69
62	os.mkdir(self.working_dir)	70	os.mkdir(self.working_dir)


diff --git a/run_exps.py b/run_exps.py index 1bad2a3..21666a9 100755 --- a/run_exps.py +++ b/run_exps.py
@@ -3,10 +3,12 @@ from __future__ import print_function
3		3
4	import common as com	4	import common as com
5	import os	5	import os
		6	import pickle
6	import pprint	7	import pprint
7	import re	8	import re
8	import shutil	9	import shutil
9	import sys	10	import sys
		11	import run.crontab as cron
10	import run.tracer as trace	12	import run.tracer as trace
11		13
12	from config.config import PARAMS,DEFAULTS,FILES	14	from config.config import PARAMS,DEFAULTS,FILES
@@ -17,9 +19,6 @@ from run.executable.executable import Executable
17	from run.experiment import Experiment,ExperimentDone,SystemCorrupted	19	from run.experiment import Experiment,ExperimentDone,SystemCorrupted
18	from run.proc_entry import ProcEntry	20	from run.proc_entry import ProcEntry
19		21
20	'''Maximum times an experiment will be retried'''
21	MAX_RETRY = 5
22
23	'''Customizable experiment parameters'''	22	'''Customizable experiment parameters'''
24	ExpParams = namedtuple('ExpParams', ['scheduler', 'duration', 'tracers',	23	ExpParams = namedtuple('ExpParams', ['scheduler', 'duration', 'tracers',
25	'kernel', 'config_options', 'file_params',	24	'kernel', 'config_options', 'file_params',
@@ -31,6 +30,11 @@ ExpData = com.recordtype('ExpData', ['name', 'params', 'sched_file', 'out_dir',
31	'''Comparison of requested versus actual kernel compile parameter value'''	30	'''Comparison of requested versus actual kernel compile parameter value'''
32	ConfigResult = namedtuple('ConfigResult', ['param', 'wanted', 'actual'])	31	ConfigResult = namedtuple('ConfigResult', ['param', 'wanted', 'actual'])
33		32
		33	'''Maximum times an experiment will be retried'''
		34	MAX_RETRY = 5
		35	'''Location experiment retry count is stored'''
		36	TRIES_FNAME = ".tries.pkl"
		37
34		38
35	class InvalidKernel(Exception):	39	class InvalidKernel(Exception):
36	def __init__(self, kernel):	40	def __init__(self, kernel):
@@ -88,6 +92,9 @@ def parse_args():
88	action='store_true', default=False,	92	action='store_true', default=False,
89	help='use crontab to resume interrupted script after '	93	help='use crontab to resume interrupted script after '
90	'system restarts. implies --retry')	94	'system restarts. implies --retry')
		95	group.add_option('-k', '--kill-crontab', dest='kill',
		96	action='store_true', default=False,
		97	help='kill existing script crontabs and exit')
91	parser.add_option_group(group)	98	parser.add_option_group(group)
92		99
93	return parser.parse_args()	100	return parser.parse_args()
@@ -314,7 +321,7 @@ def run_experiment(data, start_message, ignore, jabber):
314		321
315	def make_paths(exp, opts, out_base_dir):	322	def make_paths(exp, opts, out_base_dir):
316	'''Translate experiment name to (schedule file, output directory) paths'''	323	'''Translate experiment name to (schedule file, output directory) paths'''
317	path = "%s/%s" % (os.getcwd(), exp)	324	path = os.path.abspath(exp)
318	out_dir = "%s/%s" % (out_base_dir, os.path.split(exp.strip('/'))[1])	325	out_dir = "%s/%s" % (out_base_dir, os.path.split(exp.strip('/'))[1])
319		326
320	if not os.path.exists(path):	327	if not os.path.exists(path):
@@ -408,10 +415,35 @@ def setup_email(target):
408	return None	415	return None
409		416
410		417
		418	def tries_file(exp):
		419	return "%s/%s" % (os.path.split(exp.sched_file)[0], TRIES_FNAME)
		420
		421
		422	def get_tries(exp):
		423	if not os.path.exists(tries_file(exp)):
		424	return 0
		425	with open(tries_file(exp), 'r') as f:
		426	return int(pickle.load(f))
		427
		428
		429	def set_tries(exp, val):
		430	if not val:
		431	if os.path.exists(tries_file(exp)):
		432	os.remove(tries_file(exp))
		433	else:
		434	with open(tries_file(exp), 'w') as f:
		435	pickle.dump(str(val), f)
		436	os.system('sync')
		437
		438
411	def run_exps(exps, opts):	439	def run_exps(exps, opts):
412	jabber = setup_jabber(opts.jabber) if opts.jabber else None	440	jabber = setup_jabber(opts.jabber) if opts.jabber else None
413		441
414	exps_remaining = list(enumerate(exps))	442	# Give each experiment a unique id
		443	exps_remaining = enumerate(exps)
		444	# But run experiments which have failed the most last
		445	exps_remaining = sorted(exps_remaining, key=lambda x: get_tries(x[1]))
		446
415	while exps_remaining:	447	while exps_remaining:
416	i, exp = exps_remaining.pop(0)	448	i, exp = exps_remaining.pop(0)
417		449
@@ -419,17 +451,26 @@ def run_exps(exps, opts):
419	start_message = "%s experiment %d of %d." % (verb, i+1, len(exps))	451	start_message = "%s experiment %d of %d." % (verb, i+1, len(exps))
420		452
421	try:	453	try:
		454	set_tries(exp, get_tries(exp) + 1)
		455	if get_tries(exp) > MAX_RETRY:
		456	raise Exception("Hit maximum retries of %d" % MAX_RETRY)
		457
422	run_experiment(exp, start_message, opts.ignore, jabber)	458	run_experiment(exp, start_message, opts.ignore, jabber)
		459
		460	set_tries(exp, 0)
423	exp.state = ExpState.Succeeded	461	exp.state = ExpState.Succeeded
424	except KeyboardInterrupt:	462	except KeyboardInterrupt:
425	sys.stderr.write("Keyboard interrupt, quitting\n")	463	sys.stderr.write("Keyboard interrupt, quitting\n")
		464	set_tries(exp, get_tries(exp) - 1)
426	break	465	break
427	except ExperimentDone:	466	except ExperimentDone:
428	sys.stderr.write("Experiment already completed at '%s'\n" % exp.out_dir)	467	sys.stderr.write("Experiment already completed at '%s'\n" % exp.out_dir)
		468	set_tries(exp, 0)
429	exp.state = ExpState.Done	469	exp.state = ExpState.Done
430	except (InvalidKernel, InvalidConfig) as e:	470	except (InvalidKernel, InvalidConfig) as e:
431	sys.stderr.write("Invalid environment for experiment '%s'\n" % exp.name)	471	sys.stderr.write("Invalid environment for experiment '%s'\n" % exp.name)
432	sys.stderr.write("%s\n" % e)	472	sys.stderr.write("%s\n" % e)
		473	set_tries(exp, get_tries(exp) - 1)
433	exp.state = ExpState.Invalid	474	exp.state = ExpState.Invalid
434	except SystemCorrupted as e:	475	except SystemCorrupted as e:
435	sys.stderr.write("System is corrupted! Fix state before continuing.\n")	476	sys.stderr.write("System is corrupted! Fix state before continuing.\n")
@@ -445,17 +486,19 @@ def run_exps(exps, opts):
445	exp.state = ExpState.Failed	486	exp.state = ExpState.Failed
446		487
447	if exp.state is ExpState.Failed and opts.retry:	488	if exp.state is ExpState.Failed and opts.retry:
448	if exp.retries < MAX_RETRY:	489	exps_remaining += [(i, exp)]
449	exps_remaining += [(i, exp)]	490
450	exp.retries += 1
451	else:
452	sys.stderr.write("Hit maximum retries of %d\n" % MAX_RETRY)
453		491
454	def main():	492	def main():
455	opts, args = parse_args()	493	opts, args = parse_args()
456		494
		495	if opts.kill:
		496	cron.kill_boot_job()
		497	sys.exit(1)
		498
457	email = setup_email(opts.email) if opts.email else None	499	email = setup_email(opts.email) if opts.email else None
458		500
		501	# Create base output directory for run data
459	out_base = os.path.abspath(opts.out_dir)	502	out_base = os.path.abspath(opts.out_dir)
460	created = False	503	created = False
461	if not os.path.exists(out_base):	504	if not os.path.exists(out_base):
@@ -464,7 +507,24 @@ def main():
464		507
465	exps = get_exps(opts, args, out_base)	508	exps = get_exps(opts, args, out_base)
466		509
467	run_exps(exps, opts)	510	if opts.crontab:
		511	# Resume script on startup
		512	opts.retry = True
		513	cron.install_boot_job(['f', '--forced'],
		514	"Stop with %s -k" % com.get_cmd())
		515
		516	if opts.force or not opts.retry:
		517	cron.clean_output()
		518	for e in exps:
		519	set_tries(e, 0)
		520
		521	try:
		522	run_exps(exps, opts)
		523	finally:
		524	# Remove persistent state
		525	for e in exps:
		526	set_tries(e, 0)
		527	cron.remove_boot_job()
468		528
469	def state_count(state):	529	def state_count(state):
470	return len(filter(lambda x: x.state is state, exps))	530	return len(filter(lambda x: x.state is state, exps))