12 files changed, 474 insertions, 155 deletions
diff --git a/README.md b/README.md
index b074aa5..1f38978 100644
--- a/README.md
+++ b/README.md
@@ -156,16 +156,14 @@ You can specify your own spin programs to run as well instead of rtspin by putti
 $ echo "colorspin -f color1.csv 10 20" > test.sched
 ```
-You can specify parameters for an experiment in a file instead of on the command line using params.py (the `-p` option lets you choose the name of this file if `params.py` is not for you):
+You can specify parameters for an experiment in a file instead of on the command line using params.py:
 ```bash
 $ echo "{'scheduler':'GSN-EDF', 'duration':10}" > params.py
 $ run_exps.py test.sched
 ```
-You can also run multiple experiments with a single command, provided a directory with a schedule file exists for each. By default, the program will look for sched.py for the schedule file and params.py for the parameter file, but this behavior can be changed using the `-p` and `-c` options.
+You can also run multiple experiments with a single command, provided a directory with a schedule file exists for each. You can include non-relevant parameters which `run_exps.py` does not understand in `params.py`. These parameters will be saved with the data output by `run_exps.py`. This is useful for tracking variations in system parameters versus experimental results. In the following example, multiple experiments are demonstrated and an extra parameter `test-param` is included:
-You can include non-relevant parameters which `run_exps.py` does not understand in `params.py`. These parameters will be saved with the data output by `run_exps.py`. This is useful for tracking variations in system parameters versus experimental results. In the following example, multiple experiments are demonstrated and an extra parameter `test-param` is included:
 ```bash
 $ mkdir test1
diff --git a/common.py b/common.py
index ff0f986..fd1a983 100644
--- a/common.py
+++ b/common.py
@@ -182,7 +182,7 @@ def ft_freq():
 def kernel():
-    return subprocess.check_output(["uname", "-r"])
+    return subprocess.check_output(["uname", "-r"]).strip("\n")
 def is_executable(fname):
    '''Return whether the file passed in is executable'''
@@ -212,4 +212,7 @@ def log_once(id, msg = None):
        __logged += [id]
        if indent:
            msg = '   ' + msg.strip('\t').replace('\n', '\n\t')
-        sys.stderr.write('\n' + msg + '\n')
+        sys.stderr.write('\n' + msg.strip('\n') + '\n')
+def get_cmd():
+    return os.path.split(sys.argv[0])[1]
diff --git a/config/config.py b/config/config.py
index 28e78c9..27cb2dd 100644
--- a/config/config.py
+++ b/config/config.py
@@ -14,12 +14,14 @@ BINS = {'rtspin'    : get_executable_hint('rtspin', 'liblitmus'),
        # Optional, as sched_trace is not a publically supported repository
        'st_show'   : get_executable_hint('st_show', 'sched_trace', True)}
-'''Names of output files.'''
+'''Names of data files.'''
-FILES = {'ft_data'    : 'ft.bin',
+FILES = {'params_file' : 'params.py',
-         'ft_matches' : r'(ft.*\.bin$)|(.*\.ft)',
+         'sched_file'  : 'sched.py',
-         'linux_data' : 'trace.dat',
+         'ft_data'     : 'ft.bin',
-         'sched_data' : 'st-{}.bin',
+         'ft_matches'  : r'(ft.*\.bin$)|(.*\.ft)',
-         'log_data'   : 'trace.slog'}
+         'linux_data'  : 'trace.dat',
+         'sched_data'  : 'st-{}.bin',
+         'log_data'    : 'trace.slog'}
 '''Default parameter names in params.py.'''
 PARAMS = {'sched'   : 'scheduler',       # Scheduler used by run_exps
@@ -35,9 +37,7 @@ PARAMS = {'sched'   : 'scheduler',       # Scheduler used by run_exps
          }
 '''Default values for program options.'''
-DEFAULTS = {'params_file' : 'params.py',
+DEFAULTS = {'duration'    : 10,
-            'sched_file'  : 'sched.py',
-            'duration'    : 10,
            'prog'        : 'rtspin',
            'out-gen'     : 'exps',
            'out-run'     : 'run-data',
diff --git a/gen/edf_generators.py b/gen/edf_generators.py
index a722c21..8e4b8df 100644
--- a/gen/edf_generators.py
+++ b/gen/edf_generators.py
@@ -28,6 +28,7 @@ class EdfGenerator(gen.Generator):
        pdist = self._create_dist('period',
                                  exp_params['periods'],
                                  gen.NAMED_PERIODS)
        udist = self._create_dist('utilization',
                                  exp_params['utils'],
                                  gen.NAMED_UTILIZATIONS)
diff --git a/gen/generator.py b/gen/generator.py
index bc86cfe..40a0243 100644
--- a/gen/generator.py
+++ b/gen/generator.py
@@ -6,7 +6,7 @@ import shutil as sh
 from Cheetah.Template import Template
 from common import get_config_option,num_cpus,recordtype,log_once
-from config.config import DEFAULTS,PARAMS
+from config.config import FILES,PARAMS
 from gen.dp import DesignPointGenerator
 from parse.col_map import ColMapBuilder
@@ -129,7 +129,7 @@ class Generator(object):
    def _write_schedule(self, params):
        '''Write schedule file using current template for @params.'''
-        sched_file = self.out_dir + "/" + DEFAULTS['sched_file']
+        sched_file = self.out_dir + "/" + FILES['sched_file']
        with open(sched_file, 'wa') as f:
            f.write(str(Template(self.template, searchList=[params])))
@@ -143,7 +143,7 @@ class Generator(object):
        else:
            tasks = 0
-        exp_params_file = self.out_dir + "/" + DEFAULTS['params_file']
+        exp_params_file = self.out_dir + "/" + FILES['params_file']
        with open(exp_params_file, 'wa') as f:
            params['scheduler'] = self.scheduler
            pprint.pprint(params, f)
diff --git a/gen_exps.py b/gen_exps.py
index 65f50d8..e888f5f 100755
--- a/gen_exps.py
+++ b/gen_exps.py
@@ -43,6 +43,14 @@ def load_file(fname):
    except:
           raise IOError("Invalid generation file: %s" % fname)
+def print_descriptions(described):
+    for generator in described.split(','):
+        if generator not in gen.get_generators():
+            sys.stderr.write("No generator '%s'\n" % generator)
+        else:
+            print("Generator '%s', " % generator)
+            gen.get_generators()[generator]().print_help()
 def main():
    opts, args = parse_args()
@@ -50,12 +58,7 @@ def main():
    if opts.list_gens:
        print(", ".join(gen.get_generators()))
    if opts.described != None:
-        for generator in opts.described.split(','):
+        print_descriptions(opts.described)
-            if generator not in gen.get_generators():
-                sys.stderr.write("No generator '%s'\n" % generator)
-            else:
-                print("Generator '%s', " % generator)
-                gen.get_generators()[generator]().print_help()
    if opts.list_gens or opts.described:
        return 0
diff --git a/parse/col_map.py b/parse/col_map.py
index ceb8867..59484e8 100644
--- a/parse/col_map.py
+++ b/parse/col_map.py
@@ -22,7 +22,7 @@ class ColMapBuilder(object):
 class ColMap(object):
    def __init__(self, col_list, values = None):
-        self.col_list = col_list
+        self.col_list = sorted(col_list)
        self.rev_map = {}
        self.values = values
@@ -50,7 +50,7 @@ class ColMap(object):
            if col not in kv:
                key += (None,)
            else:
-                key += (kv[col],)
+                key += (str(kv[col]),)
        return key
diff --git a/parse/sched.py b/parse/sched.py
index 6e1fbe6..524f1ed 100644
--- a/parse/sched.py
+++ b/parse/sched.py
@@ -98,15 +98,38 @@ record_map = {}
 RECORD_SIZE   = 24
 NSEC_PER_MSEC = 1000000
+def bits_to_bytes(bits):
+    '''Includes padding'''
+    return bits / 8 + (1 if bits%8 else 0)
+def field_bytes(fields):
+    fbytes = 0
+    fbits  = 0
+    for f in fields:
+        flist = list(f)
+        if len(flist) > 2:
+            # Specified a bitfield
+            fbits += flist[2]
+        else:
+            # Only specified a type, use types size
+            fbytes += sizeof(list(f)[1])
+            # Bitfields followed by a byte will cause any incomplete
+            # bytes to be turned into full bytes
+            fbytes += bits_to_bytes(fbits)
+            fbits   = 0
+    fbytes += bits_to_bytes(fbits)
+    return fbytes + fbits
 def register_record(id, clazz):
    fields = clazz.FIELDS
+    diff = RECORD_SIZE - field_bytes(SchedRecord.FIELDS) - field_bytes(fields)
-    fsize = lambda fields : sum([sizeof(list(f)[1]) for f in fields])
-    diff  = RECORD_SIZE - fsize(SchedRecord.FIELDS) - fsize(fields)
    # Create extra padding fields to make record the proper size
    # Creating one big field of c_uint64 and giving it a size of 8*diff
-    # _shoud_ work, but doesn't. This is an uglier way of accomplishing
+    # _should_ work, but doesn't. This is an uglier way of accomplishing
    # the same goal
    for d in range(diff):
        fields += [("extra%d" % d, c_char)]
diff --git a/parse_exps.py b/parse_exps.py
index 98f95df..37667aa 100755
--- a/parse_exps.py
+++ b/parse_exps.py
@@ -14,7 +14,7 @@ import sys
 import traceback
 from collections import namedtuple
-from config.config import DEFAULTS,PARAMS
+from config.config import FILES,DEFAULTS,PARAMS
 from optparse import OptionParser
 from parse.point import ExpPoint
 from parse.tuple_table import TupleTable
@@ -94,7 +94,7 @@ def parse_exp(exp_force_base):
 def get_exp_params(data_dir, cm_builder):
-    param_file = "%s/%s" % (data_dir, DEFAULTS['params_file'])
+    param_file = "%s/%s" % (data_dir, FILES['params_file'])
    if os.path.isfile(param_file):
        params = com.load_params(param_file)
diff --git a/run/crontab.py b/run/crontab.py
new file mode 100644
index 0000000..87d71b1
--- /dev/null
+++ b/run/crontab.py
@@ -0,0 +1,151 @@
+from __future__ import print_function
+import common
+import os
+import re
+import sys
+from subprocess import Popen, PIPE, check_output
+PANIC_DUR = 10
+DELAY = 30
+DELAY_INTERVAL = 10
+def get_cron_data():
+    try:
+        return check_output(['crontab', '-l'])
+    except:
+        return ""
+def wall(message):
+    '''A wall command with no header'''
+    return "echo '%s' | wall -n" % message
+def sanitize(args, ignored):
+    ret_args = []
+    for a in args:
+        if a in ignored:
+            continue
+        if '-' == a[0] and '--' != a[0:2]:
+            for i in ignored:
+                a = a.replace(i, '')
+        ret_args += [a]
+    return ret_args
+def get_outfname():
+    return "cron-%s.txt" % common.get_cmd()
+def get_boot_cron(ignored_params, extra=""):
+    '''Turn current python script into a crontab reboot entry'''
+    job_args = sanitize(sys.argv, ignored_params)
+    job = " ".join(job_args)
+    out_fname = get_outfname()
+    short_job = " ".join([common.get_cmd()] + job_args[1:])
+    msg = "Job '%s' will write output to '%s'" % (short_job, out_fname)
+    sys.stderr.write("%s %d seconds after reboot.\n" % (msg, DELAY))
+    # Create sleep and wall commands which will countdown DELAY seconds
+    # before executing the job
+    cmds = ["sleep %d" % DELAY_INTERVAL]
+    delay_rem = DELAY - DELAY_INTERVAL
+    while delay_rem > 0:
+        wmsg = "Restarting experiments in %d seconds. %s" % (delay_rem, extra)
+        cmds += [wall(wmsg)]
+        cmds += ["sleep %d" % min(DELAY_INTERVAL, delay_rem)]
+        delay_rem -= DELAY_INTERVAL
+    delay_cmd = ";".join(cmds)
+    # Create command which will only execute if the same kernel is running
+    kern = common.kernel()
+    fail_wall = wall("Need matching kernel '%s' to run!" % kern)
+    run_cmd = "echo '%s' | grep -q `uname -r` && %s && %s && %s >> %s 2>>%s || %s" %\
+      (kern, wall(msg), wall("Starting..."), job, out_fname, out_fname, fail_wall)
+    return "@reboot cd %s; %s; %s;" % (os.getcwd(), delay_cmd, run_cmd)
+def set_panic_restart(bool_val):
+    '''Enable / disable restart on panics'''
+    if bool_val:
+        sys.stderr.write("Kernel will reboot after panic.\n")
+        dur = PANIC_DUR
+    else:
+        sys.stderr.write("Kernel will no longer reboot after panic.\n")
+        dur = 0
+    check_output(['sysctl', '-w', "kernel.panic=%d" % dur,
+                  "kernel.panic_on_oops=%d" % dur])
+def write_cron_data(data):
+    '''Write new crontab entry. No blank lines are written'''
+    # I don't know why "^\s*$" doesn't match, hence this ugly regex
+    data = re.sub(r"\n\s*\n", "\n", data, re.M)
+    sp = Popen(["crontab", "-"], stdin=PIPE)
+    stdout, stderr = sp.communicate(input=data)
+def install_path():
+    '''Place the current path in the crontab entry'''
+    data = get_cron_data()
+    curr_line = re.findall(r"PATH=.*", data)
+    if curr_line:
+        curr_paths = re.findall(r"((?:\/\w+)+)", curr_line[0])
+        data = re.sub(curr_line[0], "", data)
+    else:
+        curr_paths = []
+    curr_paths = set(curr_paths)
+    for path in os.environ["PATH"].split(os.pathsep):
+        curr_paths.add(path)
+    data = "PATH=" + os.pathsep.join(curr_paths) + "\n" + data
+    write_cron_data(data)
+def install_boot_job(ignored_params, reboot_message):
+    '''Re-run the current python script on system reboot using crontab'''
+    remove_boot_job()
+    data = get_cron_data()
+    job  = get_boot_cron(ignored_params, reboot_message)
+    set_panic_restart(True)
+    write_cron_data(data + job + "\n")
+    if job not in get_cron_data():
+        raise IOError("Failed to write %s into cron!" % job)
+    else:
+        install_path()
+def clean_output():
+    fname = get_outfname()
+    if os.path.exists(fname):
+        os.remove(fname)
+def kill_boot_job():
+    remove_boot_job()
+    cmd = common.get_cmd()
+    procs = check_output("ps -eo pid,args".split(" "))
+    pairs = re.findall("(\d+) (.*)", procs)
+    for pid, args in pairs:
+        if re.search(r"/bin/sh -c.*%s"%cmd, args):
+            sys.stderr.write("Killing job %s\n" % pid)
+            check_output(("kill -9 %s" % pid).split(" "))
+def remove_boot_job():
+    '''Remove installed reboot job from crontab'''
+    data  = get_cron_data()
+    regex = re.compile(r".*%s.*" % re.escape(common.get_cmd()), re.M)
+    if regex.search(data):
+        new_cron = regex.sub("", data)
+        write_cron_data(new_cron)
+        set_panic_restart(False)
diff --git a/run/experiment.py b/run/experiment.py
index 4667cb1..5f18bea 100644
--- a/run/experiment.py
+++ b/run/experiment.py
@@ -43,6 +43,9 @@ class Experiment(object):
        self.exec_err = None
        self.tracer_types = tracer_types
+        self.regular_tracers = []
+        self.exact_tracers = []
    def __setup_tracers(self):
        tracers = [ t(self.working_dir) for t in self.tracer_types ]
@@ -63,8 +66,13 @@ class Experiment(object):
                     Experiment.INTERRUPTED_DIR)
            interrupted = "%s/%s" % (os.path.split(self.working_dir)[0],
                                     Experiment.INTERRUPTED_DIR)
+            old_int = "%s/%s" % (self.working_dir, Experiment.INTERRUPTED_DIR)
            if os.path.exists(interrupted):
                sh.rmtree(interrupted)
+            if os.path.exists(old_int):
+                sh.rmtree(old_int)
            os.rename(self.working_dir, interrupted)
        os.mkdir(self.working_dir)
@@ -78,21 +86,24 @@ class Experiment(object):
            executable.cwd = self.working_dir
        map(assign_cwd, self.executables)
-    def __kill_all(self):
+    def __try_kill_all(self):
-        if lu.waiting_tasks():
+        try:
-            released = lu.release_tasks()
+            if lu.waiting_tasks():
-            self.log("Re-released %d tasks" % released)
+                released = lu.release_tasks()
+                self.log("Re-released %d tasks" % released)
-            time.sleep(1)
+                time.sleep(1)
-        self.log("Killing all tasks")
+            self.log("Killing all tasks")
-        for e in self.executables:
+            for e in self.executables:
-            try:
+                try:
-                e.kill()
+                    e.kill()
-            except:
+                except:
-                pass
+                    pass
-        time.sleep(1)
+            time.sleep(1)
+        except:
+            self.log("Failed to kill all tasks.")
    def __strip_path(self, path):
        '''Shorten path to something more readable.'''
@@ -194,6 +205,7 @@ class Experiment(object):
        sched = lu.scheduler()
        if sched != "Linux":
+            self.log("Switching back to Linux scheduler")
            try:
                lu.switch_scheduler("Linux")
            except:
@@ -303,6 +315,7 @@ class Experiment(object):
        self.__to_linux()
        succ = False
+        exception = None
        try:
            self.__setup()
@@ -311,20 +324,21 @@ class Experiment(object):
                self.log("Saving results in %s" % self.finished_dir)
                succ = True
            except Exception as e:
+                exception = e
                # Give time for whatever failed to finish failing
                time.sleep(2)
-                self.__kill_all()
-                raise e
+                self.__try_kill_all()
-            finally:
-                self.__teardown()
        finally:
-            self.log("Switching back to Linux scheduler")
            try:
+                self.__teardown()
                self.__to_linux()
            except Exception as e:
-                print(e)
+                exception = exception or e
-                
+            finally:
+                if exception: raise exception
        if succ:
            self.__save_results()
            self.log("Experiment done!")
diff --git a/run_exps.py b/run_exps.py
index afabca8..21666a9 100755
--- a/run_exps.py
+++ b/run_exps.py
@@ -3,14 +3,18 @@ from __future__ import print_function
 import common as com
 import os
+import pickle
+import pprint
 import re
 import shutil
 import sys
+import run.crontab as cron
 import run.tracer as trace
-from config.config import PARAMS,DEFAULTS
+from config.config import PARAMS,DEFAULTS,FILES
 from collections import namedtuple
-from optparse import OptionParser
+from optparse import OptionParser,OptionGroup
+from parse.enum import Enum
 from run.executable.executable import Executable
 from run.experiment import Experiment,ExperimentDone,SystemCorrupted
 from run.proc_entry import ProcEntry
@@ -19,9 +23,19 @@ from run.proc_entry import ProcEntry
 ExpParams = namedtuple('ExpParams', ['scheduler', 'duration', 'tracers',
                                     'kernel', 'config_options', 'file_params',
                                     'pre_script', 'post_script'])
+'''Tracked with each experiment'''
+ExpState = Enum(['Failed', 'Succeeded', 'Invalid', 'Done', 'None'])
+ExpData  = com.recordtype('ExpData', ['name', 'params', 'sched_file', 'out_dir',
+                                      'retries', 'state'])
 '''Comparison of requested versus actual kernel compile parameter value'''
 ConfigResult = namedtuple('ConfigResult', ['param', 'wanted', 'actual'])
+'''Maximum times an experiment will be retried'''
+MAX_RETRY = 5
+'''Location experiment retry count is stored'''
+TRIES_FNAME = ".tries.pkl"
 class InvalidKernel(Exception):
    def __init__(self, kernel):
        self.kernel = kernel
@@ -51,27 +65,37 @@ def parse_args():
    parser.add_option('-s', '--scheduler', dest='scheduler',
                      help='scheduler for all experiments')
+    parser.add_option('-d', '--duration', dest='duration', type='int',
+                      help='duration (seconds) of tasks')
    parser.add_option('-i', '--ignore-environment', dest='ignore',
                      action='store_true', default=False,
                      help='run experiments even in invalid environments ')
-    parser.add_option('-d', '--duration', dest='duration', type='int',
+    parser.add_option('-f', '--force', action='store_true', default=False,
-                      help='duration (seconds) of tasks')
+                      dest='force', help='overwrite existing data')
    parser.add_option('-o', '--out-dir', dest='out_dir',
                      help='directory for data output',
                      default=DEFAULTS['out-run'])
-    parser.add_option('-p', '--params', dest='param_file',
-                      help='file with experiment parameters')
+    group = OptionGroup(parser, "Communication Options")
-    parser.add_option('-c', '--schedule-file', dest='sched_file',
+    group.add_option('-j', '--jabber', metavar='username@domain',
-                      help='name of schedule files within directories',
+                     dest='jabber', default=None,
-                      default=DEFAULTS['sched_file'])
+                     help='send a jabber message when an experiment completes')
-    parser.add_option('-f', '--force', action='store_true', default=False,
+    group.add_option('-e', '--email', metavar='username@server',
-                      dest='force', help='overwrite existing data')
+                     dest='email', default=None,
-    parser.add_option('-j', '--jabber', metavar='username@domain',
+                     help='send an email when all experiments complete')
-                      dest='jabber', default=None,
+    parser.add_option_group(group)
-                      help='send a jabber message when an experiment completes')
-    parser.add_option('-e', '--email', metavar='username@server',
+    group = OptionGroup(parser, "Persistence Options")
-                      dest='email', default=None,
+    group.add_option('-r', '--retry', dest='retry', action='store_true',
-                      help='send an email when all experiments complete')
+                     default=False, help='retry failed experiments')
+    group.add_option('-c', '--crontab', dest='crontab',
+                     action='store_true', default=False,
+                     help='use crontab to resume interrupted script after '
+                     'system restarts. implies --retry')
+    group.add_option('-k', '--kill-crontab', dest='kill',
+                     action='store_true', default=False,
+                     help='kill existing script crontabs and exit')
+    parser.add_option_group(group)
    return parser.parse_args()
@@ -207,12 +231,12 @@ def run_script(script_params, exp, exp_dir, out_dir):
    out.close()
-def make_exp_params(cmd_scheduler, cmd_duration, sched_dir, param_file):
+def make_exp_params(cmd_scheduler, cmd_duration, sched_dir):
    '''Return ExpParam with configured values of all hardcoded params.'''
    kernel = copts = ""
    # Load parameter file
-    param_file = param_file or "%s/%s" % (sched_dir, DEFAULTS['params_file'])
+    param_file = "%s/%s" % (sched_dir, FILES['params_file'])
    if os.path.isfile(param_file):
        fparams = com.load_params(param_file)
    else:
@@ -252,65 +276,118 @@ def make_exp_params(cmd_scheduler, cmd_duration, sched_dir, param_file):
                     config_options=copts, tracers=tracers, file_params=fparams,
                     pre_script=pre_script, post_script=post_script)
-def run_experiment(name, sched_file, exp_params, out_dir,
+def run_experiment(data, start_message, ignore, jabber):
-                   start_message, ignore, jabber):
    '''Load and parse data from files and run result.'''
-    if not os.path.isfile(sched_file):
+    if not os.path.isfile(data.sched_file):
-        raise IOError("Cannot find schedule file: %s" % sched_file)
+        raise IOError("Cannot find schedule file: %s" % data.sched_file)
-    dir_name, fname = os.path.split(sched_file)
+    dir_name, fname = os.path.split(data.sched_file)
    work_dir = "%s/tmp" % dir_name
-    procs, execs = load_schedule(name, sched_file, exp_params.duration)
+    procs, execs = load_schedule(data.name, data.sched_file, data.params.duration)
-    exp = Experiment(name, exp_params.scheduler, work_dir, out_dir,
+    exp = Experiment(data.name, data.params.scheduler, work_dir,
-                     procs, execs, exp_params.tracers)
+                     data.out_dir, procs, execs, data.params.tracers)
    exp.log(start_message)
    if not ignore:
-        verify_environment(exp_params)
+        verify_environment(data.params)
-    run_script(exp_params.pre_script, exp, dir_name, work_dir)
+    run_script(data.params.pre_script, exp, dir_name, work_dir)
    exp.run_exp()
-    run_script(exp_params.post_script, exp, dir_name, out_dir)
+    run_script(data.params.post_script, exp, dir_name, data.out_dir)
    if jabber:
-        jabber.send("Completed '%s'" % name)
+        jabber.send("Completed '%s'" % data.name)
-    # Save parameters used to run experiment in out_dir
+    # Save parameters used to run dataeriment in out_dir
-    out_params = dict(exp_params.file_params.items() +
+    out_params = dict([(PARAMS['sched'],  data.params.scheduler),
-                      [(PARAMS['sched'],  exp_params.scheduler),
                       (PARAMS['tasks'],  len(execs)),
-                       (PARAMS['dur'],    exp_params.duration)])
+                       (PARAMS['dur'],    data.params.duration)] +
+                       data.params.file_params.items())
    # Feather-trace clock frequency saved for accurate overhead parsing
    ft_freq = com.ft_freq()
    if ft_freq:
        out_params[PARAMS['cycles']] = ft_freq
-    with open("%s/%s" % (out_dir, DEFAULTS['params_file']), 'w') as f:
+    out_param_f = "%s/%s" % (data.out_dir, FILES['params_file'])
-        f.write(str(out_params))
+    with open(out_param_f, 'w') as f:
+        pprint.pprint(out_params, f)
+def make_paths(exp, opts, out_base_dir):
+    '''Translate experiment name to (schedule file, output directory) paths'''
+    path = os.path.abspath(exp)
+    out_dir = "%s/%s" % (out_base_dir, os.path.split(exp.strip('/'))[1])
+    if not os.path.exists(path):
+        raise IOError("Invalid experiment: %s" % path)
-def get_exps(opts, args):
+    if opts.force and os.path.exists(out_dir):
-    '''Return list of experiment files or directories'''
+        shutil.rmtree(out_dir)
-    if args:
-        return args
-    # Default to sched_file > generated dirs
+    if os.path.isdir(path):
-    if os.path.exists(opts.sched_file):
+        sched_file = "%s/%s" % (path, FILES['sched_file'])
-        sys.stderr.write("Reading schedule from %s.\n" % opts.sched_file)
-        return [opts.sched_file]
-    elif os.path.exists(DEFAULTS['out-gen']):
-        sys.stderr.write("Reading schedules from %s/*.\n" % DEFAULTS['out-gen'])
-        sched_dirs = os.listdir(DEFAULTS['out-gen'])
-        return ['%s/%s' % (DEFAULTS['out-gen'], d) for d in sched_dirs]
    else:
-        sys.stderr.write("Run with -h to view options.\n");
+        sched_file = path
-        sys.exit(1)
+    return sched_file, out_dir
+def get_common_header(args):
+    common = ""
+    done = False
+    if len(args) == 1:
+        return common
+    while not done:
+        common += args[0][len(common)]
+        for path in args:
+            if path.find(common, 0, len(common)):
+                done = True
+                break
+    return common[:len(common)-1]
+def get_exps(opts, args, out_base_dir):
+    '''Return list of ExpDatas'''
+    if not args:
+        if os.path.exists(FILES['sched_file']):
+            # Default to sched_file in current directory
+            sys.stderr.write("Reading schedule from %s.\n" % FILES['sched_file'])
+            args = [FILES['sched_file']]
+        elif os.path.exists(DEFAULTS['out-gen']):
+            # Then try experiments created by gen_exps
+            sys.stderr.write("Reading schedules from %s/*.\n" % DEFAULTS['out-gen'])
+            sched_dirs = os.listdir(DEFAULTS['out-gen'])
+            args = ['%s/%s' % (DEFAULTS['out-gen'], d) for d in sched_dirs]
+        else:
+            sys.stderr.write("Run with -h to view options.\n");
+            sys.exit(1)
+    # Part of arg paths which is identical for each arg
+    common = get_common_header(args)
+    exps = []
+    for path in args:
+        sched_file, out_dir = make_paths(path, opts, out_base_dir)
+        name = path[len(common):]
+        sched_dir  = os.path.split(sched_file)[0]
+        exp_params = make_exp_params(opts.scheduler, opts.duration, sched_dir)
+        exps += [ExpData(name, exp_params, sched_file, out_dir,
+                         0, ExpState.None)]
+    return exps
 def setup_jabber(target):
@@ -338,93 +415,142 @@ def setup_email(target):
    return None
-def make_paths(exp, out_base_dir, opts):
+def tries_file(exp):
-    '''Translate experiment name to (schedule file, output directory) paths'''
+    return "%s/%s" % (os.path.split(exp.sched_file)[0], TRIES_FNAME)
-    path = "%s/%s" % (os.getcwd(), exp)
-    out_dir = "%s/%s" % (out_base_dir, os.path.split(exp.strip('/'))[1])
-    if not os.path.exists(path):
-        raise IOError("Invalid experiment: %s" % path)
-    if opts.force and os.path.exists(out_dir):
+def get_tries(exp):
-        shutil.rmtree(out_dir)
+    if not os.path.exists(tries_file(exp)):
+        return 0
+    with open(tries_file(exp), 'r') as f:
+        return int(pickle.load(f))
-    if os.path.isdir(path):
-        sched_file = "%s/%s" % (path, opts.sched_file)
-    else:
-        sched_file = path
-    return sched_file, out_dir
+def set_tries(exp, val):
+    if not val:
+        if os.path.exists(tries_file(exp)):
+            os.remove(tries_file(exp))
+    else:
+        with open(tries_file(exp), 'w') as f:
+            pickle.dump(str(val), f)
+    os.system('sync')
-def main():
-    opts, args = parse_args()
-    exps = get_exps(opts, args)
+def run_exps(exps, opts):
    jabber = setup_jabber(opts.jabber) if opts.jabber else None
-    email  = setup_email(opts.email)   if opts.email  else None
-    out_base = os.path.abspath(opts.out_dir)
+    # Give each experiment a unique id
-    created  = False
+    exps_remaining = enumerate(exps)
-    if not os.path.exists(out_base):
+    # But run experiments which have failed the most last
-        created = True
+    exps_remaining = sorted(exps_remaining, key=lambda x: get_tries(x[1]))
-        os.mkdir(out_base)
-    ran = done = succ = failed = invalid = 0
+    while exps_remaining:
+        i, exp = exps_remaining.pop(0)
-    for i, exp in enumerate(exps):
+        verb = "Loading" if exp.state == ExpState.None else "Re-running failed"
-        sched_file, out_dir = make_paths(exp, out_base, opts)
+        start_message = "%s experiment %d of %d." % (verb, i+1, len(exps))
-        sched_dir = os.path.split(sched_file)[0]
        try:
-            start_message = "Loading experiment %d of %d." % (i+1, len(exps))
+            set_tries(exp, get_tries(exp) + 1)
-            exp_params = make_exp_params(opts.scheduler, opts.duration,
+            if get_tries(exp) > MAX_RETRY:
-                                         sched_dir, opts.param_file)
+                raise Exception("Hit maximum retries of %d" % MAX_RETRY)
-            run_experiment(exp, sched_file, exp_params, out_dir,
+            run_experiment(exp, start_message, opts.ignore, jabber)
-                           start_message, opts.ignore, jabber)
-            succ += 1
+            set_tries(exp, 0)
-        except ExperimentDone:
+            exp.state = ExpState.Succeeded
-            sys.stderr.write("Experiment '%s' already completed " % exp +
-                             "at '%s'\n" % out_base)
-            done += 1
-        except (InvalidKernel, InvalidConfig) as e:
-            sys.stderr.write("Invalid environment for experiment '%s'\n" % exp)
-            sys.stderr.write("%s\n" % e)
-            invalid += 1
        except KeyboardInterrupt:
            sys.stderr.write("Keyboard interrupt, quitting\n")
+            set_tries(exp, get_tries(exp) - 1)
            break
+        except ExperimentDone:
+            sys.stderr.write("Experiment already completed at '%s'\n" % exp.out_dir)
+            set_tries(exp, 0)
+            exp.state = ExpState.Done
+        except (InvalidKernel, InvalidConfig) as e:
+            sys.stderr.write("Invalid environment for experiment '%s'\n" % exp.name)
+            sys.stderr.write("%s\n" % e)
+            set_tries(exp, get_tries(exp) - 1)
+            exp.state = ExpState.Invalid
        except SystemCorrupted as e:
            sys.stderr.write("System is corrupted! Fix state before continuing.\n")
            sys.stderr.write("%s\n" % e)
-            break
+            exp.state = ExpState.Failed
+            if not opts.retry:
+                break
+            else:
+                sys.stderr.write("Remaining experiments may fail\n")
        except Exception as e:
-            sys.stderr.write("Failed experiment %s\n" % exp)
+            sys.stderr.write("Failed experiment %s\n" % exp.name)
            sys.stderr.write("%s\n" % e)
-            failed += 1
+            exp.state = ExpState.Failed
-        ran += 1
+        if exp.state is ExpState.Failed and opts.retry:
+            exps_remaining += [(i, exp)]
-    # Clean out directory if it failed immediately
-    if not os.listdir(out_base) and created and not succ:
+def main():
-        os.rmdir(out_base)
+    opts, args = parse_args()
+    if opts.kill:
+        cron.kill_boot_job()
+        sys.exit(1)
+    email = setup_email(opts.email) if opts.email else None
+    # Create base output directory for run data
+    out_base = os.path.abspath(opts.out_dir)
+    created  = False
+    if not os.path.exists(out_base):
+        created = True
+        os.mkdir(out_base)
+    exps = get_exps(opts, args, out_base)
+    if opts.crontab:
+        # Resume script on startup
+        opts.retry = True
+        cron.install_boot_job(['f', '--forced'],
+                              "Stop with %s -k" % com.get_cmd())
+    if opts.force or not opts.retry:
+        cron.clean_output()
+        for e in exps:
+            set_tries(e, 0)
+    try:
+        run_exps(exps, opts)
+    finally:
+        # Remove persistent state
+        for e in exps:
+            set_tries(e, 0)
+        cron.remove_boot_job()
+    def state_count(state):
+        return len(filter(lambda x: x.state is state, exps))
+    ran  = len(filter(lambda x: x.state is not ExpState.None, exps))
+    succ = state_count(ExpState.Succeeded)
    message = "Experiments ran:\t%d of %d" % (ran, len(exps)) +\
      "\n  Successful:\t\t%d" % succ +\
-      "\n  Failed:\t\t%d" % failed +\
+      "\n  Failed:\t\t%d" % state_count(ExpState.Failed) +\
-      "\n  Already Done:\t\t%d" % done +\
+      "\n  Already Done:\t\t%d" % state_count(ExpState.Done) +\
-      "\n  Invalid Environment:\t%d" % invalid
+      "\n  Invalid Environment:\t%d" % state_count(ExpState.Invalid)
    print(message)
+    if email:
+        email.send(message)
+        email.close()
    if succ:
        sys.stderr.write("Successful experiment data saved in %s.\n" %
                         opts.out_dir)
+    elif not os.listdir(out_base) and created:
+        # Remove directory if no data was put into it
+        os.rmdir(out_base)
-    if email:
-        email.send(message)
-        email.close()
 if __name__ == '__main__':
    main()