diff options
| author | Jonathan Herman <hermanjl@cs.unc.edu> | 2012-09-17 11:28:55 -0400 |
|---|---|---|
| committer | Jonathan Herman <hermanjl@cs.unc.edu> | 2012-09-17 11:28:55 -0400 |
| commit | 48e1c8de6f8fca6770b669f8dae1ddc52917bead (patch) | |
| tree | 751bb3a9fde03e249e91824c433a9f9736ae4d92 | |
| parent | bdb33621ac67b2cd9fadf3f3b006419ebb16a713 (diff) | |
Handle interrupted experiments (by renaming) and failed experiments (cleanup).
| -rw-r--r-- | experiment/experiment.py | 42 | ||||
| -rwxr-xr-x | run_exps.py | 20 |
2 files changed, 42 insertions, 20 deletions
diff --git a/experiment/experiment.py b/experiment/experiment.py index 29e6bd7..b21397d 100644 --- a/experiment/experiment.py +++ b/experiment/experiment.py | |||
| @@ -9,26 +9,27 @@ class ExperimentException(Exception): | |||
| 9 | def __init__(self, name): | 9 | def __init__(self, name): |
| 10 | self.name = name | 10 | self.name = name |
| 11 | 11 | ||
| 12 | 12 | ||
| 13 | class ExperimentDone(ExperimentException): | 13 | class ExperimentDone(ExperimentException): |
| 14 | """Raised when an experiment looks like it's been run already.""" | 14 | """Raised when an experiment looks like it's been run already.""" |
| 15 | def __str__(self): | 15 | def __str__(self): |
| 16 | return "Experiment finished already: %d" % self.name | 16 | return "Experiment finished already: %d" % self.name |
| 17 | 17 | ||
| 18 | 18 | ||
| 19 | class ExperimentInterrupted(ExperimentException): | 19 | class ExperimentInterrupted(ExperimentException): |
| 20 | """Raised when an experiment appears to be interrupted (partial results).""" | 20 | """Raised when an experiment appears to be interrupted (partial results).""" |
| 21 | def __str__(self): | 21 | def __str__(self): |
| 22 | return "Experiment was interrupted in progress: %d" % self.name | 22 | return "Experiment was interrupted in progress: %d" % self.name |
| 23 | 23 | ||
| 24 | 24 | ||
| 25 | class ExperimentFailed(ExperimentException): | 25 | class ExperimentFailed(ExperimentException): |
| 26 | def __str__(self): | 26 | def __str__(self): |
| 27 | return "Experiment failed during execution: %d" % self.name | 27 | return "Experiment failed during execution: %d" % self.name |
| 28 | 28 | ||
| 29 | 29 | ||
| 30 | class Experiment(object): | 30 | class Experiment(object): |
| 31 | """Execute one task-set and save the results. Experiments have unique IDs.""" | 31 | """Execute one task-set and save the results. Experiments have unique IDs.""" |
| 32 | INTERRUPTED_DIR = ".interrupted" | ||
| 32 | 33 | ||
| 33 | def __init__(self, name, scheduler, working_dir, finished_dir, proc_entries, executables): | 34 | def __init__(self, name, scheduler, working_dir, finished_dir, proc_entries, executables): |
| 34 | """Run an experiment, optionally wrapped in tracing.""" | 35 | """Run an experiment, optionally wrapped in tracing.""" |
| @@ -45,28 +46,44 @@ class Experiment(object): | |||
| 45 | 46 | ||
| 46 | self.tracers = [] | 47 | self.tracers = [] |
| 47 | if SchedTracer.enabled(): | 48 | if SchedTracer.enabled(): |
| 49 | self.log("Enabling sched_trace") | ||
| 48 | self.tracers.append( SchedTracer(working_dir) ) | 50 | self.tracers.append( SchedTracer(working_dir) ) |
| 49 | if LinuxTracer.enabled(): | 51 | if LinuxTracer.enabled(): |
| 52 | self.log("Enabling trace-cmd / ftrace / kernelshark") | ||
| 50 | self.tracers.append( LinuxTracer(working_dir) ) | 53 | self.tracers.append( LinuxTracer(working_dir) ) |
| 51 | if LogTracer.enabled(): | 54 | if LogTracer.enabled(): |
| 55 | self.log("Enabling logging") | ||
| 52 | self.tracers.append( LogTracer(working_dir) ) | 56 | self.tracers.append( LogTracer(working_dir) ) |
| 53 | if PerfTracer.enabled(): | 57 | if PerfTracer.enabled(): |
| 58 | self.log("Tracking CPU performance counters") | ||
| 54 | self.tracers.append( PerfTracer(working_dir) ) | 59 | self.tracers.append( PerfTracer(working_dir) ) |
| 55 | 60 | ||
| 56 | # Overhead trace must be handled seperately, see __run_tasks | 61 | # Overhead trace must be handled seperately, see __run_tasks |
| 57 | if OverheadTracer.enabled(): | 62 | if OverheadTracer.enabled(): |
| 63 | self.log("Enabling overhead tracing") | ||
| 58 | self.overhead_trace = OverheadTracer(working_dir) | 64 | self.overhead_trace = OverheadTracer(working_dir) |
| 59 | else: | 65 | else: |
| 60 | self.overhead_trace = None | 66 | self.overhead_trace = None |
| 61 | 67 | ||
| 62 | def __make_dirs(self): | 68 | def __make_dirs(self): |
| 69 | interrupted = None | ||
| 70 | |||
| 63 | if os.path.exists(self.finished_dir): | 71 | if os.path.exists(self.finished_dir): |
| 64 | raise ExperimentDone(self.name) | 72 | raise ExperimentDone(self.name) |
| 73 | |||
| 65 | if os.path.exists(self.working_dir): | 74 | if os.path.exists(self.working_dir): |
| 66 | raise ExperimentInterrupted(self.name) | 75 | self.log("Found interrupted experiment, saving in %s" % |
| 67 | 76 | Experiment.INTERRUPTED_DIR) | |
| 77 | interrupted = "%s/%s" % (os.path.split(self.working_dir)[0], | ||
| 78 | Experiment.INTERRUPTED_DIR) | ||
| 79 | os.rename(self.working_dir, interrupted) | ||
| 80 | |||
| 68 | os.mkdir(self.working_dir) | 81 | os.mkdir(self.working_dir) |
| 69 | 82 | ||
| 83 | if interrupted: | ||
| 84 | os.rename(interrupted, "%s/%s" % (self.working_dir, | ||
| 85 | os.path.split(interrupted)[1])) | ||
| 86 | |||
| 70 | def __assign_executable_cwds(self): | 87 | def __assign_executable_cwds(self): |
| 71 | def assign_cwd(executable): | 88 | def assign_cwd(executable): |
| 72 | executable.cwd = self.working_dir | 89 | executable.cwd = self.working_dir |
| @@ -90,13 +107,16 @@ class Experiment(object): | |||
| 90 | self.log("Starting overhead trace") | 107 | self.log("Starting overhead trace") |
| 91 | self.overhead_trace.start_tracing() | 108 | self.overhead_trace.start_tracing() |
| 92 | 109 | ||
| 110 | self.log("Releasing %d tasks" % len(self.executables)) | ||
| 93 | released = litmus_util.release_tasks() | 111 | released = litmus_util.release_tasks() |
| 94 | 112 | ||
| 95 | ret = True | 113 | ret = True |
| 96 | if released != len(self.executables): | 114 | if released != len(self.executables): |
| 115 | # Some tasks failed to release, kill all tasks and fail | ||
| 116 | # Need to re-release non-released tasks before we can kill them though | ||
| 97 | self.log("Failed to release %d tasks! Re-releasing and killing".format( | 117 | self.log("Failed to release %d tasks! Re-releasing and killing".format( |
| 98 | len(self.experiments) - released)) | 118 | len(self.experiments) - released)) |
| 99 | 119 | ||
| 100 | time.sleep(10) | 120 | time.sleep(10) |
| 101 | litmus_util.release_tasks() | 121 | litmus_util.release_tasks() |
| 102 | 122 | ||
| @@ -124,8 +144,10 @@ class Experiment(object): | |||
| 124 | 144 | ||
| 125 | def run_exp(self): | 145 | def run_exp(self): |
| 126 | self.setup() | 146 | self.setup() |
| 127 | self.__run_tasks() | 147 | try: |
| 128 | self.teardown() | 148 | self.__run_tasks() |
| 149 | finally: | ||
| 150 | self.teardown() | ||
| 129 | 151 | ||
| 130 | def setup(self): | 152 | def setup(self): |
| 131 | self.log("Switching to %s" % self.scheduler) | 153 | self.log("Switching to %s" % self.scheduler) |
| @@ -133,7 +155,7 @@ class Experiment(object): | |||
| 133 | 155 | ||
| 134 | self.log("Writing %d proc entries" % len(self.proc_entries)) | 156 | self.log("Writing %d proc entries" % len(self.proc_entries)) |
| 135 | map(methodcaller('write_proc'), self.proc_entries) | 157 | map(methodcaller('write_proc'), self.proc_entries) |
| 136 | 158 | ||
| 137 | self.log("Starting %d tracers" % len(self.tracers)) | 159 | self.log("Starting %d tracers" % len(self.tracers)) |
| 138 | map(methodcaller('start_tracing'), self.tracers) | 160 | map(methodcaller('start_tracing'), self.tracers) |
| 139 | 161 | ||
diff --git a/run_exps.py b/run_exps.py index c589f51..bc15b98 100755 --- a/run_exps.py +++ b/run_exps.py | |||
| @@ -55,8 +55,8 @@ def convert_data(data): | |||
| 55 | else: | 55 | else: |
| 56 | prog = match.group("TYPE") or "rtspin" | 56 | prog = match.group("TYPE") or "rtspin" |
| 57 | spin = (prog, match.group("ARGS")) | 57 | spin = (prog, match.group("ARGS")) |
| 58 | spins.append(spin) | 58 | spins.append(spin) |
| 59 | 59 | ||
| 60 | return {'proc' : procs, 'spin' : spins} | 60 | return {'proc' : procs, 'spin' : spins} |
| 61 | 61 | ||
| 62 | 62 | ||
| @@ -80,7 +80,7 @@ def load_experiment(sched_file, scheduler, duration, param_file, out_base): | |||
| 80 | raise IOError("Cannot find schedule file: %s" % sched_file) | 80 | raise IOError("Cannot find schedule file: %s" % sched_file) |
| 81 | 81 | ||
| 82 | dirname = os.path.split(sched_file)[0] | 82 | dirname = os.path.split(sched_file)[0] |
| 83 | 83 | ||
| 84 | if not scheduler or not duration: | 84 | if not scheduler or not duration: |
| 85 | param_file = param_file or \ | 85 | param_file = param_file or \ |
| 86 | "%s/%s" % (dirname, conf.DEFAULTS['params_file']) | 86 | "%s/%s" % (dirname, conf.DEFAULTS['params_file']) |
| @@ -89,7 +89,7 @@ def load_experiment(sched_file, scheduler, duration, param_file, out_base): | |||
| 89 | params = load_params(param_file) | 89 | params = load_params(param_file) |
| 90 | scheduler = scheduler or params[conf.PARAMS['sched']] | 90 | scheduler = scheduler or params[conf.PARAMS['sched']] |
| 91 | duration = duration or params[conf.PARAMS['dur']] | 91 | duration = duration or params[conf.PARAMS['dur']] |
| 92 | 92 | ||
| 93 | duration = duration or conf.DEFAULTS['duration'] | 93 | duration = duration or conf.DEFAULTS['duration'] |
| 94 | 94 | ||
| 95 | if not scheduler: | 95 | if not scheduler: |
| @@ -133,10 +133,10 @@ def run_exp(name, schedule, scheduler, duration, work_dir, out_dir): | |||
| 133 | for entry_conf in schedule['proc']: | 133 | for entry_conf in schedule['proc']: |
| 134 | path = entry_conf[0] | 134 | path = entry_conf[0] |
| 135 | data = entry_conf[1] | 135 | data = entry_conf[1] |
| 136 | 136 | ||
| 137 | if not os.path.exists(path): | 137 | if not os.path.exists(path): |
| 138 | raise IOError("Invalid proc path %s: %s" % (path, name)) | 138 | raise IOError("Invalid proc path %s: %s" % (path, name)) |
| 139 | 139 | ||
| 140 | proc_entries += [ProcEntry(path, data)] | 140 | proc_entries += [ProcEntry(path, data)] |
| 141 | 141 | ||
| 142 | # Parse spinners | 142 | # Parse spinners |
| @@ -158,14 +158,14 @@ def run_exp(name, schedule, scheduler, duration, work_dir, out_dir): | |||
| 158 | 158 | ||
| 159 | if not lu.is_executable(real_spin): | 159 | if not lu.is_executable(real_spin): |
| 160 | raise OSError("Cannot run spin %s: %s" % (real_spin, name)) | 160 | raise OSError("Cannot run spin %s: %s" % (real_spin, name)) |
| 161 | 161 | ||
| 162 | executables += [Executable(real_spin, real_args)] | 162 | executables += [Executable(real_spin, real_args)] |
| 163 | 163 | ||
| 164 | exp = Experiment(name, scheduler, work_dir, out_dir, | 164 | exp = Experiment(name, scheduler, work_dir, out_dir, |
| 165 | proc_entries, executables) | 165 | proc_entries, executables) |
| 166 | exp.run_exp() | 166 | exp.run_exp() |
| 167 | 167 | ||
| 168 | 168 | ||
| 169 | def main(): | 169 | def main(): |
| 170 | opts, args = parse_args() | 170 | opts, args = parse_args() |
| 171 | 171 | ||
| @@ -181,12 +181,12 @@ def main(): | |||
| 181 | 181 | ||
| 182 | if not os.path.exists(path): | 182 | if not os.path.exists(path): |
| 183 | raise IOError("Invalid experiment: %s" % path) | 183 | raise IOError("Invalid experiment: %s" % path) |
| 184 | 184 | ||
| 185 | if os.path.isdir(exp): | 185 | if os.path.isdir(exp): |
| 186 | path = "%s%s" % (path, opts.sched_file) | 186 | path = "%s%s" % (path, opts.sched_file) |
| 187 | 187 | ||
| 188 | load_experiment(path, scheduler, duration, param_file, out_base) | 188 | load_experiment(path, scheduler, duration, param_file, out_base) |
| 189 | 189 | ||
| 190 | 190 | ||
| 191 | if __name__ == '__main__': | 191 | if __name__ == '__main__': |
| 192 | main() | 192 | main() |
