diff options
| author | Jonathan Herman <hermanjl@cs.unc.edu> | 2012-09-17 11:28:55 -0400 |
|---|---|---|
| committer | Jonathan Herman <hermanjl@cs.unc.edu> | 2012-09-17 11:28:55 -0400 |
| commit | 48e1c8de6f8fca6770b669f8dae1ddc52917bead (patch) | |
| tree | 751bb3a9fde03e249e91824c433a9f9736ae4d92 /experiment | |
| parent | bdb33621ac67b2cd9fadf3f3b006419ebb16a713 (diff) | |
Handle interrupted experiments (by renaming) and failed experiments (cleanup).
Diffstat (limited to 'experiment')
| -rw-r--r-- | experiment/experiment.py | 42 |
1 files changed, 32 insertions, 10 deletions
diff --git a/experiment/experiment.py b/experiment/experiment.py index 29e6bd7..b21397d 100644 --- a/experiment/experiment.py +++ b/experiment/experiment.py | |||
| @@ -9,26 +9,27 @@ class ExperimentException(Exception): | |||
| 9 | def __init__(self, name): | 9 | def __init__(self, name): |
| 10 | self.name = name | 10 | self.name = name |
| 11 | 11 | ||
| 12 | 12 | ||
| 13 | class ExperimentDone(ExperimentException): | 13 | class ExperimentDone(ExperimentException): |
| 14 | """Raised when an experiment looks like it's been run already.""" | 14 | """Raised when an experiment looks like it's been run already.""" |
| 15 | def __str__(self): | 15 | def __str__(self): |
| 16 | return "Experiment finished already: %d" % self.name | 16 | return "Experiment finished already: %d" % self.name |
| 17 | 17 | ||
| 18 | 18 | ||
| 19 | class ExperimentInterrupted(ExperimentException): | 19 | class ExperimentInterrupted(ExperimentException): |
| 20 | """Raised when an experiment appears to be interrupted (partial results).""" | 20 | """Raised when an experiment appears to be interrupted (partial results).""" |
| 21 | def __str__(self): | 21 | def __str__(self): |
| 22 | return "Experiment was interrupted in progress: %d" % self.name | 22 | return "Experiment was interrupted in progress: %d" % self.name |
| 23 | 23 | ||
| 24 | 24 | ||
| 25 | class ExperimentFailed(ExperimentException): | 25 | class ExperimentFailed(ExperimentException): |
| 26 | def __str__(self): | 26 | def __str__(self): |
| 27 | return "Experiment failed during execution: %d" % self.name | 27 | return "Experiment failed during execution: %d" % self.name |
| 28 | 28 | ||
| 29 | 29 | ||
| 30 | class Experiment(object): | 30 | class Experiment(object): |
| 31 | """Execute one task-set and save the results. Experiments have unique IDs.""" | 31 | """Execute one task-set and save the results. Experiments have unique IDs.""" |
| 32 | INTERRUPTED_DIR = ".interrupted" | ||
| 32 | 33 | ||
| 33 | def __init__(self, name, scheduler, working_dir, finished_dir, proc_entries, executables): | 34 | def __init__(self, name, scheduler, working_dir, finished_dir, proc_entries, executables): |
| 34 | """Run an experiment, optionally wrapped in tracing.""" | 35 | """Run an experiment, optionally wrapped in tracing.""" |
| @@ -45,28 +46,44 @@ class Experiment(object): | |||
| 45 | 46 | ||
| 46 | self.tracers = [] | 47 | self.tracers = [] |
| 47 | if SchedTracer.enabled(): | 48 | if SchedTracer.enabled(): |
| 49 | self.log("Enabling sched_trace") | ||
| 48 | self.tracers.append( SchedTracer(working_dir) ) | 50 | self.tracers.append( SchedTracer(working_dir) ) |
| 49 | if LinuxTracer.enabled(): | 51 | if LinuxTracer.enabled(): |
| 52 | self.log("Enabling trace-cmd / ftrace / kernelshark") | ||
| 50 | self.tracers.append( LinuxTracer(working_dir) ) | 53 | self.tracers.append( LinuxTracer(working_dir) ) |
| 51 | if LogTracer.enabled(): | 54 | if LogTracer.enabled(): |
| 55 | self.log("Enabling logging") | ||
| 52 | self.tracers.append( LogTracer(working_dir) ) | 56 | self.tracers.append( LogTracer(working_dir) ) |
| 53 | if PerfTracer.enabled(): | 57 | if PerfTracer.enabled(): |
| 58 | self.log("Tracking CPU performance counters") | ||
| 54 | self.tracers.append( PerfTracer(working_dir) ) | 59 | self.tracers.append( PerfTracer(working_dir) ) |
| 55 | 60 | ||
| 56 | # Overhead trace must be handled seperately, see __run_tasks | 61 | # Overhead trace must be handled seperately, see __run_tasks |
| 57 | if OverheadTracer.enabled(): | 62 | if OverheadTracer.enabled(): |
| 63 | self.log("Enabling overhead tracing") | ||
| 58 | self.overhead_trace = OverheadTracer(working_dir) | 64 | self.overhead_trace = OverheadTracer(working_dir) |
| 59 | else: | 65 | else: |
| 60 | self.overhead_trace = None | 66 | self.overhead_trace = None |
| 61 | 67 | ||
| 62 | def __make_dirs(self): | 68 | def __make_dirs(self): |
| 69 | interrupted = None | ||
| 70 | |||
| 63 | if os.path.exists(self.finished_dir): | 71 | if os.path.exists(self.finished_dir): |
| 64 | raise ExperimentDone(self.name) | 72 | raise ExperimentDone(self.name) |
| 73 | |||
| 65 | if os.path.exists(self.working_dir): | 74 | if os.path.exists(self.working_dir): |
| 66 | raise ExperimentInterrupted(self.name) | 75 | self.log("Found interrupted experiment, saving in %s" % |
| 67 | 76 | Experiment.INTERRUPTED_DIR) | |
| 77 | interrupted = "%s/%s" % (os.path.split(self.working_dir)[0], | ||
| 78 | Experiment.INTERRUPTED_DIR) | ||
| 79 | os.rename(self.working_dir, interrupted) | ||
| 80 | |||
| 68 | os.mkdir(self.working_dir) | 81 | os.mkdir(self.working_dir) |
| 69 | 82 | ||
| 83 | if interrupted: | ||
| 84 | os.rename(interrupted, "%s/%s" % (self.working_dir, | ||
| 85 | os.path.split(interrupted)[1])) | ||
| 86 | |||
| 70 | def __assign_executable_cwds(self): | 87 | def __assign_executable_cwds(self): |
| 71 | def assign_cwd(executable): | 88 | def assign_cwd(executable): |
| 72 | executable.cwd = self.working_dir | 89 | executable.cwd = self.working_dir |
| @@ -90,13 +107,16 @@ class Experiment(object): | |||
| 90 | self.log("Starting overhead trace") | 107 | self.log("Starting overhead trace") |
| 91 | self.overhead_trace.start_tracing() | 108 | self.overhead_trace.start_tracing() |
| 92 | 109 | ||
| 110 | self.log("Releasing %d tasks" % len(self.executables)) | ||
| 93 | released = litmus_util.release_tasks() | 111 | released = litmus_util.release_tasks() |
| 94 | 112 | ||
| 95 | ret = True | 113 | ret = True |
| 96 | if released != len(self.executables): | 114 | if released != len(self.executables): |
| 115 | # Some tasks failed to release, kill all tasks and fail | ||
| 116 | # Need to re-release non-released tasks before we can kill them though | ||
| 97 | self.log("Failed to release %d tasks! Re-releasing and killing".format( | 117 | self.log("Failed to release %d tasks! Re-releasing and killing".format( |
| 98 | len(self.experiments) - released)) | 118 | len(self.experiments) - released)) |
| 99 | 119 | ||
| 100 | time.sleep(10) | 120 | time.sleep(10) |
| 101 | litmus_util.release_tasks() | 121 | litmus_util.release_tasks() |
| 102 | 122 | ||
| @@ -124,8 +144,10 @@ class Experiment(object): | |||
| 124 | 144 | ||
| 125 | def run_exp(self): | 145 | def run_exp(self): |
| 126 | self.setup() | 146 | self.setup() |
| 127 | self.__run_tasks() | 147 | try: |
| 128 | self.teardown() | 148 | self.__run_tasks() |
| 149 | finally: | ||
| 150 | self.teardown() | ||
| 129 | 151 | ||
| 130 | def setup(self): | 152 | def setup(self): |
| 131 | self.log("Switching to %s" % self.scheduler) | 153 | self.log("Switching to %s" % self.scheduler) |
| @@ -133,7 +155,7 @@ class Experiment(object): | |||
| 133 | 155 | ||
| 134 | self.log("Writing %d proc entries" % len(self.proc_entries)) | 156 | self.log("Writing %d proc entries" % len(self.proc_entries)) |
| 135 | map(methodcaller('write_proc'), self.proc_entries) | 157 | map(methodcaller('write_proc'), self.proc_entries) |
| 136 | 158 | ||
| 137 | self.log("Starting %d tracers" % len(self.tracers)) | 159 | self.log("Starting %d tracers" % len(self.tracers)) |
| 138 | map(methodcaller('start_tracing'), self.tracers) | 160 | map(methodcaller('start_tracing'), self.tracers) |
| 139 | 161 | ||
