diff options
author | Jonathan Herman <hermanjl@cs.unc.edu> | 2012-09-17 11:28:55 -0400 |
---|---|---|
committer | Jonathan Herman <hermanjl@cs.unc.edu> | 2012-09-17 11:28:55 -0400 |
commit | 48e1c8de6f8fca6770b669f8dae1ddc52917bead (patch) | |
tree | 751bb3a9fde03e249e91824c433a9f9736ae4d92 /experiment/experiment.py | |
parent | bdb33621ac67b2cd9fadf3f3b006419ebb16a713 (diff) |
Handle interrupted experiments (by renaming) and failed experiments (cleanup).
Diffstat (limited to 'experiment/experiment.py')
-rw-r--r-- | experiment/experiment.py | 42 |
1 files changed, 32 insertions, 10 deletions
diff --git a/experiment/experiment.py b/experiment/experiment.py index 29e6bd7..b21397d 100644 --- a/experiment/experiment.py +++ b/experiment/experiment.py | |||
@@ -9,26 +9,27 @@ class ExperimentException(Exception): | |||
9 | def __init__(self, name): | 9 | def __init__(self, name): |
10 | self.name = name | 10 | self.name = name |
11 | 11 | ||
12 | 12 | ||
13 | class ExperimentDone(ExperimentException): | 13 | class ExperimentDone(ExperimentException): |
14 | """Raised when an experiment looks like it's been run already.""" | 14 | """Raised when an experiment looks like it's been run already.""" |
15 | def __str__(self): | 15 | def __str__(self): |
16 | return "Experiment finished already: %d" % self.name | 16 | return "Experiment finished already: %d" % self.name |
17 | 17 | ||
18 | 18 | ||
19 | class ExperimentInterrupted(ExperimentException): | 19 | class ExperimentInterrupted(ExperimentException): |
20 | """Raised when an experiment appears to be interrupted (partial results).""" | 20 | """Raised when an experiment appears to be interrupted (partial results).""" |
21 | def __str__(self): | 21 | def __str__(self): |
22 | return "Experiment was interrupted in progress: %d" % self.name | 22 | return "Experiment was interrupted in progress: %d" % self.name |
23 | 23 | ||
24 | 24 | ||
25 | class ExperimentFailed(ExperimentException): | 25 | class ExperimentFailed(ExperimentException): |
26 | def __str__(self): | 26 | def __str__(self): |
27 | return "Experiment failed during execution: %d" % self.name | 27 | return "Experiment failed during execution: %d" % self.name |
28 | 28 | ||
29 | 29 | ||
30 | class Experiment(object): | 30 | class Experiment(object): |
31 | """Execute one task-set and save the results. Experiments have unique IDs.""" | 31 | """Execute one task-set and save the results. Experiments have unique IDs.""" |
32 | INTERRUPTED_DIR = ".interrupted" | ||
32 | 33 | ||
33 | def __init__(self, name, scheduler, working_dir, finished_dir, proc_entries, executables): | 34 | def __init__(self, name, scheduler, working_dir, finished_dir, proc_entries, executables): |
34 | """Run an experiment, optionally wrapped in tracing.""" | 35 | """Run an experiment, optionally wrapped in tracing.""" |
@@ -45,28 +46,44 @@ class Experiment(object): | |||
45 | 46 | ||
46 | self.tracers = [] | 47 | self.tracers = [] |
47 | if SchedTracer.enabled(): | 48 | if SchedTracer.enabled(): |
49 | self.log("Enabling sched_trace") | ||
48 | self.tracers.append( SchedTracer(working_dir) ) | 50 | self.tracers.append( SchedTracer(working_dir) ) |
49 | if LinuxTracer.enabled(): | 51 | if LinuxTracer.enabled(): |
52 | self.log("Enabling trace-cmd / ftrace / kernelshark") | ||
50 | self.tracers.append( LinuxTracer(working_dir) ) | 53 | self.tracers.append( LinuxTracer(working_dir) ) |
51 | if LogTracer.enabled(): | 54 | if LogTracer.enabled(): |
55 | self.log("Enabling logging") | ||
52 | self.tracers.append( LogTracer(working_dir) ) | 56 | self.tracers.append( LogTracer(working_dir) ) |
53 | if PerfTracer.enabled(): | 57 | if PerfTracer.enabled(): |
58 | self.log("Tracking CPU performance counters") | ||
54 | self.tracers.append( PerfTracer(working_dir) ) | 59 | self.tracers.append( PerfTracer(working_dir) ) |
55 | 60 | ||
56 | # Overhead trace must be handled seperately, see __run_tasks | 61 | # Overhead trace must be handled seperately, see __run_tasks |
57 | if OverheadTracer.enabled(): | 62 | if OverheadTracer.enabled(): |
63 | self.log("Enabling overhead tracing") | ||
58 | self.overhead_trace = OverheadTracer(working_dir) | 64 | self.overhead_trace = OverheadTracer(working_dir) |
59 | else: | 65 | else: |
60 | self.overhead_trace = None | 66 | self.overhead_trace = None |
61 | 67 | ||
62 | def __make_dirs(self): | 68 | def __make_dirs(self): |
69 | interrupted = None | ||
70 | |||
63 | if os.path.exists(self.finished_dir): | 71 | if os.path.exists(self.finished_dir): |
64 | raise ExperimentDone(self.name) | 72 | raise ExperimentDone(self.name) |
73 | |||
65 | if os.path.exists(self.working_dir): | 74 | if os.path.exists(self.working_dir): |
66 | raise ExperimentInterrupted(self.name) | 75 | self.log("Found interrupted experiment, saving in %s" % |
67 | 76 | Experiment.INTERRUPTED_DIR) | |
77 | interrupted = "%s/%s" % (os.path.split(self.working_dir)[0], | ||
78 | Experiment.INTERRUPTED_DIR) | ||
79 | os.rename(self.working_dir, interrupted) | ||
80 | |||
68 | os.mkdir(self.working_dir) | 81 | os.mkdir(self.working_dir) |
69 | 82 | ||
83 | if interrupted: | ||
84 | os.rename(interrupted, "%s/%s" % (self.working_dir, | ||
85 | os.path.split(interrupted)[1])) | ||
86 | |||
70 | def __assign_executable_cwds(self): | 87 | def __assign_executable_cwds(self): |
71 | def assign_cwd(executable): | 88 | def assign_cwd(executable): |
72 | executable.cwd = self.working_dir | 89 | executable.cwd = self.working_dir |
@@ -90,13 +107,16 @@ class Experiment(object): | |||
90 | self.log("Starting overhead trace") | 107 | self.log("Starting overhead trace") |
91 | self.overhead_trace.start_tracing() | 108 | self.overhead_trace.start_tracing() |
92 | 109 | ||
110 | self.log("Releasing %d tasks" % len(self.executables)) | ||
93 | released = litmus_util.release_tasks() | 111 | released = litmus_util.release_tasks() |
94 | 112 | ||
95 | ret = True | 113 | ret = True |
96 | if released != len(self.executables): | 114 | if released != len(self.executables): |
115 | # Some tasks failed to release, kill all tasks and fail | ||
116 | # Need to re-release non-released tasks before we can kill them though | ||
97 | self.log("Failed to release %d tasks! Re-releasing and killing".format( | 117 | self.log("Failed to release %d tasks! Re-releasing and killing".format( |
98 | len(self.experiments) - released)) | 118 | len(self.experiments) - released)) |
99 | 119 | ||
100 | time.sleep(10) | 120 | time.sleep(10) |
101 | litmus_util.release_tasks() | 121 | litmus_util.release_tasks() |
102 | 122 | ||
@@ -124,8 +144,10 @@ class Experiment(object): | |||
124 | 144 | ||
125 | def run_exp(self): | 145 | def run_exp(self): |
126 | self.setup() | 146 | self.setup() |
127 | self.__run_tasks() | 147 | try: |
128 | self.teardown() | 148 | self.__run_tasks() |
149 | finally: | ||
150 | self.teardown() | ||
129 | 151 | ||
130 | def setup(self): | 152 | def setup(self): |
131 | self.log("Switching to %s" % self.scheduler) | 153 | self.log("Switching to %s" % self.scheduler) |
@@ -133,7 +155,7 @@ class Experiment(object): | |||
133 | 155 | ||
134 | self.log("Writing %d proc entries" % len(self.proc_entries)) | 156 | self.log("Writing %d proc entries" % len(self.proc_entries)) |
135 | map(methodcaller('write_proc'), self.proc_entries) | 157 | map(methodcaller('write_proc'), self.proc_entries) |
136 | 158 | ||
137 | self.log("Starting %d tracers" % len(self.tracers)) | 159 | self.log("Starting %d tracers" % len(self.tracers)) |
138 | map(methodcaller('start_tracing'), self.tracers) | 160 | map(methodcaller('start_tracing'), self.tracers) |
139 | 161 | ||