diff options
-rw-r--r-- | common.py | 6 | ||||
-rw-r--r-- | parse/sched.py | 1 | ||||
-rw-r--r-- | run/executable/executable.py | 6 | ||||
-rw-r--r-- | run/experiment.py | 128 | ||||
-rw-r--r-- | run/litmus_util.py | 23 | ||||
-rwxr-xr-x | run_exps.py | 18 |
6 files changed, 114 insertions, 68 deletions
@@ -26,15 +26,17 @@ def get_executable_hint(prog, hint, optional=False): | |||
26 | '''Search for @prog in system PATH. Print @hint if no binary is found. | 26 | '''Search for @prog in system PATH. Print @hint if no binary is found. |
27 | Die if not @optional.''' | 27 | Die if not @optional.''' |
28 | try: | 28 | try: |
29 | prog = get_executable(prog) | 29 | full_path = get_executable(prog) |
30 | except IOError: | 30 | except IOError: |
31 | if not optional: | 31 | if not optional: |
32 | sys.stderr.write(("Cannot find executable '%s' in PATH. This is " +\ | 32 | sys.stderr.write(("Cannot find executable '%s' in PATH. This is " +\ |
33 | "a part of '%s' which should be added to PATH.\n")\ | 33 | "a part of '%s' which should be added to PATH.\n")\ |
34 | % (prog, hint)) | 34 | % (prog, hint)) |
35 | sys.exit(1) | 35 | sys.exit(1) |
36 | else: | ||
37 | full_path = None | ||
36 | 38 | ||
37 | return prog | 39 | return full_path |
38 | 40 | ||
39 | def get_config_option(option): | 41 | def get_config_option(option): |
40 | '''Search for @option in installed kernel config (if present). | 42 | '''Search for @option in installed kernel config (if present). |
diff --git a/parse/sched.py b/parse/sched.py index 2da0149..147a2e5 100644 --- a/parse/sched.py +++ b/parse/sched.py | |||
@@ -161,6 +161,7 @@ def extract_sched_data(result, data_dir, work_dir): | |||
161 | cmd_arr = [conf.BINS['st_show']] | 161 | cmd_arr = [conf.BINS['st_show']] |
162 | cmd_arr.extend(bins) | 162 | cmd_arr.extend(bins) |
163 | with open(output_file, "w") as f: | 163 | with open(output_file, "w") as f: |
164 | print("calling %s" % cmd_arr) | ||
164 | subprocess.call(cmd_arr, cwd=data_dir, stdout=f) | 165 | subprocess.call(cmd_arr, cwd=data_dir, stdout=f) |
165 | 166 | ||
166 | task_dict = defaultdict(lambda : | 167 | task_dict = defaultdict(lambda : |
diff --git a/run/executable/executable.py b/run/executable/executable.py index 02e35ae..263e305 100644 --- a/run/executable/executable.py +++ b/run/executable/executable.py | |||
@@ -1,13 +1,13 @@ | |||
1 | import sys | 1 | import sys |
2 | import subprocess | 2 | import subprocess |
3 | import signal | 3 | import signal |
4 | from common import is_executable | 4 | from common import get_executable |
5 | 5 | ||
6 | class Executable(object): | 6 | class Executable(object): |
7 | '''Parent object that represents an executable for use in task-sets.''' | 7 | '''Parent object that represents an executable for use in task-sets.''' |
8 | 8 | ||
9 | def __init__(self, exec_file, extra_args=None, stdout_file = None, stderr_file = None): | 9 | def __init__(self, exec_file, extra_args=None, stdout_file = None, stderr_file = None): |
10 | self.exec_file = exec_file | 10 | self.exec_file = get_executable(exec_file) |
11 | self.cwd = None | 11 | self.cwd = None |
12 | self.stdout_file = stdout_file | 12 | self.stdout_file = stdout_file |
13 | self.stderr_file = stderr_file | 13 | self.stderr_file = stderr_file |
@@ -18,7 +18,7 @@ class Executable(object): | |||
18 | else: | 18 | else: |
19 | self.extra_args = [str(a) for a in list(extra_args)] # make a duplicate | 19 | self.extra_args = [str(a) for a in list(extra_args)] # make a duplicate |
20 | 20 | ||
21 | if not is_executable(self.exec_file): | 21 | if not self.exec_file: |
22 | raise Exception("Not executable ? : %s" % self.exec_file) | 22 | raise Exception("Not executable ? : %s" % self.exec_file) |
23 | 23 | ||
24 | def __del__(self): | 24 | def __del__(self): |
diff --git a/run/experiment.py b/run/experiment.py index e5811f8..ff0e9f3 100644 --- a/run/experiment.py +++ b/run/experiment.py | |||
@@ -1,7 +1,9 @@ | |||
1 | import common as com | ||
1 | import os | 2 | import os |
2 | import time | 3 | import time |
3 | import run.litmus_util as lu | 4 | import run.litmus_util as lu |
4 | import shutil as sh | 5 | import shutil as sh |
6 | import traceback | ||
5 | from operator import methodcaller | 7 | from operator import methodcaller |
6 | 8 | ||
7 | class ExperimentException(Exception): | 9 | class ExperimentException(Exception): |
@@ -15,17 +17,12 @@ class ExperimentDone(ExperimentException): | |||
15 | def __str__(self): | 17 | def __str__(self): |
16 | return "Experiment finished already: %d" % self.name | 18 | return "Experiment finished already: %d" % self.name |
17 | 19 | ||
18 | |||
19 | class ExperimentInterrupted(ExperimentException): | ||
20 | '''Raised when an experiment appears to be interrupted (partial results).''' | ||
21 | def __str__(self): | ||
22 | return "Experiment was interrupted in progress: %d" % self.name | ||
23 | |||
24 | |||
25 | class ExperimentFailed(ExperimentException): | 20 | class ExperimentFailed(ExperimentException): |
26 | def __str__(self): | 21 | def __str__(self): |
27 | return "Experiment failed during execution: %d" % self.name | 22 | return "Experiment failed during execution: %d" % self.name |
28 | 23 | ||
24 | class SystemCorrupted(Exception): | ||
25 | pass | ||
29 | 26 | ||
30 | class Experiment(object): | 27 | class Experiment(object): |
31 | '''Execute one task-set and save the results. Experiments have unique IDs.''' | 28 | '''Execute one task-set and save the results. Experiments have unique IDs.''' |
@@ -44,6 +41,8 @@ class Experiment(object): | |||
44 | self.exec_out = None | 41 | self.exec_out = None |
45 | self.exec_err = None | 42 | self.exec_err = None |
46 | 43 | ||
44 | self.task_batch = com.num_cpus() | ||
45 | |||
47 | self.__make_dirs() | 46 | self.__make_dirs() |
48 | self.__assign_executable_cwds() | 47 | self.__assign_executable_cwds() |
49 | self.__setup_tracers(tracer_types) | 48 | self.__setup_tracers(tracer_types) |
@@ -84,65 +83,67 @@ class Experiment(object): | |||
84 | executable.cwd = self.working_dir | 83 | executable.cwd = self.working_dir |
85 | map(assign_cwd, self.executables) | 84 | map(assign_cwd, self.executables) |
86 | 85 | ||
87 | def __run_tasks(self): | 86 | def __kill_all(self): |
88 | already_waiting = lu.waiting_tasks() | 87 | # Give time for whatever failed to finish failing |
88 | time.sleep(2) | ||
89 | |||
90 | released = lu.release_tasks() | ||
91 | self.log("Re-released %d tasks" % released) | ||
89 | 92 | ||
90 | if already_waiting: | 93 | time.sleep(5) |
91 | self.log("Already %d tasks waiting for release!") | ||
92 | self.log("Experiment will fail if any of these tasks are released.") | ||
93 | 94 | ||
94 | self.log("Starting the programs") | 95 | self.log("Killing all tasks") |
95 | for e in self.executables: | 96 | for e in self.executables: |
96 | try: | 97 | try: |
98 | e.kill() | ||
99 | except: | ||
100 | pass | ||
101 | |||
102 | time.sleep(2) | ||
103 | |||
104 | def __run_tasks(self): | ||
105 | self.log("Starting %d tasks" % len(self.executables)) | ||
106 | |||
107 | for i,e in enumerate(self.executables): | ||
108 | try: | ||
97 | e.execute() | 109 | e.execute() |
98 | except: | 110 | except: |
99 | raise Exception("Executable failed: %s" % e) | 111 | raise Exception("Executable failed: %s" % e) |
100 | 112 | ||
101 | self.log("Sleeping until tasks are ready for release...") | 113 | self.log("Sleeping until tasks are ready for release...") |
102 | start = time.clock() | 114 | start = time.time() |
103 | while (lu.waiting_tasks() - already_waiting) < len(self.executables): | 115 | while lu.waiting_tasks() < len(self.executables): |
104 | if time.clock() - start > 30.0: | 116 | if time.time() - start > 30.0: |
105 | raise Exception("Too much time has passed waiting for tasks!") | 117 | s = "waiting: %d, submitted: %d" %\ |
118 | (lu.waiting_tasks(), len(self.executables)) | ||
119 | raise Exception("Too much time spent waiting for tasks! %s" % s) | ||
106 | time.sleep(1) | 120 | time.sleep(1) |
107 | 121 | ||
108 | # Exact tracers (like overheads) must be started right after release or | 122 | # Exact tracers (like overheads) must be started right after release or |
109 | # measurements will be full of irrelevant records | 123 | # measurements will be full of irrelevant records |
110 | self.log("Starting %d released tracers" % len(self.exact_tracers)) | 124 | self.log("Starting %d released tracers" % len(self.exact_tracers)) |
111 | map(methodcaller('start_tracing'), self.exact_tracers) | 125 | map(methodcaller('start_tracing'), self.exact_tracers) |
126 | time.sleep(1) | ||
112 | 127 | ||
113 | self.log("Releasing %d tasks" % len(self.executables)) | 128 | try: |
114 | released = lu.release_tasks() | 129 | self.log("Releasing %d tasks" % len(self.executables)) |
115 | |||
116 | ret = True | ||
117 | if released != len(self.executables): | ||
118 | # Some tasks failed to release, kill all tasks and fail | ||
119 | # Need to re-release non-released tasks before we can kill them though | ||
120 | self.log("Failed to release {} tasks! Re-releasing and killing".format( | ||
121 | len(self.executables) - released, len(self.executables))) | ||
122 | time.sleep(5) | ||
123 | |||
124 | released = lu.release_tasks() | 130 | released = lu.release_tasks() |
125 | 131 | ||
126 | self.log("Re-released %d tasks" % released) | 132 | if released != len(self.executables): |
127 | 133 | # Some tasks failed to release, kill all tasks and fail | |
128 | time.sleep(5) | 134 | # Need to release non-released tasks before they can be killed |
129 | 135 | raise Exception("Released %s tasks, expected %s tasks" % | |
130 | self.log("Killing all tasks") | 136 | (released, len(self.executables))) |
131 | map(methodcaller('kill'), self.executables) | ||
132 | 137 | ||
133 | ret = False | 138 | self.log("Waiting for program to finish...") |
139 | for e in self.executables: | ||
140 | if not e.wait(): | ||
141 | raise Exception("Executable %s failed to complete!" % e) | ||
134 | 142 | ||
135 | self.log("Waiting for program to finish...") | 143 | finally: |
136 | for e in self.executables: | 144 | # And these must be stopped here for the same reason |
137 | if not e.wait(): | 145 | self.log("Stopping exact tracers") |
138 | ret = False | 146 | map(methodcaller('stop_tracing'), self.exact_tracers) |
139 | |||
140 | # And these must be stopped here for the same reason | ||
141 | self.log("Stopping exact tracers") | ||
142 | map(methodcaller('stop_tracing'), self.exact_tracers) | ||
143 | |||
144 | if not ret: | ||
145 | raise ExperimentFailed(self.name) | ||
146 | 147 | ||
147 | def __save_results(self): | 148 | def __save_results(self): |
148 | os.rename(self.working_dir, self.finished_dir) | 149 | os.rename(self.working_dir, self.finished_dir) |
@@ -151,29 +152,48 @@ class Experiment(object): | |||
151 | print("[Exp %s]: %s" % (self.name, msg)) | 152 | print("[Exp %s]: %s" % (self.name, msg)) |
152 | 153 | ||
153 | def run_exp(self): | 154 | def run_exp(self): |
155 | self.__check_system() | ||
156 | |||
154 | succ = False | 157 | succ = False |
155 | try: | 158 | try: |
156 | self.setup() | 159 | self.__setup() |
157 | 160 | ||
158 | try: | 161 | try: |
159 | self.__run_tasks() | 162 | self.__run_tasks() |
160 | self.log("Saving results in %s" % self.finished_dir) | 163 | self.log("Saving results in %s" % self.finished_dir) |
161 | succ = True | 164 | succ = True |
165 | except: | ||
166 | traceback.print_exc() | ||
167 | self.__kill_all() | ||
168 | raise ExperimentFailed(self.name) | ||
162 | finally: | 169 | finally: |
163 | self.teardown() | 170 | self.__teardown() |
164 | finally: | 171 | finally: |
165 | self.log("Switching to Linux scheduler") | 172 | self.log("Switching to Linux scheduler") |
166 | try: | 173 | |
167 | lu.switch_scheduler("Linux") | 174 | # Give the tasks 10 seconds to finish before bailing |
168 | except: | 175 | start = time.time() |
169 | self.log("Failed to switch back to Linux.") | 176 | while lu.all_tasks() > 0: |
177 | if time.time() - start < 10.0: | ||
178 | raise SystemCorrupted("%d tasks still running!" % | ||
179 | lu.all_tasks()) | ||
180 | |||
181 | lu.switch_scheduler("Linux") | ||
170 | 182 | ||
171 | if succ: | 183 | if succ: |
172 | self.__save_results() | 184 | self.__save_results() |
173 | self.log("Experiment done!") | 185 | self.log("Experiment done!") |
174 | 186 | ||
187 | def __check_system(self): | ||
188 | running = lu.all_tasks() | ||
189 | if running: | ||
190 | raise SystemCorrupted("%d tasks already running!" % running) | ||
191 | |||
192 | sched = lu.scheduler() | ||
193 | if sched != "Linux": | ||
194 | raise SystemCorrupted("Scheduler is %s, should be Linux" % sched) | ||
175 | 195 | ||
176 | def setup(self): | 196 | def __setup(self): |
177 | self.log("Writing %d proc entries" % len(self.proc_entries)) | 197 | self.log("Writing %d proc entries" % len(self.proc_entries)) |
178 | map(methodcaller('write_proc'), self.proc_entries) | 198 | map(methodcaller('write_proc'), self.proc_entries) |
179 | 199 | ||
@@ -190,7 +210,7 @@ class Experiment(object): | |||
190 | executable.stderr_file = self.exec_err | 210 | executable.stderr_file = self.exec_err |
191 | map(set_out, self.executables) | 211 | map(set_out, self.executables) |
192 | 212 | ||
193 | def teardown(self): | 213 | def __teardown(self): |
194 | self.exec_out and self.exec_out.close() | 214 | self.exec_out and self.exec_out.close() |
195 | self.exec_err and self.exec_err.close() | 215 | self.exec_err and self.exec_err.close() |
196 | 216 | ||
diff --git a/run/litmus_util.py b/run/litmus_util.py index 4709b66..70da262 100644 --- a/run/litmus_util.py +++ b/run/litmus_util.py | |||
@@ -3,6 +3,11 @@ import time | |||
3 | import subprocess | 3 | import subprocess |
4 | import config.config as conf | 4 | import config.config as conf |
5 | 5 | ||
6 | def scheduler(): | ||
7 | with open('/proc/litmus/active_plugin', 'r') as active_plugin: | ||
8 | cur_plugin = active_plugin.read().strip() | ||
9 | return cur_plugin | ||
10 | |||
6 | def switch_scheduler(switch_to_in): | 11 | def switch_scheduler(switch_to_in): |
7 | '''Switch the scheduler to whatever is passed in. | 12 | '''Switch the scheduler to whatever is passed in. |
8 | 13 | ||
@@ -18,11 +23,10 @@ def switch_scheduler(switch_to_in): | |||
18 | # it takes a bit to do the switch, sleep an arbitrary amount of time | 23 | # it takes a bit to do the switch, sleep an arbitrary amount of time |
19 | time.sleep(2) | 24 | time.sleep(2) |
20 | 25 | ||
21 | with open('/proc/litmus/active_plugin', 'r') as active_plugin: | 26 | cur_plugin = scheduler() |
22 | cur_plugin = active_plugin.read().strip() | ||
23 | |||
24 | if switch_to != cur_plugin: | 27 | if switch_to != cur_plugin: |
25 | raise Exception("Could not switch to '%s' (check dmesg)" % switch_to) | 28 | raise Exception("Could not switch to '%s' (check dmesg), current: %s" %\ |
29 | (switch_to, cur_plugin)) | ||
26 | 30 | ||
27 | def waiting_tasks(): | 31 | def waiting_tasks(): |
28 | reg = re.compile(r'^ready.*?(?P<READY>\d+)$', re.M) | 32 | reg = re.compile(r'^ready.*?(?P<READY>\d+)$', re.M) |
@@ -35,6 +39,17 @@ def waiting_tasks(): | |||
35 | 39 | ||
36 | return 0 if not ready else int(ready) | 40 | return 0 if not ready else int(ready) |
37 | 41 | ||
42 | def all_tasks(): | ||
43 | reg = re.compile(r'^real-time.*?(?P<TASKS>\d+)$', re.M) | ||
44 | with open('/proc/litmus/stats', 'r') as f: | ||
45 | data = f.read() | ||
46 | |||
47 | # Ignore if no tasks are waiting for release | ||
48 | match = re.search(reg, data) | ||
49 | ready = match.group("TASKS") | ||
50 | |||
51 | return 0 if not ready else int(ready) | ||
52 | |||
38 | def release_tasks(): | 53 | def release_tasks(): |
39 | try: | 54 | try: |
40 | data = subprocess.check_output([conf.BINS['release']]) | 55 | data = subprocess.check_output([conf.BINS['release']]) |
diff --git a/run_exps.py b/run_exps.py index b9cf88e..6531415 100755 --- a/run_exps.py +++ b/run_exps.py | |||
@@ -8,12 +8,11 @@ import re | |||
8 | import shutil | 8 | import shutil |
9 | import sys | 9 | import sys |
10 | import run.tracer as trace | 10 | import run.tracer as trace |
11 | import traceback | ||
12 | 11 | ||
13 | from collections import namedtuple | 12 | from collections import namedtuple |
14 | from optparse import OptionParser | 13 | from optparse import OptionParser |
15 | from run.executable.executable import Executable | 14 | from run.executable.executable import Executable |
16 | from run.experiment import Experiment,ExperimentDone | 15 | from run.experiment import Experiment,ExperimentDone,ExperimentFailed,SystemCorrupted |
17 | from run.proc_entry import ProcEntry | 16 | from run.proc_entry import ProcEntry |
18 | 17 | ||
19 | '''Customizable experiment parameters''' | 18 | '''Customizable experiment parameters''' |
@@ -330,6 +329,7 @@ def main(): | |||
330 | created = True | 329 | created = True |
331 | os.mkdir(out_base) | 330 | os.mkdir(out_base) |
332 | 331 | ||
332 | ran = 0 | ||
333 | done = 0 | 333 | done = 0 |
334 | succ = 0 | 334 | succ = 0 |
335 | failed = 0 | 335 | failed = 0 |
@@ -362,15 +362,23 @@ def main(): | |||
362 | invalid += 1 | 362 | invalid += 1 |
363 | print("Invalid environment for experiment '%s'" % exp) | 363 | print("Invalid environment for experiment '%s'" % exp) |
364 | print(e) | 364 | print(e) |
365 | except: | 365 | except KeyboardInterrupt: |
366 | print("Keyboard interrupt, quitting") | ||
367 | break | ||
368 | except SystemCorrupted as e: | ||
369 | print("System is corrupted! Fix state before continuing.") | ||
370 | print(e) | ||
371 | break | ||
372 | except ExperimentFailed: | ||
366 | print("Failed experiment %s" % exp) | 373 | print("Failed experiment %s" % exp) |
367 | traceback.print_exc() | ||
368 | failed += 1 | 374 | failed += 1 |
369 | 375 | ||
376 | ran += 1 | ||
377 | |||
370 | if not os.listdir(out_base) and created and not succ: | 378 | if not os.listdir(out_base) and created and not succ: |
371 | os.rmdir(out_base) | 379 | os.rmdir(out_base) |
372 | 380 | ||
373 | message = "Experiments run:\t%d" % len(args) +\ | 381 | message = "Experiments ran:\t%d of %d" % (ran, len(args)) +\ |
374 | "\n Successful:\t\t%d" % succ +\ | 382 | "\n Successful:\t\t%d" % succ +\ |
375 | "\n Failed:\t\t%d" % failed +\ | 383 | "\n Failed:\t\t%d" % failed +\ |
376 | "\n Already Done:\t\t%d" % done +\ | 384 | "\n Already Done:\t\t%d" % done +\ |