diff options
| author | Jonathan Herman <hermanjl@cs.unc.edu> | 2013-04-12 11:30:27 -0400 |
|---|---|---|
| committer | Jonathan Herman <hermanjl@cs.unc.edu> | 2013-04-12 11:30:27 -0400 |
| commit | 09bc409657606a37346d82ab1e4c44a165bd3541 (patch) | |
| tree | 72c569f69f37acafdc89fde4724bde7b373ef8f9 | |
| parent | 384e322f974534c1c734db144633e3c3e002b1f8 (diff) | |
Improved error handling in run_exps.py.
| -rw-r--r-- | common.py | 6 | ||||
| -rw-r--r-- | parse/sched.py | 1 | ||||
| -rw-r--r-- | run/executable/executable.py | 6 | ||||
| -rw-r--r-- | run/experiment.py | 128 | ||||
| -rw-r--r-- | run/litmus_util.py | 23 | ||||
| -rwxr-xr-x | run_exps.py | 18 |
6 files changed, 114 insertions, 68 deletions
| @@ -26,15 +26,17 @@ def get_executable_hint(prog, hint, optional=False): | |||
| 26 | '''Search for @prog in system PATH. Print @hint if no binary is found. | 26 | '''Search for @prog in system PATH. Print @hint if no binary is found. |
| 27 | Die if not @optional.''' | 27 | Die if not @optional.''' |
| 28 | try: | 28 | try: |
| 29 | prog = get_executable(prog) | 29 | full_path = get_executable(prog) |
| 30 | except IOError: | 30 | except IOError: |
| 31 | if not optional: | 31 | if not optional: |
| 32 | sys.stderr.write(("Cannot find executable '%s' in PATH. This is " +\ | 32 | sys.stderr.write(("Cannot find executable '%s' in PATH. This is " +\ |
| 33 | "a part of '%s' which should be added to PATH.\n")\ | 33 | "a part of '%s' which should be added to PATH.\n")\ |
| 34 | % (prog, hint)) | 34 | % (prog, hint)) |
| 35 | sys.exit(1) | 35 | sys.exit(1) |
| 36 | else: | ||
| 37 | full_path = None | ||
| 36 | 38 | ||
| 37 | return prog | 39 | return full_path |
| 38 | 40 | ||
| 39 | def get_config_option(option): | 41 | def get_config_option(option): |
| 40 | '''Search for @option in installed kernel config (if present). | 42 | '''Search for @option in installed kernel config (if present). |
diff --git a/parse/sched.py b/parse/sched.py index 2da0149..147a2e5 100644 --- a/parse/sched.py +++ b/parse/sched.py | |||
| @@ -161,6 +161,7 @@ def extract_sched_data(result, data_dir, work_dir): | |||
| 161 | cmd_arr = [conf.BINS['st_show']] | 161 | cmd_arr = [conf.BINS['st_show']] |
| 162 | cmd_arr.extend(bins) | 162 | cmd_arr.extend(bins) |
| 163 | with open(output_file, "w") as f: | 163 | with open(output_file, "w") as f: |
| 164 | print("calling %s" % cmd_arr) | ||
| 164 | subprocess.call(cmd_arr, cwd=data_dir, stdout=f) | 165 | subprocess.call(cmd_arr, cwd=data_dir, stdout=f) |
| 165 | 166 | ||
| 166 | task_dict = defaultdict(lambda : | 167 | task_dict = defaultdict(lambda : |
diff --git a/run/executable/executable.py b/run/executable/executable.py index 02e35ae..263e305 100644 --- a/run/executable/executable.py +++ b/run/executable/executable.py | |||
| @@ -1,13 +1,13 @@ | |||
| 1 | import sys | 1 | import sys |
| 2 | import subprocess | 2 | import subprocess |
| 3 | import signal | 3 | import signal |
| 4 | from common import is_executable | 4 | from common import get_executable |
| 5 | 5 | ||
| 6 | class Executable(object): | 6 | class Executable(object): |
| 7 | '''Parent object that represents an executable for use in task-sets.''' | 7 | '''Parent object that represents an executable for use in task-sets.''' |
| 8 | 8 | ||
| 9 | def __init__(self, exec_file, extra_args=None, stdout_file = None, stderr_file = None): | 9 | def __init__(self, exec_file, extra_args=None, stdout_file = None, stderr_file = None): |
| 10 | self.exec_file = exec_file | 10 | self.exec_file = get_executable(exec_file) |
| 11 | self.cwd = None | 11 | self.cwd = None |
| 12 | self.stdout_file = stdout_file | 12 | self.stdout_file = stdout_file |
| 13 | self.stderr_file = stderr_file | 13 | self.stderr_file = stderr_file |
| @@ -18,7 +18,7 @@ class Executable(object): | |||
| 18 | else: | 18 | else: |
| 19 | self.extra_args = [str(a) for a in list(extra_args)] # make a duplicate | 19 | self.extra_args = [str(a) for a in list(extra_args)] # make a duplicate |
| 20 | 20 | ||
| 21 | if not is_executable(self.exec_file): | 21 | if not self.exec_file: |
| 22 | raise Exception("Not executable ? : %s" % self.exec_file) | 22 | raise Exception("Not executable ? : %s" % self.exec_file) |
| 23 | 23 | ||
| 24 | def __del__(self): | 24 | def __del__(self): |
diff --git a/run/experiment.py b/run/experiment.py index e5811f8..ff0e9f3 100644 --- a/run/experiment.py +++ b/run/experiment.py | |||
| @@ -1,7 +1,9 @@ | |||
| 1 | import common as com | ||
| 1 | import os | 2 | import os |
| 2 | import time | 3 | import time |
| 3 | import run.litmus_util as lu | 4 | import run.litmus_util as lu |
| 4 | import shutil as sh | 5 | import shutil as sh |
| 6 | import traceback | ||
| 5 | from operator import methodcaller | 7 | from operator import methodcaller |
| 6 | 8 | ||
| 7 | class ExperimentException(Exception): | 9 | class ExperimentException(Exception): |
| @@ -15,17 +17,12 @@ class ExperimentDone(ExperimentException): | |||
| 15 | def __str__(self): | 17 | def __str__(self): |
| 16 | return "Experiment finished already: %d" % self.name | 18 | return "Experiment finished already: %d" % self.name |
| 17 | 19 | ||
| 18 | |||
| 19 | class ExperimentInterrupted(ExperimentException): | ||
| 20 | '''Raised when an experiment appears to be interrupted (partial results).''' | ||
| 21 | def __str__(self): | ||
| 22 | return "Experiment was interrupted in progress: %d" % self.name | ||
| 23 | |||
| 24 | |||
| 25 | class ExperimentFailed(ExperimentException): | 20 | class ExperimentFailed(ExperimentException): |
| 26 | def __str__(self): | 21 | def __str__(self): |
| 27 | return "Experiment failed during execution: %d" % self.name | 22 | return "Experiment failed during execution: %d" % self.name |
| 28 | 23 | ||
| 24 | class SystemCorrupted(Exception): | ||
| 25 | pass | ||
| 29 | 26 | ||
| 30 | class Experiment(object): | 27 | class Experiment(object): |
| 31 | '''Execute one task-set and save the results. Experiments have unique IDs.''' | 28 | '''Execute one task-set and save the results. Experiments have unique IDs.''' |
| @@ -44,6 +41,8 @@ class Experiment(object): | |||
| 44 | self.exec_out = None | 41 | self.exec_out = None |
| 45 | self.exec_err = None | 42 | self.exec_err = None |
| 46 | 43 | ||
| 44 | self.task_batch = com.num_cpus() | ||
| 45 | |||
| 47 | self.__make_dirs() | 46 | self.__make_dirs() |
| 48 | self.__assign_executable_cwds() | 47 | self.__assign_executable_cwds() |
| 49 | self.__setup_tracers(tracer_types) | 48 | self.__setup_tracers(tracer_types) |
| @@ -84,65 +83,67 @@ class Experiment(object): | |||
| 84 | executable.cwd = self.working_dir | 83 | executable.cwd = self.working_dir |
| 85 | map(assign_cwd, self.executables) | 84 | map(assign_cwd, self.executables) |
| 86 | 85 | ||
| 87 | def __run_tasks(self): | 86 | def __kill_all(self): |
| 88 | already_waiting = lu.waiting_tasks() | 87 | # Give time for whatever failed to finish failing |
| 88 | time.sleep(2) | ||
| 89 | |||
| 90 | released = lu.release_tasks() | ||
| 91 | self.log("Re-released %d tasks" % released) | ||
| 89 | 92 | ||
| 90 | if already_waiting: | 93 | time.sleep(5) |
| 91 | self.log("Already %d tasks waiting for release!") | ||
| 92 | self.log("Experiment will fail if any of these tasks are released.") | ||
| 93 | 94 | ||
| 94 | self.log("Starting the programs") | 95 | self.log("Killing all tasks") |
| 95 | for e in self.executables: | 96 | for e in self.executables: |
| 96 | try: | 97 | try: |
| 98 | e.kill() | ||
| 99 | except: | ||
| 100 | pass | ||
| 101 | |||
| 102 | time.sleep(2) | ||
| 103 | |||
| 104 | def __run_tasks(self): | ||
| 105 | self.log("Starting %d tasks" % len(self.executables)) | ||
| 106 | |||
| 107 | for i,e in enumerate(self.executables): | ||
| 108 | try: | ||
| 97 | e.execute() | 109 | e.execute() |
| 98 | except: | 110 | except: |
| 99 | raise Exception("Executable failed: %s" % e) | 111 | raise Exception("Executable failed: %s" % e) |
| 100 | 112 | ||
| 101 | self.log("Sleeping until tasks are ready for release...") | 113 | self.log("Sleeping until tasks are ready for release...") |
| 102 | start = time.clock() | 114 | start = time.time() |
| 103 | while (lu.waiting_tasks() - already_waiting) < len(self.executables): | 115 | while lu.waiting_tasks() < len(self.executables): |
| 104 | if time.clock() - start > 30.0: | 116 | if time.time() - start > 30.0: |
| 105 | raise Exception("Too much time has passed waiting for tasks!") | 117 | s = "waiting: %d, submitted: %d" %\ |
| 118 | (lu.waiting_tasks(), len(self.executables)) | ||
| 119 | raise Exception("Too much time spent waiting for tasks! %s" % s) | ||
| 106 | time.sleep(1) | 120 | time.sleep(1) |
| 107 | 121 | ||
| 108 | # Exact tracers (like overheads) must be started right after release or | 122 | # Exact tracers (like overheads) must be started right after release or |
| 109 | # measurements will be full of irrelevant records | 123 | # measurements will be full of irrelevant records |
| 110 | self.log("Starting %d released tracers" % len(self.exact_tracers)) | 124 | self.log("Starting %d released tracers" % len(self.exact_tracers)) |
| 111 | map(methodcaller('start_tracing'), self.exact_tracers) | 125 | map(methodcaller('start_tracing'), self.exact_tracers) |
| 126 | time.sleep(1) | ||
| 112 | 127 | ||
| 113 | self.log("Releasing %d tasks" % len(self.executables)) | 128 | try: |
| 114 | released = lu.release_tasks() | 129 | self.log("Releasing %d tasks" % len(self.executables)) |
| 115 | |||
| 116 | ret = True | ||
| 117 | if released != len(self.executables): | ||
| 118 | # Some tasks failed to release, kill all tasks and fail | ||
| 119 | # Need to re-release non-released tasks before we can kill them though | ||
| 120 | self.log("Failed to release {} tasks! Re-releasing and killing".format( | ||
| 121 | len(self.executables) - released, len(self.executables))) | ||
| 122 | time.sleep(5) | ||
| 123 | |||
| 124 | released = lu.release_tasks() | 130 | released = lu.release_tasks() |
| 125 | 131 | ||
| 126 | self.log("Re-released %d tasks" % released) | 132 | if released != len(self.executables): |
| 127 | 133 | # Some tasks failed to release, kill all tasks and fail | |
| 128 | time.sleep(5) | 134 | # Need to release non-released tasks before they can be killed |
| 129 | 135 | raise Exception("Released %s tasks, expected %s tasks" % | |
| 130 | self.log("Killing all tasks") | 136 | (released, len(self.executables))) |
| 131 | map(methodcaller('kill'), self.executables) | ||
| 132 | 137 | ||
| 133 | ret = False | 138 | self.log("Waiting for program to finish...") |
| 139 | for e in self.executables: | ||
| 140 | if not e.wait(): | ||
| 141 | raise Exception("Executable %s failed to complete!" % e) | ||
| 134 | 142 | ||
| 135 | self.log("Waiting for program to finish...") | 143 | finally: |
| 136 | for e in self.executables: | 144 | # And these must be stopped here for the same reason |
| 137 | if not e.wait(): | 145 | self.log("Stopping exact tracers") |
| 138 | ret = False | 146 | map(methodcaller('stop_tracing'), self.exact_tracers) |
| 139 | |||
| 140 | # And these must be stopped here for the same reason | ||
| 141 | self.log("Stopping exact tracers") | ||
| 142 | map(methodcaller('stop_tracing'), self.exact_tracers) | ||
| 143 | |||
| 144 | if not ret: | ||
| 145 | raise ExperimentFailed(self.name) | ||
| 146 | 147 | ||
| 147 | def __save_results(self): | 148 | def __save_results(self): |
| 148 | os.rename(self.working_dir, self.finished_dir) | 149 | os.rename(self.working_dir, self.finished_dir) |
| @@ -151,29 +152,48 @@ class Experiment(object): | |||
| 151 | print("[Exp %s]: %s" % (self.name, msg)) | 152 | print("[Exp %s]: %s" % (self.name, msg)) |
| 152 | 153 | ||
| 153 | def run_exp(self): | 154 | def run_exp(self): |
| 155 | self.__check_system() | ||
| 156 | |||
| 154 | succ = False | 157 | succ = False |
| 155 | try: | 158 | try: |
| 156 | self.setup() | 159 | self.__setup() |
| 157 | 160 | ||
| 158 | try: | 161 | try: |
| 159 | self.__run_tasks() | 162 | self.__run_tasks() |
| 160 | self.log("Saving results in %s" % self.finished_dir) | 163 | self.log("Saving results in %s" % self.finished_dir) |
| 161 | succ = True | 164 | succ = True |
| 165 | except: | ||
| 166 | traceback.print_exc() | ||
| 167 | self.__kill_all() | ||
| 168 | raise ExperimentFailed(self.name) | ||
| 162 | finally: | 169 | finally: |
| 163 | self.teardown() | 170 | self.__teardown() |
| 164 | finally: | 171 | finally: |
| 165 | self.log("Switching to Linux scheduler") | 172 | self.log("Switching to Linux scheduler") |
| 166 | try: | 173 | |
| 167 | lu.switch_scheduler("Linux") | 174 | # Give the tasks 10 seconds to finish before bailing |
| 168 | except: | 175 | start = time.time() |
| 169 | self.log("Failed to switch back to Linux.") | 176 | while lu.all_tasks() > 0: |
| 177 | if time.time() - start < 10.0: | ||
| 178 | raise SystemCorrupted("%d tasks still running!" % | ||
| 179 | lu.all_tasks()) | ||
| 180 | |||
| 181 | lu.switch_scheduler("Linux") | ||
| 170 | 182 | ||
| 171 | if succ: | 183 | if succ: |
| 172 | self.__save_results() | 184 | self.__save_results() |
| 173 | self.log("Experiment done!") | 185 | self.log("Experiment done!") |
| 174 | 186 | ||
| 187 | def __check_system(self): | ||
| 188 | running = lu.all_tasks() | ||
| 189 | if running: | ||
| 190 | raise SystemCorrupted("%d tasks already running!" % running) | ||
| 191 | |||
| 192 | sched = lu.scheduler() | ||
| 193 | if sched != "Linux": | ||
| 194 | raise SystemCorrupted("Scheduler is %s, should be Linux" % sched) | ||
| 175 | 195 | ||
| 176 | def setup(self): | 196 | def __setup(self): |
| 177 | self.log("Writing %d proc entries" % len(self.proc_entries)) | 197 | self.log("Writing %d proc entries" % len(self.proc_entries)) |
| 178 | map(methodcaller('write_proc'), self.proc_entries) | 198 | map(methodcaller('write_proc'), self.proc_entries) |
| 179 | 199 | ||
| @@ -190,7 +210,7 @@ class Experiment(object): | |||
| 190 | executable.stderr_file = self.exec_err | 210 | executable.stderr_file = self.exec_err |
| 191 | map(set_out, self.executables) | 211 | map(set_out, self.executables) |
| 192 | 212 | ||
| 193 | def teardown(self): | 213 | def __teardown(self): |
| 194 | self.exec_out and self.exec_out.close() | 214 | self.exec_out and self.exec_out.close() |
| 195 | self.exec_err and self.exec_err.close() | 215 | self.exec_err and self.exec_err.close() |
| 196 | 216 | ||
diff --git a/run/litmus_util.py b/run/litmus_util.py index 4709b66..70da262 100644 --- a/run/litmus_util.py +++ b/run/litmus_util.py | |||
| @@ -3,6 +3,11 @@ import time | |||
| 3 | import subprocess | 3 | import subprocess |
| 4 | import config.config as conf | 4 | import config.config as conf |
| 5 | 5 | ||
| 6 | def scheduler(): | ||
| 7 | with open('/proc/litmus/active_plugin', 'r') as active_plugin: | ||
| 8 | cur_plugin = active_plugin.read().strip() | ||
| 9 | return cur_plugin | ||
| 10 | |||
| 6 | def switch_scheduler(switch_to_in): | 11 | def switch_scheduler(switch_to_in): |
| 7 | '''Switch the scheduler to whatever is passed in. | 12 | '''Switch the scheduler to whatever is passed in. |
| 8 | 13 | ||
| @@ -18,11 +23,10 @@ def switch_scheduler(switch_to_in): | |||
| 18 | # it takes a bit to do the switch, sleep an arbitrary amount of time | 23 | # it takes a bit to do the switch, sleep an arbitrary amount of time |
| 19 | time.sleep(2) | 24 | time.sleep(2) |
| 20 | 25 | ||
| 21 | with open('/proc/litmus/active_plugin', 'r') as active_plugin: | 26 | cur_plugin = scheduler() |
| 22 | cur_plugin = active_plugin.read().strip() | ||
| 23 | |||
| 24 | if switch_to != cur_plugin: | 27 | if switch_to != cur_plugin: |
| 25 | raise Exception("Could not switch to '%s' (check dmesg)" % switch_to) | 28 | raise Exception("Could not switch to '%s' (check dmesg), current: %s" %\ |
| 29 | (switch_to, cur_plugin)) | ||
| 26 | 30 | ||
| 27 | def waiting_tasks(): | 31 | def waiting_tasks(): |
| 28 | reg = re.compile(r'^ready.*?(?P<READY>\d+)$', re.M) | 32 | reg = re.compile(r'^ready.*?(?P<READY>\d+)$', re.M) |
| @@ -35,6 +39,17 @@ def waiting_tasks(): | |||
| 35 | 39 | ||
| 36 | return 0 if not ready else int(ready) | 40 | return 0 if not ready else int(ready) |
| 37 | 41 | ||
| 42 | def all_tasks(): | ||
| 43 | reg = re.compile(r'^real-time.*?(?P<TASKS>\d+)$', re.M) | ||
| 44 | with open('/proc/litmus/stats', 'r') as f: | ||
| 45 | data = f.read() | ||
| 46 | |||
| 47 | # Ignore if no tasks are waiting for release | ||
| 48 | match = re.search(reg, data) | ||
| 49 | ready = match.group("TASKS") | ||
| 50 | |||
| 51 | return 0 if not ready else int(ready) | ||
| 52 | |||
| 38 | def release_tasks(): | 53 | def release_tasks(): |
| 39 | try: | 54 | try: |
| 40 | data = subprocess.check_output([conf.BINS['release']]) | 55 | data = subprocess.check_output([conf.BINS['release']]) |
diff --git a/run_exps.py b/run_exps.py index b9cf88e..6531415 100755 --- a/run_exps.py +++ b/run_exps.py | |||
| @@ -8,12 +8,11 @@ import re | |||
| 8 | import shutil | 8 | import shutil |
| 9 | import sys | 9 | import sys |
| 10 | import run.tracer as trace | 10 | import run.tracer as trace |
| 11 | import traceback | ||
| 12 | 11 | ||
| 13 | from collections import namedtuple | 12 | from collections import namedtuple |
| 14 | from optparse import OptionParser | 13 | from optparse import OptionParser |
| 15 | from run.executable.executable import Executable | 14 | from run.executable.executable import Executable |
| 16 | from run.experiment import Experiment,ExperimentDone | 15 | from run.experiment import Experiment,ExperimentDone,ExperimentFailed,SystemCorrupted |
| 17 | from run.proc_entry import ProcEntry | 16 | from run.proc_entry import ProcEntry |
| 18 | 17 | ||
| 19 | '''Customizable experiment parameters''' | 18 | '''Customizable experiment parameters''' |
| @@ -330,6 +329,7 @@ def main(): | |||
| 330 | created = True | 329 | created = True |
| 331 | os.mkdir(out_base) | 330 | os.mkdir(out_base) |
| 332 | 331 | ||
| 332 | ran = 0 | ||
| 333 | done = 0 | 333 | done = 0 |
| 334 | succ = 0 | 334 | succ = 0 |
| 335 | failed = 0 | 335 | failed = 0 |
| @@ -362,15 +362,23 @@ def main(): | |||
| 362 | invalid += 1 | 362 | invalid += 1 |
| 363 | print("Invalid environment for experiment '%s'" % exp) | 363 | print("Invalid environment for experiment '%s'" % exp) |
| 364 | print(e) | 364 | print(e) |
| 365 | except: | 365 | except KeyboardInterrupt: |
| 366 | print("Keyboard interrupt, quitting") | ||
| 367 | break | ||
| 368 | except SystemCorrupted as e: | ||
| 369 | print("System is corrupted! Fix state before continuing.") | ||
| 370 | print(e) | ||
| 371 | break | ||
| 372 | except ExperimentFailed: | ||
| 366 | print("Failed experiment %s" % exp) | 373 | print("Failed experiment %s" % exp) |
| 367 | traceback.print_exc() | ||
| 368 | failed += 1 | 374 | failed += 1 |
| 369 | 375 | ||
| 376 | ran += 1 | ||
| 377 | |||
| 370 | if not os.listdir(out_base) and created and not succ: | 378 | if not os.listdir(out_base) and created and not succ: |
| 371 | os.rmdir(out_base) | 379 | os.rmdir(out_base) |
| 372 | 380 | ||
| 373 | message = "Experiments run:\t%d" % len(args) +\ | 381 | message = "Experiments ran:\t%d of %d" % (ran, len(args)) +\ |
| 374 | "\n Successful:\t\t%d" % succ +\ | 382 | "\n Successful:\t\t%d" % succ +\ |
| 375 | "\n Failed:\t\t%d" % failed +\ | 383 | "\n Failed:\t\t%d" % failed +\ |
| 376 | "\n Already Done:\t\t%d" % done +\ | 384 | "\n Already Done:\t\t%d" % done +\ |
