diff options
| author | Jonathan Herman <hermanjl@cs.unc.edu> | 2013-04-29 16:50:23 -0400 |
|---|---|---|
| committer | Jonathan Herman <hermanjl@cs.unc.edu> | 2013-04-29 16:50:23 -0400 |
| commit | 7e32c3915e7ea27d2533d99a22fa53ef923198f5 (patch) | |
| tree | 5bd8d8a3ed6861e039a683f47a953d2f7a22d8b1 | |
| parent | 7545402506aa76261e18d85af585ff0ac1cf05c1 (diff) | |
Added run_exps.py option to --retry failed experiments.
If the retry flag is specified, failed experiments will be re-run after all
other experiments have run. They can be re-run at most 5 times.
This commit required a refactoring of run_exps.py to clean up the main
experiment running loop.
| -rw-r--r-- | gen/edf_generators.py | 1 | ||||
| -rwxr-xr-x | gen_exps.py | 15 | ||||
| -rw-r--r-- | run/experiment.py | 50 | ||||
| -rwxr-xr-x | run_exps.py | 246 |
4 files changed, 194 insertions, 118 deletions
diff --git a/gen/edf_generators.py b/gen/edf_generators.py index a722c21..8e4b8df 100644 --- a/gen/edf_generators.py +++ b/gen/edf_generators.py | |||
| @@ -28,6 +28,7 @@ class EdfGenerator(gen.Generator): | |||
| 28 | pdist = self._create_dist('period', | 28 | pdist = self._create_dist('period', |
| 29 | exp_params['periods'], | 29 | exp_params['periods'], |
| 30 | gen.NAMED_PERIODS) | 30 | gen.NAMED_PERIODS) |
| 31 | |||
| 31 | udist = self._create_dist('utilization', | 32 | udist = self._create_dist('utilization', |
| 32 | exp_params['utils'], | 33 | exp_params['utils'], |
| 33 | gen.NAMED_UTILIZATIONS) | 34 | gen.NAMED_UTILIZATIONS) |
diff --git a/gen_exps.py b/gen_exps.py index b847661..00ce27b 100755 --- a/gen_exps.py +++ b/gen_exps.py | |||
| @@ -43,6 +43,14 @@ def load_file(fname): | |||
| 43 | except: | 43 | except: |
| 44 | raise IOError("Invalid generation file: %s" % fname) | 44 | raise IOError("Invalid generation file: %s" % fname) |
| 45 | 45 | ||
| 46 | def print_descriptions(described): | ||
| 47 | for generator in described.split(','): | ||
| 48 | if generator not in gen.get_generators(): | ||
| 49 | sys.stderr.write("No generator '%s'\n" % generator) | ||
| 50 | else: | ||
| 51 | print("Generator '%s', " % generator) | ||
| 52 | gen.get_generators()[generator]().print_help() | ||
| 53 | |||
| 46 | def main(): | 54 | def main(): |
| 47 | opts, args = parse_args() | 55 | opts, args = parse_args() |
| 48 | 56 | ||
| @@ -50,12 +58,7 @@ def main(): | |||
| 50 | if opts.list_gens: | 58 | if opts.list_gens: |
| 51 | print(", ".join(gen.get_generators())) | 59 | print(", ".join(gen.get_generators())) |
| 52 | if opts.described != None: | 60 | if opts.described != None: |
| 53 | for generator in opts.described.split(','): | 61 | print_descriptions(opts.described) |
| 54 | if generator not in gen.get_generators(): | ||
| 55 | sys.stderr.write("No generator '%s'\n" % generator) | ||
| 56 | else: | ||
| 57 | print("Generator '%s', " % generator) | ||
| 58 | gen.get_generators()[generator]().print_help() | ||
| 59 | if opts.list_gens or opts.described: | 62 | if opts.list_gens or opts.described: |
| 60 | return 0 | 63 | return 0 |
| 61 | 64 | ||
diff --git a/run/experiment.py b/run/experiment.py index b0e46b6..9a70414 100644 --- a/run/experiment.py +++ b/run/experiment.py | |||
| @@ -2,6 +2,7 @@ import os | |||
| 2 | import time | 2 | import time |
| 3 | import run.litmus_util as lu | 3 | import run.litmus_util as lu |
| 4 | import shutil as sh | 4 | import shutil as sh |
| 5 | |||
| 5 | from operator import methodcaller | 6 | from operator import methodcaller |
| 6 | 7 | ||
| 7 | class ExperimentException(Exception): | 8 | class ExperimentException(Exception): |
| @@ -69,21 +70,24 @@ class Experiment(object): | |||
| 69 | executable.cwd = self.working_dir | 70 | executable.cwd = self.working_dir |
| 70 | map(assign_cwd, self.executables) | 71 | map(assign_cwd, self.executables) |
| 71 | 72 | ||
| 72 | def __kill_all(self): | 73 | def __try_kill_all(self): |
| 73 | if lu.waiting_tasks(): | 74 | try: |
| 74 | released = lu.release_tasks() | 75 | if lu.waiting_tasks(): |
| 75 | self.log("Re-released %d tasks" % released) | 76 | released = lu.release_tasks() |
| 77 | self.log("Re-released %d tasks" % released) | ||
| 76 | 78 | ||
| 77 | time.sleep(1) | 79 | time.sleep(1) |
| 78 | 80 | ||
| 79 | self.log("Killing all tasks") | 81 | self.log("Killing all tasks") |
| 80 | for e in self.executables: | 82 | for e in self.executables: |
| 81 | try: | 83 | try: |
| 82 | e.kill() | 84 | e.kill() |
| 83 | except: | 85 | except: |
| 84 | pass | 86 | pass |
| 85 | 87 | ||
| 86 | time.sleep(1) | 88 | time.sleep(1) |
| 89 | except: | ||
| 90 | self.log("Failed to kill all tasks.") | ||
| 87 | 91 | ||
| 88 | def __strip_path(self, path): | 92 | def __strip_path(self, path): |
| 89 | '''Shorten path to something more readable.''' | 93 | '''Shorten path to something more readable.''' |
| @@ -138,7 +142,7 @@ class Experiment(object): | |||
| 138 | now_ready = lu.waiting_tasks() | 142 | now_ready = lu.waiting_tasks() |
| 139 | if now_ready != num_ready: | 143 | if now_ready != num_ready: |
| 140 | wait_start = time.time() | 144 | wait_start = time.time() |
| 141 | num_ready = lu.now_ready | 145 | num_ready = now_ready |
| 142 | 146 | ||
| 143 | def __run_tasks(self): | 147 | def __run_tasks(self): |
| 144 | self.log("Starting %d tasks" % len(self.executables)) | 148 | self.log("Starting %d tasks" % len(self.executables)) |
| @@ -185,6 +189,7 @@ class Experiment(object): | |||
| 185 | 189 | ||
| 186 | sched = lu.scheduler() | 190 | sched = lu.scheduler() |
| 187 | if sched != "Linux": | 191 | if sched != "Linux": |
| 192 | self.log("Switching back to Linux scheduler") | ||
| 188 | try: | 193 | try: |
| 189 | lu.switch_scheduler("Linux") | 194 | lu.switch_scheduler("Linux") |
| 190 | except: | 195 | except: |
| @@ -229,6 +234,8 @@ class Experiment(object): | |||
| 229 | self.log("Stopping regular tracers") | 234 | self.log("Stopping regular tracers") |
| 230 | map(methodcaller('stop_tracing'), self.regular_tracers) | 235 | map(methodcaller('stop_tracing'), self.regular_tracers) |
| 231 | 236 | ||
| 237 | os.system('sync') | ||
| 238 | |||
| 232 | def log(self, msg): | 239 | def log(self, msg): |
| 233 | print("[Exp %s]: %s" % (self.name, msg)) | 240 | print("[Exp %s]: %s" % (self.name, msg)) |
| 234 | 241 | ||
| @@ -236,6 +243,7 @@ class Experiment(object): | |||
| 236 | self.__to_linux() | 243 | self.__to_linux() |
| 237 | 244 | ||
| 238 | succ = False | 245 | succ = False |
| 246 | exception = None | ||
| 239 | try: | 247 | try: |
| 240 | self.__setup() | 248 | self.__setup() |
| 241 | 249 | ||
| @@ -244,16 +252,20 @@ class Experiment(object): | |||
| 244 | self.log("Saving results in %s" % self.finished_dir) | 252 | self.log("Saving results in %s" % self.finished_dir) |
| 245 | succ = True | 253 | succ = True |
| 246 | except Exception as e: | 254 | except Exception as e: |
| 255 | exception = e | ||
| 256 | |||
| 247 | # Give time for whatever failed to finish failing | 257 | # Give time for whatever failed to finish failing |
| 248 | time.sleep(2) | 258 | time.sleep(2) |
| 249 | self.__kill_all() | ||
| 250 | 259 | ||
| 251 | raise e | 260 | self.__try_kill_all() |
| 252 | finally: | ||
| 253 | self.__teardown() | ||
| 254 | finally: | 261 | finally: |
| 255 | self.log("Switching back to Linux scheduler") | 262 | try: |
| 256 | self.__to_linux() | 263 | self.__teardown() |
| 264 | self.__to_linux() | ||
| 265 | except Exception as e: | ||
| 266 | exception = exception or e | ||
| 267 | finally: | ||
| 268 | if exception: raise exception | ||
| 257 | 269 | ||
| 258 | if succ: | 270 | if succ: |
| 259 | self.__save_results() | 271 | self.__save_results() |
diff --git a/run_exps.py b/run_exps.py index a15018d..1d46b45 100755 --- a/run_exps.py +++ b/run_exps.py | |||
| @@ -3,6 +3,7 @@ from __future__ import print_function | |||
| 3 | 3 | ||
| 4 | import common as com | 4 | import common as com |
| 5 | import os | 5 | import os |
| 6 | import pprint | ||
| 6 | import re | 7 | import re |
| 7 | import shutil | 8 | import shutil |
| 8 | import sys | 9 | import sys |
| @@ -11,17 +12,26 @@ import run.tracer as trace | |||
| 11 | from config.config import PARAMS,DEFAULTS | 12 | from config.config import PARAMS,DEFAULTS |
| 12 | from collections import namedtuple | 13 | from collections import namedtuple |
| 13 | from optparse import OptionParser | 14 | from optparse import OptionParser |
| 15 | from parse.enum import Enum | ||
| 14 | from run.executable.executable import Executable | 16 | from run.executable.executable import Executable |
| 15 | from run.experiment import Experiment,ExperimentDone,SystemCorrupted | 17 | from run.experiment import Experiment,ExperimentDone,SystemCorrupted |
| 16 | from run.proc_entry import ProcEntry | 18 | from run.proc_entry import ProcEntry |
| 17 | 19 | ||
| 20 | '''Maximum times an experiment will be retried''' | ||
| 21 | MAX_RETRY = 5 | ||
| 22 | |||
| 18 | '''Customizable experiment parameters''' | 23 | '''Customizable experiment parameters''' |
| 19 | ExpParams = namedtuple('ExpParams', ['scheduler', 'duration', 'tracers', | 24 | ExpParams = namedtuple('ExpParams', ['scheduler', 'duration', 'tracers', |
| 20 | 'kernel', 'config_options', 'file_params', | 25 | 'kernel', 'config_options', 'file_params', |
| 21 | 'pre_script', 'post_script']) | 26 | 'pre_script', 'post_script']) |
| 27 | '''Tracked with each experiment''' | ||
| 28 | ExpState = Enum(['Failed', 'Succeeded', 'Invalid', 'Done', 'None']) | ||
| 29 | ExpData = com.recordtype('ExpData', ['name', 'params', 'sched_file', 'out_dir', | ||
| 30 | 'retries', 'state']) | ||
| 22 | '''Comparison of requested versus actual kernel compile parameter value''' | 31 | '''Comparison of requested versus actual kernel compile parameter value''' |
| 23 | ConfigResult = namedtuple('ConfigResult', ['param', 'wanted', 'actual']) | 32 | ConfigResult = namedtuple('ConfigResult', ['param', 'wanted', 'actual']) |
| 24 | 33 | ||
| 34 | |||
| 25 | class InvalidKernel(Exception): | 35 | class InvalidKernel(Exception): |
| 26 | def __init__(self, kernel): | 36 | def __init__(self, kernel): |
| 27 | self.kernel = kernel | 37 | self.kernel = kernel |
| @@ -72,6 +82,9 @@ def parse_args(): | |||
| 72 | parser.add_option('-e', '--email', metavar='username@server', | 82 | parser.add_option('-e', '--email', metavar='username@server', |
| 73 | dest='email', default=None, | 83 | dest='email', default=None, |
| 74 | help='send an email when all experiments complete') | 84 | help='send an email when all experiments complete') |
| 85 | parser.add_option('-r', '--retry', dest='retry', | ||
| 86 | action='store_true', default=False, | ||
| 87 | help='retry failed experiments') | ||
| 75 | 88 | ||
| 76 | return parser.parse_args() | 89 | return parser.parse_args() |
| 77 | 90 | ||
| @@ -252,65 +265,118 @@ def make_exp_params(cmd_scheduler, cmd_duration, sched_dir, param_file): | |||
| 252 | config_options=copts, tracers=tracers, file_params=fparams, | 265 | config_options=copts, tracers=tracers, file_params=fparams, |
| 253 | pre_script=pre_script, post_script=post_script) | 266 | pre_script=pre_script, post_script=post_script) |
| 254 | 267 | ||
| 255 | def run_experiment(name, sched_file, exp_params, out_dir, | 268 | def run_experiment(data, start_message, ignore, jabber): |
| 256 | start_message, ignore, jabber): | ||
| 257 | '''Load and parse data from files and run result.''' | 269 | '''Load and parse data from files and run result.''' |
| 258 | if not os.path.isfile(sched_file): | 270 | if not os.path.isfile(data.sched_file): |
| 259 | raise IOError("Cannot find schedule file: %s" % sched_file) | 271 | raise IOError("Cannot find schedule file: %s" % data.sched_file) |
| 260 | 272 | ||
| 261 | dir_name, fname = os.path.split(sched_file) | 273 | dir_name, fname = os.path.split(data.sched_file) |
| 262 | work_dir = "%s/tmp" % dir_name | 274 | work_dir = "%s/tmp" % dir_name |
| 263 | 275 | ||
| 264 | procs, execs = load_schedule(name, sched_file, exp_params.duration) | 276 | procs, execs = load_schedule(data.name, data.sched_file, data.params.duration) |
| 265 | 277 | ||
| 266 | exp = Experiment(name, exp_params.scheduler, work_dir, out_dir, | 278 | exp = Experiment(data.name, data.params.scheduler, work_dir, |
| 267 | procs, execs, exp_params.tracers) | 279 | data.out_dir, procs, execs, data.params.tracers) |
| 268 | 280 | ||
| 269 | exp.log(start_message) | 281 | exp.log(start_message) |
| 270 | 282 | ||
| 271 | if not ignore: | 283 | if not ignore: |
| 272 | verify_environment(exp_params) | 284 | verify_environment(data.params) |
| 273 | 285 | ||
| 274 | run_script(exp_params.pre_script, exp, dir_name, work_dir) | 286 | run_script(data.params.pre_script, exp, dir_name, work_dir) |
| 275 | 287 | ||
| 276 | exp.run_exp() | 288 | exp.run_exp() |
| 277 | 289 | ||
| 278 | run_script(exp_params.post_script, exp, dir_name, out_dir) | 290 | run_script(data.params.post_script, exp, dir_name, data.out_dir) |
| 279 | 291 | ||
| 280 | if jabber: | 292 | if jabber: |
| 281 | jabber.send("Completed '%s'" % name) | 293 | jabber.send("Completed '%s'" % data.name) |
| 282 | 294 | ||
| 283 | # Save parameters used to run experiment in out_dir | 295 | # Save parameters used to run dataeriment in out_dir |
| 284 | out_params = dict(exp_params.file_params.items() + | 296 | out_params = dict([(PARAMS['sched'], data.params.scheduler), |
| 285 | [(PARAMS['sched'], exp_params.scheduler), | ||
| 286 | (PARAMS['tasks'], len(execs)), | 297 | (PARAMS['tasks'], len(execs)), |
| 287 | (PARAMS['dur'], exp_params.duration)]) | 298 | (PARAMS['dur'], data.params.duration)] + |
| 299 | data.params.file_params.items()) | ||
| 288 | 300 | ||
| 289 | # Feather-trace clock frequency saved for accurate overhead parsing | 301 | # Feather-trace clock frequency saved for accurate overhead parsing |
| 290 | ft_freq = com.ft_freq() | 302 | ft_freq = com.ft_freq() |
| 291 | if ft_freq: | 303 | if ft_freq: |
| 292 | out_params[PARAMS['cycles']] = ft_freq | 304 | out_params[PARAMS['cycles']] = ft_freq |
| 293 | 305 | ||
| 294 | with open("%s/%s" % (out_dir, DEFAULTS['params_file']), 'w') as f: | 306 | out_param_f = "%s/%s" % (data.out_dir, DEFAULTS['params_file']) |
| 295 | f.write(str(out_params)) | 307 | with open(out_param_f, 'w') as f: |
| 308 | pprint.pprint(out_params, f) | ||
| 296 | 309 | ||
| 297 | 310 | ||
| 298 | def get_exps(opts, args): | 311 | def make_paths(exp, opts, out_base_dir): |
| 299 | '''Return list of experiment files or directories''' | 312 | '''Translate experiment name to (schedule file, output directory) paths''' |
| 300 | if args: | 313 | path = "%s/%s" % (os.getcwd(), exp) |
| 301 | return args | 314 | out_dir = "%s/%s" % (out_base_dir, os.path.split(exp.strip('/'))[1]) |
| 315 | |||
| 316 | if not os.path.exists(path): | ||
| 317 | raise IOError("Invalid experiment: %s" % path) | ||
| 318 | |||
| 319 | if opts.force and os.path.exists(out_dir): | ||
| 320 | shutil.rmtree(out_dir) | ||
| 302 | 321 | ||
| 303 | # Default to sched_file > generated dirs | 322 | if os.path.isdir(path): |
| 304 | if os.path.exists(opts.sched_file): | 323 | sched_file = "%s/%s" % (path, opts.sched_file) |
| 305 | sys.stderr.write("Reading schedule from %s.\n" % opts.sched_file) | ||
| 306 | return [opts.sched_file] | ||
| 307 | elif os.path.exists(DEFAULTS['out-gen']): | ||
| 308 | sys.stderr.write("Reading schedules from %s/*.\n" % DEFAULTS['out-gen']) | ||
| 309 | sched_dirs = os.listdir(DEFAULTS['out-gen']) | ||
| 310 | return ['%s/%s' % (DEFAULTS['out-gen'], d) for d in sched_dirs] | ||
| 311 | else: | 324 | else: |
| 312 | sys.stderr.write("Run with -h to view options.\n"); | 325 | sched_file = path |
| 313 | sys.exit(1) | 326 | |
| 327 | return sched_file, out_dir | ||
| 328 | |||
| 329 | |||
| 330 | def get_common_header(args): | ||
| 331 | common = "" | ||
| 332 | done = False | ||
| 333 | |||
| 334 | if len(args) == 1: | ||
| 335 | return common | ||
| 336 | |||
| 337 | while not done: | ||
| 338 | common += args[0][len(common)] | ||
| 339 | for path in args: | ||
| 340 | if path.find(common, 0, len(common)): | ||
| 341 | done = True | ||
| 342 | break | ||
| 343 | |||
| 344 | return common[:len(common)-1] | ||
| 345 | |||
| 346 | |||
| 347 | def get_exps(opts, args, out_base_dir): | ||
| 348 | '''Return list of ExpDatas''' | ||
| 349 | |||
| 350 | if not args: | ||
| 351 | if os.path.exists(opts.sched_file): | ||
| 352 | # Default to sched_file in current directory | ||
| 353 | sys.stderr.write("Reading schedule from %s.\n" % opts.sched_file) | ||
| 354 | args = [opts.sched_file] | ||
| 355 | elif os.path.exists(DEFAULTS['out-gen']): | ||
| 356 | # Then try experiments created by gen_exps | ||
| 357 | sys.stderr.write("Reading schedules from %s/*.\n" % DEFAULTS['out-gen']) | ||
| 358 | sched_dirs = os.listdir(DEFAULTS['out-gen']) | ||
| 359 | args = ['%s/%s' % (DEFAULTS['out-gen'], d) for d in sched_dirs] | ||
| 360 | else: | ||
| 361 | sys.stderr.write("Run with -h to view options.\n"); | ||
| 362 | sys.exit(1) | ||
| 363 | |||
| 364 | # Part of arg paths which is identical for each arg | ||
| 365 | common = get_common_header(args) | ||
| 366 | |||
| 367 | exps = [] | ||
| 368 | for path in args: | ||
| 369 | sched_file, out_dir = make_paths(path, opts, out_base_dir) | ||
| 370 | name = path[len(common):] | ||
| 371 | |||
| 372 | sched_dir = os.path.split(sched_file)[0] | ||
| 373 | exp_params = make_exp_params(opts.scheduler, opts.duration, | ||
| 374 | sched_dir, opts.param_file) | ||
| 375 | |||
| 376 | exps += [ExpData(name, exp_params, sched_file, out_dir, | ||
| 377 | 0, ExpState.None)] | ||
| 378 | |||
| 379 | return exps | ||
| 314 | 380 | ||
| 315 | 381 | ||
| 316 | def setup_jabber(target): | 382 | def setup_jabber(target): |
| @@ -338,32 +404,53 @@ def setup_email(target): | |||
| 338 | return None | 404 | return None |
| 339 | 405 | ||
| 340 | 406 | ||
| 341 | def make_paths(exp, out_base_dir, opts): | 407 | def run_exps(exps, opts): |
| 342 | '''Translate experiment name to (schedule file, output directory) paths''' | 408 | jabber = setup_jabber(opts.jabber) if opts.jabber else None |
| 343 | path = "%s/%s" % (os.getcwd(), exp) | ||
| 344 | out_dir = "%s/%s" % (out_base_dir, os.path.split(exp.strip('/'))[1]) | ||
| 345 | |||
| 346 | if not os.path.exists(path): | ||
| 347 | raise IOError("Invalid experiment: %s" % path) | ||
| 348 | 409 | ||
| 349 | if opts.force and os.path.exists(out_dir): | 410 | exps_remaining = list(enumerate(exps)) |
| 350 | shutil.rmtree(out_dir) | 411 | while exps_remaining: |
| 412 | i, exp = exps_remaining.pop(0) | ||
| 351 | 413 | ||
| 352 | if os.path.isdir(path): | 414 | verb = "Loading" if exp.state == ExpState.None else "Re-running failed" |
| 353 | sched_file = "%s/%s" % (path, opts.sched_file) | 415 | start_message = "%s experiment %d of %d." % (verb, i+1, len(exps)) |
| 354 | else: | ||
| 355 | sched_file = path | ||
| 356 | 416 | ||
| 357 | return sched_file, out_dir | 417 | try: |
| 418 | run_experiment(exp, start_message, opts.ignore, jabber) | ||
| 419 | exp.state = ExpState.Succeeded | ||
| 420 | except KeyboardInterrupt: | ||
| 421 | sys.stderr.write("Keyboard interrupt, quitting\n") | ||
| 422 | break | ||
| 423 | except ExperimentDone: | ||
| 424 | sys.stderr.write("Experiment already completed at '%s'\n" % exp.out_dir) | ||
| 425 | exp.state = ExpState.Done | ||
| 426 | except (InvalidKernel, InvalidConfig) as e: | ||
| 427 | sys.stderr.write("Invalid environment for experiment '%s'\n" % exp.name) | ||
| 428 | sys.stderr.write("%s\n" % e) | ||
| 429 | exp.state = ExpState.Invalid | ||
| 430 | except SystemCorrupted as e: | ||
| 431 | sys.stderr.write("System is corrupted! Fix state before continuing.\n") | ||
| 432 | sys.stderr.write("%s\n" % e) | ||
| 433 | exp.state = ExpState.Failed | ||
| 434 | if not opts.retry: | ||
| 435 | break | ||
| 436 | else: | ||
| 437 | sys.stderr.write("Remaining experiments may fail\n") | ||
| 438 | except Exception as e: | ||
| 439 | sys.stderr.write("Failed experiment %s\n" % exp.name) | ||
| 440 | sys.stderr.write("%s\n" % e) | ||
| 441 | exp.state = ExpState.Failed | ||
| 358 | 442 | ||
| 443 | if exp.state is ExpState.Failed and opts.retry: | ||
| 444 | if exp.retries < MAX_RETRY: | ||
| 445 | exps_remaining += [(i, exp)] | ||
| 446 | exp.retries += 1 | ||
| 447 | else: | ||
| 448 | sys.stderr.write("Hit maximum retries of %d\n" % MAX_RETRY) | ||
| 359 | 449 | ||
| 360 | def main(): | 450 | def main(): |
| 361 | opts, args = parse_args() | 451 | opts, args = parse_args() |
| 362 | 452 | ||
| 363 | exps = get_exps(opts, args) | 453 | email = setup_email(opts.email) if opts.email else None |
| 364 | |||
| 365 | jabber = setup_jabber(opts.jabber) if opts.jabber else None | ||
| 366 | email = setup_email(opts.email) if opts.email else None | ||
| 367 | 454 | ||
| 368 | out_base = os.path.abspath(opts.out_dir) | 455 | out_base = os.path.abspath(opts.out_dir) |
| 369 | created = False | 456 | created = False |
| @@ -371,62 +458,35 @@ def main(): | |||
| 371 | created = True | 458 | created = True |
| 372 | os.mkdir(out_base) | 459 | os.mkdir(out_base) |
| 373 | 460 | ||
| 374 | ran = done = succ = failed = invalid = 0 | 461 | exps = get_exps(opts, args, out_base) |
| 375 | 462 | ||
| 376 | for i, exp in enumerate(exps): | 463 | run_exps(exps, opts) |
| 377 | sched_file, out_dir = make_paths(exp, out_base, opts) | ||
| 378 | sched_dir = os.path.split(sched_file)[0] | ||
| 379 | 464 | ||
| 380 | try: | 465 | def state_count(state): |
| 381 | start_message = "Loading experiment %d of %d." % (i+1, len(exps)) | 466 | return len(filter(lambda x: x.state is state, exps)) |
| 382 | exp_params = make_exp_params(opts.scheduler, opts.duration, | ||
| 383 | sched_dir, opts.param_file) | ||
| 384 | 467 | ||
| 385 | run_experiment(exp, sched_file, exp_params, out_dir, | 468 | ran = len(filter(lambda x: x.state is not ExpState.None, exps)) |
| 386 | start_message, opts.ignore, jabber) | 469 | succ = state_count(ExpState.Succeeded) |
| 387 | |||
| 388 | succ += 1 | ||
| 389 | except ExperimentDone: | ||
| 390 | sys.stderr.write("Experiment '%s' already completed " % exp + | ||
| 391 | "at '%s'\n" % out_base) | ||
| 392 | done += 1 | ||
| 393 | except (InvalidKernel, InvalidConfig) as e: | ||
| 394 | sys.stderr.write("Invalid environment for experiment '%s'\n" % exp) | ||
| 395 | sys.stderr.write("%s\n" % e) | ||
| 396 | invalid += 1 | ||
| 397 | except KeyboardInterrupt: | ||
| 398 | sys.stderr.write("Keyboard interrupt, quitting\n") | ||
| 399 | break | ||
| 400 | except SystemCorrupted as e: | ||
| 401 | sys.stderr.write("System is corrupted! Fix state before continuing.\n") | ||
| 402 | sys.stderr.write("%s\n" % e) | ||
| 403 | break | ||
| 404 | except Exception as e: | ||
| 405 | sys.stderr.write("Failed experiment %s\n" % exp) | ||
| 406 | sys.stderr.write("%s\n" % e) | ||
| 407 | failed += 1 | ||
| 408 | |||
| 409 | ran += 1 | ||
| 410 | |||
| 411 | # Clean out directory if it failed immediately | ||
| 412 | if not os.listdir(out_base) and created and not succ: | ||
| 413 | os.rmdir(out_base) | ||
| 414 | 470 | ||
| 415 | message = "Experiments ran:\t%d of %d" % (ran, len(exps)) +\ | 471 | message = "Experiments ran:\t%d of %d" % (ran, len(exps)) +\ |
| 416 | "\n Successful:\t\t%d" % succ +\ | 472 | "\n Successful:\t\t%d" % succ +\ |
| 417 | "\n Failed:\t\t%d" % failed +\ | 473 | "\n Failed:\t\t%d" % state_count(ExpState.Failed) +\ |
| 418 | "\n Already Done:\t\t%d" % done +\ | 474 | "\n Already Done:\t\t%d" % state_count(ExpState.Done) +\ |
| 419 | "\n Invalid Environment:\t%d" % invalid | 475 | "\n Invalid Environment:\t%d" % state_count(ExpState.Invalid) |
| 420 | 476 | ||
| 421 | print(message) | 477 | print(message) |
| 422 | 478 | ||
| 479 | if email: | ||
| 480 | email.send(message) | ||
| 481 | email.close() | ||
| 482 | |||
| 423 | if succ: | 483 | if succ: |
| 424 | sys.stderr.write("Successful experiment data saved in %s.\n" % | 484 | sys.stderr.write("Successful experiment data saved in %s.\n" % |
| 425 | opts.out_dir) | 485 | opts.out_dir) |
| 486 | elif not os.listdir(out_base) and created: | ||
| 487 | # Remove directory if no data was put into it | ||
| 488 | os.rmdir(out_base) | ||
| 426 | 489 | ||
| 427 | if email: | ||
| 428 | email.send(message) | ||
| 429 | email.close() | ||
| 430 | 490 | ||
| 431 | if __name__ == '__main__': | 491 | if __name__ == '__main__': |
| 432 | main() | 492 | main() |
