aboutsummaryrefslogtreecommitdiffstats
path: root/run_exps.py
diff options
context:
space:
mode:
authorJonathan Herman <hermanjl@cs.unc.edu>2013-05-01 15:48:01 -0400
committerJonathan Herman <hermanjl@cs.unc.edu>2013-05-01 15:48:01 -0400
commitcd9f1b026cc5c4526dfbd2f7b1c5f39edb6a7309 (patch)
tree5b6221e55d7f50c88a574ed4f57ff7efd9b7103d /run_exps.py
parent94cc65997d237ddeab24d396f06bb93bc0644a9d (diff)
Added --crontab option to run_exps.py
This will use crontab to automatically restart the machine and resume the script when the machine crashes. An additional option, -k, is provided to cancel this operation.
Diffstat (limited to 'run_exps.py')
-rwxr-xr-xrun_exps.py82
1 files changed, 71 insertions, 11 deletions
diff --git a/run_exps.py b/run_exps.py
index 1bad2a3..21666a9 100755
--- a/run_exps.py
+++ b/run_exps.py
@@ -3,10 +3,12 @@ from __future__ import print_function
3 3
4import common as com 4import common as com
5import os 5import os
6import pickle
6import pprint 7import pprint
7import re 8import re
8import shutil 9import shutil
9import sys 10import sys
11import run.crontab as cron
10import run.tracer as trace 12import run.tracer as trace
11 13
12from config.config import PARAMS,DEFAULTS,FILES 14from config.config import PARAMS,DEFAULTS,FILES
@@ -17,9 +19,6 @@ from run.executable.executable import Executable
17from run.experiment import Experiment,ExperimentDone,SystemCorrupted 19from run.experiment import Experiment,ExperimentDone,SystemCorrupted
18from run.proc_entry import ProcEntry 20from run.proc_entry import ProcEntry
19 21
20'''Maximum times an experiment will be retried'''
21MAX_RETRY = 5
22
23'''Customizable experiment parameters''' 22'''Customizable experiment parameters'''
24ExpParams = namedtuple('ExpParams', ['scheduler', 'duration', 'tracers', 23ExpParams = namedtuple('ExpParams', ['scheduler', 'duration', 'tracers',
25 'kernel', 'config_options', 'file_params', 24 'kernel', 'config_options', 'file_params',
@@ -31,6 +30,11 @@ ExpData = com.recordtype('ExpData', ['name', 'params', 'sched_file', 'out_dir',
31'''Comparison of requested versus actual kernel compile parameter value''' 30'''Comparison of requested versus actual kernel compile parameter value'''
32ConfigResult = namedtuple('ConfigResult', ['param', 'wanted', 'actual']) 31ConfigResult = namedtuple('ConfigResult', ['param', 'wanted', 'actual'])
33 32
33'''Maximum times an experiment will be retried'''
34MAX_RETRY = 5
35'''Location experiment retry count is stored'''
36TRIES_FNAME = ".tries.pkl"
37
34 38
35class InvalidKernel(Exception): 39class InvalidKernel(Exception):
36 def __init__(self, kernel): 40 def __init__(self, kernel):
@@ -88,6 +92,9 @@ def parse_args():
88 action='store_true', default=False, 92 action='store_true', default=False,
89 help='use crontab to resume interrupted script after ' 93 help='use crontab to resume interrupted script after '
90 'system restarts. implies --retry') 94 'system restarts. implies --retry')
95 group.add_option('-k', '--kill-crontab', dest='kill',
96 action='store_true', default=False,
97 help='kill existing script crontabs and exit')
91 parser.add_option_group(group) 98 parser.add_option_group(group)
92 99
93 return parser.parse_args() 100 return parser.parse_args()
@@ -314,7 +321,7 @@ def run_experiment(data, start_message, ignore, jabber):
314 321
315def make_paths(exp, opts, out_base_dir): 322def make_paths(exp, opts, out_base_dir):
316 '''Translate experiment name to (schedule file, output directory) paths''' 323 '''Translate experiment name to (schedule file, output directory) paths'''
317 path = "%s/%s" % (os.getcwd(), exp) 324 path = os.path.abspath(exp)
318 out_dir = "%s/%s" % (out_base_dir, os.path.split(exp.strip('/'))[1]) 325 out_dir = "%s/%s" % (out_base_dir, os.path.split(exp.strip('/'))[1])
319 326
320 if not os.path.exists(path): 327 if not os.path.exists(path):
@@ -408,10 +415,35 @@ def setup_email(target):
408 return None 415 return None
409 416
410 417
418def tries_file(exp):
419 return "%s/%s" % (os.path.split(exp.sched_file)[0], TRIES_FNAME)
420
421
422def get_tries(exp):
423 if not os.path.exists(tries_file(exp)):
424 return 0
425 with open(tries_file(exp), 'r') as f:
426 return int(pickle.load(f))
427
428
429def set_tries(exp, val):
430 if not val:
431 if os.path.exists(tries_file(exp)):
432 os.remove(tries_file(exp))
433 else:
434 with open(tries_file(exp), 'w') as f:
435 pickle.dump(str(val), f)
436 os.system('sync')
437
438
411def run_exps(exps, opts): 439def run_exps(exps, opts):
412 jabber = setup_jabber(opts.jabber) if opts.jabber else None 440 jabber = setup_jabber(opts.jabber) if opts.jabber else None
413 441
414 exps_remaining = list(enumerate(exps)) 442 # Give each experiment a unique id
443 exps_remaining = enumerate(exps)
444 # But run experiments which have failed the most last
445 exps_remaining = sorted(exps_remaining, key=lambda x: get_tries(x[1]))
446
415 while exps_remaining: 447 while exps_remaining:
416 i, exp = exps_remaining.pop(0) 448 i, exp = exps_remaining.pop(0)
417 449
@@ -419,17 +451,26 @@ def run_exps(exps, opts):
419 start_message = "%s experiment %d of %d." % (verb, i+1, len(exps)) 451 start_message = "%s experiment %d of %d." % (verb, i+1, len(exps))
420 452
421 try: 453 try:
454 set_tries(exp, get_tries(exp) + 1)
455 if get_tries(exp) > MAX_RETRY:
456 raise Exception("Hit maximum retries of %d" % MAX_RETRY)
457
422 run_experiment(exp, start_message, opts.ignore, jabber) 458 run_experiment(exp, start_message, opts.ignore, jabber)
459
460 set_tries(exp, 0)
423 exp.state = ExpState.Succeeded 461 exp.state = ExpState.Succeeded
424 except KeyboardInterrupt: 462 except KeyboardInterrupt:
425 sys.stderr.write("Keyboard interrupt, quitting\n") 463 sys.stderr.write("Keyboard interrupt, quitting\n")
464 set_tries(exp, get_tries(exp) - 1)
426 break 465 break
427 except ExperimentDone: 466 except ExperimentDone:
428 sys.stderr.write("Experiment already completed at '%s'\n" % exp.out_dir) 467 sys.stderr.write("Experiment already completed at '%s'\n" % exp.out_dir)
468 set_tries(exp, 0)
429 exp.state = ExpState.Done 469 exp.state = ExpState.Done
430 except (InvalidKernel, InvalidConfig) as e: 470 except (InvalidKernel, InvalidConfig) as e:
431 sys.stderr.write("Invalid environment for experiment '%s'\n" % exp.name) 471 sys.stderr.write("Invalid environment for experiment '%s'\n" % exp.name)
432 sys.stderr.write("%s\n" % e) 472 sys.stderr.write("%s\n" % e)
473 set_tries(exp, get_tries(exp) - 1)
433 exp.state = ExpState.Invalid 474 exp.state = ExpState.Invalid
434 except SystemCorrupted as e: 475 except SystemCorrupted as e:
435 sys.stderr.write("System is corrupted! Fix state before continuing.\n") 476 sys.stderr.write("System is corrupted! Fix state before continuing.\n")
@@ -445,17 +486,19 @@ def run_exps(exps, opts):
445 exp.state = ExpState.Failed 486 exp.state = ExpState.Failed
446 487
447 if exp.state is ExpState.Failed and opts.retry: 488 if exp.state is ExpState.Failed and opts.retry:
448 if exp.retries < MAX_RETRY: 489 exps_remaining += [(i, exp)]
449 exps_remaining += [(i, exp)] 490
450 exp.retries += 1
451 else:
452 sys.stderr.write("Hit maximum retries of %d\n" % MAX_RETRY)
453 491
454def main(): 492def main():
455 opts, args = parse_args() 493 opts, args = parse_args()
456 494
495 if opts.kill:
496 cron.kill_boot_job()
497 sys.exit(1)
498
457 email = setup_email(opts.email) if opts.email else None 499 email = setup_email(opts.email) if opts.email else None
458 500
501 # Create base output directory for run data
459 out_base = os.path.abspath(opts.out_dir) 502 out_base = os.path.abspath(opts.out_dir)
460 created = False 503 created = False
461 if not os.path.exists(out_base): 504 if not os.path.exists(out_base):
@@ -464,7 +507,24 @@ def main():
464 507
465 exps = get_exps(opts, args, out_base) 508 exps = get_exps(opts, args, out_base)
466 509
467 run_exps(exps, opts) 510 if opts.crontab:
511 # Resume script on startup
512 opts.retry = True
513 cron.install_boot_job(['f', '--forced'],
514 "Stop with %s -k" % com.get_cmd())
515
516 if opts.force or not opts.retry:
517 cron.clean_output()
518 for e in exps:
519 set_tries(e, 0)
520
521 try:
522 run_exps(exps, opts)
523 finally:
524 # Remove persistent state
525 for e in exps:
526 set_tries(e, 0)
527 cron.remove_boot_job()
468 528
469 def state_count(state): 529 def state_count(state):
470 return len(filter(lambda x: x.state is state, exps)) 530 return len(filter(lambda x: x.state is state, exps))