aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--README.md6
-rw-r--r--common.py7
-rw-r--r--config/config.py18
-rw-r--r--gen/edf_generators.py1
-rw-r--r--gen/generator.py6
-rwxr-xr-xgen_exps.py15
-rw-r--r--parse/col_map.py4
-rw-r--r--parse/sched.py31
-rwxr-xr-xparse_exps.py4
-rw-r--r--run/crontab.py151
-rw-r--r--run/experiment.py52
-rwxr-xr-xrun_exps.py334
12 files changed, 474 insertions, 155 deletions
diff --git a/README.md b/README.md
index b074aa5..1f38978 100644
--- a/README.md
+++ b/README.md
@@ -156,16 +156,14 @@ You can specify your own spin programs to run as well instead of rtspin by putti
156$ echo "colorspin -f color1.csv 10 20" > test.sched 156$ echo "colorspin -f color1.csv 10 20" > test.sched
157``` 157```
158 158
159You can specify parameters for an experiment in a file instead of on the command line using params.py (the `-p` option lets you choose the name of this file if `params.py` is not for you): 159You can specify parameters for an experiment in a file instead of on the command line using params.py:
160 160
161```bash 161```bash
162$ echo "{'scheduler':'GSN-EDF', 'duration':10}" > params.py 162$ echo "{'scheduler':'GSN-EDF', 'duration':10}" > params.py
163$ run_exps.py test.sched 163$ run_exps.py test.sched
164``` 164```
165 165
166You can also run multiple experiments with a single command, provided a directory with a schedule file exists for each. By default, the program will look for sched.py for the schedule file and params.py for the parameter file, but this behavior can be changed using the `-p` and `-c` options. 166You can also run multiple experiments with a single command, provided a directory with a schedule file exists for each. You can include non-relevant parameters which `run_exps.py` does not understand in `params.py`. These parameters will be saved with the data output by `run_exps.py`. This is useful for tracking variations in system parameters versus experimental results. In the following example, multiple experiments are demonstrated and an extra parameter `test-param` is included:
167
168You can include non-relevant parameters which `run_exps.py` does not understand in `params.py`. These parameters will be saved with the data output by `run_exps.py`. This is useful for tracking variations in system parameters versus experimental results. In the following example, multiple experiments are demonstrated and an extra parameter `test-param` is included:
169 167
170```bash 168```bash
171$ mkdir test1 169$ mkdir test1
diff --git a/common.py b/common.py
index ff0f986..fd1a983 100644
--- a/common.py
+++ b/common.py
@@ -182,7 +182,7 @@ def ft_freq():
182 182
183 183
184def kernel(): 184def kernel():
185 return subprocess.check_output(["uname", "-r"]) 185 return subprocess.check_output(["uname", "-r"]).strip("\n")
186 186
187def is_executable(fname): 187def is_executable(fname):
188 '''Return whether the file passed in is executable''' 188 '''Return whether the file passed in is executable'''
@@ -212,4 +212,7 @@ def log_once(id, msg = None):
212 __logged += [id] 212 __logged += [id]
213 if indent: 213 if indent:
214 msg = ' ' + msg.strip('\t').replace('\n', '\n\t') 214 msg = ' ' + msg.strip('\t').replace('\n', '\n\t')
215 sys.stderr.write('\n' + msg + '\n') 215 sys.stderr.write('\n' + msg.strip('\n') + '\n')
216
217def get_cmd():
218 return os.path.split(sys.argv[0])[1]
diff --git a/config/config.py b/config/config.py
index 28e78c9..27cb2dd 100644
--- a/config/config.py
+++ b/config/config.py
@@ -14,12 +14,14 @@ BINS = {'rtspin' : get_executable_hint('rtspin', 'liblitmus'),
14 # Optional, as sched_trace is not a publically supported repository 14 # Optional, as sched_trace is not a publically supported repository
15 'st_show' : get_executable_hint('st_show', 'sched_trace', True)} 15 'st_show' : get_executable_hint('st_show', 'sched_trace', True)}
16 16
17'''Names of output files.''' 17'''Names of data files.'''
18FILES = {'ft_data' : 'ft.bin', 18FILES = {'params_file' : 'params.py',
19 'ft_matches' : r'(ft.*\.bin$)|(.*\.ft)', 19 'sched_file' : 'sched.py',
20 'linux_data' : 'trace.dat', 20 'ft_data' : 'ft.bin',
21 'sched_data' : 'st-{}.bin', 21 'ft_matches' : r'(ft.*\.bin$)|(.*\.ft)',
22 'log_data' : 'trace.slog'} 22 'linux_data' : 'trace.dat',
23 'sched_data' : 'st-{}.bin',
24 'log_data' : 'trace.slog'}
23 25
24'''Default parameter names in params.py.''' 26'''Default parameter names in params.py.'''
25PARAMS = {'sched' : 'scheduler', # Scheduler used by run_exps 27PARAMS = {'sched' : 'scheduler', # Scheduler used by run_exps
@@ -35,9 +37,7 @@ PARAMS = {'sched' : 'scheduler', # Scheduler used by run_exps
35 } 37 }
36 38
37'''Default values for program options.''' 39'''Default values for program options.'''
38DEFAULTS = {'params_file' : 'params.py', 40DEFAULTS = {'duration' : 10,
39 'sched_file' : 'sched.py',
40 'duration' : 10,
41 'prog' : 'rtspin', 41 'prog' : 'rtspin',
42 'out-gen' : 'exps', 42 'out-gen' : 'exps',
43 'out-run' : 'run-data', 43 'out-run' : 'run-data',
diff --git a/gen/edf_generators.py b/gen/edf_generators.py
index a722c21..8e4b8df 100644
--- a/gen/edf_generators.py
+++ b/gen/edf_generators.py
@@ -28,6 +28,7 @@ class EdfGenerator(gen.Generator):
28 pdist = self._create_dist('period', 28 pdist = self._create_dist('period',
29 exp_params['periods'], 29 exp_params['periods'],
30 gen.NAMED_PERIODS) 30 gen.NAMED_PERIODS)
31
31 udist = self._create_dist('utilization', 32 udist = self._create_dist('utilization',
32 exp_params['utils'], 33 exp_params['utils'],
33 gen.NAMED_UTILIZATIONS) 34 gen.NAMED_UTILIZATIONS)
diff --git a/gen/generator.py b/gen/generator.py
index bc86cfe..40a0243 100644
--- a/gen/generator.py
+++ b/gen/generator.py
@@ -6,7 +6,7 @@ import shutil as sh
6 6
7from Cheetah.Template import Template 7from Cheetah.Template import Template
8from common import get_config_option,num_cpus,recordtype,log_once 8from common import get_config_option,num_cpus,recordtype,log_once
9from config.config import DEFAULTS,PARAMS 9from config.config import FILES,PARAMS
10from gen.dp import DesignPointGenerator 10from gen.dp import DesignPointGenerator
11from parse.col_map import ColMapBuilder 11from parse.col_map import ColMapBuilder
12 12
@@ -129,7 +129,7 @@ class Generator(object):
129 129
130 def _write_schedule(self, params): 130 def _write_schedule(self, params):
131 '''Write schedule file using current template for @params.''' 131 '''Write schedule file using current template for @params.'''
132 sched_file = self.out_dir + "/" + DEFAULTS['sched_file'] 132 sched_file = self.out_dir + "/" + FILES['sched_file']
133 with open(sched_file, 'wa') as f: 133 with open(sched_file, 'wa') as f:
134 f.write(str(Template(self.template, searchList=[params]))) 134 f.write(str(Template(self.template, searchList=[params])))
135 135
@@ -143,7 +143,7 @@ class Generator(object):
143 else: 143 else:
144 tasks = 0 144 tasks = 0
145 145
146 exp_params_file = self.out_dir + "/" + DEFAULTS['params_file'] 146 exp_params_file = self.out_dir + "/" + FILES['params_file']
147 with open(exp_params_file, 'wa') as f: 147 with open(exp_params_file, 'wa') as f:
148 params['scheduler'] = self.scheduler 148 params['scheduler'] = self.scheduler
149 pprint.pprint(params, f) 149 pprint.pprint(params, f)
diff --git a/gen_exps.py b/gen_exps.py
index 65f50d8..e888f5f 100755
--- a/gen_exps.py
+++ b/gen_exps.py
@@ -43,6 +43,14 @@ def load_file(fname):
43 except: 43 except:
44 raise IOError("Invalid generation file: %s" % fname) 44 raise IOError("Invalid generation file: %s" % fname)
45 45
46def print_descriptions(described):
47 for generator in described.split(','):
48 if generator not in gen.get_generators():
49 sys.stderr.write("No generator '%s'\n" % generator)
50 else:
51 print("Generator '%s', " % generator)
52 gen.get_generators()[generator]().print_help()
53
46def main(): 54def main():
47 opts, args = parse_args() 55 opts, args = parse_args()
48 56
@@ -50,12 +58,7 @@ def main():
50 if opts.list_gens: 58 if opts.list_gens:
51 print(", ".join(gen.get_generators())) 59 print(", ".join(gen.get_generators()))
52 if opts.described != None: 60 if opts.described != None:
53 for generator in opts.described.split(','): 61 print_descriptions(opts.described)
54 if generator not in gen.get_generators():
55 sys.stderr.write("No generator '%s'\n" % generator)
56 else:
57 print("Generator '%s', " % generator)
58 gen.get_generators()[generator]().print_help()
59 if opts.list_gens or opts.described: 62 if opts.list_gens or opts.described:
60 return 0 63 return 0
61 64
diff --git a/parse/col_map.py b/parse/col_map.py
index ceb8867..59484e8 100644
--- a/parse/col_map.py
+++ b/parse/col_map.py
@@ -22,7 +22,7 @@ class ColMapBuilder(object):
22 22
23class ColMap(object): 23class ColMap(object):
24 def __init__(self, col_list, values = None): 24 def __init__(self, col_list, values = None):
25 self.col_list = col_list 25 self.col_list = sorted(col_list)
26 self.rev_map = {} 26 self.rev_map = {}
27 self.values = values 27 self.values = values
28 28
@@ -50,7 +50,7 @@ class ColMap(object):
50 if col not in kv: 50 if col not in kv:
51 key += (None,) 51 key += (None,)
52 else: 52 else:
53 key += (kv[col],) 53 key += (str(kv[col]),)
54 54
55 return key 55 return key
56 56
diff --git a/parse/sched.py b/parse/sched.py
index 6e1fbe6..524f1ed 100644
--- a/parse/sched.py
+++ b/parse/sched.py
@@ -98,15 +98,38 @@ record_map = {}
98RECORD_SIZE = 24 98RECORD_SIZE = 24
99NSEC_PER_MSEC = 1000000 99NSEC_PER_MSEC = 1000000
100 100
101def bits_to_bytes(bits):
102 '''Includes padding'''
103 return bits / 8 + (1 if bits%8 else 0)
104
105def field_bytes(fields):
106 fbytes = 0
107 fbits = 0
108 for f in fields:
109 flist = list(f)
110
111 if len(flist) > 2:
112 # Specified a bitfield
113 fbits += flist[2]
114 else:
115 # Only specified a type, use types size
116 fbytes += sizeof(list(f)[1])
117
118 # Bitfields followed by a byte will cause any incomplete
119 # bytes to be turned into full bytes
120 fbytes += bits_to_bytes(fbits)
121 fbits = 0
122
123 fbytes += bits_to_bytes(fbits)
124 return fbytes + fbits
125
101def register_record(id, clazz): 126def register_record(id, clazz):
102 fields = clazz.FIELDS 127 fields = clazz.FIELDS
103 128 diff = RECORD_SIZE - field_bytes(SchedRecord.FIELDS) - field_bytes(fields)
104 fsize = lambda fields : sum([sizeof(list(f)[1]) for f in fields])
105 diff = RECORD_SIZE - fsize(SchedRecord.FIELDS) - fsize(fields)
106 129
107 # Create extra padding fields to make record the proper size 130 # Create extra padding fields to make record the proper size
108 # Creating one big field of c_uint64 and giving it a size of 8*diff 131 # Creating one big field of c_uint64 and giving it a size of 8*diff
109 # _shoud_ work, but doesn't. This is an uglier way of accomplishing 132 # _should_ work, but doesn't. This is an uglier way of accomplishing
110 # the same goal 133 # the same goal
111 for d in range(diff): 134 for d in range(diff):
112 fields += [("extra%d" % d, c_char)] 135 fields += [("extra%d" % d, c_char)]
diff --git a/parse_exps.py b/parse_exps.py
index 98f95df..37667aa 100755
--- a/parse_exps.py
+++ b/parse_exps.py
@@ -14,7 +14,7 @@ import sys
14import traceback 14import traceback
15 15
16from collections import namedtuple 16from collections import namedtuple
17from config.config import DEFAULTS,PARAMS 17from config.config import FILES,DEFAULTS,PARAMS
18from optparse import OptionParser 18from optparse import OptionParser
19from parse.point import ExpPoint 19from parse.point import ExpPoint
20from parse.tuple_table import TupleTable 20from parse.tuple_table import TupleTable
@@ -94,7 +94,7 @@ def parse_exp(exp_force_base):
94 94
95 95
96def get_exp_params(data_dir, cm_builder): 96def get_exp_params(data_dir, cm_builder):
97 param_file = "%s/%s" % (data_dir, DEFAULTS['params_file']) 97 param_file = "%s/%s" % (data_dir, FILES['params_file'])
98 if os.path.isfile(param_file): 98 if os.path.isfile(param_file):
99 params = com.load_params(param_file) 99 params = com.load_params(param_file)
100 100
diff --git a/run/crontab.py b/run/crontab.py
new file mode 100644
index 0000000..87d71b1
--- /dev/null
+++ b/run/crontab.py
@@ -0,0 +1,151 @@
1from __future__ import print_function
2
3import common
4import os
5import re
6import sys
7
8from subprocess import Popen, PIPE, check_output
9
10PANIC_DUR = 10
11DELAY = 30
12DELAY_INTERVAL = 10
13
14def get_cron_data():
15 try:
16 return check_output(['crontab', '-l'])
17 except:
18 return ""
19
20def wall(message):
21 '''A wall command with no header'''
22 return "echo '%s' | wall -n" % message
23
24def sanitize(args, ignored):
25 ret_args = []
26 for a in args:
27 if a in ignored:
28 continue
29 if '-' == a[0] and '--' != a[0:2]:
30 for i in ignored:
31 a = a.replace(i, '')
32 ret_args += [a]
33 return ret_args
34
35def get_outfname():
36 return "cron-%s.txt" % common.get_cmd()
37
38def get_boot_cron(ignored_params, extra=""):
39 '''Turn current python script into a crontab reboot entry'''
40 job_args = sanitize(sys.argv, ignored_params)
41 job = " ".join(job_args)
42 out_fname = get_outfname()
43
44 short_job = " ".join([common.get_cmd()] + job_args[1:])
45 msg = "Job '%s' will write output to '%s'" % (short_job, out_fname)
46
47 sys.stderr.write("%s %d seconds after reboot.\n" % (msg, DELAY))
48
49 # Create sleep and wall commands which will countdown DELAY seconds
50 # before executing the job
51 cmds = ["sleep %d" % DELAY_INTERVAL]
52 delay_rem = DELAY - DELAY_INTERVAL
53 while delay_rem > 0:
54 wmsg = "Restarting experiments in %d seconds. %s" % (delay_rem, extra)
55 cmds += [wall(wmsg)]
56 cmds += ["sleep %d" % min(DELAY_INTERVAL, delay_rem)]
57 delay_rem -= DELAY_INTERVAL
58 delay_cmd = ";".join(cmds)
59
60 # Create command which will only execute if the same kernel is running
61 kern = common.kernel()
62 fail_wall = wall("Need matching kernel '%s' to run!" % kern)
63 run_cmd = "echo '%s' | grep -q `uname -r` && %s && %s && %s >> %s 2>>%s || %s" %\
64 (kern, wall(msg), wall("Starting..."), job, out_fname, out_fname, fail_wall)
65
66 return "@reboot cd %s; %s; %s;" % (os.getcwd(), delay_cmd, run_cmd)
67
68def set_panic_restart(bool_val):
69 '''Enable / disable restart on panics'''
70 if bool_val:
71 sys.stderr.write("Kernel will reboot after panic.\n")
72 dur = PANIC_DUR
73 else:
74 sys.stderr.write("Kernel will no longer reboot after panic.\n")
75 dur = 0
76
77 check_output(['sysctl', '-w', "kernel.panic=%d" % dur,
78 "kernel.panic_on_oops=%d" % dur])
79
80def write_cron_data(data):
81 '''Write new crontab entry. No blank lines are written'''
82
83 # I don't know why "^\s*$" doesn't match, hence this ugly regex
84 data = re.sub(r"\n\s*\n", "\n", data, re.M)
85
86 sp = Popen(["crontab", "-"], stdin=PIPE)
87 stdout, stderr = sp.communicate(input=data)
88
89def install_path():
90 '''Place the current path in the crontab entry'''
91 data = get_cron_data()
92 curr_line = re.findall(r"PATH=.*", data)
93
94 if curr_line:
95 curr_paths = re.findall(r"((?:\/\w+)+)", curr_line[0])
96 data = re.sub(curr_line[0], "", data)
97 else:
98 curr_paths = []
99 curr_paths = set(curr_paths)
100
101 for path in os.environ["PATH"].split(os.pathsep):
102 curr_paths.add(path)
103
104 data = "PATH=" + os.pathsep.join(curr_paths) + "\n" + data
105
106 write_cron_data(data)
107
108def install_boot_job(ignored_params, reboot_message):
109 '''Re-run the current python script on system reboot using crontab'''
110 remove_boot_job()
111
112 data = get_cron_data()
113 job = get_boot_cron(ignored_params, reboot_message)
114
115 set_panic_restart(True)
116
117 write_cron_data(data + job + "\n")
118
119 if job not in get_cron_data():
120 raise IOError("Failed to write %s into cron!" % job)
121 else:
122 install_path()
123
124def clean_output():
125 fname = get_outfname()
126 if os.path.exists(fname):
127 os.remove(fname)
128
129def kill_boot_job():
130 remove_boot_job()
131
132 cmd = common.get_cmd()
133
134 procs = check_output("ps -eo pid,args".split(" "))
135 pairs = re.findall("(\d+) (.*)", procs)
136
137 for pid, args in pairs:
138 if re.search(r"/bin/sh -c.*%s"%cmd, args):
139 sys.stderr.write("Killing job %s\n" % pid)
140 check_output(("kill -9 %s" % pid).split(" "))
141
142def remove_boot_job():
143 '''Remove installed reboot job from crontab'''
144 data = get_cron_data()
145 regex = re.compile(r".*%s.*" % re.escape(common.get_cmd()), re.M)
146
147 if regex.search(data):
148 new_cron = regex.sub("", data)
149 write_cron_data(new_cron)
150
151 set_panic_restart(False)
diff --git a/run/experiment.py b/run/experiment.py
index 4667cb1..5f18bea 100644
--- a/run/experiment.py
+++ b/run/experiment.py
@@ -43,6 +43,9 @@ class Experiment(object):
43 self.exec_err = None 43 self.exec_err = None
44 self.tracer_types = tracer_types 44 self.tracer_types = tracer_types
45 45
46 self.regular_tracers = []
47 self.exact_tracers = []
48
46 def __setup_tracers(self): 49 def __setup_tracers(self):
47 tracers = [ t(self.working_dir) for t in self.tracer_types ] 50 tracers = [ t(self.working_dir) for t in self.tracer_types ]
48 51
@@ -63,8 +66,13 @@ class Experiment(object):
63 Experiment.INTERRUPTED_DIR) 66 Experiment.INTERRUPTED_DIR)
64 interrupted = "%s/%s" % (os.path.split(self.working_dir)[0], 67 interrupted = "%s/%s" % (os.path.split(self.working_dir)[0],
65 Experiment.INTERRUPTED_DIR) 68 Experiment.INTERRUPTED_DIR)
69 old_int = "%s/%s" % (self.working_dir, Experiment.INTERRUPTED_DIR)
70
66 if os.path.exists(interrupted): 71 if os.path.exists(interrupted):
67 sh.rmtree(interrupted) 72 sh.rmtree(interrupted)
73 if os.path.exists(old_int):
74 sh.rmtree(old_int)
75
68 os.rename(self.working_dir, interrupted) 76 os.rename(self.working_dir, interrupted)
69 77
70 os.mkdir(self.working_dir) 78 os.mkdir(self.working_dir)
@@ -78,21 +86,24 @@ class Experiment(object):
78 executable.cwd = self.working_dir 86 executable.cwd = self.working_dir
79 map(assign_cwd, self.executables) 87 map(assign_cwd, self.executables)
80 88
81 def __kill_all(self): 89 def __try_kill_all(self):
82 if lu.waiting_tasks(): 90 try:
83 released = lu.release_tasks() 91 if lu.waiting_tasks():
84 self.log("Re-released %d tasks" % released) 92 released = lu.release_tasks()
93 self.log("Re-released %d tasks" % released)
85 94
86 time.sleep(1) 95 time.sleep(1)
87 96
88 self.log("Killing all tasks") 97 self.log("Killing all tasks")
89 for e in self.executables: 98 for e in self.executables:
90 try: 99 try:
91 e.kill() 100 e.kill()
92 except: 101 except:
93 pass 102 pass
94 103
95 time.sleep(1) 104 time.sleep(1)
105 except:
106 self.log("Failed to kill all tasks.")
96 107
97 def __strip_path(self, path): 108 def __strip_path(self, path):
98 '''Shorten path to something more readable.''' 109 '''Shorten path to something more readable.'''
@@ -194,6 +205,7 @@ class Experiment(object):
194 205
195 sched = lu.scheduler() 206 sched = lu.scheduler()
196 if sched != "Linux": 207 if sched != "Linux":
208 self.log("Switching back to Linux scheduler")
197 try: 209 try:
198 lu.switch_scheduler("Linux") 210 lu.switch_scheduler("Linux")
199 except: 211 except:
@@ -303,6 +315,7 @@ class Experiment(object):
303 self.__to_linux() 315 self.__to_linux()
304 316
305 succ = False 317 succ = False
318 exception = None
306 try: 319 try:
307 self.__setup() 320 self.__setup()
308 321
@@ -311,20 +324,21 @@ class Experiment(object):
311 self.log("Saving results in %s" % self.finished_dir) 324 self.log("Saving results in %s" % self.finished_dir)
312 succ = True 325 succ = True
313 except Exception as e: 326 except Exception as e:
327 exception = e
328
314 # Give time for whatever failed to finish failing 329 # Give time for whatever failed to finish failing
315 time.sleep(2) 330 time.sleep(2)
316 self.__kill_all()
317 331
318 raise e 332 self.__try_kill_all()
319 finally:
320 self.__teardown()
321 finally: 333 finally:
322 self.log("Switching back to Linux scheduler")
323 try: 334 try:
335 self.__teardown()
324 self.__to_linux() 336 self.__to_linux()
325 except Exception as e: 337 except Exception as e:
326 print(e) 338 exception = exception or e
327 339 finally:
340 if exception: raise exception
341
328 if succ: 342 if succ:
329 self.__save_results() 343 self.__save_results()
330 self.log("Experiment done!") 344 self.log("Experiment done!")
diff --git a/run_exps.py b/run_exps.py
index afabca8..21666a9 100755
--- a/run_exps.py
+++ b/run_exps.py
@@ -3,14 +3,18 @@ from __future__ import print_function
3 3
4import common as com 4import common as com
5import os 5import os
6import pickle
7import pprint
6import re 8import re
7import shutil 9import shutil
8import sys 10import sys
11import run.crontab as cron
9import run.tracer as trace 12import run.tracer as trace
10 13
11from config.config import PARAMS,DEFAULTS 14from config.config import PARAMS,DEFAULTS,FILES
12from collections import namedtuple 15from collections import namedtuple
13from optparse import OptionParser 16from optparse import OptionParser,OptionGroup
17from parse.enum import Enum
14from run.executable.executable import Executable 18from run.executable.executable import Executable
15from run.experiment import Experiment,ExperimentDone,SystemCorrupted 19from run.experiment import Experiment,ExperimentDone,SystemCorrupted
16from run.proc_entry import ProcEntry 20from run.proc_entry import ProcEntry
@@ -19,9 +23,19 @@ from run.proc_entry import ProcEntry
19ExpParams = namedtuple('ExpParams', ['scheduler', 'duration', 'tracers', 23ExpParams = namedtuple('ExpParams', ['scheduler', 'duration', 'tracers',
20 'kernel', 'config_options', 'file_params', 24 'kernel', 'config_options', 'file_params',
21 'pre_script', 'post_script']) 25 'pre_script', 'post_script'])
26'''Tracked with each experiment'''
27ExpState = Enum(['Failed', 'Succeeded', 'Invalid', 'Done', 'None'])
28ExpData = com.recordtype('ExpData', ['name', 'params', 'sched_file', 'out_dir',
29 'retries', 'state'])
22'''Comparison of requested versus actual kernel compile parameter value''' 30'''Comparison of requested versus actual kernel compile parameter value'''
23ConfigResult = namedtuple('ConfigResult', ['param', 'wanted', 'actual']) 31ConfigResult = namedtuple('ConfigResult', ['param', 'wanted', 'actual'])
24 32
33'''Maximum times an experiment will be retried'''
34MAX_RETRY = 5
35'''Location experiment retry count is stored'''
36TRIES_FNAME = ".tries.pkl"
37
38
25class InvalidKernel(Exception): 39class InvalidKernel(Exception):
26 def __init__(self, kernel): 40 def __init__(self, kernel):
27 self.kernel = kernel 41 self.kernel = kernel
@@ -51,27 +65,37 @@ def parse_args():
51 65
52 parser.add_option('-s', '--scheduler', dest='scheduler', 66 parser.add_option('-s', '--scheduler', dest='scheduler',
53 help='scheduler for all experiments') 67 help='scheduler for all experiments')
68 parser.add_option('-d', '--duration', dest='duration', type='int',
69 help='duration (seconds) of tasks')
54 parser.add_option('-i', '--ignore-environment', dest='ignore', 70 parser.add_option('-i', '--ignore-environment', dest='ignore',
55 action='store_true', default=False, 71 action='store_true', default=False,
56 help='run experiments even in invalid environments ') 72 help='run experiments even in invalid environments ')
57 parser.add_option('-d', '--duration', dest='duration', type='int', 73 parser.add_option('-f', '--force', action='store_true', default=False,
58 help='duration (seconds) of tasks') 74 dest='force', help='overwrite existing data')
59 parser.add_option('-o', '--out-dir', dest='out_dir', 75 parser.add_option('-o', '--out-dir', dest='out_dir',
60 help='directory for data output', 76 help='directory for data output',
61 default=DEFAULTS['out-run']) 77 default=DEFAULTS['out-run'])
62 parser.add_option('-p', '--params', dest='param_file', 78
63 help='file with experiment parameters') 79 group = OptionGroup(parser, "Communication Options")
64 parser.add_option('-c', '--schedule-file', dest='sched_file', 80 group.add_option('-j', '--jabber', metavar='username@domain',
65 help='name of schedule files within directories', 81 dest='jabber', default=None,
66 default=DEFAULTS['sched_file']) 82 help='send a jabber message when an experiment completes')
67 parser.add_option('-f', '--force', action='store_true', default=False, 83 group.add_option('-e', '--email', metavar='username@server',
68 dest='force', help='overwrite existing data') 84 dest='email', default=None,
69 parser.add_option('-j', '--jabber', metavar='username@domain', 85 help='send an email when all experiments complete')
70 dest='jabber', default=None, 86 parser.add_option_group(group)
71 help='send a jabber message when an experiment completes') 87
72 parser.add_option('-e', '--email', metavar='username@server', 88 group = OptionGroup(parser, "Persistence Options")
73 dest='email', default=None, 89 group.add_option('-r', '--retry', dest='retry', action='store_true',
74 help='send an email when all experiments complete') 90 default=False, help='retry failed experiments')
91 group.add_option('-c', '--crontab', dest='crontab',
92 action='store_true', default=False,
93 help='use crontab to resume interrupted script after '
94 'system restarts. implies --retry')
95 group.add_option('-k', '--kill-crontab', dest='kill',
96 action='store_true', default=False,
97 help='kill existing script crontabs and exit')
98 parser.add_option_group(group)
75 99
76 return parser.parse_args() 100 return parser.parse_args()
77 101
@@ -207,12 +231,12 @@ def run_script(script_params, exp, exp_dir, out_dir):
207 out.close() 231 out.close()
208 232
209 233
210def make_exp_params(cmd_scheduler, cmd_duration, sched_dir, param_file): 234def make_exp_params(cmd_scheduler, cmd_duration, sched_dir):
211 '''Return ExpParam with configured values of all hardcoded params.''' 235 '''Return ExpParam with configured values of all hardcoded params.'''
212 kernel = copts = "" 236 kernel = copts = ""
213 237
214 # Load parameter file 238 # Load parameter file
215 param_file = param_file or "%s/%s" % (sched_dir, DEFAULTS['params_file']) 239 param_file = "%s/%s" % (sched_dir, FILES['params_file'])
216 if os.path.isfile(param_file): 240 if os.path.isfile(param_file):
217 fparams = com.load_params(param_file) 241 fparams = com.load_params(param_file)
218 else: 242 else:
@@ -252,65 +276,118 @@ def make_exp_params(cmd_scheduler, cmd_duration, sched_dir, param_file):
252 config_options=copts, tracers=tracers, file_params=fparams, 276 config_options=copts, tracers=tracers, file_params=fparams,
253 pre_script=pre_script, post_script=post_script) 277 pre_script=pre_script, post_script=post_script)
254 278
255def run_experiment(name, sched_file, exp_params, out_dir, 279def run_experiment(data, start_message, ignore, jabber):
256 start_message, ignore, jabber):
257 '''Load and parse data from files and run result.''' 280 '''Load and parse data from files and run result.'''
258 if not os.path.isfile(sched_file): 281 if not os.path.isfile(data.sched_file):
259 raise IOError("Cannot find schedule file: %s" % sched_file) 282 raise IOError("Cannot find schedule file: %s" % data.sched_file)
260 283
261 dir_name, fname = os.path.split(sched_file) 284 dir_name, fname = os.path.split(data.sched_file)
262 work_dir = "%s/tmp" % dir_name 285 work_dir = "%s/tmp" % dir_name
263 286
264 procs, execs = load_schedule(name, sched_file, exp_params.duration) 287 procs, execs = load_schedule(data.name, data.sched_file, data.params.duration)
265 288
266 exp = Experiment(name, exp_params.scheduler, work_dir, out_dir, 289 exp = Experiment(data.name, data.params.scheduler, work_dir,
267 procs, execs, exp_params.tracers) 290 data.out_dir, procs, execs, data.params.tracers)
268 291
269 exp.log(start_message) 292 exp.log(start_message)
270 293
271 if not ignore: 294 if not ignore:
272 verify_environment(exp_params) 295 verify_environment(data.params)
273 296
274 run_script(exp_params.pre_script, exp, dir_name, work_dir) 297 run_script(data.params.pre_script, exp, dir_name, work_dir)
275 298
276 exp.run_exp() 299 exp.run_exp()
277 300
278 run_script(exp_params.post_script, exp, dir_name, out_dir) 301 run_script(data.params.post_script, exp, dir_name, data.out_dir)
279 302
280 if jabber: 303 if jabber:
281 jabber.send("Completed '%s'" % name) 304 jabber.send("Completed '%s'" % data.name)
282 305
283 # Save parameters used to run experiment in out_dir 306 # Save parameters used to run dataeriment in out_dir
284 out_params = dict(exp_params.file_params.items() + 307 out_params = dict([(PARAMS['sched'], data.params.scheduler),
285 [(PARAMS['sched'], exp_params.scheduler),
286 (PARAMS['tasks'], len(execs)), 308 (PARAMS['tasks'], len(execs)),
287 (PARAMS['dur'], exp_params.duration)]) 309 (PARAMS['dur'], data.params.duration)] +
310 data.params.file_params.items())
288 311
289 # Feather-trace clock frequency saved for accurate overhead parsing 312 # Feather-trace clock frequency saved for accurate overhead parsing
290 ft_freq = com.ft_freq() 313 ft_freq = com.ft_freq()
291 if ft_freq: 314 if ft_freq:
292 out_params[PARAMS['cycles']] = ft_freq 315 out_params[PARAMS['cycles']] = ft_freq
293 316
294 with open("%s/%s" % (out_dir, DEFAULTS['params_file']), 'w') as f: 317 out_param_f = "%s/%s" % (data.out_dir, FILES['params_file'])
295 f.write(str(out_params)) 318 with open(out_param_f, 'w') as f:
319 pprint.pprint(out_params, f)
320
296 321
322def make_paths(exp, opts, out_base_dir):
323 '''Translate experiment name to (schedule file, output directory) paths'''
324 path = os.path.abspath(exp)
325 out_dir = "%s/%s" % (out_base_dir, os.path.split(exp.strip('/'))[1])
326
327 if not os.path.exists(path):
328 raise IOError("Invalid experiment: %s" % path)
297 329
298def get_exps(opts, args): 330 if opts.force and os.path.exists(out_dir):
299 '''Return list of experiment files or directories''' 331 shutil.rmtree(out_dir)
300 if args:
301 return args
302 332
303 # Default to sched_file > generated dirs 333 if os.path.isdir(path):
304 if os.path.exists(opts.sched_file): 334 sched_file = "%s/%s" % (path, FILES['sched_file'])
305 sys.stderr.write("Reading schedule from %s.\n" % opts.sched_file)
306 return [opts.sched_file]
307 elif os.path.exists(DEFAULTS['out-gen']):
308 sys.stderr.write("Reading schedules from %s/*.\n" % DEFAULTS['out-gen'])
309 sched_dirs = os.listdir(DEFAULTS['out-gen'])
310 return ['%s/%s' % (DEFAULTS['out-gen'], d) for d in sched_dirs]
311 else: 335 else:
312 sys.stderr.write("Run with -h to view options.\n"); 336 sched_file = path
313 sys.exit(1) 337
338 return sched_file, out_dir
339
340
341def get_common_header(args):
342 common = ""
343 done = False
344
345 if len(args) == 1:
346 return common
347
348 while not done:
349 common += args[0][len(common)]
350 for path in args:
351 if path.find(common, 0, len(common)):
352 done = True
353 break
354
355 return common[:len(common)-1]
356
357
358def get_exps(opts, args, out_base_dir):
359 '''Return list of ExpDatas'''
360
361 if not args:
362 if os.path.exists(FILES['sched_file']):
363 # Default to sched_file in current directory
364 sys.stderr.write("Reading schedule from %s.\n" % FILES['sched_file'])
365 args = [FILES['sched_file']]
366 elif os.path.exists(DEFAULTS['out-gen']):
367 # Then try experiments created by gen_exps
368 sys.stderr.write("Reading schedules from %s/*.\n" % DEFAULTS['out-gen'])
369 sched_dirs = os.listdir(DEFAULTS['out-gen'])
370 args = ['%s/%s' % (DEFAULTS['out-gen'], d) for d in sched_dirs]
371 else:
372 sys.stderr.write("Run with -h to view options.\n");
373 sys.exit(1)
374
375 # Part of arg paths which is identical for each arg
376 common = get_common_header(args)
377
378 exps = []
379 for path in args:
380 sched_file, out_dir = make_paths(path, opts, out_base_dir)
381 name = path[len(common):]
382
383 sched_dir = os.path.split(sched_file)[0]
384
385 exp_params = make_exp_params(opts.scheduler, opts.duration, sched_dir)
386
387 exps += [ExpData(name, exp_params, sched_file, out_dir,
388 0, ExpState.None)]
389
390 return exps
314 391
315 392
316def setup_jabber(target): 393def setup_jabber(target):
@@ -338,93 +415,142 @@ def setup_email(target):
338 return None 415 return None
339 416
340 417
341def make_paths(exp, out_base_dir, opts): 418def tries_file(exp):
342 '''Translate experiment name to (schedule file, output directory) paths''' 419 return "%s/%s" % (os.path.split(exp.sched_file)[0], TRIES_FNAME)
343 path = "%s/%s" % (os.getcwd(), exp)
344 out_dir = "%s/%s" % (out_base_dir, os.path.split(exp.strip('/'))[1])
345 420
346 if not os.path.exists(path):
347 raise IOError("Invalid experiment: %s" % path)
348 421
349 if opts.force and os.path.exists(out_dir): 422def get_tries(exp):
350 shutil.rmtree(out_dir) 423 if not os.path.exists(tries_file(exp)):
424 return 0
425 with open(tries_file(exp), 'r') as f:
426 return int(pickle.load(f))
351 427
352 if os.path.isdir(path):
353 sched_file = "%s/%s" % (path, opts.sched_file)
354 else:
355 sched_file = path
356 428
357 return sched_file, out_dir 429def set_tries(exp, val):
430 if not val:
431 if os.path.exists(tries_file(exp)):
432 os.remove(tries_file(exp))
433 else:
434 with open(tries_file(exp), 'w') as f:
435 pickle.dump(str(val), f)
436 os.system('sync')
358 437
359def main():
360 opts, args = parse_args()
361 exps = get_exps(opts, args)
362 438
439def run_exps(exps, opts):
363 jabber = setup_jabber(opts.jabber) if opts.jabber else None 440 jabber = setup_jabber(opts.jabber) if opts.jabber else None
364 email = setup_email(opts.email) if opts.email else None
365 441
366 out_base = os.path.abspath(opts.out_dir) 442 # Give each experiment a unique id
367 created = False 443 exps_remaining = enumerate(exps)
368 if not os.path.exists(out_base): 444 # But run experiments which have failed the most last
369 created = True 445 exps_remaining = sorted(exps_remaining, key=lambda x: get_tries(x[1]))
370 os.mkdir(out_base)
371 446
372 ran = done = succ = failed = invalid = 0 447 while exps_remaining:
448 i, exp = exps_remaining.pop(0)
373 449
374 for i, exp in enumerate(exps): 450 verb = "Loading" if exp.state == ExpState.None else "Re-running failed"
375 sched_file, out_dir = make_paths(exp, out_base, opts) 451 start_message = "%s experiment %d of %d." % (verb, i+1, len(exps))
376 sched_dir = os.path.split(sched_file)[0]
377 452
378 try: 453 try:
379 start_message = "Loading experiment %d of %d." % (i+1, len(exps)) 454 set_tries(exp, get_tries(exp) + 1)
380 exp_params = make_exp_params(opts.scheduler, opts.duration, 455 if get_tries(exp) > MAX_RETRY:
381 sched_dir, opts.param_file) 456 raise Exception("Hit maximum retries of %d" % MAX_RETRY)
382 457
383 run_experiment(exp, sched_file, exp_params, out_dir, 458 run_experiment(exp, start_message, opts.ignore, jabber)
384 start_message, opts.ignore, jabber)
385 459
386 succ += 1 460 set_tries(exp, 0)
387 except ExperimentDone: 461 exp.state = ExpState.Succeeded
388 sys.stderr.write("Experiment '%s' already completed " % exp +
389 "at '%s'\n" % out_base)
390 done += 1
391 except (InvalidKernel, InvalidConfig) as e:
392 sys.stderr.write("Invalid environment for experiment '%s'\n" % exp)
393 sys.stderr.write("%s\n" % e)
394 invalid += 1
395 except KeyboardInterrupt: 462 except KeyboardInterrupt:
396 sys.stderr.write("Keyboard interrupt, quitting\n") 463 sys.stderr.write("Keyboard interrupt, quitting\n")
464 set_tries(exp, get_tries(exp) - 1)
397 break 465 break
466 except ExperimentDone:
467 sys.stderr.write("Experiment already completed at '%s'\n" % exp.out_dir)
468 set_tries(exp, 0)
469 exp.state = ExpState.Done
470 except (InvalidKernel, InvalidConfig) as e:
471 sys.stderr.write("Invalid environment for experiment '%s'\n" % exp.name)
472 sys.stderr.write("%s\n" % e)
473 set_tries(exp, get_tries(exp) - 1)
474 exp.state = ExpState.Invalid
398 except SystemCorrupted as e: 475 except SystemCorrupted as e:
399 sys.stderr.write("System is corrupted! Fix state before continuing.\n") 476 sys.stderr.write("System is corrupted! Fix state before continuing.\n")
400 sys.stderr.write("%s\n" % e) 477 sys.stderr.write("%s\n" % e)
401 break 478 exp.state = ExpState.Failed
479 if not opts.retry:
480 break
481 else:
482 sys.stderr.write("Remaining experiments may fail\n")
402 except Exception as e: 483 except Exception as e:
403 sys.stderr.write("Failed experiment %s\n" % exp) 484 sys.stderr.write("Failed experiment %s\n" % exp.name)
404 sys.stderr.write("%s\n" % e) 485 sys.stderr.write("%s\n" % e)
405 failed += 1 486 exp.state = ExpState.Failed
406 487
407 ran += 1 488 if exp.state is ExpState.Failed and opts.retry:
489 exps_remaining += [(i, exp)]
408 490
409 # Clean out directory if it failed immediately 491
410 if not os.listdir(out_base) and created and not succ: 492def main():
411 os.rmdir(out_base) 493 opts, args = parse_args()
494
495 if opts.kill:
496 cron.kill_boot_job()
497 sys.exit(1)
498
499 email = setup_email(opts.email) if opts.email else None
500
501 # Create base output directory for run data
502 out_base = os.path.abspath(opts.out_dir)
503 created = False
504 if not os.path.exists(out_base):
505 created = True
506 os.mkdir(out_base)
507
508 exps = get_exps(opts, args, out_base)
509
510 if opts.crontab:
511 # Resume script on startup
512 opts.retry = True
513 cron.install_boot_job(['f', '--forced'],
514 "Stop with %s -k" % com.get_cmd())
515
516 if opts.force or not opts.retry:
517 cron.clean_output()
518 for e in exps:
519 set_tries(e, 0)
520
521 try:
522 run_exps(exps, opts)
523 finally:
524 # Remove persistent state
525 for e in exps:
526 set_tries(e, 0)
527 cron.remove_boot_job()
528
529 def state_count(state):
530 return len(filter(lambda x: x.state is state, exps))
531
532 ran = len(filter(lambda x: x.state is not ExpState.None, exps))
533 succ = state_count(ExpState.Succeeded)
412 534
413 message = "Experiments ran:\t%d of %d" % (ran, len(exps)) +\ 535 message = "Experiments ran:\t%d of %d" % (ran, len(exps)) +\
414 "\n Successful:\t\t%d" % succ +\ 536 "\n Successful:\t\t%d" % succ +\
415 "\n Failed:\t\t%d" % failed +\ 537 "\n Failed:\t\t%d" % state_count(ExpState.Failed) +\
416 "\n Already Done:\t\t%d" % done +\ 538 "\n Already Done:\t\t%d" % state_count(ExpState.Done) +\
417 "\n Invalid Environment:\t%d" % invalid 539 "\n Invalid Environment:\t%d" % state_count(ExpState.Invalid)
418 540
419 print(message) 541 print(message)
420 542
543 if email:
544 email.send(message)
545 email.close()
546
421 if succ: 547 if succ:
422 sys.stderr.write("Successful experiment data saved in %s.\n" % 548 sys.stderr.write("Successful experiment data saved in %s.\n" %
423 opts.out_dir) 549 opts.out_dir)
550 elif not os.listdir(out_base) and created:
551 # Remove directory if no data was put into it
552 os.rmdir(out_base)
424 553
425 if email:
426 email.send(message)
427 email.close()
428 554
429if __name__ == '__main__': 555if __name__ == '__main__':
430 main() 556 main()