1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
|
import common as com
import os
import time
import run.litmus_util as lu
import shutil as sh
import traceback
from operator import methodcaller
class ExperimentException(Exception):
'''Used to indicate when there are problems with an experiment.'''
def __init__(self, name):
self.name = name
class ExperimentDone(ExperimentException):
'''Raised when an experiment looks like it's been run already.'''
def __str__(self):
return "Experiment finished already: %d" % self.name
class ExperimentFailed(ExperimentException):
def __str__(self):
return "Experiment failed during execution: %d" % self.name
class SystemCorrupted(Exception):
pass
class Experiment(object):
'''Execute one task-set and save the results. Experiments have unique IDs.'''
INTERRUPTED_DIR = ".interrupted"
def __init__(self, name, scheduler, working_dir, finished_dir,
proc_entries, executables, tracer_types):
'''Run an experiment, optionally wrapped in tracing.'''
self.name = name
self.scheduler = scheduler
self.working_dir = working_dir
self.finished_dir = finished_dir
self.proc_entries = proc_entries
self.executables = executables
self.exec_out = None
self.exec_err = None
self.task_batch = com.num_cpus()
self.__make_dirs()
self.__assign_executable_cwds()
self.__setup_tracers(tracer_types)
def __setup_tracers(self, tracer_types):
tracers = [ t(self.working_dir) for t in tracer_types ]
self.regular_tracers = [t for t in tracers if not t.is_exact()]
self.exact_tracers = [t for t in tracers if t.is_exact()]
for t in tracers:
self.log("Enabling %s" % t.get_name())
def __make_dirs(self):
interrupted = None
if os.path.exists(self.finished_dir):
raise ExperimentDone(self.name)
if os.path.exists(self.working_dir):
self.log("Found interrupted experiment, saving in %s" %
Experiment.INTERRUPTED_DIR)
interrupted = "%s/%s" % (os.path.split(self.working_dir)[0],
Experiment.INTERRUPTED_DIR)
if os.path.exists(interrupted):
sh.rmtree(interrupted)
os.rename(self.working_dir, interrupted)
os.mkdir(self.working_dir)
if interrupted:
os.rename(interrupted, "%s/%s" % (self.working_dir,
os.path.split(interrupted)[1]))
def __assign_executable_cwds(self):
def assign_cwd(executable):
executable.cwd = self.working_dir
map(assign_cwd, self.executables)
def __kill_all(self):
# Give time for whatever failed to finish failing
time.sleep(2)
released = lu.release_tasks()
self.log("Re-released %d tasks" % released)
time.sleep(5)
self.log("Killing all tasks")
for e in self.executables:
try:
e.kill()
except:
pass
time.sleep(2)
def __run_tasks(self):
self.log("Starting %d tasks" % len(self.executables))
for i,e in enumerate(self.executables):
try:
e.execute()
except:
raise Exception("Executable failed: %s" % e)
self.log("Sleeping until tasks are ready for release...")
start = time.time()
while lu.waiting_tasks() < len(self.executables):
if time.time() - start > 30.0:
s = "waiting: %d, submitted: %d" %\
(lu.waiting_tasks(), len(self.executables))
raise Exception("Too much time spent waiting for tasks! %s" % s)
time.sleep(1)
# Exact tracers (like overheads) must be started right after release or
# measurements will be full of irrelevant records
self.log("Starting %d released tracers" % len(self.exact_tracers))
map(methodcaller('start_tracing'), self.exact_tracers)
time.sleep(1)
try:
self.log("Releasing %d tasks" % len(self.executables))
released = lu.release_tasks()
if released != len(self.executables):
# Some tasks failed to release, kill all tasks and fail
# Need to release non-released tasks before they can be killed
raise Exception("Released %s tasks, expected %s tasks" %
(released, len(self.executables)))
self.log("Waiting for program to finish...")
for e in self.executables:
if not e.wait():
raise Exception("Executable %s failed to complete!" % e)
finally:
# And these must be stopped here for the same reason
self.log("Stopping exact tracers")
map(methodcaller('stop_tracing'), self.exact_tracers)
def __save_results(self):
os.rename(self.working_dir, self.finished_dir)
def log(self, msg):
print("[Exp %s]: %s" % (self.name, msg))
def run_exp(self):
self.__check_system()
succ = False
try:
self.__setup()
try:
self.__run_tasks()
self.log("Saving results in %s" % self.finished_dir)
succ = True
except:
traceback.print_exc()
self.__kill_all()
raise ExperimentFailed(self.name)
finally:
self.__teardown()
finally:
self.log("Switching to Linux scheduler")
# Give the tasks 10 seconds to finish before bailing
start = time.time()
while lu.all_tasks() > 0:
if time.time() - start < 10.0:
raise SystemCorrupted("%d tasks still running!" %
lu.all_tasks())
lu.switch_scheduler("Linux")
if succ:
self.__save_results()
self.log("Experiment done!")
def __check_system(self):
running = lu.all_tasks()
if running:
raise SystemCorrupted("%d tasks already running!" % running)
sched = lu.scheduler()
if sched != "Linux":
raise SystemCorrupted("Scheduler is %s, should be Linux" % sched)
def __setup(self):
self.log("Writing %d proc entries" % len(self.proc_entries))
map(methodcaller('write_proc'), self.proc_entries)
self.log("Switching to %s" % self.scheduler)
lu.switch_scheduler(self.scheduler)
self.log("Starting %d regular tracers" % len(self.regular_tracers))
map(methodcaller('start_tracing'), self.regular_tracers)
self.exec_out = open('%s/exec-out.txt' % self.working_dir, 'w')
self.exec_err = open('%s/exec-err.txt' % self.working_dir, 'w')
def set_out(executable):
executable.stdout_file = self.exec_out
executable.stderr_file = self.exec_err
map(set_out, self.executables)
def __teardown(self):
self.exec_out and self.exec_out.close()
self.exec_err and self.exec_err.close()
self.log("Stopping regular tracers")
map(methodcaller('stop_tracing'), self.regular_tracers)
|