diff options
author | Glenn Elliott <gelliott@cs.unc.edu> | 2013-05-02 18:02:10 -0400 |
---|---|---|
committer | Glenn Elliott <gelliott@cs.unc.edu> | 2013-05-02 18:02:10 -0400 |
commit | 0f89bddde73d448511004a60b98b8be042f6ffd6 (patch) | |
tree | 0fd80ef6fddc61698eb189b815291bc18c7c10c4 | |
parent | e3935c7f68ce428e394eb53ea29ebef5509bcd7f (diff) |
randomize job execution time w/ noraml distribu.
-rw-r--r-- | Makefile | 4 | ||||
-rw-r--r-- | gpu/budget.cpp | 32 | ||||
-rw-r--r-- | gpu/gpuspin.cu | 564 | ||||
-rw-r--r-- | src/migration.c | 4 |
4 files changed, 356 insertions, 248 deletions
@@ -30,7 +30,7 @@ flags-api = -D_XOPEN_SOURCE=600 -D_GNU_SOURCE | |||
30 | flags-misc = -fasynchronous-unwind-tables -fnon-call-exceptions | 30 | flags-misc = -fasynchronous-unwind-tables -fnon-call-exceptions |
31 | 31 | ||
32 | flags-cu-debug = -g -G -Xcompiler -Wall -Xcompiler -Werror | 32 | flags-cu-debug = -g -G -Xcompiler -Wall -Xcompiler -Werror |
33 | flags-cu-optim = -O3 -Xcompiler -march=native | 33 | flags-cu-optim = -O2 -Xcompiler -march=native |
34 | flags-cu-nvcc = --use_fast_math -gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 | 34 | flags-cu-nvcc = --use_fast_math -gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 |
35 | flags-cu-misc = -Xcompiler -fasynchronous-unwind-tables -Xcompiler -fnon-call-exceptions -Xcompiler -malign-double -Xcompiler -pthread | 35 | flags-cu-misc = -Xcompiler -fasynchronous-unwind-tables -Xcompiler -fnon-call-exceptions -Xcompiler -malign-double -Xcompiler -pthread |
36 | flags-cu-x86_64 = -m64 | 36 | flags-cu-x86_64 = -m64 |
@@ -299,7 +299,7 @@ lib-budget = -lrt -lm -pthread | |||
299 | vpath %.cu gpu/ | 299 | vpath %.cu gpu/ |
300 | 300 | ||
301 | objcu-gpuspin = gpuspin.o common.o | 301 | objcu-gpuspin = gpuspin.o common.o |
302 | lib-gpuspin = -lrt -lm -lpthread | 302 | lib-gpuspin = -lblitz -lrt -lm -lpthread |
303 | 303 | ||
304 | # ############################################################################## | 304 | # ############################################################################## |
305 | # Build everything that depends on liblitmus. | 305 | # Build everything that depends on liblitmus. |
diff --git a/gpu/budget.cpp b/gpu/budget.cpp index eebb14e..e08daf7 100644 --- a/gpu/budget.cpp +++ b/gpu/budget.cpp | |||
@@ -134,7 +134,7 @@ int job(lt_t exec_ns, lt_t budget_ns) | |||
134 | for(int i = 0; i < NUM_LOCKS; ++i) | 134 | for(int i = 0; i < NUM_LOCKS; ++i) |
135 | litmus_lock(LOCKS[i]); | 135 | litmus_lock(LOCKS[i]); |
136 | } | 136 | } |
137 | 137 | ||
138 | // intentionally overrun via suspension | 138 | // intentionally overrun via suspension |
139 | if (OVERRUN_BY_SLEEP) | 139 | if (OVERRUN_BY_SLEEP) |
140 | lt_sleep(approx_remaining + overrun_extra); | 140 | lt_sleep(approx_remaining + overrun_extra); |
@@ -146,11 +146,11 @@ int job(lt_t exec_ns, lt_t budget_ns) | |||
146 | litmus_dgl_unlock(LOCKS, NUM_LOCKS); | 146 | litmus_dgl_unlock(LOCKS, NUM_LOCKS); |
147 | else | 147 | else |
148 | for(int i = NUM_LOCKS-1; i >= 0; --i) | 148 | for(int i = NUM_LOCKS-1; i >= 0; --i) |
149 | litmus_unlock(LOCKS[i]); | 149 | litmus_unlock(LOCKS[i]); |
150 | if (NEST_IN_IKGLP) | 150 | if (NEST_IN_IKGLP) |
151 | litmus_unlock(IKGLP_LOCK); | 151 | litmus_unlock(IKGLP_LOCK); |
152 | } | 152 | } |
153 | 153 | ||
154 | if (SIGNALS && BLOCK_SIGNALS_ON_SLEEP) | 154 | if (SIGNALS && BLOCK_SIGNALS_ON_SLEEP) |
155 | unblock_litmus_signals(SIG_BUDGET); | 155 | unblock_litmus_signals(SIG_BUDGET); |
156 | } | 156 | } |
@@ -165,7 +165,7 @@ int job(lt_t exec_ns, lt_t budget_ns) | |||
165 | return 1; | 165 | return 1; |
166 | } | 166 | } |
167 | 167 | ||
168 | #define OPTSTR "SbosOvzalwqixdn:r:" | 168 | #define OPTSTR "SbosOvzalwqixdn:r:p:" |
169 | 169 | ||
170 | int main(int argc, char** argv) | 170 | int main(int argc, char** argv) |
171 | { | 171 | { |
@@ -185,9 +185,16 @@ int main(int argc, char** argv) | |||
185 | int compute_overrun_rate = 0; | 185 | int compute_overrun_rate = 0; |
186 | int once = 1; | 186 | int once = 1; |
187 | 187 | ||
188 | bool migrate = false; | ||
189 | int partition = 0; | ||
190 | int partition_sz = 1; | ||
188 | 191 | ||
189 | while ((opt = getopt(argc, argv, OPTSTR)) != -1) { | 192 | while ((opt = getopt(argc, argv, OPTSTR)) != -1) { |
190 | switch(opt) { | 193 | switch(opt) { |
194 | case 'p': | ||
195 | migrate = true; | ||
196 | partition = atoi(optarg); | ||
197 | break; | ||
191 | case 'S': | 198 | case 'S': |
192 | SIGNALS = 1; | 199 | SIGNALS = 1; |
193 | break; | 200 | break; |
@@ -261,7 +268,7 @@ int main(int argc, char** argv) | |||
261 | assert(NUM_LOCKS > 0); | 268 | assert(NUM_LOCKS > 0); |
262 | if (LOCK_TYPE == IKGLP || NEST_IN_IKGLP) | 269 | if (LOCK_TYPE == IKGLP || NEST_IN_IKGLP) |
263 | assert(NUM_REPLICAS >= 1); | 270 | assert(NUM_REPLICAS >= 1); |
264 | 271 | ||
265 | LOCKS = new int[NUM_LOCKS]; | 272 | LOCKS = new int[NUM_LOCKS]; |
266 | 273 | ||
267 | if (compute_overrun_rate) { | 274 | if (compute_overrun_rate) { |
@@ -281,7 +288,14 @@ int main(int argc, char** argv) | |||
281 | param.budget_policy = PRECISE_ENFORCEMENT; | 288 | param.budget_policy = PRECISE_ENFORCEMENT; |
282 | else | 289 | else |
283 | param.budget_signal_policy = PRECISE_SIGNALS; | 290 | param.budget_signal_policy = PRECISE_SIGNALS; |
291 | if (migrate) | ||
292 | param.cpu = cluster_to_first_cpu(partition, partition_sz); | ||
284 | 293 | ||
294 | // set up affinity and init litmus | ||
295 | if (migrate) { | ||
296 | ret = be_migrate_to_cluster(partition, partition_sz); | ||
297 | assert(!ret); | ||
298 | } | ||
285 | init_litmus(); | 299 | init_litmus(); |
286 | 300 | ||
287 | ret = set_rt_task_param(gettid(), ¶m); | 301 | ret = set_rt_task_param(gettid(), ¶m); |
@@ -309,7 +323,7 @@ int main(int argc, char** argv) | |||
309 | } | 323 | } |
310 | LOCKS[i] = lock; | 324 | LOCKS[i] = lock; |
311 | } | 325 | } |
312 | 326 | ||
313 | if (NEST_IN_IKGLP) { | 327 | if (NEST_IN_IKGLP) { |
314 | IKGLP_LOCK = open_ikglp_sem(NAMESPACE, i, NUM_REPLICAS); | 328 | IKGLP_LOCK = open_ikglp_sem(NAMESPACE, i, NUM_REPLICAS); |
315 | if (IKGLP_LOCK < 0) { | 329 | if (IKGLP_LOCK < 0) { |
@@ -318,13 +332,13 @@ int main(int argc, char** argv) | |||
318 | } | 332 | } |
319 | } | 333 | } |
320 | } | 334 | } |
321 | 335 | ||
322 | if (WAIT) { | 336 | if (WAIT) { |
323 | ret = wait_for_ts_release(); | 337 | ret = wait_for_ts_release(); |
324 | if (ret < 0) | 338 | if (ret < 0) |
325 | perror("wait_for_ts_release"); | 339 | perror("wait_for_ts_release"); |
326 | } | 340 | } |
327 | 341 | ||
328 | ret = task_mode(LITMUS_RT_TASK); | 342 | ret = task_mode(LITMUS_RT_TASK); |
329 | assert(ret == 0); | 343 | assert(ret == 0); |
330 | 344 | ||
@@ -360,6 +374,6 @@ int main(int argc, char** argv) | |||
360 | printf("# Overruns: %d\n", NUM_OVERRUNS); | 374 | printf("# Overruns: %d\n", NUM_OVERRUNS); |
361 | 375 | ||
362 | delete[] LOCKS; | 376 | delete[] LOCKS; |
363 | 377 | ||
364 | return 0; | 378 | return 0; |
365 | } | 379 | } |
diff --git a/gpu/gpuspin.cu b/gpu/gpuspin.cu index 970d6f2..21134f6 100644 --- a/gpu/gpuspin.cu +++ b/gpu/gpuspin.cu | |||
@@ -11,6 +11,8 @@ | |||
11 | #include <boost/interprocess/managed_shared_memory.hpp> | 11 | #include <boost/interprocess/managed_shared_memory.hpp> |
12 | #include <boost/interprocess/sync/interprocess_mutex.hpp> | 12 | #include <boost/interprocess/sync/interprocess_mutex.hpp> |
13 | 13 | ||
14 | #include <random/normal.h> | ||
15 | |||
14 | #include <cuda_runtime.h> | 16 | #include <cuda_runtime.h> |
15 | 17 | ||
16 | #include "litmus.h" | 18 | #include "litmus.h" |
@@ -18,6 +20,9 @@ | |||
18 | 20 | ||
19 | using namespace std; | 21 | using namespace std; |
20 | using namespace boost::interprocess; | 22 | using namespace boost::interprocess; |
23 | using namespace ranlib; | ||
24 | |||
25 | #define ms2s(ms) ((ms)*0.001) | ||
21 | 26 | ||
22 | const char *lock_namespace = "./.gpuspin-locks"; | 27 | const char *lock_namespace = "./.gpuspin-locks"; |
23 | 28 | ||
@@ -143,10 +148,10 @@ struct ce_lock_state | |||
143 | size_t num_locks; | 148 | size_t num_locks; |
144 | size_t budget_remaining; | 149 | size_t budget_remaining; |
145 | bool locked; | 150 | bool locked; |
146 | 151 | ||
147 | ce_lock_state(int device_a, enum cudaMemcpyKind kind, size_t size, int device_b = -1, bool migration = false) { | 152 | ce_lock_state(int device_a, enum cudaMemcpyKind kind, size_t size, int device_b = -1, bool migration = false) { |
148 | num_locks = (device_a != -1) + (device_b != -1); | 153 | num_locks = (device_a != -1) + (device_b != -1); |
149 | 154 | ||
150 | if(device_a != -1) { | 155 | if(device_a != -1) { |
151 | if (!migration) | 156 | if (!migration) |
152 | locks[0] = (kind == cudaMemcpyHostToDevice || (kind == cudaMemcpyDeviceToDevice && device_b == -1)) ? | 157 | locks[0] = (kind == cudaMemcpyHostToDevice || (kind == cudaMemcpyDeviceToDevice && device_b == -1)) ? |
@@ -155,15 +160,15 @@ struct ce_lock_state | |||
155 | locks[0] = (kind == cudaMemcpyHostToDevice || (kind == cudaMemcpyDeviceToDevice && device_b == -1)) ? | 160 | locks[0] = (kind == cudaMemcpyHostToDevice || (kind == cudaMemcpyDeviceToDevice && device_b == -1)) ? |
156 | CE_MIGR_SEND_LOCKS[device_a] : CE_MIGR_RECV_LOCKS[device_a]; | 161 | CE_MIGR_SEND_LOCKS[device_a] : CE_MIGR_RECV_LOCKS[device_a]; |
157 | } | 162 | } |
158 | 163 | ||
159 | if(device_b != -1) { | 164 | if(device_b != -1) { |
160 | assert(kind == cudaMemcpyDeviceToDevice); | 165 | assert(kind == cudaMemcpyDeviceToDevice); |
161 | 166 | ||
162 | if (!migration) | 167 | if (!migration) |
163 | locks[1] = CE_RECV_LOCKS[device_b]; | 168 | locks[1] = CE_RECV_LOCKS[device_b]; |
164 | else | 169 | else |
165 | locks[1] = CE_MIGR_RECV_LOCKS[device_b]; | 170 | locks[1] = CE_MIGR_RECV_LOCKS[device_b]; |
166 | 171 | ||
167 | if(locks[1] < locks[0]) { | 172 | if(locks[1] < locks[0]) { |
168 | // enforce total order on locking | 173 | // enforce total order on locking |
169 | int temp = locks[1]; | 174 | int temp = locks[1]; |
@@ -174,35 +179,35 @@ struct ce_lock_state | |||
174 | else { | 179 | else { |
175 | locks[1] = -1; | 180 | locks[1] = -1; |
176 | } | 181 | } |
177 | 182 | ||
178 | if(!ENABLE_CHUNKING) | 183 | if(!ENABLE_CHUNKING) |
179 | budget_remaining = size; | 184 | budget_remaining = size; |
180 | else | 185 | else |
181 | budget_remaining = CHUNK_SIZE; | 186 | budget_remaining = CHUNK_SIZE; |
182 | } | 187 | } |
183 | 188 | ||
184 | void crash(void) { | 189 | void crash(void) { |
185 | void *array[50]; | 190 | void *array[50]; |
186 | int size, i; | 191 | int size, i; |
187 | char **messages; | 192 | char **messages; |
188 | 193 | ||
189 | size = backtrace(array, 50); | 194 | size = backtrace(array, 50); |
190 | messages = backtrace_symbols(array, size); | 195 | messages = backtrace_symbols(array, size); |
191 | 196 | ||
192 | fprintf(stderr, "%d: TRIED TO GRAB SAME LOCK TWICE! Lock = %d\n", getpid(), locks[0]); | 197 | fprintf(stderr, "%d: TRIED TO GRAB SAME LOCK TWICE! Lock = %d\n", getpid(), locks[0]); |
193 | for (i = 1; i < size && messages != NULL; ++i) | 198 | for (i = 1; i < size && messages != NULL; ++i) |
194 | { | 199 | { |
195 | fprintf(stderr, "%d: [bt]: (%d) %s\n", getpid(), i, messages[i]); | 200 | fprintf(stderr, "%d: [bt]: (%d) %s\n", getpid(), i, messages[i]); |
196 | } | 201 | } |
197 | free(messages); | 202 | free(messages); |
198 | 203 | ||
199 | assert(false); | 204 | assert(false); |
200 | } | 205 | } |
201 | 206 | ||
202 | 207 | ||
203 | void lock() { | 208 | void lock() { |
204 | if(locks[0] == locks[1]) crash(); | 209 | if(locks[0] == locks[1]) crash(); |
205 | 210 | ||
206 | if(USE_DYNAMIC_GROUP_LOCKS) { | 211 | if(USE_DYNAMIC_GROUP_LOCKS) { |
207 | litmus_dgl_lock(locks, num_locks); | 212 | litmus_dgl_lock(locks, num_locks); |
208 | } | 213 | } |
@@ -215,10 +220,10 @@ struct ce_lock_state | |||
215 | } | 220 | } |
216 | locked = true; | 221 | locked = true; |
217 | } | 222 | } |
218 | 223 | ||
219 | void unlock() { | 224 | void unlock() { |
220 | if(locks[0] == locks[1]) crash(); | 225 | if(locks[0] == locks[1]) crash(); |
221 | 226 | ||
222 | if(USE_DYNAMIC_GROUP_LOCKS) { | 227 | if(USE_DYNAMIC_GROUP_LOCKS) { |
223 | litmus_dgl_unlock(locks, num_locks); | 228 | litmus_dgl_unlock(locks, num_locks); |
224 | } | 229 | } |
@@ -232,15 +237,15 @@ struct ce_lock_state | |||
232 | } | 237 | } |
233 | locked = false; | 238 | locked = false; |
234 | } | 239 | } |
235 | 240 | ||
236 | void refresh() { | 241 | void refresh() { |
237 | budget_remaining = CHUNK_SIZE; | 242 | budget_remaining = CHUNK_SIZE; |
238 | } | 243 | } |
239 | 244 | ||
240 | bool budgetIsAvailable(size_t tosend) { | 245 | bool budgetIsAvailable(size_t tosend) { |
241 | return(tosend >= budget_remaining); | 246 | return(tosend >= budget_remaining); |
242 | } | 247 | } |
243 | 248 | ||
244 | void decreaseBudget(size_t spent) { | 249 | void decreaseBudget(size_t spent) { |
245 | budget_remaining -= spent; | 250 | budget_remaining -= spent; |
246 | } | 251 | } |
@@ -253,28 +258,28 @@ static cudaError_t __chunkMemcpy(void* a_dst, const void* a_src, size_t count, | |||
253 | { | 258 | { |
254 | cudaError_t ret = cudaSuccess; | 259 | cudaError_t ret = cudaSuccess; |
255 | int remaining = count; | 260 | int remaining = count; |
256 | 261 | ||
257 | char* dst = (char*)a_dst; | 262 | char* dst = (char*)a_dst; |
258 | const char* src = (const char*)a_src; | 263 | const char* src = (const char*)a_src; |
259 | 264 | ||
260 | // disable chunking, if needed, by setting chunk_size equal to the | 265 | // disable chunking, if needed, by setting chunk_size equal to the |
261 | // amount of data to be copied. | 266 | // amount of data to be copied. |
262 | int chunk_size = (ENABLE_CHUNKING) ? CHUNK_SIZE : count; | 267 | int chunk_size = (ENABLE_CHUNKING) ? CHUNK_SIZE : count; |
263 | int i = 0; | 268 | int i = 0; |
264 | 269 | ||
265 | while(remaining != 0) | 270 | while(remaining != 0) |
266 | { | 271 | { |
267 | int bytesToCopy = std::min(remaining, chunk_size); | 272 | int bytesToCopy = std::min(remaining, chunk_size); |
268 | 273 | ||
269 | if(state && state->budgetIsAvailable(bytesToCopy) && state->locked) { | 274 | if(state && state->budgetIsAvailable(bytesToCopy) && state->locked) { |
270 | cudaStreamSynchronize(STREAMS[CUR_DEVICE]); | 275 | cudaStreamSynchronize(STREAMS[CUR_DEVICE]); |
271 | ret = cudaGetLastError(); | 276 | ret = cudaGetLastError(); |
272 | 277 | ||
273 | if(ret != cudaSuccess) | 278 | if(ret != cudaSuccess) |
274 | { | 279 | { |
275 | break; | 280 | break; |
276 | } | 281 | } |
277 | 282 | ||
278 | state->unlock(); | 283 | state->unlock(); |
279 | state->refresh(); // replentish. | 284 | state->refresh(); // replentish. |
280 | // we can only run out of | 285 | // we can only run out of |
@@ -283,14 +288,14 @@ static cudaError_t __chunkMemcpy(void* a_dst, const void* a_src, size_t count, | |||
283 | // be set to cover entire memcpy | 288 | // be set to cover entire memcpy |
284 | // if chunking were disabled. | 289 | // if chunking were disabled. |
285 | } | 290 | } |
286 | 291 | ||
287 | if(state && !state->locked) { | 292 | if(state && !state->locked) { |
288 | state->lock(); | 293 | state->lock(); |
289 | } | 294 | } |
290 | 295 | ||
291 | //ret = cudaMemcpy(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind); | 296 | //ret = cudaMemcpy(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind); |
292 | cudaMemcpyAsync(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind, STREAMS[CUR_DEVICE]); | 297 | cudaMemcpyAsync(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind, STREAMS[CUR_DEVICE]); |
293 | 298 | ||
294 | if(state) { | 299 | if(state) { |
295 | state->decreaseBudget(bytesToCopy); | 300 | state->decreaseBudget(bytesToCopy); |
296 | } | 301 | } |
@@ -332,9 +337,9 @@ void allocate_locks_litmus(void) | |||
332 | { | 337 | { |
333 | // allocate k-FMLP lock | 338 | // allocate k-FMLP lock |
334 | int fd = open(lock_namespace, O_RDONLY | O_CREAT, S_IRUSR | S_IWUSR); | 339 | int fd = open(lock_namespace, O_RDONLY | O_CREAT, S_IRUSR | S_IWUSR); |
335 | 340 | ||
336 | int base_name = GPU_PARTITION * 1000; | 341 | int base_name = GPU_PARTITION * 1000; |
337 | 342 | ||
338 | if (GPU_SYNC_MODE == IKGLP_MODE) { | 343 | if (GPU_SYNC_MODE == IKGLP_MODE) { |
339 | /* Standard (optimal) IKGLP */ | 344 | /* Standard (optimal) IKGLP */ |
340 | TOKEN_LOCK = open_gpusync_token_lock(fd, | 345 | TOKEN_LOCK = open_gpusync_token_lock(fd, |
@@ -390,15 +395,15 @@ void allocate_locks_litmus(void) | |||
390 | perror("Invalid GPUSync mode specified\n"); | 395 | perror("Invalid GPUSync mode specified\n"); |
391 | TOKEN_LOCK = -1; | 396 | TOKEN_LOCK = -1; |
392 | } | 397 | } |
393 | 398 | ||
394 | if(TOKEN_LOCK < 0) | 399 | if(TOKEN_LOCK < 0) |
395 | perror("open_token_sem"); | 400 | perror("open_token_sem"); |
396 | 401 | ||
397 | if(USE_ENGINE_LOCKS) | 402 | if(USE_ENGINE_LOCKS) |
398 | { | 403 | { |
399 | assert(NUM_COPY_ENGINES == 1 || NUM_COPY_ENGINES == 2); | 404 | assert(NUM_COPY_ENGINES == 1 || NUM_COPY_ENGINES == 2); |
400 | assert((NUM_COPY_ENGINES == 1 && !RESERVED_MIGR_COPY_ENGINE) || NUM_COPY_ENGINES == 2); | 405 | assert((NUM_COPY_ENGINES == 1 && !RESERVED_MIGR_COPY_ENGINE) || NUM_COPY_ENGINES == 2); |
401 | 406 | ||
402 | // allocate the engine locks. | 407 | // allocate the engine locks. |
403 | for (int i = 0; i < GPU_PARTITION_SIZE; ++i) | 408 | for (int i = 0; i < GPU_PARTITION_SIZE; ++i) |
404 | { | 409 | { |
@@ -407,27 +412,27 @@ void allocate_locks_litmus(void) | |||
407 | int ce_0_name = (i+1)*10 + base_name + 1; | 412 | int ce_0_name = (i+1)*10 + base_name + 1; |
408 | int ce_1_name = (i+1)*10 + base_name + 2; | 413 | int ce_1_name = (i+1)*10 + base_name + 2; |
409 | int ee_lock = -1, ce_0_lock = -1, ce_1_lock = -1; | 414 | int ee_lock = -1, ce_0_lock = -1, ce_1_lock = -1; |
410 | 415 | ||
411 | open_sem_t openEngineLock = (ENGINE_LOCK_TYPE == FIFO) ? | 416 | open_sem_t openEngineLock = (ENGINE_LOCK_TYPE == FIFO) ? |
412 | open_fifo_sem : open_prioq_sem; | 417 | open_fifo_sem : open_prioq_sem; |
413 | 418 | ||
414 | ee_lock = openEngineLock(fd, ee_name); | 419 | ee_lock = openEngineLock(fd, ee_name); |
415 | if (ee_lock < 0) | 420 | if (ee_lock < 0) |
416 | perror("open_*_sem (engine lock)"); | 421 | perror("open_*_sem (engine lock)"); |
417 | 422 | ||
418 | ce_0_lock = openEngineLock(fd, ce_0_name); | 423 | ce_0_lock = openEngineLock(fd, ce_0_name); |
419 | if (ce_0_lock < 0) | 424 | if (ce_0_lock < 0) |
420 | perror("open_*_sem (engine lock)"); | 425 | perror("open_*_sem (engine lock)"); |
421 | 426 | ||
422 | if (NUM_COPY_ENGINES == 2) | 427 | if (NUM_COPY_ENGINES == 2) |
423 | { | 428 | { |
424 | ce_1_lock = openEngineLock(fd, ce_1_name); | 429 | ce_1_lock = openEngineLock(fd, ce_1_name); |
425 | if (ce_1_lock < 0) | 430 | if (ce_1_lock < 0) |
426 | perror("open_*_sem (engine lock)"); | 431 | perror("open_*_sem (engine lock)"); |
427 | } | 432 | } |
428 | 433 | ||
429 | EE_LOCKS[idx] = ee_lock; | 434 | EE_LOCKS[idx] = ee_lock; |
430 | 435 | ||
431 | if (NUM_COPY_ENGINES == 1) | 436 | if (NUM_COPY_ENGINES == 1) |
432 | { | 437 | { |
433 | // share locks | 438 | // share locks |
@@ -439,7 +444,7 @@ void allocate_locks_litmus(void) | |||
439 | else | 444 | else |
440 | { | 445 | { |
441 | assert(NUM_COPY_ENGINES == 2); | 446 | assert(NUM_COPY_ENGINES == 2); |
442 | 447 | ||
443 | if (RESERVED_MIGR_COPY_ENGINE) { | 448 | if (RESERVED_MIGR_COPY_ENGINE) { |
444 | // copy engine deadicated to migration operations | 449 | // copy engine deadicated to migration operations |
445 | CE_SEND_LOCKS[idx] = ce_0_lock; | 450 | CE_SEND_LOCKS[idx] = ce_0_lock; |
@@ -469,15 +474,18 @@ public: | |||
469 | { | 474 | { |
470 | memset(&pool[0], 0, sizeof(pool[0])*poolSize); | 475 | memset(&pool[0], 0, sizeof(pool[0])*poolSize); |
471 | } | 476 | } |
472 | 477 | ||
473 | int get(pthread_mutex_t* tex, int preference = -1) | 478 | int get(pthread_mutex_t* tex, int preference = -1) |
474 | { | 479 | { |
475 | int which = -1; | 480 | int which = -1; |
476 | int last = (preference >= 0) ? preference : 0; | 481 | // int last = (preference >= 0) ? preference : 0; |
482 | int last = (ENABLE_AFFINITY) ? | ||
483 | (preference >= 0) ? preference : 0 : | ||
484 | rand()%poolSize; | ||
477 | int minIdx = last; | 485 | int minIdx = last; |
478 | 486 | ||
479 | pthread_mutex_lock(tex); | 487 | pthread_mutex_lock(tex); |
480 | 488 | ||
481 | int min = pool[last]; | 489 | int min = pool[last]; |
482 | for(int i = (minIdx+1)%poolSize; i != last; i = (i+1)%poolSize) | 490 | for(int i = (minIdx+1)%poolSize; i != last; i = (i+1)%poolSize) |
483 | { | 491 | { |
@@ -485,21 +493,21 @@ public: | |||
485 | minIdx = i; | 493 | minIdx = i; |
486 | } | 494 | } |
487 | ++pool[minIdx]; | 495 | ++pool[minIdx]; |
488 | 496 | ||
489 | pthread_mutex_unlock(tex); | 497 | pthread_mutex_unlock(tex); |
490 | 498 | ||
491 | which = minIdx; | 499 | which = minIdx; |
492 | 500 | ||
493 | return which; | 501 | return which; |
494 | } | 502 | } |
495 | 503 | ||
496 | void put(pthread_mutex_t* tex, int which) | 504 | void put(pthread_mutex_t* tex, int which) |
497 | { | 505 | { |
498 | pthread_mutex_lock(tex); | 506 | pthread_mutex_lock(tex); |
499 | --pool[which]; | 507 | --pool[which]; |
500 | pthread_mutex_unlock(tex); | 508 | pthread_mutex_unlock(tex); |
501 | } | 509 | } |
502 | 510 | ||
503 | private: | 511 | private: |
504 | int poolSize; | 512 | int poolSize; |
505 | int pool[NR_GPUS]; // >= gpu_part_size | 513 | int pool[NR_GPUS]; // >= gpu_part_size |
@@ -508,19 +516,19 @@ private: | |||
508 | static gpu_pool* GPU_LINUX_SEM_POOL = NULL; | 516 | static gpu_pool* GPU_LINUX_SEM_POOL = NULL; |
509 | static pthread_mutex_t* GPU_LINUX_MUTEX_POOL = NULL; | 517 | static pthread_mutex_t* GPU_LINUX_MUTEX_POOL = NULL; |
510 | 518 | ||
511 | static void allocate_locks_linux(int num_gpu_users) | 519 | static void allocate_locks_linux(const int num_gpu_users) |
512 | { | 520 | { |
513 | managed_shared_memory *segment_pool_ptr = NULL; | 521 | managed_shared_memory *segment_pool_ptr = NULL; |
514 | managed_shared_memory *segment_mutex_ptr = NULL; | 522 | managed_shared_memory *segment_mutex_ptr = NULL; |
515 | 523 | ||
516 | int numGpuPartitions = NR_GPUS/GPU_PARTITION_SIZE; | 524 | int numGpuPartitions = NR_GPUS/GPU_PARTITION_SIZE; |
517 | 525 | ||
518 | if(num_gpu_users != 0) | 526 | if(num_gpu_users > 0) |
519 | { | 527 | { |
520 | printf("%d creating shared memory for linux semaphores; num pools = %d, pool size = %d\n", getpid(), numGpuPartitions, GPU_PARTITION_SIZE); | 528 | printf("%d creating shared memory for linux semaphores; num pools = %d, pool size = %d\n", getpid(), numGpuPartitions, GPU_PARTITION_SIZE); |
521 | shared_memory_object::remove("linux_mutex_memory"); | 529 | shared_memory_object::remove("linux_mutex_memory"); |
522 | shared_memory_object::remove("linux_sem_memory"); | 530 | shared_memory_object::remove("linux_sem_memory"); |
523 | 531 | ||
524 | segment_mutex_ptr = new managed_shared_memory(create_only, "linux_mutex_memory", 4*1024); | 532 | segment_mutex_ptr = new managed_shared_memory(create_only, "linux_mutex_memory", 4*1024); |
525 | GPU_LINUX_MUTEX_POOL = segment_mutex_ptr->construct<pthread_mutex_t>("pthread_mutex_t linux_m")[numGpuPartitions](); | 533 | GPU_LINUX_MUTEX_POOL = segment_mutex_ptr->construct<pthread_mutex_t>("pthread_mutex_t linux_m")[numGpuPartitions](); |
526 | for(int i = 0; i < numGpuPartitions; ++i) | 534 | for(int i = 0; i < numGpuPartitions; ++i) |
@@ -531,7 +539,7 @@ static void allocate_locks_linux(int num_gpu_users) | |||
531 | pthread_mutex_init(&(GPU_LINUX_MUTEX_POOL[i]), &attr); | 539 | pthread_mutex_init(&(GPU_LINUX_MUTEX_POOL[i]), &attr); |
532 | pthread_mutexattr_destroy(&attr); | 540 | pthread_mutexattr_destroy(&attr); |
533 | } | 541 | } |
534 | 542 | ||
535 | segment_pool_ptr = new managed_shared_memory(create_only, "linux_sem_memory", 4*1024); | 543 | segment_pool_ptr = new managed_shared_memory(create_only, "linux_sem_memory", 4*1024); |
536 | GPU_LINUX_SEM_POOL = segment_pool_ptr->construct<gpu_pool>("gpu_pool linux_p")[numGpuPartitions](GPU_PARTITION_SIZE); | 544 | GPU_LINUX_SEM_POOL = segment_pool_ptr->construct<gpu_pool>("gpu_pool linux_p")[numGpuPartitions](GPU_PARTITION_SIZE); |
537 | } | 545 | } |
@@ -548,7 +556,7 @@ static void allocate_locks_linux(int num_gpu_users) | |||
548 | sleep(1); | 556 | sleep(1); |
549 | } | 557 | } |
550 | }while(segment_pool_ptr == NULL); | 558 | }while(segment_pool_ptr == NULL); |
551 | 559 | ||
552 | do | 560 | do |
553 | { | 561 | { |
554 | try | 562 | try |
@@ -560,7 +568,7 @@ static void allocate_locks_linux(int num_gpu_users) | |||
560 | sleep(1); | 568 | sleep(1); |
561 | } | 569 | } |
562 | }while(segment_mutex_ptr == NULL); | 570 | }while(segment_mutex_ptr == NULL); |
563 | 571 | ||
564 | GPU_LINUX_SEM_POOL = segment_pool_ptr->find<gpu_pool>("gpu_pool linux_p").first; | 572 | GPU_LINUX_SEM_POOL = segment_pool_ptr->find<gpu_pool>("gpu_pool linux_p").first; |
565 | GPU_LINUX_MUTEX_POOL = segment_mutex_ptr->find<pthread_mutex_t>("pthread_mutex_t linux_m").first; | 573 | GPU_LINUX_MUTEX_POOL = segment_mutex_ptr->find<pthread_mutex_t>("pthread_mutex_t linux_m").first; |
566 | } | 574 | } |
@@ -569,7 +577,7 @@ static void allocate_locks_linux(int num_gpu_users) | |||
569 | 577 | ||
570 | 578 | ||
571 | 579 | ||
572 | static void allocate_locks(int num_gpu_users, bool linux_mode) | 580 | static void allocate_locks(const int num_gpu_users, bool linux_mode) |
573 | { | 581 | { |
574 | if(!linux_mode) | 582 | if(!linux_mode) |
575 | allocate_locks_litmus(); | 583 | allocate_locks_litmus(); |
@@ -593,14 +601,14 @@ static pthread_barrier_t *gpu_barrier = NULL; | |||
593 | static interprocess_mutex *gpu_mgmt_mutexes = NULL; | 601 | static interprocess_mutex *gpu_mgmt_mutexes = NULL; |
594 | static managed_shared_memory *segment_ptr = NULL; | 602 | static managed_shared_memory *segment_ptr = NULL; |
595 | 603 | ||
596 | void coordinate_gpu_tasks(int num_gpu_users) | 604 | void coordinate_gpu_tasks(const int num_gpu_users) |
597 | { | 605 | { |
598 | if(num_gpu_users != 0) | 606 | if(num_gpu_users > 0) |
599 | { | 607 | { |
600 | printf("%d creating shared memory\n", getpid()); | 608 | printf("%d creating shared memory\n", getpid()); |
601 | shared_memory_object::remove("gpu_barrier_memory"); | 609 | shared_memory_object::remove("gpu_barrier_memory"); |
602 | segment_ptr = new managed_shared_memory(create_only, "gpu_barrier_memory", 4*1024); | 610 | segment_ptr = new managed_shared_memory(create_only, "gpu_barrier_memory", 4*1024); |
603 | 611 | ||
604 | printf("%d creating a barrier for %d users\n", getpid(), num_gpu_users); | 612 | printf("%d creating a barrier for %d users\n", getpid(), num_gpu_users); |
605 | gpu_barrier = segment_ptr->construct<pthread_barrier_t>("pthread_barrier_t gpu_barrier")(); | 613 | gpu_barrier = segment_ptr->construct<pthread_barrier_t>("pthread_barrier_t gpu_barrier")(); |
606 | pthread_barrierattr_t battr; | 614 | pthread_barrierattr_t battr; |
@@ -624,7 +632,7 @@ void coordinate_gpu_tasks(int num_gpu_users) | |||
624 | sleep(1); | 632 | sleep(1); |
625 | } | 633 | } |
626 | }while(segment_ptr == NULL); | 634 | }while(segment_ptr == NULL); |
627 | 635 | ||
628 | gpu_barrier = segment_ptr->find<pthread_barrier_t>("pthread_barrier_t gpu_barrier").first; | 636 | gpu_barrier = segment_ptr->find<pthread_barrier_t>("pthread_barrier_t gpu_barrier").first; |
629 | gpu_mgmt_mutexes = segment_ptr->find<interprocess_mutex>("interprocess_mutex m").first; | 637 | gpu_mgmt_mutexes = segment_ptr->find<interprocess_mutex>("interprocess_mutex m").first; |
630 | } | 638 | } |
@@ -647,15 +655,16 @@ char *h_state_data = 0; | |||
647 | 655 | ||
648 | unsigned int *h_iteration_count[NR_GPUS] = {0}; | 656 | unsigned int *h_iteration_count[NR_GPUS] = {0}; |
649 | 657 | ||
650 | static void init_cuda(int num_gpu_users) | 658 | static void init_cuda(const int num_gpu_users) |
651 | { | 659 | { |
652 | const int PAGE_SIZE = 4*1024; | 660 | const int PAGE_SIZE = 4*1024; |
653 | size_t send_alloc_bytes = SEND_SIZE + (SEND_SIZE%PAGE_SIZE != 0)*PAGE_SIZE; | 661 | size_t send_alloc_bytes = SEND_SIZE + (SEND_SIZE%PAGE_SIZE != 0)*PAGE_SIZE; |
654 | size_t recv_alloc_bytes = RECV_SIZE + (RECV_SIZE%PAGE_SIZE != 0)*PAGE_SIZE; | 662 | size_t recv_alloc_bytes = RECV_SIZE + (RECV_SIZE%PAGE_SIZE != 0)*PAGE_SIZE; |
655 | size_t state_alloc_bytes = STATE_SIZE + (STATE_SIZE%PAGE_SIZE != 0)*PAGE_SIZE; | 663 | size_t state_alloc_bytes = STATE_SIZE + (STATE_SIZE%PAGE_SIZE != 0)*PAGE_SIZE; |
656 | 664 | ||
657 | coordinate_gpu_tasks(num_gpu_users); | 665 | coordinate_gpu_tasks(num_gpu_users); |
658 | 666 | ||
667 | #if 1 | ||
659 | switch (CUDA_SYNC_MODE) | 668 | switch (CUDA_SYNC_MODE) |
660 | { | 669 | { |
661 | case BLOCKING: | 670 | case BLOCKING: |
@@ -665,72 +674,85 @@ static void init_cuda(int num_gpu_users) | |||
665 | cudaSetDeviceFlags(cudaDeviceScheduleSpin); | 674 | cudaSetDeviceFlags(cudaDeviceScheduleSpin); |
666 | break; | 675 | break; |
667 | } | 676 | } |
668 | 677 | #else | |
678 | cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); | ||
679 | #endif | ||
680 | |||
669 | for(int i = 0; i < GPU_PARTITION_SIZE; ++i) | 681 | for(int i = 0; i < GPU_PARTITION_SIZE; ++i) |
670 | { | 682 | { |
671 | cudaDeviceProp prop; | 683 | cudaDeviceProp prop; |
672 | int which = GPU_PARTITION*GPU_PARTITION_SIZE + i; | 684 | int which = GPU_PARTITION*GPU_PARTITION_SIZE + i; |
673 | 685 | ||
674 | gpu_mgmt_mutexes[which].lock(); | 686 | gpu_mgmt_mutexes[which].lock(); |
675 | 687 | try | |
676 | set_cur_gpu(which); | ||
677 | cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 0); | ||
678 | cudaDeviceSetLimit(cudaLimitMallocHeapSize, 0); | ||
679 | |||
680 | cudaGetDeviceProperties(&prop, which); | ||
681 | GPU_HZ[which] = prop.clockRate * 1000; /* khz -> hz */ | ||
682 | NUM_SM[which] = prop.multiProcessorCount; | ||
683 | WARP_SIZE[which] = prop.warpSize; | ||
684 | |||
685 | // enough to fill the L2 cache exactly. | ||
686 | ELEM_PER_THREAD[which] = (prop.l2CacheSize/(NUM_SM[which]*WARP_SIZE[which]*sizeof(spindata_t))); | ||
687 | |||
688 | |||
689 | if (!MIGRATE_VIA_SYSMEM && prop.unifiedAddressing) | ||
690 | { | 688 | { |
691 | for(int j = 0; j < GPU_PARTITION_SIZE; ++j) | 689 | set_cur_gpu(which); |
690 | cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 0); | ||
691 | cudaDeviceSetLimit(cudaLimitMallocHeapSize, 0); | ||
692 | |||
693 | cudaGetDeviceProperties(&prop, which); | ||
694 | GPU_HZ[which] = prop.clockRate * 1000; /* khz -> hz */ | ||
695 | NUM_SM[which] = prop.multiProcessorCount; | ||
696 | WARP_SIZE[which] = prop.warpSize; | ||
697 | |||
698 | // enough to fill the L2 cache exactly. | ||
699 | ELEM_PER_THREAD[which] = (prop.l2CacheSize/(NUM_SM[which]*WARP_SIZE[which]*sizeof(spindata_t))); | ||
700 | |||
701 | |||
702 | if (!MIGRATE_VIA_SYSMEM && prop.unifiedAddressing) | ||
692 | { | 703 | { |
693 | if (i != j) | 704 | for(int j = 0; j < GPU_PARTITION_SIZE; ++j) |
694 | { | 705 | { |
695 | int other = GPU_PARTITION*GPU_PARTITION_SIZE + j; | 706 | if (i != j) |
696 | int canAccess = 0; | ||
697 | cudaDeviceCanAccessPeer(&canAccess, which, other); | ||
698 | if(canAccess) | ||
699 | { | 707 | { |
700 | cudaDeviceEnablePeerAccess(other, 0); | 708 | int other = GPU_PARTITION*GPU_PARTITION_SIZE + j; |
701 | p2pMigration[which][other] = true; | 709 | int canAccess = 0; |
710 | cudaDeviceCanAccessPeer(&canAccess, which, other); | ||
711 | if(canAccess) | ||
712 | { | ||
713 | cudaDeviceEnablePeerAccess(other, 0); | ||
714 | p2pMigration[which][other] = true; | ||
715 | } | ||
702 | } | 716 | } |
703 | } | 717 | } |
704 | } | 718 | } |
719 | |||
720 | cudaStreamCreate(&STREAMS[CUR_DEVICE]); | ||
721 | |||
722 | cudaMalloc(&d_spin_data[which], prop.l2CacheSize); | ||
723 | cudaMemset(&d_spin_data[which], 0, prop.l2CacheSize); | ||
724 | // cudaMalloc(&d_iteration_count[which], NUM_SM[which]*WARP_SIZE[which]*sizeof(unsigned int)); | ||
725 | // cudaHostAlloc(&h_iteration_count[which], NUM_SM[which]*WARP_SIZE[which]*sizeof(unsigned int), cudaHostAllocPortable | cudaHostAllocMapped); | ||
726 | |||
727 | if (send_alloc_bytes) { | ||
728 | cudaMalloc(&d_send_data[which], send_alloc_bytes); | ||
729 | cudaHostAlloc(&h_send_data, send_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped); | ||
730 | } | ||
731 | |||
732 | if (h_recv_data) { | ||
733 | cudaMalloc(&d_recv_data[which], recv_alloc_bytes); | ||
734 | cudaHostAlloc(&h_recv_data, recv_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped); | ||
735 | } | ||
736 | |||
737 | if (h_state_data) { | ||
738 | cudaMalloc(&d_state_data[which], state_alloc_bytes); | ||
739 | |||
740 | if (MIGRATE_VIA_SYSMEM) | ||
741 | cudaHostAlloc(&h_state_data, state_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped | cudaHostAllocWriteCombined); | ||
742 | } | ||
705 | } | 743 | } |
706 | 744 | catch(std::exception &e) | |
707 | cudaStreamCreate(&STREAMS[CUR_DEVICE]); | 745 | { |
708 | 746 | printf("caught an exception during initializiation!: %s\n", e.what()); | |
709 | cudaMalloc(&d_spin_data[which], prop.l2CacheSize); | ||
710 | cudaMemset(&d_spin_data[which], 0, prop.l2CacheSize); | ||
711 | // cudaMalloc(&d_iteration_count[which], NUM_SM[which]*WARP_SIZE[which]*sizeof(unsigned int)); | ||
712 | // cudaHostAlloc(&h_iteration_count[which], NUM_SM[which]*WARP_SIZE[which]*sizeof(unsigned int), cudaHostAllocPortable | cudaHostAllocMapped); | ||
713 | |||
714 | if (send_alloc_bytes) { | ||
715 | cudaMalloc(&d_send_data[which], send_alloc_bytes); | ||
716 | cudaHostAlloc(&h_send_data, send_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped); | ||
717 | } | ||
718 | |||
719 | if (h_recv_data) { | ||
720 | cudaMalloc(&d_recv_data[which], recv_alloc_bytes); | ||
721 | cudaHostAlloc(&h_recv_data, recv_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped); | ||
722 | } | 747 | } |
723 | 748 | catch(...) | |
724 | if (h_state_data) { | 749 | { |
725 | cudaMalloc(&d_state_data[which], state_alloc_bytes); | 750 | printf("caught unknown exception.\n"); |
726 | |||
727 | if (MIGRATE_VIA_SYSMEM) | ||
728 | cudaHostAlloc(&h_state_data, state_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped | cudaHostAllocWriteCombined); | ||
729 | } | 751 | } |
730 | 752 | ||
731 | gpu_mgmt_mutexes[which].unlock(); | 753 | gpu_mgmt_mutexes[which].unlock(); |
732 | } | 754 | } |
733 | 755 | ||
734 | // roll back to first GPU | 756 | // roll back to first GPU |
735 | set_cur_gpu(GPU_PARTITION*GPU_PARTITION_SIZE); | 757 | set_cur_gpu(GPU_PARTITION*GPU_PARTITION_SIZE); |
736 | } | 758 | } |
@@ -772,26 +794,26 @@ static bool MigrateToGPU_SysMem(int from, int to) | |||
772 | // you should be using speculative migrations. | 794 | // you should be using speculative migrations. |
773 | // Use PushState() and PullState(). | 795 | // Use PushState() and PullState(). |
774 | assert(false); // for now | 796 | assert(false); // for now |
775 | 797 | ||
776 | bool success = true; | 798 | bool success = true; |
777 | 799 | ||
778 | set_cur_gpu(from); | 800 | set_cur_gpu(from); |
779 | chunkMemcpy(h_state_data, this_gpu(d_state_data), | 801 | chunkMemcpy(h_state_data, this_gpu(d_state_data), |
780 | STATE_SIZE, cudaMemcpyDeviceToHost, | 802 | STATE_SIZE, cudaMemcpyDeviceToHost, |
781 | from, useEngineLocks(), -1, true); | 803 | from, useEngineLocks(), -1, true); |
782 | 804 | ||
783 | set_cur_gpu(to); | 805 | set_cur_gpu(to); |
784 | chunkMemcpy(this_gpu(d_state_data), h_state_data, | 806 | chunkMemcpy(this_gpu(d_state_data), h_state_data, |
785 | STATE_SIZE, cudaMemcpyHostToDevice, | 807 | STATE_SIZE, cudaMemcpyHostToDevice, |
786 | to, useEngineLocks(), -1, true); | 808 | to, useEngineLocks(), -1, true); |
787 | 809 | ||
788 | return success; | 810 | return success; |
789 | } | 811 | } |
790 | 812 | ||
791 | static bool MigrateToGPU(int from, int to) | 813 | static bool MigrateToGPU(int from, int to) |
792 | { | 814 | { |
793 | bool success = false; | 815 | bool success = false; |
794 | 816 | ||
795 | if (from != to) | 817 | if (from != to) |
796 | { | 818 | { |
797 | if(!MIGRATE_VIA_SYSMEM && p2pMigration[to][from]) | 819 | if(!MIGRATE_VIA_SYSMEM && p2pMigration[to][from]) |
@@ -804,7 +826,7 @@ static bool MigrateToGPU(int from, int to) | |||
804 | set_cur_gpu(to); | 826 | set_cur_gpu(to); |
805 | success = true; | 827 | success = true; |
806 | } | 828 | } |
807 | 829 | ||
808 | return success; | 830 | return success; |
809 | } | 831 | } |
810 | 832 | ||
@@ -851,9 +873,9 @@ static void catch_exit(int catch_exit) | |||
851 | { | 873 | { |
852 | int which = GPU_PARTITION*GPU_PARTITION_SIZE + i; | 874 | int which = GPU_PARTITION*GPU_PARTITION_SIZE + i; |
853 | set_cur_gpu(which); | 875 | set_cur_gpu(which); |
854 | 876 | ||
855 | // cudaDeviceReset(); | 877 | // cudaDeviceReset(); |
856 | 878 | ||
857 | // try to unlock everything. litmus will prevent bogus calls. | 879 | // try to unlock everything. litmus will prevent bogus calls. |
858 | if(USE_ENGINE_LOCKS) | 880 | if(USE_ENGINE_LOCKS) |
859 | { | 881 | { |
@@ -883,15 +905,15 @@ static int gpucount = 0; | |||
883 | 905 | ||
884 | __global__ void docudaspin(float* data, /*unsigned int* iterations,*/ unsigned int num_elem, unsigned int cycles) | 906 | __global__ void docudaspin(float* data, /*unsigned int* iterations,*/ unsigned int num_elem, unsigned int cycles) |
885 | { | 907 | { |
886 | long long int now = clock64(); | 908 | long long int now = clock64(); |
887 | long long unsigned int elapsed = 0; | 909 | long long unsigned int elapsed = 0; |
888 | long long int last; | 910 | long long int last; |
889 | 911 | ||
890 | // unsigned int iter = 0; | 912 | // unsigned int iter = 0; |
891 | unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; | 913 | unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; |
892 | unsigned int j = 0; | 914 | unsigned int j = 0; |
893 | bool toggle = true; | 915 | bool toggle = true; |
894 | 916 | ||
895 | // iterations[i] = 0; | 917 | // iterations[i] = 0; |
896 | do | 918 | do |
897 | { | 919 | { |
@@ -899,7 +921,7 @@ __global__ void docudaspin(float* data, /*unsigned int* iterations,*/ unsigned i | |||
899 | j = (j + 1 != num_elem) ? j + 1 : 0; | 921 | j = (j + 1 != num_elem) ? j + 1 : 0; |
900 | toggle = !toggle; | 922 | toggle = !toggle; |
901 | // iter++; | 923 | // iter++; |
902 | 924 | ||
903 | last = now; | 925 | last = now; |
904 | now = clock64(); | 926 | now = clock64(); |
905 | 927 | ||
@@ -909,13 +931,13 @@ __global__ void docudaspin(float* data, /*unsigned int* iterations,*/ unsigned i | |||
909 | // elapsed += (diff > 0) ? | 931 | // elapsed += (diff > 0) ? |
910 | // diff : | 932 | // diff : |
911 | // now + ((~((long long int)0)<<1)>>1) - last; | 933 | // now + ((~((long long int)0)<<1)>>1) - last; |
912 | 934 | ||
913 | // don't count iterations with clock roll-over | 935 | // don't count iterations with clock roll-over |
914 | elapsed += max(0ll, now - last); | 936 | elapsed += max(0ll, now - last); |
915 | }while(elapsed < cycles); | 937 | }while(elapsed < cycles); |
916 | 938 | ||
917 | // iterations[i] = iter; | 939 | // iterations[i] = iter; |
918 | 940 | ||
919 | return; | 941 | return; |
920 | } | 942 | } |
921 | 943 | ||
@@ -923,9 +945,11 @@ static void gpu_loop_for(double gpu_sec_time, unsigned int num_kernels, double e | |||
923 | { | 945 | { |
924 | int next_gpu; | 946 | int next_gpu; |
925 | 947 | ||
948 | if (gpu_sec_time <= 0.0) | ||
949 | goto out; | ||
926 | if (emergency_exit && wctime() > emergency_exit) | 950 | if (emergency_exit && wctime() > emergency_exit) |
927 | goto out; | 951 | goto out; |
928 | 952 | ||
929 | next_gpu = litmus_lock(TOKEN_LOCK); | 953 | next_gpu = litmus_lock(TOKEN_LOCK); |
930 | { | 954 | { |
931 | MigrateIfNeeded(next_gpu); | 955 | MigrateIfNeeded(next_gpu); |
@@ -934,7 +958,7 @@ static void gpu_loop_for(double gpu_sec_time, unsigned int num_kernels, double e | |||
934 | if(SEND_SIZE > 0) | 958 | if(SEND_SIZE > 0) |
935 | chunkMemcpy(this_gpu(d_state_data), h_send_data, SEND_SIZE, | 959 | chunkMemcpy(this_gpu(d_state_data), h_send_data, SEND_SIZE, |
936 | cudaMemcpyHostToDevice, CUR_DEVICE, useEngineLocks()); | 960 | cudaMemcpyHostToDevice, CUR_DEVICE, useEngineLocks()); |
937 | 961 | ||
938 | for(unsigned int i = 0; i < num_kernels; ++i) | 962 | for(unsigned int i = 0; i < num_kernels; ++i) |
939 | { | 963 | { |
940 | if(useEngineLocks()) litmus_lock(cur_ee()); | 964 | if(useEngineLocks()) litmus_lock(cur_ee()); |
@@ -943,18 +967,18 @@ static void gpu_loop_for(double gpu_sec_time, unsigned int num_kernels, double e | |||
943 | cudaStreamSynchronize(cur_stream()); | 967 | cudaStreamSynchronize(cur_stream()); |
944 | if(useEngineLocks()) litmus_unlock(cur_ee()); | 968 | if(useEngineLocks()) litmus_unlock(cur_ee()); |
945 | } | 969 | } |
946 | 970 | ||
947 | if(RECV_SIZE > 0) | 971 | if(RECV_SIZE > 0) |
948 | chunkMemcpy(h_recv_data, this_gpu(d_state_data), RECV_SIZE, | 972 | chunkMemcpy(h_recv_data, this_gpu(d_state_data), RECV_SIZE, |
949 | cudaMemcpyDeviceToHost, CUR_DEVICE, useEngineLocks()); | 973 | cudaMemcpyDeviceToHost, CUR_DEVICE, useEngineLocks()); |
950 | 974 | ||
951 | if (MIGRATE_VIA_SYSMEM) | 975 | if (MIGRATE_VIA_SYSMEM) |
952 | PullState(); | 976 | PullState(); |
953 | } | 977 | } |
954 | litmus_unlock(TOKEN_LOCK); | 978 | litmus_unlock(TOKEN_LOCK); |
955 | 979 | ||
956 | last_gpu() = cur_gpu(); | 980 | last_gpu() = cur_gpu(); |
957 | 981 | ||
958 | out: | 982 | out: |
959 | return; | 983 | return; |
960 | } | 984 | } |
@@ -964,7 +988,14 @@ static void gpu_loop_for_linux(double gpu_sec_time, unsigned int num_kernels, do | |||
964 | static int GPU_OFFSET = GPU_PARTITION * GPU_PARTITION_SIZE; | 988 | static int GPU_OFFSET = GPU_PARTITION * GPU_PARTITION_SIZE; |
965 | static gpu_pool *pool = &GPU_LINUX_SEM_POOL[GPU_PARTITION]; | 989 | static gpu_pool *pool = &GPU_LINUX_SEM_POOL[GPU_PARTITION]; |
966 | static pthread_mutex_t *mutex = &GPU_LINUX_MUTEX_POOL[GPU_PARTITION]; | 990 | static pthread_mutex_t *mutex = &GPU_LINUX_MUTEX_POOL[GPU_PARTITION]; |
967 | 991 | ||
992 | int next_gpu; | ||
993 | |||
994 | if (gpu_sec_time <= 0.0) | ||
995 | goto out; | ||
996 | if (emergency_exit && wctime() > emergency_exit) | ||
997 | goto out; | ||
998 | |||
968 | #ifdef VANILLA_LINUX | 999 | #ifdef VANILLA_LINUX |
969 | static bool once = false; | 1000 | static bool once = false; |
970 | static cudaEvent_t start, end; | 1001 | static cudaEvent_t start, end; |
@@ -977,21 +1008,16 @@ static void gpu_loop_for_linux(double gpu_sec_time, unsigned int num_kernels, do | |||
977 | } | 1008 | } |
978 | #endif | 1009 | #endif |
979 | 1010 | ||
980 | int next_gpu; | ||
981 | |||
982 | if (emergency_exit && wctime() > emergency_exit) | ||
983 | goto out; | ||
984 | |||
985 | next_gpu = pool->get(mutex, cur_gpu() - GPU_OFFSET) + GPU_OFFSET; | 1011 | next_gpu = pool->get(mutex, cur_gpu() - GPU_OFFSET) + GPU_OFFSET; |
986 | { | 1012 | { |
987 | MigrateIfNeeded(next_gpu); | 1013 | MigrateIfNeeded(next_gpu); |
988 | 1014 | ||
989 | unsigned int numcycles = ((unsigned int)(cur_hz() * gpu_sec_time))/num_kernels; | 1015 | unsigned int numcycles = ((unsigned int)(cur_hz() * gpu_sec_time))/num_kernels; |
990 | 1016 | ||
991 | if(SEND_SIZE > 0) | 1017 | if(SEND_SIZE > 0) |
992 | chunkMemcpy(this_gpu(d_state_data), h_send_data, SEND_SIZE, | 1018 | chunkMemcpy(this_gpu(d_state_data), h_send_data, SEND_SIZE, |
993 | cudaMemcpyHostToDevice, cur_gpu(), useEngineLocks()); | 1019 | cudaMemcpyHostToDevice, cur_gpu(), useEngineLocks()); |
994 | 1020 | ||
995 | for(unsigned int i = 0; i < num_kernels; ++i) | 1021 | for(unsigned int i = 0; i < num_kernels; ++i) |
996 | { | 1022 | { |
997 | /* one block per sm, one warp per block */ | 1023 | /* one block per sm, one warp per block */ |
@@ -1004,7 +1030,7 @@ static void gpu_loop_for_linux(double gpu_sec_time, unsigned int num_kernels, do | |||
1004 | cudaEventSynchronize(end); | 1030 | cudaEventSynchronize(end); |
1005 | #endif | 1031 | #endif |
1006 | cudaStreamSynchronize(cur_stream()); | 1032 | cudaStreamSynchronize(cur_stream()); |
1007 | 1033 | ||
1008 | #ifdef VANILLA_LINUX | 1034 | #ifdef VANILLA_LINUX |
1009 | cudaEventElapsedTime(&ms, start, end); | 1035 | cudaEventElapsedTime(&ms, start, end); |
1010 | ms_sum += ms; | 1036 | ms_sum += ms; |
@@ -1013,18 +1039,18 @@ static void gpu_loop_for_linux(double gpu_sec_time, unsigned int num_kernels, do | |||
1013 | #ifdef VANILLA_LINUX | 1039 | #ifdef VANILLA_LINUX |
1014 | ++gpucount; | 1040 | ++gpucount; |
1015 | #endif | 1041 | #endif |
1016 | 1042 | ||
1017 | if(RECV_SIZE > 0) | 1043 | if(RECV_SIZE > 0) |
1018 | chunkMemcpy(h_recv_data, this_gpu(d_state_data), RECV_SIZE, | 1044 | chunkMemcpy(h_recv_data, this_gpu(d_state_data), RECV_SIZE, |
1019 | cudaMemcpyDeviceToHost, cur_gpu(), useEngineLocks()); | 1045 | cudaMemcpyDeviceToHost, cur_gpu(), useEngineLocks()); |
1020 | 1046 | ||
1021 | if (MIGRATE_VIA_SYSMEM) | 1047 | if (MIGRATE_VIA_SYSMEM) |
1022 | PullState(); | 1048 | PullState(); |
1023 | } | 1049 | } |
1024 | pool->put(mutex, cur_gpu() - GPU_OFFSET); | 1050 | pool->put(mutex, cur_gpu() - GPU_OFFSET); |
1025 | 1051 | ||
1026 | last_gpu() = cur_gpu(); | 1052 | last_gpu() = cur_gpu(); |
1027 | 1053 | ||
1028 | out: | 1054 | out: |
1029 | return; | 1055 | return; |
1030 | } | 1056 | } |
@@ -1131,15 +1157,20 @@ static int loop_once(void) | |||
1131 | 1157 | ||
1132 | static int loop_for(double exec_time, double emergency_exit) | 1158 | static int loop_for(double exec_time, double emergency_exit) |
1133 | { | 1159 | { |
1134 | double last_loop = 0, loop_start; | ||
1135 | int tmp = 0; | 1160 | int tmp = 0; |
1161 | double last_loop, loop_start; | ||
1162 | double start, now; | ||
1163 | |||
1164 | if (exec_time <= 0.0) | ||
1165 | goto out; | ||
1136 | 1166 | ||
1137 | double start = cputime(); | 1167 | start = cputime(); |
1138 | double now = cputime(); | 1168 | now = cputime(); |
1139 | 1169 | ||
1140 | if (emergency_exit && wctime() > emergency_exit) | 1170 | if (emergency_exit && wctime() > emergency_exit) |
1141 | goto out; | 1171 | goto out; |
1142 | 1172 | ||
1173 | last_loop = 0; | ||
1143 | while (now + last_loop < start + exec_time) { | 1174 | while (now + last_loop < start + exec_time) { |
1144 | loop_start = now; | 1175 | loop_start = now; |
1145 | tmp += loop_once(); | 1176 | tmp += loop_once(); |
@@ -1177,36 +1208,39 @@ static void debug_delay_loop(void) | |||
1177 | } | 1208 | } |
1178 | } | 1209 | } |
1179 | 1210 | ||
1180 | static int gpu_job(double exec_time, double gpu_exec_time, unsigned int num_kernels, double program_end) | 1211 | typedef bool (*gpu_job_t)(double exec_time, double gpu_exec_time, unsigned int num_kernels, double program_end); |
1212 | typedef bool (*cpu_job_t)(double exec_time, double program_end); | ||
1213 | |||
1214 | static bool gpu_job(double exec_time, double gpu_exec_time, unsigned int num_kernels, double program_end) | ||
1181 | { | 1215 | { |
1182 | double chunk1, chunk2; | 1216 | double chunk1, chunk2; |
1183 | 1217 | ||
1184 | if (wctime() > program_end) { | 1218 | if (wctime() > program_end) { |
1185 | return 0; | 1219 | return false; |
1186 | } | 1220 | } |
1187 | else { | 1221 | else { |
1188 | chunk1 = exec_time * drand48(); | 1222 | chunk1 = exec_time * drand48(); |
1189 | chunk2 = exec_time - chunk1; | 1223 | chunk2 = exec_time - chunk1; |
1190 | 1224 | ||
1191 | loop_for(chunk1, program_end + 1); | 1225 | loop_for(chunk1, program_end + 1); |
1192 | gpu_loop_for(gpu_exec_time, num_kernels, program_end + 1); | 1226 | gpu_loop_for(gpu_exec_time, num_kernels, program_end + 1); |
1193 | loop_for(chunk2, program_end + 1); | 1227 | loop_for(chunk2, program_end + 1); |
1194 | 1228 | ||
1195 | sleep_next_period(); | 1229 | sleep_next_period(); |
1196 | } | 1230 | } |
1197 | return 1; | 1231 | return true; |
1198 | } | 1232 | } |
1199 | 1233 | ||
1200 | static int job(double exec_time, double program_end) | 1234 | static bool job(double exec_time, double program_end) |
1201 | { | 1235 | { |
1202 | if (wctime() > program_end) { | 1236 | if (wctime() > program_end) { |
1203 | return 0; | 1237 | return false; |
1204 | } | 1238 | } |
1205 | else { | 1239 | else { |
1206 | loop_for(exec_time, program_end + 1); | 1240 | loop_for(exec_time, program_end + 1); |
1207 | sleep_next_period(); | 1241 | sleep_next_period(); |
1208 | } | 1242 | } |
1209 | return 1; | 1243 | return true; |
1210 | } | 1244 | } |
1211 | 1245 | ||
1212 | /*****************************/ | 1246 | /*****************************/ |
@@ -1254,12 +1288,12 @@ static void init_linux() | |||
1254 | mlockall(MCL_CURRENT | MCL_FUTURE); | 1288 | mlockall(MCL_CURRENT | MCL_FUTURE); |
1255 | } | 1289 | } |
1256 | 1290 | ||
1257 | static int gpu_job_linux(double exec_time, double gpu_exec_time, unsigned int num_kernels, double program_end) | 1291 | static bool gpu_job_linux(double exec_time, double gpu_exec_time, unsigned int num_kernels, double program_end) |
1258 | { | 1292 | { |
1259 | double chunk1, chunk2; | 1293 | double chunk1, chunk2; |
1260 | 1294 | ||
1261 | if (wctime() > program_end) { | 1295 | if (wctime() > program_end) { |
1262 | return 0; | 1296 | return false; |
1263 | } | 1297 | } |
1264 | else { | 1298 | else { |
1265 | chunk1 = exec_time * drand48(); | 1299 | chunk1 = exec_time * drand48(); |
@@ -1268,22 +1302,22 @@ static int gpu_job_linux(double exec_time, double gpu_exec_time, unsigned int nu | |||
1268 | loop_for(chunk1, program_end + 1); | 1302 | loop_for(chunk1, program_end + 1); |
1269 | gpu_loop_for_linux(gpu_exec_time, num_kernels, program_end + 1); | 1303 | gpu_loop_for_linux(gpu_exec_time, num_kernels, program_end + 1); |
1270 | loop_for(chunk2, program_end + 1); | 1304 | loop_for(chunk2, program_end + 1); |
1271 | 1305 | ||
1272 | sleep_next_period_linux(); | 1306 | sleep_next_period_linux(); |
1273 | } | 1307 | } |
1274 | return 1; | 1308 | return true; |
1275 | } | 1309 | } |
1276 | 1310 | ||
1277 | static int job_linux(double exec_time, double program_end) | 1311 | static bool job_linux(double exec_time, double program_end) |
1278 | { | 1312 | { |
1279 | if (wctime() > program_end) { | 1313 | if (wctime() > program_end) { |
1280 | return 0; | 1314 | return false; |
1281 | } | 1315 | } |
1282 | else { | 1316 | else { |
1283 | loop_for(exec_time, program_end + 1); | 1317 | loop_for(exec_time, program_end + 1); |
1284 | sleep_next_period_linux(); | 1318 | sleep_next_period_linux(); |
1285 | } | 1319 | } |
1286 | return 1; | 1320 | return true; |
1287 | } | 1321 | } |
1288 | 1322 | ||
1289 | /*****************************/ | 1323 | /*****************************/ |
@@ -1296,7 +1330,7 @@ enum eScheduler | |||
1296 | }; | 1330 | }; |
1297 | 1331 | ||
1298 | #define CPU_OPTIONS "p:z:c:wlveio:f:s:q:X:L:Q:d:" | 1332 | #define CPU_OPTIONS "p:z:c:wlveio:f:s:q:X:L:Q:d:" |
1299 | #define GPU_OPTIONS "g:y:r:C:E:DG:xS:R:T:Z:aFm:b:MNIk:V" | 1333 | #define GPU_OPTIONS "g:y:r:C:E:DG:xS:R:T:Z:aFm:b:MNIk:VW:" |
1300 | 1334 | ||
1301 | // concat the option strings | 1335 | // concat the option strings |
1302 | #define OPTSTR CPU_OPTIONS GPU_OPTIONS | 1336 | #define OPTSTR CPU_OPTIONS GPU_OPTIONS |
@@ -1304,37 +1338,52 @@ enum eScheduler | |||
1304 | int main(int argc, char** argv) | 1338 | int main(int argc, char** argv) |
1305 | { | 1339 | { |
1306 | int ret; | 1340 | int ret; |
1341 | |||
1342 | struct rt_task param; | ||
1343 | |||
1307 | lt_t wcet; | 1344 | lt_t wcet; |
1308 | lt_t period; | 1345 | lt_t period; |
1309 | double wcet_ms = -1, gpu_wcet_ms = -1, period_ms = -1; | 1346 | lt_t budget; |
1347 | double wcet_ms = -1.0; | ||
1348 | double gpu_wcet_ms = 0.0; | ||
1349 | double period_ms = -1.0; | ||
1350 | double budget_ms = -1.0; | ||
1351 | |||
1352 | unsigned int num_kernels = 1; | ||
1353 | |||
1354 | budget_drain_policy_t drain = DRAIN_SIMPLE; | ||
1355 | bool want_enforcement = false; | ||
1356 | bool want_signals = false; | ||
1357 | |||
1310 | unsigned int priority = LITMUS_LOWEST_PRIORITY; | 1358 | unsigned int priority = LITMUS_LOWEST_PRIORITY; |
1359 | |||
1360 | task_class_t cls = RT_CLASS_SOFT; | ||
1361 | |||
1362 | eScheduler scheduler = LITMUS; | ||
1363 | int num_gpu_users = 0; | ||
1311 | int migrate = 0; | 1364 | int migrate = 0; |
1312 | int cluster = 0; | 1365 | int cluster = 0; |
1313 | int cluster_size = 1; | 1366 | int cluster_size = 1; |
1314 | int opt; | 1367 | |
1368 | Normal<double> *wcet_dist_ms = NULL; | ||
1369 | float stdpct = 0.0; | ||
1370 | |||
1371 | cpu_job_t cjobfn = NULL; | ||
1372 | gpu_job_t gjobfn = NULL; | ||
1373 | |||
1315 | int wait = 0; | 1374 | int wait = 0; |
1375 | double scale = 1.0; | ||
1316 | int test_loop = 0; | 1376 | int test_loop = 0; |
1317 | int column = 1; | 1377 | |
1318 | const char *file = NULL; | ||
1319 | int want_enforcement = 0; | ||
1320 | int want_signals = 0; | ||
1321 | double duration = 0, start = 0; | 1378 | double duration = 0, start = 0; |
1322 | double *exec_times = NULL; | ||
1323 | double scale = 1.0; | ||
1324 | task_class_t cls = RT_CLASS_SOFT; | ||
1325 | int cur_job = 0, num_jobs = 0; | 1379 | int cur_job = 0, num_jobs = 0; |
1326 | struct rt_task param; | 1380 | int column = 1; |
1327 | 1381 | ||
1328 | double budget_ms = -1.0; | 1382 | int opt; |
1329 | lt_t budget; | 1383 | |
1330 | 1384 | double *exec_times = NULL; | |
1331 | int num_gpu_users = 0; | 1385 | const char *file = NULL; |
1332 | unsigned int num_kernels = 1; | ||
1333 | 1386 | ||
1334 | budget_drain_policy_t drain = DRAIN_SIMPLE; | ||
1335 | |||
1336 | eScheduler scheduler = LITMUS; | ||
1337 | |||
1338 | /* locking */ | 1387 | /* locking */ |
1339 | // int lock_od = -1; | 1388 | // int lock_od = -1; |
1340 | // int resource_id = 0; | 1389 | // int resource_id = 0; |
@@ -1414,7 +1463,7 @@ int main(int argc, char** argv) | |||
1414 | MIGRATE_VIA_SYSMEM = true; | 1463 | MIGRATE_VIA_SYSMEM = true; |
1415 | break; | 1464 | break; |
1416 | case 'm': | 1465 | case 'm': |
1417 | num_gpu_users = atoi(optarg); | 1466 | num_gpu_users = (int)atoi(optarg); |
1418 | assert(num_gpu_users > 0); | 1467 | assert(num_gpu_users > 0); |
1419 | break; | 1468 | break; |
1420 | case 'k': | 1469 | case 'k': |
@@ -1423,6 +1472,9 @@ int main(int argc, char** argv) | |||
1423 | case 'b': | 1472 | case 'b': |
1424 | budget_ms = atoi(optarg); | 1473 | budget_ms = atoi(optarg); |
1425 | break; | 1474 | break; |
1475 | case 'W': | ||
1476 | stdpct = atof(optarg); | ||
1477 | break; | ||
1426 | case 'N': | 1478 | case 'N': |
1427 | scheduler = LINUX; | 1479 | scheduler = LINUX; |
1428 | break; | 1480 | break; |
@@ -1438,10 +1490,10 @@ int main(int argc, char** argv) | |||
1438 | usage("Unknown task class."); | 1490 | usage("Unknown task class."); |
1439 | break; | 1491 | break; |
1440 | case 'e': | 1492 | case 'e': |
1441 | want_enforcement = 1; | 1493 | want_enforcement = true; |
1442 | break; | 1494 | break; |
1443 | case 'i': | 1495 | case 'i': |
1444 | want_signals = 1; | 1496 | want_signals = true; |
1445 | break; | 1497 | break; |
1446 | case 'd': | 1498 | case 'd': |
1447 | drain = (budget_drain_policy_t)atoi(optarg); | 1499 | drain = (budget_drain_policy_t)atoi(optarg); |
@@ -1489,27 +1541,34 @@ int main(int argc, char** argv) | |||
1489 | assert(scheduler != LITMUS); | 1541 | assert(scheduler != LITMUS); |
1490 | assert(!wait); | 1542 | assert(!wait); |
1491 | #endif | 1543 | #endif |
1492 | 1544 | ||
1545 | assert(stdpct >= 0.0); | ||
1546 | |||
1547 | if (MIGRATE_VIA_SYSMEM) | ||
1548 | assert(GPU_PARTITION_SIZE != 1); | ||
1549 | |||
1493 | // turn off some features to be safe | 1550 | // turn off some features to be safe |
1494 | if (scheduler != LITMUS) | 1551 | if (scheduler != LITMUS) |
1495 | { | 1552 | { |
1496 | RHO = 0; | 1553 | RHO = 0; |
1497 | USE_ENGINE_LOCKS = false; | 1554 | USE_ENGINE_LOCKS = false; |
1498 | USE_DYNAMIC_GROUP_LOCKS = false; | 1555 | USE_DYNAMIC_GROUP_LOCKS = false; |
1499 | ENABLE_AFFINITY = false; | ||
1500 | RELAX_FIFO_MAX_LEN = false; | 1556 | RELAX_FIFO_MAX_LEN = false; |
1501 | ENABLE_RT_AUX_THREADS = false; | 1557 | ENABLE_RT_AUX_THREADS = false; |
1502 | budget_ms = -1.0; | 1558 | budget_ms = -1.0; |
1503 | want_enforcement = 0; | 1559 | want_enforcement = false; |
1504 | want_signals = 0; | 1560 | want_signals = false; |
1505 | 1561 | ||
1562 | cjobfn = job_linux; | ||
1563 | gjobfn = gpu_job_linux; | ||
1564 | |||
1506 | if (scheduler == RT_LINUX) | 1565 | if (scheduler == RT_LINUX) |
1507 | { | 1566 | { |
1508 | struct sched_param fifoparams; | 1567 | struct sched_param fifoparams; |
1509 | 1568 | ||
1510 | assert(priority >= sched_get_priority_min(SCHED_FIFO) && | 1569 | assert(priority >= sched_get_priority_min(SCHED_FIFO) && |
1511 | priority <= sched_get_priority_max(SCHED_FIFO)); | 1570 | priority <= sched_get_priority_max(SCHED_FIFO)); |
1512 | 1571 | ||
1513 | memset(&fifoparams, 0, sizeof(fifoparams)); | 1572 | memset(&fifoparams, 0, sizeof(fifoparams)); |
1514 | fifoparams.sched_priority = priority; | 1573 | fifoparams.sched_priority = priority; |
1515 | assert(0 == sched_setscheduler(getpid(), SCHED_FIFO, &fifoparams)); | 1574 | assert(0 == sched_setscheduler(getpid(), SCHED_FIFO, &fifoparams)); |
@@ -1517,16 +1576,19 @@ int main(int argc, char** argv) | |||
1517 | } | 1576 | } |
1518 | else | 1577 | else |
1519 | { | 1578 | { |
1579 | cjobfn = job; | ||
1580 | gjobfn = gpu_job; | ||
1581 | |||
1520 | if (!litmus_is_valid_fixed_prio(priority)) | 1582 | if (!litmus_is_valid_fixed_prio(priority)) |
1521 | usage("Invalid priority."); | 1583 | usage("Invalid priority."); |
1522 | } | 1584 | } |
1523 | 1585 | ||
1524 | if (test_loop) { | 1586 | if (test_loop) { |
1525 | debug_delay_loop(); | 1587 | debug_delay_loop(); |
1526 | return 0; | 1588 | return 0; |
1527 | } | 1589 | } |
1528 | 1590 | ||
1529 | srand(getpid()); | 1591 | srand(time(0)); |
1530 | 1592 | ||
1531 | if (file) { | 1593 | if (file) { |
1532 | get_exec_times(file, column, &num_jobs, &exec_times); | 1594 | get_exec_times(file, column, &num_jobs, &exec_times); |
@@ -1548,7 +1610,7 @@ int main(int argc, char** argv) | |||
1548 | } | 1610 | } |
1549 | 1611 | ||
1550 | if (argc - optind == 3) { | 1612 | if (argc - optind == 3) { |
1551 | assert(!GPU_USING); | 1613 | assert(!GPU_USING); |
1552 | wcet_ms = atof(argv[optind + 0]); | 1614 | wcet_ms = atof(argv[optind + 0]); |
1553 | period_ms = atof(argv[optind + 1]); | 1615 | period_ms = atof(argv[optind + 1]); |
1554 | duration = atof(argv[optind + 2]); | 1616 | duration = atof(argv[optind + 2]); |
@@ -1560,7 +1622,7 @@ int main(int argc, char** argv) | |||
1560 | period_ms = atof(argv[optind + 2]); | 1622 | period_ms = atof(argv[optind + 2]); |
1561 | duration = atof(argv[optind + 3]); | 1623 | duration = atof(argv[optind + 3]); |
1562 | } | 1624 | } |
1563 | 1625 | ||
1564 | wcet = ms2ns(wcet_ms); | 1626 | wcet = ms2ns(wcet_ms); |
1565 | period = ms2ns(period_ms); | 1627 | period = ms2ns(period_ms); |
1566 | if (wcet <= 0) | 1628 | if (wcet <= 0) |
@@ -1579,7 +1641,29 @@ int main(int argc, char** argv) | |||
1579 | budget = ms2ns(budget_ms); | 1641 | budget = ms2ns(budget_ms); |
1580 | else | 1642 | else |
1581 | budget = wcet; | 1643 | budget = wcet; |
1582 | 1644 | ||
1645 | #if 0 | ||
1646 | // use upscale to determine breakdown utilization | ||
1647 | // only scaling up CPU time for now. | ||
1648 | double upscale = (double)period/(double)budget - 1.0; | ||
1649 | upscale = std::min(std::max(0.0, upscale), 0.6); // at most 30% | ||
1650 | wcet = wcet + wcet*upscale; | ||
1651 | budget = budget + wcet*upscale; | ||
1652 | wcet_ms = wcet_ms + wcet_ms*upscale; | ||
1653 | |||
1654 | // fucking floating point | ||
1655 | if (budget < wcet) | ||
1656 | budget = wcet; | ||
1657 | if (budget > period) | ||
1658 | budget = period; | ||
1659 | #endif | ||
1660 | |||
1661 | // randomize execution time according to a normal distribution | ||
1662 | // centered around the desired execution time. | ||
1663 | // standard deviation is a percentage of this average | ||
1664 | wcet_dist_ms = new Normal<double>(wcet_ms + gpu_wcet_ms, (wcet_ms + gpu_wcet_ms) * stdpct); | ||
1665 | wcet_dist_ms->seed((unsigned int)time(0)); | ||
1666 | |||
1583 | if (file && num_jobs > 1) | 1667 | if (file && num_jobs > 1) |
1584 | duration += period_ms * 0.001 * (num_jobs - 1); | 1668 | duration += period_ms * 0.001 * (num_jobs - 1); |
1585 | 1669 | ||
@@ -1588,7 +1672,7 @@ int main(int argc, char** argv) | |||
1588 | if (ret < 0) | 1672 | if (ret < 0) |
1589 | bail_out("could not migrate to target partition or cluster."); | 1673 | bail_out("could not migrate to target partition or cluster."); |
1590 | } | 1674 | } |
1591 | 1675 | ||
1592 | if (scheduler != LITMUS) | 1676 | if (scheduler != LITMUS) |
1593 | { | 1677 | { |
1594 | // set some variables needed by linux modes | 1678 | // set some variables needed by linux modes |
@@ -1612,17 +1696,19 @@ int main(int argc, char** argv) | |||
1612 | PRECISE_SIGNALS : NO_SIGNALS; | 1696 | PRECISE_SIGNALS : NO_SIGNALS; |
1613 | param.drain_policy = drain; | 1697 | param.drain_policy = drain; |
1614 | param.release_policy = PERIODIC; | 1698 | param.release_policy = PERIODIC; |
1615 | 1699 | ||
1616 | if (migrate) | 1700 | if (migrate) |
1617 | param.cpu = cluster_to_first_cpu(cluster, cluster_size); | 1701 | param.cpu = cluster_to_first_cpu(cluster, cluster_size); |
1618 | ret = set_rt_task_param(gettid(), ¶m); | 1702 | ret = set_rt_task_param(gettid(), ¶m); |
1619 | if (ret < 0) | 1703 | if (ret < 0) |
1620 | bail_out("could not setup rt task params"); | 1704 | bail_out("could not setup rt task params"); |
1621 | 1705 | ||
1622 | if (scheduler == LITMUS) | 1706 | if (scheduler == LITMUS) { |
1623 | init_litmus(); | 1707 | init_litmus(); |
1624 | else | 1708 | } |
1709 | else { | ||
1625 | init_linux(); | 1710 | init_linux(); |
1711 | } | ||
1626 | 1712 | ||
1627 | if (want_signals) { | 1713 | if (want_signals) { |
1628 | /* bind default longjmp signal handler to SIG_BUDGET. */ | 1714 | /* bind default longjmp signal handler to SIG_BUDGET. */ |
@@ -1640,16 +1726,16 @@ int main(int argc, char** argv) | |||
1640 | 1726 | ||
1641 | if (GPU_USING) { | 1727 | if (GPU_USING) { |
1642 | allocate_locks(num_gpu_users, scheduler != LITMUS); | 1728 | allocate_locks(num_gpu_users, scheduler != LITMUS); |
1643 | 1729 | ||
1644 | signal(SIGABRT, catch_exit); | 1730 | signal(SIGABRT, catch_exit); |
1645 | signal(SIGTERM, catch_exit); | 1731 | signal(SIGTERM, catch_exit); |
1646 | signal(SIGQUIT, catch_exit); | 1732 | signal(SIGQUIT, catch_exit); |
1647 | signal(SIGSEGV, catch_exit); | 1733 | signal(SIGSEGV, catch_exit); |
1648 | 1734 | ||
1649 | init_cuda(num_gpu_users); | 1735 | init_cuda(num_gpu_users); |
1650 | safetynet = true; | 1736 | safetynet = true; |
1651 | } | 1737 | } |
1652 | 1738 | ||
1653 | if (scheduler == LITMUS) | 1739 | if (scheduler == LITMUS) |
1654 | { | 1740 | { |
1655 | ret = task_mode(LITMUS_RT_TASK); | 1741 | ret = task_mode(LITMUS_RT_TASK); |
@@ -1666,7 +1752,7 @@ int main(int argc, char** argv) | |||
1666 | ret = wait_for_ts_release2(&releaseTime); | 1752 | ret = wait_for_ts_release2(&releaseTime); |
1667 | if (ret != 0) | 1753 | if (ret != 0) |
1668 | bail_out("wait_for_ts_release2()"); | 1754 | bail_out("wait_for_ts_release2()"); |
1669 | 1755 | ||
1670 | if (scheduler != LITMUS) | 1756 | if (scheduler != LITMUS) |
1671 | log_release(); | 1757 | log_release(); |
1672 | } | 1758 | } |
@@ -1683,35 +1769,38 @@ int main(int argc, char** argv) | |||
1683 | 1769 | ||
1684 | start = wctime(); | 1770 | start = wctime(); |
1685 | 1771 | ||
1686 | if (scheduler == LITMUS) | 1772 | if (!GPU_USING) { |
1687 | { | 1773 | bool keepgoing; |
1688 | if (!GPU_USING) { | 1774 | do |
1689 | while (job(wcet_ms * 0.001 * scale, start + duration)); | 1775 | { |
1690 | } | 1776 | double job_ms = wcet_dist_ms->random(); |
1691 | else { | 1777 | if (job_ms < 0.0) |
1692 | while (gpu_job(wcet_ms * 0.001 * scale, | 1778 | job_ms = 0.0; |
1693 | gpu_wcet_ms * 0.001 * scale, | 1779 | keepgoing = cjobfn(ms2s(job_ms * scale), start + duration); |
1694 | num_kernels, | 1780 | }while(keepgoing); |
1695 | start + duration)); | ||
1696 | } | ||
1697 | } | 1781 | } |
1698 | else | 1782 | else { |
1699 | { | 1783 | bool keepgoing; |
1700 | if (!GPU_USING) { | 1784 | do |
1701 | while (job_linux(wcet_ms * 0.001 * scale, start + duration)); | 1785 | { |
1702 | } | 1786 | double job_ms = wcet_dist_ms->random(); |
1703 | else { | 1787 | if (job_ms < 0.0) |
1704 | while (gpu_job_linux(wcet_ms * 0.001 * scale, | 1788 | job_ms = 0.0; |
1705 | gpu_wcet_ms * 0.001 * scale, | 1789 | |
1706 | num_kernels, | 1790 | double cpu_job_ms = (job_ms/(wcet_ms + gpu_wcet_ms))*wcet_ms; |
1707 | start + duration)); | 1791 | double gpu_job_ms = (job_ms/(wcet_ms + gpu_wcet_ms))*gpu_wcet_ms; |
1708 | } | 1792 | keepgoing = gjobfn( |
1793 | ms2s(cpu_job_ms * scale), | ||
1794 | ms2s(gpu_job_ms * scale), | ||
1795 | num_kernels, | ||
1796 | start + duration); | ||
1797 | }while(keepgoing); | ||
1709 | } | 1798 | } |
1710 | 1799 | ||
1711 | if (GPU_USING && ENABLE_RT_AUX_THREADS) | 1800 | if (GPU_USING && ENABLE_RT_AUX_THREADS) |
1712 | if (disable_aux_rt_tasks(AUX_CURRENT | AUX_FUTURE) != 0) | 1801 | if (disable_aux_rt_tasks(AUX_CURRENT | AUX_FUTURE) != 0) |
1713 | bail_out("disable_aux_rt_tasks() failed"); | 1802 | bail_out("disable_aux_rt_tasks() failed"); |
1714 | 1803 | ||
1715 | // if (file) { | 1804 | // if (file) { |
1716 | // /* use times read from the CSV file */ | 1805 | // /* use times read from the CSV file */ |
1717 | // for (cur_job = 0; cur_job < num_jobs; ++cur_job) { | 1806 | // for (cur_job = 0; cur_job < num_jobs; ++cur_job) { |
@@ -1740,7 +1829,10 @@ int main(int argc, char** argv) | |||
1740 | 1829 | ||
1741 | // printf("avg: %f\n", ms_sum/gpucount); | 1830 | // printf("avg: %f\n", ms_sum/gpucount); |
1742 | } | 1831 | } |
1743 | 1832 | ||
1833 | if (wcet_dist_ms) | ||
1834 | delete wcet_dist_ms; | ||
1835 | |||
1744 | if (file) | 1836 | if (file) |
1745 | free(exec_times); | 1837 | free(exec_times); |
1746 | 1838 | ||
diff --git a/src/migration.c b/src/migration.c index 7ac320e..084b68c 100644 --- a/src/migration.c +++ b/src/migration.c | |||
@@ -66,6 +66,7 @@ int cluster_to_first_cpu(int cluster, int cluster_sz) | |||
66 | static int setup_numa(pid_t tid, int sz, const cpu_set_t *cpus) | 66 | static int setup_numa(pid_t tid, int sz, const cpu_set_t *cpus) |
67 | { | 67 | { |
68 | int nr_nodes; | 68 | int nr_nodes; |
69 | int nr_cpus = num_online_cpus(); | ||
69 | struct bitmask* new_nodes; | 70 | struct bitmask* new_nodes; |
70 | struct bitmask* old_nodes; | 71 | struct bitmask* old_nodes; |
71 | int i; | 72 | int i; |
@@ -78,7 +79,7 @@ static int setup_numa(pid_t tid, int sz, const cpu_set_t *cpus) | |||
78 | new_nodes = numa_bitmask_alloc(nr_nodes); | 79 | new_nodes = numa_bitmask_alloc(nr_nodes); |
79 | old_nodes = numa_bitmask_alloc(nr_nodes); | 80 | old_nodes = numa_bitmask_alloc(nr_nodes); |
80 | /* map the cpu mask to a numa mask */ | 81 | /* map the cpu mask to a numa mask */ |
81 | for (i = 0; i < sz; ++i) { | 82 | for (i = 0; i < nr_cpus; ++i) { |
82 | if(CPU_ISSET_S(i, sz, cpus)) { | 83 | if(CPU_ISSET_S(i, sz, cpus)) { |
83 | numa_bitmask_setbit(new_nodes, numa_node_of_cpu(i)); | 84 | numa_bitmask_setbit(new_nodes, numa_node_of_cpu(i)); |
84 | } | 85 | } |
@@ -124,6 +125,7 @@ int be_migrate_thread_to_cpu(pid_t tid, int target_cpu) | |||
124 | 125 | ||
125 | cpu_set = CPU_ALLOC(num_cpus); | 126 | cpu_set = CPU_ALLOC(num_cpus); |
126 | sz = CPU_ALLOC_SIZE(num_cpus); | 127 | sz = CPU_ALLOC_SIZE(num_cpus); |
128 | |||
127 | CPU_ZERO_S(sz, cpu_set); | 129 | CPU_ZERO_S(sz, cpu_set); |
128 | CPU_SET_S(target_cpu, sz, cpu_set); | 130 | CPU_SET_S(target_cpu, sz, cpu_set); |
129 | 131 | ||