diff options
author | Glenn Elliott <gelliott@cs.unc.edu> | 2013-03-13 15:33:57 -0400 |
---|---|---|
committer | Glenn Elliott <gelliott@cs.unc.edu> | 2013-03-13 15:33:57 -0400 |
commit | 944a78c21028da69fb53c0aec3e9dfdb048d47e4 (patch) | |
tree | bdbc77b3c0ff1337670a7e5d0f9d438388c1a866 /gpu | |
parent | f338b34ea0fb6136ea3895a07161ece030c4b998 (diff) | |
parent | 1ff4fc699f01f0ad1359fad48b00c9d3be1b28b4 (diff) |
Merge branch 'gh/staging' into temp
Conflicts:
Makefile
bin/rt_launch.c
bin/rtspin.c
src/task.c
Diffstat (limited to 'gpu')
-rw-r--r-- | gpu/aux_threads.c | 14 | ||||
-rw-r--r-- | gpu/dgl.c | 20 | ||||
-rw-r--r-- | gpu/ikglptest.c | 78 | ||||
-rw-r--r-- | gpu/locktest.c | 2 | ||||
-rw-r--r-- | gpu/nested.c | 4 | ||||
-rw-r--r-- | gpu/rtspin_fake_cuda.cpp | 206 |
6 files changed, 162 insertions, 162 deletions
diff --git a/gpu/aux_threads.c b/gpu/aux_threads.c index 1e168c6..1711c40 100644 --- a/gpu/aux_threads.c +++ b/gpu/aux_threads.c | |||
@@ -1,4 +1,4 @@ | |||
1 | /* based_mt_task.c -- A basic multi-threaded real-time task skeleton. | 1 | /* based_mt_task.c -- A basic multi-threaded real-time task skeleton. |
2 | * | 2 | * |
3 | * This (by itself useless) task demos how to setup a multi-threaded LITMUS^RT | 3 | * This (by itself useless) task demos how to setup a multi-threaded LITMUS^RT |
4 | * real-time task. Familiarity with the single threaded example (base_task.c) | 4 | * real-time task. Familiarity with the single threaded example (base_task.c) |
@@ -48,7 +48,7 @@ struct thread_context { | |||
48 | void* rt_thread(void *tcontext); | 48 | void* rt_thread(void *tcontext); |
49 | void* aux_thread(void *tcontext); | 49 | void* aux_thread(void *tcontext); |
50 | 50 | ||
51 | /* Declare the periodically invoked job. | 51 | /* Declare the periodically invoked job. |
52 | * Returns 1 -> task should exit. | 52 | * Returns 1 -> task should exit. |
53 | * 0 -> task should continue. | 53 | * 0 -> task should continue. |
54 | */ | 54 | */ |
@@ -112,7 +112,7 @@ int main(int argc, char** argv) | |||
112 | 112 | ||
113 | ctx = calloc(NUM_AUX_THREADS, sizeof(struct thread_context)); | 113 | ctx = calloc(NUM_AUX_THREADS, sizeof(struct thread_context)); |
114 | task = calloc(NUM_AUX_THREADS, sizeof(pthread_t)); | 114 | task = calloc(NUM_AUX_THREADS, sizeof(pthread_t)); |
115 | 115 | ||
116 | //lt_t delay = ms2lt(1000); | 116 | //lt_t delay = ms2lt(1000); |
117 | 117 | ||
118 | /***** | 118 | /***** |
@@ -199,9 +199,9 @@ int main(int argc, char** argv) | |||
199 | printf("child %d: %fs\n", i, time); | 199 | printf("child %d: %fs\n", i, time); |
200 | } | 200 | } |
201 | } | 201 | } |
202 | |||
203 | 202 | ||
204 | /***** | 203 | |
204 | /***** | ||
205 | * 6) Clean up, maybe print results and stats, and exit. | 205 | * 6) Clean up, maybe print results and stats, and exit. |
206 | */ | 206 | */ |
207 | return 0; | 207 | return 0; |
@@ -271,7 +271,7 @@ void* rt_thread(void *tcontext) | |||
271 | 271 | ||
272 | wait_for_ts_release(); | 272 | wait_for_ts_release(); |
273 | 273 | ||
274 | /* The task is now executing as a real-time task if the call didn't fail. | 274 | /* The task is now executing as a real-time task if the call didn't fail. |
275 | */ | 275 | */ |
276 | 276 | ||
277 | 277 | ||
@@ -304,7 +304,7 @@ void* rt_thread(void *tcontext) | |||
304 | return ctx; | 304 | return ctx; |
305 | } | 305 | } |
306 | 306 | ||
307 | int job(void) | 307 | int job(void) |
308 | { | 308 | { |
309 | /* Do real-time calculation. */ | 309 | /* Do real-time calculation. */ |
310 | 310 | ||
@@ -177,7 +177,7 @@ void* rt_thread(void* _ctx) | |||
177 | xfprintf(stdout, "ikglp od = %d\n", ctx->ikglp); | 177 | xfprintf(stdout, "ikglp od = %d\n", ctx->ikglp); |
178 | } | 178 | } |
179 | 179 | ||
180 | 180 | ||
181 | for (i = 0; i < NUM_SEMS; i++) { | 181 | for (i = 0; i < NUM_SEMS; i++) { |
182 | if(!USE_PRIOQ) { | 182 | if(!USE_PRIOQ) { |
183 | ctx->od[i] = open_fifo_sem(ctx->fd, i+1); | 183 | ctx->od[i] = open_fifo_sem(ctx->fd, i+1); |
@@ -208,29 +208,29 @@ void* rt_thread(void* _ctx) | |||
208 | int last = (first + NEST_DEPTH - 1 >= NUM_SEMS) ? NUM_SEMS - 1 : first + NEST_DEPTH - 1; | 208 | int last = (first + NEST_DEPTH - 1 >= NUM_SEMS) ? NUM_SEMS - 1 : first + NEST_DEPTH - 1; |
209 | int dgl_size = last - first + 1; | 209 | int dgl_size = last - first + 1; |
210 | int dgl[dgl_size]; | 210 | int dgl[dgl_size]; |
211 | 211 | ||
212 | // construct the DGL | 212 | // construct the DGL |
213 | for(i = first; i <= last; ++i) { | 213 | for(i = first; i <= last; ++i) { |
214 | dgl[i-first] = ctx->od[i]; | 214 | dgl[i-first] = ctx->od[i]; |
215 | } | 215 | } |
216 | 216 | ||
217 | 217 | ||
218 | if(NUM_REPLICAS) { | 218 | if(NUM_REPLICAS) { |
219 | replica = litmus_lock(ctx->ikglp); | 219 | replica = litmus_lock(ctx->ikglp); |
220 | xfprintf(stdout, "[%d] got ikglp replica %d.\n", ctx->id, replica); | 220 | xfprintf(stdout, "[%d] got ikglp replica %d.\n", ctx->id, replica); |
221 | } | 221 | } |
222 | 222 | ||
223 | 223 | ||
224 | litmus_dgl_lock(dgl, dgl_size); | 224 | litmus_dgl_lock(dgl, dgl_size); |
225 | xfprintf(stdout, "[%d] acquired dgl.\n", ctx->id); | 225 | xfprintf(stdout, "[%d] acquired dgl.\n", ctx->id); |
226 | 226 | ||
227 | do_exit = job(ctx); | 227 | do_exit = job(ctx); |
228 | 228 | ||
229 | 229 | ||
230 | xfprintf(stdout, "[%d] unlocking dgl.\n", ctx->id); | 230 | xfprintf(stdout, "[%d] unlocking dgl.\n", ctx->id); |
231 | litmus_dgl_unlock(dgl, dgl_size); | 231 | litmus_dgl_unlock(dgl, dgl_size); |
232 | 232 | ||
233 | if(NUM_REPLICAS) { | 233 | if(NUM_REPLICAS) { |
234 | xfprintf(stdout, "[%d]: freeing ikglp replica %d.\n", ctx->id, replica); | 234 | xfprintf(stdout, "[%d]: freeing ikglp replica %d.\n", ctx->id, replica); |
235 | litmus_unlock(ctx->ikglp); | 235 | litmus_unlock(ctx->ikglp); |
236 | } | 236 | } |
@@ -249,7 +249,7 @@ void* rt_thread(void* _ctx) | |||
249 | return NULL; | 249 | return NULL; |
250 | } | 250 | } |
251 | 251 | ||
252 | void dirty_kb(int kb) | 252 | void dirty_kb(int kb) |
253 | { | 253 | { |
254 | int32_t one_kb[256]; | 254 | int32_t one_kb[256]; |
255 | int32_t sum = 0; | 255 | int32_t sum = 0; |
diff --git a/gpu/ikglptest.c b/gpu/ikglptest.c index f802801..30623b7 100644 --- a/gpu/ikglptest.c +++ b/gpu/ikglptest.c | |||
@@ -172,7 +172,7 @@ struct avg_info feedback(int _a, int _b) | |||
172 | } | 172 | } |
173 | 173 | ||
174 | stdev = sqrtf(devsum/(NUM_SAMPLES-1)); | 174 | stdev = sqrtf(devsum/(NUM_SAMPLES-1)); |
175 | 175 | ||
176 | ret.avg = avg; | 176 | ret.avg = avg; |
177 | ret.stdev = stdev; | 177 | ret.stdev = stdev; |
178 | 178 | ||
@@ -189,10 +189,10 @@ struct avg_info feedback(int _a, int _b) | |||
189 | int main(int argc, char** argv) | 189 | int main(int argc, char** argv) |
190 | { | 190 | { |
191 | int i; | 191 | int i; |
192 | struct thread_context* ctx; | 192 | struct thread_context* ctx = NULL; |
193 | struct thread_context* aux_ctx; | 193 | struct thread_context* aux_ctx = NULL; |
194 | pthread_t* task; | 194 | pthread_t* task = NULL; |
195 | pthread_t* aux_task; | 195 | pthread_t* aux_task = NULL; |
196 | int fd; | 196 | int fd; |
197 | 197 | ||
198 | int opt; | 198 | int opt; |
@@ -291,7 +291,7 @@ int main(int argc, char** argv) | |||
291 | } | 291 | } |
292 | } | 292 | } |
293 | } | 293 | } |
294 | 294 | ||
295 | printf("Best:\ta = %d\tb = %d\t(b-a) = %d\tavg = %6.2f\tstdev = %6.2f\n", best_a, best_b, best_b - best_a, best.avg, best.stdev); | 295 | printf("Best:\ta = %d\tb = %d\t(b-a) = %d\tavg = %6.2f\tstdev = %6.2f\n", best_a, best_b, best_b - best_a, best.avg, best.stdev); |
296 | printf("2nd:\ta = %d\tb = %d\t(b-a) = %d\tavg = %6.2f\tstdev = %6.2f\n", second_best_a, second_best_b, second_best_b - second_best_a, second_best.avg, second_best.stdev); | 296 | printf("2nd:\ta = %d\tb = %d\t(b-a) = %d\tavg = %6.2f\tstdev = %6.2f\n", second_best_a, second_best_b, second_best_b - second_best_a, second_best.avg, second_best.stdev); |
297 | 297 | ||
@@ -308,7 +308,7 @@ int main(int argc, char** argv) | |||
308 | } | 308 | } |
309 | 309 | ||
310 | printf("Aaron:\tavg = %6.2f\tstd = %6.2f\n", avg_accum/TRIALS, std_accum/TRIALS); | 310 | printf("Aaron:\tavg = %6.2f\tstd = %6.2f\n", avg_accum/TRIALS, std_accum/TRIALS); |
311 | 311 | ||
312 | 312 | ||
313 | 313 | ||
314 | 314 | ||
@@ -385,7 +385,7 @@ int affinity_distance(struct thread_context* ctx, int a, int b) | |||
385 | { | 385 | { |
386 | int i; | 386 | int i; |
387 | int dist; | 387 | int dist; |
388 | 388 | ||
389 | if(a >= 0 && b >= 0) { | 389 | if(a >= 0 && b >= 0) { |
390 | for(i = 0; i <= 3; ++i) { | 390 | for(i = 0; i <= 3; ++i) { |
391 | if(a>>i == b>>i) { | 391 | if(a>>i == b>>i) { |
@@ -397,25 +397,25 @@ int affinity_distance(struct thread_context* ctx, int a, int b) | |||
397 | } | 397 | } |
398 | else { | 398 | else { |
399 | dist = 0; | 399 | dist = 0; |
400 | } | 400 | } |
401 | 401 | ||
402 | out: | 402 | out: |
403 | //printf("[%d]: distance: %d -> %d = %d\n", ctx->id, a, b, dist); | 403 | //printf("[%d]: distance: %d -> %d = %d\n", ctx->id, a, b, dist); |
404 | 404 | ||
405 | ++(ctx->mig_count[dist]); | 405 | ++(ctx->mig_count[dist]); |
406 | 406 | ||
407 | return dist; | 407 | return dist; |
408 | 408 | ||
409 | // int groups[] = {2, 4, 8}; | 409 | // int groups[] = {2, 4, 8}; |
410 | // int i; | 410 | // int i; |
411 | // | 411 | // |
412 | // if(a < 0 || b < 0) | 412 | // if(a < 0 || b < 0) |
413 | // return (sizeof(groups)/sizeof(groups[0])); // worst affinity | 413 | // return (sizeof(groups)/sizeof(groups[0])); // worst affinity |
414 | // | 414 | // |
415 | // // no migration | 415 | // // no migration |
416 | // if(a == b) | 416 | // if(a == b) |
417 | // return 0; | 417 | // return 0; |
418 | // | 418 | // |
419 | // for(i = 0; i < sizeof(groups)/sizeof(groups[0]); ++i) { | 419 | // for(i = 0; i < sizeof(groups)/sizeof(groups[0]); ++i) { |
420 | // if(a/groups[i] == b/groups[i]) | 420 | // if(a/groups[i] == b/groups[i]) |
421 | // return (i+1); | 421 | // return (i+1); |
@@ -441,7 +441,7 @@ void* rt_thread(void* _ctx) | |||
441 | { | 441 | { |
442 | int i; | 442 | int i; |
443 | int do_exit = 0; | 443 | int do_exit = 0; |
444 | int last_replica = -1; | 444 | int last_replica = -1; |
445 | 445 | ||
446 | struct thread_context *ctx = (struct thread_context*)_ctx; | 446 | struct thread_context *ctx = (struct thread_context*)_ctx; |
447 | 447 | ||
@@ -472,13 +472,13 @@ void* rt_thread(void* _ctx) | |||
472 | IKGLP_OPTIMAL_FIFO_LEN : | 472 | IKGLP_OPTIMAL_FIFO_LEN : |
473 | IKGLP_UNLIMITED_FIFO_LEN, | 473 | IKGLP_UNLIMITED_FIFO_LEN, |
474 | ENABLE_AFFINITY | 474 | ENABLE_AFFINITY |
475 | ); | 475 | ); |
476 | } | 476 | } |
477 | if(ctx->kexclu < 0) | 477 | if(ctx->kexclu < 0) |
478 | perror("open_kexclu_sem"); | 478 | perror("open_kexclu_sem"); |
479 | else | 479 | else |
480 | printf("kexclu od = %d\n", ctx->kexclu); | 480 | printf("kexclu od = %d\n", ctx->kexclu); |
481 | 481 | ||
482 | for (i = 0; i < NUM_SEMS; ++i) { | 482 | for (i = 0; i < NUM_SEMS; ++i) { |
483 | if(!USE_PRIOQ) { | 483 | if(!USE_PRIOQ) { |
484 | ctx->od[i] = open_fifo_sem(ctx->fd, i + ctx->kexclu + 2); | 484 | ctx->od[i] = open_fifo_sem(ctx->fd, i + ctx->kexclu + 2); |
@@ -508,21 +508,21 @@ void* rt_thread(void* _ctx) | |||
508 | int dgl_size = last - first + 1; | 508 | int dgl_size = last - first + 1; |
509 | int replica = -1; | 509 | int replica = -1; |
510 | int distance; | 510 | int distance; |
511 | 511 | ||
512 | int dgl[dgl_size]; | 512 | int dgl[dgl_size]; |
513 | 513 | ||
514 | // construct the DGL | 514 | // construct the DGL |
515 | for(i = first; i <= last; ++i) { | 515 | for(i = first; i <= last; ++i) { |
516 | dgl[i-first] = ctx->od[i]; | 516 | dgl[i-first] = ctx->od[i]; |
517 | } | 517 | } |
518 | 518 | ||
519 | replica = litmus_lock(ctx->kexclu); | 519 | replica = litmus_lock(ctx->kexclu); |
520 | 520 | ||
521 | //printf("[%d] got kexclu replica %d.\n", ctx->id, replica); | 521 | //printf("[%d] got kexclu replica %d.\n", ctx->id, replica); |
522 | //fflush(stdout); | 522 | //fflush(stdout); |
523 | 523 | ||
524 | distance = affinity_distance(ctx, replica, last_replica); | 524 | distance = affinity_distance(ctx, replica, last_replica); |
525 | 525 | ||
526 | if(USE_DYNAMIC_GROUP_LOCKS) { | 526 | if(USE_DYNAMIC_GROUP_LOCKS) { |
527 | litmus_dgl_lock(dgl, dgl_size); | 527 | litmus_dgl_lock(dgl, dgl_size); |
528 | } | 528 | } |
@@ -531,24 +531,24 @@ void* rt_thread(void* _ctx) | |||
531 | litmus_lock(dgl[i]); | 531 | litmus_lock(dgl[i]); |
532 | } | 532 | } |
533 | } | 533 | } |
534 | 534 | ||
535 | //do_exit = nested_job(ctx, &count, &first, affinity_cost[distance]); | 535 | //do_exit = nested_job(ctx, &count, &first, affinity_cost[distance]); |
536 | do_exit = job(ctx, affinity_cost[distance]); | 536 | do_exit = job(ctx, affinity_cost[distance]); |
537 | 537 | ||
538 | if(USE_DYNAMIC_GROUP_LOCKS) { | 538 | if(USE_DYNAMIC_GROUP_LOCKS) { |
539 | litmus_dgl_unlock(dgl, dgl_size); | 539 | litmus_dgl_unlock(dgl, dgl_size); |
540 | } | 540 | } |
541 | else { | 541 | else { |
542 | for(i = dgl_size - 1; i >= 0; --i) { | 542 | for(i = dgl_size - 1; i >= 0; --i) { |
543 | litmus_unlock(dgl[i]); | 543 | litmus_unlock(dgl[i]); |
544 | } | 544 | } |
545 | } | 545 | } |
546 | 546 | ||
547 | //printf("[%d]: freeing kexclu replica %d.\n", ctx->id, replica); | 547 | //printf("[%d]: freeing kexclu replica %d.\n", ctx->id, replica); |
548 | //fflush(stdout); | 548 | //fflush(stdout); |
549 | 549 | ||
550 | litmus_unlock(ctx->kexclu); | 550 | litmus_unlock(ctx->kexclu); |
551 | 551 | ||
552 | last_replica = replica; | 552 | last_replica = replica; |
553 | 553 | ||
554 | if(SLEEP_BETWEEN_JOBS && !do_exit) { | 554 | if(SLEEP_BETWEEN_JOBS && !do_exit) { |
@@ -567,7 +567,7 @@ void* rt_thread(void* _ctx) | |||
567 | */ | 567 | */ |
568 | TH_CALL( task_mode(BACKGROUND_TASK) ); | 568 | TH_CALL( task_mode(BACKGROUND_TASK) ); |
569 | 569 | ||
570 | for(i = 0; i < sizeof(ctx->mig_count)/sizeof(ctx->mig_count[0]); ++i) | 570 | for(i = 0; i < sizeof(ctx->mig_count)/sizeof(ctx->mig_count[0]); ++i) |
571 | { | 571 | { |
572 | printf("[%d]: mig_count[%d] = %d\n", ctx->id, i, ctx->mig_count[i]); | 572 | printf("[%d]: mig_count[%d] = %d\n", ctx->id, i, ctx->mig_count[i]); |
573 | } | 573 | } |
@@ -608,15 +608,15 @@ void* rt_thread(void* _ctx) | |||
608 | //} | 608 | //} |
609 | 609 | ||
610 | 610 | ||
611 | void dirty_kb(int kb) | 611 | void dirty_kb(int kb) |
612 | { | 612 | { |
613 | int32_t one_kb[256]; | 613 | int32_t one_kb[256]; |
614 | int32_t sum = 0; | 614 | int32_t sum = 0; |
615 | int32_t i; | 615 | int32_t i; |
616 | 616 | ||
617 | if(!kb) | 617 | if(!kb) |
618 | return; | 618 | return; |
619 | 619 | ||
620 | for (i = 0; i < 256; i++) | 620 | for (i = 0; i < 256; i++) |
621 | sum += one_kb[i]; | 621 | sum += one_kb[i]; |
622 | kb--; | 622 | kb--; |
@@ -630,9 +630,9 @@ void dirty_kb(int kb) | |||
630 | int job(struct thread_context* ctx, int runfactor) | 630 | int job(struct thread_context* ctx, int runfactor) |
631 | { | 631 | { |
632 | //struct timespec tosleep = {0, 100000}; // 0.1 ms | 632 | //struct timespec tosleep = {0, 100000}; // 0.1 ms |
633 | 633 | ||
634 | //printf("[%d]: runfactor = %d\n", ctx->id, runfactor); | 634 | //printf("[%d]: runfactor = %d\n", ctx->id, runfactor); |
635 | 635 | ||
636 | //dirty_kb(8 * runfactor); | 636 | //dirty_kb(8 * runfactor); |
637 | dirty_kb(1 * runfactor); | 637 | dirty_kb(1 * runfactor); |
638 | //nanosleep(&tosleep, NULL); | 638 | //nanosleep(&tosleep, NULL); |
diff --git a/gpu/locktest.c b/gpu/locktest.c index bc4fc54..6a1219a 100644 --- a/gpu/locktest.c +++ b/gpu/locktest.c | |||
@@ -177,7 +177,7 @@ void* rt_thread(void* _ctx) | |||
177 | return NULL; | 177 | return NULL; |
178 | } | 178 | } |
179 | 179 | ||
180 | void dirty_kb(int kb) | 180 | void dirty_kb(int kb) |
181 | { | 181 | { |
182 | int32_t one_kb[256]; | 182 | int32_t one_kb[256]; |
183 | int32_t sum = 0; | 183 | int32_t sum = 0; |
diff --git a/gpu/nested.c b/gpu/nested.c index 8c39152..edec46b 100644 --- a/gpu/nested.c +++ b/gpu/nested.c | |||
@@ -180,7 +180,7 @@ void* rt_thread(void* _ctx) | |||
180 | int first = (int)(NUM_SEMS * (rand_r(&(ctx->rand)) / (RAND_MAX + 1.0))); | 180 | int first = (int)(NUM_SEMS * (rand_r(&(ctx->rand)) / (RAND_MAX + 1.0))); |
181 | int count = NEST_DEPTH; | 181 | int count = NEST_DEPTH; |
182 | do_exit = nested_job(ctx, &count, &first); | 182 | do_exit = nested_job(ctx, &count, &first); |
183 | 183 | ||
184 | if(SLEEP_BETWEEN_JOBS && !do_exit) { | 184 | if(SLEEP_BETWEEN_JOBS && !do_exit) { |
185 | sleep_next_period(); | 185 | sleep_next_period(); |
186 | } | 186 | } |
@@ -226,7 +226,7 @@ int nested_job(struct thread_context* ctx, int *count, int *next) | |||
226 | 226 | ||
227 | 227 | ||
228 | 228 | ||
229 | void dirty_kb(int kb) | 229 | void dirty_kb(int kb) |
230 | { | 230 | { |
231 | int32_t one_kb[256]; | 231 | int32_t one_kb[256]; |
232 | int32_t sum = 0; | 232 | int32_t sum = 0; |
diff --git a/gpu/rtspin_fake_cuda.cpp b/gpu/rtspin_fake_cuda.cpp index 78e4f60..247a74c 100644 --- a/gpu/rtspin_fake_cuda.cpp +++ b/gpu/rtspin_fake_cuda.cpp | |||
@@ -119,7 +119,7 @@ char *h_state_data = 0; | |||
119 | mmap(NULL, s , \ | 119 | mmap(NULL, s , \ |
120 | PROT_READ | PROT_WRITE, \ | 120 | PROT_READ | PROT_WRITE, \ |
121 | MAP_PRIVATE | MAP_ANONYMOUS | MAP_LOCKED, \ | 121 | MAP_PRIVATE | MAP_ANONYMOUS | MAP_LOCKED, \ |
122 | -1, 0) | 122 | -1, 0) |
123 | #else | 123 | #else |
124 | #define c_malloc(s) malloc(s) | 124 | #define c_malloc(s) malloc(s) |
125 | #endif | 125 | #endif |
@@ -144,38 +144,38 @@ cudaError_t cudaGetLastError() | |||
144 | //////////////////////////////////////////////////////////////////////// | 144 | //////////////////////////////////////////////////////////////////////// |
145 | 145 | ||
146 | struct ce_lock_state | 146 | struct ce_lock_state |
147 | { | 147 | { |
148 | int locks[2]; | 148 | int locks[2]; |
149 | size_t num_locks; | 149 | size_t num_locks; |
150 | size_t budget_remaining; | 150 | size_t budget_remaining; |
151 | bool locked; | 151 | bool locked; |
152 | 152 | ||
153 | ce_lock_state(int device_a, enum cudaMemcpyKind kind, size_t size, int device_b = -1) { | 153 | ce_lock_state(int device_a, enum cudaMemcpyKind kind, size_t size, int device_b = -1) { |
154 | num_locks = (device_a != -1) + (device_b != -1); | 154 | num_locks = (device_a != -1) + (device_b != -1); |
155 | 155 | ||
156 | if(device_a != -1) { | 156 | if(device_a != -1) { |
157 | locks[0] = (kind == cudaMemcpyHostToDevice) ? | 157 | locks[0] = (kind == cudaMemcpyHostToDevice) ? |
158 | CE_SEND_LOCKS[device_a] : CE_RECV_LOCKS[device_a]; | 158 | CE_SEND_LOCKS[device_a] : CE_RECV_LOCKS[device_a]; |
159 | } | 159 | } |
160 | 160 | ||
161 | if(device_b != -1) { | 161 | if(device_b != -1) { |
162 | assert(kind == cudaMemcpyDeviceToDevice); | 162 | assert(kind == cudaMemcpyDeviceToDevice); |
163 | 163 | ||
164 | locks[1] = CE_RECV_LOCKS[device_b]; | 164 | locks[1] = CE_RECV_LOCKS[device_b]; |
165 | 165 | ||
166 | if(locks[1] < locks[0]) { | 166 | if(locks[1] < locks[0]) { |
167 | int temp = locks[1]; | 167 | int temp = locks[1]; |
168 | locks[1] = locks[0]; | 168 | locks[1] = locks[0]; |
169 | locks[0] = temp; | 169 | locks[0] = temp; |
170 | } | 170 | } |
171 | } | 171 | } |
172 | 172 | ||
173 | if(!ENABLE_CHUNKING) | 173 | if(!ENABLE_CHUNKING) |
174 | budget_remaining = size; | 174 | budget_remaining = size; |
175 | else | 175 | else |
176 | budget_remaining = CHUNK_SIZE; | 176 | budget_remaining = CHUNK_SIZE; |
177 | } | 177 | } |
178 | 178 | ||
179 | void lock() { | 179 | void lock() { |
180 | if(USE_DYNAMIC_GROUP_LOCKS) { | 180 | if(USE_DYNAMIC_GROUP_LOCKS) { |
181 | litmus_dgl_lock(locks, num_locks); | 181 | litmus_dgl_lock(locks, num_locks); |
@@ -189,7 +189,7 @@ struct ce_lock_state | |||
189 | } | 189 | } |
190 | locked = true; | 190 | locked = true; |
191 | } | 191 | } |
192 | 192 | ||
193 | void unlock() { | 193 | void unlock() { |
194 | if(USE_DYNAMIC_GROUP_LOCKS) { | 194 | if(USE_DYNAMIC_GROUP_LOCKS) { |
195 | litmus_dgl_unlock(locks, num_locks); | 195 | litmus_dgl_unlock(locks, num_locks); |
@@ -204,15 +204,15 @@ struct ce_lock_state | |||
204 | } | 204 | } |
205 | locked = false; | 205 | locked = false; |
206 | } | 206 | } |
207 | 207 | ||
208 | void refresh() { | 208 | void refresh() { |
209 | budget_remaining = CHUNK_SIZE; | 209 | budget_remaining = CHUNK_SIZE; |
210 | } | 210 | } |
211 | 211 | ||
212 | bool budgetIsAvailable(size_t tosend) { | 212 | bool budgetIsAvailable(size_t tosend) { |
213 | return(tosend >= budget_remaining); | 213 | return(tosend >= budget_remaining); |
214 | } | 214 | } |
215 | 215 | ||
216 | void decreaseBudget(size_t spent) { | 216 | void decreaseBudget(size_t spent) { |
217 | budget_remaining -= spent; | 217 | budget_remaining -= spent; |
218 | } | 218 | } |
@@ -225,53 +225,53 @@ cudaError_t __chunkMemcpy(void* a_dst, const void* a_src, size_t count, | |||
225 | { | 225 | { |
226 | cudaError_t ret = cudaSuccess; | 226 | cudaError_t ret = cudaSuccess; |
227 | int remaining = count; | 227 | int remaining = count; |
228 | 228 | ||
229 | char* dst = (char*)a_dst; | 229 | char* dst = (char*)a_dst; |
230 | const char* src = (const char*)a_src; | 230 | const char* src = (const char*)a_src; |
231 | 231 | ||
232 | // disable chunking, if needed, by setting chunk_size equal to the | 232 | // disable chunking, if needed, by setting chunk_size equal to the |
233 | // amount of data to be copied. | 233 | // amount of data to be copied. |
234 | int chunk_size = (ENABLE_CHUNKING) ? CHUNK_SIZE : count; | 234 | int chunk_size = (ENABLE_CHUNKING) ? CHUNK_SIZE : count; |
235 | int i = 0; | 235 | int i = 0; |
236 | 236 | ||
237 | while(remaining != 0) | 237 | while(remaining != 0) |
238 | { | 238 | { |
239 | int bytesToCopy = std::min(remaining, chunk_size); | 239 | int bytesToCopy = std::min(remaining, chunk_size); |
240 | 240 | ||
241 | if(state && state->budgetIsAvailable(bytesToCopy) && state->locked) { | 241 | if(state && state->budgetIsAvailable(bytesToCopy) && state->locked) { |
242 | //cutilSafeCall( cudaStreamSynchronize(streams[CUR_DEVICE]) ); | 242 | //cutilSafeCall( cudaStreamSynchronize(streams[CUR_DEVICE]) ); |
243 | ret = cudaGetLastError(); | 243 | ret = cudaGetLastError(); |
244 | 244 | ||
245 | if(ret != cudaSuccess) | 245 | if(ret != cudaSuccess) |
246 | { | 246 | { |
247 | break; | 247 | break; |
248 | } | 248 | } |
249 | 249 | ||
250 | state->unlock(); | 250 | state->unlock(); |
251 | state->refresh(); // replentish. | 251 | state->refresh(); // replentish. |
252 | // we can only run out of | 252 | // we can only run out of |
253 | // budget if chunking is enabled. | 253 | // budget if chunking is enabled. |
254 | // we presume that init budget would | 254 | // we presume that init budget would |
255 | // be set to cover entire memcpy | 255 | // be set to cover entire memcpy |
256 | // if chunking were disabled. | 256 | // if chunking were disabled. |
257 | } | 257 | } |
258 | 258 | ||
259 | if(state && !state->locked) { | 259 | if(state && !state->locked) { |
260 | state->lock(); | 260 | state->lock(); |
261 | } | 261 | } |
262 | 262 | ||
263 | //ret = cudaMemcpy(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind); | 263 | //ret = cudaMemcpy(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind); |
264 | //cudaMemcpyAsync(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind, streams[CUR_DEVICE]); | 264 | //cudaMemcpyAsync(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind, streams[CUR_DEVICE]); |
265 | 265 | ||
266 | if(state) { | 266 | if(state) { |
267 | state->decreaseBudget(bytesToCopy); | 267 | state->decreaseBudget(bytesToCopy); |
268 | } | 268 | } |
269 | 269 | ||
270 | // if(ret != cudaSuccess) | 270 | // if(ret != cudaSuccess) |
271 | // { | 271 | // { |
272 | // break; | 272 | // break; |
273 | // } | 273 | // } |
274 | 274 | ||
275 | ++i; | 275 | ++i; |
276 | remaining -= bytesToCopy; | 276 | remaining -= bytesToCopy; |
277 | } | 277 | } |
@@ -281,7 +281,7 @@ cudaError_t __chunkMemcpy(void* a_dst, const void* a_src, size_t count, | |||
281 | cudaError_t chunkMemcpy(void* a_dst, const void* a_src, size_t count, | 281 | cudaError_t chunkMemcpy(void* a_dst, const void* a_src, size_t count, |
282 | enum cudaMemcpyKind kind, | 282 | enum cudaMemcpyKind kind, |
283 | int device_a = -1, // device_a == -1 disables locking | 283 | int device_a = -1, // device_a == -1 disables locking |
284 | bool do_locking = true, | 284 | bool do_locking = true, |
285 | int device_b = -1) | 285 | int device_b = -1) |
286 | { | 286 | { |
287 | cudaError_t ret; | 287 | cudaError_t ret; |
@@ -317,7 +317,7 @@ inline uint64_t timespec_to_ns(const struct timespec& t) | |||
317 | inline struct timespec ns_to_timespec(const uint64_t& ns) | 317 | inline struct timespec ns_to_timespec(const uint64_t& ns) |
318 | { | 318 | { |
319 | struct timespec temp = {ns/1e9, ns - ns/1e9}; | 319 | struct timespec temp = {ns/1e9, ns - ns/1e9}; |
320 | return(temp); | 320 | return(temp); |
321 | } | 321 | } |
322 | 322 | ||
323 | inline uint64_t clock_gettime_ns(clockid_t clk_id) | 323 | inline uint64_t clock_gettime_ns(clockid_t clk_id) |
@@ -366,9 +366,9 @@ static void allocate_locks() | |||
366 | { | 366 | { |
367 | // allocate k-FMLP lock | 367 | // allocate k-FMLP lock |
368 | int fd = open("semaphores", O_RDONLY | O_CREAT, S_IRUSR | S_IWUSR); | 368 | int fd = open("semaphores", O_RDONLY | O_CREAT, S_IRUSR | S_IWUSR); |
369 | 369 | ||
370 | int base_name = GPU_PARTITION * 1000; | 370 | int base_name = GPU_PARTITION * 1000; |
371 | 371 | ||
372 | if(USE_KFMLP) { | 372 | if(USE_KFMLP) { |
373 | KEXCLU_LOCK = open_kfmlp_gpu_sem(fd, | 373 | KEXCLU_LOCK = open_kfmlp_gpu_sem(fd, |
374 | base_name, /* name */ | 374 | base_name, /* name */ |
@@ -397,7 +397,7 @@ static void allocate_locks() | |||
397 | // NUM_SIMULT_USERS, | 397 | // NUM_SIMULT_USERS, |
398 | // ENABLE_AFFINITY, | 398 | // ENABLE_AFFINITY, |
399 | // RELAX_FIFO_MAX_LEN | 399 | // RELAX_FIFO_MAX_LEN |
400 | // ); | 400 | // ); |
401 | } | 401 | } |
402 | if(KEXCLU_LOCK < 0) | 402 | if(KEXCLU_LOCK < 0) |
403 | perror("open_kexclu_sem"); | 403 | perror("open_kexclu_sem"); |
@@ -406,31 +406,31 @@ static void allocate_locks() | |||
406 | { | 406 | { |
407 | open_sem_t opensem = (!USE_PRIOQ) ? open_fifo_sem : open_prioq_sem; | 407 | open_sem_t opensem = (!USE_PRIOQ) ? open_fifo_sem : open_prioq_sem; |
408 | const char* opensem_label = (!USE_PRIOQ) ? "open_fifo_sem" : "open_prioq_sem"; | 408 | const char* opensem_label = (!USE_PRIOQ) ? "open_fifo_sem" : "open_prioq_sem"; |
409 | 409 | ||
410 | // allocate the engine locks. | 410 | // allocate the engine locks. |
411 | for (int i = 0; i < MAX_GPUS; ++i) | 411 | for (int i = 0; i < MAX_GPUS; ++i) |
412 | { | 412 | { |
413 | EE_LOCKS[i] = opensem(fd, (i+1)*10 + base_name); | 413 | EE_LOCKS[i] = opensem(fd, (i+1)*10 + base_name); |
414 | if(EE_LOCKS[i] < 0) | 414 | if(EE_LOCKS[i] < 0) |
415 | perror(opensem_label); | 415 | perror(opensem_label); |
416 | 416 | ||
417 | CE_SEND_LOCKS[i] = opensem(fd, (i+1)*10 + base_name + 1); | 417 | CE_SEND_LOCKS[i] = opensem(fd, (i+1)*10 + base_name + 1); |
418 | if(CE_SEND_LOCKS[i] < 0) | 418 | if(CE_SEND_LOCKS[i] < 0) |
419 | perror(opensem_label); | 419 | perror(opensem_label); |
420 | 420 | ||
421 | if(NUM_SIMULT_USERS == 3) | 421 | if(NUM_SIMULT_USERS == 3) |
422 | { | 422 | { |
423 | // allocate a separate lock for the second copy engine | 423 | // allocate a separate lock for the second copy engine |
424 | CE_RECV_LOCKS[i] = opensem(fd, (i+1)*10 + base_name + 2); | 424 | CE_RECV_LOCKS[i] = opensem(fd, (i+1)*10 + base_name + 2); |
425 | if(CE_RECV_LOCKS[i] < 0) | 425 | if(CE_RECV_LOCKS[i] < 0) |
426 | perror(opensem_label); | 426 | perror(opensem_label); |
427 | } | 427 | } |
428 | else | 428 | else |
429 | { | 429 | { |
430 | // share a single lock for the single copy engine | 430 | // share a single lock for the single copy engine |
431 | CE_RECV_LOCKS[i] = CE_SEND_LOCKS[i]; | 431 | CE_RECV_LOCKS[i] = CE_SEND_LOCKS[i]; |
432 | } | 432 | } |
433 | } | 433 | } |
434 | } | 434 | } |
435 | } | 435 | } |
436 | 436 | ||
@@ -449,22 +449,22 @@ static void allocate_host_memory() | |||
449 | // h_send_data = (char *)c_malloc(send_alloc_bytes); | 449 | // h_send_data = (char *)c_malloc(send_alloc_bytes); |
450 | // memset(h_send_data, 0x55, send_alloc_bytes); // write some random value | 450 | // memset(h_send_data, 0x55, send_alloc_bytes); // write some random value |
451 | // // this will open a connection to GPU 0 if there is no active context, so | 451 | // // this will open a connection to GPU 0 if there is no active context, so |
452 | // // expect long stalls. LAME. | 452 | // // expect long stalls. LAME. |
453 | // cutilSafeCall( cudaHostRegister(h_send_data, send_alloc_bytes, cudaHostRegisterPortable) ); | 453 | // cutilSafeCall( cudaHostRegister(h_send_data, send_alloc_bytes, cudaHostRegisterPortable) ); |
454 | // } | 454 | // } |
455 | // | 455 | // |
456 | // if(recv_alloc_bytes > 0) | 456 | // if(recv_alloc_bytes > 0) |
457 | // { | 457 | // { |
458 | // h_recv_data = (char *)c_malloc(recv_alloc_bytes); | 458 | // h_recv_data = (char *)c_malloc(recv_alloc_bytes); |
459 | // memset(h_recv_data, 0xAA, recv_alloc_bytes); | 459 | // memset(h_recv_data, 0xAA, recv_alloc_bytes); |
460 | // cutilSafeCall( cudaHostRegister(h_recv_data, recv_alloc_bytes, cudaHostRegisterPortable) ); | 460 | // cutilSafeCall( cudaHostRegister(h_recv_data, recv_alloc_bytes, cudaHostRegisterPortable) ); |
461 | // } | 461 | // } |
462 | // | 462 | // |
463 | // if(state_alloc_bytes > 0) | 463 | // if(state_alloc_bytes > 0) |
464 | // { | 464 | // { |
465 | // h_state_data = (char *)c_malloc(state_alloc_bytes); | 465 | // h_state_data = (char *)c_malloc(state_alloc_bytes); |
466 | // memset(h_state_data, 0xCC, state_alloc_bytes); // write some random value | 466 | // memset(h_state_data, 0xCC, state_alloc_bytes); // write some random value |
467 | // cutilSafeCall( cudaHostRegister(h_state_data, state_alloc_bytes, cudaHostRegisterPortable) ); | 467 | // cutilSafeCall( cudaHostRegister(h_state_data, state_alloc_bytes, cudaHostRegisterPortable) ); |
468 | // } | 468 | // } |
469 | 469 | ||
470 | printf("Host memory allocated.\n"); | 470 | printf("Host memory allocated.\n"); |
@@ -477,28 +477,28 @@ static void allocate_device_memory() | |||
477 | // for(int i = 0; i < GPU_PARTITION_SIZE; ++i) | 477 | // for(int i = 0; i < GPU_PARTITION_SIZE; ++i) |
478 | // { | 478 | // { |
479 | // int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i; | 479 | // int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i; |
480 | // | 480 | // |
481 | // if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].lock(); | 481 | // if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].lock(); |
482 | // | 482 | // |
483 | // cutilSafeCall( cudaSetDevice(which_device) ); | 483 | // cutilSafeCall( cudaSetDevice(which_device) ); |
484 | // cutilSafeCall( cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 0) ); | 484 | // cutilSafeCall( cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 0) ); |
485 | // cutilSafeCall( cudaDeviceSetLimit(cudaLimitMallocHeapSize, 0) ); | 485 | // cutilSafeCall( cudaDeviceSetLimit(cudaLimitMallocHeapSize, 0) ); |
486 | // | 486 | // |
487 | // cutilSafeCall( cudaStreamCreate(&streams[which_device]) ); | 487 | // cutilSafeCall( cudaStreamCreate(&streams[which_device]) ); |
488 | // | 488 | // |
489 | // /* pre-allocate memory, pray there's enough to go around */ | 489 | // /* pre-allocate memory, pray there's enough to go around */ |
490 | // if(SEND_SIZE > 0) { | 490 | // if(SEND_SIZE > 0) { |
491 | // cutilSafeCall( cudaMalloc((void**)&d_send_data[which_device], SEND_SIZE) ); | 491 | // cutilSafeCall( cudaMalloc((void**)&d_send_data[which_device], SEND_SIZE) ); |
492 | // } | 492 | // } |
493 | // if(RECV_SIZE > 0) { | 493 | // if(RECV_SIZE > 0) { |
494 | // cutilSafeCall( cudaMalloc((void**)&h_recv_data[which_device], RECV_SIZE) ); | 494 | // cutilSafeCall( cudaMalloc((void**)&h_recv_data[which_device], RECV_SIZE) ); |
495 | // } | 495 | // } |
496 | // if(STATE_SIZE > 0) { | 496 | // if(STATE_SIZE > 0) { |
497 | // cutilSafeCall( cudaMalloc((void**)&h_state_data[which_device], STATE_SIZE) ); | 497 | // cutilSafeCall( cudaMalloc((void**)&h_state_data[which_device], STATE_SIZE) ); |
498 | // } | 498 | // } |
499 | // | 499 | // |
500 | // if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].unlock(); | 500 | // if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].unlock(); |
501 | // } | 501 | // } |
502 | printf("Device memory allocated.\n"); | 502 | printf("Device memory allocated.\n"); |
503 | } | 503 | } |
504 | 504 | ||
@@ -508,39 +508,39 @@ static void configure_gpus() | |||
508 | 508 | ||
509 | // // SUSPEND WHEN BLOCKED!! | 509 | // // SUSPEND WHEN BLOCKED!! |
510 | // cutilSafeCall( cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync) ); | 510 | // cutilSafeCall( cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync) ); |
511 | // | 511 | // |
512 | // // establish a connection to each GPU. | 512 | // // establish a connection to each GPU. |
513 | // for(int i = 0; i < GPU_PARTITION_SIZE; ++i) | 513 | // for(int i = 0; i < GPU_PARTITION_SIZE; ++i) |
514 | // { | 514 | // { |
515 | // int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i; | 515 | // int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i; |
516 | // | 516 | // |
517 | // if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].lock(); | 517 | // if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].lock(); |
518 | // | 518 | // |
519 | // cutilSafeCall( cudaSetDevice(which_device) ); | 519 | // cutilSafeCall( cudaSetDevice(which_device) ); |
520 | // cutilSafeCall( cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 0) ); | 520 | // cutilSafeCall( cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 0) ); |
521 | // cutilSafeCall( cudaDeviceSetLimit(cudaLimitMallocHeapSize, 0) ); | 521 | // cutilSafeCall( cudaDeviceSetLimit(cudaLimitMallocHeapSize, 0) ); |
522 | // | 522 | // |
523 | // cutilSafeCall( cudaStreamCreate(&streams[which_device]) ); | 523 | // cutilSafeCall( cudaStreamCreate(&streams[which_device]) ); |
524 | // | 524 | // |
525 | // // enable P2P migrations. | 525 | // // enable P2P migrations. |
526 | // // we assume all GPUs are on the same I/O hub. | 526 | // // we assume all GPUs are on the same I/O hub. |
527 | // for(int j = 0; j < GPU_PARTITION_SIZE; ++j) | 527 | // for(int j = 0; j < GPU_PARTITION_SIZE; ++j) |
528 | // { | 528 | // { |
529 | // int other_device = GPU_PARTITION*GPU_PARTITION_SIZE + j; | 529 | // int other_device = GPU_PARTITION*GPU_PARTITION_SIZE + j; |
530 | // | 530 | // |
531 | // if(which_device != other_device) | 531 | // if(which_device != other_device) |
532 | // { | 532 | // { |
533 | // cutilSafeCall( cudaDeviceEnablePeerAccess(other_device, 0) ); | 533 | // cutilSafeCall( cudaDeviceEnablePeerAccess(other_device, 0) ); |
534 | // } | 534 | // } |
535 | // } | 535 | // } |
536 | // | 536 | // |
537 | // if(i == 0) | 537 | // if(i == 0) |
538 | // { | 538 | // { |
539 | // struct cudaDeviceProp pi; | 539 | // struct cudaDeviceProp pi; |
540 | // cudaGetDeviceProperties(&pi, i); | 540 | // cudaGetDeviceProperties(&pi, i); |
541 | // gpuCyclesPerSecond = pi.clockRate * 1000; /* khz -> hz */ | 541 | // gpuCyclesPerSecond = pi.clockRate * 1000; /* khz -> hz */ |
542 | // } | 542 | // } |
543 | // | 543 | // |
544 | // if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].unlock(); | 544 | // if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].unlock(); |
545 | // } | 545 | // } |
546 | 546 | ||
@@ -580,7 +580,7 @@ static void catchExit(void) | |||
580 | for(int i = 0; i < GPU_PARTITION_SIZE; ++i) | 580 | for(int i = 0; i < GPU_PARTITION_SIZE; ++i) |
581 | { | 581 | { |
582 | int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i; | 582 | int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i; |
583 | 583 | ||
584 | litmus_unlock(EE_LOCKS[which_device]); | 584 | litmus_unlock(EE_LOCKS[which_device]); |
585 | litmus_unlock(CE_SEND_LOCKS[which_device]); | 585 | litmus_unlock(CE_SEND_LOCKS[which_device]); |
586 | if(NUM_SIMULT_USERS == 2) { | 586 | if(NUM_SIMULT_USERS == 2) { |
@@ -588,11 +588,11 @@ static void catchExit(void) | |||
588 | } | 588 | } |
589 | } | 589 | } |
590 | } | 590 | } |
591 | 591 | ||
592 | if(CUR_DEVICE >= 0) { | 592 | if(CUR_DEVICE >= 0) { |
593 | unregister_nv_device(CUR_DEVICE); | 593 | unregister_nv_device(CUR_DEVICE); |
594 | } | 594 | } |
595 | 595 | ||
596 | litmus_unlock(KEXCLU_LOCK); | 596 | litmus_unlock(KEXCLU_LOCK); |
597 | } | 597 | } |
598 | } | 598 | } |
@@ -604,18 +604,18 @@ static void migrateToGPU(int destination) | |||
604 | if(MIGRATE_VIA_SYSMEM) | 604 | if(MIGRATE_VIA_SYSMEM) |
605 | { | 605 | { |
606 | chunkMemcpy(h_state_data, d_state_data[LAST_DEVICE], STATE_SIZE, | 606 | chunkMemcpy(h_state_data, d_state_data[LAST_DEVICE], STATE_SIZE, |
607 | cudaMemcpyDeviceToHost, LAST_DEVICE, useEngineLocks()); | 607 | cudaMemcpyDeviceToHost, LAST_DEVICE, useEngineLocks()); |
608 | } | 608 | } |
609 | } | 609 | } |
610 | 610 | ||
611 | // cutilSafeCall( cudaSetDevice(destination) ); | 611 | // cutilSafeCall( cudaSetDevice(destination) ); |
612 | 612 | ||
613 | if(!BROADCAST_STATE && STATE_SIZE > 0) | 613 | if(!BROADCAST_STATE && STATE_SIZE > 0) |
614 | { | 614 | { |
615 | if(MIGRATE_VIA_SYSMEM) | 615 | if(MIGRATE_VIA_SYSMEM) |
616 | { | 616 | { |
617 | chunkMemcpy(d_state_data[CUR_DEVICE], h_state_data, STATE_SIZE, | 617 | chunkMemcpy(d_state_data[CUR_DEVICE], h_state_data, STATE_SIZE, |
618 | cudaMemcpyHostToDevice, CUR_DEVICE, useEngineLocks()); | 618 | cudaMemcpyHostToDevice, CUR_DEVICE, useEngineLocks()); |
619 | } | 619 | } |
620 | else | 620 | else |
621 | { | 621 | { |
@@ -633,15 +633,15 @@ static void migrateToGPU(int destination) | |||
633 | static void broadcastState(int from) | 633 | static void broadcastState(int from) |
634 | { | 634 | { |
635 | if(STATE_SIZE > 0) | 635 | if(STATE_SIZE > 0) |
636 | { | 636 | { |
637 | assert(CUR_DEVICE == from); | 637 | assert(CUR_DEVICE == from); |
638 | 638 | ||
639 | if(MIGRATE_VIA_SYSMEM) | 639 | if(MIGRATE_VIA_SYSMEM) |
640 | { | 640 | { |
641 | chunkMemcpy(h_state_data, d_state_data[from], STATE_SIZE, | 641 | chunkMemcpy(h_state_data, d_state_data[from], STATE_SIZE, |
642 | cudaMemcpyDeviceToHost, from, useEngineLocks()); | 642 | cudaMemcpyDeviceToHost, from, useEngineLocks()); |
643 | } | 643 | } |
644 | 644 | ||
645 | for(int i = 0; i < GPU_PARTITION_SIZE; ++i) | 645 | for(int i = 0; i < GPU_PARTITION_SIZE; ++i) |
646 | { | 646 | { |
647 | int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i; | 647 | int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i; |
@@ -652,7 +652,7 @@ static void broadcastState(int from) | |||
652 | // cutilSafeCall( cudaSetDevice(which_device) ); | 652 | // cutilSafeCall( cudaSetDevice(which_device) ); |
653 | CUR_DEVICE = which_device; // temporary | 653 | CUR_DEVICE = which_device; // temporary |
654 | chunkMemcpy(d_state_data[which_device], h_state_data, STATE_SIZE, | 654 | chunkMemcpy(d_state_data[which_device], h_state_data, STATE_SIZE, |
655 | cudaMemcpyHostToDevice, which_device, useEngineLocks()); | 655 | cudaMemcpyHostToDevice, which_device, useEngineLocks()); |
656 | } | 656 | } |
657 | else | 657 | else |
658 | { | 658 | { |
@@ -662,11 +662,11 @@ static void broadcastState(int from) | |||
662 | cudaMemcpyDeviceToDevice, | 662 | cudaMemcpyDeviceToDevice, |
663 | from, | 663 | from, |
664 | useEngineLocks(), | 664 | useEngineLocks(), |
665 | which_device); | 665 | which_device); |
666 | } | 666 | } |
667 | } | 667 | } |
668 | } | 668 | } |
669 | 669 | ||
670 | if(MIGRATE_VIA_SYSMEM && CUR_DEVICE != from) | 670 | if(MIGRATE_VIA_SYSMEM && CUR_DEVICE != from) |
671 | { | 671 | { |
672 | // cutilSafeCall( cudaSetDevice(from) ); | 672 | // cutilSafeCall( cudaSetDevice(from) ); |
@@ -714,18 +714,18 @@ static void gpu_loop_for(double gpu_sec_time, double emergency_exit) | |||
714 | } | 714 | } |
715 | 715 | ||
716 | if(useEngineLocks()) litmus_lock(EE_LOCKS[CUR_DEVICE]); | 716 | if(useEngineLocks()) litmus_lock(EE_LOCKS[CUR_DEVICE]); |
717 | 717 | ||
718 | // docudaspin <<<numblocks,blocksz, 0, streams[CUR_DEVICE]>>> (numcycles); | 718 | // docudaspin <<<numblocks,blocksz, 0, streams[CUR_DEVICE]>>> (numcycles); |
719 | // cutilSafeCall( cudaStreamSynchronize(streams[CUR_DEVICE]) ); | 719 | // cutilSafeCall( cudaStreamSynchronize(streams[CUR_DEVICE]) ); |
720 | 720 | ||
721 | if(useEngineLocks()) litmus_unlock(EE_LOCKS[CUR_DEVICE]); | 721 | if(useEngineLocks()) litmus_unlock(EE_LOCKS[CUR_DEVICE]); |
722 | 722 | ||
723 | if(RECV_SIZE > 0) | 723 | if(RECV_SIZE > 0) |
724 | { | 724 | { |
725 | chunkMemcpy(h_recv_data, d_recv_data[CUR_DEVICE], RECV_SIZE, | 725 | chunkMemcpy(h_recv_data, d_recv_data[CUR_DEVICE], RECV_SIZE, |
726 | cudaMemcpyDeviceToHost, CUR_DEVICE, useEngineLocks()); | 726 | cudaMemcpyDeviceToHost, CUR_DEVICE, useEngineLocks()); |
727 | } | 727 | } |
728 | 728 | ||
729 | if(BROADCAST_STATE) | 729 | if(BROADCAST_STATE) |
730 | { | 730 | { |
731 | broadcastState(CUR_DEVICE); | 731 | broadcastState(CUR_DEVICE); |
@@ -802,7 +802,7 @@ int main(int argc, char** argv) | |||
802 | int num_tasks = 0; | 802 | int num_tasks = 0; |
803 | 803 | ||
804 | double gpu_sec_ms = 0; | 804 | double gpu_sec_ms = 0; |
805 | 805 | ||
806 | while ((opt = getopt(argc, argv, OPTSTR)) != -1) { | 806 | while ((opt = getopt(argc, argv, OPTSTR)) != -1) { |
807 | // printf("opt = %c optarg = %s\n", opt, optarg); | 807 | // printf("opt = %c optarg = %s\n", opt, optarg); |
808 | switch (opt) { | 808 | switch (opt) { |
@@ -858,7 +858,7 @@ int main(int argc, char** argv) | |||
858 | break; | 858 | break; |
859 | case 'r': | 859 | case 'r': |
860 | RELAX_FIFO_MAX_LEN = true; | 860 | RELAX_FIFO_MAX_LEN = true; |
861 | break; | 861 | break; |
862 | case 'L': | 862 | case 'L': |
863 | USE_KFMLP = true; | 863 | USE_KFMLP = true; |
864 | break; | 864 | break; |
@@ -949,13 +949,13 @@ int main(int argc, char** argv) | |||
949 | { | 949 | { |
950 | printf("%d creating release shared memory\n", getpid()); | 950 | printf("%d creating release shared memory\n", getpid()); |
951 | shared_memory_object::remove("release_barrier_memory"); | 951 | shared_memory_object::remove("release_barrier_memory"); |
952 | release_segment_ptr = new managed_shared_memory(create_only, "release_barrier_memory", 4*1024); | 952 | release_segment_ptr = new managed_shared_memory(create_only, "release_barrier_memory", 4*1024); |
953 | 953 | ||
954 | printf("%d creating release barrier for %d users\n", getpid(), num_tasks); | 954 | printf("%d creating release barrier for %d users\n", getpid(), num_tasks); |
955 | release_barrier = release_segment_ptr->construct<barrier>("barrier release_barrier")(num_tasks); | 955 | release_barrier = release_segment_ptr->construct<barrier>("barrier release_barrier")(num_tasks); |
956 | 956 | ||
957 | init_release_time = release_segment_ptr->construct<uint64_t>("uint64_t instance")(); | 957 | init_release_time = release_segment_ptr->construct<uint64_t>("uint64_t instance")(); |
958 | *init_release_time = 0; | 958 | *init_release_time = 0; |
959 | } | 959 | } |
960 | else | 960 | else |
961 | { | 961 | { |
@@ -972,13 +972,13 @@ int main(int argc, char** argv) | |||
972 | sleep(1); | 972 | sleep(1); |
973 | } | 973 | } |
974 | }while(segment_ptr == NULL); | 974 | }while(segment_ptr == NULL); |
975 | 975 | ||
976 | release_barrier = segment_ptr->find<barrier>("barrier release_barrier").first; | 976 | release_barrier = segment_ptr->find<barrier>("barrier release_barrier").first; |
977 | init_release_time = segment_ptr->find<uint64_t>("uint64_t instance").first; | 977 | init_release_time = segment_ptr->find<uint64_t>("uint64_t instance").first; |
978 | } | 978 | } |
979 | } | 979 | } |
980 | 980 | ||
981 | 981 | ||
982 | if(GPU_TASK) | 982 | if(GPU_TASK) |
983 | { | 983 | { |
984 | if(ENABLE_WAIT) | 984 | if(ENABLE_WAIT) |
@@ -1019,7 +1019,7 @@ int main(int argc, char** argv) | |||
1019 | SEND_SIZE *= scale; | 1019 | SEND_SIZE *= scale; |
1020 | RECV_SIZE *= scale; | 1020 | RECV_SIZE *= scale; |
1021 | STATE_SIZE *= scale; | 1021 | STATE_SIZE *= scale; |
1022 | 1022 | ||
1023 | init_cuda(); | 1023 | init_cuda(); |
1024 | } | 1024 | } |
1025 | 1025 | ||
@@ -1036,16 +1036,16 @@ int main(int argc, char** argv) | |||
1036 | if (ret != 0) | 1036 | if (ret != 0) |
1037 | bail_out("could not become RT task"); | 1037 | bail_out("could not become RT task"); |
1038 | 1038 | ||
1039 | 1039 | ||
1040 | 1040 | ||
1041 | uint64_t jobCount = 0; | 1041 | uint64_t jobCount = 0; |
1042 | blitz::Array<uint64_t, 1> responseTimeLog(num_jobs+1); | 1042 | blitz::Array<uint64_t, 1> responseTimeLog(num_jobs+1); |
1043 | 1043 | ||
1044 | struct timespec spec; | 1044 | struct timespec spec; |
1045 | uint64_t release; | 1045 | uint64_t release; |
1046 | uint64_t finish; | 1046 | uint64_t finish; |
1047 | 1047 | ||
1048 | 1048 | ||
1049 | if (ENABLE_WAIT) { | 1049 | if (ENABLE_WAIT) { |
1050 | printf("Waiting for release.\n"); | 1050 | printf("Waiting for release.\n"); |
1051 | ret = wait_for_ts_release(); | 1051 | ret = wait_for_ts_release(); |
@@ -1056,14 +1056,14 @@ int main(int argc, char** argv) | |||
1056 | { | 1056 | { |
1057 | sleep_next_period(); | 1057 | sleep_next_period(); |
1058 | } | 1058 | } |
1059 | 1059 | ||
1060 | clock_gettime(CLOCK_MONOTONIC, &spec); | 1060 | clock_gettime(CLOCK_MONOTONIC, &spec); |
1061 | release = timespec_to_ns(spec); | 1061 | release = timespec_to_ns(spec); |
1062 | if (!__sync_bool_compare_and_swap(init_release_time, 0, release)) | 1062 | if (!__sync_bool_compare_and_swap(init_release_time, 0, release)) |
1063 | { | 1063 | { |
1064 | release = *init_release_time; | 1064 | release = *init_release_time; |
1065 | } | 1065 | } |
1066 | 1066 | ||
1067 | releaseTime = wctime(); | 1067 | releaseTime = wctime(); |
1068 | double failsafeEnd = releaseTime + duration; | 1068 | double failsafeEnd = releaseTime + duration; |
1069 | 1069 | ||
@@ -1087,7 +1087,7 @@ int main(int argc, char** argv) | |||
1087 | clock_gettime(CLOCK_MONOTONIC, &spec); | 1087 | clock_gettime(CLOCK_MONOTONIC, &spec); |
1088 | finish = timespec_to_ns(spec); | 1088 | finish = timespec_to_ns(spec); |
1089 | 1089 | ||
1090 | responseTimeLog(min(num_jobs,jobCount++)) = finish - release; | 1090 | responseTimeLog(min(num_jobs,jobCount++)) = finish - release; |
1091 | 1091 | ||
1092 | // this is an estimated upper-bound on release time. it may be off by several microseconds. | 1092 | // this is an estimated upper-bound on release time. it may be off by several microseconds. |
1093 | #ifdef RESET_RELEASE_ON_MISS | 1093 | #ifdef RESET_RELEASE_ON_MISS |
@@ -1097,11 +1097,11 @@ int main(int argc, char** argv) | |||
1097 | #else | 1097 | #else |
1098 | release = release + period; // allow things to get progressively later. | 1098 | release = release + period; // allow things to get progressively later. |
1099 | #endif | 1099 | #endif |
1100 | 1100 | ||
1101 | sleep_next_period(); | 1101 | sleep_next_period(); |
1102 | clock_gettime(CLOCK_MONOTONIC, &spec); | 1102 | clock_gettime(CLOCK_MONOTONIC, &spec); |
1103 | release = min(timespec_to_ns(spec), release); | 1103 | release = min(timespec_to_ns(spec), release); |
1104 | 1104 | ||
1105 | } while(keepGoing); | 1105 | } while(keepGoing); |
1106 | } | 1106 | } |
1107 | 1107 | ||
@@ -1147,13 +1147,13 @@ int main(int argc, char** argv) | |||
1147 | } | 1147 | } |
1148 | } | 1148 | } |
1149 | 1149 | ||
1150 | 1150 | ||
1151 | if (ENABLE_WAIT) | 1151 | if (ENABLE_WAIT) |
1152 | { | 1152 | { |
1153 | printf("%d waiting at exit barrier\n", getpid()); | 1153 | printf("%d waiting at exit barrier\n", getpid()); |
1154 | release_barrier->wait(); | 1154 | release_barrier->wait(); |
1155 | } | 1155 | } |
1156 | 1156 | ||
1157 | 1157 | ||
1158 | char gpu_using_str[] = "GPU\n"; | 1158 | char gpu_using_str[] = "GPU\n"; |
1159 | char cpu_only_str[] = "CPU\n"; | 1159 | char cpu_only_str[] = "CPU\n"; |
@@ -1166,7 +1166,7 @@ int main(int argc, char** argv) | |||
1166 | // average | 1166 | // average |
1167 | blitz::mean(USED(responseTimeLog)), | 1167 | blitz::mean(USED(responseTimeLog)), |
1168 | // average pct of period | 1168 | // average pct of period |
1169 | 100.0*(blitz::mean(USED(responseTimeLog))/period), | 1169 | 100.0*(blitz::mean(USED(responseTimeLog))/period), |
1170 | // min | 1170 | // min |
1171 | blitz::min(USED(responseTimeLog)), | 1171 | blitz::min(USED(responseTimeLog)), |
1172 | // max | 1172 | // max |
@@ -1182,6 +1182,6 @@ int main(int argc, char** argv) | |||
1182 | // flag gpu-using tasks | 1182 | // flag gpu-using tasks |
1183 | ((GPU_TASK) ? gpu_using_str : cpu_only_str) | 1183 | ((GPU_TASK) ? gpu_using_str : cpu_only_str) |
1184 | ); | 1184 | ); |
1185 | 1185 | ||
1186 | return 0; | 1186 | return 0; |
1187 | } | 1187 | } |