aboutsummaryrefslogtreecommitdiffstats
path: root/gpu
diff options
context:
space:
mode:
authorGlenn Elliott <gelliott@cs.unc.edu>2013-03-13 15:33:57 -0400
committerGlenn Elliott <gelliott@cs.unc.edu>2013-03-13 15:33:57 -0400
commit944a78c21028da69fb53c0aec3e9dfdb048d47e4 (patch)
treebdbc77b3c0ff1337670a7e5d0f9d438388c1a866 /gpu
parentf338b34ea0fb6136ea3895a07161ece030c4b998 (diff)
parent1ff4fc699f01f0ad1359fad48b00c9d3be1b28b4 (diff)
Merge branch 'gh/staging' into temp
Conflicts: Makefile bin/rt_launch.c bin/rtspin.c src/task.c
Diffstat (limited to 'gpu')
-rw-r--r--gpu/aux_threads.c14
-rw-r--r--gpu/dgl.c20
-rw-r--r--gpu/ikglptest.c78
-rw-r--r--gpu/locktest.c2
-rw-r--r--gpu/nested.c4
-rw-r--r--gpu/rtspin_fake_cuda.cpp206
6 files changed, 162 insertions, 162 deletions
diff --git a/gpu/aux_threads.c b/gpu/aux_threads.c
index 1e168c6..1711c40 100644
--- a/gpu/aux_threads.c
+++ b/gpu/aux_threads.c
@@ -1,4 +1,4 @@
1/* based_mt_task.c -- A basic multi-threaded real-time task skeleton. 1/* based_mt_task.c -- A basic multi-threaded real-time task skeleton.
2 * 2 *
3 * This (by itself useless) task demos how to setup a multi-threaded LITMUS^RT 3 * This (by itself useless) task demos how to setup a multi-threaded LITMUS^RT
4 * real-time task. Familiarity with the single threaded example (base_task.c) 4 * real-time task. Familiarity with the single threaded example (base_task.c)
@@ -48,7 +48,7 @@ struct thread_context {
48void* rt_thread(void *tcontext); 48void* rt_thread(void *tcontext);
49void* aux_thread(void *tcontext); 49void* aux_thread(void *tcontext);
50 50
51/* Declare the periodically invoked job. 51/* Declare the periodically invoked job.
52 * Returns 1 -> task should exit. 52 * Returns 1 -> task should exit.
53 * 0 -> task should continue. 53 * 0 -> task should continue.
54 */ 54 */
@@ -112,7 +112,7 @@ int main(int argc, char** argv)
112 112
113 ctx = calloc(NUM_AUX_THREADS, sizeof(struct thread_context)); 113 ctx = calloc(NUM_AUX_THREADS, sizeof(struct thread_context));
114 task = calloc(NUM_AUX_THREADS, sizeof(pthread_t)); 114 task = calloc(NUM_AUX_THREADS, sizeof(pthread_t));
115 115
116 //lt_t delay = ms2lt(1000); 116 //lt_t delay = ms2lt(1000);
117 117
118 /***** 118 /*****
@@ -199,9 +199,9 @@ int main(int argc, char** argv)
199 printf("child %d: %fs\n", i, time); 199 printf("child %d: %fs\n", i, time);
200 } 200 }
201 } 201 }
202
203 202
204 /***** 203
204 /*****
205 * 6) Clean up, maybe print results and stats, and exit. 205 * 6) Clean up, maybe print results and stats, and exit.
206 */ 206 */
207 return 0; 207 return 0;
@@ -271,7 +271,7 @@ void* rt_thread(void *tcontext)
271 271
272 wait_for_ts_release(); 272 wait_for_ts_release();
273 273
274 /* The task is now executing as a real-time task if the call didn't fail. 274 /* The task is now executing as a real-time task if the call didn't fail.
275 */ 275 */
276 276
277 277
@@ -304,7 +304,7 @@ void* rt_thread(void *tcontext)
304 return ctx; 304 return ctx;
305} 305}
306 306
307int job(void) 307int job(void)
308{ 308{
309 /* Do real-time calculation. */ 309 /* Do real-time calculation. */
310 310
diff --git a/gpu/dgl.c b/gpu/dgl.c
index dc68ead..42a3ae2 100644
--- a/gpu/dgl.c
+++ b/gpu/dgl.c
@@ -177,7 +177,7 @@ void* rt_thread(void* _ctx)
177 xfprintf(stdout, "ikglp od = %d\n", ctx->ikglp); 177 xfprintf(stdout, "ikglp od = %d\n", ctx->ikglp);
178 } 178 }
179 179
180 180
181 for (i = 0; i < NUM_SEMS; i++) { 181 for (i = 0; i < NUM_SEMS; i++) {
182 if(!USE_PRIOQ) { 182 if(!USE_PRIOQ) {
183 ctx->od[i] = open_fifo_sem(ctx->fd, i+1); 183 ctx->od[i] = open_fifo_sem(ctx->fd, i+1);
@@ -208,29 +208,29 @@ void* rt_thread(void* _ctx)
208 int last = (first + NEST_DEPTH - 1 >= NUM_SEMS) ? NUM_SEMS - 1 : first + NEST_DEPTH - 1; 208 int last = (first + NEST_DEPTH - 1 >= NUM_SEMS) ? NUM_SEMS - 1 : first + NEST_DEPTH - 1;
209 int dgl_size = last - first + 1; 209 int dgl_size = last - first + 1;
210 int dgl[dgl_size]; 210 int dgl[dgl_size];
211 211
212 // construct the DGL 212 // construct the DGL
213 for(i = first; i <= last; ++i) { 213 for(i = first; i <= last; ++i) {
214 dgl[i-first] = ctx->od[i]; 214 dgl[i-first] = ctx->od[i];
215 } 215 }
216 216
217 217
218 if(NUM_REPLICAS) { 218 if(NUM_REPLICAS) {
219 replica = litmus_lock(ctx->ikglp); 219 replica = litmus_lock(ctx->ikglp);
220 xfprintf(stdout, "[%d] got ikglp replica %d.\n", ctx->id, replica); 220 xfprintf(stdout, "[%d] got ikglp replica %d.\n", ctx->id, replica);
221 } 221 }
222 222
223 223
224 litmus_dgl_lock(dgl, dgl_size); 224 litmus_dgl_lock(dgl, dgl_size);
225 xfprintf(stdout, "[%d] acquired dgl.\n", ctx->id); 225 xfprintf(stdout, "[%d] acquired dgl.\n", ctx->id);
226 226
227 do_exit = job(ctx); 227 do_exit = job(ctx);
228 228
229 229
230 xfprintf(stdout, "[%d] unlocking dgl.\n", ctx->id); 230 xfprintf(stdout, "[%d] unlocking dgl.\n", ctx->id);
231 litmus_dgl_unlock(dgl, dgl_size); 231 litmus_dgl_unlock(dgl, dgl_size);
232 232
233 if(NUM_REPLICAS) { 233 if(NUM_REPLICAS) {
234 xfprintf(stdout, "[%d]: freeing ikglp replica %d.\n", ctx->id, replica); 234 xfprintf(stdout, "[%d]: freeing ikglp replica %d.\n", ctx->id, replica);
235 litmus_unlock(ctx->ikglp); 235 litmus_unlock(ctx->ikglp);
236 } 236 }
@@ -249,7 +249,7 @@ void* rt_thread(void* _ctx)
249 return NULL; 249 return NULL;
250} 250}
251 251
252void dirty_kb(int kb) 252void dirty_kb(int kb)
253{ 253{
254 int32_t one_kb[256]; 254 int32_t one_kb[256];
255 int32_t sum = 0; 255 int32_t sum = 0;
diff --git a/gpu/ikglptest.c b/gpu/ikglptest.c
index f802801..30623b7 100644
--- a/gpu/ikglptest.c
+++ b/gpu/ikglptest.c
@@ -172,7 +172,7 @@ struct avg_info feedback(int _a, int _b)
172 } 172 }
173 173
174 stdev = sqrtf(devsum/(NUM_SAMPLES-1)); 174 stdev = sqrtf(devsum/(NUM_SAMPLES-1));
175 175
176 ret.avg = avg; 176 ret.avg = avg;
177 ret.stdev = stdev; 177 ret.stdev = stdev;
178 178
@@ -189,10 +189,10 @@ struct avg_info feedback(int _a, int _b)
189int main(int argc, char** argv) 189int main(int argc, char** argv)
190{ 190{
191 int i; 191 int i;
192 struct thread_context* ctx; 192 struct thread_context* ctx = NULL;
193 struct thread_context* aux_ctx; 193 struct thread_context* aux_ctx = NULL;
194 pthread_t* task; 194 pthread_t* task = NULL;
195 pthread_t* aux_task; 195 pthread_t* aux_task = NULL;
196 int fd; 196 int fd;
197 197
198 int opt; 198 int opt;
@@ -291,7 +291,7 @@ int main(int argc, char** argv)
291 } 291 }
292 } 292 }
293 } 293 }
294 294
295 printf("Best:\ta = %d\tb = %d\t(b-a) = %d\tavg = %6.2f\tstdev = %6.2f\n", best_a, best_b, best_b - best_a, best.avg, best.stdev); 295 printf("Best:\ta = %d\tb = %d\t(b-a) = %d\tavg = %6.2f\tstdev = %6.2f\n", best_a, best_b, best_b - best_a, best.avg, best.stdev);
296 printf("2nd:\ta = %d\tb = %d\t(b-a) = %d\tavg = %6.2f\tstdev = %6.2f\n", second_best_a, second_best_b, second_best_b - second_best_a, second_best.avg, second_best.stdev); 296 printf("2nd:\ta = %d\tb = %d\t(b-a) = %d\tavg = %6.2f\tstdev = %6.2f\n", second_best_a, second_best_b, second_best_b - second_best_a, second_best.avg, second_best.stdev);
297 297
@@ -308,7 +308,7 @@ int main(int argc, char** argv)
308 } 308 }
309 309
310 printf("Aaron:\tavg = %6.2f\tstd = %6.2f\n", avg_accum/TRIALS, std_accum/TRIALS); 310 printf("Aaron:\tavg = %6.2f\tstd = %6.2f\n", avg_accum/TRIALS, std_accum/TRIALS);
311 311
312 312
313 313
314 314
@@ -385,7 +385,7 @@ int affinity_distance(struct thread_context* ctx, int a, int b)
385{ 385{
386 int i; 386 int i;
387 int dist; 387 int dist;
388 388
389 if(a >= 0 && b >= 0) { 389 if(a >= 0 && b >= 0) {
390 for(i = 0; i <= 3; ++i) { 390 for(i = 0; i <= 3; ++i) {
391 if(a>>i == b>>i) { 391 if(a>>i == b>>i) {
@@ -397,25 +397,25 @@ int affinity_distance(struct thread_context* ctx, int a, int b)
397 } 397 }
398 else { 398 else {
399 dist = 0; 399 dist = 0;
400 } 400 }
401 401
402out: 402out:
403 //printf("[%d]: distance: %d -> %d = %d\n", ctx->id, a, b, dist); 403 //printf("[%d]: distance: %d -> %d = %d\n", ctx->id, a, b, dist);
404 404
405 ++(ctx->mig_count[dist]); 405 ++(ctx->mig_count[dist]);
406 406
407 return dist; 407 return dist;
408 408
409// int groups[] = {2, 4, 8}; 409// int groups[] = {2, 4, 8};
410// int i; 410// int i;
411// 411//
412// if(a < 0 || b < 0) 412// if(a < 0 || b < 0)
413// return (sizeof(groups)/sizeof(groups[0])); // worst affinity 413// return (sizeof(groups)/sizeof(groups[0])); // worst affinity
414// 414//
415// // no migration 415// // no migration
416// if(a == b) 416// if(a == b)
417// return 0; 417// return 0;
418// 418//
419// for(i = 0; i < sizeof(groups)/sizeof(groups[0]); ++i) { 419// for(i = 0; i < sizeof(groups)/sizeof(groups[0]); ++i) {
420// if(a/groups[i] == b/groups[i]) 420// if(a/groups[i] == b/groups[i])
421// return (i+1); 421// return (i+1);
@@ -441,7 +441,7 @@ void* rt_thread(void* _ctx)
441{ 441{
442 int i; 442 int i;
443 int do_exit = 0; 443 int do_exit = 0;
444 int last_replica = -1; 444 int last_replica = -1;
445 445
446 struct thread_context *ctx = (struct thread_context*)_ctx; 446 struct thread_context *ctx = (struct thread_context*)_ctx;
447 447
@@ -472,13 +472,13 @@ void* rt_thread(void* _ctx)
472 IKGLP_OPTIMAL_FIFO_LEN : 472 IKGLP_OPTIMAL_FIFO_LEN :
473 IKGLP_UNLIMITED_FIFO_LEN, 473 IKGLP_UNLIMITED_FIFO_LEN,
474 ENABLE_AFFINITY 474 ENABLE_AFFINITY
475 ); 475 );
476 } 476 }
477 if(ctx->kexclu < 0) 477 if(ctx->kexclu < 0)
478 perror("open_kexclu_sem"); 478 perror("open_kexclu_sem");
479 else 479 else
480 printf("kexclu od = %d\n", ctx->kexclu); 480 printf("kexclu od = %d\n", ctx->kexclu);
481 481
482 for (i = 0; i < NUM_SEMS; ++i) { 482 for (i = 0; i < NUM_SEMS; ++i) {
483 if(!USE_PRIOQ) { 483 if(!USE_PRIOQ) {
484 ctx->od[i] = open_fifo_sem(ctx->fd, i + ctx->kexclu + 2); 484 ctx->od[i] = open_fifo_sem(ctx->fd, i + ctx->kexclu + 2);
@@ -508,21 +508,21 @@ void* rt_thread(void* _ctx)
508 int dgl_size = last - first + 1; 508 int dgl_size = last - first + 1;
509 int replica = -1; 509 int replica = -1;
510 int distance; 510 int distance;
511 511
512 int dgl[dgl_size]; 512 int dgl[dgl_size];
513 513
514 // construct the DGL 514 // construct the DGL
515 for(i = first; i <= last; ++i) { 515 for(i = first; i <= last; ++i) {
516 dgl[i-first] = ctx->od[i]; 516 dgl[i-first] = ctx->od[i];
517 } 517 }
518 518
519 replica = litmus_lock(ctx->kexclu); 519 replica = litmus_lock(ctx->kexclu);
520 520
521 //printf("[%d] got kexclu replica %d.\n", ctx->id, replica); 521 //printf("[%d] got kexclu replica %d.\n", ctx->id, replica);
522 //fflush(stdout); 522 //fflush(stdout);
523 523
524 distance = affinity_distance(ctx, replica, last_replica); 524 distance = affinity_distance(ctx, replica, last_replica);
525 525
526 if(USE_DYNAMIC_GROUP_LOCKS) { 526 if(USE_DYNAMIC_GROUP_LOCKS) {
527 litmus_dgl_lock(dgl, dgl_size); 527 litmus_dgl_lock(dgl, dgl_size);
528 } 528 }
@@ -531,24 +531,24 @@ void* rt_thread(void* _ctx)
531 litmus_lock(dgl[i]); 531 litmus_lock(dgl[i]);
532 } 532 }
533 } 533 }
534 534
535 //do_exit = nested_job(ctx, &count, &first, affinity_cost[distance]); 535 //do_exit = nested_job(ctx, &count, &first, affinity_cost[distance]);
536 do_exit = job(ctx, affinity_cost[distance]); 536 do_exit = job(ctx, affinity_cost[distance]);
537 537
538 if(USE_DYNAMIC_GROUP_LOCKS) { 538 if(USE_DYNAMIC_GROUP_LOCKS) {
539 litmus_dgl_unlock(dgl, dgl_size); 539 litmus_dgl_unlock(dgl, dgl_size);
540 } 540 }
541 else { 541 else {
542 for(i = dgl_size - 1; i >= 0; --i) { 542 for(i = dgl_size - 1; i >= 0; --i) {
543 litmus_unlock(dgl[i]); 543 litmus_unlock(dgl[i]);
544 } 544 }
545 } 545 }
546 546
547 //printf("[%d]: freeing kexclu replica %d.\n", ctx->id, replica); 547 //printf("[%d]: freeing kexclu replica %d.\n", ctx->id, replica);
548 //fflush(stdout); 548 //fflush(stdout);
549 549
550 litmus_unlock(ctx->kexclu); 550 litmus_unlock(ctx->kexclu);
551 551
552 last_replica = replica; 552 last_replica = replica;
553 553
554 if(SLEEP_BETWEEN_JOBS && !do_exit) { 554 if(SLEEP_BETWEEN_JOBS && !do_exit) {
@@ -567,7 +567,7 @@ void* rt_thread(void* _ctx)
567 */ 567 */
568 TH_CALL( task_mode(BACKGROUND_TASK) ); 568 TH_CALL( task_mode(BACKGROUND_TASK) );
569 569
570 for(i = 0; i < sizeof(ctx->mig_count)/sizeof(ctx->mig_count[0]); ++i) 570 for(i = 0; i < sizeof(ctx->mig_count)/sizeof(ctx->mig_count[0]); ++i)
571 { 571 {
572 printf("[%d]: mig_count[%d] = %d\n", ctx->id, i, ctx->mig_count[i]); 572 printf("[%d]: mig_count[%d] = %d\n", ctx->id, i, ctx->mig_count[i]);
573 } 573 }
@@ -608,15 +608,15 @@ void* rt_thread(void* _ctx)
608//} 608//}
609 609
610 610
611void dirty_kb(int kb) 611void dirty_kb(int kb)
612{ 612{
613 int32_t one_kb[256]; 613 int32_t one_kb[256];
614 int32_t sum = 0; 614 int32_t sum = 0;
615 int32_t i; 615 int32_t i;
616 616
617 if(!kb) 617 if(!kb)
618 return; 618 return;
619 619
620 for (i = 0; i < 256; i++) 620 for (i = 0; i < 256; i++)
621 sum += one_kb[i]; 621 sum += one_kb[i];
622 kb--; 622 kb--;
@@ -630,9 +630,9 @@ void dirty_kb(int kb)
630int job(struct thread_context* ctx, int runfactor) 630int job(struct thread_context* ctx, int runfactor)
631{ 631{
632 //struct timespec tosleep = {0, 100000}; // 0.1 ms 632 //struct timespec tosleep = {0, 100000}; // 0.1 ms
633 633
634 //printf("[%d]: runfactor = %d\n", ctx->id, runfactor); 634 //printf("[%d]: runfactor = %d\n", ctx->id, runfactor);
635 635
636 //dirty_kb(8 * runfactor); 636 //dirty_kb(8 * runfactor);
637 dirty_kb(1 * runfactor); 637 dirty_kb(1 * runfactor);
638 //nanosleep(&tosleep, NULL); 638 //nanosleep(&tosleep, NULL);
diff --git a/gpu/locktest.c b/gpu/locktest.c
index bc4fc54..6a1219a 100644
--- a/gpu/locktest.c
+++ b/gpu/locktest.c
@@ -177,7 +177,7 @@ void* rt_thread(void* _ctx)
177 return NULL; 177 return NULL;
178} 178}
179 179
180void dirty_kb(int kb) 180void dirty_kb(int kb)
181{ 181{
182 int32_t one_kb[256]; 182 int32_t one_kb[256];
183 int32_t sum = 0; 183 int32_t sum = 0;
diff --git a/gpu/nested.c b/gpu/nested.c
index 8c39152..edec46b 100644
--- a/gpu/nested.c
+++ b/gpu/nested.c
@@ -180,7 +180,7 @@ void* rt_thread(void* _ctx)
180 int first = (int)(NUM_SEMS * (rand_r(&(ctx->rand)) / (RAND_MAX + 1.0))); 180 int first = (int)(NUM_SEMS * (rand_r(&(ctx->rand)) / (RAND_MAX + 1.0)));
181 int count = NEST_DEPTH; 181 int count = NEST_DEPTH;
182 do_exit = nested_job(ctx, &count, &first); 182 do_exit = nested_job(ctx, &count, &first);
183 183
184 if(SLEEP_BETWEEN_JOBS && !do_exit) { 184 if(SLEEP_BETWEEN_JOBS && !do_exit) {
185 sleep_next_period(); 185 sleep_next_period();
186 } 186 }
@@ -226,7 +226,7 @@ int nested_job(struct thread_context* ctx, int *count, int *next)
226 226
227 227
228 228
229void dirty_kb(int kb) 229void dirty_kb(int kb)
230{ 230{
231 int32_t one_kb[256]; 231 int32_t one_kb[256];
232 int32_t sum = 0; 232 int32_t sum = 0;
diff --git a/gpu/rtspin_fake_cuda.cpp b/gpu/rtspin_fake_cuda.cpp
index 78e4f60..247a74c 100644
--- a/gpu/rtspin_fake_cuda.cpp
+++ b/gpu/rtspin_fake_cuda.cpp
@@ -119,7 +119,7 @@ char *h_state_data = 0;
119 mmap(NULL, s , \ 119 mmap(NULL, s , \
120 PROT_READ | PROT_WRITE, \ 120 PROT_READ | PROT_WRITE, \
121 MAP_PRIVATE | MAP_ANONYMOUS | MAP_LOCKED, \ 121 MAP_PRIVATE | MAP_ANONYMOUS | MAP_LOCKED, \
122 -1, 0) 122 -1, 0)
123#else 123#else
124#define c_malloc(s) malloc(s) 124#define c_malloc(s) malloc(s)
125#endif 125#endif
@@ -144,38 +144,38 @@ cudaError_t cudaGetLastError()
144//////////////////////////////////////////////////////////////////////// 144////////////////////////////////////////////////////////////////////////
145 145
146struct ce_lock_state 146struct ce_lock_state
147{ 147{
148 int locks[2]; 148 int locks[2];
149 size_t num_locks; 149 size_t num_locks;
150 size_t budget_remaining; 150 size_t budget_remaining;
151 bool locked; 151 bool locked;
152 152
153 ce_lock_state(int device_a, enum cudaMemcpyKind kind, size_t size, int device_b = -1) { 153 ce_lock_state(int device_a, enum cudaMemcpyKind kind, size_t size, int device_b = -1) {
154 num_locks = (device_a != -1) + (device_b != -1); 154 num_locks = (device_a != -1) + (device_b != -1);
155 155
156 if(device_a != -1) { 156 if(device_a != -1) {
157 locks[0] = (kind == cudaMemcpyHostToDevice) ? 157 locks[0] = (kind == cudaMemcpyHostToDevice) ?
158 CE_SEND_LOCKS[device_a] : CE_RECV_LOCKS[device_a]; 158 CE_SEND_LOCKS[device_a] : CE_RECV_LOCKS[device_a];
159 } 159 }
160 160
161 if(device_b != -1) { 161 if(device_b != -1) {
162 assert(kind == cudaMemcpyDeviceToDevice); 162 assert(kind == cudaMemcpyDeviceToDevice);
163 163
164 locks[1] = CE_RECV_LOCKS[device_b]; 164 locks[1] = CE_RECV_LOCKS[device_b];
165 165
166 if(locks[1] < locks[0]) { 166 if(locks[1] < locks[0]) {
167 int temp = locks[1]; 167 int temp = locks[1];
168 locks[1] = locks[0]; 168 locks[1] = locks[0];
169 locks[0] = temp; 169 locks[0] = temp;
170 } 170 }
171 } 171 }
172 172
173 if(!ENABLE_CHUNKING) 173 if(!ENABLE_CHUNKING)
174 budget_remaining = size; 174 budget_remaining = size;
175 else 175 else
176 budget_remaining = CHUNK_SIZE; 176 budget_remaining = CHUNK_SIZE;
177 } 177 }
178 178
179 void lock() { 179 void lock() {
180 if(USE_DYNAMIC_GROUP_LOCKS) { 180 if(USE_DYNAMIC_GROUP_LOCKS) {
181 litmus_dgl_lock(locks, num_locks); 181 litmus_dgl_lock(locks, num_locks);
@@ -189,7 +189,7 @@ struct ce_lock_state
189 } 189 }
190 locked = true; 190 locked = true;
191 } 191 }
192 192
193 void unlock() { 193 void unlock() {
194 if(USE_DYNAMIC_GROUP_LOCKS) { 194 if(USE_DYNAMIC_GROUP_LOCKS) {
195 litmus_dgl_unlock(locks, num_locks); 195 litmus_dgl_unlock(locks, num_locks);
@@ -204,15 +204,15 @@ struct ce_lock_state
204 } 204 }
205 locked = false; 205 locked = false;
206 } 206 }
207 207
208 void refresh() { 208 void refresh() {
209 budget_remaining = CHUNK_SIZE; 209 budget_remaining = CHUNK_SIZE;
210 } 210 }
211 211
212 bool budgetIsAvailable(size_t tosend) { 212 bool budgetIsAvailable(size_t tosend) {
213 return(tosend >= budget_remaining); 213 return(tosend >= budget_remaining);
214 } 214 }
215 215
216 void decreaseBudget(size_t spent) { 216 void decreaseBudget(size_t spent) {
217 budget_remaining -= spent; 217 budget_remaining -= spent;
218 } 218 }
@@ -225,53 +225,53 @@ cudaError_t __chunkMemcpy(void* a_dst, const void* a_src, size_t count,
225{ 225{
226 cudaError_t ret = cudaSuccess; 226 cudaError_t ret = cudaSuccess;
227 int remaining = count; 227 int remaining = count;
228 228
229 char* dst = (char*)a_dst; 229 char* dst = (char*)a_dst;
230 const char* src = (const char*)a_src; 230 const char* src = (const char*)a_src;
231 231
232 // disable chunking, if needed, by setting chunk_size equal to the 232 // disable chunking, if needed, by setting chunk_size equal to the
233 // amount of data to be copied. 233 // amount of data to be copied.
234 int chunk_size = (ENABLE_CHUNKING) ? CHUNK_SIZE : count; 234 int chunk_size = (ENABLE_CHUNKING) ? CHUNK_SIZE : count;
235 int i = 0; 235 int i = 0;
236 236
237 while(remaining != 0) 237 while(remaining != 0)
238 { 238 {
239 int bytesToCopy = std::min(remaining, chunk_size); 239 int bytesToCopy = std::min(remaining, chunk_size);
240 240
241 if(state && state->budgetIsAvailable(bytesToCopy) && state->locked) { 241 if(state && state->budgetIsAvailable(bytesToCopy) && state->locked) {
242 //cutilSafeCall( cudaStreamSynchronize(streams[CUR_DEVICE]) ); 242 //cutilSafeCall( cudaStreamSynchronize(streams[CUR_DEVICE]) );
243 ret = cudaGetLastError(); 243 ret = cudaGetLastError();
244 244
245 if(ret != cudaSuccess) 245 if(ret != cudaSuccess)
246 { 246 {
247 break; 247 break;
248 } 248 }
249 249
250 state->unlock(); 250 state->unlock();
251 state->refresh(); // replentish. 251 state->refresh(); // replentish.
252 // we can only run out of 252 // we can only run out of
253 // budget if chunking is enabled. 253 // budget if chunking is enabled.
254 // we presume that init budget would 254 // we presume that init budget would
255 // be set to cover entire memcpy 255 // be set to cover entire memcpy
256 // if chunking were disabled. 256 // if chunking were disabled.
257 } 257 }
258 258
259 if(state && !state->locked) { 259 if(state && !state->locked) {
260 state->lock(); 260 state->lock();
261 } 261 }
262 262
263 //ret = cudaMemcpy(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind); 263 //ret = cudaMemcpy(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind);
264 //cudaMemcpyAsync(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind, streams[CUR_DEVICE]); 264 //cudaMemcpyAsync(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind, streams[CUR_DEVICE]);
265 265
266 if(state) { 266 if(state) {
267 state->decreaseBudget(bytesToCopy); 267 state->decreaseBudget(bytesToCopy);
268 } 268 }
269 269
270// if(ret != cudaSuccess) 270// if(ret != cudaSuccess)
271// { 271// {
272// break; 272// break;
273// } 273// }
274 274
275 ++i; 275 ++i;
276 remaining -= bytesToCopy; 276 remaining -= bytesToCopy;
277 } 277 }
@@ -281,7 +281,7 @@ cudaError_t __chunkMemcpy(void* a_dst, const void* a_src, size_t count,
281cudaError_t chunkMemcpy(void* a_dst, const void* a_src, size_t count, 281cudaError_t chunkMemcpy(void* a_dst, const void* a_src, size_t count,
282 enum cudaMemcpyKind kind, 282 enum cudaMemcpyKind kind,
283 int device_a = -1, // device_a == -1 disables locking 283 int device_a = -1, // device_a == -1 disables locking
284 bool do_locking = true, 284 bool do_locking = true,
285 int device_b = -1) 285 int device_b = -1)
286{ 286{
287 cudaError_t ret; 287 cudaError_t ret;
@@ -317,7 +317,7 @@ inline uint64_t timespec_to_ns(const struct timespec& t)
317inline struct timespec ns_to_timespec(const uint64_t& ns) 317inline struct timespec ns_to_timespec(const uint64_t& ns)
318{ 318{
319 struct timespec temp = {ns/1e9, ns - ns/1e9}; 319 struct timespec temp = {ns/1e9, ns - ns/1e9};
320 return(temp); 320 return(temp);
321} 321}
322 322
323inline uint64_t clock_gettime_ns(clockid_t clk_id) 323inline uint64_t clock_gettime_ns(clockid_t clk_id)
@@ -366,9 +366,9 @@ static void allocate_locks()
366{ 366{
367 // allocate k-FMLP lock 367 // allocate k-FMLP lock
368 int fd = open("semaphores", O_RDONLY | O_CREAT, S_IRUSR | S_IWUSR); 368 int fd = open("semaphores", O_RDONLY | O_CREAT, S_IRUSR | S_IWUSR);
369 369
370 int base_name = GPU_PARTITION * 1000; 370 int base_name = GPU_PARTITION * 1000;
371 371
372 if(USE_KFMLP) { 372 if(USE_KFMLP) {
373 KEXCLU_LOCK = open_kfmlp_gpu_sem(fd, 373 KEXCLU_LOCK = open_kfmlp_gpu_sem(fd,
374 base_name, /* name */ 374 base_name, /* name */
@@ -397,7 +397,7 @@ static void allocate_locks()
397// NUM_SIMULT_USERS, 397// NUM_SIMULT_USERS,
398// ENABLE_AFFINITY, 398// ENABLE_AFFINITY,
399// RELAX_FIFO_MAX_LEN 399// RELAX_FIFO_MAX_LEN
400// ); 400// );
401 } 401 }
402 if(KEXCLU_LOCK < 0) 402 if(KEXCLU_LOCK < 0)
403 perror("open_kexclu_sem"); 403 perror("open_kexclu_sem");
@@ -406,31 +406,31 @@ static void allocate_locks()
406 { 406 {
407 open_sem_t opensem = (!USE_PRIOQ) ? open_fifo_sem : open_prioq_sem; 407 open_sem_t opensem = (!USE_PRIOQ) ? open_fifo_sem : open_prioq_sem;
408 const char* opensem_label = (!USE_PRIOQ) ? "open_fifo_sem" : "open_prioq_sem"; 408 const char* opensem_label = (!USE_PRIOQ) ? "open_fifo_sem" : "open_prioq_sem";
409 409
410 // allocate the engine locks. 410 // allocate the engine locks.
411 for (int i = 0; i < MAX_GPUS; ++i) 411 for (int i = 0; i < MAX_GPUS; ++i)
412 { 412 {
413 EE_LOCKS[i] = opensem(fd, (i+1)*10 + base_name); 413 EE_LOCKS[i] = opensem(fd, (i+1)*10 + base_name);
414 if(EE_LOCKS[i] < 0) 414 if(EE_LOCKS[i] < 0)
415 perror(opensem_label); 415 perror(opensem_label);
416 416
417 CE_SEND_LOCKS[i] = opensem(fd, (i+1)*10 + base_name + 1); 417 CE_SEND_LOCKS[i] = opensem(fd, (i+1)*10 + base_name + 1);
418 if(CE_SEND_LOCKS[i] < 0) 418 if(CE_SEND_LOCKS[i] < 0)
419 perror(opensem_label); 419 perror(opensem_label);
420 420
421 if(NUM_SIMULT_USERS == 3) 421 if(NUM_SIMULT_USERS == 3)
422 { 422 {
423 // allocate a separate lock for the second copy engine 423 // allocate a separate lock for the second copy engine
424 CE_RECV_LOCKS[i] = opensem(fd, (i+1)*10 + base_name + 2); 424 CE_RECV_LOCKS[i] = opensem(fd, (i+1)*10 + base_name + 2);
425 if(CE_RECV_LOCKS[i] < 0) 425 if(CE_RECV_LOCKS[i] < 0)
426 perror(opensem_label); 426 perror(opensem_label);
427 } 427 }
428 else 428 else
429 { 429 {
430 // share a single lock for the single copy engine 430 // share a single lock for the single copy engine
431 CE_RECV_LOCKS[i] = CE_SEND_LOCKS[i]; 431 CE_RECV_LOCKS[i] = CE_SEND_LOCKS[i];
432 } 432 }
433 } 433 }
434 } 434 }
435} 435}
436 436
@@ -449,22 +449,22 @@ static void allocate_host_memory()
449// h_send_data = (char *)c_malloc(send_alloc_bytes); 449// h_send_data = (char *)c_malloc(send_alloc_bytes);
450// memset(h_send_data, 0x55, send_alloc_bytes); // write some random value 450// memset(h_send_data, 0x55, send_alloc_bytes); // write some random value
451// // this will open a connection to GPU 0 if there is no active context, so 451// // this will open a connection to GPU 0 if there is no active context, so
452// // expect long stalls. LAME. 452// // expect long stalls. LAME.
453// cutilSafeCall( cudaHostRegister(h_send_data, send_alloc_bytes, cudaHostRegisterPortable) ); 453// cutilSafeCall( cudaHostRegister(h_send_data, send_alloc_bytes, cudaHostRegisterPortable) );
454// } 454// }
455// 455//
456// if(recv_alloc_bytes > 0) 456// if(recv_alloc_bytes > 0)
457// { 457// {
458// h_recv_data = (char *)c_malloc(recv_alloc_bytes); 458// h_recv_data = (char *)c_malloc(recv_alloc_bytes);
459// memset(h_recv_data, 0xAA, recv_alloc_bytes); 459// memset(h_recv_data, 0xAA, recv_alloc_bytes);
460// cutilSafeCall( cudaHostRegister(h_recv_data, recv_alloc_bytes, cudaHostRegisterPortable) ); 460// cutilSafeCall( cudaHostRegister(h_recv_data, recv_alloc_bytes, cudaHostRegisterPortable) );
461// } 461// }
462// 462//
463// if(state_alloc_bytes > 0) 463// if(state_alloc_bytes > 0)
464// { 464// {
465// h_state_data = (char *)c_malloc(state_alloc_bytes); 465// h_state_data = (char *)c_malloc(state_alloc_bytes);
466// memset(h_state_data, 0xCC, state_alloc_bytes); // write some random value 466// memset(h_state_data, 0xCC, state_alloc_bytes); // write some random value
467// cutilSafeCall( cudaHostRegister(h_state_data, state_alloc_bytes, cudaHostRegisterPortable) ); 467// cutilSafeCall( cudaHostRegister(h_state_data, state_alloc_bytes, cudaHostRegisterPortable) );
468// } 468// }
469 469
470 printf("Host memory allocated.\n"); 470 printf("Host memory allocated.\n");
@@ -477,28 +477,28 @@ static void allocate_device_memory()
477// for(int i = 0; i < GPU_PARTITION_SIZE; ++i) 477// for(int i = 0; i < GPU_PARTITION_SIZE; ++i)
478// { 478// {
479// int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i; 479// int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i;
480// 480//
481// if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].lock(); 481// if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].lock();
482// 482//
483// cutilSafeCall( cudaSetDevice(which_device) ); 483// cutilSafeCall( cudaSetDevice(which_device) );
484// cutilSafeCall( cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 0) ); 484// cutilSafeCall( cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 0) );
485// cutilSafeCall( cudaDeviceSetLimit(cudaLimitMallocHeapSize, 0) ); 485// cutilSafeCall( cudaDeviceSetLimit(cudaLimitMallocHeapSize, 0) );
486// 486//
487// cutilSafeCall( cudaStreamCreate(&streams[which_device]) ); 487// cutilSafeCall( cudaStreamCreate(&streams[which_device]) );
488// 488//
489// /* pre-allocate memory, pray there's enough to go around */ 489// /* pre-allocate memory, pray there's enough to go around */
490// if(SEND_SIZE > 0) { 490// if(SEND_SIZE > 0) {
491// cutilSafeCall( cudaMalloc((void**)&d_send_data[which_device], SEND_SIZE) ); 491// cutilSafeCall( cudaMalloc((void**)&d_send_data[which_device], SEND_SIZE) );
492// } 492// }
493// if(RECV_SIZE > 0) { 493// if(RECV_SIZE > 0) {
494// cutilSafeCall( cudaMalloc((void**)&h_recv_data[which_device], RECV_SIZE) ); 494// cutilSafeCall( cudaMalloc((void**)&h_recv_data[which_device], RECV_SIZE) );
495// } 495// }
496// if(STATE_SIZE > 0) { 496// if(STATE_SIZE > 0) {
497// cutilSafeCall( cudaMalloc((void**)&h_state_data[which_device], STATE_SIZE) ); 497// cutilSafeCall( cudaMalloc((void**)&h_state_data[which_device], STATE_SIZE) );
498// } 498// }
499// 499//
500// if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].unlock(); 500// if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].unlock();
501// } 501// }
502 printf("Device memory allocated.\n"); 502 printf("Device memory allocated.\n");
503} 503}
504 504
@@ -508,39 +508,39 @@ static void configure_gpus()
508 508
509// // SUSPEND WHEN BLOCKED!! 509// // SUSPEND WHEN BLOCKED!!
510// cutilSafeCall( cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync) ); 510// cutilSafeCall( cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync) );
511// 511//
512// // establish a connection to each GPU. 512// // establish a connection to each GPU.
513// for(int i = 0; i < GPU_PARTITION_SIZE; ++i) 513// for(int i = 0; i < GPU_PARTITION_SIZE; ++i)
514// { 514// {
515// int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i; 515// int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i;
516// 516//
517// if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].lock(); 517// if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].lock();
518// 518//
519// cutilSafeCall( cudaSetDevice(which_device) ); 519// cutilSafeCall( cudaSetDevice(which_device) );
520// cutilSafeCall( cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 0) ); 520// cutilSafeCall( cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 0) );
521// cutilSafeCall( cudaDeviceSetLimit(cudaLimitMallocHeapSize, 0) ); 521// cutilSafeCall( cudaDeviceSetLimit(cudaLimitMallocHeapSize, 0) );
522// 522//
523// cutilSafeCall( cudaStreamCreate(&streams[which_device]) ); 523// cutilSafeCall( cudaStreamCreate(&streams[which_device]) );
524// 524//
525// // enable P2P migrations. 525// // enable P2P migrations.
526// // we assume all GPUs are on the same I/O hub. 526// // we assume all GPUs are on the same I/O hub.
527// for(int j = 0; j < GPU_PARTITION_SIZE; ++j) 527// for(int j = 0; j < GPU_PARTITION_SIZE; ++j)
528// { 528// {
529// int other_device = GPU_PARTITION*GPU_PARTITION_SIZE + j; 529// int other_device = GPU_PARTITION*GPU_PARTITION_SIZE + j;
530// 530//
531// if(which_device != other_device) 531// if(which_device != other_device)
532// { 532// {
533// cutilSafeCall( cudaDeviceEnablePeerAccess(other_device, 0) ); 533// cutilSafeCall( cudaDeviceEnablePeerAccess(other_device, 0) );
534// } 534// }
535// } 535// }
536// 536//
537// if(i == 0) 537// if(i == 0)
538// { 538// {
539// struct cudaDeviceProp pi; 539// struct cudaDeviceProp pi;
540// cudaGetDeviceProperties(&pi, i); 540// cudaGetDeviceProperties(&pi, i);
541// gpuCyclesPerSecond = pi.clockRate * 1000; /* khz -> hz */ 541// gpuCyclesPerSecond = pi.clockRate * 1000; /* khz -> hz */
542// } 542// }
543// 543//
544// if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].unlock(); 544// if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].unlock();
545// } 545// }
546 546
@@ -580,7 +580,7 @@ static void catchExit(void)
580 for(int i = 0; i < GPU_PARTITION_SIZE; ++i) 580 for(int i = 0; i < GPU_PARTITION_SIZE; ++i)
581 { 581 {
582 int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i; 582 int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i;
583 583
584 litmus_unlock(EE_LOCKS[which_device]); 584 litmus_unlock(EE_LOCKS[which_device]);
585 litmus_unlock(CE_SEND_LOCKS[which_device]); 585 litmus_unlock(CE_SEND_LOCKS[which_device]);
586 if(NUM_SIMULT_USERS == 2) { 586 if(NUM_SIMULT_USERS == 2) {
@@ -588,11 +588,11 @@ static void catchExit(void)
588 } 588 }
589 } 589 }
590 } 590 }
591 591
592 if(CUR_DEVICE >= 0) { 592 if(CUR_DEVICE >= 0) {
593 unregister_nv_device(CUR_DEVICE); 593 unregister_nv_device(CUR_DEVICE);
594 } 594 }
595 595
596 litmus_unlock(KEXCLU_LOCK); 596 litmus_unlock(KEXCLU_LOCK);
597 } 597 }
598} 598}
@@ -604,18 +604,18 @@ static void migrateToGPU(int destination)
604 if(MIGRATE_VIA_SYSMEM) 604 if(MIGRATE_VIA_SYSMEM)
605 { 605 {
606 chunkMemcpy(h_state_data, d_state_data[LAST_DEVICE], STATE_SIZE, 606 chunkMemcpy(h_state_data, d_state_data[LAST_DEVICE], STATE_SIZE,
607 cudaMemcpyDeviceToHost, LAST_DEVICE, useEngineLocks()); 607 cudaMemcpyDeviceToHost, LAST_DEVICE, useEngineLocks());
608 } 608 }
609 } 609 }
610 610
611// cutilSafeCall( cudaSetDevice(destination) ); 611// cutilSafeCall( cudaSetDevice(destination) );
612 612
613 if(!BROADCAST_STATE && STATE_SIZE > 0) 613 if(!BROADCAST_STATE && STATE_SIZE > 0)
614 { 614 {
615 if(MIGRATE_VIA_SYSMEM) 615 if(MIGRATE_VIA_SYSMEM)
616 { 616 {
617 chunkMemcpy(d_state_data[CUR_DEVICE], h_state_data, STATE_SIZE, 617 chunkMemcpy(d_state_data[CUR_DEVICE], h_state_data, STATE_SIZE,
618 cudaMemcpyHostToDevice, CUR_DEVICE, useEngineLocks()); 618 cudaMemcpyHostToDevice, CUR_DEVICE, useEngineLocks());
619 } 619 }
620 else 620 else
621 { 621 {
@@ -633,15 +633,15 @@ static void migrateToGPU(int destination)
633static void broadcastState(int from) 633static void broadcastState(int from)
634{ 634{
635 if(STATE_SIZE > 0) 635 if(STATE_SIZE > 0)
636 { 636 {
637 assert(CUR_DEVICE == from); 637 assert(CUR_DEVICE == from);
638 638
639 if(MIGRATE_VIA_SYSMEM) 639 if(MIGRATE_VIA_SYSMEM)
640 { 640 {
641 chunkMemcpy(h_state_data, d_state_data[from], STATE_SIZE, 641 chunkMemcpy(h_state_data, d_state_data[from], STATE_SIZE,
642 cudaMemcpyDeviceToHost, from, useEngineLocks()); 642 cudaMemcpyDeviceToHost, from, useEngineLocks());
643 } 643 }
644 644
645 for(int i = 0; i < GPU_PARTITION_SIZE; ++i) 645 for(int i = 0; i < GPU_PARTITION_SIZE; ++i)
646 { 646 {
647 int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i; 647 int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i;
@@ -652,7 +652,7 @@ static void broadcastState(int from)
652// cutilSafeCall( cudaSetDevice(which_device) ); 652// cutilSafeCall( cudaSetDevice(which_device) );
653 CUR_DEVICE = which_device; // temporary 653 CUR_DEVICE = which_device; // temporary
654 chunkMemcpy(d_state_data[which_device], h_state_data, STATE_SIZE, 654 chunkMemcpy(d_state_data[which_device], h_state_data, STATE_SIZE,
655 cudaMemcpyHostToDevice, which_device, useEngineLocks()); 655 cudaMemcpyHostToDevice, which_device, useEngineLocks());
656 } 656 }
657 else 657 else
658 { 658 {
@@ -662,11 +662,11 @@ static void broadcastState(int from)
662 cudaMemcpyDeviceToDevice, 662 cudaMemcpyDeviceToDevice,
663 from, 663 from,
664 useEngineLocks(), 664 useEngineLocks(),
665 which_device); 665 which_device);
666 } 666 }
667 } 667 }
668 } 668 }
669 669
670 if(MIGRATE_VIA_SYSMEM && CUR_DEVICE != from) 670 if(MIGRATE_VIA_SYSMEM && CUR_DEVICE != from)
671 { 671 {
672// cutilSafeCall( cudaSetDevice(from) ); 672// cutilSafeCall( cudaSetDevice(from) );
@@ -714,18 +714,18 @@ static void gpu_loop_for(double gpu_sec_time, double emergency_exit)
714 } 714 }
715 715
716 if(useEngineLocks()) litmus_lock(EE_LOCKS[CUR_DEVICE]); 716 if(useEngineLocks()) litmus_lock(EE_LOCKS[CUR_DEVICE]);
717 717
718// docudaspin <<<numblocks,blocksz, 0, streams[CUR_DEVICE]>>> (numcycles); 718// docudaspin <<<numblocks,blocksz, 0, streams[CUR_DEVICE]>>> (numcycles);
719// cutilSafeCall( cudaStreamSynchronize(streams[CUR_DEVICE]) ); 719// cutilSafeCall( cudaStreamSynchronize(streams[CUR_DEVICE]) );
720 720
721 if(useEngineLocks()) litmus_unlock(EE_LOCKS[CUR_DEVICE]); 721 if(useEngineLocks()) litmus_unlock(EE_LOCKS[CUR_DEVICE]);
722 722
723 if(RECV_SIZE > 0) 723 if(RECV_SIZE > 0)
724 { 724 {
725 chunkMemcpy(h_recv_data, d_recv_data[CUR_DEVICE], RECV_SIZE, 725 chunkMemcpy(h_recv_data, d_recv_data[CUR_DEVICE], RECV_SIZE,
726 cudaMemcpyDeviceToHost, CUR_DEVICE, useEngineLocks()); 726 cudaMemcpyDeviceToHost, CUR_DEVICE, useEngineLocks());
727 } 727 }
728 728
729 if(BROADCAST_STATE) 729 if(BROADCAST_STATE)
730 { 730 {
731 broadcastState(CUR_DEVICE); 731 broadcastState(CUR_DEVICE);
@@ -802,7 +802,7 @@ int main(int argc, char** argv)
802 int num_tasks = 0; 802 int num_tasks = 0;
803 803
804 double gpu_sec_ms = 0; 804 double gpu_sec_ms = 0;
805 805
806 while ((opt = getopt(argc, argv, OPTSTR)) != -1) { 806 while ((opt = getopt(argc, argv, OPTSTR)) != -1) {
807// printf("opt = %c optarg = %s\n", opt, optarg); 807// printf("opt = %c optarg = %s\n", opt, optarg);
808 switch (opt) { 808 switch (opt) {
@@ -858,7 +858,7 @@ int main(int argc, char** argv)
858 break; 858 break;
859 case 'r': 859 case 'r':
860 RELAX_FIFO_MAX_LEN = true; 860 RELAX_FIFO_MAX_LEN = true;
861 break; 861 break;
862 case 'L': 862 case 'L':
863 USE_KFMLP = true; 863 USE_KFMLP = true;
864 break; 864 break;
@@ -949,13 +949,13 @@ int main(int argc, char** argv)
949 { 949 {
950 printf("%d creating release shared memory\n", getpid()); 950 printf("%d creating release shared memory\n", getpid());
951 shared_memory_object::remove("release_barrier_memory"); 951 shared_memory_object::remove("release_barrier_memory");
952 release_segment_ptr = new managed_shared_memory(create_only, "release_barrier_memory", 4*1024); 952 release_segment_ptr = new managed_shared_memory(create_only, "release_barrier_memory", 4*1024);
953 953
954 printf("%d creating release barrier for %d users\n", getpid(), num_tasks); 954 printf("%d creating release barrier for %d users\n", getpid(), num_tasks);
955 release_barrier = release_segment_ptr->construct<barrier>("barrier release_barrier")(num_tasks); 955 release_barrier = release_segment_ptr->construct<barrier>("barrier release_barrier")(num_tasks);
956 956
957 init_release_time = release_segment_ptr->construct<uint64_t>("uint64_t instance")(); 957 init_release_time = release_segment_ptr->construct<uint64_t>("uint64_t instance")();
958 *init_release_time = 0; 958 *init_release_time = 0;
959 } 959 }
960 else 960 else
961 { 961 {
@@ -972,13 +972,13 @@ int main(int argc, char** argv)
972 sleep(1); 972 sleep(1);
973 } 973 }
974 }while(segment_ptr == NULL); 974 }while(segment_ptr == NULL);
975 975
976 release_barrier = segment_ptr->find<barrier>("barrier release_barrier").first; 976 release_barrier = segment_ptr->find<barrier>("barrier release_barrier").first;
977 init_release_time = segment_ptr->find<uint64_t>("uint64_t instance").first; 977 init_release_time = segment_ptr->find<uint64_t>("uint64_t instance").first;
978 } 978 }
979 } 979 }
980 980
981 981
982 if(GPU_TASK) 982 if(GPU_TASK)
983 { 983 {
984 if(ENABLE_WAIT) 984 if(ENABLE_WAIT)
@@ -1019,7 +1019,7 @@ int main(int argc, char** argv)
1019 SEND_SIZE *= scale; 1019 SEND_SIZE *= scale;
1020 RECV_SIZE *= scale; 1020 RECV_SIZE *= scale;
1021 STATE_SIZE *= scale; 1021 STATE_SIZE *= scale;
1022 1022
1023 init_cuda(); 1023 init_cuda();
1024 } 1024 }
1025 1025
@@ -1036,16 +1036,16 @@ int main(int argc, char** argv)
1036 if (ret != 0) 1036 if (ret != 0)
1037 bail_out("could not become RT task"); 1037 bail_out("could not become RT task");
1038 1038
1039 1039
1040 1040
1041 uint64_t jobCount = 0; 1041 uint64_t jobCount = 0;
1042 blitz::Array<uint64_t, 1> responseTimeLog(num_jobs+1); 1042 blitz::Array<uint64_t, 1> responseTimeLog(num_jobs+1);
1043 1043
1044 struct timespec spec; 1044 struct timespec spec;
1045 uint64_t release; 1045 uint64_t release;
1046 uint64_t finish; 1046 uint64_t finish;
1047 1047
1048 1048
1049 if (ENABLE_WAIT) { 1049 if (ENABLE_WAIT) {
1050 printf("Waiting for release.\n"); 1050 printf("Waiting for release.\n");
1051 ret = wait_for_ts_release(); 1051 ret = wait_for_ts_release();
@@ -1056,14 +1056,14 @@ int main(int argc, char** argv)
1056 { 1056 {
1057 sleep_next_period(); 1057 sleep_next_period();
1058 } 1058 }
1059 1059
1060 clock_gettime(CLOCK_MONOTONIC, &spec); 1060 clock_gettime(CLOCK_MONOTONIC, &spec);
1061 release = timespec_to_ns(spec); 1061 release = timespec_to_ns(spec);
1062 if (!__sync_bool_compare_and_swap(init_release_time, 0, release)) 1062 if (!__sync_bool_compare_and_swap(init_release_time, 0, release))
1063 { 1063 {
1064 release = *init_release_time; 1064 release = *init_release_time;
1065 } 1065 }
1066 1066
1067 releaseTime = wctime(); 1067 releaseTime = wctime();
1068 double failsafeEnd = releaseTime + duration; 1068 double failsafeEnd = releaseTime + duration;
1069 1069
@@ -1087,7 +1087,7 @@ int main(int argc, char** argv)
1087 clock_gettime(CLOCK_MONOTONIC, &spec); 1087 clock_gettime(CLOCK_MONOTONIC, &spec);
1088 finish = timespec_to_ns(spec); 1088 finish = timespec_to_ns(spec);
1089 1089
1090 responseTimeLog(min(num_jobs,jobCount++)) = finish - release; 1090 responseTimeLog(min(num_jobs,jobCount++)) = finish - release;
1091 1091
1092 // this is an estimated upper-bound on release time. it may be off by several microseconds. 1092 // this is an estimated upper-bound on release time. it may be off by several microseconds.
1093#ifdef RESET_RELEASE_ON_MISS 1093#ifdef RESET_RELEASE_ON_MISS
@@ -1097,11 +1097,11 @@ int main(int argc, char** argv)
1097#else 1097#else
1098 release = release + period; // allow things to get progressively later. 1098 release = release + period; // allow things to get progressively later.
1099#endif 1099#endif
1100 1100
1101 sleep_next_period(); 1101 sleep_next_period();
1102 clock_gettime(CLOCK_MONOTONIC, &spec); 1102 clock_gettime(CLOCK_MONOTONIC, &spec);
1103 release = min(timespec_to_ns(spec), release); 1103 release = min(timespec_to_ns(spec), release);
1104 1104
1105 } while(keepGoing); 1105 } while(keepGoing);
1106 } 1106 }
1107 1107
@@ -1147,13 +1147,13 @@ int main(int argc, char** argv)
1147 } 1147 }
1148 } 1148 }
1149 1149
1150 1150
1151 if (ENABLE_WAIT) 1151 if (ENABLE_WAIT)
1152 { 1152 {
1153 printf("%d waiting at exit barrier\n", getpid()); 1153 printf("%d waiting at exit barrier\n", getpid());
1154 release_barrier->wait(); 1154 release_barrier->wait();
1155 } 1155 }
1156 1156
1157 1157
1158 char gpu_using_str[] = "GPU\n"; 1158 char gpu_using_str[] = "GPU\n";
1159 char cpu_only_str[] = "CPU\n"; 1159 char cpu_only_str[] = "CPU\n";
@@ -1166,7 +1166,7 @@ int main(int argc, char** argv)
1166 // average 1166 // average
1167 blitz::mean(USED(responseTimeLog)), 1167 blitz::mean(USED(responseTimeLog)),
1168 // average pct of period 1168 // average pct of period
1169 100.0*(blitz::mean(USED(responseTimeLog))/period), 1169 100.0*(blitz::mean(USED(responseTimeLog))/period),
1170 // min 1170 // min
1171 blitz::min(USED(responseTimeLog)), 1171 blitz::min(USED(responseTimeLog)),
1172 // max 1172 // max
@@ -1182,6 +1182,6 @@ int main(int argc, char** argv)
1182 // flag gpu-using tasks 1182 // flag gpu-using tasks
1183 ((GPU_TASK) ? gpu_using_str : cpu_only_str) 1183 ((GPU_TASK) ? gpu_using_str : cpu_only_str)
1184 ); 1184 );
1185 1185
1186 return 0; 1186 return 0;
1187} 1187}