aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGlenn Elliott <gelliott@cs.unc.edu>2013-05-06 18:57:37 -0400
committerGlenn Elliott <gelliott@cs.unc.edu>2013-05-06 18:58:59 -0400
commit95e840f68892d46289120d1042ee36f9eaf41de7 (patch)
tree1335167a07621094518c4389f60ef0f3ed77eea4
parent0f89bddde73d448511004a60b98b8be042f6ffd6 (diff)
several new *important* features
1) gpusync daemon mode. 2) engine yield logic 3) fixed chunking (did not work on memcpys > 2 chunks)
-rw-r--r--Makefile8
-rw-r--r--gpu/dgl.c1
-rw-r--r--gpu/gpuspin.cu1907
-rw-r--r--include/litmus.h2
-rw-r--r--src/kernel_iface.c2
-rw-r--r--src/syscalls.c10
6 files changed, 1301 insertions, 629 deletions
diff --git a/Makefile b/Makefile
index 831c16b..e877ca4 100644
--- a/Makefile
+++ b/Makefile
@@ -25,12 +25,15 @@ NUMA_SUPPORT = dummyval
25 25
26# compiler flags 26# compiler flags
27flags-debug = -O2 -Wall -Werror -g -Wdeclaration-after-statement 27flags-debug = -O2 -Wall -Werror -g -Wdeclaration-after-statement
28#flags-debug = -Wall -Werror -g -Wdeclaration-after-statement
28flags-debug-cpp = -O2 -Wall -Werror -g 29flags-debug-cpp = -O2 -Wall -Werror -g
30#flags-debug-cpp = -Wall -Werror -g
29flags-api = -D_XOPEN_SOURCE=600 -D_GNU_SOURCE 31flags-api = -D_XOPEN_SOURCE=600 -D_GNU_SOURCE
30flags-misc = -fasynchronous-unwind-tables -fnon-call-exceptions 32flags-misc = -fasynchronous-unwind-tables -fnon-call-exceptions
31 33
32flags-cu-debug = -g -G -Xcompiler -Wall -Xcompiler -Werror 34flags-cu-debug = -g -G -Xcompiler -Wall -Xcompiler -Werror
33flags-cu-optim = -O2 -Xcompiler -march=native 35flags-cu-optim = -O2 -Xcompiler -march=native
36#flags-cu-optim = -Xcompiler -march=native
34flags-cu-nvcc = --use_fast_math -gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 37flags-cu-nvcc = --use_fast_math -gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30
35flags-cu-misc = -Xcompiler -fasynchronous-unwind-tables -Xcompiler -fnon-call-exceptions -Xcompiler -malign-double -Xcompiler -pthread 38flags-cu-misc = -Xcompiler -fasynchronous-unwind-tables -Xcompiler -fnon-call-exceptions -Xcompiler -malign-double -Xcompiler -pthread
36flags-cu-x86_64 = -m64 39flags-cu-x86_64 = -m64
@@ -63,7 +66,6 @@ headers = -I${LIBLITMUS}/include -I${LIBLITMUS}/arch/${include-${ARCH}}/include
63# combine options 66# combine options
64CPPFLAGS = ${flags-api} ${flags-debug-cpp} ${flags-misc} ${flags-${ARCH}} -DARCH=${ARCH} ${headers} 67CPPFLAGS = ${flags-api} ${flags-debug-cpp} ${flags-misc} ${flags-${ARCH}} -DARCH=${ARCH} ${headers}
65CUFLAGS = ${flags-api} ${flags-cu-debug} ${flags-cu-optim} ${flags-cu-nvcc} ${flags-cu-misc} -DARCH=${ARCH} ${headers} 68CUFLAGS = ${flags-api} ${flags-cu-debug} ${flags-cu-optim} ${flags-cu-nvcc} ${flags-cu-misc} -DARCH=${ARCH} ${headers}
66#CUFLAGS = ${flags-api} ${flags-cu-optim} ${flags-cu-nvcc} ${flags-cu-misc} -DARCH=${ARCH} ${headers}
67CFLAGS = ${flags-debug} ${flags-misc} 69CFLAGS = ${flags-debug} ${flags-misc}
68LDFLAGS = ${flags-${ARCH}} 70LDFLAGS = ${flags-${ARCH}}
69 71
@@ -82,7 +84,7 @@ endif
82# how to link cuda 84# how to link cuda
83cuda-flags-i386 = -L/usr/local/cuda/lib 85cuda-flags-i386 = -L/usr/local/cuda/lib
84cuda-flags-x86_64 = -L/usr/local/cuda/lib64 86cuda-flags-x86_64 = -L/usr/local/cuda/lib64
85cuda-flags = ${cuda-flags-${ARCH}} -lcudart 87cuda-flags = ${cuda-flags-${ARCH}} -lcudart -lcuda
86 88
87# Force gcc instead of cc, but let the user specify a more specific version if 89# Force gcc instead of cc, but let the user specify a more specific version if
88# desired. 90# desired.
@@ -299,7 +301,7 @@ lib-budget = -lrt -lm -pthread
299vpath %.cu gpu/ 301vpath %.cu gpu/
300 302
301objcu-gpuspin = gpuspin.o common.o 303objcu-gpuspin = gpuspin.o common.o
302lib-gpuspin = -lblitz -lrt -lm -lpthread 304lib-gpuspin = -lblitz -lrt -lm -lpthread -lboost_filesystem -lboost_system
303 305
304# ############################################################################## 306# ##############################################################################
305# Build everything that depends on liblitmus. 307# Build everything that depends on liblitmus.
diff --git a/gpu/dgl.c b/gpu/dgl.c
index 3029255..c40fec6 100644
--- a/gpu/dgl.c
+++ b/gpu/dgl.c
@@ -229,6 +229,7 @@ void* rt_thread(void* _ctx)
229 229
230 do_exit = job(ctx); 230 do_exit = job(ctx);
231 231
232 fprintf(stdout, "[%d] should yield dgl: %d.\n", ctx->id, litmus_dgl_should_yield_lock(dgl, dgl_size));
232 233
233 xfprintf(stdout, "[%d] unlocking dgl.\n", ctx->id); 234 xfprintf(stdout, "[%d] unlocking dgl.\n", ctx->id);
234 litmus_dgl_unlock(dgl, dgl_size); 235 litmus_dgl_unlock(dgl, dgl_size);
diff --git a/gpu/gpuspin.cu b/gpu/gpuspin.cu
index 21134f6..f361b86 100644
--- a/gpu/gpuspin.cu
+++ b/gpu/gpuspin.cu
@@ -10,9 +10,11 @@
10 10
11#include <boost/interprocess/managed_shared_memory.hpp> 11#include <boost/interprocess/managed_shared_memory.hpp>
12#include <boost/interprocess/sync/interprocess_mutex.hpp> 12#include <boost/interprocess/sync/interprocess_mutex.hpp>
13#include <boost/filesystem.hpp>
13 14
14#include <random/normal.h> 15#include <random/normal.h>
15 16
17#include <cuda.h>
16#include <cuda_runtime.h> 18#include <cuda_runtime.h>
17 19
18#include "litmus.h" 20#include "litmus.h"
@@ -24,7 +26,21 @@ using namespace ranlib;
24 26
25#define ms2s(ms) ((ms)*0.001) 27#define ms2s(ms) ((ms)*0.001)
26 28
29bool SILENT = true;
30inline int xprintf(const char *format, ...)
31{
32 int ret = 0;
33 if (!SILENT) {
34 va_list args;
35 va_start(args, format);
36 ret = vprintf(format, args);
37 va_end(args);
38 }
39 return ret;
40}
41
27const char *lock_namespace = "./.gpuspin-locks"; 42const char *lock_namespace = "./.gpuspin-locks";
43const size_t PAGE_SIZE = sysconf(_SC_PAGESIZE);
28 44
29const int NR_GPUS = 8; 45const int NR_GPUS = 8;
30 46
@@ -34,6 +50,8 @@ bool RELAX_FIFO_MAX_LEN = false;
34bool ENABLE_CHUNKING = false; 50bool ENABLE_CHUNKING = false;
35bool MIGRATE_VIA_SYSMEM = false; 51bool MIGRATE_VIA_SYSMEM = false;
36 52
53bool YIELD_LOCKS = false;
54
37enum eEngineLockTypes 55enum eEngineLockTypes
38{ 56{
39 FIFO, 57 FIFO,
@@ -97,15 +115,82 @@ int CUR_DEVICE = -1;
97int LAST_DEVICE = -1; 115int LAST_DEVICE = -1;
98 116
99cudaStream_t STREAMS[NR_GPUS]; 117cudaStream_t STREAMS[NR_GPUS];
118cudaEvent_t EVENTS[NR_GPUS];
100int GPU_HZ[NR_GPUS]; 119int GPU_HZ[NR_GPUS];
101int NUM_SM[NR_GPUS]; 120int NUM_SM[NR_GPUS];
102int WARP_SIZE[NR_GPUS]; 121int WARP_SIZE[NR_GPUS];
103int ELEM_PER_THREAD[NR_GPUS]; 122int ELEM_PER_THREAD[NR_GPUS];
104 123
124enum eScheduler
125{
126 LITMUS,
127 LINUX,
128 RT_LINUX
129};
130
131struct Args
132{
133 bool wait;
134 bool migrate;
135 int cluster;
136 int cluster_size;
137 bool gpu_using;
138 int gpu_partition;
139 int gpu_partition_size;
140 int rho;
141 int num_ce;
142 bool reserve_migr_ce;
143 bool use_engine_locks;
144 eEngineLockTypes engine_lock_type;
145 bool yield_locks;
146 bool use_dgls;
147 eGpuSyncMode gpusync_mode;
148 bool enable_affinity;
149 int relax_fifo_len;
150 eCudaSyncMode sync_mode;
151 size_t send_size;
152 size_t recv_size;
153 size_t state_size;
154 bool enable_chunking;
155 size_t chunk_size;
156 bool use_sysmem_migration;
157 int num_kernels;
158
159 double wcet_ms;
160 double gpu_wcet_ms;
161 double period_ms;
162
163 double budget_ms;
164
165 double stddev;
166
167 eScheduler scheduler;
168
169 unsigned int priority;
170
171 task_class_t cls;
172
173 bool want_enforcement;
174 bool want_signals;
175 budget_drain_policy_t drain_policy;
176
177 int column;
178
179 int num_gpu_tasks;
180 int num_tasks;
181
182 double scale;
183
184 double duration;
185};
186
187
188
105#define DEFINE_PER_GPU(type, var) type var[NR_GPUS] 189#define DEFINE_PER_GPU(type, var) type var[NR_GPUS]
106#define per_gpu(var, idx) (var[(idx)]) 190#define per_gpu(var, idx) (var[(idx)])
107#define this_gpu(var) (var[(CUR_DEVICE)]) 191#define this_gpu(var) (var[(CUR_DEVICE)])
108#define cur_stream() (this_gpu(STREAMS)) 192#define cur_stream() (this_gpu(STREAMS))
193#define cur_event() (this_gpu(EVENTS))
109#define cur_gpu() (CUR_DEVICE) 194#define cur_gpu() (CUR_DEVICE)
110#define last_gpu() (LAST_DEVICE) 195#define last_gpu() (LAST_DEVICE)
111#define cur_ee() (EE_LOCKS[CUR_DEVICE]) 196#define cur_ee() (EE_LOCKS[CUR_DEVICE])
@@ -208,9 +293,10 @@ struct ce_lock_state
208 void lock() { 293 void lock() {
209 if(locks[0] == locks[1]) crash(); 294 if(locks[0] == locks[1]) crash();
210 295
211 if(USE_DYNAMIC_GROUP_LOCKS) { 296 if (num_locks == 1)
297 litmus_lock(locks[0]);
298 else if(USE_DYNAMIC_GROUP_LOCKS)
212 litmus_dgl_lock(locks, num_locks); 299 litmus_dgl_lock(locks, num_locks);
213 }
214 else 300 else
215 { 301 {
216 for(int l = 0; l < num_locks; ++l) 302 for(int l = 0; l < num_locks; ++l)
@@ -224,9 +310,10 @@ struct ce_lock_state
224 void unlock() { 310 void unlock() {
225 if(locks[0] == locks[1]) crash(); 311 if(locks[0] == locks[1]) crash();
226 312
227 if(USE_DYNAMIC_GROUP_LOCKS) { 313 if (num_locks == 1)
314 litmus_unlock(locks[0]);
315 else if(USE_DYNAMIC_GROUP_LOCKS)
228 litmus_dgl_unlock(locks, num_locks); 316 litmus_dgl_unlock(locks, num_locks);
229 }
230 else 317 else
231 { 318 {
232 // reverse order 319 // reverse order
@@ -238,6 +325,21 @@ struct ce_lock_state
238 locked = false; 325 locked = false;
239 } 326 }
240 327
328 bool should_yield() {
329 int yield = 1; // assume we should yield
330 if (YIELD_LOCKS) {
331 if(locks[0] == locks[1]) crash();
332 if (num_locks == 1)
333 yield = litmus_should_yield_lock(locks[0]);
334 else if(USE_DYNAMIC_GROUP_LOCKS)
335 yield = litmus_dgl_should_yield_lock(locks, num_locks);
336 else
337 for(int l = num_locks - 1; l >= 0; --l) // reverse order
338 yield = litmus_should_yield_lock(locks[l]);
339 }
340 return (yield);
341 }
342
241 void refresh() { 343 void refresh() {
242 budget_remaining = CHUNK_SIZE; 344 budget_remaining = CHUNK_SIZE;
243 } 345 }
@@ -271,34 +373,37 @@ static cudaError_t __chunkMemcpy(void* a_dst, const void* a_src, size_t count,
271 { 373 {
272 int bytesToCopy = std::min(remaining, chunk_size); 374 int bytesToCopy = std::min(remaining, chunk_size);
273 375
274 if(state && state->budgetIsAvailable(bytesToCopy) && state->locked) { 376 if (state && state->locked) {
275 cudaStreamSynchronize(STREAMS[CUR_DEVICE]); 377 // we have to unlock/re-lock the copy engine to refresh our budget unless
276 ret = cudaGetLastError(); 378 // we still have budget available.
277 379 if (!state->budgetIsAvailable(bytesToCopy)) {
278 if(ret != cudaSuccess) 380 // optimization - don't unlock if no one else needs the engine
279 { 381 if (state->should_yield()) {
280 break; 382 //cudaStreamSynchronize(STREAMS[CUR_DEVICE]);
383 cudaEventSynchronize(EVENTS[CUR_DEVICE]);
384 ret = cudaGetLastError();
385 state->unlock();
386 if(ret != cudaSuccess)
387 break;
388 }
389 // we can only run out of
390 // budget if chunking is enabled.
391 // we presume that init budget would
392 // be set to cover entire memcpy
393 // if chunking were disabled.
394 state->refresh();
281 } 395 }
282
283 state->unlock();
284 state->refresh(); // replentish.
285 // we can only run out of
286 // budget if chunking is enabled.
287 // we presume that init budget would
288 // be set to cover entire memcpy
289 // if chunking were disabled.
290 } 396 }
291 397
292 if(state && !state->locked) { 398 if(state && !state->locked)
293 state->lock(); 399 state->lock();
294 }
295 400
296 //ret = cudaMemcpy(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind); 401 //ret = cudaMemcpy(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind);
297 cudaMemcpyAsync(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind, STREAMS[CUR_DEVICE]); 402 cudaMemcpyAsync(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind, STREAMS[CUR_DEVICE]);
403 cudaEventRecord(EVENTS[CUR_DEVICE], STREAMS[CUR_DEVICE]);
298 404
299 if(state) { 405 if(state)
300 state->decreaseBudget(bytesToCopy); 406 state->decreaseBudget(bytesToCopy);
301 }
302 407
303 ++i; 408 ++i;
304 remaining -= bytesToCopy; 409 remaining -= bytesToCopy;
@@ -316,7 +421,8 @@ static cudaError_t chunkMemcpy(void* a_dst, const void* a_src, size_t count,
316 cudaError_t ret; 421 cudaError_t ret;
317 if(!do_locking || device_a == -1) { 422 if(!do_locking || device_a == -1) {
318 ret = __chunkMemcpy(a_dst, a_src, count, kind, NULL); 423 ret = __chunkMemcpy(a_dst, a_src, count, kind, NULL);
319 cudaStreamSynchronize(cur_stream()); 424 cudaEventSynchronize(cur_event());
425// cudaStreamSynchronize(cur_stream());
320 if(ret == cudaSuccess) 426 if(ret == cudaSuccess)
321 ret = cudaGetLastError(); 427 ret = cudaGetLastError();
322 } 428 }
@@ -324,7 +430,8 @@ static cudaError_t chunkMemcpy(void* a_dst, const void* a_src, size_t count,
324 ce_lock_state state(device_a, kind, count, device_b, migration); 430 ce_lock_state state(device_a, kind, count, device_b, migration);
325 state.lock(); 431 state.lock();
326 ret = __chunkMemcpy(a_dst, a_src, count, kind, &state); 432 ret = __chunkMemcpy(a_dst, a_src, count, kind, &state);
327 cudaStreamSynchronize(cur_stream()); 433 cudaEventSynchronize(cur_event());
434 // cudaStreamSynchronize(cur_stream());
328 if(ret == cudaSuccess) 435 if(ret == cudaSuccess)
329 ret = cudaGetLastError(); 436 ret = cudaGetLastError();
330 state.unlock(); 437 state.unlock();
@@ -332,17 +439,26 @@ static cudaError_t chunkMemcpy(void* a_dst, const void* a_src, size_t count,
332 return ret; 439 return ret;
333} 440}
334 441
442int LITMUS_LOCK_FD = 0;
443
444int EXP_OFFSET = 0;
335 445
336void allocate_locks_litmus(void) 446void allocate_locks_litmus(void)
337{ 447{
448 stringstream ss;
449 ss<<lock_namespace<<"-"<<EXP_OFFSET;
450
338 // allocate k-FMLP lock 451 // allocate k-FMLP lock
339 int fd = open(lock_namespace, O_RDONLY | O_CREAT, S_IRUSR | S_IWUSR); 452 //LITMUS_LOCK_FD = open(lock_namespace, O_RDONLY | O_CREAT, S_IRUSR | S_IWUSR);
453 LITMUS_LOCK_FD = open(ss.str().c_str(), O_RDONLY | O_CREAT, S_IRUSR | S_IWUSR);
454 int *fd = &LITMUS_LOCK_FD;
340 455
341 int base_name = GPU_PARTITION * 1000; 456 int base_name = GPU_PARTITION * 100 + EXP_OFFSET * 200;
457 ++EXP_OFFSET;
342 458
343 if (GPU_SYNC_MODE == IKGLP_MODE) { 459 if (GPU_SYNC_MODE == IKGLP_MODE) {
344 /* Standard (optimal) IKGLP */ 460 /* Standard (optimal) IKGLP */
345 TOKEN_LOCK = open_gpusync_token_lock(fd, 461 TOKEN_LOCK = open_gpusync_token_lock(*fd,
346 base_name, /* name */ 462 base_name, /* name */
347 GPU_PARTITION_SIZE, 463 GPU_PARTITION_SIZE,
348 GPU_PARTITION*GPU_PARTITION_SIZE, 464 GPU_PARTITION*GPU_PARTITION_SIZE,
@@ -355,7 +471,7 @@ void allocate_locks_litmus(void)
355 } 471 }
356 else if (GPU_SYNC_MODE == KFMLP_MODE) { 472 else if (GPU_SYNC_MODE == KFMLP_MODE) {
357 /* KFMLP. FIFO queues only for tokens. */ 473 /* KFMLP. FIFO queues only for tokens. */
358 TOKEN_LOCK = open_gpusync_token_lock(fd, 474 TOKEN_LOCK = open_gpusync_token_lock(*fd,
359 base_name, /* name */ 475 base_name, /* name */
360 GPU_PARTITION_SIZE, 476 GPU_PARTITION_SIZE,
361 GPU_PARTITION*GPU_PARTITION_SIZE, 477 GPU_PARTITION*GPU_PARTITION_SIZE,
@@ -366,7 +482,7 @@ void allocate_locks_litmus(void)
366 } 482 }
367 else if (GPU_SYNC_MODE == RGEM_MODE) { 483 else if (GPU_SYNC_MODE == RGEM_MODE) {
368 /* RGEM-like token allocation. Shared priority queue for all tokens. */ 484 /* RGEM-like token allocation. Shared priority queue for all tokens. */
369 TOKEN_LOCK = open_gpusync_token_lock(fd, 485 TOKEN_LOCK = open_gpusync_token_lock(*fd,
370 base_name, /* name */ 486 base_name, /* name */
371 GPU_PARTITION_SIZE, 487 GPU_PARTITION_SIZE,
372 GPU_PARTITION*GPU_PARTITION_SIZE, 488 GPU_PARTITION*GPU_PARTITION_SIZE,
@@ -380,7 +496,7 @@ void allocate_locks_litmus(void)
380 * token requests. */ 496 * token requests. */
381 int max_simult_run = std::max(CPU_PARTITION_SIZE, RHO*GPU_PARTITION_SIZE); 497 int max_simult_run = std::max(CPU_PARTITION_SIZE, RHO*GPU_PARTITION_SIZE);
382 int max_fifo_len = (int)ceil((float)max_simult_run / (RHO*GPU_PARTITION_SIZE)); 498 int max_fifo_len = (int)ceil((float)max_simult_run / (RHO*GPU_PARTITION_SIZE));
383 TOKEN_LOCK = open_gpusync_token_lock(fd, 499 TOKEN_LOCK = open_gpusync_token_lock(*fd,
384 base_name, /* name */ 500 base_name, /* name */
385 GPU_PARTITION_SIZE, 501 GPU_PARTITION_SIZE,
386 GPU_PARTITION*GPU_PARTITION_SIZE, 502 GPU_PARTITION*GPU_PARTITION_SIZE,
@@ -416,17 +532,17 @@ void allocate_locks_litmus(void)
416 open_sem_t openEngineLock = (ENGINE_LOCK_TYPE == FIFO) ? 532 open_sem_t openEngineLock = (ENGINE_LOCK_TYPE == FIFO) ?
417 open_fifo_sem : open_prioq_sem; 533 open_fifo_sem : open_prioq_sem;
418 534
419 ee_lock = openEngineLock(fd, ee_name); 535 ee_lock = openEngineLock(*fd, ee_name);
420 if (ee_lock < 0) 536 if (ee_lock < 0)
421 perror("open_*_sem (engine lock)"); 537 perror("open_*_sem (engine lock)");
422 538
423 ce_0_lock = openEngineLock(fd, ce_0_name); 539 ce_0_lock = openEngineLock(*fd, ce_0_name);
424 if (ce_0_lock < 0) 540 if (ce_0_lock < 0)
425 perror("open_*_sem (engine lock)"); 541 perror("open_*_sem (engine lock)");
426 542
427 if (NUM_COPY_ENGINES == 2) 543 if (NUM_COPY_ENGINES == 2)
428 { 544 {
429 ce_1_lock = openEngineLock(fd, ce_1_name); 545 ce_1_lock = openEngineLock(*fd, ce_1_name);
430 if (ce_1_lock < 0) 546 if (ce_1_lock < 0)
431 perror("open_*_sem (engine lock)"); 547 perror("open_*_sem (engine lock)");
432 } 548 }
@@ -464,7 +580,41 @@ void allocate_locks_litmus(void)
464 } 580 }
465} 581}
466 582
583void deallocate_locks_litmus(void)
584{
585 for (int i = 0; i < GPU_PARTITION_SIZE; ++i)
586 {
587 int idx = GPU_PARTITION*GPU_PARTITION_SIZE + i;
467 588
589 od_close(EE_LOCKS[idx]);
590 if (NUM_COPY_ENGINES == 1)
591 {
592 od_close(CE_SEND_LOCKS[idx]);
593 }
594 else
595 {
596 if (RESERVED_MIGR_COPY_ENGINE) {
597 od_close(CE_SEND_LOCKS[idx]);
598 od_close(CE_MIGR_SEND_LOCKS[idx]);
599 }
600 else {
601 od_close(CE_SEND_LOCKS[idx]);
602 od_close(CE_RECV_LOCKS[idx]);
603 }
604 }
605 }
606
607 od_close(TOKEN_LOCK);
608
609 close(LITMUS_LOCK_FD);
610
611 memset(&CE_SEND_LOCKS[0], 0, sizeof(CE_SEND_LOCKS));
612 memset(&CE_RECV_LOCKS[0], 0, sizeof(CE_RECV_LOCKS));
613 memset(&CE_MIGR_SEND_LOCKS[0], 0, sizeof(CE_MIGR_SEND_LOCKS));
614 memset(&CE_MIGR_RECV_LOCKS[0], 0, sizeof(CE_MIGR_RECV_LOCKS));
615 TOKEN_LOCK = -1;
616 LITMUS_LOCK_FD = 0;
617}
468 618
469 619
470class gpu_pool 620class gpu_pool
@@ -478,10 +628,9 @@ public:
478 int get(pthread_mutex_t* tex, int preference = -1) 628 int get(pthread_mutex_t* tex, int preference = -1)
479 { 629 {
480 int which = -1; 630 int which = -1;
481 // int last = (preference >= 0) ? preference : 0;
482 int last = (ENABLE_AFFINITY) ? 631 int last = (ENABLE_AFFINITY) ?
483 (preference >= 0) ? preference : 0 : 632 ((preference >= 0) ? preference : 0) :
484 rand()%poolSize; 633 (rand()%poolSize);
485 int minIdx = last; 634 int minIdx = last;
486 635
487 pthread_mutex_lock(tex); 636 pthread_mutex_lock(tex);
@@ -513,24 +662,22 @@ private:
513 int pool[NR_GPUS]; // >= gpu_part_size 662 int pool[NR_GPUS]; // >= gpu_part_size
514}; 663};
515 664
665
666static managed_shared_memory *linux_lock_segment_ptr = NULL;
516static gpu_pool* GPU_LINUX_SEM_POOL = NULL; 667static gpu_pool* GPU_LINUX_SEM_POOL = NULL;
517static pthread_mutex_t* GPU_LINUX_MUTEX_POOL = NULL; 668static pthread_mutex_t* GPU_LINUX_MUTEX_POOL = NULL;
518 669
519static void allocate_locks_linux(const int num_gpu_users) 670static void allocate_locks_linux(const int num_gpu_users)
520{ 671{
521 managed_shared_memory *segment_pool_ptr = NULL;
522 managed_shared_memory *segment_mutex_ptr = NULL;
523
524 int numGpuPartitions = NR_GPUS/GPU_PARTITION_SIZE; 672 int numGpuPartitions = NR_GPUS/GPU_PARTITION_SIZE;
525 673
526 if(num_gpu_users > 0) 674 if(num_gpu_users > 0)
527 { 675 {
528 printf("%d creating shared memory for linux semaphores; num pools = %d, pool size = %d\n", getpid(), numGpuPartitions, GPU_PARTITION_SIZE); 676 xprintf("%d: creating linux locks\n", getpid());
529 shared_memory_object::remove("linux_mutex_memory"); 677 shared_memory_object::remove("linux_lock_memory");
530 shared_memory_object::remove("linux_sem_memory");
531 678
532 segment_mutex_ptr = new managed_shared_memory(create_only, "linux_mutex_memory", 4*1024); 679 linux_lock_segment_ptr = new managed_shared_memory(create_only, "linux_lock_memory", 30*PAGE_SIZE);
533 GPU_LINUX_MUTEX_POOL = segment_mutex_ptr->construct<pthread_mutex_t>("pthread_mutex_t linux_m")[numGpuPartitions](); 680 GPU_LINUX_MUTEX_POOL = linux_lock_segment_ptr->construct<pthread_mutex_t>("pthread_mutex_t linux_m")[numGpuPartitions]();
534 for(int i = 0; i < numGpuPartitions; ++i) 681 for(int i = 0; i < numGpuPartitions; ++i)
535 { 682 {
536 pthread_mutexattr_t attr; 683 pthread_mutexattr_t attr;
@@ -539,41 +686,41 @@ static void allocate_locks_linux(const int num_gpu_users)
539 pthread_mutex_init(&(GPU_LINUX_MUTEX_POOL[i]), &attr); 686 pthread_mutex_init(&(GPU_LINUX_MUTEX_POOL[i]), &attr);
540 pthread_mutexattr_destroy(&attr); 687 pthread_mutexattr_destroy(&attr);
541 } 688 }
542 689 GPU_LINUX_SEM_POOL = linux_lock_segment_ptr->construct<gpu_pool>("gpu_pool linux_p")[numGpuPartitions](GPU_PARTITION_SIZE);
543 segment_pool_ptr = new managed_shared_memory(create_only, "linux_sem_memory", 4*1024);
544 GPU_LINUX_SEM_POOL = segment_pool_ptr->construct<gpu_pool>("gpu_pool linux_p")[numGpuPartitions](GPU_PARTITION_SIZE);
545 } 690 }
546 else 691 else
547 { 692 {
693 sleep(5);
548 do 694 do
549 { 695 {
550 try 696 try
551 { 697 {
552 if (!segment_pool_ptr) segment_pool_ptr = new managed_shared_memory(open_only, "linux_sem_memory"); 698 if (!linux_lock_segment_ptr)
699 linux_lock_segment_ptr = new managed_shared_memory(open_only, "linux_lock_memory");
553 } 700 }
554 catch(...) 701 catch(...)
555 { 702 {
556 sleep(1); 703 sleep(1);
557 } 704 }
558 }while(segment_pool_ptr == NULL); 705 }while(linux_lock_segment_ptr == NULL);
559 706
560 do 707 GPU_LINUX_MUTEX_POOL = linux_lock_segment_ptr->find<pthread_mutex_t>("pthread_mutex_t linux_m").first;
561 { 708 GPU_LINUX_SEM_POOL = linux_lock_segment_ptr->find<gpu_pool>("gpu_pool linux_p").first;
562 try
563 {
564 if (!segment_mutex_ptr) segment_mutex_ptr = new managed_shared_memory(open_only, "linux_mutex_memory");
565 }
566 catch(...)
567 {
568 sleep(1);
569 }
570 }while(segment_mutex_ptr == NULL);
571
572 GPU_LINUX_SEM_POOL = segment_pool_ptr->find<gpu_pool>("gpu_pool linux_p").first;
573 GPU_LINUX_MUTEX_POOL = segment_mutex_ptr->find<pthread_mutex_t>("pthread_mutex_t linux_m").first;
574 } 709 }
575} 710}
576 711
712static void deallocate_locks_linux(const int num_gpu_users)
713{
714 GPU_LINUX_MUTEX_POOL = NULL;
715 GPU_LINUX_SEM_POOL = NULL;
716
717 delete linux_lock_segment_ptr;
718 linux_lock_segment_ptr = NULL;
719
720 if(num_gpu_users > 0)
721 shared_memory_object::remove("linux_lock_memory");
722}
723
577 724
578 725
579 726
@@ -585,6 +732,14 @@ static void allocate_locks(const int num_gpu_users, bool linux_mode)
585 allocate_locks_linux(num_gpu_users); 732 allocate_locks_linux(num_gpu_users);
586} 733}
587 734
735static void deallocate_locks(const int num_gpu_users, bool linux_mode)
736{
737 if(!linux_mode)
738 deallocate_locks_litmus();
739 else
740 deallocate_locks_linux(num_gpu_users);
741}
742
588static void set_cur_gpu(int gpu) 743static void set_cur_gpu(int gpu)
589{ 744{
590 if (TRACE_MIGRATIONS) { 745 if (TRACE_MIGRATIONS) {
@@ -597,47 +752,52 @@ static void set_cur_gpu(int gpu)
597} 752}
598 753
599 754
600static pthread_barrier_t *gpu_barrier = NULL; 755//static pthread_barrier_t *gpu_barrier = NULL;
601static interprocess_mutex *gpu_mgmt_mutexes = NULL; 756static interprocess_mutex *gpu_mgmt_mutexes = NULL;
602static managed_shared_memory *segment_ptr = NULL; 757static managed_shared_memory *gpu_mutex_segment_ptr = NULL;
603 758
604void coordinate_gpu_tasks(const int num_gpu_users) 759void coordinate_gpu_tasks(const int num_gpu_users)
605{ 760{
606 if(num_gpu_users > 0) 761 if(num_gpu_users > 0)
607 { 762 {
608 printf("%d creating shared memory\n", getpid()); 763 xprintf("%d creating shared memory\n", getpid());
609 shared_memory_object::remove("gpu_barrier_memory"); 764 shared_memory_object::remove("gpu_mutex_memory");
610 segment_ptr = new managed_shared_memory(create_only, "gpu_barrier_memory", 4*1024); 765 gpu_mutex_segment_ptr = new managed_shared_memory(create_only, "gpu_mutex_memory", PAGE_SIZE);
611 766
612 printf("%d creating a barrier for %d users\n", getpid(), num_gpu_users); 767// printf("%d creating a barrier for %d users\n", getpid(), num_gpu_users);
613 gpu_barrier = segment_ptr->construct<pthread_barrier_t>("pthread_barrier_t gpu_barrier")(); 768// gpu_barrier = segment_ptr->construct<pthread_barrier_t>("pthread_barrier_t gpu_barrier")();
614 pthread_barrierattr_t battr; 769// pthread_barrierattr_t battr;
615 pthread_barrierattr_init(&battr); 770// pthread_barrierattr_init(&battr);
616 pthread_barrierattr_setpshared(&battr, PTHREAD_PROCESS_SHARED); 771// pthread_barrierattr_setpshared(&battr, PTHREAD_PROCESS_SHARED);
617 pthread_barrier_init(gpu_barrier, &battr, num_gpu_users); 772// pthread_barrier_init(gpu_barrier, &battr, num_gpu_users);
618 pthread_barrierattr_destroy(&battr); 773// pthread_barrierattr_destroy(&battr);
619 printf("%d creating gpu mgmt mutexes for %d devices\n", getpid(), NR_GPUS); 774// printf("%d creating gpu mgmt mutexes for %d devices\n", getpid(), NR_GPUS);
620 gpu_mgmt_mutexes = segment_ptr->construct<interprocess_mutex>("interprocess_mutex m")[NR_GPUS](); 775 gpu_mgmt_mutexes = gpu_mutex_segment_ptr->construct<interprocess_mutex>("interprocess_mutex m")[NR_GPUS]();
621 } 776 }
622 else 777 else
623 { 778 {
779 sleep(5);
624 do 780 do
625 { 781 {
626 try 782 try
627 { 783 {
628 segment_ptr = new managed_shared_memory(open_only, "gpu_barrier_memory"); 784 gpu_mutex_segment_ptr = new managed_shared_memory(open_only, "gpu_mutex_memory");
629 } 785 }
630 catch(...) 786 catch(...)
631 { 787 {
632 sleep(1); 788 sleep(1);
633 } 789 }
634 }while(segment_ptr == NULL); 790 }while(gpu_mutex_segment_ptr == NULL);
635 791
636 gpu_barrier = segment_ptr->find<pthread_barrier_t>("pthread_barrier_t gpu_barrier").first; 792// gpu_barrier = segment_ptr->find<pthread_barrier_t>("pthread_barrier_t gpu_barrier").first;
637 gpu_mgmt_mutexes = segment_ptr->find<interprocess_mutex>("interprocess_mutex m").first; 793 gpu_mgmt_mutexes = gpu_mutex_segment_ptr->find<interprocess_mutex>("interprocess_mutex m").first;
638 } 794 }
639} 795}
640 796
797const size_t SEND_ALLOC_SIZE = 12*1024;
798const size_t RECV_ALLOC_SIZE = 12*1024;
799const size_t STATE_ALLOC_SIZE = 16*1024;
800
641typedef float spindata_t; 801typedef float spindata_t;
642 802
643char *d_send_data[NR_GPUS] = {0}; 803char *d_send_data[NR_GPUS] = {0};
@@ -653,18 +813,48 @@ char *h_send_data = 0;
653char *h_recv_data = 0; 813char *h_recv_data = 0;
654char *h_state_data = 0; 814char *h_state_data = 0;
655 815
656unsigned int *h_iteration_count[NR_GPUS] = {0}; 816static void destroy_events()
817{
818 for(int i = 0; i < GPU_PARTITION_SIZE; ++i)
819 {
820 int which = GPU_PARTITION*GPU_PARTITION_SIZE + i;
821 gpu_mgmt_mutexes[which].lock();
822 set_cur_gpu(which);
823 cudaEventDestroy(EVENTS[which]);
824 gpu_mgmt_mutexes[which].unlock();
825 }
826}
827
828static void init_events()
829{
830 xprintf("creating %s events\n", (CUDA_SYNC_MODE == BLOCKING) ? "blocking" : "spinning");
831 for(int i = 0; i < GPU_PARTITION_SIZE; ++i)
832 {
833 int which = GPU_PARTITION*GPU_PARTITION_SIZE + i;
834 gpu_mgmt_mutexes[which].lock();
835 set_cur_gpu(which);
836 if (CUDA_SYNC_MODE == BLOCKING)
837 cudaEventCreateWithFlags(&EVENTS[which], cudaEventBlockingSync | cudaEventDisableTiming);
838 else
839 cudaEventCreateWithFlags(&EVENTS[which], cudaEventDefault | cudaEventDisableTiming);
840 gpu_mgmt_mutexes[which].unlock();
841 }
842}
657 843
658static void init_cuda(const int num_gpu_users) 844static void init_cuda(const int num_gpu_users)
659{ 845{
660 const int PAGE_SIZE = 4*1024; 846 size_t send_alloc_bytes = SEND_ALLOC_SIZE + (SEND_ALLOC_SIZE%PAGE_SIZE != 0)*PAGE_SIZE;
661 size_t send_alloc_bytes = SEND_SIZE + (SEND_SIZE%PAGE_SIZE != 0)*PAGE_SIZE; 847 size_t recv_alloc_bytes = RECV_ALLOC_SIZE + (RECV_ALLOC_SIZE%PAGE_SIZE != 0)*PAGE_SIZE;
662 size_t recv_alloc_bytes = RECV_SIZE + (RECV_SIZE%PAGE_SIZE != 0)*PAGE_SIZE; 848 size_t state_alloc_bytes = STATE_ALLOC_SIZE + (STATE_ALLOC_SIZE%PAGE_SIZE != 0)*PAGE_SIZE;
663 size_t state_alloc_bytes = STATE_SIZE + (STATE_SIZE%PAGE_SIZE != 0)*PAGE_SIZE;
664 849
665 coordinate_gpu_tasks(num_gpu_users); 850 static bool first_time = true;
666 851
667#if 1 852 if (first_time) {
853 coordinate_gpu_tasks(num_gpu_users);
854 first_time = false;
855 }
856
857#if 0
668 switch (CUDA_SYNC_MODE) 858 switch (CUDA_SYNC_MODE)
669 { 859 {
670 case BLOCKING: 860 case BLOCKING:
@@ -674,8 +864,6 @@ static void init_cuda(const int num_gpu_users)
674 cudaSetDeviceFlags(cudaDeviceScheduleSpin); 864 cudaSetDeviceFlags(cudaDeviceScheduleSpin);
675 break; 865 break;
676 } 866 }
677#else
678 cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
679#endif 867#endif
680 868
681 for(int i = 0; i < GPU_PARTITION_SIZE; ++i) 869 for(int i = 0; i < GPU_PARTITION_SIZE; ++i)
@@ -687,6 +875,9 @@ static void init_cuda(const int num_gpu_users)
687 try 875 try
688 { 876 {
689 set_cur_gpu(which); 877 set_cur_gpu(which);
878
879 xprintf("setting up GPU %d\n", which);
880
690 cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 0); 881 cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 0);
691 cudaDeviceSetLimit(cudaLimitMallocHeapSize, 0); 882 cudaDeviceSetLimit(cudaLimitMallocHeapSize, 0);
692 883
@@ -698,8 +889,8 @@ static void init_cuda(const int num_gpu_users)
698 // enough to fill the L2 cache exactly. 889 // enough to fill the L2 cache exactly.
699 ELEM_PER_THREAD[which] = (prop.l2CacheSize/(NUM_SM[which]*WARP_SIZE[which]*sizeof(spindata_t))); 890 ELEM_PER_THREAD[which] = (prop.l2CacheSize/(NUM_SM[which]*WARP_SIZE[which]*sizeof(spindata_t)));
700 891
701 892// if (!MIGRATE_VIA_SYSMEM && prop.unifiedAddressing)
702 if (!MIGRATE_VIA_SYSMEM && prop.unifiedAddressing) 893 if (prop.unifiedAddressing)
703 { 894 {
704 for(int j = 0; j < GPU_PARTITION_SIZE; ++j) 895 for(int j = 0; j < GPU_PARTITION_SIZE; ++j)
705 { 896 {
@@ -717,29 +908,23 @@ static void init_cuda(const int num_gpu_users)
717 } 908 }
718 } 909 }
719 910
720 cudaStreamCreate(&STREAMS[CUR_DEVICE]); 911 cudaStreamCreate(&STREAMS[which]);
721 912
913 // gpu working set
722 cudaMalloc(&d_spin_data[which], prop.l2CacheSize); 914 cudaMalloc(&d_spin_data[which], prop.l2CacheSize);
723 cudaMemset(&d_spin_data[which], 0, prop.l2CacheSize); 915 cudaMemset(&d_spin_data[which], 0, prop.l2CacheSize);
724// cudaMalloc(&d_iteration_count[which], NUM_SM[which]*WARP_SIZE[which]*sizeof(unsigned int));
725// cudaHostAlloc(&h_iteration_count[which], NUM_SM[which]*WARP_SIZE[which]*sizeof(unsigned int), cudaHostAllocPortable | cudaHostAllocMapped);
726
727 if (send_alloc_bytes) {
728 cudaMalloc(&d_send_data[which], send_alloc_bytes);
729 cudaHostAlloc(&h_send_data, send_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped);
730 }
731 916
732 if (h_recv_data) { 917 // send data
733 cudaMalloc(&d_recv_data[which], recv_alloc_bytes); 918 cudaMalloc(&d_send_data[which], send_alloc_bytes);
734 cudaHostAlloc(&h_recv_data, recv_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped); 919 cudaHostAlloc(&h_send_data, send_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped);
735 }
736 920
737 if (h_state_data) { 921 // recv data
738 cudaMalloc(&d_state_data[which], state_alloc_bytes); 922 cudaMalloc(&d_recv_data[which], recv_alloc_bytes);
923 cudaHostAlloc(&h_recv_data, recv_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped);
739 924
740 if (MIGRATE_VIA_SYSMEM) 925 // state data
741 cudaHostAlloc(&h_state_data, state_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped | cudaHostAllocWriteCombined); 926 cudaMalloc(&d_state_data[which], state_alloc_bytes);
742 } 927 cudaHostAlloc(&h_state_data, state_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped | cudaHostAllocWriteCombined);
743 } 928 }
744 catch(std::exception &e) 929 catch(std::exception &e)
745 { 930 {
@@ -793,6 +978,8 @@ static bool MigrateToGPU_SysMem(int from, int to)
793 // THIS IS ON-DEMAND SYS_MEM MIGRATION. GPUSync says 978 // THIS IS ON-DEMAND SYS_MEM MIGRATION. GPUSync says
794 // you should be using speculative migrations. 979 // you should be using speculative migrations.
795 // Use PushState() and PullState(). 980 // Use PushState() and PullState().
981 fprintf(stderr, "Tried to sysmem migrate from %d to %d\n",
982 from, to);
796 assert(false); // for now 983 assert(false); // for now
797 984
798 bool success = true; 985 bool success = true;
@@ -846,12 +1033,31 @@ static void MigrateIfNeeded(int next_gpu)
846 PushState(); 1033 PushState();
847 } 1034 }
848 } 1035 }
1036 else if(cur_gpu() == -1) {
1037 set_cur_gpu(next_gpu);
1038 }
849} 1039}
850 1040
851
852
853static void exit_cuda() 1041static void exit_cuda()
854{ 1042{
1043#if 0
1044 for(int i = 0; i < GPU_PARTITION_SIZE; ++i)
1045 {
1046 int which = GPU_PARTITION*GPU_PARTITION_SIZE + i;
1047 gpu_mgmt_mutexes[which].lock();
1048 set_cur_gpu(which);
1049 cudaFree(d_send_data[which]);
1050 cudaFree(d_recv_data[which]);
1051 cudaFree(d_state_data[which]);
1052 cudaFree(d_spin_data[which]);
1053 gpu_mgmt_mutexes[which].unlock();
1054 }
1055#endif
1056
1057 cudaFreeHost(h_send_data);
1058 cudaFreeHost(h_recv_data);
1059 cudaFreeHost(h_state_data);
1060
855 for(int i = 0; i < GPU_PARTITION_SIZE; ++i) 1061 for(int i = 0; i < GPU_PARTITION_SIZE; ++i)
856 { 1062 {
857 int which = GPU_PARTITION*GPU_PARTITION_SIZE + i; 1063 int which = GPU_PARTITION*GPU_PARTITION_SIZE + i;
@@ -860,6 +1066,14 @@ static void exit_cuda()
860 cudaDeviceReset(); 1066 cudaDeviceReset();
861 gpu_mgmt_mutexes[which].unlock(); 1067 gpu_mgmt_mutexes[which].unlock();
862 } 1068 }
1069
1070 memset(d_send_data, 0, sizeof(d_send_data));
1071 memset(d_recv_data, 0, sizeof(d_recv_data));
1072 memset(d_state_data, 0, sizeof(d_state_data));
1073 memset(d_spin_data, 0, sizeof(d_spin_data));
1074 h_send_data = NULL;
1075 h_recv_data = NULL;
1076 h_state_data = NULL;
863} 1077}
864 1078
865bool safetynet = false; 1079bool safetynet = false;
@@ -895,14 +1109,6 @@ static void catch_exit(int catch_exit)
895} 1109}
896 1110
897 1111
898
899
900
901#ifdef VANILLA_LINUX
902static float ms_sum;
903static int gpucount = 0;
904#endif
905
906__global__ void docudaspin(float* data, /*unsigned int* iterations,*/ unsigned int num_elem, unsigned int cycles) 1112__global__ void docudaspin(float* data, /*unsigned int* iterations,*/ unsigned int num_elem, unsigned int cycles)
907{ 1113{
908 long long int now = clock64(); 1114 long long int now = clock64();
@@ -959,13 +1165,30 @@ static void gpu_loop_for(double gpu_sec_time, unsigned int num_kernels, double e
959 chunkMemcpy(this_gpu(d_state_data), h_send_data, SEND_SIZE, 1165 chunkMemcpy(this_gpu(d_state_data), h_send_data, SEND_SIZE,
960 cudaMemcpyHostToDevice, CUR_DEVICE, useEngineLocks()); 1166 cudaMemcpyHostToDevice, CUR_DEVICE, useEngineLocks());
961 1167
1168 bool locked = false;
962 for(unsigned int i = 0; i < num_kernels; ++i) 1169 for(unsigned int i = 0; i < num_kernels; ++i)
963 { 1170 {
964 if(useEngineLocks()) litmus_lock(cur_ee()); 1171 if(useEngineLocks() && !locked) {
1172 litmus_lock(cur_ee());
1173 locked = true;
1174 }
1175
965 /* one block per sm, one warp per block */ 1176 /* one block per sm, one warp per block */
966 docudaspin <<<cur_sms(),cur_warp_size(), 0, cur_stream()>>> (d_spin_data[cur_gpu()], cur_elem_per_thread(), numcycles); 1177 docudaspin <<<cur_sms(), cur_warp_size(), 0, cur_stream()>>> (d_spin_data[cur_gpu()], cur_elem_per_thread(), numcycles);
967 cudaStreamSynchronize(cur_stream()); 1178
968 if(useEngineLocks()) litmus_unlock(cur_ee()); 1179 if(useEngineLocks() && (!YIELD_LOCKS || (YIELD_LOCKS && litmus_should_yield_lock(cur_ee())))) {
1180// cudaStreamSynchronize(cur_stream());
1181 cudaEventRecord(cur_event(), cur_stream());
1182 cudaEventSynchronize(cur_event());
1183 litmus_unlock(cur_ee());
1184 locked = false;
1185 }
1186 }
1187 if (locked) {
1188 cudaEventRecord(cur_event(), cur_stream());
1189 cudaEventSynchronize(cur_event());
1190 litmus_unlock(cur_ee());
1191 locked = false;
969 } 1192 }
970 1193
971 if(RECV_SIZE > 0) 1194 if(RECV_SIZE > 0)
@@ -985,9 +1208,9 @@ out:
985 1208
986static void gpu_loop_for_linux(double gpu_sec_time, unsigned int num_kernels, double emergency_exit) 1209static void gpu_loop_for_linux(double gpu_sec_time, unsigned int num_kernels, double emergency_exit)
987{ 1210{
988 static int GPU_OFFSET = GPU_PARTITION * GPU_PARTITION_SIZE; 1211 int GPU_OFFSET = GPU_PARTITION * GPU_PARTITION_SIZE;
989 static gpu_pool *pool = &GPU_LINUX_SEM_POOL[GPU_PARTITION]; 1212 gpu_pool *pool = &GPU_LINUX_SEM_POOL[GPU_PARTITION];
990 static pthread_mutex_t *mutex = &GPU_LINUX_MUTEX_POOL[GPU_PARTITION]; 1213 pthread_mutex_t *mutex = &GPU_LINUX_MUTEX_POOL[GPU_PARTITION];
991 1214
992 int next_gpu; 1215 int next_gpu;
993 1216
@@ -996,19 +1219,10 @@ static void gpu_loop_for_linux(double gpu_sec_time, unsigned int num_kernels, do
996 if (emergency_exit && wctime() > emergency_exit) 1219 if (emergency_exit && wctime() > emergency_exit)
997 goto out; 1220 goto out;
998 1221
999#ifdef VANILLA_LINUX 1222 next_gpu = pool->get(mutex, ((cur_gpu() != -1) ?
1000 static bool once = false; 1223 cur_gpu() - GPU_OFFSET :
1001 static cudaEvent_t start, end; 1224 -1))
1002 float ms; 1225 + GPU_OFFSET;
1003 if (!once)
1004 {
1005 once = true;
1006 cudaEventCreate(&start);
1007 cudaEventCreate(&end);
1008 }
1009#endif
1010
1011 next_gpu = pool->get(mutex, cur_gpu() - GPU_OFFSET) + GPU_OFFSET;
1012 { 1226 {
1013 MigrateIfNeeded(next_gpu); 1227 MigrateIfNeeded(next_gpu);
1014 1228
@@ -1021,24 +1235,11 @@ static void gpu_loop_for_linux(double gpu_sec_time, unsigned int num_kernels, do
1021 for(unsigned int i = 0; i < num_kernels; ++i) 1235 for(unsigned int i = 0; i < num_kernels; ++i)
1022 { 1236 {
1023 /* one block per sm, one warp per block */ 1237 /* one block per sm, one warp per block */
1024#ifdef VANILLA_LINUX
1025 cudaEventRecord(start, cur_stream());
1026#endif
1027 docudaspin <<<cur_sms(),cur_warp_size(), 0, cur_stream()>>> (d_spin_data[cur_gpu()], cur_elem_per_thread(), numcycles); 1238 docudaspin <<<cur_sms(),cur_warp_size(), 0, cur_stream()>>> (d_spin_data[cur_gpu()], cur_elem_per_thread(), numcycles);
1028#ifdef VANILLA_LINUX 1239 cudaEventRecord(cur_event(), cur_stream());
1029 cudaEventRecord(end, cur_stream()); 1240 cudaEventSynchronize(cur_event());
1030 cudaEventSynchronize(end); 1241// cudaStreamSynchronize(cur_stream());
1031#endif
1032 cudaStreamSynchronize(cur_stream());
1033
1034#ifdef VANILLA_LINUX
1035 cudaEventElapsedTime(&ms, start, end);
1036 ms_sum += ms;
1037#endif
1038 } 1242 }
1039#ifdef VANILLA_LINUX
1040 ++gpucount;
1041#endif
1042 1243
1043 if(RECV_SIZE > 0) 1244 if(RECV_SIZE > 0)
1044 chunkMemcpy(h_recv_data, this_gpu(d_state_data), RECV_SIZE, 1245 chunkMemcpy(h_recv_data, this_gpu(d_state_data), RECV_SIZE,
@@ -1075,73 +1276,73 @@ static void usage(char *error) {
1075 exit(EXIT_FAILURE); 1276 exit(EXIT_FAILURE);
1076} 1277}
1077 1278
1078/* 1279///*
1079 * returns the character that made processing stop, newline or EOF 1280// * returns the character that made processing stop, newline or EOF
1080 */ 1281// */
1081static int skip_to_next_line(FILE *fstream) 1282//static int skip_to_next_line(FILE *fstream)
1082{ 1283//{
1083 int ch; 1284// int ch;
1084 for (ch = fgetc(fstream); ch != EOF && ch != '\n'; ch = fgetc(fstream)); 1285// for (ch = fgetc(fstream); ch != EOF && ch != '\n'; ch = fgetc(fstream));
1085 return ch; 1286// return ch;
1086} 1287//}
1087 1288//
1088static void skip_comments(FILE *fstream) 1289//static void skip_comments(FILE *fstream)
1089{ 1290//{
1090 int ch; 1291// int ch;
1091 for (ch = fgetc(fstream); ch == '#'; ch = fgetc(fstream)) 1292// for (ch = fgetc(fstream); ch == '#'; ch = fgetc(fstream))
1092 skip_to_next_line(fstream); 1293// skip_to_next_line(fstream);
1093 ungetc(ch, fstream); 1294// ungetc(ch, fstream);
1094} 1295//}
1095 1296//
1096static void get_exec_times(const char *file, const int column, 1297//static void get_exec_times(const char *file, const int column,
1097 int *num_jobs, double **exec_times) 1298// int *num_jobs, double **exec_times)
1098{ 1299//{
1099 FILE *fstream; 1300// FILE *fstream;
1100 int cur_job, cur_col, ch; 1301// int cur_job, cur_col, ch;
1101 *num_jobs = 0; 1302// *num_jobs = 0;
1102 1303//
1103 fstream = fopen(file, "r"); 1304// fstream = fopen(file, "r");
1104 if (!fstream) 1305// if (!fstream)
1105 bail_out("could not open execution time file"); 1306// bail_out("could not open execution time file");
1106 1307//
1107 /* figure out the number of jobs */ 1308// /* figure out the number of jobs */
1108 do { 1309// do {
1109 skip_comments(fstream); 1310// skip_comments(fstream);
1110 ch = skip_to_next_line(fstream); 1311// ch = skip_to_next_line(fstream);
1111 if (ch != EOF) 1312// if (ch != EOF)
1112 ++(*num_jobs); 1313// ++(*num_jobs);
1113 } while (ch != EOF); 1314// } while (ch != EOF);
1114 1315//
1115 if (-1 == fseek(fstream, 0L, SEEK_SET)) 1316// if (-1 == fseek(fstream, 0L, SEEK_SET))
1116 bail_out("rewinding file failed"); 1317// bail_out("rewinding file failed");
1117 1318//
1118 /* allocate space for exec times */ 1319// /* allocate space for exec times */
1119 *exec_times = (double*)calloc(*num_jobs, sizeof(*exec_times)); 1320// *exec_times = (double*)calloc(*num_jobs, sizeof(*exec_times));
1120 if (!*exec_times) 1321// if (!*exec_times)
1121 bail_out("couldn't allocate memory"); 1322// bail_out("couldn't allocate memory");
1122 1323//
1123 for (cur_job = 0; cur_job < *num_jobs && !feof(fstream); ++cur_job) { 1324// for (cur_job = 0; cur_job < *num_jobs && !feof(fstream); ++cur_job) {
1124 1325//
1125 skip_comments(fstream); 1326// skip_comments(fstream);
1126 1327//
1127 for (cur_col = 1; cur_col < column; ++cur_col) { 1328// for (cur_col = 1; cur_col < column; ++cur_col) {
1128 /* discard input until we get to the column we want */ 1329// /* discard input until we get to the column we want */
1129 int unused __attribute__ ((unused)) = fscanf(fstream, "%*s,"); 1330// int unused __attribute__ ((unused)) = fscanf(fstream, "%*s,");
1130 } 1331// }
1131 1332//
1132 /* get the desired exec. time */ 1333// /* get the desired exec. time */
1133 if (1 != fscanf(fstream, "%lf", (*exec_times)+cur_job)) { 1334// if (1 != fscanf(fstream, "%lf", (*exec_times)+cur_job)) {
1134 fprintf(stderr, "invalid execution time near line %d\n", 1335// fprintf(stderr, "invalid execution time near line %d\n",
1135 cur_job); 1336// cur_job);
1136 exit(EXIT_FAILURE); 1337// exit(EXIT_FAILURE);
1137 } 1338// }
1138 1339//
1139 skip_to_next_line(fstream); 1340// skip_to_next_line(fstream);
1140 } 1341// }
1141 1342//
1142 assert(cur_job == *num_jobs); 1343// assert(cur_job == *num_jobs);
1143 fclose(fstream); 1344// fclose(fstream);
1144} 1345//}
1145 1346
1146#define NUMS 4096 1347#define NUMS 4096
1147static int num[NUMS]; 1348static int num[NUMS];
@@ -1190,23 +1391,23 @@ out:
1190} 1391}
1191 1392
1192 1393
1193static void debug_delay_loop(void) 1394//static void debug_delay_loop(void)
1194{ 1395//{
1195 double start, end, delay; 1396// double start, end, delay;
1196 1397//
1197 while (1) { 1398// while (1) {
1198 for (delay = 0.5; delay > 0.01; delay -= 0.01) { 1399// for (delay = 0.5; delay > 0.01; delay -= 0.01) {
1199 start = wctime(); 1400// start = wctime();
1200 loop_for(delay, 0); 1401// loop_for(delay, 0);
1201 end = wctime(); 1402// end = wctime();
1202 printf("%6.4fs: looped for %10.8fs, delta=%11.8fs, error=%7.4f%%\n", 1403// printf("%6.4fs: looped for %10.8fs, delta=%11.8fs, error=%7.4f%%\n",
1203 delay, 1404// delay,
1204 end - start, 1405// end - start,
1205 end - start - delay, 1406// end - start - delay,
1206 100 * (end - start - delay) / delay); 1407// 100 * (end - start - delay) / delay);
1207 } 1408// }
1208 } 1409// }
1209} 1410//}
1210 1411
1211typedef bool (*gpu_job_t)(double exec_time, double gpu_exec_time, unsigned int num_kernels, double program_end); 1412typedef bool (*gpu_job_t)(double exec_time, double gpu_exec_time, unsigned int num_kernels, double program_end);
1212typedef bool (*cpu_job_t)(double exec_time, double program_end); 1413typedef bool (*cpu_job_t)(double exec_time, double program_end);
@@ -1288,6 +1489,108 @@ static void init_linux()
1288 mlockall(MCL_CURRENT | MCL_FUTURE); 1489 mlockall(MCL_CURRENT | MCL_FUTURE);
1289} 1490}
1290 1491
1492static int enable_aux_rt_tasks_linux(pid_t tid)
1493{
1494 /* pre: caller must already be real time */
1495 int ret = 0;
1496 struct sched_param param;
1497 stringstream pidstr;
1498 boost::filesystem::directory_iterator theEnd;
1499 boost::filesystem::path proc_dir;
1500
1501 int policy = sched_getscheduler(tid);
1502 if (policy == -1 || policy != SCHED_FIFO) {
1503 ret = -1;
1504 goto out;
1505 }
1506
1507 ret = sched_getparam(tid, &param);
1508 if (ret < 0)
1509 goto out;
1510
1511
1512 pidstr<<getpid();
1513 proc_dir = boost::filesystem::path("/proc");
1514 proc_dir /= pidstr.str();
1515 proc_dir /= "task";
1516
1517 for(boost::filesystem::directory_iterator iter(proc_dir); iter != theEnd; ++iter)
1518 {
1519 stringstream taskstr(iter->path().leaf().c_str());
1520 int child = 0;
1521 taskstr>>child;
1522 if (child != tid && child != 0)
1523 {
1524 /* mirror tid's params to others */
1525 ret = sched_setscheduler(child, policy, &param);
1526 if (ret != 0)
1527 goto out;
1528 }
1529 }
1530
1531out:
1532 return ret;
1533}
1534
1535static int disable_aux_rt_tasks_linux(pid_t tid)
1536{
1537 int ret = 0;
1538 struct sched_param param;
1539 stringstream pidstr;
1540 boost::filesystem::directory_iterator theEnd;
1541 boost::filesystem::path proc_dir;
1542
1543 memset(&param, 0, sizeof(param));
1544
1545 pidstr<<getpid();
1546 proc_dir = boost::filesystem::path("/proc");
1547 proc_dir /= pidstr.str();
1548 proc_dir /= "task";
1549
1550 for(boost::filesystem::directory_iterator iter(proc_dir); iter != theEnd; ++iter)
1551 {
1552 stringstream taskstr(iter->path().leaf().c_str());
1553 int child = 0;
1554 taskstr>>child;
1555 if (child != tid && child != 0)
1556 {
1557 /* make all other threads sched_normal */
1558 ret = sched_setscheduler(child, SCHED_OTHER, &param);
1559 if (ret != 0)
1560 goto out;
1561 }
1562 }
1563
1564out:
1565 return ret;
1566}
1567
1568static int be_migrate_all_to_cluster(int cluster, int cluster_size)
1569{
1570 int ret = 0;
1571 stringstream pidstr;
1572
1573 pidstr<<getpid();
1574 boost::filesystem::path proc_dir("/proc");
1575 proc_dir /= pidstr.str();
1576 proc_dir /= "task";
1577 boost::filesystem::directory_iterator theEnd;
1578 for(boost::filesystem::directory_iterator iter(proc_dir); iter != theEnd; ++iter)
1579 {
1580 stringstream taskstr(iter->path().leaf().c_str());
1581 int task = 0;
1582 taskstr>>task;
1583 if (task != 0) {
1584 ret = be_migrate_to_cluster(cluster, cluster_size);
1585 if (ret != 0)
1586 goto out;
1587 }
1588 }
1589
1590out:
1591 return ret;
1592}
1593
1291static bool gpu_job_linux(double exec_time, double gpu_exec_time, unsigned int num_kernels, double program_end) 1594static bool gpu_job_linux(double exec_time, double gpu_exec_time, unsigned int num_kernels, double program_end)
1292{ 1595{
1293 double chunk1, chunk2; 1596 double chunk1, chunk2;
@@ -1322,195 +1625,820 @@ static bool job_linux(double exec_time, double program_end)
1322 1625
1323/*****************************/ 1626/*****************************/
1324 1627
1325enum eScheduler 1628
1629
1630
1631
1632enum eRunMode
1326{ 1633{
1327 LITMUS, 1634 NORMAL,
1328 LINUX, 1635 PROXY,
1329 RT_LINUX 1636 DAEMON,
1330}; 1637};
1331 1638
1332#define CPU_OPTIONS "p:z:c:wlveio:f:s:q:X:L:Q:d:" 1639void set_defaults(struct Args* args)
1333#define GPU_OPTIONS "g:y:r:C:E:DG:xS:R:T:Z:aFm:b:MNIk:VW:" 1640{
1334 1641 memset(args, 0, sizeof(*args));
1335// concat the option strings 1642 args->wcet_ms = -1.0;
1336#define OPTSTR CPU_OPTIONS GPU_OPTIONS 1643 args->gpu_wcet_ms = 0.0;
1644 args->period_ms = -1.0;
1645 args->budget_ms = -1.0;
1646 args->gpusync_mode = IKGLP_MODE;
1647 args->sync_mode = BLOCKING;
1648 args->gpu_using = false;
1649 args->enable_affinity = false;
1650 args->enable_chunking = false;
1651 args->relax_fifo_len = false;
1652 args->use_sysmem_migration = false;
1653 args->rho = 2;
1654 args->num_ce = 2;
1655 args->reserve_migr_ce = false;
1656 args->num_kernels = 1;
1657 args->engine_lock_type = FIFO;
1658 args->yield_locks = false;
1659 args->drain_policy = DRAIN_SIMPLE;
1660 args->want_enforcement = false;
1661 args->want_signals = false;
1662 args->priority = LITMUS_LOWEST_PRIORITY;
1663 args->cls = RT_CLASS_SOFT;
1664 args->scheduler = LITMUS;
1665 args->migrate = false;
1666 args->cluster = 0;
1667 args->cluster_size = 1;
1668 args->stddev = 0.0;
1669 args->wait = false;
1670 args->scale = 1.0;
1671 args->duration = 0.0;
1672}
1337 1673
1338int main(int argc, char** argv) 1674void apply_args(struct Args* args)
1339{ 1675{
1340 int ret; 1676 // set all the globals
1677 CPU_PARTITION_SIZE = args->cluster_size;
1678 GPU_USING = args->gpu_using;
1679 GPU_PARTITION = args->gpu_partition;
1680 GPU_PARTITION_SIZE = args->gpu_partition_size;
1681 RHO = args->rho;
1682 NUM_COPY_ENGINES = args->num_ce;
1683 RESERVED_MIGR_COPY_ENGINE = args->reserve_migr_ce;
1684 USE_ENGINE_LOCKS = args->use_engine_locks;
1685 ENGINE_LOCK_TYPE = args->engine_lock_type;
1686 YIELD_LOCKS = args->yield_locks;
1687 USE_DYNAMIC_GROUP_LOCKS = args->use_dgls;
1688 GPU_SYNC_MODE = args->gpusync_mode;
1689 ENABLE_AFFINITY = args->enable_affinity;
1690 RELAX_FIFO_MAX_LEN = args->relax_fifo_len;
1691 CUDA_SYNC_MODE = args->sync_mode;
1692 SEND_SIZE = args->send_size;
1693 RECV_SIZE = args->recv_size;
1694 STATE_SIZE = args->state_size;
1695 ENABLE_CHUNKING = args->enable_chunking;
1696 CHUNK_SIZE = args->chunk_size;
1697 MIGRATE_VIA_SYSMEM = args->use_sysmem_migration;
1698
1699 // roll back other globals to an initial state
1700 CUR_DEVICE = -1;
1701 LAST_DEVICE = -1;
1702}
1341 1703
1704int __do_normal(struct Args* args)
1705{
1706 int ret = 0;
1342 struct rt_task param; 1707 struct rt_task param;
1343 1708
1344 lt_t wcet; 1709 lt_t wcet;
1345 lt_t period; 1710 lt_t period;
1346 lt_t budget; 1711 lt_t budget;
1347 double wcet_ms = -1.0;
1348 double gpu_wcet_ms = 0.0;
1349 double period_ms = -1.0;
1350 double budget_ms = -1.0;
1351 1712
1352 unsigned int num_kernels = 1; 1713 Normal<double> *wcet_dist_ms = NULL;
1353 1714
1354 budget_drain_policy_t drain = DRAIN_SIMPLE; 1715 cpu_job_t cjobfn = NULL;
1355 bool want_enforcement = false; 1716 gpu_job_t gjobfn = NULL;
1356 bool want_signals = false;
1357 1717
1358 unsigned int priority = LITMUS_LOWEST_PRIORITY; 1718 double start = 0;
1359 1719
1360 task_class_t cls = RT_CLASS_SOFT; 1720 if (MIGRATE_VIA_SYSMEM && GPU_PARTITION_SIZE == 1)
1721 return -1;
1361 1722
1362 eScheduler scheduler = LITMUS; 1723 // turn off some features to be safe
1363 int num_gpu_users = 0; 1724 if (args->scheduler != LITMUS)
1364 int migrate = 0; 1725 {
1365 int cluster = 0; 1726 RHO = 0;
1366 int cluster_size = 1; 1727 USE_ENGINE_LOCKS = false;
1728 USE_DYNAMIC_GROUP_LOCKS = false;
1729 RELAX_FIFO_MAX_LEN = false;
1730 ENABLE_RT_AUX_THREADS = false;
1731 args->budget_ms = -1.0;
1732 args->want_enforcement = false;
1733 args->want_signals = false;
1367 1734
1368 Normal<double> *wcet_dist_ms = NULL; 1735 cjobfn = job_linux;
1369 float stdpct = 0.0; 1736 gjobfn = gpu_job_linux;
1737 }
1738 else
1739 {
1740 cjobfn = job;
1741 gjobfn = gpu_job;
1742 }
1370 1743
1371 cpu_job_t cjobfn = NULL; 1744 wcet = ms2ns(args->wcet_ms);
1372 gpu_job_t gjobfn = NULL; 1745 period = ms2ns(args->period_ms);
1746 if (wcet <= 0) {
1747 printf("The worst-case execution time must be a positive number.\n");
1748 ret = -1;
1749 goto out;
1750 }
1751 if (period <= 0) {
1752 printf("The period must be a positive number.\n");
1753 ret = -1;
1754 goto out;
1755 }
1756 if (wcet > period) {
1757 printf("The worst-case execution time must not exceed the period.\n");
1758 ret = -1;
1759 goto out;
1760 }
1761 if (args->gpu_using && args->gpu_wcet_ms <= 0) {
1762 printf("The worst-case gpu execution time must be a positive number.\n");
1763 ret = -1;
1764 goto out;
1765 }
1373 1766
1374 int wait = 0; 1767 if (args->budget_ms > 0.0)
1375 double scale = 1.0; 1768 budget = ms2ns(args->budget_ms);
1376 int test_loop = 0; 1769 else
1770 budget = args->wcet_ms;
1377 1771
1378 double duration = 0, start = 0; 1772 // randomize execution time according to a normal distribution
1379 int cur_job = 0, num_jobs = 0; 1773 // centered around the desired execution time.
1380 int column = 1; 1774 // standard deviation is a percentage of this average
1775 wcet_dist_ms = new Normal<double>(args->wcet_ms + args->gpu_wcet_ms, (args->wcet_ms + args->gpu_wcet_ms) * args->stddev);
1776 wcet_dist_ms->seed((unsigned int)time(0));
1381 1777
1382 int opt; 1778 ret = be_migrate_all_to_cluster(args->cluster, args->cluster_size);
1779 if (ret < 0) {
1780 printf("could not migrate to target partition or cluster.\n");
1781 goto out;
1782 }
1783
1784 if (args->scheduler != LITMUS)
1785 {
1786 // set some variables needed by linux modes
1787 if (args->gpu_using)
1788 TRACE_MIGRATIONS = true;
1789 periodTime.tv_sec = period / s2ns(1);
1790 periodTime.tv_nsec = period - periodTime.tv_sec * s2ns(1);
1791 period_ns = period;
1792 job_no = 0;
1793 }
1794
1795 init_rt_task_param(&param);
1796 param.exec_cost = budget;
1797 param.period = period;
1798 param.priority = args->priority;
1799 param.cls = args->cls;
1800 param.budget_policy = (args->want_enforcement) ?
1801 PRECISE_ENFORCEMENT : NO_ENFORCEMENT;
1802 param.budget_signal_policy = (args->want_enforcement && args->want_signals) ?
1803 PRECISE_SIGNALS : NO_SIGNALS;
1804 param.drain_policy = args->drain_policy;
1805 param.release_policy = PERIODIC;
1806 param.cpu = cluster_to_first_cpu(args->cluster, args->cluster_size);
1807
1808 ret = set_rt_task_param(gettid(), &param);
1809 if (ret < 0) {
1810 bail_out("could not setup rt task params\n");
1811 goto out;
1812 }
1813
1814 if (args->want_signals)
1815 /* bind default longjmp signal handler to SIG_BUDGET. */
1816 activate_litmus_signals(SIG_BUDGET_MASK, longjmp_on_litmus_signal);
1817 else
1818 ignore_litmus_signals(SIG_BUDGET_MASK);
1819
1820 if (args->gpu_using)
1821 allocate_locks(args->num_gpu_tasks, args->scheduler != LITMUS);
1822
1823 if (args->scheduler == LITMUS)
1824 {
1825 ret = task_mode(LITMUS_RT_TASK);
1826 if (ret < 0) {
1827 printf("could not become RT task\n");
1828 goto out;
1829 }
1830 }
1831 else
1832 {
1833 if (args->scheduler == RT_LINUX)
1834 {
1835 struct sched_param fifoparams;
1836 memset(&fifoparams, 0, sizeof(fifoparams));
1837 fifoparams.sched_priority = args->priority;
1838 ret = sched_setscheduler(getpid(), SCHED_FIFO, &fifoparams);
1839 if (ret < 0) {
1840 printf("could not become sched_fifo task\n");
1841 goto out;
1842 }
1843 }
1844 trace_name();
1845 trace_param();
1846 }
1847
1848 if (args->wait) {
1849 xprintf("%d: waiting for release.\n", getpid());
1850 ret = wait_for_ts_release2(&releaseTime);
1851 if (ret != 0) {
1852 printf("wait_for_ts_release2()\n");
1853 goto out;
1854 }
1855
1856 if (args->scheduler != LITMUS)
1857 log_release();
1858 }
1859 else if (args->scheduler != LITMUS)
1860 {
1861 clock_gettime(CLOCK_MONOTONIC, &releaseTime);
1862 sleep_next_period_linux();
1863 }
1864
1865 if (args->gpu_using && ENABLE_RT_AUX_THREADS) {
1866 if (args->scheduler == LITMUS) {
1867 ret = enable_aux_rt_tasks(AUX_CURRENT | AUX_FUTURE);
1868 if (ret != 0) {
1869 printf("enable_aux_rt_tasks() failed\n");
1870 goto out;
1871 }
1872 }
1873 else if (args->scheduler == RT_LINUX) {
1874 ret = enable_aux_rt_tasks_linux(gettid());
1875 if (ret != 0) {
1876 printf("enable_aux_rt_tasks_linux() failed\n");
1877 goto out;
1878 }
1879 }
1880 }
1383 1881
1384 double *exec_times = NULL; 1882 start = wctime();
1385 const char *file = NULL;
1386 1883
1387 /* locking */ 1884 if (!args->gpu_using) {
1388// int lock_od = -1; 1885 bool keepgoing;
1389// int resource_id = 0; 1886 do
1390// int protocol = -1; 1887 {
1391// double cs_length = 1; /* millisecond */ 1888 double job_ms = wcet_dist_ms->random();
1889 if (job_ms < 0.0)
1890 job_ms = 0.0;
1891 keepgoing = cjobfn(ms2s(job_ms * args->scale), start + args->duration);
1892 }while(keepgoing);
1893 }
1894 else {
1895 bool keepgoing;
1896 do
1897 {
1898 double job_ms = wcet_dist_ms->random();
1899 if (job_ms < 0.0)
1900 job_ms = 0.0;
1901
1902 double cpu_job_ms = (job_ms/(args->wcet_ms + args->gpu_wcet_ms))*args->wcet_ms;
1903 double gpu_job_ms = (job_ms/(args->wcet_ms + args->gpu_wcet_ms))*args->gpu_wcet_ms;
1904 keepgoing = gjobfn(
1905 ms2s(cpu_job_ms * args->scale),
1906 ms2s(gpu_job_ms * args->scale),
1907 args->num_kernels,
1908 start + args->duration);
1909 }while(keepgoing);
1910 }
1911
1912 if (args->gpu_using && ENABLE_RT_AUX_THREADS) {
1913 if (args->scheduler == LITMUS) {
1914 ret = disable_aux_rt_tasks(AUX_CURRENT | AUX_FUTURE);
1915 if (ret != 0) {
1916 printf("disable_aux_rt_tasks() failed\n");
1917 goto out;
1918 }
1919 }
1920 else if(args->scheduler == RT_LINUX) {
1921 ret = disable_aux_rt_tasks_linux(gettid());
1922 if (ret != 0) {
1923 printf("disable_aux_rt_tasks_linux() failed\n");
1924 goto out;
1925 }
1926 }
1927 }
1928
1929 if (args->gpu_using)
1930 deallocate_locks(args->num_gpu_tasks, args->scheduler != LITMUS);
1931
1932 if (args->scheduler == LITMUS)
1933 {
1934 ret = task_mode(BACKGROUND_TASK);
1935 if (ret != 0) {
1936 printf("could not become regular task (huh?)\n");
1937 goto out;
1938 }
1939 }
1940
1941 {
1942 // become a normal task just in case.
1943 struct sched_param normalparams;
1944 memset(&normalparams, 0, sizeof(normalparams));
1945 ret = sched_setscheduler(getpid(), SCHED_OTHER, &normalparams);
1946 if (ret < 0) {
1947 printf("could not become sched_normal task\n");
1948 goto out;
1949 }
1950 }
1951
1952out:
1953 if (wcet_dist_ms)
1954 delete wcet_dist_ms;
1955
1956 return ret;
1957}
1958
1959int do_normal(struct Args* args)
1960{
1961 int ret = 0;
1962
1963 apply_args(args);
1964
1965 if (args->scheduler == LITMUS)
1966 init_litmus();
1967 else
1968 init_linux();
1969
1970 if (args->gpu_using) {
1971 signal(SIGABRT, catch_exit);
1972 signal(SIGTERM, catch_exit);
1973 signal(SIGQUIT, catch_exit);
1974 signal(SIGSEGV, catch_exit);
1975
1976 cudaSetDeviceFlags(cudaDeviceScheduleSpin);
1977 init_cuda(args->num_gpu_tasks);
1978 init_events();
1979 safetynet = true;
1980 }
1981
1982 ret = __do_normal(args);
1983
1984 if (args->gpu_using) {
1985 safetynet = false;
1986 exit_cuda();
1987 }
1988
1989 return ret;
1990}
1991
1992typedef struct run_entry
1993{
1994 struct Args args;
1995 int used;
1996 int ret;
1997} run_entry_t;
1998
1999
2000
2001static int *num_run_entries = NULL;
2002static run_entry_t *run_entries = NULL;
2003static pthread_barrier_t *daemon_barrier = NULL;
2004static pthread_mutex_t *daemon_mutex = NULL;
2005
2006static run_entry_t *my_run_entry = NULL;
2007static managed_shared_memory *daemon_segment_ptr = NULL;
2008
2009int init_daemon(struct Args* args, int num_total_users, bool is_daemon)
2010{
2011 if (num_total_users)
2012 {
2013 shared_memory_object::remove("gpuspin_daemon_memory");
2014
2015 daemon_segment_ptr = new managed_shared_memory(create_only, "gpuspin_daemon_memory", 30*PAGE_SIZE);
2016 num_run_entries = daemon_segment_ptr->construct<int>("int num_run_entries")();
2017 *num_run_entries = num_total_users;
2018
2019 run_entries = daemon_segment_ptr->construct<struct run_entry>("run_entry_t run_entries")[num_total_users]();
2020 memset(run_entries, 0, sizeof(run_entry_t)*num_total_users);
2021
2022 daemon_mutex = daemon_segment_ptr->construct<pthread_mutex_t>("pthread_mutex_t daemon_mutex")();
2023 pthread_mutexattr_t attr;
2024 pthread_mutexattr_init(&attr);
2025 pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED);
2026 pthread_mutex_init(daemon_mutex, &attr);
2027 pthread_mutexattr_destroy(&attr);
2028
2029 daemon_barrier = daemon_segment_ptr->construct<pthread_barrier_t>("pthread_barrier_t daemon_barrier")();
2030 pthread_barrierattr_t battr;
2031 pthread_barrierattr_init(&battr);
2032 pthread_barrierattr_setpshared(&battr, PTHREAD_PROCESS_SHARED);
2033 pthread_barrier_init(daemon_barrier, &battr, args->num_tasks*2);
2034 pthread_barrierattr_destroy(&battr);
2035 }
2036 else
2037 {
2038 do
2039 {
2040 try
2041 {
2042 if (!daemon_segment_ptr) daemon_segment_ptr = new managed_shared_memory(open_only, "gpuspin_daemon_memory");
2043 }
2044 catch(...)
2045 {
2046 sleep(1);
2047 }
2048 }while(daemon_segment_ptr == NULL);
2049
2050 num_run_entries = daemon_segment_ptr->find<int>("int num_run_entries").first;
2051 run_entries = daemon_segment_ptr->find<struct run_entry>("run_entry_t run_entries").first;
2052 daemon_mutex = daemon_segment_ptr->find<pthread_mutex_t>("pthread_mutex_t daemon_mutex").first;
2053 daemon_barrier = daemon_segment_ptr->find<pthread_barrier_t>("pthread_barrier_t daemon_barrier").first;
2054 }
2055
2056 if (is_daemon)
2057 {
2058 // find and claim an entry
2059 pthread_mutex_lock(daemon_mutex);
2060 for(int i = 0; i < *num_run_entries; ++i)
2061 {
2062 if(!run_entries[i].used)
2063 {
2064 my_run_entry = &run_entries[i];
2065 my_run_entry->used = 1;
2066 break;
2067 }
2068 }
2069 pthread_mutex_unlock(daemon_mutex);
2070
2071 assert(my_run_entry);
2072 my_run_entry->args = *args;
2073 my_run_entry->ret = 0;
2074 }
2075 else
2076 {
2077 // find my entry
2078 pthread_mutex_lock(daemon_mutex);
2079 for(int i = 0; i < *num_run_entries; ++i)
2080 {
2081 if (run_entries[i].args.wcet_ms == args->wcet_ms &&
2082 run_entries[i].args.gpu_wcet_ms == args->gpu_wcet_ms &&
2083 run_entries[i].args.period_ms == args->period_ms)
2084 {
2085 my_run_entry = &run_entries[i];
2086 break;
2087 }
2088 }
2089 pthread_mutex_unlock(daemon_mutex);
2090 }
2091
2092 if (!my_run_entry)
2093 return -1;
2094 return 0;
2095}
2096
2097int put_next_run(struct Args* args)
2098{
2099 assert(my_run_entry);
2100
2101 pthread_mutex_lock(daemon_mutex);
2102 my_run_entry->args = *args;
2103 pthread_mutex_unlock(daemon_mutex);
2104
2105 pthread_barrier_wait(daemon_barrier);
2106
2107 return 0;
2108}
2109
2110int get_next_run(struct Args* args)
2111{
2112 assert(my_run_entry);
2113
2114 pthread_barrier_wait(daemon_barrier);
2115
2116 pthread_mutex_lock(daemon_mutex);
2117 *args = my_run_entry->args;
2118 my_run_entry->ret = 0;
2119 pthread_mutex_unlock(daemon_mutex);
2120
2121 return 0;
2122}
2123
2124int complete_run(int ret)
2125{
2126 assert(my_run_entry);
2127
2128 pthread_mutex_lock(daemon_mutex);
2129 my_run_entry->ret = ret;
2130 pthread_mutex_unlock(daemon_mutex);
2131
2132 pthread_barrier_wait(daemon_barrier);
2133
2134 return 0;
2135}
2136
2137int wait_completion()
2138{
2139 int ret = 0;
2140
2141 assert(my_run_entry);
2142
2143 pthread_barrier_wait(daemon_barrier);
2144
2145 pthread_mutex_lock(daemon_mutex);
2146 ret = my_run_entry->ret;
2147 pthread_mutex_unlock(daemon_mutex);
2148
2149 return ret;
2150}
2151
2152
2153
2154
2155int do_proxy(struct Args* args)
2156{
2157 int ret = 0;
2158 ret = init_daemon(args, 0, false);
2159 if (ret < 0)
2160 goto out;
2161 put_next_run(args);
2162 ret = wait_completion();
2163
2164out:
2165 return ret;
2166}
2167
2168static bool is_daemon = false;
2169static bool running = false;
2170static void catch_exit2(int signal)
2171{
2172 if (is_daemon && running)
2173 complete_run(-signal);
2174 catch_exit(signal);
2175}
2176
2177int do_daemon(struct Args* args)
2178{
2179 is_daemon = true;
2180
2181 int ret = 0;
2182 struct Args nextargs;
2183
2184 signal(SIGFPE, catch_exit2);
2185 signal(SIGABRT, catch_exit2);
2186 signal(SIGTERM, catch_exit2);
2187 signal(SIGQUIT, catch_exit2);
2188 signal(SIGSEGV, catch_exit2);
2189
2190 init_daemon(args, args->num_tasks, true);
2191
2192 apply_args(args);
2193 init_litmus(); /* does everything init_linux() does, plus litmus stuff */
2194
2195 if (args->gpu_using) {
2196 cudaSetDeviceFlags(cudaDeviceScheduleSpin);
2197 init_cuda(args->num_gpu_tasks);
2198 init_events();
2199 safetynet = true;
2200 }
2201
2202 do {
2203 bool sync_change = false;
2204 bool gpu_part_change = false;
2205 bool gpu_part_size_change = false;
2206
2207 xprintf("%d: waiting for work\n", getpid());
2208
2209 get_next_run(&nextargs);
2210
2211 if (nextargs.gpu_using) {
2212 xprintf("%d: gpu using! gpu partition = %d, gwcet = %f, send = %lu\n",
2213 getpid(),
2214 nextargs.gpu_partition,
2215 nextargs.gpu_wcet_ms,
2216 nextargs.send_size);
2217 }
2218
2219 running = true;
2220 sync_change = args->gpu_using && (CUDA_SYNC_MODE != nextargs.sync_mode);
2221 gpu_part_change = args->gpu_using && (GPU_PARTITION != nextargs.gpu_partition);
2222 gpu_part_size_change = args->gpu_using && (GPU_PARTITION_SIZE != nextargs.gpu_partition_size);
2223
2224 if (sync_change || gpu_part_change || gpu_part_size_change) {
2225 destroy_events();
2226 if (gpu_part_change || gpu_part_size_change)
2227 exit_cuda();
2228 }
2229 apply_args(&nextargs);
2230 if (sync_change || gpu_part_change || gpu_part_size_change) {
2231 if (gpu_part_change || gpu_part_size_change) {
2232 xprintf("%d: changing device configuration\n", getpid());
2233 init_cuda(nextargs.num_gpu_tasks);
2234 CUR_DEVICE = -1;
2235 LAST_DEVICE = -1;
2236 }
2237 init_events();
2238 }
2239
2240 xprintf("%d: starting run\n", getpid());
2241
2242 ret = __do_normal(&nextargs);
2243 complete_run(ret);
2244 running = false;
2245 }while(ret == 0);
2246
2247 if (args->gpu_using) {
2248 safetynet = false;
2249 exit_cuda();
2250 }
2251
2252 if (args->num_gpu_tasks)
2253 shared_memory_object::remove("gpu_mutex_memory");
2254
2255 if (args->num_tasks)
2256 shared_memory_object::remove("gpuspin_daemon_memory");
2257
2258 return ret;
2259}
2260
2261#define CPU_OPTIONS "p:z:c:wlveio:f:s:q:X:L:Q:d:"
2262#define GPU_OPTIONS "g:y:r:C:E:DG:xS:R:T:Z:aFm:b:MNIk:VW:u"
2263#define PROXY_OPTIONS "B:PA"
2264
2265// concat the option strings
2266#define OPTSTR CPU_OPTIONS GPU_OPTIONS PROXY_OPTIONS
2267
2268int main(int argc, char** argv)
2269{
2270 struct Args myArgs;
2271 set_defaults(&myArgs);
2272
2273 eRunMode run_mode = NORMAL;
2274
2275 int opt;
1392 2276
1393 progname = argv[0]; 2277 progname = argv[0];
1394 2278
1395 while ((opt = getopt(argc, argv, OPTSTR)) != -1) { 2279 while ((opt = getopt(argc, argv, OPTSTR)) != -1) {
1396 switch (opt) { 2280 switch (opt) {
2281 case 'B':
2282 myArgs.num_tasks = atoi(optarg);
2283 break;
2284 case 'P':
2285 run_mode = PROXY;
2286 break;
2287 case 'A':
2288 run_mode = DAEMON;
2289 break;
2290
2291
1397 case 'w': 2292 case 'w':
1398 wait = 1; 2293 myArgs.wait = true;
1399 break; 2294 break;
1400 case 'p': 2295 case 'p':
1401 cluster = atoi(optarg); 2296 myArgs.cluster = atoi(optarg);
1402 migrate = 1; 2297 myArgs.migrate = true;
1403 break; 2298 break;
1404 case 'z': 2299 case 'z':
1405 cluster_size = atoi(optarg); 2300// CPU_PARTITION_SIZE = cluster_size;
1406 CPU_PARTITION_SIZE = cluster_size; 2301 myArgs.cluster_size = atoi(optarg);
1407 break; 2302 break;
1408 case 'g': 2303 case 'g':
1409 GPU_USING = true; 2304// GPU_USING = true;
1410 GPU_PARTITION = atoi(optarg); 2305// GPU_PARTITION = atoi(optarg);
1411 assert(GPU_PARTITION >= 0 && GPU_PARTITION < NR_GPUS); 2306 myArgs.gpu_using = true;
2307 myArgs.gpu_partition = atoi(optarg);
2308// assert(GPU_PARTITION >= 0 && GPU_PARTITION < NR_GPUS);
1412 break; 2309 break;
1413 case 'y': 2310 case 'y':
1414 GPU_PARTITION_SIZE = atoi(optarg); 2311// GPU_PARTITION_SIZE = atoi(optarg);
1415 assert(GPU_PARTITION_SIZE > 0); 2312 myArgs.gpu_partition_size = atoi(optarg);
2313// assert(GPU_PARTITION_SIZE > 0);
1416 break; 2314 break;
1417 case 'r': 2315 case 'r':
1418 RHO = atoi(optarg); 2316// RHO = atoi(optarg);
1419 assert(RHO > 0); 2317 myArgs.rho = atoi(optarg);
2318// assert(RHO > 0);
1420 break; 2319 break;
1421 case 'C': 2320 case 'C':
1422 NUM_COPY_ENGINES = atoi(optarg); 2321// NUM_COPY_ENGINES = atoi(optarg);
1423 assert(NUM_COPY_ENGINES == 1 || NUM_COPY_ENGINES == 2); 2322 myArgs.num_ce = atoi(optarg);
2323// assert(NUM_COPY_ENGINES == 1 || NUM_COPY_ENGINES == 2);
1424 break; 2324 break;
1425 case 'V': 2325 case 'V':
1426 RESERVED_MIGR_COPY_ENGINE = true; 2326// RESERVED_MIGR_COPY_ENGINE = true;
2327 myArgs.reserve_migr_ce = true;
1427 break; 2328 break;
1428 case 'E': 2329 case 'E':
1429 USE_ENGINE_LOCKS = true; 2330// USE_ENGINE_LOCKS = true;
1430 ENGINE_LOCK_TYPE = (eEngineLockTypes)atoi(optarg); 2331// ENGINE_LOCK_TYPE = (eEngineLockTypes)atoi(optarg);
1431 assert(ENGINE_LOCK_TYPE == FIFO || ENGINE_LOCK_TYPE == PRIOQ); 2332 myArgs.use_engine_locks = true;
2333 myArgs.engine_lock_type = (eEngineLockTypes)atoi(optarg);
2334// assert(ENGINE_LOCK_TYPE == FIFO || ENGINE_LOCK_TYPE == PRIOQ);
2335 break;
2336 case 'u':
2337 myArgs.yield_locks = true;
1432 break; 2338 break;
1433 case 'D': 2339 case 'D':
1434 USE_DYNAMIC_GROUP_LOCKS = true; 2340// USE_DYNAMIC_GROUP_LOCKS = true;
2341 myArgs.use_dgls = true;
1435 break; 2342 break;
1436 case 'G': 2343 case 'G':
1437 GPU_SYNC_MODE = (eGpuSyncMode)atoi(optarg); 2344// GPU_SYNC_MODE = (eGpuSyncMode)atoi(optarg);
1438 assert(GPU_SYNC_MODE >= IKGLP_MODE && GPU_SYNC_MODE <= RGEM_MODE); 2345 myArgs.gpusync_mode = (eGpuSyncMode)atoi(optarg);
2346// assert(GPU_SYNC_MODE >= IKGLP_MODE && GPU_SYNC_MODE <= RGEM_MODE);
1439 break; 2347 break;
1440 case 'a': 2348 case 'a':
1441 ENABLE_AFFINITY = true; 2349// ENABLE_AFFINITY = true;
2350 myArgs.enable_affinity = true;
1442 break; 2351 break;
1443 case 'F': 2352 case 'F':
1444 RELAX_FIFO_MAX_LEN = true; 2353// RELAX_FIFO_MAX_LEN = true;
2354 myArgs.relax_fifo_len = true;
1445 break; 2355 break;
1446 case 'x': 2356 case 'x':
1447 CUDA_SYNC_MODE = SPIN; 2357// CUDA_SYNC_MODE = SPIN;
2358 myArgs.sync_mode = SPIN;
1448 break; 2359 break;
1449 case 'S': 2360 case 'S':
1450 SEND_SIZE = kbToB((size_t)atoi(optarg)); 2361// SEND_SIZE = kbToB((size_t)atoi(optarg));
2362 myArgs.send_size = kbToB((size_t)atoi(optarg));
1451 break; 2363 break;
1452 case 'R': 2364 case 'R':
1453 RECV_SIZE = kbToB((size_t)atoi(optarg)); 2365// RECV_SIZE = kbToB((size_t)atoi(optarg));
2366 myArgs.recv_size = kbToB((size_t)atoi(optarg));
1454 break; 2367 break;
1455 case 'T': 2368 case 'T':
1456 STATE_SIZE = kbToB((size_t)atoi(optarg)); 2369// STATE_SIZE = kbToB((size_t)atoi(optarg));
2370 myArgs.state_size = kbToB((size_t)atoi(optarg));
1457 break; 2371 break;
1458 case 'Z': 2372 case 'Z':
1459 ENABLE_CHUNKING = true; 2373// ENABLE_CHUNKING = true;
1460 CHUNK_SIZE = kbToB((size_t)atoi(optarg)); 2374// CHUNK_SIZE = kbToB((size_t)atoi(optarg));
2375 myArgs.enable_chunking = true;
2376 myArgs.chunk_size = kbToB((size_t)atoi(optarg));
1461 break; 2377 break;
1462 case 'M': 2378 case 'M':
1463 MIGRATE_VIA_SYSMEM = true; 2379// MIGRATE_VIA_SYSMEM = true;
2380 myArgs.use_sysmem_migration = true;
1464 break; 2381 break;
1465 case 'm': 2382 case 'm':
1466 num_gpu_users = (int)atoi(optarg); 2383// num_gpu_users = (int)atoi(optarg);
1467 assert(num_gpu_users > 0); 2384 myArgs.num_gpu_tasks = (int)atoi(optarg);
2385// assert(num_gpu_users > 0);
1468 break; 2386 break;
1469 case 'k': 2387 case 'k':
1470 num_kernels = (unsigned int)atoi(optarg); 2388// num_kernels = (unsigned int)atoi(optarg);
2389 myArgs.num_kernels = (unsigned int)atoi(optarg);
1471 break; 2390 break;
1472 case 'b': 2391 case 'b':
1473 budget_ms = atoi(optarg); 2392// budget_ms = atoi(optarg);
2393 myArgs.budget_ms = atoi(optarg);
1474 break; 2394 break;
1475 case 'W': 2395 case 'W':
1476 stdpct = atof(optarg); 2396// stdpct = (double)atof(optarg);
2397 myArgs.stddev = (double)atof(optarg);
1477 break; 2398 break;
1478 case 'N': 2399 case 'N':
1479 scheduler = LINUX; 2400// scheduler = LINUX;
2401 myArgs.scheduler = LINUX;
1480 break; 2402 break;
1481 case 'I': 2403 case 'I':
1482 scheduler = RT_LINUX; 2404// scheduler = RT_LINUX;
2405 myArgs.scheduler = RT_LINUX;
1483 break; 2406 break;
1484 case 'q': 2407 case 'q':
1485 priority = atoi(optarg); 2408// priority = atoi(optarg);
2409 myArgs.priority = atoi(optarg);
1486 break; 2410 break;
1487 case 'c': 2411 case 'c':
1488 cls = str2class(optarg); 2412// cls = str2class(optarg);
1489 if (cls == -1) 2413 myArgs.cls = str2class(optarg);
1490 usage("Unknown task class.");
1491 break; 2414 break;
1492 case 'e': 2415 case 'e':
1493 want_enforcement = true; 2416// want_enforcement = true;
2417 myArgs.want_enforcement = true;
1494 break; 2418 break;
1495 case 'i': 2419 case 'i':
1496 want_signals = true; 2420// want_signals = true;
2421 myArgs.want_signals = true;
1497 break; 2422 break;
1498 case 'd': 2423 case 'd':
1499 drain = (budget_drain_policy_t)atoi(optarg); 2424// drain = (budget_drain_policy_t)atoi(optarg);
1500 assert(drain >= DRAIN_SIMPLE && drain <= DRAIN_SOBLIV); 2425 myArgs.drain_policy = (budget_drain_policy_t)atoi(optarg);
1501 assert(drain != DRAIN_SAWARE); // unsupported 2426// assert(drain >= DRAIN_SIMPLE && drain <= DRAIN_SOBLIV);
1502 break; 2427// assert(drain != DRAIN_SAWARE); // unsupported
1503 case 'l':
1504 test_loop = 1;
1505 break;
1506 case 'o':
1507 column = atoi(optarg);
1508 break; 2428 break;
2429// case 'l':
2430// test_loop = 1;
2431// break;
2432// case 'o':
2433//// column = atoi(optarg);
2434// myArgs.column = atoi(optarg);
2435// break;
1509// case 'f': 2436// case 'f':
1510// file = optarg; 2437// file = optarg;
1511// break; 2438// break;
1512 case 's': 2439 case 's':
1513 scale = atof(optarg); 2440// scale = (double)atof(optarg);
2441 myArgs.scale = (double)atof(optarg);
1514 break; 2442 break;
1515// case 'X': 2443// case 'X':
1516// protocol = lock_protocol_for_name(optarg); 2444// protocol = lock_protocol_for_name(optarg);
@@ -1537,304 +2465,33 @@ int main(int argc, char** argv)
1537 } 2465 }
1538 } 2466 }
1539 2467
1540#ifdef VANILLA_LINUX
1541 assert(scheduler != LITMUS);
1542 assert(!wait);
1543#endif
1544
1545 assert(stdpct >= 0.0);
1546
1547 if (MIGRATE_VIA_SYSMEM)
1548 assert(GPU_PARTITION_SIZE != 1);
1549
1550 // turn off some features to be safe
1551 if (scheduler != LITMUS)
1552 {
1553 RHO = 0;
1554 USE_ENGINE_LOCKS = false;
1555 USE_DYNAMIC_GROUP_LOCKS = false;
1556 RELAX_FIFO_MAX_LEN = false;
1557 ENABLE_RT_AUX_THREADS = false;
1558 budget_ms = -1.0;
1559 want_enforcement = false;
1560 want_signals = false;
1561
1562 cjobfn = job_linux;
1563 gjobfn = gpu_job_linux;
1564
1565 if (scheduler == RT_LINUX)
1566 {
1567 struct sched_param fifoparams;
1568
1569 assert(priority >= sched_get_priority_min(SCHED_FIFO) &&
1570 priority <= sched_get_priority_max(SCHED_FIFO));
1571
1572 memset(&fifoparams, 0, sizeof(fifoparams));
1573 fifoparams.sched_priority = priority;
1574 assert(0 == sched_setscheduler(getpid(), SCHED_FIFO, &fifoparams));
1575 }
1576 }
1577 else
1578 {
1579 cjobfn = job;
1580 gjobfn = gpu_job;
1581
1582 if (!litmus_is_valid_fixed_prio(priority))
1583 usage("Invalid priority.");
1584 }
1585
1586 if (test_loop) {
1587 debug_delay_loop();
1588 return 0;
1589 }
1590 2468
1591 srand(time(0)); 2469 srand(time(0));
1592 2470
1593 if (file) {
1594 get_exec_times(file, column, &num_jobs, &exec_times);
1595
1596 if (argc - optind < 2)
1597 usage("Arguments missing.");
1598
1599 for (cur_job = 0; cur_job < num_jobs; ++cur_job) {
1600 /* convert the execution time to seconds */
1601 duration += exec_times[cur_job] * 0.001;
1602 }
1603 } else {
1604 /*
1605 * if we're not reading from the CSV file, then we need
1606 * three parameters
1607 */
1608 if (argc - optind < 3)
1609 usage("Arguments missing.");
1610 }
1611
1612 if (argc - optind == 3) { 2471 if (argc - optind == 3) {
1613 assert(!GPU_USING); 2472 myArgs.wcet_ms = atof(argv[optind + 0]);
1614 wcet_ms = atof(argv[optind + 0]); 2473 myArgs.period_ms = atof(argv[optind + 1]);
1615 period_ms = atof(argv[optind + 1]); 2474 myArgs.duration = atof(argv[optind + 2]);
1616 duration = atof(argv[optind + 2]);
1617 } 2475 }
1618 else if (argc - optind == 4) { 2476 else if (argc - optind == 4) {
1619 assert(GPU_USING); 2477 myArgs.wcet_ms = atof(argv[optind + 0]);
1620 wcet_ms = atof(argv[optind + 0]); 2478 myArgs.gpu_wcet_ms = atof(argv[optind + 1]);
1621 gpu_wcet_ms = atof(argv[optind + 1]); 2479 myArgs.period_ms = atof(argv[optind + 2]);
1622 period_ms = atof(argv[optind + 2]); 2480 myArgs.duration = atof(argv[optind + 3]);
1623 duration = atof(argv[optind + 3]);
1624 }
1625
1626 wcet = ms2ns(wcet_ms);
1627 period = ms2ns(period_ms);
1628 if (wcet <= 0)
1629 usage("The worst-case execution time must be a "
1630 "positive number.");
1631 if (period <= 0)
1632 usage("The period must be a positive number.");
1633 if (!file && wcet > period) {
1634 usage("The worst-case execution time must not "
1635 "exceed the period.");
1636 }
1637 if (GPU_USING && gpu_wcet_ms <= 0)
1638 usage("The worst-case gpu execution time must be a positive number.");
1639
1640 if (budget_ms > 0.0)
1641 budget = ms2ns(budget_ms);
1642 else
1643 budget = wcet;
1644
1645#if 0
1646 // use upscale to determine breakdown utilization
1647 // only scaling up CPU time for now.
1648 double upscale = (double)period/(double)budget - 1.0;
1649 upscale = std::min(std::max(0.0, upscale), 0.6); // at most 30%
1650 wcet = wcet + wcet*upscale;
1651 budget = budget + wcet*upscale;
1652 wcet_ms = wcet_ms + wcet_ms*upscale;
1653
1654 // fucking floating point
1655 if (budget < wcet)
1656 budget = wcet;
1657 if (budget > period)
1658 budget = period;
1659#endif
1660
1661 // randomize execution time according to a normal distribution
1662 // centered around the desired execution time.
1663 // standard deviation is a percentage of this average
1664 wcet_dist_ms = new Normal<double>(wcet_ms + gpu_wcet_ms, (wcet_ms + gpu_wcet_ms) * stdpct);
1665 wcet_dist_ms->seed((unsigned int)time(0));
1666
1667 if (file && num_jobs > 1)
1668 duration += period_ms * 0.001 * (num_jobs - 1);
1669
1670 if (migrate) {
1671 ret = be_migrate_to_cluster(cluster, cluster_size);
1672 if (ret < 0)
1673 bail_out("could not migrate to target partition or cluster.");
1674 }
1675
1676 if (scheduler != LITMUS)
1677 {
1678 // set some variables needed by linux modes
1679 if (GPU_USING)
1680 {
1681 TRACE_MIGRATIONS = true;
1682 }
1683 periodTime.tv_sec = period / s2ns(1);
1684 periodTime.tv_nsec = period - periodTime.tv_sec * s2ns(1);
1685 period_ns = period;
1686 }
1687
1688 init_rt_task_param(&param);
1689 param.exec_cost = budget;
1690 param.period = period;
1691 param.priority = priority;
1692 param.cls = cls;
1693 param.budget_policy = (want_enforcement) ?
1694 PRECISE_ENFORCEMENT : NO_ENFORCEMENT;
1695 param.budget_signal_policy = (want_enforcement && want_signals) ?
1696 PRECISE_SIGNALS : NO_SIGNALS;
1697 param.drain_policy = drain;
1698 param.release_policy = PERIODIC;
1699
1700 if (migrate)
1701 param.cpu = cluster_to_first_cpu(cluster, cluster_size);
1702 ret = set_rt_task_param(gettid(), &param);
1703 if (ret < 0)
1704 bail_out("could not setup rt task params");
1705
1706 if (scheduler == LITMUS) {
1707 init_litmus();
1708 }
1709 else {
1710 init_linux();
1711 }
1712
1713 if (want_signals) {
1714 /* bind default longjmp signal handler to SIG_BUDGET. */
1715 activate_litmus_signals(SIG_BUDGET_MASK, longjmp_on_litmus_signal);
1716 }
1717
1718// if (protocol >= 0) {
1719// /* open reference to semaphore */
1720// lock_od = litmus_open_lock(protocol, resource_id, lock_namespace, &cluster);
1721// if (lock_od < 0) {
1722// perror("litmus_open_lock");
1723// usage("Could not open lock.");
1724// }
1725// }
1726
1727 if (GPU_USING) {
1728 allocate_locks(num_gpu_users, scheduler != LITMUS);
1729
1730 signal(SIGABRT, catch_exit);
1731 signal(SIGTERM, catch_exit);
1732 signal(SIGQUIT, catch_exit);
1733 signal(SIGSEGV, catch_exit);
1734
1735 init_cuda(num_gpu_users);
1736 safetynet = true;
1737 }
1738
1739 if (scheduler == LITMUS)
1740 {
1741 ret = task_mode(LITMUS_RT_TASK);
1742 if (ret != 0)
1743 bail_out("could not become RT task");
1744 }
1745 else
1746 {
1747 trace_name();
1748 trace_param();
1749 }
1750
1751 if (wait) {
1752 ret = wait_for_ts_release2(&releaseTime);
1753 if (ret != 0)
1754 bail_out("wait_for_ts_release2()");
1755
1756 if (scheduler != LITMUS)
1757 log_release();
1758 }
1759 else if (scheduler != LITMUS)
1760 {
1761 clock_gettime(CLOCK_MONOTONIC, &releaseTime);
1762 sleep_next_period_linux();
1763 } 2481 }
1764 2482
1765 if (scheduler == LITMUS && GPU_USING && ENABLE_RT_AUX_THREADS) { 2483 if (myArgs.num_tasks == 0 || myArgs.num_gpu_tasks == 0) {
1766 if (enable_aux_rt_tasks(AUX_CURRENT | AUX_FUTURE) != 0) 2484 // safety w.r.t. shared mem.
1767 bail_out("enable_aux_rt_tasks() failed"); 2485 sleep(2);
1768 } 2486 }
1769 2487
1770 start = wctime(); 2488 if (run_mode == NORMAL) {
1771 2489 return do_normal(&myArgs);
1772 if (!GPU_USING) {
1773 bool keepgoing;
1774 do
1775 {
1776 double job_ms = wcet_dist_ms->random();
1777 if (job_ms < 0.0)
1778 job_ms = 0.0;
1779 keepgoing = cjobfn(ms2s(job_ms * scale), start + duration);
1780 }while(keepgoing);
1781 }
1782 else {
1783 bool keepgoing;
1784 do
1785 {
1786 double job_ms = wcet_dist_ms->random();
1787 if (job_ms < 0.0)
1788 job_ms = 0.0;
1789
1790 double cpu_job_ms = (job_ms/(wcet_ms + gpu_wcet_ms))*wcet_ms;
1791 double gpu_job_ms = (job_ms/(wcet_ms + gpu_wcet_ms))*gpu_wcet_ms;
1792 keepgoing = gjobfn(
1793 ms2s(cpu_job_ms * scale),
1794 ms2s(gpu_job_ms * scale),
1795 num_kernels,
1796 start + duration);
1797 }while(keepgoing);
1798 } 2490 }
1799 2491 else if (run_mode == PROXY) {
1800 if (GPU_USING && ENABLE_RT_AUX_THREADS) 2492 return do_proxy(&myArgs);
1801 if (disable_aux_rt_tasks(AUX_CURRENT | AUX_FUTURE) != 0)
1802 bail_out("disable_aux_rt_tasks() failed");
1803
1804// if (file) {
1805// /* use times read from the CSV file */
1806// for (cur_job = 0; cur_job < num_jobs; ++cur_job) {
1807// /* convert job's length to seconds */
1808// job(exec_times[cur_job] * 0.001 * scale,
1809// start + duration,
1810// lock_od, cs_length * 0.001);
1811// }
1812// } else {
1813// /* convert to seconds and scale */
1814// while (job(wcet_ms * 0.001 * scale, start + duration,
1815// lock_od, cs_length * 0.001));
1816// }
1817
1818 if (scheduler == LITMUS)
1819 {
1820 ret = task_mode(BACKGROUND_TASK);
1821 if (ret != 0)
1822 bail_out("could not become regular task (huh?)");
1823 } 2493 }
1824 2494 else if (run_mode == DAEMON) {
1825 if (GPU_USING) { 2495 return do_daemon(&myArgs);
1826 safetynet = false;
1827 exit_cuda();
1828
1829
1830// printf("avg: %f\n", ms_sum/gpucount);
1831 } 2496 }
1832
1833 if (wcet_dist_ms)
1834 delete wcet_dist_ms;
1835
1836 if (file)
1837 free(exec_times);
1838
1839 return 0;
1840} 2497}
diff --git a/include/litmus.h b/include/litmus.h
index d3b89cf..a6c2b13 100644
--- a/include/litmus.h
+++ b/include/litmus.h
@@ -89,6 +89,7 @@ int litmus_open_lock(
89/* real-time locking protocol support */ 89/* real-time locking protocol support */
90int litmus_lock(int od); 90int litmus_lock(int od);
91int litmus_unlock(int od); 91int litmus_unlock(int od);
92int litmus_should_yield_lock(int od);
92 93
93/* Dynamic group lock support. ods arrays MUST BE PARTIALLY ORDERED!!!!!! 94/* Dynamic group lock support. ods arrays MUST BE PARTIALLY ORDERED!!!!!!
94 * Use the same ordering for lock and unlock. 95 * Use the same ordering for lock and unlock.
@@ -99,6 +100,7 @@ int litmus_unlock(int od);
99 */ 100 */
100int litmus_dgl_lock(int* ods, int dgl_size); 101int litmus_dgl_lock(int* ods, int dgl_size);
101int litmus_dgl_unlock(int* ods, int dgl_size); 102int litmus_dgl_unlock(int* ods, int dgl_size);
103int litmus_dgl_should_yield_lock(int* ods, int dgl_size);
102 104
103/* nvidia graphics cards */ 105/* nvidia graphics cards */
104int register_nv_device(int nv_device_id); 106int register_nv_device(int nv_device_id);
diff --git a/src/kernel_iface.c b/src/kernel_iface.c
index e446102..73d398f 100644
--- a/src/kernel_iface.c
+++ b/src/kernel_iface.c
@@ -80,7 +80,7 @@ int get_nr_ts_release_waiters(void)
80} 80}
81 81
82/* thread-local pointer to control page */ 82/* thread-local pointer to control page */
83static __thread struct control_page *ctrl_page; 83static __thread struct control_page *ctrl_page = NULL;
84 84
85int init_kernel_iface(void) 85int init_kernel_iface(void)
86{ 86{
diff --git a/src/syscalls.c b/src/syscalls.c
index d3ca5d8..ff02b7d 100644
--- a/src/syscalls.c
+++ b/src/syscalls.c
@@ -58,6 +58,11 @@ int litmus_unlock(int od)
58 return syscall(__NR_litmus_unlock, od); 58 return syscall(__NR_litmus_unlock, od);
59} 59}
60 60
61int litmus_should_yield_lock(int od)
62{
63 return syscall(__NR_litmus_should_yield_lock, od);
64}
65
61int litmus_dgl_lock(int *ods, int dgl_size) 66int litmus_dgl_lock(int *ods, int dgl_size)
62{ 67{
63 return syscall(__NR_litmus_dgl_lock, ods, dgl_size); 68 return syscall(__NR_litmus_dgl_lock, ods, dgl_size);
@@ -68,6 +73,11 @@ int litmus_dgl_unlock(int *ods, int dgl_size)
68 return syscall(__NR_litmus_dgl_unlock, ods, dgl_size); 73 return syscall(__NR_litmus_dgl_unlock, ods, dgl_size);
69} 74}
70 75
76int litmus_dgl_should_yield_lock(int *ods, int dgl_size)
77{
78 return syscall(__NR_litmus_dgl_should_yield_lock, ods, dgl_size);
79}
80
71int get_job_no(unsigned int *job_no) 81int get_job_no(unsigned int *job_no)
72{ 82{
73 return syscall(__NR_query_job_no, job_no); 83 return syscall(__NR_query_job_no, job_no);