aboutsummaryrefslogtreecommitdiffstats
path: root/gpu/gpuspin.cu
diff options
context:
space:
mode:
authorGlenn Elliott <gelliott@cs.unc.edu>2013-04-14 15:06:43 -0400
committerGlenn Elliott <gelliott@cs.unc.edu>2013-04-14 15:06:43 -0400
commit37b4a24ba84f1dffd680fd550a3d8cad2ac5e3a8 (patch)
tree5dc5e56a7a4f424e75f59f7705263bdb43b86fb3 /gpu/gpuspin.cu
parent209f1961ea2d5863d6f2d2e9d2323446ee5e53c4 (diff)
Implemented gpusync rtspin.
Diffstat (limited to 'gpu/gpuspin.cu')
-rw-r--r--gpu/gpuspin.cu1720
1 files changed, 1720 insertions, 0 deletions
diff --git a/gpu/gpuspin.cu b/gpu/gpuspin.cu
new file mode 100644
index 0000000..aff6cd1
--- /dev/null
+++ b/gpu/gpuspin.cu
@@ -0,0 +1,1720 @@
1#include <sys/time.h>
2
3#include <stdio.h>
4#include <stdlib.h>
5#include <unistd.h>
6#include <time.h>
7#include <string.h>
8#include <assert.h>
9#include <execinfo.h>
10
11#include <boost/interprocess/managed_shared_memory.hpp>
12#include <boost/interprocess/sync/interprocess_mutex.hpp>
13
14#include <cuda_runtime.h>
15
16#include "litmus.h"
17#include "common.h"
18
19using namespace std;
20using namespace boost::interprocess;
21
22const char *lock_namespace = "./.gpuspin-locks";
23
24const int NR_GPUS = 8;
25
26bool GPU_USING = false;
27bool ENABLE_AFFINITY = false;
28bool RELAX_FIFO_MAX_LEN = false;
29bool ENABLE_CHUNKING = false;
30bool MIGRATE_VIA_SYSMEM = false;
31
32enum eEngineLockTypes
33{
34 FIFO,
35 PRIOQ
36};
37
38eEngineLockTypes ENGINE_LOCK_TYPE = FIFO;
39
40int GPU_PARTITION = 0;
41int GPU_PARTITION_SIZE = 0;
42int CPU_PARTITION_SIZE = 0;
43
44int RHO = 2;
45
46int NUM_COPY_ENGINES = 2;
47
48
49__attribute__((unused)) static size_t kbToB(size_t kb) { return kb * 1024; }
50__attribute__((unused)) static size_t mbToB(size_t mb) { return kbToB(mb * 1024); }
51
52/* in bytes */
53size_t SEND_SIZE = 0;
54size_t RECV_SIZE = 0;
55size_t STATE_SIZE = 0;
56size_t CHUNK_SIZE = 0;
57
58int TOKEN_LOCK = -1;
59
60bool USE_ENGINE_LOCKS = true;
61bool USE_DYNAMIC_GROUP_LOCKS = false;
62int EE_LOCKS[NR_GPUS];
63int CE_SEND_LOCKS[NR_GPUS];
64int CE_RECV_LOCKS[NR_GPUS];
65int CE_MIGR_SEND_LOCKS[NR_GPUS];
66int CE_MIGR_RECV_LOCKS[NR_GPUS];
67bool RESERVED_MIGR_COPY_ENGINE = false; // only checked if NUM_COPY_ENGINES == 2
68
69bool ENABLE_RT_AUX_THREADS = true;
70
71enum eGpuSyncMode
72{
73 IKGLP_MODE,
74 IKGLP_WC_MODE, /* work-conserving IKGLP. no GPU is left idle, but breaks optimality */
75 KFMLP_MODE,
76 RGEM_MODE,
77};
78
79eGpuSyncMode GPU_SYNC_MODE = IKGLP_MODE;
80
81enum eCudaSyncMode
82{
83 BLOCKING,
84 SPIN
85};
86
87eCudaSyncMode CUDA_SYNC_MODE = BLOCKING;
88
89
90int CUR_DEVICE = -1;
91int LAST_DEVICE = -1;
92
93cudaStream_t STREAMS[NR_GPUS];
94int GPU_HZ[NR_GPUS];
95int NUM_SM[NR_GPUS];
96int WARP_SIZE[NR_GPUS];
97int ELEM_PER_THREAD[NR_GPUS];
98
99#define DEFINE_PER_GPU(type, var) type var[NR_GPUS]
100#define per_gpu(var, idx) (var[(idx)])
101#define this_gpu(var) (var[(CUR_DEVICE)])
102#define cur_stream() (this_gpu(STREAMS))
103#define cur_gpu() (CUR_DEVICE)
104#define last_gpu() (LAST_DEVICE)
105#define cur_ee() (EE_LOCKS[CUR_DEVICE])
106#define cur_send() (CE_SEND_LOCKS[CUR_DEVICE])
107#define cur_recv() (CE_RECV_LOCKS[CUR_DEVICE])
108#define cur_migr_send() (CE_MIGR_SEND_LOCKS[CUR_DEVICE])
109#define cur_migr_recv() (CE_MIGR_RECV_LOCKS[CUR_DEVICE])
110#define cur_hz() (GPU_HZ[CUR_DEVICE])
111#define cur_sms() (NUM_SM[CUR_DEVICE])
112#define cur_warp_size() (WARP_SIZE[CUR_DEVICE])
113#define cur_elem_per_thread() (ELEM_PER_THREAD[CUR_DEVICE])
114#define num_online_gpus() (NUM_GPUS)
115
116static bool useEngineLocks()
117{
118 return(USE_ENGINE_LOCKS);
119}
120
121#define VANILLA_LINUX
122
123bool TRACE_MIGRATIONS = false;
124#ifndef VANILLA_LINUX
125#define trace_migration(to, from) do { inject_gpu_migration((to), (from)); } while(0)
126#define trace_release(arrival, deadline, jobno) do { inject_release((arrival), (deadline), (jobno)); } while(0)
127#define trace_completion(jobno) do { inject_completion((jobno)); } while(0)
128#define trace_name() do { inject_name(); } while(0)
129#define trace_param() do { inject_param(); } while(0)
130#else
131#define set_rt_task_param(x, y) (0)
132#define trace_migration(to, from)
133#define trace_release(arrival, deadline, jobno)
134#define trace_completion(jobno)
135#define trace_name()
136#define trace_param()
137#endif
138
139struct ce_lock_state
140{
141 int locks[2];
142 size_t num_locks;
143 size_t budget_remaining;
144 bool locked;
145
146 ce_lock_state(int device_a, enum cudaMemcpyKind kind, size_t size, int device_b = -1, bool migration = false) {
147 num_locks = (device_a != -1) + (device_b != -1);
148
149 if(device_a != -1) {
150 if (!migration)
151 locks[0] = (kind == cudaMemcpyHostToDevice || (kind == cudaMemcpyDeviceToDevice && device_b == -1)) ?
152 CE_SEND_LOCKS[device_a] : CE_RECV_LOCKS[device_a];
153 else
154 locks[0] = (kind == cudaMemcpyHostToDevice || (kind == cudaMemcpyDeviceToDevice && device_b == -1)) ?
155 CE_MIGR_SEND_LOCKS[device_a] : CE_MIGR_RECV_LOCKS[device_a];
156 }
157
158 if(device_b != -1) {
159 assert(kind == cudaMemcpyDeviceToDevice);
160
161 if (!migration)
162 locks[1] = CE_RECV_LOCKS[device_b];
163 else
164 locks[1] = CE_MIGR_RECV_LOCKS[device_b];
165
166 if(locks[1] < locks[0]) {
167 // enforce total order on locking
168 int temp = locks[1];
169 locks[1] = locks[0];
170 locks[0] = temp;
171 }
172 }
173 else {
174 locks[1] = -1;
175 }
176
177 if(!ENABLE_CHUNKING)
178 budget_remaining = size;
179 else
180 budget_remaining = CHUNK_SIZE;
181 }
182
183 void crash(void) {
184 void *array[50];
185 int size, i;
186 char **messages;
187
188 size = backtrace(array, 50);
189 messages = backtrace_symbols(array, size);
190
191 fprintf(stderr, "%d: TRIED TO GRAB SAME LOCK TWICE! Lock = %d\n", getpid(), locks[0]);
192 for (i = 1; i < size && messages != NULL; ++i)
193 {
194 fprintf(stderr, "%d: [bt]: (%d) %s\n", getpid(), i, messages[i]);
195 }
196 free(messages);
197
198 assert(false);
199 }
200
201
202 void lock() {
203 if(locks[0] == locks[1]) crash();
204
205 if(USE_DYNAMIC_GROUP_LOCKS) {
206 litmus_dgl_lock(locks, num_locks);
207 }
208 else
209 {
210 for(int l = 0; l < num_locks; ++l)
211 {
212 litmus_lock(locks[l]);
213 }
214 }
215 locked = true;
216 }
217
218 void unlock() {
219 if(locks[0] == locks[1]) crash();
220
221 if(USE_DYNAMIC_GROUP_LOCKS) {
222 litmus_dgl_unlock(locks, num_locks);
223 }
224 else
225 {
226 // reverse order
227 for(int l = num_locks - 1; l >= 0; --l)
228 {
229 litmus_unlock(locks[l]);
230 }
231 }
232 locked = false;
233 }
234
235 void refresh() {
236 budget_remaining = CHUNK_SIZE;
237 }
238
239 bool budgetIsAvailable(size_t tosend) {
240 return(tosend >= budget_remaining);
241 }
242
243 void decreaseBudget(size_t spent) {
244 budget_remaining -= spent;
245 }
246};
247
248// precondition: if do_locking == true, locks in state are held.
249static cudaError_t __chunkMemcpy(void* a_dst, const void* a_src, size_t count,
250 enum cudaMemcpyKind kind,
251 ce_lock_state* state)
252{
253 cudaError_t ret = cudaSuccess;
254 int remaining = count;
255
256 char* dst = (char*)a_dst;
257 const char* src = (const char*)a_src;
258
259 // disable chunking, if needed, by setting chunk_size equal to the
260 // amount of data to be copied.
261 int chunk_size = (ENABLE_CHUNKING) ? CHUNK_SIZE : count;
262 int i = 0;
263
264 while(remaining != 0)
265 {
266 int bytesToCopy = std::min(remaining, chunk_size);
267
268 if(state && state->budgetIsAvailable(bytesToCopy) && state->locked) {
269 cudaStreamSynchronize(STREAMS[CUR_DEVICE]);
270 ret = cudaGetLastError();
271
272 if(ret != cudaSuccess)
273 {
274 break;
275 }
276
277 state->unlock();
278 state->refresh(); // replentish.
279 // we can only run out of
280 // budget if chunking is enabled.
281 // we presume that init budget would
282 // be set to cover entire memcpy
283 // if chunking were disabled.
284 }
285
286 if(state && !state->locked) {
287 state->lock();
288 }
289
290 //ret = cudaMemcpy(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind);
291 cudaMemcpyAsync(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind, STREAMS[CUR_DEVICE]);
292
293 if(state) {
294 state->decreaseBudget(bytesToCopy);
295 }
296
297 ++i;
298 remaining -= bytesToCopy;
299 }
300 return ret;
301}
302
303static cudaError_t chunkMemcpy(void* a_dst, const void* a_src, size_t count,
304 enum cudaMemcpyKind kind,
305 int device_a = -1, // device_a == -1 disables locking
306 bool do_locking = true,
307 int device_b = -1,
308 bool migration = false)
309{
310 cudaError_t ret;
311 if(!do_locking || device_a == -1) {
312 ret = __chunkMemcpy(a_dst, a_src, count, kind, NULL);
313 cudaStreamSynchronize(cur_stream());
314 if(ret == cudaSuccess)
315 ret = cudaGetLastError();
316 }
317 else {
318 ce_lock_state state(device_a, kind, count, device_b, migration);
319 state.lock();
320 ret = __chunkMemcpy(a_dst, a_src, count, kind, &state);
321 cudaStreamSynchronize(cur_stream());
322 if(ret == cudaSuccess)
323 ret = cudaGetLastError();
324 state.unlock();
325 }
326 return ret;
327}
328
329
330void allocate_locks_litmus(void)
331{
332 // allocate k-FMLP lock
333 int fd = open(lock_namespace, O_RDONLY | O_CREAT, S_IRUSR | S_IWUSR);
334
335 int base_name = GPU_PARTITION * 1000;
336
337 if (GPU_SYNC_MODE == IKGLP_MODE) {
338 /* Standard (optimal) IKGLP */
339 TOKEN_LOCK = open_gpusync_token_lock(fd,
340 base_name, /* name */
341 GPU_PARTITION_SIZE,
342 GPU_PARTITION*GPU_PARTITION_SIZE,
343 RHO,
344 IKGLP_M_IN_FIFOS,
345 (!RELAX_FIFO_MAX_LEN) ?
346 IKGLP_OPTIMAL_FIFO_LEN :
347 IKGLP_UNLIMITED_FIFO_LEN,
348 ENABLE_AFFINITY);
349 }
350 else if (GPU_SYNC_MODE == KFMLP_MODE) {
351 /* KFMLP. FIFO queues only for tokens. */
352 TOKEN_LOCK = open_gpusync_token_lock(fd,
353 base_name, /* name */
354 GPU_PARTITION_SIZE,
355 GPU_PARTITION*GPU_PARTITION_SIZE,
356 RHO,
357 IKGLP_UNLIMITED_IN_FIFOS,
358 IKGLP_UNLIMITED_FIFO_LEN,
359 ENABLE_AFFINITY);
360 }
361 else if (GPU_SYNC_MODE == RGEM_MODE) {
362 /* RGEM-like token allocation. Shared priority queue for all tokens. */
363 TOKEN_LOCK = open_gpusync_token_lock(fd,
364 base_name, /* name */
365 GPU_PARTITION_SIZE,
366 GPU_PARTITION*GPU_PARTITION_SIZE,
367 RHO,
368 RHO*GPU_PARTITION_SIZE,
369 1,
370 ENABLE_AFFINITY);
371 }
372 else if (GPU_SYNC_MODE == IKGLP_WC_MODE) {
373 /* Non-optimal IKGLP that never lets a replica idle if there are pending
374 * token requests. */
375 int max_simult_run = std::max(CPU_PARTITION_SIZE, RHO*GPU_PARTITION_SIZE);
376 int max_fifo_len = (int)ceil((float)max_simult_run / (RHO*GPU_PARTITION_SIZE));
377 TOKEN_LOCK = open_gpusync_token_lock(fd,
378 base_name, /* name */
379 GPU_PARTITION_SIZE,
380 GPU_PARTITION*GPU_PARTITION_SIZE,
381 RHO,
382 max_simult_run,
383 (!RELAX_FIFO_MAX_LEN) ?
384 max_fifo_len :
385 IKGLP_UNLIMITED_FIFO_LEN,
386 ENABLE_AFFINITY);
387 }
388 else {
389 perror("Invalid GPUSync mode specified\n");
390 TOKEN_LOCK = -1;
391 }
392
393 if(TOKEN_LOCK < 0)
394 perror("open_token_sem");
395
396 if(USE_ENGINE_LOCKS)
397 {
398 assert(NUM_COPY_ENGINES == 1 || NUM_COPY_ENGINES == 2);
399 assert((NUM_COPY_ENGINES == 1 && !RESERVED_MIGR_COPY_ENGINE) || NUM_COPY_ENGINES == 2);
400
401 // allocate the engine locks.
402 for (int i = 0; i < GPU_PARTITION_SIZE; ++i)
403 {
404 int idx = GPU_PARTITION*GPU_PARTITION_SIZE + i;
405 int ee_name = (i+1)*10 + base_name;
406 int ce_0_name = (i+1)*10 + base_name + 1;
407 int ce_1_name = (i+1)*10 + base_name + 2;
408 int ee_lock = -1, ce_0_lock = -1, ce_1_lock = -1;
409
410 open_sem_t openEngineLock = (ENGINE_LOCK_TYPE == FIFO) ?
411 open_fifo_sem : open_prioq_sem;
412
413 ee_lock = openEngineLock(fd, ee_name);
414 if (ee_lock < 0)
415 perror("open_*_sem (engine lock)");
416
417 ce_0_lock = openEngineLock(fd, ce_0_name);
418 if (ce_0_lock < 0)
419 perror("open_*_sem (engine lock)");
420
421 if (NUM_COPY_ENGINES == 2)
422 {
423 ce_1_lock = openEngineLock(fd, ce_1_name);
424 if (ce_1_lock < 0)
425 perror("open_*_sem (engine lock)");
426 }
427
428 EE_LOCKS[idx] = ee_lock;
429
430 if (NUM_COPY_ENGINES == 1)
431 {
432 // share locks
433 CE_SEND_LOCKS[idx] = ce_0_lock;
434 CE_RECV_LOCKS[idx] = ce_0_lock;
435 CE_MIGR_SEND_LOCKS[idx] = ce_0_lock;
436 CE_MIGR_RECV_LOCKS[idx] = ce_0_lock;
437 }
438 else
439 {
440 assert(NUM_COPY_ENGINES == 2);
441
442 if (RESERVED_MIGR_COPY_ENGINE) {
443 // copy engine deadicated to migration operations
444 CE_SEND_LOCKS[idx] = ce_0_lock;
445 CE_RECV_LOCKS[idx] = ce_0_lock;
446 CE_MIGR_SEND_LOCKS[idx] = ce_1_lock;
447 CE_MIGR_RECV_LOCKS[idx] = ce_1_lock;
448 }
449 else {
450 // migration transmissions treated as regular data
451 CE_SEND_LOCKS[idx] = ce_0_lock;
452 CE_RECV_LOCKS[idx] = ce_1_lock;
453 CE_MIGR_SEND_LOCKS[idx] = ce_0_lock;
454 CE_MIGR_RECV_LOCKS[idx] = ce_1_lock;
455 }
456 }
457 }
458 }
459}
460
461
462
463
464class gpu_pool
465{
466public:
467 gpu_pool(int pSz): poolSize(pSz)
468 {
469 memset(&pool[0], 0, sizeof(pool[0])*poolSize);
470 }
471
472 int get(pthread_mutex_t* tex, int preference = -1)
473 {
474 int which = -1;
475 int last = (preference >= 0) ? preference : 0;
476 int minIdx = last;
477
478 pthread_mutex_lock(tex);
479
480 int min = pool[last];
481 for(int i = (minIdx+1)%poolSize; i != last; i = (i+1)%poolSize)
482 {
483 if(min > pool[i])
484 minIdx = i;
485 }
486 ++pool[minIdx];
487
488 pthread_mutex_unlock(tex);
489
490 which = minIdx;
491
492 return which;
493 }
494
495 void put(pthread_mutex_t* tex, int which)
496 {
497 pthread_mutex_lock(tex);
498 --pool[which];
499 pthread_mutex_unlock(tex);
500 }
501
502private:
503 int poolSize;
504 int pool[NR_GPUS]; // >= gpu_part_size
505};
506
507static gpu_pool* GPU_LINUX_SEM_POOL = NULL;
508static pthread_mutex_t* GPU_LINUX_MUTEX_POOL = NULL;
509
510static void allocate_locks_linux(int num_gpu_users)
511{
512 managed_shared_memory *segment_pool_ptr = NULL;
513 managed_shared_memory *segment_mutex_ptr = NULL;
514
515 int numGpuPartitions = NR_GPUS/GPU_PARTITION_SIZE;
516
517 if(num_gpu_users != 0)
518 {
519 printf("%d creating shared memory for linux semaphores; num pools = %d, pool size = %d\n", getpid(), numGpuPartitions, GPU_PARTITION_SIZE);
520 shared_memory_object::remove("linux_mutex_memory");
521 shared_memory_object::remove("linux_sem_memory");
522
523 segment_mutex_ptr = new managed_shared_memory(create_only, "linux_mutex_memory", 4*1024);
524 GPU_LINUX_MUTEX_POOL = segment_mutex_ptr->construct<pthread_mutex_t>("pthread_mutex_t linux_m")[numGpuPartitions]();
525 for(int i = 0; i < numGpuPartitions; ++i)
526 {
527 pthread_mutexattr_t attr;
528 pthread_mutexattr_init(&attr);
529 pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED);
530 pthread_mutex_init(&(GPU_LINUX_MUTEX_POOL[i]), &attr);
531 pthread_mutexattr_destroy(&attr);
532 }
533
534 segment_pool_ptr = new managed_shared_memory(create_only, "linux_sem_memory", 4*1024);
535 GPU_LINUX_SEM_POOL = segment_pool_ptr->construct<gpu_pool>("gpu_pool linux_p")[numGpuPartitions](GPU_PARTITION_SIZE);
536 }
537 else
538 {
539 do
540 {
541 try
542 {
543 if (!segment_pool_ptr) segment_pool_ptr = new managed_shared_memory(open_only, "linux_sem_memory");
544 }
545 catch(...)
546 {
547 sleep(1);
548 }
549 }while(segment_pool_ptr == NULL);
550
551 do
552 {
553 try
554 {
555 if (!segment_mutex_ptr) segment_mutex_ptr = new managed_shared_memory(open_only, "linux_mutex_memory");
556 }
557 catch(...)
558 {
559 sleep(1);
560 }
561 }while(segment_mutex_ptr == NULL);
562
563 GPU_LINUX_SEM_POOL = segment_pool_ptr->find<gpu_pool>("gpu_pool linux_p").first;
564 GPU_LINUX_MUTEX_POOL = segment_mutex_ptr->find<pthread_mutex_t>("pthread_mutex_t linux_m").first;
565 }
566}
567
568
569
570
571static void allocate_locks(int num_gpu_users, bool linux_mode)
572{
573 if(!linux_mode)
574 allocate_locks_litmus();
575 else
576 allocate_locks_linux(num_gpu_users);
577}
578
579static void set_cur_gpu(int gpu)
580{
581 if (TRACE_MIGRATIONS) {
582 trace_migration(gpu, CUR_DEVICE);
583 }
584 if(gpu != CUR_DEVICE) {
585 cudaSetDevice(gpu);
586 CUR_DEVICE = gpu;
587 }
588}
589
590
591static pthread_barrier_t *gpu_barrier = NULL;
592static interprocess_mutex *gpu_mgmt_mutexes = NULL;
593static managed_shared_memory *segment_ptr = NULL;
594
595void coordinate_gpu_tasks(int num_gpu_users)
596{
597 if(num_gpu_users != 0)
598 {
599 printf("%d creating shared memory\n", getpid());
600 shared_memory_object::remove("gpu_barrier_memory");
601 segment_ptr = new managed_shared_memory(create_only, "gpu_barrier_memory", 4*1024);
602
603 printf("%d creating a barrier for %d users\n", getpid(), num_gpu_users);
604 gpu_barrier = segment_ptr->construct<pthread_barrier_t>("pthread_barrier_t gpu_barrier")();
605 pthread_barrierattr_t battr;
606 pthread_barrierattr_init(&battr);
607 pthread_barrierattr_setpshared(&battr, PTHREAD_PROCESS_SHARED);
608 pthread_barrier_init(gpu_barrier, &battr, num_gpu_users);
609 pthread_barrierattr_destroy(&battr);
610 printf("%d creating gpu mgmt mutexes for %d devices\n", getpid(), NR_GPUS);
611 gpu_mgmt_mutexes = segment_ptr->construct<interprocess_mutex>("interprocess_mutex m")[NR_GPUS]();
612 }
613 else
614 {
615 do
616 {
617 try
618 {
619 segment_ptr = new managed_shared_memory(open_only, "gpu_barrier_memory");
620 }
621 catch(...)
622 {
623 sleep(1);
624 }
625 }while(segment_ptr == NULL);
626
627 gpu_barrier = segment_ptr->find<pthread_barrier_t>("pthread_barrier_t gpu_barrier").first;
628 gpu_mgmt_mutexes = segment_ptr->find<interprocess_mutex>("interprocess_mutex m").first;
629 }
630}
631
632typedef float spindata_t;
633
634char *d_send_data[NR_GPUS] = {0};
635char *d_recv_data[NR_GPUS] = {0};
636char *d_state_data[NR_GPUS] = {0};
637spindata_t *d_spin_data[NR_GPUS] = {0};
638//unsigned int *d_iteration_count[NR_GPUS] = {0};
639
640
641bool p2pMigration[NR_GPUS][NR_GPUS] = {0};
642
643char *h_send_data = 0;
644char *h_recv_data = 0;
645char *h_state_data = 0;
646
647unsigned int *h_iteration_count[NR_GPUS] = {0};
648
649static void init_cuda(int num_gpu_users)
650{
651 const int PAGE_SIZE = 4*1024;
652 size_t send_alloc_bytes = SEND_SIZE + (SEND_SIZE%PAGE_SIZE != 0)*PAGE_SIZE;
653 size_t recv_alloc_bytes = RECV_SIZE + (RECV_SIZE%PAGE_SIZE != 0)*PAGE_SIZE;
654 size_t state_alloc_bytes = STATE_SIZE + (STATE_SIZE%PAGE_SIZE != 0)*PAGE_SIZE;
655
656 coordinate_gpu_tasks(num_gpu_users);
657
658 switch (CUDA_SYNC_MODE)
659 {
660 case BLOCKING:
661 cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
662 break;
663 case SPIN:
664 cudaSetDeviceFlags(cudaDeviceScheduleSpin);
665 break;
666 }
667
668 for(int i = 0; i < GPU_PARTITION_SIZE; ++i)
669 {
670 cudaDeviceProp prop;
671 int which = GPU_PARTITION*GPU_PARTITION_SIZE + i;
672
673 gpu_mgmt_mutexes[which].lock();
674
675 set_cur_gpu(which);
676 cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 0);
677 cudaDeviceSetLimit(cudaLimitMallocHeapSize, 0);
678
679 cudaGetDeviceProperties(&prop, which);
680 GPU_HZ[which] = prop.clockRate * 1000; /* khz -> hz */
681 NUM_SM[which] = prop.multiProcessorCount;
682 WARP_SIZE[which] = prop.warpSize;
683
684 // enough to fill the L2 cache exactly.
685 ELEM_PER_THREAD[which] = (prop.l2CacheSize/(NUM_SM[which]*WARP_SIZE[which]*sizeof(spindata_t)));
686
687
688 if (!MIGRATE_VIA_SYSMEM && prop.unifiedAddressing)
689 {
690 for(int j = 0; j < GPU_PARTITION_SIZE; ++j)
691 {
692 if (i != j)
693 {
694 int canAccess = 0;
695 cudaDeviceCanAccessPeer(&canAccess, i, j);
696 if(canAccess)
697 {
698 cudaDeviceEnablePeerAccess(j, 0);
699 p2pMigration[i][j] = true;
700 }
701 }
702 }
703 }
704
705 cudaStreamCreate(&STREAMS[CUR_DEVICE]);
706
707 cudaMalloc(&d_spin_data[which], prop.l2CacheSize);
708 cudaMemset(&d_spin_data[which], 0, prop.l2CacheSize);
709// cudaMalloc(&d_iteration_count[which], NUM_SM[which]*WARP_SIZE[which]*sizeof(unsigned int));
710// cudaHostAlloc(&h_iteration_count[which], NUM_SM[which]*WARP_SIZE[which]*sizeof(unsigned int), cudaHostAllocPortable | cudaHostAllocMapped);
711
712 if (send_alloc_bytes) {
713 cudaMalloc(&d_send_data[which], send_alloc_bytes);
714 cudaHostAlloc(&h_send_data, send_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped);
715 }
716
717 if (h_recv_data) {
718 cudaMalloc(&d_recv_data[which], recv_alloc_bytes);
719 cudaHostAlloc(&h_recv_data, recv_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped);
720 }
721
722 if (h_state_data) {
723 cudaMalloc(&d_state_data[which], state_alloc_bytes);
724
725 if (MIGRATE_VIA_SYSMEM)
726 cudaHostAlloc(&h_state_data, state_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped | cudaHostAllocWriteCombined);
727 }
728
729 gpu_mgmt_mutexes[which].unlock();
730 }
731
732 // roll back to first GPU
733 set_cur_gpu(GPU_PARTITION*GPU_PARTITION_SIZE);
734}
735
736
737
738static bool MigrateToGPU_P2P(int from, int to)
739{
740 bool success = true;
741 set_cur_gpu(to);
742 chunkMemcpy(this_gpu(d_state_data), per_gpu(d_state_data, from),
743 STATE_SIZE, cudaMemcpyDeviceToDevice, to,
744 useEngineLocks(), from, true);
745 return success;
746}
747
748
749static bool PullState(void)
750{
751 bool success = true;
752 chunkMemcpy(h_state_data, this_gpu(d_state_data),
753 STATE_SIZE, cudaMemcpyDeviceToHost,
754 cur_gpu(), useEngineLocks(), -1, true);
755 return success;
756}
757
758static bool PushState(void)
759{
760 bool success = true;
761 chunkMemcpy(this_gpu(d_state_data), h_state_data,
762 STATE_SIZE, cudaMemcpyHostToDevice,
763 cur_gpu(), useEngineLocks(), -1, true);
764 return success;
765}
766
767static bool MigrateToGPU_SysMem(int from, int to)
768{
769 // THIS IS ON-DEMAND SYS_MEM MIGRATION. GPUSync says
770 // you should be using speculative migrations.
771 // Use PushState() and PullState().
772 assert(false); // for now
773
774 bool success = true;
775
776 set_cur_gpu(from);
777 chunkMemcpy(h_state_data, this_gpu(d_state_data),
778 STATE_SIZE, cudaMemcpyDeviceToHost,
779 from, useEngineLocks(), -1, true);
780
781 set_cur_gpu(to);
782 chunkMemcpy(this_gpu(d_state_data), h_state_data,
783 STATE_SIZE, cudaMemcpyHostToDevice,
784 to, useEngineLocks(), -1, true);
785
786 return success;
787}
788
789static bool MigrateToGPU(int from, int to)
790{
791 bool success = false;
792
793 if (from != to)
794 {
795 if(!MIGRATE_VIA_SYSMEM && p2pMigration[to][from])
796 success = MigrateToGPU_P2P(from, to);
797 else
798 success = MigrateToGPU_SysMem(from, to);
799 }
800 else
801 {
802 set_cur_gpu(to);
803 success = true;
804 }
805
806 return success;
807}
808
809static bool MigrateToGPU_Implicit(int to)
810{
811 return( MigrateToGPU(cur_gpu(), to) );
812}
813
814static void MigrateIfNeeded(int next_gpu)
815{
816 if(next_gpu != cur_gpu() && cur_gpu() != -1)
817 {
818 if (!MIGRATE_VIA_SYSMEM)
819 MigrateToGPU_Implicit(next_gpu);
820 else {
821 set_cur_gpu(next_gpu);
822 PushState();
823 }
824 }
825}
826
827
828
829static void exit_cuda()
830{
831 for(int i = 0; i < GPU_PARTITION_SIZE; ++i)
832 {
833 int which = GPU_PARTITION*GPU_PARTITION_SIZE + i;
834 gpu_mgmt_mutexes[which].lock();
835 set_cur_gpu(which);
836 cudaDeviceReset();
837 gpu_mgmt_mutexes[which].unlock();
838 }
839}
840
841bool safetynet = false;
842
843static void catch_exit(int catch_exit)
844{
845 if(GPU_USING && USE_ENGINE_LOCKS && safetynet)
846 {
847 safetynet = false;
848 for(int i = 0; i < GPU_PARTITION_SIZE; ++i)
849 {
850 int which = GPU_PARTITION*GPU_PARTITION_SIZE + i;
851 set_cur_gpu(which);
852
853// cudaDeviceReset();
854
855 // try to unlock everything. litmus will prevent bogus calls.
856 if(USE_ENGINE_LOCKS)
857 {
858 litmus_unlock(EE_LOCKS[which]);
859 litmus_unlock(CE_SEND_LOCKS[which]);
860 if (NUM_COPY_ENGINES == 2)
861 {
862 if (RESERVED_MIGR_COPY_ENGINE)
863 litmus_unlock(CE_MIGR_SEND_LOCKS[which]);
864 else
865 litmus_unlock(CE_MIGR_RECV_LOCKS[which]);
866 }
867 }
868 }
869 litmus_unlock(TOKEN_LOCK);
870 }
871}
872
873
874
875
876
877static float ms_sum;
878static int gpucount = 0;
879
880__global__ void docudaspin(float* data, /*unsigned int* iterations,*/ unsigned int num_elem, unsigned int cycles)
881{
882 long long int now = clock64();
883 long long unsigned int elapsed = 0;
884 long long int last;
885
886// unsigned int iter = 0;
887 unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
888 unsigned int j = 0;
889 bool toggle = true;
890
891// iterations[i] = 0;
892 do
893 {
894 data[i*num_elem+j] += (toggle) ? M_PI : -M_PI;
895 j = (j + 1 != num_elem) ? j + 1 : 0;
896 toggle = !toggle;
897// iter++;
898
899 last = now;
900 now = clock64();
901
902// // exact calculation takes more cycles than a second
903// // loop iteration when code is compiled optimized
904// long long int diff = now - last;
905// elapsed += (diff > 0) ?
906// diff :
907// now + ((~((long long int)0)<<1)>>1) - last;
908
909 // don't count iterations with clock roll-over
910 elapsed += max(0ll, now - last);
911 }while(elapsed < cycles);
912
913// iterations[i] = iter;
914
915 return;
916}
917
918static void gpu_loop_for(double gpu_sec_time, double emergency_exit)
919{
920 int next_gpu;
921
922 if (emergency_exit && wctime() > emergency_exit)
923 goto out;
924
925 next_gpu = litmus_lock(TOKEN_LOCK);
926 {
927 MigrateIfNeeded(next_gpu);
928
929 unsigned int numcycles = (unsigned int)(cur_hz() * gpu_sec_time);
930
931 if(SEND_SIZE > 0)
932 chunkMemcpy(this_gpu(d_state_data), h_send_data, SEND_SIZE,
933 cudaMemcpyHostToDevice, CUR_DEVICE, useEngineLocks());
934
935 if(useEngineLocks()) litmus_lock(cur_ee());
936 /* one block per sm, one warp per block */
937 docudaspin <<<cur_sms(),cur_warp_size(), 0, cur_stream()>>> (d_spin_data[cur_gpu()], cur_elem_per_thread(), numcycles);
938// docudaspin <<<cur_sms(),cur_warp_size(), 0, cur_stream()>>> (d_spin_data[cur_gpu()], d_iteration_count[cur_gpu()], cur_elem_per_thread(), numcycles);
939 cudaStreamSynchronize(cur_stream());
940 if(useEngineLocks()) litmus_unlock(cur_ee());
941
942 if(RECV_SIZE > 0)
943 chunkMemcpy(h_recv_data, this_gpu(d_state_data), RECV_SIZE,
944 cudaMemcpyDeviceToHost, CUR_DEVICE, useEngineLocks());
945
946 if (MIGRATE_VIA_SYSMEM)
947 PullState();
948 }
949 litmus_unlock(TOKEN_LOCK);
950
951 last_gpu() = cur_gpu();
952
953out:
954 return;
955}
956
957static void gpu_loop_for_linux(double gpu_sec_time, double emergency_exit)
958{
959 static int GPU_OFFSET = GPU_PARTITION * GPU_PARTITION_SIZE;
960 static gpu_pool *pool = &GPU_LINUX_SEM_POOL[GPU_PARTITION];
961 static pthread_mutex_t *mutex = &GPU_LINUX_MUTEX_POOL[GPU_PARTITION];
962
963 static bool once = false;
964 static cudaEvent_t start, end;
965 float ms;
966 if (!once)
967 {
968 once = true;
969 cudaEventCreate(&start);
970 cudaEventCreate(&end);
971 }
972
973 int next_gpu;
974
975 if (emergency_exit && wctime() > emergency_exit)
976 goto out;
977
978 next_gpu = pool->get(mutex, cur_gpu() - GPU_OFFSET) + GPU_OFFSET;
979 {
980 MigrateIfNeeded(next_gpu);
981
982 unsigned int numcycles = (unsigned int)(cur_hz() * gpu_sec_time);
983
984 if(SEND_SIZE > 0)
985 chunkMemcpy(this_gpu(d_state_data), h_send_data, SEND_SIZE,
986 cudaMemcpyHostToDevice, cur_gpu(), useEngineLocks());
987
988 /* one block per sm, one warp per block */
989 cudaEventRecord(start, cur_stream());
990 docudaspin <<<cur_sms(),cur_warp_size(), 0, cur_stream()>>> (d_spin_data[cur_gpu()], cur_elem_per_thread(), numcycles);
991// docudaspin <<<cur_sms(),cur_warp_size(), 0, cur_stream()>>> (d_spin_data[cur_gpu()], d_iteration_count[cur_gpu()], cur_elem_per_thread(), numcycles);
992 cudaEventRecord(end, cur_stream());
993 cudaEventSynchronize(end);
994 cudaStreamSynchronize(cur_stream());
995
996// chunkMemcpy(this_gpu(h_iteration_count), this_gpu(d_iteration_count), sizeof(unsigned int),
997// cudaMemcpyDeviceToHost, cur_gpu(), useEngineLocks());
998//
999 cudaEventElapsedTime(&ms, start, end);
1000 ms_sum += ms;
1001 ++gpucount;
1002// printf("%f\n", ms);
1003// printf("%f: %u\n", ms, this_gpu(h_iteration_count)[0]);
1004
1005
1006 if(RECV_SIZE > 0)
1007 chunkMemcpy(h_recv_data, this_gpu(d_state_data), RECV_SIZE,
1008 cudaMemcpyDeviceToHost, cur_gpu(), useEngineLocks());
1009
1010 if (MIGRATE_VIA_SYSMEM)
1011 PullState();
1012 }
1013 pool->put(mutex, cur_gpu() - GPU_OFFSET);
1014
1015 last_gpu() = cur_gpu();
1016
1017out:
1018 return;
1019}
1020
1021
1022
1023
1024static void usage(char *error) {
1025 fprintf(stderr, "Error: %s\n", error);
1026 fprintf(stderr,
1027 "Usage:\n"
1028 " rt_spin [COMMON-OPTS] WCET PERIOD DURATION\n"
1029 " rt_spin [COMMON-OPTS] -f FILE [-o COLUMN] WCET PERIOD\n"
1030 " rt_spin -l\n"
1031 "\n"
1032 "COMMON-OPTS = [-w] [-s SCALE]\n"
1033 " [-p PARTITION/CLUSTER [-z CLUSTER SIZE]] [-c CLASS]\n"
1034 " [-X LOCKING-PROTOCOL] [-L CRITICAL SECTION LENGTH] [-Q RESOURCE-ID]"
1035 "\n"
1036 "WCET and PERIOD are milliseconds, DURATION is seconds.\n"
1037 "CRITICAL SECTION LENGTH is in milliseconds.\n");
1038 exit(EXIT_FAILURE);
1039}
1040
1041/*
1042 * returns the character that made processing stop, newline or EOF
1043 */
1044static int skip_to_next_line(FILE *fstream)
1045{
1046 int ch;
1047 for (ch = fgetc(fstream); ch != EOF && ch != '\n'; ch = fgetc(fstream));
1048 return ch;
1049}
1050
1051static void skip_comments(FILE *fstream)
1052{
1053 int ch;
1054 for (ch = fgetc(fstream); ch == '#'; ch = fgetc(fstream))
1055 skip_to_next_line(fstream);
1056 ungetc(ch, fstream);
1057}
1058
1059static void get_exec_times(const char *file, const int column,
1060 int *num_jobs, double **exec_times)
1061{
1062 FILE *fstream;
1063 int cur_job, cur_col, ch;
1064 *num_jobs = 0;
1065
1066 fstream = fopen(file, "r");
1067 if (!fstream)
1068 bail_out("could not open execution time file");
1069
1070 /* figure out the number of jobs */
1071 do {
1072 skip_comments(fstream);
1073 ch = skip_to_next_line(fstream);
1074 if (ch != EOF)
1075 ++(*num_jobs);
1076 } while (ch != EOF);
1077
1078 if (-1 == fseek(fstream, 0L, SEEK_SET))
1079 bail_out("rewinding file failed");
1080
1081 /* allocate space for exec times */
1082 *exec_times = (double*)calloc(*num_jobs, sizeof(*exec_times));
1083 if (!*exec_times)
1084 bail_out("couldn't allocate memory");
1085
1086 for (cur_job = 0; cur_job < *num_jobs && !feof(fstream); ++cur_job) {
1087
1088 skip_comments(fstream);
1089
1090 for (cur_col = 1; cur_col < column; ++cur_col) {
1091 /* discard input until we get to the column we want */
1092 int unused __attribute__ ((unused)) = fscanf(fstream, "%*s,");
1093 }
1094
1095 /* get the desired exec. time */
1096 if (1 != fscanf(fstream, "%lf", (*exec_times)+cur_job)) {
1097 fprintf(stderr, "invalid execution time near line %d\n",
1098 cur_job);
1099 exit(EXIT_FAILURE);
1100 }
1101
1102 skip_to_next_line(fstream);
1103 }
1104
1105 assert(cur_job == *num_jobs);
1106 fclose(fstream);
1107}
1108
1109#define NUMS 4096
1110static int num[NUMS];
1111__attribute__((unused)) static char* progname;
1112
1113static int loop_once(void)
1114{
1115 int i, j = 0;
1116 for (i = 0; i < NUMS; i++)
1117 j += num[i]++;
1118 return j;
1119}
1120
1121static int loop_for(double exec_time, double emergency_exit)
1122{
1123 double last_loop = 0, loop_start;
1124 int tmp = 0;
1125
1126 double start = cputime();
1127 double now = cputime();
1128
1129 if (emergency_exit && wctime() > emergency_exit)
1130 goto out;
1131
1132 while (now + last_loop < start + exec_time) {
1133 loop_start = now;
1134 tmp += loop_once();
1135 now = cputime();
1136 last_loop = now - loop_start;
1137 if (emergency_exit && wctime() > emergency_exit) {
1138 /* Oops --- this should only be possible if the execution time tracking
1139 * is broken in the LITMUS^RT kernel. */
1140 fprintf(stderr, "!!! gpuspin/%d emergency exit!\n", getpid());
1141 fprintf(stderr, "Something is seriously wrong! Do not ignore this.\n");
1142 break;
1143 }
1144 }
1145
1146out:
1147 return tmp;
1148}
1149
1150
1151static void debug_delay_loop(void)
1152{
1153 double start, end, delay;
1154
1155 while (1) {
1156 for (delay = 0.5; delay > 0.01; delay -= 0.01) {
1157 start = wctime();
1158 loop_for(delay, 0);
1159 end = wctime();
1160 printf("%6.4fs: looped for %10.8fs, delta=%11.8fs, error=%7.4f%%\n",
1161 delay,
1162 end - start,
1163 end - start - delay,
1164 100 * (end - start - delay) / delay);
1165 }
1166 }
1167}
1168
1169static int gpu_job(double exec_time, double gpu_exec_time, double program_end)
1170{
1171 double chunk1, chunk2;
1172
1173 if (wctime() > program_end) {
1174 return 0;
1175 }
1176 else {
1177 chunk1 = exec_time * drand48();
1178 chunk2 = exec_time - chunk1;
1179
1180 loop_for(chunk1, program_end + 1);
1181 gpu_loop_for(gpu_exec_time, program_end + 1);
1182 loop_for(chunk2, program_end + 1);
1183
1184 sleep_next_period();
1185 }
1186 return 1;
1187}
1188
1189static int job(double exec_time, double program_end)
1190{
1191 if (wctime() > program_end) {
1192 return 0;
1193 }
1194 else {
1195 loop_for(exec_time, program_end + 1);
1196 sleep_next_period();
1197 }
1198 return 1;
1199}
1200
1201/*****************************/
1202/* only used for linux modes */
1203
1204static struct timespec periodTime;
1205static struct timespec releaseTime;
1206static unsigned int job_no = 0;
1207
1208static lt_t period_ns;
1209
1210static void log_release()
1211{
1212 __attribute__ ((unused)) lt_t rel = releaseTime.tv_sec * s2ns(1) + releaseTime.tv_nsec;
1213 __attribute__ ((unused)) lt_t dead = rel + period_ns;
1214 trace_release(rel, dead, job_no);
1215}
1216
1217static void log_completion()
1218{
1219 trace_completion(job_no);
1220 ++job_no;
1221}
1222
1223static void setup_next_period_linux(struct timespec* spec, struct timespec* period)
1224{
1225 spec->tv_sec += period->tv_sec;
1226 spec->tv_nsec += period->tv_nsec;
1227 if (spec->tv_nsec >= s2ns(1)) {
1228 ++(spec->tv_sec);
1229 spec->tv_nsec -= s2ns(1);
1230 }
1231}
1232
1233static void sleep_next_period_linux()
1234{
1235 log_completion();
1236 setup_next_period_linux(&releaseTime, &periodTime);
1237 clock_nanosleep(CLOCK_MONOTONIC, TIMER_ABSTIME, &releaseTime, NULL);
1238 log_release();
1239}
1240
1241static void init_linux()
1242{
1243 mlockall(MCL_CURRENT | MCL_FUTURE);
1244}
1245
1246static int gpu_job_linux(double exec_time, double gpu_exec_time, double program_end)
1247{
1248 double chunk1, chunk2;
1249
1250 if (wctime() > program_end) {
1251 return 0;
1252 }
1253 else {
1254 chunk1 = exec_time * drand48();
1255 chunk2 = exec_time - chunk1;
1256
1257 loop_for(chunk1, program_end + 1);
1258 gpu_loop_for_linux(gpu_exec_time, program_end + 1);
1259 loop_for(chunk2, program_end + 1);
1260
1261 sleep_next_period_linux();
1262 }
1263 return 1;
1264}
1265
1266static int job_linux(double exec_time, double program_end)
1267{
1268 if (wctime() > program_end) {
1269 return 0;
1270 }
1271 else {
1272 loop_for(exec_time, program_end + 1);
1273 sleep_next_period_linux();
1274 }
1275 return 1;
1276}
1277
1278/*****************************/
1279
1280enum eScheduler
1281{
1282 LITMUS,
1283 LINUX,
1284 RT_LINUX
1285};
1286
1287#define CPU_OPTIONS "p:z:c:wlveio:f:s:q:X:L:Q:"
1288#define GPU_OPTIONS "g:y:r:C:E:dG:xS:R:T:Z:aFm:b:MNI"
1289
1290// concat the option strings
1291#define OPTSTR CPU_OPTIONS GPU_OPTIONS
1292
1293int main(int argc, char** argv)
1294{
1295 int ret;
1296 lt_t wcet;
1297 lt_t period;
1298 double wcet_ms = -1, gpu_wcet_ms = -1, period_ms = -1;
1299 unsigned int priority = LITMUS_LOWEST_PRIORITY;
1300 int migrate = 0;
1301 int cluster = 0;
1302 int cluster_size = 1;
1303 int opt;
1304 int wait = 0;
1305 int test_loop = 0;
1306 int column = 1;
1307 const char *file = NULL;
1308 int want_enforcement = 0;
1309 int want_signals = 0;
1310 double duration = 0, start = 0;
1311 double *exec_times = NULL;
1312 double scale = 1.0;
1313 task_class_t cls = RT_CLASS_HARD;
1314 int cur_job = 0, num_jobs = 0;
1315 struct rt_task param;
1316
1317 double budget_ms = -1.0;
1318 lt_t budget;
1319
1320 int num_gpu_users = 0;
1321
1322
1323 eScheduler scheduler = LITMUS;
1324
1325 /* locking */
1326// int lock_od = -1;
1327// int resource_id = 0;
1328// int protocol = -1;
1329// double cs_length = 1; /* millisecond */
1330
1331 progname = argv[0];
1332
1333 while ((opt = getopt(argc, argv, OPTSTR)) != -1) {
1334 switch (opt) {
1335 case 'w':
1336 wait = 1;
1337 break;
1338 case 'p':
1339 cluster = atoi(optarg);
1340 migrate = 1;
1341 break;
1342 case 'z':
1343 cluster_size = atoi(optarg);
1344 CPU_PARTITION_SIZE = cluster_size;
1345 break;
1346 case 'g':
1347 GPU_USING = true;
1348 GPU_PARTITION = atoi(optarg);
1349 assert(GPU_PARTITION >= 0 && GPU_PARTITION < NR_GPUS);
1350 break;
1351 case 'y':
1352 GPU_PARTITION_SIZE = atoi(optarg);
1353 assert(GPU_PARTITION_SIZE > 0);
1354 break;
1355 case 'r':
1356 RHO = atoi(optarg);
1357 assert(RHO > 0);
1358 break;
1359 case 'C':
1360 NUM_COPY_ENGINES = atoi(optarg);
1361 assert(NUM_COPY_ENGINES == 1 || NUM_COPY_ENGINES == 2);
1362 break;
1363 case 'E':
1364 USE_ENGINE_LOCKS = true;
1365 ENGINE_LOCK_TYPE = (eEngineLockTypes)atoi(optarg);
1366 assert(ENGINE_LOCK_TYPE == FIFO || ENGINE_LOCK_TYPE == PRIOQ);
1367 break;
1368 case 'd':
1369 USE_DYNAMIC_GROUP_LOCKS = true;
1370 break;
1371 case 'G':
1372 GPU_SYNC_MODE = (eGpuSyncMode)atoi(optarg);
1373 assert(GPU_SYNC_MODE >= IKGLP_MODE && GPU_SYNC_MODE <= RGEM_MODE);
1374 break;
1375 case 'a':
1376 ENABLE_AFFINITY = true;
1377 break;
1378 case 'F':
1379 RELAX_FIFO_MAX_LEN = true;
1380 break;
1381 case 'x':
1382 CUDA_SYNC_MODE = SPIN;
1383 break;
1384 case 'S':
1385 SEND_SIZE = kbToB((size_t)atoi(optarg));
1386 break;
1387 case 'R':
1388 RECV_SIZE = kbToB((size_t)atoi(optarg));
1389 break;
1390 case 'T':
1391 STATE_SIZE = kbToB((size_t)atoi(optarg));
1392 break;
1393 case 'Z':
1394 ENABLE_CHUNKING = true;
1395 CHUNK_SIZE = kbToB((size_t)atoi(optarg));
1396 break;
1397 case 'M':
1398 MIGRATE_VIA_SYSMEM = true;
1399 break;
1400 case 'm':
1401 num_gpu_users = atoi(optarg);
1402 assert(num_gpu_users > 0);
1403 break;
1404 case 'b':
1405 budget_ms = atoi(optarg);
1406 break;
1407 case 'N':
1408 scheduler = LINUX;
1409 break;
1410 case 'I':
1411 scheduler = RT_LINUX;
1412 break;
1413 case 'q':
1414 priority = atoi(optarg);
1415 break;
1416 case 'c':
1417 cls = str2class(optarg);
1418 if (cls == -1)
1419 usage("Unknown task class.");
1420 break;
1421 case 'e':
1422 want_enforcement = 1;
1423 break;
1424 case 'i':
1425 want_signals = 1;
1426 break;
1427 case 'l':
1428 test_loop = 1;
1429 break;
1430 case 'o':
1431 column = atoi(optarg);
1432 break;
1433// case 'f':
1434// file = optarg;
1435// break;
1436 case 's':
1437 scale = atof(optarg);
1438 break;
1439// case 'X':
1440// protocol = lock_protocol_for_name(optarg);
1441// if (protocol < 0)
1442// usage("Unknown locking protocol specified.");
1443// break;
1444// case 'L':
1445// cs_length = atof(optarg);
1446// if (cs_length <= 0)
1447// usage("Invalid critical section length.");
1448// break;
1449// case 'Q':
1450// resource_id = atoi(optarg);
1451// if (resource_id <= 0 && strcmp(optarg, "0"))
1452// usage("Invalid resource ID.");
1453// break;
1454 case ':':
1455 usage("Argument missing.");
1456 break;
1457 case '?':
1458 default:
1459 usage("Bad argument.");
1460 break;
1461 }
1462 }
1463
1464#ifdef VANILLA_LINUX
1465 assert(scheduler != LITMUS);
1466 assert(!wait);
1467#endif
1468
1469 // turn off some features to be safe
1470 if (scheduler != LITMUS)
1471 {
1472 RHO = 0;
1473 USE_ENGINE_LOCKS = false;
1474 USE_DYNAMIC_GROUP_LOCKS = false;
1475 ENABLE_AFFINITY = false;
1476 RELAX_FIFO_MAX_LEN = false;
1477 ENABLE_RT_AUX_THREADS = false;
1478 budget_ms = -1;
1479 want_enforcement = 0;
1480 want_signals = 0;
1481
1482 if (scheduler == RT_LINUX)
1483 {
1484 struct sched_param fifoparams;
1485
1486 assert(priority >= sched_get_priority_min(SCHED_FIFO) &&
1487 priority <= sched_get_priority_max(SCHED_FIFO));
1488
1489 memset(&fifoparams, 0, sizeof(fifoparams));
1490 fifoparams.sched_priority = priority;
1491 assert(0 == sched_setscheduler(getpid(), SCHED_FIFO, &fifoparams));
1492 }
1493 }
1494 else
1495 {
1496 if (!litmus_is_valid_fixed_prio(priority))
1497 usage("Invalid priority.");
1498 }
1499
1500 if (test_loop) {
1501 debug_delay_loop();
1502 return 0;
1503 }
1504
1505 srand(getpid());
1506
1507 if (file) {
1508 get_exec_times(file, column, &num_jobs, &exec_times);
1509
1510 if (argc - optind < 2)
1511 usage("Arguments missing.");
1512
1513 for (cur_job = 0; cur_job < num_jobs; ++cur_job) {
1514 /* convert the execution time to seconds */
1515 duration += exec_times[cur_job] * 0.001;
1516 }
1517 } else {
1518 /*
1519 * if we're not reading from the CSV file, then we need
1520 * three parameters
1521 */
1522 if (argc - optind < 3)
1523 usage("Arguments missing.");
1524 }
1525
1526 if (argc - optind == 3) {
1527 assert(!GPU_USING);
1528 wcet_ms = atof(argv[optind + 0]);
1529 period_ms = atof(argv[optind + 1]);
1530 duration = atof(argv[optind + 2]);
1531 }
1532 else if (argc - optind == 4) {
1533 assert(GPU_USING);
1534 wcet_ms = atof(argv[optind + 0]);
1535 gpu_wcet_ms = atof(argv[optind + 1]);
1536 period_ms = atof(argv[optind + 2]);
1537 duration = atof(argv[optind + 3]);
1538 }
1539
1540 wcet = ms2ns(wcet_ms);
1541 period = ms2ns(period_ms);
1542 if (wcet <= 0)
1543 usage("The worst-case execution time must be a "
1544 "positive number.");
1545 if (period <= 0)
1546 usage("The period must be a positive number.");
1547 if (!file && wcet > period) {
1548 usage("The worst-case execution time must not "
1549 "exceed the period.");
1550 }
1551 if (GPU_USING && gpu_wcet_ms <= 0)
1552 usage("The worst-case gpu execution time must be a positive number.");
1553
1554 if (budget_ms > 0)
1555 budget = ms2ns(budget_ms);
1556 else
1557 budget = wcet;
1558
1559 if (file && num_jobs > 1)
1560 duration += period_ms * 0.001 * (num_jobs - 1);
1561
1562 if (migrate) {
1563 ret = be_migrate_to_cluster(cluster, cluster_size);
1564 if (ret < 0)
1565 bail_out("could not migrate to target partition or cluster.");
1566 }
1567
1568 if (scheduler != LITMUS)
1569 {
1570 // set some variables needed by linux modes
1571 if (GPU_USING)
1572 {
1573 TRACE_MIGRATIONS = true;
1574 }
1575 periodTime.tv_sec = period / s2ns(1);
1576 periodTime.tv_nsec = period - periodTime.tv_sec * s2ns(1);
1577 period_ns = period;
1578 }
1579
1580 init_rt_task_param(&param);
1581 param.exec_cost = budget;
1582 param.period = period;
1583 param.priority = priority;
1584 param.cls = cls;
1585 param.budget_policy = (want_enforcement) ?
1586 PRECISE_ENFORCEMENT : NO_ENFORCEMENT;
1587 param.budget_signal_policy = (want_enforcement && want_signals) ?
1588 PRECISE_SIGNALS : NO_SIGNALS;
1589 param.release_policy = PERIODIC;
1590
1591 if (migrate)
1592 param.cpu = cluster_to_first_cpu(cluster, cluster_size);
1593 ret = set_rt_task_param(gettid(), &param);
1594 if (ret < 0)
1595 bail_out("could not setup rt task params");
1596
1597 if (scheduler == LITMUS)
1598 init_litmus();
1599 else
1600 init_linux();
1601
1602 if (want_signals) {
1603 /* bind default longjmp signal handler to SIG_BUDGET. */
1604 activate_litmus_signals(SIG_BUDGET_MASK, longjmp_on_litmus_signal);
1605 }
1606
1607 if (scheduler == LITMUS)
1608 {
1609 ret = task_mode(LITMUS_RT_TASK);
1610 if (ret != 0)
1611 bail_out("could not become RT task");
1612 }
1613 else
1614 {
1615 trace_name();
1616 trace_param();
1617 }
1618
1619// if (protocol >= 0) {
1620// /* open reference to semaphore */
1621// lock_od = litmus_open_lock(protocol, resource_id, lock_namespace, &cluster);
1622// if (lock_od < 0) {
1623// perror("litmus_open_lock");
1624// usage("Could not open lock.");
1625// }
1626// }
1627
1628 if (GPU_USING) {
1629 allocate_locks(num_gpu_users, scheduler != LITMUS);
1630
1631 signal(SIGABRT, catch_exit);
1632 signal(SIGTERM, catch_exit);
1633 signal(SIGQUIT, catch_exit);
1634 signal(SIGSEGV, catch_exit);
1635
1636 init_cuda(num_gpu_users);
1637 safetynet = true;
1638
1639 if (ENABLE_RT_AUX_THREADS)
1640 if (enable_aux_rt_tasks(AUX_CURRENT | AUX_FUTURE) != 0)
1641 bail_out("enable_aux_rt_tasks() failed");
1642 }
1643
1644 if (wait) {
1645 ret = wait_for_ts_release2(&releaseTime);
1646 if (ret != 0)
1647 bail_out("wait_for_ts_release2()");
1648
1649 if (scheduler != LITMUS)
1650 log_release();
1651 }
1652 else if (scheduler != LITMUS)
1653 {
1654 clock_gettime(CLOCK_MONOTONIC, &releaseTime);
1655 sleep_next_period_linux();
1656 }
1657
1658 start = wctime();
1659
1660 if (scheduler == LITMUS)
1661 {
1662 if (!GPU_USING) {
1663 while (job(wcet_ms * 0.001 * scale, start + duration));
1664 }
1665 else {
1666 while (gpu_job(wcet_ms * 0.001 * scale,
1667 gpu_wcet_ms * 0.001 * scale,
1668 start + duration));
1669 }
1670 }
1671 else
1672 {
1673 if (!GPU_USING) {
1674 while (job_linux(wcet_ms * 0.001 * scale, start + duration));
1675 }
1676 else {
1677 while (gpu_job_linux(wcet_ms * 0.001 * scale,
1678 gpu_wcet_ms * 0.001 * scale,
1679 start + duration));
1680 }
1681 }
1682
1683 if (GPU_USING && ENABLE_RT_AUX_THREADS)
1684 if (disable_aux_rt_tasks(AUX_CURRENT | AUX_FUTURE) != 0)
1685 bail_out("disable_aux_rt_tasks() failed");
1686
1687// if (file) {
1688// /* use times read from the CSV file */
1689// for (cur_job = 0; cur_job < num_jobs; ++cur_job) {
1690// /* convert job's length to seconds */
1691// job(exec_times[cur_job] * 0.001 * scale,
1692// start + duration,
1693// lock_od, cs_length * 0.001);
1694// }
1695// } else {
1696// /* convert to seconds and scale */
1697// while (job(wcet_ms * 0.001 * scale, start + duration,
1698// lock_od, cs_length * 0.001));
1699// }
1700
1701 if (scheduler == LITMUS)
1702 {
1703 ret = task_mode(BACKGROUND_TASK);
1704 if (ret != 0)
1705 bail_out("could not become regular task (huh?)");
1706 }
1707
1708 if (GPU_USING) {
1709 safetynet = false;
1710 exit_cuda();
1711
1712
1713 printf("avg: %f\n", ms_sum/gpucount);
1714 }
1715
1716 if (file)
1717 free(exec_times);
1718
1719 return 0;
1720}