diff options
author | Glenn Elliott <gelliott@cs.unc.edu> | 2013-05-06 18:57:37 -0400 |
---|---|---|
committer | Glenn Elliott <gelliott@cs.unc.edu> | 2013-05-06 18:58:59 -0400 |
commit | 95e840f68892d46289120d1042ee36f9eaf41de7 (patch) | |
tree | 1335167a07621094518c4389f60ef0f3ed77eea4 | |
parent | 0f89bddde73d448511004a60b98b8be042f6ffd6 (diff) |
several new *important* features
1) gpusync daemon mode.
2) engine yield logic
3) fixed chunking (did not work on memcpys > 2 chunks)
-rw-r--r-- | Makefile | 8 | ||||
-rw-r--r-- | gpu/dgl.c | 1 | ||||
-rw-r--r-- | gpu/gpuspin.cu | 1907 | ||||
-rw-r--r-- | include/litmus.h | 2 | ||||
-rw-r--r-- | src/kernel_iface.c | 2 | ||||
-rw-r--r-- | src/syscalls.c | 10 |
6 files changed, 1301 insertions, 629 deletions
@@ -25,12 +25,15 @@ NUMA_SUPPORT = dummyval | |||
25 | 25 | ||
26 | # compiler flags | 26 | # compiler flags |
27 | flags-debug = -O2 -Wall -Werror -g -Wdeclaration-after-statement | 27 | flags-debug = -O2 -Wall -Werror -g -Wdeclaration-after-statement |
28 | #flags-debug = -Wall -Werror -g -Wdeclaration-after-statement | ||
28 | flags-debug-cpp = -O2 -Wall -Werror -g | 29 | flags-debug-cpp = -O2 -Wall -Werror -g |
30 | #flags-debug-cpp = -Wall -Werror -g | ||
29 | flags-api = -D_XOPEN_SOURCE=600 -D_GNU_SOURCE | 31 | flags-api = -D_XOPEN_SOURCE=600 -D_GNU_SOURCE |
30 | flags-misc = -fasynchronous-unwind-tables -fnon-call-exceptions | 32 | flags-misc = -fasynchronous-unwind-tables -fnon-call-exceptions |
31 | 33 | ||
32 | flags-cu-debug = -g -G -Xcompiler -Wall -Xcompiler -Werror | 34 | flags-cu-debug = -g -G -Xcompiler -Wall -Xcompiler -Werror |
33 | flags-cu-optim = -O2 -Xcompiler -march=native | 35 | flags-cu-optim = -O2 -Xcompiler -march=native |
36 | #flags-cu-optim = -Xcompiler -march=native | ||
34 | flags-cu-nvcc = --use_fast_math -gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 | 37 | flags-cu-nvcc = --use_fast_math -gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 |
35 | flags-cu-misc = -Xcompiler -fasynchronous-unwind-tables -Xcompiler -fnon-call-exceptions -Xcompiler -malign-double -Xcompiler -pthread | 38 | flags-cu-misc = -Xcompiler -fasynchronous-unwind-tables -Xcompiler -fnon-call-exceptions -Xcompiler -malign-double -Xcompiler -pthread |
36 | flags-cu-x86_64 = -m64 | 39 | flags-cu-x86_64 = -m64 |
@@ -63,7 +66,6 @@ headers = -I${LIBLITMUS}/include -I${LIBLITMUS}/arch/${include-${ARCH}}/include | |||
63 | # combine options | 66 | # combine options |
64 | CPPFLAGS = ${flags-api} ${flags-debug-cpp} ${flags-misc} ${flags-${ARCH}} -DARCH=${ARCH} ${headers} | 67 | CPPFLAGS = ${flags-api} ${flags-debug-cpp} ${flags-misc} ${flags-${ARCH}} -DARCH=${ARCH} ${headers} |
65 | CUFLAGS = ${flags-api} ${flags-cu-debug} ${flags-cu-optim} ${flags-cu-nvcc} ${flags-cu-misc} -DARCH=${ARCH} ${headers} | 68 | CUFLAGS = ${flags-api} ${flags-cu-debug} ${flags-cu-optim} ${flags-cu-nvcc} ${flags-cu-misc} -DARCH=${ARCH} ${headers} |
66 | #CUFLAGS = ${flags-api} ${flags-cu-optim} ${flags-cu-nvcc} ${flags-cu-misc} -DARCH=${ARCH} ${headers} | ||
67 | CFLAGS = ${flags-debug} ${flags-misc} | 69 | CFLAGS = ${flags-debug} ${flags-misc} |
68 | LDFLAGS = ${flags-${ARCH}} | 70 | LDFLAGS = ${flags-${ARCH}} |
69 | 71 | ||
@@ -82,7 +84,7 @@ endif | |||
82 | # how to link cuda | 84 | # how to link cuda |
83 | cuda-flags-i386 = -L/usr/local/cuda/lib | 85 | cuda-flags-i386 = -L/usr/local/cuda/lib |
84 | cuda-flags-x86_64 = -L/usr/local/cuda/lib64 | 86 | cuda-flags-x86_64 = -L/usr/local/cuda/lib64 |
85 | cuda-flags = ${cuda-flags-${ARCH}} -lcudart | 87 | cuda-flags = ${cuda-flags-${ARCH}} -lcudart -lcuda |
86 | 88 | ||
87 | # Force gcc instead of cc, but let the user specify a more specific version if | 89 | # Force gcc instead of cc, but let the user specify a more specific version if |
88 | # desired. | 90 | # desired. |
@@ -299,7 +301,7 @@ lib-budget = -lrt -lm -pthread | |||
299 | vpath %.cu gpu/ | 301 | vpath %.cu gpu/ |
300 | 302 | ||
301 | objcu-gpuspin = gpuspin.o common.o | 303 | objcu-gpuspin = gpuspin.o common.o |
302 | lib-gpuspin = -lblitz -lrt -lm -lpthread | 304 | lib-gpuspin = -lblitz -lrt -lm -lpthread -lboost_filesystem -lboost_system |
303 | 305 | ||
304 | # ############################################################################## | 306 | # ############################################################################## |
305 | # Build everything that depends on liblitmus. | 307 | # Build everything that depends on liblitmus. |
@@ -229,6 +229,7 @@ void* rt_thread(void* _ctx) | |||
229 | 229 | ||
230 | do_exit = job(ctx); | 230 | do_exit = job(ctx); |
231 | 231 | ||
232 | fprintf(stdout, "[%d] should yield dgl: %d.\n", ctx->id, litmus_dgl_should_yield_lock(dgl, dgl_size)); | ||
232 | 233 | ||
233 | xfprintf(stdout, "[%d] unlocking dgl.\n", ctx->id); | 234 | xfprintf(stdout, "[%d] unlocking dgl.\n", ctx->id); |
234 | litmus_dgl_unlock(dgl, dgl_size); | 235 | litmus_dgl_unlock(dgl, dgl_size); |
diff --git a/gpu/gpuspin.cu b/gpu/gpuspin.cu index 21134f6..f361b86 100644 --- a/gpu/gpuspin.cu +++ b/gpu/gpuspin.cu | |||
@@ -10,9 +10,11 @@ | |||
10 | 10 | ||
11 | #include <boost/interprocess/managed_shared_memory.hpp> | 11 | #include <boost/interprocess/managed_shared_memory.hpp> |
12 | #include <boost/interprocess/sync/interprocess_mutex.hpp> | 12 | #include <boost/interprocess/sync/interprocess_mutex.hpp> |
13 | #include <boost/filesystem.hpp> | ||
13 | 14 | ||
14 | #include <random/normal.h> | 15 | #include <random/normal.h> |
15 | 16 | ||
17 | #include <cuda.h> | ||
16 | #include <cuda_runtime.h> | 18 | #include <cuda_runtime.h> |
17 | 19 | ||
18 | #include "litmus.h" | 20 | #include "litmus.h" |
@@ -24,7 +26,21 @@ using namespace ranlib; | |||
24 | 26 | ||
25 | #define ms2s(ms) ((ms)*0.001) | 27 | #define ms2s(ms) ((ms)*0.001) |
26 | 28 | ||
29 | bool SILENT = true; | ||
30 | inline int xprintf(const char *format, ...) | ||
31 | { | ||
32 | int ret = 0; | ||
33 | if (!SILENT) { | ||
34 | va_list args; | ||
35 | va_start(args, format); | ||
36 | ret = vprintf(format, args); | ||
37 | va_end(args); | ||
38 | } | ||
39 | return ret; | ||
40 | } | ||
41 | |||
27 | const char *lock_namespace = "./.gpuspin-locks"; | 42 | const char *lock_namespace = "./.gpuspin-locks"; |
43 | const size_t PAGE_SIZE = sysconf(_SC_PAGESIZE); | ||
28 | 44 | ||
29 | const int NR_GPUS = 8; | 45 | const int NR_GPUS = 8; |
30 | 46 | ||
@@ -34,6 +50,8 @@ bool RELAX_FIFO_MAX_LEN = false; | |||
34 | bool ENABLE_CHUNKING = false; | 50 | bool ENABLE_CHUNKING = false; |
35 | bool MIGRATE_VIA_SYSMEM = false; | 51 | bool MIGRATE_VIA_SYSMEM = false; |
36 | 52 | ||
53 | bool YIELD_LOCKS = false; | ||
54 | |||
37 | enum eEngineLockTypes | 55 | enum eEngineLockTypes |
38 | { | 56 | { |
39 | FIFO, | 57 | FIFO, |
@@ -97,15 +115,82 @@ int CUR_DEVICE = -1; | |||
97 | int LAST_DEVICE = -1; | 115 | int LAST_DEVICE = -1; |
98 | 116 | ||
99 | cudaStream_t STREAMS[NR_GPUS]; | 117 | cudaStream_t STREAMS[NR_GPUS]; |
118 | cudaEvent_t EVENTS[NR_GPUS]; | ||
100 | int GPU_HZ[NR_GPUS]; | 119 | int GPU_HZ[NR_GPUS]; |
101 | int NUM_SM[NR_GPUS]; | 120 | int NUM_SM[NR_GPUS]; |
102 | int WARP_SIZE[NR_GPUS]; | 121 | int WARP_SIZE[NR_GPUS]; |
103 | int ELEM_PER_THREAD[NR_GPUS]; | 122 | int ELEM_PER_THREAD[NR_GPUS]; |
104 | 123 | ||
124 | enum eScheduler | ||
125 | { | ||
126 | LITMUS, | ||
127 | LINUX, | ||
128 | RT_LINUX | ||
129 | }; | ||
130 | |||
131 | struct Args | ||
132 | { | ||
133 | bool wait; | ||
134 | bool migrate; | ||
135 | int cluster; | ||
136 | int cluster_size; | ||
137 | bool gpu_using; | ||
138 | int gpu_partition; | ||
139 | int gpu_partition_size; | ||
140 | int rho; | ||
141 | int num_ce; | ||
142 | bool reserve_migr_ce; | ||
143 | bool use_engine_locks; | ||
144 | eEngineLockTypes engine_lock_type; | ||
145 | bool yield_locks; | ||
146 | bool use_dgls; | ||
147 | eGpuSyncMode gpusync_mode; | ||
148 | bool enable_affinity; | ||
149 | int relax_fifo_len; | ||
150 | eCudaSyncMode sync_mode; | ||
151 | size_t send_size; | ||
152 | size_t recv_size; | ||
153 | size_t state_size; | ||
154 | bool enable_chunking; | ||
155 | size_t chunk_size; | ||
156 | bool use_sysmem_migration; | ||
157 | int num_kernels; | ||
158 | |||
159 | double wcet_ms; | ||
160 | double gpu_wcet_ms; | ||
161 | double period_ms; | ||
162 | |||
163 | double budget_ms; | ||
164 | |||
165 | double stddev; | ||
166 | |||
167 | eScheduler scheduler; | ||
168 | |||
169 | unsigned int priority; | ||
170 | |||
171 | task_class_t cls; | ||
172 | |||
173 | bool want_enforcement; | ||
174 | bool want_signals; | ||
175 | budget_drain_policy_t drain_policy; | ||
176 | |||
177 | int column; | ||
178 | |||
179 | int num_gpu_tasks; | ||
180 | int num_tasks; | ||
181 | |||
182 | double scale; | ||
183 | |||
184 | double duration; | ||
185 | }; | ||
186 | |||
187 | |||
188 | |||
105 | #define DEFINE_PER_GPU(type, var) type var[NR_GPUS] | 189 | #define DEFINE_PER_GPU(type, var) type var[NR_GPUS] |
106 | #define per_gpu(var, idx) (var[(idx)]) | 190 | #define per_gpu(var, idx) (var[(idx)]) |
107 | #define this_gpu(var) (var[(CUR_DEVICE)]) | 191 | #define this_gpu(var) (var[(CUR_DEVICE)]) |
108 | #define cur_stream() (this_gpu(STREAMS)) | 192 | #define cur_stream() (this_gpu(STREAMS)) |
193 | #define cur_event() (this_gpu(EVENTS)) | ||
109 | #define cur_gpu() (CUR_DEVICE) | 194 | #define cur_gpu() (CUR_DEVICE) |
110 | #define last_gpu() (LAST_DEVICE) | 195 | #define last_gpu() (LAST_DEVICE) |
111 | #define cur_ee() (EE_LOCKS[CUR_DEVICE]) | 196 | #define cur_ee() (EE_LOCKS[CUR_DEVICE]) |
@@ -208,9 +293,10 @@ struct ce_lock_state | |||
208 | void lock() { | 293 | void lock() { |
209 | if(locks[0] == locks[1]) crash(); | 294 | if(locks[0] == locks[1]) crash(); |
210 | 295 | ||
211 | if(USE_DYNAMIC_GROUP_LOCKS) { | 296 | if (num_locks == 1) |
297 | litmus_lock(locks[0]); | ||
298 | else if(USE_DYNAMIC_GROUP_LOCKS) | ||
212 | litmus_dgl_lock(locks, num_locks); | 299 | litmus_dgl_lock(locks, num_locks); |
213 | } | ||
214 | else | 300 | else |
215 | { | 301 | { |
216 | for(int l = 0; l < num_locks; ++l) | 302 | for(int l = 0; l < num_locks; ++l) |
@@ -224,9 +310,10 @@ struct ce_lock_state | |||
224 | void unlock() { | 310 | void unlock() { |
225 | if(locks[0] == locks[1]) crash(); | 311 | if(locks[0] == locks[1]) crash(); |
226 | 312 | ||
227 | if(USE_DYNAMIC_GROUP_LOCKS) { | 313 | if (num_locks == 1) |
314 | litmus_unlock(locks[0]); | ||
315 | else if(USE_DYNAMIC_GROUP_LOCKS) | ||
228 | litmus_dgl_unlock(locks, num_locks); | 316 | litmus_dgl_unlock(locks, num_locks); |
229 | } | ||
230 | else | 317 | else |
231 | { | 318 | { |
232 | // reverse order | 319 | // reverse order |
@@ -238,6 +325,21 @@ struct ce_lock_state | |||
238 | locked = false; | 325 | locked = false; |
239 | } | 326 | } |
240 | 327 | ||
328 | bool should_yield() { | ||
329 | int yield = 1; // assume we should yield | ||
330 | if (YIELD_LOCKS) { | ||
331 | if(locks[0] == locks[1]) crash(); | ||
332 | if (num_locks == 1) | ||
333 | yield = litmus_should_yield_lock(locks[0]); | ||
334 | else if(USE_DYNAMIC_GROUP_LOCKS) | ||
335 | yield = litmus_dgl_should_yield_lock(locks, num_locks); | ||
336 | else | ||
337 | for(int l = num_locks - 1; l >= 0; --l) // reverse order | ||
338 | yield = litmus_should_yield_lock(locks[l]); | ||
339 | } | ||
340 | return (yield); | ||
341 | } | ||
342 | |||
241 | void refresh() { | 343 | void refresh() { |
242 | budget_remaining = CHUNK_SIZE; | 344 | budget_remaining = CHUNK_SIZE; |
243 | } | 345 | } |
@@ -271,34 +373,37 @@ static cudaError_t __chunkMemcpy(void* a_dst, const void* a_src, size_t count, | |||
271 | { | 373 | { |
272 | int bytesToCopy = std::min(remaining, chunk_size); | 374 | int bytesToCopy = std::min(remaining, chunk_size); |
273 | 375 | ||
274 | if(state && state->budgetIsAvailable(bytesToCopy) && state->locked) { | 376 | if (state && state->locked) { |
275 | cudaStreamSynchronize(STREAMS[CUR_DEVICE]); | 377 | // we have to unlock/re-lock the copy engine to refresh our budget unless |
276 | ret = cudaGetLastError(); | 378 | // we still have budget available. |
277 | 379 | if (!state->budgetIsAvailable(bytesToCopy)) { | |
278 | if(ret != cudaSuccess) | 380 | // optimization - don't unlock if no one else needs the engine |
279 | { | 381 | if (state->should_yield()) { |
280 | break; | 382 | //cudaStreamSynchronize(STREAMS[CUR_DEVICE]); |
383 | cudaEventSynchronize(EVENTS[CUR_DEVICE]); | ||
384 | ret = cudaGetLastError(); | ||
385 | state->unlock(); | ||
386 | if(ret != cudaSuccess) | ||
387 | break; | ||
388 | } | ||
389 | // we can only run out of | ||
390 | // budget if chunking is enabled. | ||
391 | // we presume that init budget would | ||
392 | // be set to cover entire memcpy | ||
393 | // if chunking were disabled. | ||
394 | state->refresh(); | ||
281 | } | 395 | } |
282 | |||
283 | state->unlock(); | ||
284 | state->refresh(); // replentish. | ||
285 | // we can only run out of | ||
286 | // budget if chunking is enabled. | ||
287 | // we presume that init budget would | ||
288 | // be set to cover entire memcpy | ||
289 | // if chunking were disabled. | ||
290 | } | 396 | } |
291 | 397 | ||
292 | if(state && !state->locked) { | 398 | if(state && !state->locked) |
293 | state->lock(); | 399 | state->lock(); |
294 | } | ||
295 | 400 | ||
296 | //ret = cudaMemcpy(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind); | 401 | //ret = cudaMemcpy(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind); |
297 | cudaMemcpyAsync(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind, STREAMS[CUR_DEVICE]); | 402 | cudaMemcpyAsync(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind, STREAMS[CUR_DEVICE]); |
403 | cudaEventRecord(EVENTS[CUR_DEVICE], STREAMS[CUR_DEVICE]); | ||
298 | 404 | ||
299 | if(state) { | 405 | if(state) |
300 | state->decreaseBudget(bytesToCopy); | 406 | state->decreaseBudget(bytesToCopy); |
301 | } | ||
302 | 407 | ||
303 | ++i; | 408 | ++i; |
304 | remaining -= bytesToCopy; | 409 | remaining -= bytesToCopy; |
@@ -316,7 +421,8 @@ static cudaError_t chunkMemcpy(void* a_dst, const void* a_src, size_t count, | |||
316 | cudaError_t ret; | 421 | cudaError_t ret; |
317 | if(!do_locking || device_a == -1) { | 422 | if(!do_locking || device_a == -1) { |
318 | ret = __chunkMemcpy(a_dst, a_src, count, kind, NULL); | 423 | ret = __chunkMemcpy(a_dst, a_src, count, kind, NULL); |
319 | cudaStreamSynchronize(cur_stream()); | 424 | cudaEventSynchronize(cur_event()); |
425 | // cudaStreamSynchronize(cur_stream()); | ||
320 | if(ret == cudaSuccess) | 426 | if(ret == cudaSuccess) |
321 | ret = cudaGetLastError(); | 427 | ret = cudaGetLastError(); |
322 | } | 428 | } |
@@ -324,7 +430,8 @@ static cudaError_t chunkMemcpy(void* a_dst, const void* a_src, size_t count, | |||
324 | ce_lock_state state(device_a, kind, count, device_b, migration); | 430 | ce_lock_state state(device_a, kind, count, device_b, migration); |
325 | state.lock(); | 431 | state.lock(); |
326 | ret = __chunkMemcpy(a_dst, a_src, count, kind, &state); | 432 | ret = __chunkMemcpy(a_dst, a_src, count, kind, &state); |
327 | cudaStreamSynchronize(cur_stream()); | 433 | cudaEventSynchronize(cur_event()); |
434 | // cudaStreamSynchronize(cur_stream()); | ||
328 | if(ret == cudaSuccess) | 435 | if(ret == cudaSuccess) |
329 | ret = cudaGetLastError(); | 436 | ret = cudaGetLastError(); |
330 | state.unlock(); | 437 | state.unlock(); |
@@ -332,17 +439,26 @@ static cudaError_t chunkMemcpy(void* a_dst, const void* a_src, size_t count, | |||
332 | return ret; | 439 | return ret; |
333 | } | 440 | } |
334 | 441 | ||
442 | int LITMUS_LOCK_FD = 0; | ||
443 | |||
444 | int EXP_OFFSET = 0; | ||
335 | 445 | ||
336 | void allocate_locks_litmus(void) | 446 | void allocate_locks_litmus(void) |
337 | { | 447 | { |
448 | stringstream ss; | ||
449 | ss<<lock_namespace<<"-"<<EXP_OFFSET; | ||
450 | |||
338 | // allocate k-FMLP lock | 451 | // allocate k-FMLP lock |
339 | int fd = open(lock_namespace, O_RDONLY | O_CREAT, S_IRUSR | S_IWUSR); | 452 | //LITMUS_LOCK_FD = open(lock_namespace, O_RDONLY | O_CREAT, S_IRUSR | S_IWUSR); |
453 | LITMUS_LOCK_FD = open(ss.str().c_str(), O_RDONLY | O_CREAT, S_IRUSR | S_IWUSR); | ||
454 | int *fd = &LITMUS_LOCK_FD; | ||
340 | 455 | ||
341 | int base_name = GPU_PARTITION * 1000; | 456 | int base_name = GPU_PARTITION * 100 + EXP_OFFSET * 200; |
457 | ++EXP_OFFSET; | ||
342 | 458 | ||
343 | if (GPU_SYNC_MODE == IKGLP_MODE) { | 459 | if (GPU_SYNC_MODE == IKGLP_MODE) { |
344 | /* Standard (optimal) IKGLP */ | 460 | /* Standard (optimal) IKGLP */ |
345 | TOKEN_LOCK = open_gpusync_token_lock(fd, | 461 | TOKEN_LOCK = open_gpusync_token_lock(*fd, |
346 | base_name, /* name */ | 462 | base_name, /* name */ |
347 | GPU_PARTITION_SIZE, | 463 | GPU_PARTITION_SIZE, |
348 | GPU_PARTITION*GPU_PARTITION_SIZE, | 464 | GPU_PARTITION*GPU_PARTITION_SIZE, |
@@ -355,7 +471,7 @@ void allocate_locks_litmus(void) | |||
355 | } | 471 | } |
356 | else if (GPU_SYNC_MODE == KFMLP_MODE) { | 472 | else if (GPU_SYNC_MODE == KFMLP_MODE) { |
357 | /* KFMLP. FIFO queues only for tokens. */ | 473 | /* KFMLP. FIFO queues only for tokens. */ |
358 | TOKEN_LOCK = open_gpusync_token_lock(fd, | 474 | TOKEN_LOCK = open_gpusync_token_lock(*fd, |
359 | base_name, /* name */ | 475 | base_name, /* name */ |
360 | GPU_PARTITION_SIZE, | 476 | GPU_PARTITION_SIZE, |
361 | GPU_PARTITION*GPU_PARTITION_SIZE, | 477 | GPU_PARTITION*GPU_PARTITION_SIZE, |
@@ -366,7 +482,7 @@ void allocate_locks_litmus(void) | |||
366 | } | 482 | } |
367 | else if (GPU_SYNC_MODE == RGEM_MODE) { | 483 | else if (GPU_SYNC_MODE == RGEM_MODE) { |
368 | /* RGEM-like token allocation. Shared priority queue for all tokens. */ | 484 | /* RGEM-like token allocation. Shared priority queue for all tokens. */ |
369 | TOKEN_LOCK = open_gpusync_token_lock(fd, | 485 | TOKEN_LOCK = open_gpusync_token_lock(*fd, |
370 | base_name, /* name */ | 486 | base_name, /* name */ |
371 | GPU_PARTITION_SIZE, | 487 | GPU_PARTITION_SIZE, |
372 | GPU_PARTITION*GPU_PARTITION_SIZE, | 488 | GPU_PARTITION*GPU_PARTITION_SIZE, |
@@ -380,7 +496,7 @@ void allocate_locks_litmus(void) | |||
380 | * token requests. */ | 496 | * token requests. */ |
381 | int max_simult_run = std::max(CPU_PARTITION_SIZE, RHO*GPU_PARTITION_SIZE); | 497 | int max_simult_run = std::max(CPU_PARTITION_SIZE, RHO*GPU_PARTITION_SIZE); |
382 | int max_fifo_len = (int)ceil((float)max_simult_run / (RHO*GPU_PARTITION_SIZE)); | 498 | int max_fifo_len = (int)ceil((float)max_simult_run / (RHO*GPU_PARTITION_SIZE)); |
383 | TOKEN_LOCK = open_gpusync_token_lock(fd, | 499 | TOKEN_LOCK = open_gpusync_token_lock(*fd, |
384 | base_name, /* name */ | 500 | base_name, /* name */ |
385 | GPU_PARTITION_SIZE, | 501 | GPU_PARTITION_SIZE, |
386 | GPU_PARTITION*GPU_PARTITION_SIZE, | 502 | GPU_PARTITION*GPU_PARTITION_SIZE, |
@@ -416,17 +532,17 @@ void allocate_locks_litmus(void) | |||
416 | open_sem_t openEngineLock = (ENGINE_LOCK_TYPE == FIFO) ? | 532 | open_sem_t openEngineLock = (ENGINE_LOCK_TYPE == FIFO) ? |
417 | open_fifo_sem : open_prioq_sem; | 533 | open_fifo_sem : open_prioq_sem; |
418 | 534 | ||
419 | ee_lock = openEngineLock(fd, ee_name); | 535 | ee_lock = openEngineLock(*fd, ee_name); |
420 | if (ee_lock < 0) | 536 | if (ee_lock < 0) |
421 | perror("open_*_sem (engine lock)"); | 537 | perror("open_*_sem (engine lock)"); |
422 | 538 | ||
423 | ce_0_lock = openEngineLock(fd, ce_0_name); | 539 | ce_0_lock = openEngineLock(*fd, ce_0_name); |
424 | if (ce_0_lock < 0) | 540 | if (ce_0_lock < 0) |
425 | perror("open_*_sem (engine lock)"); | 541 | perror("open_*_sem (engine lock)"); |
426 | 542 | ||
427 | if (NUM_COPY_ENGINES == 2) | 543 | if (NUM_COPY_ENGINES == 2) |
428 | { | 544 | { |
429 | ce_1_lock = openEngineLock(fd, ce_1_name); | 545 | ce_1_lock = openEngineLock(*fd, ce_1_name); |
430 | if (ce_1_lock < 0) | 546 | if (ce_1_lock < 0) |
431 | perror("open_*_sem (engine lock)"); | 547 | perror("open_*_sem (engine lock)"); |
432 | } | 548 | } |
@@ -464,7 +580,41 @@ void allocate_locks_litmus(void) | |||
464 | } | 580 | } |
465 | } | 581 | } |
466 | 582 | ||
583 | void deallocate_locks_litmus(void) | ||
584 | { | ||
585 | for (int i = 0; i < GPU_PARTITION_SIZE; ++i) | ||
586 | { | ||
587 | int idx = GPU_PARTITION*GPU_PARTITION_SIZE + i; | ||
467 | 588 | ||
589 | od_close(EE_LOCKS[idx]); | ||
590 | if (NUM_COPY_ENGINES == 1) | ||
591 | { | ||
592 | od_close(CE_SEND_LOCKS[idx]); | ||
593 | } | ||
594 | else | ||
595 | { | ||
596 | if (RESERVED_MIGR_COPY_ENGINE) { | ||
597 | od_close(CE_SEND_LOCKS[idx]); | ||
598 | od_close(CE_MIGR_SEND_LOCKS[idx]); | ||
599 | } | ||
600 | else { | ||
601 | od_close(CE_SEND_LOCKS[idx]); | ||
602 | od_close(CE_RECV_LOCKS[idx]); | ||
603 | } | ||
604 | } | ||
605 | } | ||
606 | |||
607 | od_close(TOKEN_LOCK); | ||
608 | |||
609 | close(LITMUS_LOCK_FD); | ||
610 | |||
611 | memset(&CE_SEND_LOCKS[0], 0, sizeof(CE_SEND_LOCKS)); | ||
612 | memset(&CE_RECV_LOCKS[0], 0, sizeof(CE_RECV_LOCKS)); | ||
613 | memset(&CE_MIGR_SEND_LOCKS[0], 0, sizeof(CE_MIGR_SEND_LOCKS)); | ||
614 | memset(&CE_MIGR_RECV_LOCKS[0], 0, sizeof(CE_MIGR_RECV_LOCKS)); | ||
615 | TOKEN_LOCK = -1; | ||
616 | LITMUS_LOCK_FD = 0; | ||
617 | } | ||
468 | 618 | ||
469 | 619 | ||
470 | class gpu_pool | 620 | class gpu_pool |
@@ -478,10 +628,9 @@ public: | |||
478 | int get(pthread_mutex_t* tex, int preference = -1) | 628 | int get(pthread_mutex_t* tex, int preference = -1) |
479 | { | 629 | { |
480 | int which = -1; | 630 | int which = -1; |
481 | // int last = (preference >= 0) ? preference : 0; | ||
482 | int last = (ENABLE_AFFINITY) ? | 631 | int last = (ENABLE_AFFINITY) ? |
483 | (preference >= 0) ? preference : 0 : | 632 | ((preference >= 0) ? preference : 0) : |
484 | rand()%poolSize; | 633 | (rand()%poolSize); |
485 | int minIdx = last; | 634 | int minIdx = last; |
486 | 635 | ||
487 | pthread_mutex_lock(tex); | 636 | pthread_mutex_lock(tex); |
@@ -513,24 +662,22 @@ private: | |||
513 | int pool[NR_GPUS]; // >= gpu_part_size | 662 | int pool[NR_GPUS]; // >= gpu_part_size |
514 | }; | 663 | }; |
515 | 664 | ||
665 | |||
666 | static managed_shared_memory *linux_lock_segment_ptr = NULL; | ||
516 | static gpu_pool* GPU_LINUX_SEM_POOL = NULL; | 667 | static gpu_pool* GPU_LINUX_SEM_POOL = NULL; |
517 | static pthread_mutex_t* GPU_LINUX_MUTEX_POOL = NULL; | 668 | static pthread_mutex_t* GPU_LINUX_MUTEX_POOL = NULL; |
518 | 669 | ||
519 | static void allocate_locks_linux(const int num_gpu_users) | 670 | static void allocate_locks_linux(const int num_gpu_users) |
520 | { | 671 | { |
521 | managed_shared_memory *segment_pool_ptr = NULL; | ||
522 | managed_shared_memory *segment_mutex_ptr = NULL; | ||
523 | |||
524 | int numGpuPartitions = NR_GPUS/GPU_PARTITION_SIZE; | 672 | int numGpuPartitions = NR_GPUS/GPU_PARTITION_SIZE; |
525 | 673 | ||
526 | if(num_gpu_users > 0) | 674 | if(num_gpu_users > 0) |
527 | { | 675 | { |
528 | printf("%d creating shared memory for linux semaphores; num pools = %d, pool size = %d\n", getpid(), numGpuPartitions, GPU_PARTITION_SIZE); | 676 | xprintf("%d: creating linux locks\n", getpid()); |
529 | shared_memory_object::remove("linux_mutex_memory"); | 677 | shared_memory_object::remove("linux_lock_memory"); |
530 | shared_memory_object::remove("linux_sem_memory"); | ||
531 | 678 | ||
532 | segment_mutex_ptr = new managed_shared_memory(create_only, "linux_mutex_memory", 4*1024); | 679 | linux_lock_segment_ptr = new managed_shared_memory(create_only, "linux_lock_memory", 30*PAGE_SIZE); |
533 | GPU_LINUX_MUTEX_POOL = segment_mutex_ptr->construct<pthread_mutex_t>("pthread_mutex_t linux_m")[numGpuPartitions](); | 680 | GPU_LINUX_MUTEX_POOL = linux_lock_segment_ptr->construct<pthread_mutex_t>("pthread_mutex_t linux_m")[numGpuPartitions](); |
534 | for(int i = 0; i < numGpuPartitions; ++i) | 681 | for(int i = 0; i < numGpuPartitions; ++i) |
535 | { | 682 | { |
536 | pthread_mutexattr_t attr; | 683 | pthread_mutexattr_t attr; |
@@ -539,41 +686,41 @@ static void allocate_locks_linux(const int num_gpu_users) | |||
539 | pthread_mutex_init(&(GPU_LINUX_MUTEX_POOL[i]), &attr); | 686 | pthread_mutex_init(&(GPU_LINUX_MUTEX_POOL[i]), &attr); |
540 | pthread_mutexattr_destroy(&attr); | 687 | pthread_mutexattr_destroy(&attr); |
541 | } | 688 | } |
542 | 689 | GPU_LINUX_SEM_POOL = linux_lock_segment_ptr->construct<gpu_pool>("gpu_pool linux_p")[numGpuPartitions](GPU_PARTITION_SIZE); | |
543 | segment_pool_ptr = new managed_shared_memory(create_only, "linux_sem_memory", 4*1024); | ||
544 | GPU_LINUX_SEM_POOL = segment_pool_ptr->construct<gpu_pool>("gpu_pool linux_p")[numGpuPartitions](GPU_PARTITION_SIZE); | ||
545 | } | 690 | } |
546 | else | 691 | else |
547 | { | 692 | { |
693 | sleep(5); | ||
548 | do | 694 | do |
549 | { | 695 | { |
550 | try | 696 | try |
551 | { | 697 | { |
552 | if (!segment_pool_ptr) segment_pool_ptr = new managed_shared_memory(open_only, "linux_sem_memory"); | 698 | if (!linux_lock_segment_ptr) |
699 | linux_lock_segment_ptr = new managed_shared_memory(open_only, "linux_lock_memory"); | ||
553 | } | 700 | } |
554 | catch(...) | 701 | catch(...) |
555 | { | 702 | { |
556 | sleep(1); | 703 | sleep(1); |
557 | } | 704 | } |
558 | }while(segment_pool_ptr == NULL); | 705 | }while(linux_lock_segment_ptr == NULL); |
559 | 706 | ||
560 | do | 707 | GPU_LINUX_MUTEX_POOL = linux_lock_segment_ptr->find<pthread_mutex_t>("pthread_mutex_t linux_m").first; |
561 | { | 708 | GPU_LINUX_SEM_POOL = linux_lock_segment_ptr->find<gpu_pool>("gpu_pool linux_p").first; |
562 | try | ||
563 | { | ||
564 | if (!segment_mutex_ptr) segment_mutex_ptr = new managed_shared_memory(open_only, "linux_mutex_memory"); | ||
565 | } | ||
566 | catch(...) | ||
567 | { | ||
568 | sleep(1); | ||
569 | } | ||
570 | }while(segment_mutex_ptr == NULL); | ||
571 | |||
572 | GPU_LINUX_SEM_POOL = segment_pool_ptr->find<gpu_pool>("gpu_pool linux_p").first; | ||
573 | GPU_LINUX_MUTEX_POOL = segment_mutex_ptr->find<pthread_mutex_t>("pthread_mutex_t linux_m").first; | ||
574 | } | 709 | } |
575 | } | 710 | } |
576 | 711 | ||
712 | static void deallocate_locks_linux(const int num_gpu_users) | ||
713 | { | ||
714 | GPU_LINUX_MUTEX_POOL = NULL; | ||
715 | GPU_LINUX_SEM_POOL = NULL; | ||
716 | |||
717 | delete linux_lock_segment_ptr; | ||
718 | linux_lock_segment_ptr = NULL; | ||
719 | |||
720 | if(num_gpu_users > 0) | ||
721 | shared_memory_object::remove("linux_lock_memory"); | ||
722 | } | ||
723 | |||
577 | 724 | ||
578 | 725 | ||
579 | 726 | ||
@@ -585,6 +732,14 @@ static void allocate_locks(const int num_gpu_users, bool linux_mode) | |||
585 | allocate_locks_linux(num_gpu_users); | 732 | allocate_locks_linux(num_gpu_users); |
586 | } | 733 | } |
587 | 734 | ||
735 | static void deallocate_locks(const int num_gpu_users, bool linux_mode) | ||
736 | { | ||
737 | if(!linux_mode) | ||
738 | deallocate_locks_litmus(); | ||
739 | else | ||
740 | deallocate_locks_linux(num_gpu_users); | ||
741 | } | ||
742 | |||
588 | static void set_cur_gpu(int gpu) | 743 | static void set_cur_gpu(int gpu) |
589 | { | 744 | { |
590 | if (TRACE_MIGRATIONS) { | 745 | if (TRACE_MIGRATIONS) { |
@@ -597,47 +752,52 @@ static void set_cur_gpu(int gpu) | |||
597 | } | 752 | } |
598 | 753 | ||
599 | 754 | ||
600 | static pthread_barrier_t *gpu_barrier = NULL; | 755 | //static pthread_barrier_t *gpu_barrier = NULL; |
601 | static interprocess_mutex *gpu_mgmt_mutexes = NULL; | 756 | static interprocess_mutex *gpu_mgmt_mutexes = NULL; |
602 | static managed_shared_memory *segment_ptr = NULL; | 757 | static managed_shared_memory *gpu_mutex_segment_ptr = NULL; |
603 | 758 | ||
604 | void coordinate_gpu_tasks(const int num_gpu_users) | 759 | void coordinate_gpu_tasks(const int num_gpu_users) |
605 | { | 760 | { |
606 | if(num_gpu_users > 0) | 761 | if(num_gpu_users > 0) |
607 | { | 762 | { |
608 | printf("%d creating shared memory\n", getpid()); | 763 | xprintf("%d creating shared memory\n", getpid()); |
609 | shared_memory_object::remove("gpu_barrier_memory"); | 764 | shared_memory_object::remove("gpu_mutex_memory"); |
610 | segment_ptr = new managed_shared_memory(create_only, "gpu_barrier_memory", 4*1024); | 765 | gpu_mutex_segment_ptr = new managed_shared_memory(create_only, "gpu_mutex_memory", PAGE_SIZE); |
611 | 766 | ||
612 | printf("%d creating a barrier for %d users\n", getpid(), num_gpu_users); | 767 | // printf("%d creating a barrier for %d users\n", getpid(), num_gpu_users); |
613 | gpu_barrier = segment_ptr->construct<pthread_barrier_t>("pthread_barrier_t gpu_barrier")(); | 768 | // gpu_barrier = segment_ptr->construct<pthread_barrier_t>("pthread_barrier_t gpu_barrier")(); |
614 | pthread_barrierattr_t battr; | 769 | // pthread_barrierattr_t battr; |
615 | pthread_barrierattr_init(&battr); | 770 | // pthread_barrierattr_init(&battr); |
616 | pthread_barrierattr_setpshared(&battr, PTHREAD_PROCESS_SHARED); | 771 | // pthread_barrierattr_setpshared(&battr, PTHREAD_PROCESS_SHARED); |
617 | pthread_barrier_init(gpu_barrier, &battr, num_gpu_users); | 772 | // pthread_barrier_init(gpu_barrier, &battr, num_gpu_users); |
618 | pthread_barrierattr_destroy(&battr); | 773 | // pthread_barrierattr_destroy(&battr); |
619 | printf("%d creating gpu mgmt mutexes for %d devices\n", getpid(), NR_GPUS); | 774 | // printf("%d creating gpu mgmt mutexes for %d devices\n", getpid(), NR_GPUS); |
620 | gpu_mgmt_mutexes = segment_ptr->construct<interprocess_mutex>("interprocess_mutex m")[NR_GPUS](); | 775 | gpu_mgmt_mutexes = gpu_mutex_segment_ptr->construct<interprocess_mutex>("interprocess_mutex m")[NR_GPUS](); |
621 | } | 776 | } |
622 | else | 777 | else |
623 | { | 778 | { |
779 | sleep(5); | ||
624 | do | 780 | do |
625 | { | 781 | { |
626 | try | 782 | try |
627 | { | 783 | { |
628 | segment_ptr = new managed_shared_memory(open_only, "gpu_barrier_memory"); | 784 | gpu_mutex_segment_ptr = new managed_shared_memory(open_only, "gpu_mutex_memory"); |
629 | } | 785 | } |
630 | catch(...) | 786 | catch(...) |
631 | { | 787 | { |
632 | sleep(1); | 788 | sleep(1); |
633 | } | 789 | } |
634 | }while(segment_ptr == NULL); | 790 | }while(gpu_mutex_segment_ptr == NULL); |
635 | 791 | ||
636 | gpu_barrier = segment_ptr->find<pthread_barrier_t>("pthread_barrier_t gpu_barrier").first; | 792 | // gpu_barrier = segment_ptr->find<pthread_barrier_t>("pthread_barrier_t gpu_barrier").first; |
637 | gpu_mgmt_mutexes = segment_ptr->find<interprocess_mutex>("interprocess_mutex m").first; | 793 | gpu_mgmt_mutexes = gpu_mutex_segment_ptr->find<interprocess_mutex>("interprocess_mutex m").first; |
638 | } | 794 | } |
639 | } | 795 | } |
640 | 796 | ||
797 | const size_t SEND_ALLOC_SIZE = 12*1024; | ||
798 | const size_t RECV_ALLOC_SIZE = 12*1024; | ||
799 | const size_t STATE_ALLOC_SIZE = 16*1024; | ||
800 | |||
641 | typedef float spindata_t; | 801 | typedef float spindata_t; |
642 | 802 | ||
643 | char *d_send_data[NR_GPUS] = {0}; | 803 | char *d_send_data[NR_GPUS] = {0}; |
@@ -653,18 +813,48 @@ char *h_send_data = 0; | |||
653 | char *h_recv_data = 0; | 813 | char *h_recv_data = 0; |
654 | char *h_state_data = 0; | 814 | char *h_state_data = 0; |
655 | 815 | ||
656 | unsigned int *h_iteration_count[NR_GPUS] = {0}; | 816 | static void destroy_events() |
817 | { | ||
818 | for(int i = 0; i < GPU_PARTITION_SIZE; ++i) | ||
819 | { | ||
820 | int which = GPU_PARTITION*GPU_PARTITION_SIZE + i; | ||
821 | gpu_mgmt_mutexes[which].lock(); | ||
822 | set_cur_gpu(which); | ||
823 | cudaEventDestroy(EVENTS[which]); | ||
824 | gpu_mgmt_mutexes[which].unlock(); | ||
825 | } | ||
826 | } | ||
827 | |||
828 | static void init_events() | ||
829 | { | ||
830 | xprintf("creating %s events\n", (CUDA_SYNC_MODE == BLOCKING) ? "blocking" : "spinning"); | ||
831 | for(int i = 0; i < GPU_PARTITION_SIZE; ++i) | ||
832 | { | ||
833 | int which = GPU_PARTITION*GPU_PARTITION_SIZE + i; | ||
834 | gpu_mgmt_mutexes[which].lock(); | ||
835 | set_cur_gpu(which); | ||
836 | if (CUDA_SYNC_MODE == BLOCKING) | ||
837 | cudaEventCreateWithFlags(&EVENTS[which], cudaEventBlockingSync | cudaEventDisableTiming); | ||
838 | else | ||
839 | cudaEventCreateWithFlags(&EVENTS[which], cudaEventDefault | cudaEventDisableTiming); | ||
840 | gpu_mgmt_mutexes[which].unlock(); | ||
841 | } | ||
842 | } | ||
657 | 843 | ||
658 | static void init_cuda(const int num_gpu_users) | 844 | static void init_cuda(const int num_gpu_users) |
659 | { | 845 | { |
660 | const int PAGE_SIZE = 4*1024; | 846 | size_t send_alloc_bytes = SEND_ALLOC_SIZE + (SEND_ALLOC_SIZE%PAGE_SIZE != 0)*PAGE_SIZE; |
661 | size_t send_alloc_bytes = SEND_SIZE + (SEND_SIZE%PAGE_SIZE != 0)*PAGE_SIZE; | 847 | size_t recv_alloc_bytes = RECV_ALLOC_SIZE + (RECV_ALLOC_SIZE%PAGE_SIZE != 0)*PAGE_SIZE; |
662 | size_t recv_alloc_bytes = RECV_SIZE + (RECV_SIZE%PAGE_SIZE != 0)*PAGE_SIZE; | 848 | size_t state_alloc_bytes = STATE_ALLOC_SIZE + (STATE_ALLOC_SIZE%PAGE_SIZE != 0)*PAGE_SIZE; |
663 | size_t state_alloc_bytes = STATE_SIZE + (STATE_SIZE%PAGE_SIZE != 0)*PAGE_SIZE; | ||
664 | 849 | ||
665 | coordinate_gpu_tasks(num_gpu_users); | 850 | static bool first_time = true; |
666 | 851 | ||
667 | #if 1 | 852 | if (first_time) { |
853 | coordinate_gpu_tasks(num_gpu_users); | ||
854 | first_time = false; | ||
855 | } | ||
856 | |||
857 | #if 0 | ||
668 | switch (CUDA_SYNC_MODE) | 858 | switch (CUDA_SYNC_MODE) |
669 | { | 859 | { |
670 | case BLOCKING: | 860 | case BLOCKING: |
@@ -674,8 +864,6 @@ static void init_cuda(const int num_gpu_users) | |||
674 | cudaSetDeviceFlags(cudaDeviceScheduleSpin); | 864 | cudaSetDeviceFlags(cudaDeviceScheduleSpin); |
675 | break; | 865 | break; |
676 | } | 866 | } |
677 | #else | ||
678 | cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); | ||
679 | #endif | 867 | #endif |
680 | 868 | ||
681 | for(int i = 0; i < GPU_PARTITION_SIZE; ++i) | 869 | for(int i = 0; i < GPU_PARTITION_SIZE; ++i) |
@@ -687,6 +875,9 @@ static void init_cuda(const int num_gpu_users) | |||
687 | try | 875 | try |
688 | { | 876 | { |
689 | set_cur_gpu(which); | 877 | set_cur_gpu(which); |
878 | |||
879 | xprintf("setting up GPU %d\n", which); | ||
880 | |||
690 | cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 0); | 881 | cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 0); |
691 | cudaDeviceSetLimit(cudaLimitMallocHeapSize, 0); | 882 | cudaDeviceSetLimit(cudaLimitMallocHeapSize, 0); |
692 | 883 | ||
@@ -698,8 +889,8 @@ static void init_cuda(const int num_gpu_users) | |||
698 | // enough to fill the L2 cache exactly. | 889 | // enough to fill the L2 cache exactly. |
699 | ELEM_PER_THREAD[which] = (prop.l2CacheSize/(NUM_SM[which]*WARP_SIZE[which]*sizeof(spindata_t))); | 890 | ELEM_PER_THREAD[which] = (prop.l2CacheSize/(NUM_SM[which]*WARP_SIZE[which]*sizeof(spindata_t))); |
700 | 891 | ||
701 | 892 | // if (!MIGRATE_VIA_SYSMEM && prop.unifiedAddressing) | |
702 | if (!MIGRATE_VIA_SYSMEM && prop.unifiedAddressing) | 893 | if (prop.unifiedAddressing) |
703 | { | 894 | { |
704 | for(int j = 0; j < GPU_PARTITION_SIZE; ++j) | 895 | for(int j = 0; j < GPU_PARTITION_SIZE; ++j) |
705 | { | 896 | { |
@@ -717,29 +908,23 @@ static void init_cuda(const int num_gpu_users) | |||
717 | } | 908 | } |
718 | } | 909 | } |
719 | 910 | ||
720 | cudaStreamCreate(&STREAMS[CUR_DEVICE]); | 911 | cudaStreamCreate(&STREAMS[which]); |
721 | 912 | ||
913 | // gpu working set | ||
722 | cudaMalloc(&d_spin_data[which], prop.l2CacheSize); | 914 | cudaMalloc(&d_spin_data[which], prop.l2CacheSize); |
723 | cudaMemset(&d_spin_data[which], 0, prop.l2CacheSize); | 915 | cudaMemset(&d_spin_data[which], 0, prop.l2CacheSize); |
724 | // cudaMalloc(&d_iteration_count[which], NUM_SM[which]*WARP_SIZE[which]*sizeof(unsigned int)); | ||
725 | // cudaHostAlloc(&h_iteration_count[which], NUM_SM[which]*WARP_SIZE[which]*sizeof(unsigned int), cudaHostAllocPortable | cudaHostAllocMapped); | ||
726 | |||
727 | if (send_alloc_bytes) { | ||
728 | cudaMalloc(&d_send_data[which], send_alloc_bytes); | ||
729 | cudaHostAlloc(&h_send_data, send_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped); | ||
730 | } | ||
731 | 916 | ||
732 | if (h_recv_data) { | 917 | // send data |
733 | cudaMalloc(&d_recv_data[which], recv_alloc_bytes); | 918 | cudaMalloc(&d_send_data[which], send_alloc_bytes); |
734 | cudaHostAlloc(&h_recv_data, recv_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped); | 919 | cudaHostAlloc(&h_send_data, send_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped); |
735 | } | ||
736 | 920 | ||
737 | if (h_state_data) { | 921 | // recv data |
738 | cudaMalloc(&d_state_data[which], state_alloc_bytes); | 922 | cudaMalloc(&d_recv_data[which], recv_alloc_bytes); |
923 | cudaHostAlloc(&h_recv_data, recv_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped); | ||
739 | 924 | ||
740 | if (MIGRATE_VIA_SYSMEM) | 925 | // state data |
741 | cudaHostAlloc(&h_state_data, state_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped | cudaHostAllocWriteCombined); | 926 | cudaMalloc(&d_state_data[which], state_alloc_bytes); |
742 | } | 927 | cudaHostAlloc(&h_state_data, state_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped | cudaHostAllocWriteCombined); |
743 | } | 928 | } |
744 | catch(std::exception &e) | 929 | catch(std::exception &e) |
745 | { | 930 | { |
@@ -793,6 +978,8 @@ static bool MigrateToGPU_SysMem(int from, int to) | |||
793 | // THIS IS ON-DEMAND SYS_MEM MIGRATION. GPUSync says | 978 | // THIS IS ON-DEMAND SYS_MEM MIGRATION. GPUSync says |
794 | // you should be using speculative migrations. | 979 | // you should be using speculative migrations. |
795 | // Use PushState() and PullState(). | 980 | // Use PushState() and PullState(). |
981 | fprintf(stderr, "Tried to sysmem migrate from %d to %d\n", | ||
982 | from, to); | ||
796 | assert(false); // for now | 983 | assert(false); // for now |
797 | 984 | ||
798 | bool success = true; | 985 | bool success = true; |
@@ -846,12 +1033,31 @@ static void MigrateIfNeeded(int next_gpu) | |||
846 | PushState(); | 1033 | PushState(); |
847 | } | 1034 | } |
848 | } | 1035 | } |
1036 | else if(cur_gpu() == -1) { | ||
1037 | set_cur_gpu(next_gpu); | ||
1038 | } | ||
849 | } | 1039 | } |
850 | 1040 | ||
851 | |||
852 | |||
853 | static void exit_cuda() | 1041 | static void exit_cuda() |
854 | { | 1042 | { |
1043 | #if 0 | ||
1044 | for(int i = 0; i < GPU_PARTITION_SIZE; ++i) | ||
1045 | { | ||
1046 | int which = GPU_PARTITION*GPU_PARTITION_SIZE + i; | ||
1047 | gpu_mgmt_mutexes[which].lock(); | ||
1048 | set_cur_gpu(which); | ||
1049 | cudaFree(d_send_data[which]); | ||
1050 | cudaFree(d_recv_data[which]); | ||
1051 | cudaFree(d_state_data[which]); | ||
1052 | cudaFree(d_spin_data[which]); | ||
1053 | gpu_mgmt_mutexes[which].unlock(); | ||
1054 | } | ||
1055 | #endif | ||
1056 | |||
1057 | cudaFreeHost(h_send_data); | ||
1058 | cudaFreeHost(h_recv_data); | ||
1059 | cudaFreeHost(h_state_data); | ||
1060 | |||
855 | for(int i = 0; i < GPU_PARTITION_SIZE; ++i) | 1061 | for(int i = 0; i < GPU_PARTITION_SIZE; ++i) |
856 | { | 1062 | { |
857 | int which = GPU_PARTITION*GPU_PARTITION_SIZE + i; | 1063 | int which = GPU_PARTITION*GPU_PARTITION_SIZE + i; |
@@ -860,6 +1066,14 @@ static void exit_cuda() | |||
860 | cudaDeviceReset(); | 1066 | cudaDeviceReset(); |
861 | gpu_mgmt_mutexes[which].unlock(); | 1067 | gpu_mgmt_mutexes[which].unlock(); |
862 | } | 1068 | } |
1069 | |||
1070 | memset(d_send_data, 0, sizeof(d_send_data)); | ||
1071 | memset(d_recv_data, 0, sizeof(d_recv_data)); | ||
1072 | memset(d_state_data, 0, sizeof(d_state_data)); | ||
1073 | memset(d_spin_data, 0, sizeof(d_spin_data)); | ||
1074 | h_send_data = NULL; | ||
1075 | h_recv_data = NULL; | ||
1076 | h_state_data = NULL; | ||
863 | } | 1077 | } |
864 | 1078 | ||
865 | bool safetynet = false; | 1079 | bool safetynet = false; |
@@ -895,14 +1109,6 @@ static void catch_exit(int catch_exit) | |||
895 | } | 1109 | } |
896 | 1110 | ||
897 | 1111 | ||
898 | |||
899 | |||
900 | |||
901 | #ifdef VANILLA_LINUX | ||
902 | static float ms_sum; | ||
903 | static int gpucount = 0; | ||
904 | #endif | ||
905 | |||
906 | __global__ void docudaspin(float* data, /*unsigned int* iterations,*/ unsigned int num_elem, unsigned int cycles) | 1112 | __global__ void docudaspin(float* data, /*unsigned int* iterations,*/ unsigned int num_elem, unsigned int cycles) |
907 | { | 1113 | { |
908 | long long int now = clock64(); | 1114 | long long int now = clock64(); |
@@ -959,13 +1165,30 @@ static void gpu_loop_for(double gpu_sec_time, unsigned int num_kernels, double e | |||
959 | chunkMemcpy(this_gpu(d_state_data), h_send_data, SEND_SIZE, | 1165 | chunkMemcpy(this_gpu(d_state_data), h_send_data, SEND_SIZE, |
960 | cudaMemcpyHostToDevice, CUR_DEVICE, useEngineLocks()); | 1166 | cudaMemcpyHostToDevice, CUR_DEVICE, useEngineLocks()); |
961 | 1167 | ||
1168 | bool locked = false; | ||
962 | for(unsigned int i = 0; i < num_kernels; ++i) | 1169 | for(unsigned int i = 0; i < num_kernels; ++i) |
963 | { | 1170 | { |
964 | if(useEngineLocks()) litmus_lock(cur_ee()); | 1171 | if(useEngineLocks() && !locked) { |
1172 | litmus_lock(cur_ee()); | ||
1173 | locked = true; | ||
1174 | } | ||
1175 | |||
965 | /* one block per sm, one warp per block */ | 1176 | /* one block per sm, one warp per block */ |
966 | docudaspin <<<cur_sms(),cur_warp_size(), 0, cur_stream()>>> (d_spin_data[cur_gpu()], cur_elem_per_thread(), numcycles); | 1177 | docudaspin <<<cur_sms(), cur_warp_size(), 0, cur_stream()>>> (d_spin_data[cur_gpu()], cur_elem_per_thread(), numcycles); |
967 | cudaStreamSynchronize(cur_stream()); | 1178 | |
968 | if(useEngineLocks()) litmus_unlock(cur_ee()); | 1179 | if(useEngineLocks() && (!YIELD_LOCKS || (YIELD_LOCKS && litmus_should_yield_lock(cur_ee())))) { |
1180 | // cudaStreamSynchronize(cur_stream()); | ||
1181 | cudaEventRecord(cur_event(), cur_stream()); | ||
1182 | cudaEventSynchronize(cur_event()); | ||
1183 | litmus_unlock(cur_ee()); | ||
1184 | locked = false; | ||
1185 | } | ||
1186 | } | ||
1187 | if (locked) { | ||
1188 | cudaEventRecord(cur_event(), cur_stream()); | ||
1189 | cudaEventSynchronize(cur_event()); | ||
1190 | litmus_unlock(cur_ee()); | ||
1191 | locked = false; | ||
969 | } | 1192 | } |
970 | 1193 | ||
971 | if(RECV_SIZE > 0) | 1194 | if(RECV_SIZE > 0) |
@@ -985,9 +1208,9 @@ out: | |||
985 | 1208 | ||
986 | static void gpu_loop_for_linux(double gpu_sec_time, unsigned int num_kernels, double emergency_exit) | 1209 | static void gpu_loop_for_linux(double gpu_sec_time, unsigned int num_kernels, double emergency_exit) |
987 | { | 1210 | { |
988 | static int GPU_OFFSET = GPU_PARTITION * GPU_PARTITION_SIZE; | 1211 | int GPU_OFFSET = GPU_PARTITION * GPU_PARTITION_SIZE; |
989 | static gpu_pool *pool = &GPU_LINUX_SEM_POOL[GPU_PARTITION]; | 1212 | gpu_pool *pool = &GPU_LINUX_SEM_POOL[GPU_PARTITION]; |
990 | static pthread_mutex_t *mutex = &GPU_LINUX_MUTEX_POOL[GPU_PARTITION]; | 1213 | pthread_mutex_t *mutex = &GPU_LINUX_MUTEX_POOL[GPU_PARTITION]; |
991 | 1214 | ||
992 | int next_gpu; | 1215 | int next_gpu; |
993 | 1216 | ||
@@ -996,19 +1219,10 @@ static void gpu_loop_for_linux(double gpu_sec_time, unsigned int num_kernels, do | |||
996 | if (emergency_exit && wctime() > emergency_exit) | 1219 | if (emergency_exit && wctime() > emergency_exit) |
997 | goto out; | 1220 | goto out; |
998 | 1221 | ||
999 | #ifdef VANILLA_LINUX | 1222 | next_gpu = pool->get(mutex, ((cur_gpu() != -1) ? |
1000 | static bool once = false; | 1223 | cur_gpu() - GPU_OFFSET : |
1001 | static cudaEvent_t start, end; | 1224 | -1)) |
1002 | float ms; | 1225 | + GPU_OFFSET; |
1003 | if (!once) | ||
1004 | { | ||
1005 | once = true; | ||
1006 | cudaEventCreate(&start); | ||
1007 | cudaEventCreate(&end); | ||
1008 | } | ||
1009 | #endif | ||
1010 | |||
1011 | next_gpu = pool->get(mutex, cur_gpu() - GPU_OFFSET) + GPU_OFFSET; | ||
1012 | { | 1226 | { |
1013 | MigrateIfNeeded(next_gpu); | 1227 | MigrateIfNeeded(next_gpu); |
1014 | 1228 | ||
@@ -1021,24 +1235,11 @@ static void gpu_loop_for_linux(double gpu_sec_time, unsigned int num_kernels, do | |||
1021 | for(unsigned int i = 0; i < num_kernels; ++i) | 1235 | for(unsigned int i = 0; i < num_kernels; ++i) |
1022 | { | 1236 | { |
1023 | /* one block per sm, one warp per block */ | 1237 | /* one block per sm, one warp per block */ |
1024 | #ifdef VANILLA_LINUX | ||
1025 | cudaEventRecord(start, cur_stream()); | ||
1026 | #endif | ||
1027 | docudaspin <<<cur_sms(),cur_warp_size(), 0, cur_stream()>>> (d_spin_data[cur_gpu()], cur_elem_per_thread(), numcycles); | 1238 | docudaspin <<<cur_sms(),cur_warp_size(), 0, cur_stream()>>> (d_spin_data[cur_gpu()], cur_elem_per_thread(), numcycles); |
1028 | #ifdef VANILLA_LINUX | 1239 | cudaEventRecord(cur_event(), cur_stream()); |
1029 | cudaEventRecord(end, cur_stream()); | 1240 | cudaEventSynchronize(cur_event()); |
1030 | cudaEventSynchronize(end); | 1241 | // cudaStreamSynchronize(cur_stream()); |
1031 | #endif | ||
1032 | cudaStreamSynchronize(cur_stream()); | ||
1033 | |||
1034 | #ifdef VANILLA_LINUX | ||
1035 | cudaEventElapsedTime(&ms, start, end); | ||
1036 | ms_sum += ms; | ||
1037 | #endif | ||
1038 | } | 1242 | } |
1039 | #ifdef VANILLA_LINUX | ||
1040 | ++gpucount; | ||
1041 | #endif | ||
1042 | 1243 | ||
1043 | if(RECV_SIZE > 0) | 1244 | if(RECV_SIZE > 0) |
1044 | chunkMemcpy(h_recv_data, this_gpu(d_state_data), RECV_SIZE, | 1245 | chunkMemcpy(h_recv_data, this_gpu(d_state_data), RECV_SIZE, |
@@ -1075,73 +1276,73 @@ static void usage(char *error) { | |||
1075 | exit(EXIT_FAILURE); | 1276 | exit(EXIT_FAILURE); |
1076 | } | 1277 | } |
1077 | 1278 | ||
1078 | /* | 1279 | ///* |
1079 | * returns the character that made processing stop, newline or EOF | 1280 | // * returns the character that made processing stop, newline or EOF |
1080 | */ | 1281 | // */ |
1081 | static int skip_to_next_line(FILE *fstream) | 1282 | //static int skip_to_next_line(FILE *fstream) |
1082 | { | 1283 | //{ |
1083 | int ch; | 1284 | // int ch; |
1084 | for (ch = fgetc(fstream); ch != EOF && ch != '\n'; ch = fgetc(fstream)); | 1285 | // for (ch = fgetc(fstream); ch != EOF && ch != '\n'; ch = fgetc(fstream)); |
1085 | return ch; | 1286 | // return ch; |
1086 | } | 1287 | //} |
1087 | 1288 | // | |
1088 | static void skip_comments(FILE *fstream) | 1289 | //static void skip_comments(FILE *fstream) |
1089 | { | 1290 | //{ |
1090 | int ch; | 1291 | // int ch; |
1091 | for (ch = fgetc(fstream); ch == '#'; ch = fgetc(fstream)) | 1292 | // for (ch = fgetc(fstream); ch == '#'; ch = fgetc(fstream)) |
1092 | skip_to_next_line(fstream); | 1293 | // skip_to_next_line(fstream); |
1093 | ungetc(ch, fstream); | 1294 | // ungetc(ch, fstream); |
1094 | } | 1295 | //} |
1095 | 1296 | // | |
1096 | static void get_exec_times(const char *file, const int column, | 1297 | //static void get_exec_times(const char *file, const int column, |
1097 | int *num_jobs, double **exec_times) | 1298 | // int *num_jobs, double **exec_times) |
1098 | { | 1299 | //{ |
1099 | FILE *fstream; | 1300 | // FILE *fstream; |
1100 | int cur_job, cur_col, ch; | 1301 | // int cur_job, cur_col, ch; |
1101 | *num_jobs = 0; | 1302 | // *num_jobs = 0; |
1102 | 1303 | // | |
1103 | fstream = fopen(file, "r"); | 1304 | // fstream = fopen(file, "r"); |
1104 | if (!fstream) | 1305 | // if (!fstream) |
1105 | bail_out("could not open execution time file"); | 1306 | // bail_out("could not open execution time file"); |
1106 | 1307 | // | |
1107 | /* figure out the number of jobs */ | 1308 | // /* figure out the number of jobs */ |
1108 | do { | 1309 | // do { |
1109 | skip_comments(fstream); | 1310 | // skip_comments(fstream); |
1110 | ch = skip_to_next_line(fstream); | 1311 | // ch = skip_to_next_line(fstream); |
1111 | if (ch != EOF) | 1312 | // if (ch != EOF) |
1112 | ++(*num_jobs); | 1313 | // ++(*num_jobs); |
1113 | } while (ch != EOF); | 1314 | // } while (ch != EOF); |
1114 | 1315 | // | |
1115 | if (-1 == fseek(fstream, 0L, SEEK_SET)) | 1316 | // if (-1 == fseek(fstream, 0L, SEEK_SET)) |
1116 | bail_out("rewinding file failed"); | 1317 | // bail_out("rewinding file failed"); |
1117 | 1318 | // | |
1118 | /* allocate space for exec times */ | 1319 | // /* allocate space for exec times */ |
1119 | *exec_times = (double*)calloc(*num_jobs, sizeof(*exec_times)); | 1320 | // *exec_times = (double*)calloc(*num_jobs, sizeof(*exec_times)); |
1120 | if (!*exec_times) | 1321 | // if (!*exec_times) |
1121 | bail_out("couldn't allocate memory"); | 1322 | // bail_out("couldn't allocate memory"); |
1122 | 1323 | // | |
1123 | for (cur_job = 0; cur_job < *num_jobs && !feof(fstream); ++cur_job) { | 1324 | // for (cur_job = 0; cur_job < *num_jobs && !feof(fstream); ++cur_job) { |
1124 | 1325 | // | |
1125 | skip_comments(fstream); | 1326 | // skip_comments(fstream); |
1126 | 1327 | // | |
1127 | for (cur_col = 1; cur_col < column; ++cur_col) { | 1328 | // for (cur_col = 1; cur_col < column; ++cur_col) { |
1128 | /* discard input until we get to the column we want */ | 1329 | // /* discard input until we get to the column we want */ |
1129 | int unused __attribute__ ((unused)) = fscanf(fstream, "%*s,"); | 1330 | // int unused __attribute__ ((unused)) = fscanf(fstream, "%*s,"); |
1130 | } | 1331 | // } |
1131 | 1332 | // | |
1132 | /* get the desired exec. time */ | 1333 | // /* get the desired exec. time */ |
1133 | if (1 != fscanf(fstream, "%lf", (*exec_times)+cur_job)) { | 1334 | // if (1 != fscanf(fstream, "%lf", (*exec_times)+cur_job)) { |
1134 | fprintf(stderr, "invalid execution time near line %d\n", | 1335 | // fprintf(stderr, "invalid execution time near line %d\n", |
1135 | cur_job); | 1336 | // cur_job); |
1136 | exit(EXIT_FAILURE); | 1337 | // exit(EXIT_FAILURE); |
1137 | } | 1338 | // } |
1138 | 1339 | // | |
1139 | skip_to_next_line(fstream); | 1340 | // skip_to_next_line(fstream); |
1140 | } | 1341 | // } |
1141 | 1342 | // | |
1142 | assert(cur_job == *num_jobs); | 1343 | // assert(cur_job == *num_jobs); |
1143 | fclose(fstream); | 1344 | // fclose(fstream); |
1144 | } | 1345 | //} |
1145 | 1346 | ||
1146 | #define NUMS 4096 | 1347 | #define NUMS 4096 |
1147 | static int num[NUMS]; | 1348 | static int num[NUMS]; |
@@ -1190,23 +1391,23 @@ out: | |||
1190 | } | 1391 | } |
1191 | 1392 | ||
1192 | 1393 | ||
1193 | static void debug_delay_loop(void) | 1394 | //static void debug_delay_loop(void) |
1194 | { | 1395 | //{ |
1195 | double start, end, delay; | 1396 | // double start, end, delay; |
1196 | 1397 | // | |
1197 | while (1) { | 1398 | // while (1) { |
1198 | for (delay = 0.5; delay > 0.01; delay -= 0.01) { | 1399 | // for (delay = 0.5; delay > 0.01; delay -= 0.01) { |
1199 | start = wctime(); | 1400 | // start = wctime(); |
1200 | loop_for(delay, 0); | 1401 | // loop_for(delay, 0); |
1201 | end = wctime(); | 1402 | // end = wctime(); |
1202 | printf("%6.4fs: looped for %10.8fs, delta=%11.8fs, error=%7.4f%%\n", | 1403 | // printf("%6.4fs: looped for %10.8fs, delta=%11.8fs, error=%7.4f%%\n", |
1203 | delay, | 1404 | // delay, |
1204 | end - start, | 1405 | // end - start, |
1205 | end - start - delay, | 1406 | // end - start - delay, |
1206 | 100 * (end - start - delay) / delay); | 1407 | // 100 * (end - start - delay) / delay); |
1207 | } | 1408 | // } |
1208 | } | 1409 | // } |
1209 | } | 1410 | //} |
1210 | 1411 | ||
1211 | typedef bool (*gpu_job_t)(double exec_time, double gpu_exec_time, unsigned int num_kernels, double program_end); | 1412 | typedef bool (*gpu_job_t)(double exec_time, double gpu_exec_time, unsigned int num_kernels, double program_end); |
1212 | typedef bool (*cpu_job_t)(double exec_time, double program_end); | 1413 | typedef bool (*cpu_job_t)(double exec_time, double program_end); |
@@ -1288,6 +1489,108 @@ static void init_linux() | |||
1288 | mlockall(MCL_CURRENT | MCL_FUTURE); | 1489 | mlockall(MCL_CURRENT | MCL_FUTURE); |
1289 | } | 1490 | } |
1290 | 1491 | ||
1492 | static int enable_aux_rt_tasks_linux(pid_t tid) | ||
1493 | { | ||
1494 | /* pre: caller must already be real time */ | ||
1495 | int ret = 0; | ||
1496 | struct sched_param param; | ||
1497 | stringstream pidstr; | ||
1498 | boost::filesystem::directory_iterator theEnd; | ||
1499 | boost::filesystem::path proc_dir; | ||
1500 | |||
1501 | int policy = sched_getscheduler(tid); | ||
1502 | if (policy == -1 || policy != SCHED_FIFO) { | ||
1503 | ret = -1; | ||
1504 | goto out; | ||
1505 | } | ||
1506 | |||
1507 | ret = sched_getparam(tid, ¶m); | ||
1508 | if (ret < 0) | ||
1509 | goto out; | ||
1510 | |||
1511 | |||
1512 | pidstr<<getpid(); | ||
1513 | proc_dir = boost::filesystem::path("/proc"); | ||
1514 | proc_dir /= pidstr.str(); | ||
1515 | proc_dir /= "task"; | ||
1516 | |||
1517 | for(boost::filesystem::directory_iterator iter(proc_dir); iter != theEnd; ++iter) | ||
1518 | { | ||
1519 | stringstream taskstr(iter->path().leaf().c_str()); | ||
1520 | int child = 0; | ||
1521 | taskstr>>child; | ||
1522 | if (child != tid && child != 0) | ||
1523 | { | ||
1524 | /* mirror tid's params to others */ | ||
1525 | ret = sched_setscheduler(child, policy, ¶m); | ||
1526 | if (ret != 0) | ||
1527 | goto out; | ||
1528 | } | ||
1529 | } | ||
1530 | |||
1531 | out: | ||
1532 | return ret; | ||
1533 | } | ||
1534 | |||
1535 | static int disable_aux_rt_tasks_linux(pid_t tid) | ||
1536 | { | ||
1537 | int ret = 0; | ||
1538 | struct sched_param param; | ||
1539 | stringstream pidstr; | ||
1540 | boost::filesystem::directory_iterator theEnd; | ||
1541 | boost::filesystem::path proc_dir; | ||
1542 | |||
1543 | memset(¶m, 0, sizeof(param)); | ||
1544 | |||
1545 | pidstr<<getpid(); | ||
1546 | proc_dir = boost::filesystem::path("/proc"); | ||
1547 | proc_dir /= pidstr.str(); | ||
1548 | proc_dir /= "task"; | ||
1549 | |||
1550 | for(boost::filesystem::directory_iterator iter(proc_dir); iter != theEnd; ++iter) | ||
1551 | { | ||
1552 | stringstream taskstr(iter->path().leaf().c_str()); | ||
1553 | int child = 0; | ||
1554 | taskstr>>child; | ||
1555 | if (child != tid && child != 0) | ||
1556 | { | ||
1557 | /* make all other threads sched_normal */ | ||
1558 | ret = sched_setscheduler(child, SCHED_OTHER, ¶m); | ||
1559 | if (ret != 0) | ||
1560 | goto out; | ||
1561 | } | ||
1562 | } | ||
1563 | |||
1564 | out: | ||
1565 | return ret; | ||
1566 | } | ||
1567 | |||
1568 | static int be_migrate_all_to_cluster(int cluster, int cluster_size) | ||
1569 | { | ||
1570 | int ret = 0; | ||
1571 | stringstream pidstr; | ||
1572 | |||
1573 | pidstr<<getpid(); | ||
1574 | boost::filesystem::path proc_dir("/proc"); | ||
1575 | proc_dir /= pidstr.str(); | ||
1576 | proc_dir /= "task"; | ||
1577 | boost::filesystem::directory_iterator theEnd; | ||
1578 | for(boost::filesystem::directory_iterator iter(proc_dir); iter != theEnd; ++iter) | ||
1579 | { | ||
1580 | stringstream taskstr(iter->path().leaf().c_str()); | ||
1581 | int task = 0; | ||
1582 | taskstr>>task; | ||
1583 | if (task != 0) { | ||
1584 | ret = be_migrate_to_cluster(cluster, cluster_size); | ||
1585 | if (ret != 0) | ||
1586 | goto out; | ||
1587 | } | ||
1588 | } | ||
1589 | |||
1590 | out: | ||
1591 | return ret; | ||
1592 | } | ||
1593 | |||
1291 | static bool gpu_job_linux(double exec_time, double gpu_exec_time, unsigned int num_kernels, double program_end) | 1594 | static bool gpu_job_linux(double exec_time, double gpu_exec_time, unsigned int num_kernels, double program_end) |
1292 | { | 1595 | { |
1293 | double chunk1, chunk2; | 1596 | double chunk1, chunk2; |
@@ -1322,195 +1625,820 @@ static bool job_linux(double exec_time, double program_end) | |||
1322 | 1625 | ||
1323 | /*****************************/ | 1626 | /*****************************/ |
1324 | 1627 | ||
1325 | enum eScheduler | 1628 | |
1629 | |||
1630 | |||
1631 | |||
1632 | enum eRunMode | ||
1326 | { | 1633 | { |
1327 | LITMUS, | 1634 | NORMAL, |
1328 | LINUX, | 1635 | PROXY, |
1329 | RT_LINUX | 1636 | DAEMON, |
1330 | }; | 1637 | }; |
1331 | 1638 | ||
1332 | #define CPU_OPTIONS "p:z:c:wlveio:f:s:q:X:L:Q:d:" | 1639 | void set_defaults(struct Args* args) |
1333 | #define GPU_OPTIONS "g:y:r:C:E:DG:xS:R:T:Z:aFm:b:MNIk:VW:" | 1640 | { |
1334 | 1641 | memset(args, 0, sizeof(*args)); | |
1335 | // concat the option strings | 1642 | args->wcet_ms = -1.0; |
1336 | #define OPTSTR CPU_OPTIONS GPU_OPTIONS | 1643 | args->gpu_wcet_ms = 0.0; |
1644 | args->period_ms = -1.0; | ||
1645 | args->budget_ms = -1.0; | ||
1646 | args->gpusync_mode = IKGLP_MODE; | ||
1647 | args->sync_mode = BLOCKING; | ||
1648 | args->gpu_using = false; | ||
1649 | args->enable_affinity = false; | ||
1650 | args->enable_chunking = false; | ||
1651 | args->relax_fifo_len = false; | ||
1652 | args->use_sysmem_migration = false; | ||
1653 | args->rho = 2; | ||
1654 | args->num_ce = 2; | ||
1655 | args->reserve_migr_ce = false; | ||
1656 | args->num_kernels = 1; | ||
1657 | args->engine_lock_type = FIFO; | ||
1658 | args->yield_locks = false; | ||
1659 | args->drain_policy = DRAIN_SIMPLE; | ||
1660 | args->want_enforcement = false; | ||
1661 | args->want_signals = false; | ||
1662 | args->priority = LITMUS_LOWEST_PRIORITY; | ||
1663 | args->cls = RT_CLASS_SOFT; | ||
1664 | args->scheduler = LITMUS; | ||
1665 | args->migrate = false; | ||
1666 | args->cluster = 0; | ||
1667 | args->cluster_size = 1; | ||
1668 | args->stddev = 0.0; | ||
1669 | args->wait = false; | ||
1670 | args->scale = 1.0; | ||
1671 | args->duration = 0.0; | ||
1672 | } | ||
1337 | 1673 | ||
1338 | int main(int argc, char** argv) | 1674 | void apply_args(struct Args* args) |
1339 | { | 1675 | { |
1340 | int ret; | 1676 | // set all the globals |
1677 | CPU_PARTITION_SIZE = args->cluster_size; | ||
1678 | GPU_USING = args->gpu_using; | ||
1679 | GPU_PARTITION = args->gpu_partition; | ||
1680 | GPU_PARTITION_SIZE = args->gpu_partition_size; | ||
1681 | RHO = args->rho; | ||
1682 | NUM_COPY_ENGINES = args->num_ce; | ||
1683 | RESERVED_MIGR_COPY_ENGINE = args->reserve_migr_ce; | ||
1684 | USE_ENGINE_LOCKS = args->use_engine_locks; | ||
1685 | ENGINE_LOCK_TYPE = args->engine_lock_type; | ||
1686 | YIELD_LOCKS = args->yield_locks; | ||
1687 | USE_DYNAMIC_GROUP_LOCKS = args->use_dgls; | ||
1688 | GPU_SYNC_MODE = args->gpusync_mode; | ||
1689 | ENABLE_AFFINITY = args->enable_affinity; | ||
1690 | RELAX_FIFO_MAX_LEN = args->relax_fifo_len; | ||
1691 | CUDA_SYNC_MODE = args->sync_mode; | ||
1692 | SEND_SIZE = args->send_size; | ||
1693 | RECV_SIZE = args->recv_size; | ||
1694 | STATE_SIZE = args->state_size; | ||
1695 | ENABLE_CHUNKING = args->enable_chunking; | ||
1696 | CHUNK_SIZE = args->chunk_size; | ||
1697 | MIGRATE_VIA_SYSMEM = args->use_sysmem_migration; | ||
1698 | |||
1699 | // roll back other globals to an initial state | ||
1700 | CUR_DEVICE = -1; | ||
1701 | LAST_DEVICE = -1; | ||
1702 | } | ||
1341 | 1703 | ||
1704 | int __do_normal(struct Args* args) | ||
1705 | { | ||
1706 | int ret = 0; | ||
1342 | struct rt_task param; | 1707 | struct rt_task param; |
1343 | 1708 | ||
1344 | lt_t wcet; | 1709 | lt_t wcet; |
1345 | lt_t period; | 1710 | lt_t period; |
1346 | lt_t budget; | 1711 | lt_t budget; |
1347 | double wcet_ms = -1.0; | ||
1348 | double gpu_wcet_ms = 0.0; | ||
1349 | double period_ms = -1.0; | ||
1350 | double budget_ms = -1.0; | ||
1351 | 1712 | ||
1352 | unsigned int num_kernels = 1; | 1713 | Normal<double> *wcet_dist_ms = NULL; |
1353 | 1714 | ||
1354 | budget_drain_policy_t drain = DRAIN_SIMPLE; | 1715 | cpu_job_t cjobfn = NULL; |
1355 | bool want_enforcement = false; | 1716 | gpu_job_t gjobfn = NULL; |
1356 | bool want_signals = false; | ||
1357 | 1717 | ||
1358 | unsigned int priority = LITMUS_LOWEST_PRIORITY; | 1718 | double start = 0; |
1359 | 1719 | ||
1360 | task_class_t cls = RT_CLASS_SOFT; | 1720 | if (MIGRATE_VIA_SYSMEM && GPU_PARTITION_SIZE == 1) |
1721 | return -1; | ||
1361 | 1722 | ||
1362 | eScheduler scheduler = LITMUS; | 1723 | // turn off some features to be safe |
1363 | int num_gpu_users = 0; | 1724 | if (args->scheduler != LITMUS) |
1364 | int migrate = 0; | 1725 | { |
1365 | int cluster = 0; | 1726 | RHO = 0; |
1366 | int cluster_size = 1; | 1727 | USE_ENGINE_LOCKS = false; |
1728 | USE_DYNAMIC_GROUP_LOCKS = false; | ||
1729 | RELAX_FIFO_MAX_LEN = false; | ||
1730 | ENABLE_RT_AUX_THREADS = false; | ||
1731 | args->budget_ms = -1.0; | ||
1732 | args->want_enforcement = false; | ||
1733 | args->want_signals = false; | ||
1367 | 1734 | ||
1368 | Normal<double> *wcet_dist_ms = NULL; | 1735 | cjobfn = job_linux; |
1369 | float stdpct = 0.0; | 1736 | gjobfn = gpu_job_linux; |
1737 | } | ||
1738 | else | ||
1739 | { | ||
1740 | cjobfn = job; | ||
1741 | gjobfn = gpu_job; | ||
1742 | } | ||
1370 | 1743 | ||
1371 | cpu_job_t cjobfn = NULL; | 1744 | wcet = ms2ns(args->wcet_ms); |
1372 | gpu_job_t gjobfn = NULL; | 1745 | period = ms2ns(args->period_ms); |
1746 | if (wcet <= 0) { | ||
1747 | printf("The worst-case execution time must be a positive number.\n"); | ||
1748 | ret = -1; | ||
1749 | goto out; | ||
1750 | } | ||
1751 | if (period <= 0) { | ||
1752 | printf("The period must be a positive number.\n"); | ||
1753 | ret = -1; | ||
1754 | goto out; | ||
1755 | } | ||
1756 | if (wcet > period) { | ||
1757 | printf("The worst-case execution time must not exceed the period.\n"); | ||
1758 | ret = -1; | ||
1759 | goto out; | ||
1760 | } | ||
1761 | if (args->gpu_using && args->gpu_wcet_ms <= 0) { | ||
1762 | printf("The worst-case gpu execution time must be a positive number.\n"); | ||
1763 | ret = -1; | ||
1764 | goto out; | ||
1765 | } | ||
1373 | 1766 | ||
1374 | int wait = 0; | 1767 | if (args->budget_ms > 0.0) |
1375 | double scale = 1.0; | 1768 | budget = ms2ns(args->budget_ms); |
1376 | int test_loop = 0; | 1769 | else |
1770 | budget = args->wcet_ms; | ||
1377 | 1771 | ||
1378 | double duration = 0, start = 0; | 1772 | // randomize execution time according to a normal distribution |
1379 | int cur_job = 0, num_jobs = 0; | 1773 | // centered around the desired execution time. |
1380 | int column = 1; | 1774 | // standard deviation is a percentage of this average |
1775 | wcet_dist_ms = new Normal<double>(args->wcet_ms + args->gpu_wcet_ms, (args->wcet_ms + args->gpu_wcet_ms) * args->stddev); | ||
1776 | wcet_dist_ms->seed((unsigned int)time(0)); | ||
1381 | 1777 | ||
1382 | int opt; | 1778 | ret = be_migrate_all_to_cluster(args->cluster, args->cluster_size); |
1779 | if (ret < 0) { | ||
1780 | printf("could not migrate to target partition or cluster.\n"); | ||
1781 | goto out; | ||
1782 | } | ||
1783 | |||
1784 | if (args->scheduler != LITMUS) | ||
1785 | { | ||
1786 | // set some variables needed by linux modes | ||
1787 | if (args->gpu_using) | ||
1788 | TRACE_MIGRATIONS = true; | ||
1789 | periodTime.tv_sec = period / s2ns(1); | ||
1790 | periodTime.tv_nsec = period - periodTime.tv_sec * s2ns(1); | ||
1791 | period_ns = period; | ||
1792 | job_no = 0; | ||
1793 | } | ||
1794 | |||
1795 | init_rt_task_param(¶m); | ||
1796 | param.exec_cost = budget; | ||
1797 | param.period = period; | ||
1798 | param.priority = args->priority; | ||
1799 | param.cls = args->cls; | ||
1800 | param.budget_policy = (args->want_enforcement) ? | ||
1801 | PRECISE_ENFORCEMENT : NO_ENFORCEMENT; | ||
1802 | param.budget_signal_policy = (args->want_enforcement && args->want_signals) ? | ||
1803 | PRECISE_SIGNALS : NO_SIGNALS; | ||
1804 | param.drain_policy = args->drain_policy; | ||
1805 | param.release_policy = PERIODIC; | ||
1806 | param.cpu = cluster_to_first_cpu(args->cluster, args->cluster_size); | ||
1807 | |||
1808 | ret = set_rt_task_param(gettid(), ¶m); | ||
1809 | if (ret < 0) { | ||
1810 | bail_out("could not setup rt task params\n"); | ||
1811 | goto out; | ||
1812 | } | ||
1813 | |||
1814 | if (args->want_signals) | ||
1815 | /* bind default longjmp signal handler to SIG_BUDGET. */ | ||
1816 | activate_litmus_signals(SIG_BUDGET_MASK, longjmp_on_litmus_signal); | ||
1817 | else | ||
1818 | ignore_litmus_signals(SIG_BUDGET_MASK); | ||
1819 | |||
1820 | if (args->gpu_using) | ||
1821 | allocate_locks(args->num_gpu_tasks, args->scheduler != LITMUS); | ||
1822 | |||
1823 | if (args->scheduler == LITMUS) | ||
1824 | { | ||
1825 | ret = task_mode(LITMUS_RT_TASK); | ||
1826 | if (ret < 0) { | ||
1827 | printf("could not become RT task\n"); | ||
1828 | goto out; | ||
1829 | } | ||
1830 | } | ||
1831 | else | ||
1832 | { | ||
1833 | if (args->scheduler == RT_LINUX) | ||
1834 | { | ||
1835 | struct sched_param fifoparams; | ||
1836 | memset(&fifoparams, 0, sizeof(fifoparams)); | ||
1837 | fifoparams.sched_priority = args->priority; | ||
1838 | ret = sched_setscheduler(getpid(), SCHED_FIFO, &fifoparams); | ||
1839 | if (ret < 0) { | ||
1840 | printf("could not become sched_fifo task\n"); | ||
1841 | goto out; | ||
1842 | } | ||
1843 | } | ||
1844 | trace_name(); | ||
1845 | trace_param(); | ||
1846 | } | ||
1847 | |||
1848 | if (args->wait) { | ||
1849 | xprintf("%d: waiting for release.\n", getpid()); | ||
1850 | ret = wait_for_ts_release2(&releaseTime); | ||
1851 | if (ret != 0) { | ||
1852 | printf("wait_for_ts_release2()\n"); | ||
1853 | goto out; | ||
1854 | } | ||
1855 | |||
1856 | if (args->scheduler != LITMUS) | ||
1857 | log_release(); | ||
1858 | } | ||
1859 | else if (args->scheduler != LITMUS) | ||
1860 | { | ||
1861 | clock_gettime(CLOCK_MONOTONIC, &releaseTime); | ||
1862 | sleep_next_period_linux(); | ||
1863 | } | ||
1864 | |||
1865 | if (args->gpu_using && ENABLE_RT_AUX_THREADS) { | ||
1866 | if (args->scheduler == LITMUS) { | ||
1867 | ret = enable_aux_rt_tasks(AUX_CURRENT | AUX_FUTURE); | ||
1868 | if (ret != 0) { | ||
1869 | printf("enable_aux_rt_tasks() failed\n"); | ||
1870 | goto out; | ||
1871 | } | ||
1872 | } | ||
1873 | else if (args->scheduler == RT_LINUX) { | ||
1874 | ret = enable_aux_rt_tasks_linux(gettid()); | ||
1875 | if (ret != 0) { | ||
1876 | printf("enable_aux_rt_tasks_linux() failed\n"); | ||
1877 | goto out; | ||
1878 | } | ||
1879 | } | ||
1880 | } | ||
1383 | 1881 | ||
1384 | double *exec_times = NULL; | 1882 | start = wctime(); |
1385 | const char *file = NULL; | ||
1386 | 1883 | ||
1387 | /* locking */ | 1884 | if (!args->gpu_using) { |
1388 | // int lock_od = -1; | 1885 | bool keepgoing; |
1389 | // int resource_id = 0; | 1886 | do |
1390 | // int protocol = -1; | 1887 | { |
1391 | // double cs_length = 1; /* millisecond */ | 1888 | double job_ms = wcet_dist_ms->random(); |
1889 | if (job_ms < 0.0) | ||
1890 | job_ms = 0.0; | ||
1891 | keepgoing = cjobfn(ms2s(job_ms * args->scale), start + args->duration); | ||
1892 | }while(keepgoing); | ||
1893 | } | ||
1894 | else { | ||
1895 | bool keepgoing; | ||
1896 | do | ||
1897 | { | ||
1898 | double job_ms = wcet_dist_ms->random(); | ||
1899 | if (job_ms < 0.0) | ||
1900 | job_ms = 0.0; | ||
1901 | |||
1902 | double cpu_job_ms = (job_ms/(args->wcet_ms + args->gpu_wcet_ms))*args->wcet_ms; | ||
1903 | double gpu_job_ms = (job_ms/(args->wcet_ms + args->gpu_wcet_ms))*args->gpu_wcet_ms; | ||
1904 | keepgoing = gjobfn( | ||
1905 | ms2s(cpu_job_ms * args->scale), | ||
1906 | ms2s(gpu_job_ms * args->scale), | ||
1907 | args->num_kernels, | ||
1908 | start + args->duration); | ||
1909 | }while(keepgoing); | ||
1910 | } | ||
1911 | |||
1912 | if (args->gpu_using && ENABLE_RT_AUX_THREADS) { | ||
1913 | if (args->scheduler == LITMUS) { | ||
1914 | ret = disable_aux_rt_tasks(AUX_CURRENT | AUX_FUTURE); | ||
1915 | if (ret != 0) { | ||
1916 | printf("disable_aux_rt_tasks() failed\n"); | ||
1917 | goto out; | ||
1918 | } | ||
1919 | } | ||
1920 | else if(args->scheduler == RT_LINUX) { | ||
1921 | ret = disable_aux_rt_tasks_linux(gettid()); | ||
1922 | if (ret != 0) { | ||
1923 | printf("disable_aux_rt_tasks_linux() failed\n"); | ||
1924 | goto out; | ||
1925 | } | ||
1926 | } | ||
1927 | } | ||
1928 | |||
1929 | if (args->gpu_using) | ||
1930 | deallocate_locks(args->num_gpu_tasks, args->scheduler != LITMUS); | ||
1931 | |||
1932 | if (args->scheduler == LITMUS) | ||
1933 | { | ||
1934 | ret = task_mode(BACKGROUND_TASK); | ||
1935 | if (ret != 0) { | ||
1936 | printf("could not become regular task (huh?)\n"); | ||
1937 | goto out; | ||
1938 | } | ||
1939 | } | ||
1940 | |||
1941 | { | ||
1942 | // become a normal task just in case. | ||
1943 | struct sched_param normalparams; | ||
1944 | memset(&normalparams, 0, sizeof(normalparams)); | ||
1945 | ret = sched_setscheduler(getpid(), SCHED_OTHER, &normalparams); | ||
1946 | if (ret < 0) { | ||
1947 | printf("could not become sched_normal task\n"); | ||
1948 | goto out; | ||
1949 | } | ||
1950 | } | ||
1951 | |||
1952 | out: | ||
1953 | if (wcet_dist_ms) | ||
1954 | delete wcet_dist_ms; | ||
1955 | |||
1956 | return ret; | ||
1957 | } | ||
1958 | |||
1959 | int do_normal(struct Args* args) | ||
1960 | { | ||
1961 | int ret = 0; | ||
1962 | |||
1963 | apply_args(args); | ||
1964 | |||
1965 | if (args->scheduler == LITMUS) | ||
1966 | init_litmus(); | ||
1967 | else | ||
1968 | init_linux(); | ||
1969 | |||
1970 | if (args->gpu_using) { | ||
1971 | signal(SIGABRT, catch_exit); | ||
1972 | signal(SIGTERM, catch_exit); | ||
1973 | signal(SIGQUIT, catch_exit); | ||
1974 | signal(SIGSEGV, catch_exit); | ||
1975 | |||
1976 | cudaSetDeviceFlags(cudaDeviceScheduleSpin); | ||
1977 | init_cuda(args->num_gpu_tasks); | ||
1978 | init_events(); | ||
1979 | safetynet = true; | ||
1980 | } | ||
1981 | |||
1982 | ret = __do_normal(args); | ||
1983 | |||
1984 | if (args->gpu_using) { | ||
1985 | safetynet = false; | ||
1986 | exit_cuda(); | ||
1987 | } | ||
1988 | |||
1989 | return ret; | ||
1990 | } | ||
1991 | |||
1992 | typedef struct run_entry | ||
1993 | { | ||
1994 | struct Args args; | ||
1995 | int used; | ||
1996 | int ret; | ||
1997 | } run_entry_t; | ||
1998 | |||
1999 | |||
2000 | |||
2001 | static int *num_run_entries = NULL; | ||
2002 | static run_entry_t *run_entries = NULL; | ||
2003 | static pthread_barrier_t *daemon_barrier = NULL; | ||
2004 | static pthread_mutex_t *daemon_mutex = NULL; | ||
2005 | |||
2006 | static run_entry_t *my_run_entry = NULL; | ||
2007 | static managed_shared_memory *daemon_segment_ptr = NULL; | ||
2008 | |||
2009 | int init_daemon(struct Args* args, int num_total_users, bool is_daemon) | ||
2010 | { | ||
2011 | if (num_total_users) | ||
2012 | { | ||
2013 | shared_memory_object::remove("gpuspin_daemon_memory"); | ||
2014 | |||
2015 | daemon_segment_ptr = new managed_shared_memory(create_only, "gpuspin_daemon_memory", 30*PAGE_SIZE); | ||
2016 | num_run_entries = daemon_segment_ptr->construct<int>("int num_run_entries")(); | ||
2017 | *num_run_entries = num_total_users; | ||
2018 | |||
2019 | run_entries = daemon_segment_ptr->construct<struct run_entry>("run_entry_t run_entries")[num_total_users](); | ||
2020 | memset(run_entries, 0, sizeof(run_entry_t)*num_total_users); | ||
2021 | |||
2022 | daemon_mutex = daemon_segment_ptr->construct<pthread_mutex_t>("pthread_mutex_t daemon_mutex")(); | ||
2023 | pthread_mutexattr_t attr; | ||
2024 | pthread_mutexattr_init(&attr); | ||
2025 | pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED); | ||
2026 | pthread_mutex_init(daemon_mutex, &attr); | ||
2027 | pthread_mutexattr_destroy(&attr); | ||
2028 | |||
2029 | daemon_barrier = daemon_segment_ptr->construct<pthread_barrier_t>("pthread_barrier_t daemon_barrier")(); | ||
2030 | pthread_barrierattr_t battr; | ||
2031 | pthread_barrierattr_init(&battr); | ||
2032 | pthread_barrierattr_setpshared(&battr, PTHREAD_PROCESS_SHARED); | ||
2033 | pthread_barrier_init(daemon_barrier, &battr, args->num_tasks*2); | ||
2034 | pthread_barrierattr_destroy(&battr); | ||
2035 | } | ||
2036 | else | ||
2037 | { | ||
2038 | do | ||
2039 | { | ||
2040 | try | ||
2041 | { | ||
2042 | if (!daemon_segment_ptr) daemon_segment_ptr = new managed_shared_memory(open_only, "gpuspin_daemon_memory"); | ||
2043 | } | ||
2044 | catch(...) | ||
2045 | { | ||
2046 | sleep(1); | ||
2047 | } | ||
2048 | }while(daemon_segment_ptr == NULL); | ||
2049 | |||
2050 | num_run_entries = daemon_segment_ptr->find<int>("int num_run_entries").first; | ||
2051 | run_entries = daemon_segment_ptr->find<struct run_entry>("run_entry_t run_entries").first; | ||
2052 | daemon_mutex = daemon_segment_ptr->find<pthread_mutex_t>("pthread_mutex_t daemon_mutex").first; | ||
2053 | daemon_barrier = daemon_segment_ptr->find<pthread_barrier_t>("pthread_barrier_t daemon_barrier").first; | ||
2054 | } | ||
2055 | |||
2056 | if (is_daemon) | ||
2057 | { | ||
2058 | // find and claim an entry | ||
2059 | pthread_mutex_lock(daemon_mutex); | ||
2060 | for(int i = 0; i < *num_run_entries; ++i) | ||
2061 | { | ||
2062 | if(!run_entries[i].used) | ||
2063 | { | ||
2064 | my_run_entry = &run_entries[i]; | ||
2065 | my_run_entry->used = 1; | ||
2066 | break; | ||
2067 | } | ||
2068 | } | ||
2069 | pthread_mutex_unlock(daemon_mutex); | ||
2070 | |||
2071 | assert(my_run_entry); | ||
2072 | my_run_entry->args = *args; | ||
2073 | my_run_entry->ret = 0; | ||
2074 | } | ||
2075 | else | ||
2076 | { | ||
2077 | // find my entry | ||
2078 | pthread_mutex_lock(daemon_mutex); | ||
2079 | for(int i = 0; i < *num_run_entries; ++i) | ||
2080 | { | ||
2081 | if (run_entries[i].args.wcet_ms == args->wcet_ms && | ||
2082 | run_entries[i].args.gpu_wcet_ms == args->gpu_wcet_ms && | ||
2083 | run_entries[i].args.period_ms == args->period_ms) | ||
2084 | { | ||
2085 | my_run_entry = &run_entries[i]; | ||
2086 | break; | ||
2087 | } | ||
2088 | } | ||
2089 | pthread_mutex_unlock(daemon_mutex); | ||
2090 | } | ||
2091 | |||
2092 | if (!my_run_entry) | ||
2093 | return -1; | ||
2094 | return 0; | ||
2095 | } | ||
2096 | |||
2097 | int put_next_run(struct Args* args) | ||
2098 | { | ||
2099 | assert(my_run_entry); | ||
2100 | |||
2101 | pthread_mutex_lock(daemon_mutex); | ||
2102 | my_run_entry->args = *args; | ||
2103 | pthread_mutex_unlock(daemon_mutex); | ||
2104 | |||
2105 | pthread_barrier_wait(daemon_barrier); | ||
2106 | |||
2107 | return 0; | ||
2108 | } | ||
2109 | |||
2110 | int get_next_run(struct Args* args) | ||
2111 | { | ||
2112 | assert(my_run_entry); | ||
2113 | |||
2114 | pthread_barrier_wait(daemon_barrier); | ||
2115 | |||
2116 | pthread_mutex_lock(daemon_mutex); | ||
2117 | *args = my_run_entry->args; | ||
2118 | my_run_entry->ret = 0; | ||
2119 | pthread_mutex_unlock(daemon_mutex); | ||
2120 | |||
2121 | return 0; | ||
2122 | } | ||
2123 | |||
2124 | int complete_run(int ret) | ||
2125 | { | ||
2126 | assert(my_run_entry); | ||
2127 | |||
2128 | pthread_mutex_lock(daemon_mutex); | ||
2129 | my_run_entry->ret = ret; | ||
2130 | pthread_mutex_unlock(daemon_mutex); | ||
2131 | |||
2132 | pthread_barrier_wait(daemon_barrier); | ||
2133 | |||
2134 | return 0; | ||
2135 | } | ||
2136 | |||
2137 | int wait_completion() | ||
2138 | { | ||
2139 | int ret = 0; | ||
2140 | |||
2141 | assert(my_run_entry); | ||
2142 | |||
2143 | pthread_barrier_wait(daemon_barrier); | ||
2144 | |||
2145 | pthread_mutex_lock(daemon_mutex); | ||
2146 | ret = my_run_entry->ret; | ||
2147 | pthread_mutex_unlock(daemon_mutex); | ||
2148 | |||
2149 | return ret; | ||
2150 | } | ||
2151 | |||
2152 | |||
2153 | |||
2154 | |||
2155 | int do_proxy(struct Args* args) | ||
2156 | { | ||
2157 | int ret = 0; | ||
2158 | ret = init_daemon(args, 0, false); | ||
2159 | if (ret < 0) | ||
2160 | goto out; | ||
2161 | put_next_run(args); | ||
2162 | ret = wait_completion(); | ||
2163 | |||
2164 | out: | ||
2165 | return ret; | ||
2166 | } | ||
2167 | |||
2168 | static bool is_daemon = false; | ||
2169 | static bool running = false; | ||
2170 | static void catch_exit2(int signal) | ||
2171 | { | ||
2172 | if (is_daemon && running) | ||
2173 | complete_run(-signal); | ||
2174 | catch_exit(signal); | ||
2175 | } | ||
2176 | |||
2177 | int do_daemon(struct Args* args) | ||
2178 | { | ||
2179 | is_daemon = true; | ||
2180 | |||
2181 | int ret = 0; | ||
2182 | struct Args nextargs; | ||
2183 | |||
2184 | signal(SIGFPE, catch_exit2); | ||
2185 | signal(SIGABRT, catch_exit2); | ||
2186 | signal(SIGTERM, catch_exit2); | ||
2187 | signal(SIGQUIT, catch_exit2); | ||
2188 | signal(SIGSEGV, catch_exit2); | ||
2189 | |||
2190 | init_daemon(args, args->num_tasks, true); | ||
2191 | |||
2192 | apply_args(args); | ||
2193 | init_litmus(); /* does everything init_linux() does, plus litmus stuff */ | ||
2194 | |||
2195 | if (args->gpu_using) { | ||
2196 | cudaSetDeviceFlags(cudaDeviceScheduleSpin); | ||
2197 | init_cuda(args->num_gpu_tasks); | ||
2198 | init_events(); | ||
2199 | safetynet = true; | ||
2200 | } | ||
2201 | |||
2202 | do { | ||
2203 | bool sync_change = false; | ||
2204 | bool gpu_part_change = false; | ||
2205 | bool gpu_part_size_change = false; | ||
2206 | |||
2207 | xprintf("%d: waiting for work\n", getpid()); | ||
2208 | |||
2209 | get_next_run(&nextargs); | ||
2210 | |||
2211 | if (nextargs.gpu_using) { | ||
2212 | xprintf("%d: gpu using! gpu partition = %d, gwcet = %f, send = %lu\n", | ||
2213 | getpid(), | ||
2214 | nextargs.gpu_partition, | ||
2215 | nextargs.gpu_wcet_ms, | ||
2216 | nextargs.send_size); | ||
2217 | } | ||
2218 | |||
2219 | running = true; | ||
2220 | sync_change = args->gpu_using && (CUDA_SYNC_MODE != nextargs.sync_mode); | ||
2221 | gpu_part_change = args->gpu_using && (GPU_PARTITION != nextargs.gpu_partition); | ||
2222 | gpu_part_size_change = args->gpu_using && (GPU_PARTITION_SIZE != nextargs.gpu_partition_size); | ||
2223 | |||
2224 | if (sync_change || gpu_part_change || gpu_part_size_change) { | ||
2225 | destroy_events(); | ||
2226 | if (gpu_part_change || gpu_part_size_change) | ||
2227 | exit_cuda(); | ||
2228 | } | ||
2229 | apply_args(&nextargs); | ||
2230 | if (sync_change || gpu_part_change || gpu_part_size_change) { | ||
2231 | if (gpu_part_change || gpu_part_size_change) { | ||
2232 | xprintf("%d: changing device configuration\n", getpid()); | ||
2233 | init_cuda(nextargs.num_gpu_tasks); | ||
2234 | CUR_DEVICE = -1; | ||
2235 | LAST_DEVICE = -1; | ||
2236 | } | ||
2237 | init_events(); | ||
2238 | } | ||
2239 | |||
2240 | xprintf("%d: starting run\n", getpid()); | ||
2241 | |||
2242 | ret = __do_normal(&nextargs); | ||
2243 | complete_run(ret); | ||
2244 | running = false; | ||
2245 | }while(ret == 0); | ||
2246 | |||
2247 | if (args->gpu_using) { | ||
2248 | safetynet = false; | ||
2249 | exit_cuda(); | ||
2250 | } | ||
2251 | |||
2252 | if (args->num_gpu_tasks) | ||
2253 | shared_memory_object::remove("gpu_mutex_memory"); | ||
2254 | |||
2255 | if (args->num_tasks) | ||
2256 | shared_memory_object::remove("gpuspin_daemon_memory"); | ||
2257 | |||
2258 | return ret; | ||
2259 | } | ||
2260 | |||
2261 | #define CPU_OPTIONS "p:z:c:wlveio:f:s:q:X:L:Q:d:" | ||
2262 | #define GPU_OPTIONS "g:y:r:C:E:DG:xS:R:T:Z:aFm:b:MNIk:VW:u" | ||
2263 | #define PROXY_OPTIONS "B:PA" | ||
2264 | |||
2265 | // concat the option strings | ||
2266 | #define OPTSTR CPU_OPTIONS GPU_OPTIONS PROXY_OPTIONS | ||
2267 | |||
2268 | int main(int argc, char** argv) | ||
2269 | { | ||
2270 | struct Args myArgs; | ||
2271 | set_defaults(&myArgs); | ||
2272 | |||
2273 | eRunMode run_mode = NORMAL; | ||
2274 | |||
2275 | int opt; | ||
1392 | 2276 | ||
1393 | progname = argv[0]; | 2277 | progname = argv[0]; |
1394 | 2278 | ||
1395 | while ((opt = getopt(argc, argv, OPTSTR)) != -1) { | 2279 | while ((opt = getopt(argc, argv, OPTSTR)) != -1) { |
1396 | switch (opt) { | 2280 | switch (opt) { |
2281 | case 'B': | ||
2282 | myArgs.num_tasks = atoi(optarg); | ||
2283 | break; | ||
2284 | case 'P': | ||
2285 | run_mode = PROXY; | ||
2286 | break; | ||
2287 | case 'A': | ||
2288 | run_mode = DAEMON; | ||
2289 | break; | ||
2290 | |||
2291 | |||
1397 | case 'w': | 2292 | case 'w': |
1398 | wait = 1; | 2293 | myArgs.wait = true; |
1399 | break; | 2294 | break; |
1400 | case 'p': | 2295 | case 'p': |
1401 | cluster = atoi(optarg); | 2296 | myArgs.cluster = atoi(optarg); |
1402 | migrate = 1; | 2297 | myArgs.migrate = true; |
1403 | break; | 2298 | break; |
1404 | case 'z': | 2299 | case 'z': |
1405 | cluster_size = atoi(optarg); | 2300 | // CPU_PARTITION_SIZE = cluster_size; |
1406 | CPU_PARTITION_SIZE = cluster_size; | 2301 | myArgs.cluster_size = atoi(optarg); |
1407 | break; | 2302 | break; |
1408 | case 'g': | 2303 | case 'g': |
1409 | GPU_USING = true; | 2304 | // GPU_USING = true; |
1410 | GPU_PARTITION = atoi(optarg); | 2305 | // GPU_PARTITION = atoi(optarg); |
1411 | assert(GPU_PARTITION >= 0 && GPU_PARTITION < NR_GPUS); | 2306 | myArgs.gpu_using = true; |
2307 | myArgs.gpu_partition = atoi(optarg); | ||
2308 | // assert(GPU_PARTITION >= 0 && GPU_PARTITION < NR_GPUS); | ||
1412 | break; | 2309 | break; |
1413 | case 'y': | 2310 | case 'y': |
1414 | GPU_PARTITION_SIZE = atoi(optarg); | 2311 | // GPU_PARTITION_SIZE = atoi(optarg); |
1415 | assert(GPU_PARTITION_SIZE > 0); | 2312 | myArgs.gpu_partition_size = atoi(optarg); |
2313 | // assert(GPU_PARTITION_SIZE > 0); | ||
1416 | break; | 2314 | break; |
1417 | case 'r': | 2315 | case 'r': |
1418 | RHO = atoi(optarg); | 2316 | // RHO = atoi(optarg); |
1419 | assert(RHO > 0); | 2317 | myArgs.rho = atoi(optarg); |
2318 | // assert(RHO > 0); | ||
1420 | break; | 2319 | break; |
1421 | case 'C': | 2320 | case 'C': |
1422 | NUM_COPY_ENGINES = atoi(optarg); | 2321 | // NUM_COPY_ENGINES = atoi(optarg); |
1423 | assert(NUM_COPY_ENGINES == 1 || NUM_COPY_ENGINES == 2); | 2322 | myArgs.num_ce = atoi(optarg); |
2323 | // assert(NUM_COPY_ENGINES == 1 || NUM_COPY_ENGINES == 2); | ||
1424 | break; | 2324 | break; |
1425 | case 'V': | 2325 | case 'V': |
1426 | RESERVED_MIGR_COPY_ENGINE = true; | 2326 | // RESERVED_MIGR_COPY_ENGINE = true; |
2327 | myArgs.reserve_migr_ce = true; | ||
1427 | break; | 2328 | break; |
1428 | case 'E': | 2329 | case 'E': |
1429 | USE_ENGINE_LOCKS = true; | 2330 | // USE_ENGINE_LOCKS = true; |
1430 | ENGINE_LOCK_TYPE = (eEngineLockTypes)atoi(optarg); | 2331 | // ENGINE_LOCK_TYPE = (eEngineLockTypes)atoi(optarg); |
1431 | assert(ENGINE_LOCK_TYPE == FIFO || ENGINE_LOCK_TYPE == PRIOQ); | 2332 | myArgs.use_engine_locks = true; |
2333 | myArgs.engine_lock_type = (eEngineLockTypes)atoi(optarg); | ||
2334 | // assert(ENGINE_LOCK_TYPE == FIFO || ENGINE_LOCK_TYPE == PRIOQ); | ||
2335 | break; | ||
2336 | case 'u': | ||
2337 | myArgs.yield_locks = true; | ||
1432 | break; | 2338 | break; |
1433 | case 'D': | 2339 | case 'D': |
1434 | USE_DYNAMIC_GROUP_LOCKS = true; | 2340 | // USE_DYNAMIC_GROUP_LOCKS = true; |
2341 | myArgs.use_dgls = true; | ||
1435 | break; | 2342 | break; |
1436 | case 'G': | 2343 | case 'G': |
1437 | GPU_SYNC_MODE = (eGpuSyncMode)atoi(optarg); | 2344 | // GPU_SYNC_MODE = (eGpuSyncMode)atoi(optarg); |
1438 | assert(GPU_SYNC_MODE >= IKGLP_MODE && GPU_SYNC_MODE <= RGEM_MODE); | 2345 | myArgs.gpusync_mode = (eGpuSyncMode)atoi(optarg); |
2346 | // assert(GPU_SYNC_MODE >= IKGLP_MODE && GPU_SYNC_MODE <= RGEM_MODE); | ||
1439 | break; | 2347 | break; |
1440 | case 'a': | 2348 | case 'a': |
1441 | ENABLE_AFFINITY = true; | 2349 | // ENABLE_AFFINITY = true; |
2350 | myArgs.enable_affinity = true; | ||
1442 | break; | 2351 | break; |
1443 | case 'F': | 2352 | case 'F': |
1444 | RELAX_FIFO_MAX_LEN = true; | 2353 | // RELAX_FIFO_MAX_LEN = true; |
2354 | myArgs.relax_fifo_len = true; | ||
1445 | break; | 2355 | break; |
1446 | case 'x': | 2356 | case 'x': |
1447 | CUDA_SYNC_MODE = SPIN; | 2357 | // CUDA_SYNC_MODE = SPIN; |
2358 | myArgs.sync_mode = SPIN; | ||
1448 | break; | 2359 | break; |
1449 | case 'S': | 2360 | case 'S': |
1450 | SEND_SIZE = kbToB((size_t)atoi(optarg)); | 2361 | // SEND_SIZE = kbToB((size_t)atoi(optarg)); |
2362 | myArgs.send_size = kbToB((size_t)atoi(optarg)); | ||
1451 | break; | 2363 | break; |
1452 | case 'R': | 2364 | case 'R': |
1453 | RECV_SIZE = kbToB((size_t)atoi(optarg)); | 2365 | // RECV_SIZE = kbToB((size_t)atoi(optarg)); |
2366 | myArgs.recv_size = kbToB((size_t)atoi(optarg)); | ||
1454 | break; | 2367 | break; |
1455 | case 'T': | 2368 | case 'T': |
1456 | STATE_SIZE = kbToB((size_t)atoi(optarg)); | 2369 | // STATE_SIZE = kbToB((size_t)atoi(optarg)); |
2370 | myArgs.state_size = kbToB((size_t)atoi(optarg)); | ||
1457 | break; | 2371 | break; |
1458 | case 'Z': | 2372 | case 'Z': |
1459 | ENABLE_CHUNKING = true; | 2373 | // ENABLE_CHUNKING = true; |
1460 | CHUNK_SIZE = kbToB((size_t)atoi(optarg)); | 2374 | // CHUNK_SIZE = kbToB((size_t)atoi(optarg)); |
2375 | myArgs.enable_chunking = true; | ||
2376 | myArgs.chunk_size = kbToB((size_t)atoi(optarg)); | ||
1461 | break; | 2377 | break; |
1462 | case 'M': | 2378 | case 'M': |
1463 | MIGRATE_VIA_SYSMEM = true; | 2379 | // MIGRATE_VIA_SYSMEM = true; |
2380 | myArgs.use_sysmem_migration = true; | ||
1464 | break; | 2381 | break; |
1465 | case 'm': | 2382 | case 'm': |
1466 | num_gpu_users = (int)atoi(optarg); | 2383 | // num_gpu_users = (int)atoi(optarg); |
1467 | assert(num_gpu_users > 0); | 2384 | myArgs.num_gpu_tasks = (int)atoi(optarg); |
2385 | // assert(num_gpu_users > 0); | ||
1468 | break; | 2386 | break; |
1469 | case 'k': | 2387 | case 'k': |
1470 | num_kernels = (unsigned int)atoi(optarg); | 2388 | // num_kernels = (unsigned int)atoi(optarg); |
2389 | myArgs.num_kernels = (unsigned int)atoi(optarg); | ||
1471 | break; | 2390 | break; |
1472 | case 'b': | 2391 | case 'b': |
1473 | budget_ms = atoi(optarg); | 2392 | // budget_ms = atoi(optarg); |
2393 | myArgs.budget_ms = atoi(optarg); | ||
1474 | break; | 2394 | break; |
1475 | case 'W': | 2395 | case 'W': |
1476 | stdpct = atof(optarg); | 2396 | // stdpct = (double)atof(optarg); |
2397 | myArgs.stddev = (double)atof(optarg); | ||
1477 | break; | 2398 | break; |
1478 | case 'N': | 2399 | case 'N': |
1479 | scheduler = LINUX; | 2400 | // scheduler = LINUX; |
2401 | myArgs.scheduler = LINUX; | ||
1480 | break; | 2402 | break; |
1481 | case 'I': | 2403 | case 'I': |
1482 | scheduler = RT_LINUX; | 2404 | // scheduler = RT_LINUX; |
2405 | myArgs.scheduler = RT_LINUX; | ||
1483 | break; | 2406 | break; |
1484 | case 'q': | 2407 | case 'q': |
1485 | priority = atoi(optarg); | 2408 | // priority = atoi(optarg); |
2409 | myArgs.priority = atoi(optarg); | ||
1486 | break; | 2410 | break; |
1487 | case 'c': | 2411 | case 'c': |
1488 | cls = str2class(optarg); | 2412 | // cls = str2class(optarg); |
1489 | if (cls == -1) | 2413 | myArgs.cls = str2class(optarg); |
1490 | usage("Unknown task class."); | ||
1491 | break; | 2414 | break; |
1492 | case 'e': | 2415 | case 'e': |
1493 | want_enforcement = true; | 2416 | // want_enforcement = true; |
2417 | myArgs.want_enforcement = true; | ||
1494 | break; | 2418 | break; |
1495 | case 'i': | 2419 | case 'i': |
1496 | want_signals = true; | 2420 | // want_signals = true; |
2421 | myArgs.want_signals = true; | ||
1497 | break; | 2422 | break; |
1498 | case 'd': | 2423 | case 'd': |
1499 | drain = (budget_drain_policy_t)atoi(optarg); | 2424 | // drain = (budget_drain_policy_t)atoi(optarg); |
1500 | assert(drain >= DRAIN_SIMPLE && drain <= DRAIN_SOBLIV); | 2425 | myArgs.drain_policy = (budget_drain_policy_t)atoi(optarg); |
1501 | assert(drain != DRAIN_SAWARE); // unsupported | 2426 | // assert(drain >= DRAIN_SIMPLE && drain <= DRAIN_SOBLIV); |
1502 | break; | 2427 | // assert(drain != DRAIN_SAWARE); // unsupported |
1503 | case 'l': | ||
1504 | test_loop = 1; | ||
1505 | break; | ||
1506 | case 'o': | ||
1507 | column = atoi(optarg); | ||
1508 | break; | 2428 | break; |
2429 | // case 'l': | ||
2430 | // test_loop = 1; | ||
2431 | // break; | ||
2432 | // case 'o': | ||
2433 | //// column = atoi(optarg); | ||
2434 | // myArgs.column = atoi(optarg); | ||
2435 | // break; | ||
1509 | // case 'f': | 2436 | // case 'f': |
1510 | // file = optarg; | 2437 | // file = optarg; |
1511 | // break; | 2438 | // break; |
1512 | case 's': | 2439 | case 's': |
1513 | scale = atof(optarg); | 2440 | // scale = (double)atof(optarg); |
2441 | myArgs.scale = (double)atof(optarg); | ||
1514 | break; | 2442 | break; |
1515 | // case 'X': | 2443 | // case 'X': |
1516 | // protocol = lock_protocol_for_name(optarg); | 2444 | // protocol = lock_protocol_for_name(optarg); |
@@ -1537,304 +2465,33 @@ int main(int argc, char** argv) | |||
1537 | } | 2465 | } |
1538 | } | 2466 | } |
1539 | 2467 | ||
1540 | #ifdef VANILLA_LINUX | ||
1541 | assert(scheduler != LITMUS); | ||
1542 | assert(!wait); | ||
1543 | #endif | ||
1544 | |||
1545 | assert(stdpct >= 0.0); | ||
1546 | |||
1547 | if (MIGRATE_VIA_SYSMEM) | ||
1548 | assert(GPU_PARTITION_SIZE != 1); | ||
1549 | |||
1550 | // turn off some features to be safe | ||
1551 | if (scheduler != LITMUS) | ||
1552 | { | ||
1553 | RHO = 0; | ||
1554 | USE_ENGINE_LOCKS = false; | ||
1555 | USE_DYNAMIC_GROUP_LOCKS = false; | ||
1556 | RELAX_FIFO_MAX_LEN = false; | ||
1557 | ENABLE_RT_AUX_THREADS = false; | ||
1558 | budget_ms = -1.0; | ||
1559 | want_enforcement = false; | ||
1560 | want_signals = false; | ||
1561 | |||
1562 | cjobfn = job_linux; | ||
1563 | gjobfn = gpu_job_linux; | ||
1564 | |||
1565 | if (scheduler == RT_LINUX) | ||
1566 | { | ||
1567 | struct sched_param fifoparams; | ||
1568 | |||
1569 | assert(priority >= sched_get_priority_min(SCHED_FIFO) && | ||
1570 | priority <= sched_get_priority_max(SCHED_FIFO)); | ||
1571 | |||
1572 | memset(&fifoparams, 0, sizeof(fifoparams)); | ||
1573 | fifoparams.sched_priority = priority; | ||
1574 | assert(0 == sched_setscheduler(getpid(), SCHED_FIFO, &fifoparams)); | ||
1575 | } | ||
1576 | } | ||
1577 | else | ||
1578 | { | ||
1579 | cjobfn = job; | ||
1580 | gjobfn = gpu_job; | ||
1581 | |||
1582 | if (!litmus_is_valid_fixed_prio(priority)) | ||
1583 | usage("Invalid priority."); | ||
1584 | } | ||
1585 | |||
1586 | if (test_loop) { | ||
1587 | debug_delay_loop(); | ||
1588 | return 0; | ||
1589 | } | ||
1590 | 2468 | ||
1591 | srand(time(0)); | 2469 | srand(time(0)); |
1592 | 2470 | ||
1593 | if (file) { | ||
1594 | get_exec_times(file, column, &num_jobs, &exec_times); | ||
1595 | |||
1596 | if (argc - optind < 2) | ||
1597 | usage("Arguments missing."); | ||
1598 | |||
1599 | for (cur_job = 0; cur_job < num_jobs; ++cur_job) { | ||
1600 | /* convert the execution time to seconds */ | ||
1601 | duration += exec_times[cur_job] * 0.001; | ||
1602 | } | ||
1603 | } else { | ||
1604 | /* | ||
1605 | * if we're not reading from the CSV file, then we need | ||
1606 | * three parameters | ||
1607 | */ | ||
1608 | if (argc - optind < 3) | ||
1609 | usage("Arguments missing."); | ||
1610 | } | ||
1611 | |||
1612 | if (argc - optind == 3) { | 2471 | if (argc - optind == 3) { |
1613 | assert(!GPU_USING); | 2472 | myArgs.wcet_ms = atof(argv[optind + 0]); |
1614 | wcet_ms = atof(argv[optind + 0]); | 2473 | myArgs.period_ms = atof(argv[optind + 1]); |
1615 | period_ms = atof(argv[optind + 1]); | 2474 | myArgs.duration = atof(argv[optind + 2]); |
1616 | duration = atof(argv[optind + 2]); | ||
1617 | } | 2475 | } |
1618 | else if (argc - optind == 4) { | 2476 | else if (argc - optind == 4) { |
1619 | assert(GPU_USING); | 2477 | myArgs.wcet_ms = atof(argv[optind + 0]); |
1620 | wcet_ms = atof(argv[optind + 0]); | 2478 | myArgs.gpu_wcet_ms = atof(argv[optind + 1]); |
1621 | gpu_wcet_ms = atof(argv[optind + 1]); | 2479 | myArgs.period_ms = atof(argv[optind + 2]); |
1622 | period_ms = atof(argv[optind + 2]); | 2480 | myArgs.duration = atof(argv[optind + 3]); |
1623 | duration = atof(argv[optind + 3]); | ||
1624 | } | ||
1625 | |||
1626 | wcet = ms2ns(wcet_ms); | ||
1627 | period = ms2ns(period_ms); | ||
1628 | if (wcet <= 0) | ||
1629 | usage("The worst-case execution time must be a " | ||
1630 | "positive number."); | ||
1631 | if (period <= 0) | ||
1632 | usage("The period must be a positive number."); | ||
1633 | if (!file && wcet > period) { | ||
1634 | usage("The worst-case execution time must not " | ||
1635 | "exceed the period."); | ||
1636 | } | ||
1637 | if (GPU_USING && gpu_wcet_ms <= 0) | ||
1638 | usage("The worst-case gpu execution time must be a positive number."); | ||
1639 | |||
1640 | if (budget_ms > 0.0) | ||
1641 | budget = ms2ns(budget_ms); | ||
1642 | else | ||
1643 | budget = wcet; | ||
1644 | |||
1645 | #if 0 | ||
1646 | // use upscale to determine breakdown utilization | ||
1647 | // only scaling up CPU time for now. | ||
1648 | double upscale = (double)period/(double)budget - 1.0; | ||
1649 | upscale = std::min(std::max(0.0, upscale), 0.6); // at most 30% | ||
1650 | wcet = wcet + wcet*upscale; | ||
1651 | budget = budget + wcet*upscale; | ||
1652 | wcet_ms = wcet_ms + wcet_ms*upscale; | ||
1653 | |||
1654 | // fucking floating point | ||
1655 | if (budget < wcet) | ||
1656 | budget = wcet; | ||
1657 | if (budget > period) | ||
1658 | budget = period; | ||
1659 | #endif | ||
1660 | |||
1661 | // randomize execution time according to a normal distribution | ||
1662 | // centered around the desired execution time. | ||
1663 | // standard deviation is a percentage of this average | ||
1664 | wcet_dist_ms = new Normal<double>(wcet_ms + gpu_wcet_ms, (wcet_ms + gpu_wcet_ms) * stdpct); | ||
1665 | wcet_dist_ms->seed((unsigned int)time(0)); | ||
1666 | |||
1667 | if (file && num_jobs > 1) | ||
1668 | duration += period_ms * 0.001 * (num_jobs - 1); | ||
1669 | |||
1670 | if (migrate) { | ||
1671 | ret = be_migrate_to_cluster(cluster, cluster_size); | ||
1672 | if (ret < 0) | ||
1673 | bail_out("could not migrate to target partition or cluster."); | ||
1674 | } | ||
1675 | |||
1676 | if (scheduler != LITMUS) | ||
1677 | { | ||
1678 | // set some variables needed by linux modes | ||
1679 | if (GPU_USING) | ||
1680 | { | ||
1681 | TRACE_MIGRATIONS = true; | ||
1682 | } | ||
1683 | periodTime.tv_sec = period / s2ns(1); | ||
1684 | periodTime.tv_nsec = period - periodTime.tv_sec * s2ns(1); | ||
1685 | period_ns = period; | ||
1686 | } | ||
1687 | |||
1688 | init_rt_task_param(¶m); | ||
1689 | param.exec_cost = budget; | ||
1690 | param.period = period; | ||
1691 | param.priority = priority; | ||
1692 | param.cls = cls; | ||
1693 | param.budget_policy = (want_enforcement) ? | ||
1694 | PRECISE_ENFORCEMENT : NO_ENFORCEMENT; | ||
1695 | param.budget_signal_policy = (want_enforcement && want_signals) ? | ||
1696 | PRECISE_SIGNALS : NO_SIGNALS; | ||
1697 | param.drain_policy = drain; | ||
1698 | param.release_policy = PERIODIC; | ||
1699 | |||
1700 | if (migrate) | ||
1701 | param.cpu = cluster_to_first_cpu(cluster, cluster_size); | ||
1702 | ret = set_rt_task_param(gettid(), ¶m); | ||
1703 | if (ret < 0) | ||
1704 | bail_out("could not setup rt task params"); | ||
1705 | |||
1706 | if (scheduler == LITMUS) { | ||
1707 | init_litmus(); | ||
1708 | } | ||
1709 | else { | ||
1710 | init_linux(); | ||
1711 | } | ||
1712 | |||
1713 | if (want_signals) { | ||
1714 | /* bind default longjmp signal handler to SIG_BUDGET. */ | ||
1715 | activate_litmus_signals(SIG_BUDGET_MASK, longjmp_on_litmus_signal); | ||
1716 | } | ||
1717 | |||
1718 | // if (protocol >= 0) { | ||
1719 | // /* open reference to semaphore */ | ||
1720 | // lock_od = litmus_open_lock(protocol, resource_id, lock_namespace, &cluster); | ||
1721 | // if (lock_od < 0) { | ||
1722 | // perror("litmus_open_lock"); | ||
1723 | // usage("Could not open lock."); | ||
1724 | // } | ||
1725 | // } | ||
1726 | |||
1727 | if (GPU_USING) { | ||
1728 | allocate_locks(num_gpu_users, scheduler != LITMUS); | ||
1729 | |||
1730 | signal(SIGABRT, catch_exit); | ||
1731 | signal(SIGTERM, catch_exit); | ||
1732 | signal(SIGQUIT, catch_exit); | ||
1733 | signal(SIGSEGV, catch_exit); | ||
1734 | |||
1735 | init_cuda(num_gpu_users); | ||
1736 | safetynet = true; | ||
1737 | } | ||
1738 | |||
1739 | if (scheduler == LITMUS) | ||
1740 | { | ||
1741 | ret = task_mode(LITMUS_RT_TASK); | ||
1742 | if (ret != 0) | ||
1743 | bail_out("could not become RT task"); | ||
1744 | } | ||
1745 | else | ||
1746 | { | ||
1747 | trace_name(); | ||
1748 | trace_param(); | ||
1749 | } | ||
1750 | |||
1751 | if (wait) { | ||
1752 | ret = wait_for_ts_release2(&releaseTime); | ||
1753 | if (ret != 0) | ||
1754 | bail_out("wait_for_ts_release2()"); | ||
1755 | |||
1756 | if (scheduler != LITMUS) | ||
1757 | log_release(); | ||
1758 | } | ||
1759 | else if (scheduler != LITMUS) | ||
1760 | { | ||
1761 | clock_gettime(CLOCK_MONOTONIC, &releaseTime); | ||
1762 | sleep_next_period_linux(); | ||
1763 | } | 2481 | } |
1764 | 2482 | ||
1765 | if (scheduler == LITMUS && GPU_USING && ENABLE_RT_AUX_THREADS) { | 2483 | if (myArgs.num_tasks == 0 || myArgs.num_gpu_tasks == 0) { |
1766 | if (enable_aux_rt_tasks(AUX_CURRENT | AUX_FUTURE) != 0) | 2484 | // safety w.r.t. shared mem. |
1767 | bail_out("enable_aux_rt_tasks() failed"); | 2485 | sleep(2); |
1768 | } | 2486 | } |
1769 | 2487 | ||
1770 | start = wctime(); | 2488 | if (run_mode == NORMAL) { |
1771 | 2489 | return do_normal(&myArgs); | |
1772 | if (!GPU_USING) { | ||
1773 | bool keepgoing; | ||
1774 | do | ||
1775 | { | ||
1776 | double job_ms = wcet_dist_ms->random(); | ||
1777 | if (job_ms < 0.0) | ||
1778 | job_ms = 0.0; | ||
1779 | keepgoing = cjobfn(ms2s(job_ms * scale), start + duration); | ||
1780 | }while(keepgoing); | ||
1781 | } | ||
1782 | else { | ||
1783 | bool keepgoing; | ||
1784 | do | ||
1785 | { | ||
1786 | double job_ms = wcet_dist_ms->random(); | ||
1787 | if (job_ms < 0.0) | ||
1788 | job_ms = 0.0; | ||
1789 | |||
1790 | double cpu_job_ms = (job_ms/(wcet_ms + gpu_wcet_ms))*wcet_ms; | ||
1791 | double gpu_job_ms = (job_ms/(wcet_ms + gpu_wcet_ms))*gpu_wcet_ms; | ||
1792 | keepgoing = gjobfn( | ||
1793 | ms2s(cpu_job_ms * scale), | ||
1794 | ms2s(gpu_job_ms * scale), | ||
1795 | num_kernels, | ||
1796 | start + duration); | ||
1797 | }while(keepgoing); | ||
1798 | } | 2490 | } |
1799 | 2491 | else if (run_mode == PROXY) { | |
1800 | if (GPU_USING && ENABLE_RT_AUX_THREADS) | 2492 | return do_proxy(&myArgs); |
1801 | if (disable_aux_rt_tasks(AUX_CURRENT | AUX_FUTURE) != 0) | ||
1802 | bail_out("disable_aux_rt_tasks() failed"); | ||
1803 | |||
1804 | // if (file) { | ||
1805 | // /* use times read from the CSV file */ | ||
1806 | // for (cur_job = 0; cur_job < num_jobs; ++cur_job) { | ||
1807 | // /* convert job's length to seconds */ | ||
1808 | // job(exec_times[cur_job] * 0.001 * scale, | ||
1809 | // start + duration, | ||
1810 | // lock_od, cs_length * 0.001); | ||
1811 | // } | ||
1812 | // } else { | ||
1813 | // /* convert to seconds and scale */ | ||
1814 | // while (job(wcet_ms * 0.001 * scale, start + duration, | ||
1815 | // lock_od, cs_length * 0.001)); | ||
1816 | // } | ||
1817 | |||
1818 | if (scheduler == LITMUS) | ||
1819 | { | ||
1820 | ret = task_mode(BACKGROUND_TASK); | ||
1821 | if (ret != 0) | ||
1822 | bail_out("could not become regular task (huh?)"); | ||
1823 | } | 2493 | } |
1824 | 2494 | else if (run_mode == DAEMON) { | |
1825 | if (GPU_USING) { | 2495 | return do_daemon(&myArgs); |
1826 | safetynet = false; | ||
1827 | exit_cuda(); | ||
1828 | |||
1829 | |||
1830 | // printf("avg: %f\n", ms_sum/gpucount); | ||
1831 | } | 2496 | } |
1832 | |||
1833 | if (wcet_dist_ms) | ||
1834 | delete wcet_dist_ms; | ||
1835 | |||
1836 | if (file) | ||
1837 | free(exec_times); | ||
1838 | |||
1839 | return 0; | ||
1840 | } | 2497 | } |
diff --git a/include/litmus.h b/include/litmus.h index d3b89cf..a6c2b13 100644 --- a/include/litmus.h +++ b/include/litmus.h | |||
@@ -89,6 +89,7 @@ int litmus_open_lock( | |||
89 | /* real-time locking protocol support */ | 89 | /* real-time locking protocol support */ |
90 | int litmus_lock(int od); | 90 | int litmus_lock(int od); |
91 | int litmus_unlock(int od); | 91 | int litmus_unlock(int od); |
92 | int litmus_should_yield_lock(int od); | ||
92 | 93 | ||
93 | /* Dynamic group lock support. ods arrays MUST BE PARTIALLY ORDERED!!!!!! | 94 | /* Dynamic group lock support. ods arrays MUST BE PARTIALLY ORDERED!!!!!! |
94 | * Use the same ordering for lock and unlock. | 95 | * Use the same ordering for lock and unlock. |
@@ -99,6 +100,7 @@ int litmus_unlock(int od); | |||
99 | */ | 100 | */ |
100 | int litmus_dgl_lock(int* ods, int dgl_size); | 101 | int litmus_dgl_lock(int* ods, int dgl_size); |
101 | int litmus_dgl_unlock(int* ods, int dgl_size); | 102 | int litmus_dgl_unlock(int* ods, int dgl_size); |
103 | int litmus_dgl_should_yield_lock(int* ods, int dgl_size); | ||
102 | 104 | ||
103 | /* nvidia graphics cards */ | 105 | /* nvidia graphics cards */ |
104 | int register_nv_device(int nv_device_id); | 106 | int register_nv_device(int nv_device_id); |
diff --git a/src/kernel_iface.c b/src/kernel_iface.c index e446102..73d398f 100644 --- a/src/kernel_iface.c +++ b/src/kernel_iface.c | |||
@@ -80,7 +80,7 @@ int get_nr_ts_release_waiters(void) | |||
80 | } | 80 | } |
81 | 81 | ||
82 | /* thread-local pointer to control page */ | 82 | /* thread-local pointer to control page */ |
83 | static __thread struct control_page *ctrl_page; | 83 | static __thread struct control_page *ctrl_page = NULL; |
84 | 84 | ||
85 | int init_kernel_iface(void) | 85 | int init_kernel_iface(void) |
86 | { | 86 | { |
diff --git a/src/syscalls.c b/src/syscalls.c index d3ca5d8..ff02b7d 100644 --- a/src/syscalls.c +++ b/src/syscalls.c | |||
@@ -58,6 +58,11 @@ int litmus_unlock(int od) | |||
58 | return syscall(__NR_litmus_unlock, od); | 58 | return syscall(__NR_litmus_unlock, od); |
59 | } | 59 | } |
60 | 60 | ||
61 | int litmus_should_yield_lock(int od) | ||
62 | { | ||
63 | return syscall(__NR_litmus_should_yield_lock, od); | ||
64 | } | ||
65 | |||
61 | int litmus_dgl_lock(int *ods, int dgl_size) | 66 | int litmus_dgl_lock(int *ods, int dgl_size) |
62 | { | 67 | { |
63 | return syscall(__NR_litmus_dgl_lock, ods, dgl_size); | 68 | return syscall(__NR_litmus_dgl_lock, ods, dgl_size); |
@@ -68,6 +73,11 @@ int litmus_dgl_unlock(int *ods, int dgl_size) | |||
68 | return syscall(__NR_litmus_dgl_unlock, ods, dgl_size); | 73 | return syscall(__NR_litmus_dgl_unlock, ods, dgl_size); |
69 | } | 74 | } |
70 | 75 | ||
76 | int litmus_dgl_should_yield_lock(int *ods, int dgl_size) | ||
77 | { | ||
78 | return syscall(__NR_litmus_dgl_should_yield_lock, ods, dgl_size); | ||
79 | } | ||
80 | |||
71 | int get_job_no(unsigned int *job_no) | 81 | int get_job_no(unsigned int *job_no) |
72 | { | 82 | { |
73 | return syscall(__NR_query_job_no, job_no); | 83 | return syscall(__NR_query_job_no, job_no); |