aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGlenn Elliott <gelliott@cs.unc.edu>2013-05-02 18:02:10 -0400
committerGlenn Elliott <gelliott@cs.unc.edu>2013-05-02 18:02:10 -0400
commit0f89bddde73d448511004a60b98b8be042f6ffd6 (patch)
tree0fd80ef6fddc61698eb189b815291bc18c7c10c4
parente3935c7f68ce428e394eb53ea29ebef5509bcd7f (diff)
randomize job execution time w/ noraml distribu.
-rw-r--r--Makefile4
-rw-r--r--gpu/budget.cpp32
-rw-r--r--gpu/gpuspin.cu564
-rw-r--r--src/migration.c4
4 files changed, 356 insertions, 248 deletions
diff --git a/Makefile b/Makefile
index b91dec5..831c16b 100644
--- a/Makefile
+++ b/Makefile
@@ -30,7 +30,7 @@ flags-api = -D_XOPEN_SOURCE=600 -D_GNU_SOURCE
30flags-misc = -fasynchronous-unwind-tables -fnon-call-exceptions 30flags-misc = -fasynchronous-unwind-tables -fnon-call-exceptions
31 31
32flags-cu-debug = -g -G -Xcompiler -Wall -Xcompiler -Werror 32flags-cu-debug = -g -G -Xcompiler -Wall -Xcompiler -Werror
33flags-cu-optim = -O3 -Xcompiler -march=native 33flags-cu-optim = -O2 -Xcompiler -march=native
34flags-cu-nvcc = --use_fast_math -gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 34flags-cu-nvcc = --use_fast_math -gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30
35flags-cu-misc = -Xcompiler -fasynchronous-unwind-tables -Xcompiler -fnon-call-exceptions -Xcompiler -malign-double -Xcompiler -pthread 35flags-cu-misc = -Xcompiler -fasynchronous-unwind-tables -Xcompiler -fnon-call-exceptions -Xcompiler -malign-double -Xcompiler -pthread
36flags-cu-x86_64 = -m64 36flags-cu-x86_64 = -m64
@@ -299,7 +299,7 @@ lib-budget = -lrt -lm -pthread
299vpath %.cu gpu/ 299vpath %.cu gpu/
300 300
301objcu-gpuspin = gpuspin.o common.o 301objcu-gpuspin = gpuspin.o common.o
302lib-gpuspin = -lrt -lm -lpthread 302lib-gpuspin = -lblitz -lrt -lm -lpthread
303 303
304# ############################################################################## 304# ##############################################################################
305# Build everything that depends on liblitmus. 305# Build everything that depends on liblitmus.
diff --git a/gpu/budget.cpp b/gpu/budget.cpp
index eebb14e..e08daf7 100644
--- a/gpu/budget.cpp
+++ b/gpu/budget.cpp
@@ -134,7 +134,7 @@ int job(lt_t exec_ns, lt_t budget_ns)
134 for(int i = 0; i < NUM_LOCKS; ++i) 134 for(int i = 0; i < NUM_LOCKS; ++i)
135 litmus_lock(LOCKS[i]); 135 litmus_lock(LOCKS[i]);
136 } 136 }
137 137
138 // intentionally overrun via suspension 138 // intentionally overrun via suspension
139 if (OVERRUN_BY_SLEEP) 139 if (OVERRUN_BY_SLEEP)
140 lt_sleep(approx_remaining + overrun_extra); 140 lt_sleep(approx_remaining + overrun_extra);
@@ -146,11 +146,11 @@ int job(lt_t exec_ns, lt_t budget_ns)
146 litmus_dgl_unlock(LOCKS, NUM_LOCKS); 146 litmus_dgl_unlock(LOCKS, NUM_LOCKS);
147 else 147 else
148 for(int i = NUM_LOCKS-1; i >= 0; --i) 148 for(int i = NUM_LOCKS-1; i >= 0; --i)
149 litmus_unlock(LOCKS[i]); 149 litmus_unlock(LOCKS[i]);
150 if (NEST_IN_IKGLP) 150 if (NEST_IN_IKGLP)
151 litmus_unlock(IKGLP_LOCK); 151 litmus_unlock(IKGLP_LOCK);
152 } 152 }
153 153
154 if (SIGNALS && BLOCK_SIGNALS_ON_SLEEP) 154 if (SIGNALS && BLOCK_SIGNALS_ON_SLEEP)
155 unblock_litmus_signals(SIG_BUDGET); 155 unblock_litmus_signals(SIG_BUDGET);
156 } 156 }
@@ -165,7 +165,7 @@ int job(lt_t exec_ns, lt_t budget_ns)
165 return 1; 165 return 1;
166} 166}
167 167
168#define OPTSTR "SbosOvzalwqixdn:r:" 168#define OPTSTR "SbosOvzalwqixdn:r:p:"
169 169
170int main(int argc, char** argv) 170int main(int argc, char** argv)
171{ 171{
@@ -185,9 +185,16 @@ int main(int argc, char** argv)
185 int compute_overrun_rate = 0; 185 int compute_overrun_rate = 0;
186 int once = 1; 186 int once = 1;
187 187
188 bool migrate = false;
189 int partition = 0;
190 int partition_sz = 1;
188 191
189 while ((opt = getopt(argc, argv, OPTSTR)) != -1) { 192 while ((opt = getopt(argc, argv, OPTSTR)) != -1) {
190 switch(opt) { 193 switch(opt) {
194 case 'p':
195 migrate = true;
196 partition = atoi(optarg);
197 break;
191 case 'S': 198 case 'S':
192 SIGNALS = 1; 199 SIGNALS = 1;
193 break; 200 break;
@@ -261,7 +268,7 @@ int main(int argc, char** argv)
261 assert(NUM_LOCKS > 0); 268 assert(NUM_LOCKS > 0);
262 if (LOCK_TYPE == IKGLP || NEST_IN_IKGLP) 269 if (LOCK_TYPE == IKGLP || NEST_IN_IKGLP)
263 assert(NUM_REPLICAS >= 1); 270 assert(NUM_REPLICAS >= 1);
264 271
265 LOCKS = new int[NUM_LOCKS]; 272 LOCKS = new int[NUM_LOCKS];
266 273
267 if (compute_overrun_rate) { 274 if (compute_overrun_rate) {
@@ -281,7 +288,14 @@ int main(int argc, char** argv)
281 param.budget_policy = PRECISE_ENFORCEMENT; 288 param.budget_policy = PRECISE_ENFORCEMENT;
282 else 289 else
283 param.budget_signal_policy = PRECISE_SIGNALS; 290 param.budget_signal_policy = PRECISE_SIGNALS;
291 if (migrate)
292 param.cpu = cluster_to_first_cpu(partition, partition_sz);
284 293
294 // set up affinity and init litmus
295 if (migrate) {
296 ret = be_migrate_to_cluster(partition, partition_sz);
297 assert(!ret);
298 }
285 init_litmus(); 299 init_litmus();
286 300
287 ret = set_rt_task_param(gettid(), &param); 301 ret = set_rt_task_param(gettid(), &param);
@@ -309,7 +323,7 @@ int main(int argc, char** argv)
309 } 323 }
310 LOCKS[i] = lock; 324 LOCKS[i] = lock;
311 } 325 }
312 326
313 if (NEST_IN_IKGLP) { 327 if (NEST_IN_IKGLP) {
314 IKGLP_LOCK = open_ikglp_sem(NAMESPACE, i, NUM_REPLICAS); 328 IKGLP_LOCK = open_ikglp_sem(NAMESPACE, i, NUM_REPLICAS);
315 if (IKGLP_LOCK < 0) { 329 if (IKGLP_LOCK < 0) {
@@ -318,13 +332,13 @@ int main(int argc, char** argv)
318 } 332 }
319 } 333 }
320 } 334 }
321 335
322 if (WAIT) { 336 if (WAIT) {
323 ret = wait_for_ts_release(); 337 ret = wait_for_ts_release();
324 if (ret < 0) 338 if (ret < 0)
325 perror("wait_for_ts_release"); 339 perror("wait_for_ts_release");
326 } 340 }
327 341
328 ret = task_mode(LITMUS_RT_TASK); 342 ret = task_mode(LITMUS_RT_TASK);
329 assert(ret == 0); 343 assert(ret == 0);
330 344
@@ -360,6 +374,6 @@ int main(int argc, char** argv)
360 printf("# Overruns: %d\n", NUM_OVERRUNS); 374 printf("# Overruns: %d\n", NUM_OVERRUNS);
361 375
362 delete[] LOCKS; 376 delete[] LOCKS;
363 377
364 return 0; 378 return 0;
365} 379}
diff --git a/gpu/gpuspin.cu b/gpu/gpuspin.cu
index 970d6f2..21134f6 100644
--- a/gpu/gpuspin.cu
+++ b/gpu/gpuspin.cu
@@ -11,6 +11,8 @@
11#include <boost/interprocess/managed_shared_memory.hpp> 11#include <boost/interprocess/managed_shared_memory.hpp>
12#include <boost/interprocess/sync/interprocess_mutex.hpp> 12#include <boost/interprocess/sync/interprocess_mutex.hpp>
13 13
14#include <random/normal.h>
15
14#include <cuda_runtime.h> 16#include <cuda_runtime.h>
15 17
16#include "litmus.h" 18#include "litmus.h"
@@ -18,6 +20,9 @@
18 20
19using namespace std; 21using namespace std;
20using namespace boost::interprocess; 22using namespace boost::interprocess;
23using namespace ranlib;
24
25#define ms2s(ms) ((ms)*0.001)
21 26
22const char *lock_namespace = "./.gpuspin-locks"; 27const char *lock_namespace = "./.gpuspin-locks";
23 28
@@ -143,10 +148,10 @@ struct ce_lock_state
143 size_t num_locks; 148 size_t num_locks;
144 size_t budget_remaining; 149 size_t budget_remaining;
145 bool locked; 150 bool locked;
146 151
147 ce_lock_state(int device_a, enum cudaMemcpyKind kind, size_t size, int device_b = -1, bool migration = false) { 152 ce_lock_state(int device_a, enum cudaMemcpyKind kind, size_t size, int device_b = -1, bool migration = false) {
148 num_locks = (device_a != -1) + (device_b != -1); 153 num_locks = (device_a != -1) + (device_b != -1);
149 154
150 if(device_a != -1) { 155 if(device_a != -1) {
151 if (!migration) 156 if (!migration)
152 locks[0] = (kind == cudaMemcpyHostToDevice || (kind == cudaMemcpyDeviceToDevice && device_b == -1)) ? 157 locks[0] = (kind == cudaMemcpyHostToDevice || (kind == cudaMemcpyDeviceToDevice && device_b == -1)) ?
@@ -155,15 +160,15 @@ struct ce_lock_state
155 locks[0] = (kind == cudaMemcpyHostToDevice || (kind == cudaMemcpyDeviceToDevice && device_b == -1)) ? 160 locks[0] = (kind == cudaMemcpyHostToDevice || (kind == cudaMemcpyDeviceToDevice && device_b == -1)) ?
156 CE_MIGR_SEND_LOCKS[device_a] : CE_MIGR_RECV_LOCKS[device_a]; 161 CE_MIGR_SEND_LOCKS[device_a] : CE_MIGR_RECV_LOCKS[device_a];
157 } 162 }
158 163
159 if(device_b != -1) { 164 if(device_b != -1) {
160 assert(kind == cudaMemcpyDeviceToDevice); 165 assert(kind == cudaMemcpyDeviceToDevice);
161 166
162 if (!migration) 167 if (!migration)
163 locks[1] = CE_RECV_LOCKS[device_b]; 168 locks[1] = CE_RECV_LOCKS[device_b];
164 else 169 else
165 locks[1] = CE_MIGR_RECV_LOCKS[device_b]; 170 locks[1] = CE_MIGR_RECV_LOCKS[device_b];
166 171
167 if(locks[1] < locks[0]) { 172 if(locks[1] < locks[0]) {
168 // enforce total order on locking 173 // enforce total order on locking
169 int temp = locks[1]; 174 int temp = locks[1];
@@ -174,35 +179,35 @@ struct ce_lock_state
174 else { 179 else {
175 locks[1] = -1; 180 locks[1] = -1;
176 } 181 }
177 182
178 if(!ENABLE_CHUNKING) 183 if(!ENABLE_CHUNKING)
179 budget_remaining = size; 184 budget_remaining = size;
180 else 185 else
181 budget_remaining = CHUNK_SIZE; 186 budget_remaining = CHUNK_SIZE;
182 } 187 }
183 188
184 void crash(void) { 189 void crash(void) {
185 void *array[50]; 190 void *array[50];
186 int size, i; 191 int size, i;
187 char **messages; 192 char **messages;
188 193
189 size = backtrace(array, 50); 194 size = backtrace(array, 50);
190 messages = backtrace_symbols(array, size); 195 messages = backtrace_symbols(array, size);
191 196
192 fprintf(stderr, "%d: TRIED TO GRAB SAME LOCK TWICE! Lock = %d\n", getpid(), locks[0]); 197 fprintf(stderr, "%d: TRIED TO GRAB SAME LOCK TWICE! Lock = %d\n", getpid(), locks[0]);
193 for (i = 1; i < size && messages != NULL; ++i) 198 for (i = 1; i < size && messages != NULL; ++i)
194 { 199 {
195 fprintf(stderr, "%d: [bt]: (%d) %s\n", getpid(), i, messages[i]); 200 fprintf(stderr, "%d: [bt]: (%d) %s\n", getpid(), i, messages[i]);
196 } 201 }
197 free(messages); 202 free(messages);
198 203
199 assert(false); 204 assert(false);
200 } 205 }
201 206
202 207
203 void lock() { 208 void lock() {
204 if(locks[0] == locks[1]) crash(); 209 if(locks[0] == locks[1]) crash();
205 210
206 if(USE_DYNAMIC_GROUP_LOCKS) { 211 if(USE_DYNAMIC_GROUP_LOCKS) {
207 litmus_dgl_lock(locks, num_locks); 212 litmus_dgl_lock(locks, num_locks);
208 } 213 }
@@ -215,10 +220,10 @@ struct ce_lock_state
215 } 220 }
216 locked = true; 221 locked = true;
217 } 222 }
218 223
219 void unlock() { 224 void unlock() {
220 if(locks[0] == locks[1]) crash(); 225 if(locks[0] == locks[1]) crash();
221 226
222 if(USE_DYNAMIC_GROUP_LOCKS) { 227 if(USE_DYNAMIC_GROUP_LOCKS) {
223 litmus_dgl_unlock(locks, num_locks); 228 litmus_dgl_unlock(locks, num_locks);
224 } 229 }
@@ -232,15 +237,15 @@ struct ce_lock_state
232 } 237 }
233 locked = false; 238 locked = false;
234 } 239 }
235 240
236 void refresh() { 241 void refresh() {
237 budget_remaining = CHUNK_SIZE; 242 budget_remaining = CHUNK_SIZE;
238 } 243 }
239 244
240 bool budgetIsAvailable(size_t tosend) { 245 bool budgetIsAvailable(size_t tosend) {
241 return(tosend >= budget_remaining); 246 return(tosend >= budget_remaining);
242 } 247 }
243 248
244 void decreaseBudget(size_t spent) { 249 void decreaseBudget(size_t spent) {
245 budget_remaining -= spent; 250 budget_remaining -= spent;
246 } 251 }
@@ -253,28 +258,28 @@ static cudaError_t __chunkMemcpy(void* a_dst, const void* a_src, size_t count,
253{ 258{
254 cudaError_t ret = cudaSuccess; 259 cudaError_t ret = cudaSuccess;
255 int remaining = count; 260 int remaining = count;
256 261
257 char* dst = (char*)a_dst; 262 char* dst = (char*)a_dst;
258 const char* src = (const char*)a_src; 263 const char* src = (const char*)a_src;
259 264
260 // disable chunking, if needed, by setting chunk_size equal to the 265 // disable chunking, if needed, by setting chunk_size equal to the
261 // amount of data to be copied. 266 // amount of data to be copied.
262 int chunk_size = (ENABLE_CHUNKING) ? CHUNK_SIZE : count; 267 int chunk_size = (ENABLE_CHUNKING) ? CHUNK_SIZE : count;
263 int i = 0; 268 int i = 0;
264 269
265 while(remaining != 0) 270 while(remaining != 0)
266 { 271 {
267 int bytesToCopy = std::min(remaining, chunk_size); 272 int bytesToCopy = std::min(remaining, chunk_size);
268 273
269 if(state && state->budgetIsAvailable(bytesToCopy) && state->locked) { 274 if(state && state->budgetIsAvailable(bytesToCopy) && state->locked) {
270 cudaStreamSynchronize(STREAMS[CUR_DEVICE]); 275 cudaStreamSynchronize(STREAMS[CUR_DEVICE]);
271 ret = cudaGetLastError(); 276 ret = cudaGetLastError();
272 277
273 if(ret != cudaSuccess) 278 if(ret != cudaSuccess)
274 { 279 {
275 break; 280 break;
276 } 281 }
277 282
278 state->unlock(); 283 state->unlock();
279 state->refresh(); // replentish. 284 state->refresh(); // replentish.
280 // we can only run out of 285 // we can only run out of
@@ -283,14 +288,14 @@ static cudaError_t __chunkMemcpy(void* a_dst, const void* a_src, size_t count,
283 // be set to cover entire memcpy 288 // be set to cover entire memcpy
284 // if chunking were disabled. 289 // if chunking were disabled.
285 } 290 }
286 291
287 if(state && !state->locked) { 292 if(state && !state->locked) {
288 state->lock(); 293 state->lock();
289 } 294 }
290 295
291 //ret = cudaMemcpy(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind); 296 //ret = cudaMemcpy(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind);
292 cudaMemcpyAsync(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind, STREAMS[CUR_DEVICE]); 297 cudaMemcpyAsync(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind, STREAMS[CUR_DEVICE]);
293 298
294 if(state) { 299 if(state) {
295 state->decreaseBudget(bytesToCopy); 300 state->decreaseBudget(bytesToCopy);
296 } 301 }
@@ -332,9 +337,9 @@ void allocate_locks_litmus(void)
332{ 337{
333 // allocate k-FMLP lock 338 // allocate k-FMLP lock
334 int fd = open(lock_namespace, O_RDONLY | O_CREAT, S_IRUSR | S_IWUSR); 339 int fd = open(lock_namespace, O_RDONLY | O_CREAT, S_IRUSR | S_IWUSR);
335 340
336 int base_name = GPU_PARTITION * 1000; 341 int base_name = GPU_PARTITION * 1000;
337 342
338 if (GPU_SYNC_MODE == IKGLP_MODE) { 343 if (GPU_SYNC_MODE == IKGLP_MODE) {
339 /* Standard (optimal) IKGLP */ 344 /* Standard (optimal) IKGLP */
340 TOKEN_LOCK = open_gpusync_token_lock(fd, 345 TOKEN_LOCK = open_gpusync_token_lock(fd,
@@ -390,15 +395,15 @@ void allocate_locks_litmus(void)
390 perror("Invalid GPUSync mode specified\n"); 395 perror("Invalid GPUSync mode specified\n");
391 TOKEN_LOCK = -1; 396 TOKEN_LOCK = -1;
392 } 397 }
393 398
394 if(TOKEN_LOCK < 0) 399 if(TOKEN_LOCK < 0)
395 perror("open_token_sem"); 400 perror("open_token_sem");
396 401
397 if(USE_ENGINE_LOCKS) 402 if(USE_ENGINE_LOCKS)
398 { 403 {
399 assert(NUM_COPY_ENGINES == 1 || NUM_COPY_ENGINES == 2); 404 assert(NUM_COPY_ENGINES == 1 || NUM_COPY_ENGINES == 2);
400 assert((NUM_COPY_ENGINES == 1 && !RESERVED_MIGR_COPY_ENGINE) || NUM_COPY_ENGINES == 2); 405 assert((NUM_COPY_ENGINES == 1 && !RESERVED_MIGR_COPY_ENGINE) || NUM_COPY_ENGINES == 2);
401 406
402 // allocate the engine locks. 407 // allocate the engine locks.
403 for (int i = 0; i < GPU_PARTITION_SIZE; ++i) 408 for (int i = 0; i < GPU_PARTITION_SIZE; ++i)
404 { 409 {
@@ -407,27 +412,27 @@ void allocate_locks_litmus(void)
407 int ce_0_name = (i+1)*10 + base_name + 1; 412 int ce_0_name = (i+1)*10 + base_name + 1;
408 int ce_1_name = (i+1)*10 + base_name + 2; 413 int ce_1_name = (i+1)*10 + base_name + 2;
409 int ee_lock = -1, ce_0_lock = -1, ce_1_lock = -1; 414 int ee_lock = -1, ce_0_lock = -1, ce_1_lock = -1;
410 415
411 open_sem_t openEngineLock = (ENGINE_LOCK_TYPE == FIFO) ? 416 open_sem_t openEngineLock = (ENGINE_LOCK_TYPE == FIFO) ?
412 open_fifo_sem : open_prioq_sem; 417 open_fifo_sem : open_prioq_sem;
413 418
414 ee_lock = openEngineLock(fd, ee_name); 419 ee_lock = openEngineLock(fd, ee_name);
415 if (ee_lock < 0) 420 if (ee_lock < 0)
416 perror("open_*_sem (engine lock)"); 421 perror("open_*_sem (engine lock)");
417 422
418 ce_0_lock = openEngineLock(fd, ce_0_name); 423 ce_0_lock = openEngineLock(fd, ce_0_name);
419 if (ce_0_lock < 0) 424 if (ce_0_lock < 0)
420 perror("open_*_sem (engine lock)"); 425 perror("open_*_sem (engine lock)");
421 426
422 if (NUM_COPY_ENGINES == 2) 427 if (NUM_COPY_ENGINES == 2)
423 { 428 {
424 ce_1_lock = openEngineLock(fd, ce_1_name); 429 ce_1_lock = openEngineLock(fd, ce_1_name);
425 if (ce_1_lock < 0) 430 if (ce_1_lock < 0)
426 perror("open_*_sem (engine lock)"); 431 perror("open_*_sem (engine lock)");
427 } 432 }
428 433
429 EE_LOCKS[idx] = ee_lock; 434 EE_LOCKS[idx] = ee_lock;
430 435
431 if (NUM_COPY_ENGINES == 1) 436 if (NUM_COPY_ENGINES == 1)
432 { 437 {
433 // share locks 438 // share locks
@@ -439,7 +444,7 @@ void allocate_locks_litmus(void)
439 else 444 else
440 { 445 {
441 assert(NUM_COPY_ENGINES == 2); 446 assert(NUM_COPY_ENGINES == 2);
442 447
443 if (RESERVED_MIGR_COPY_ENGINE) { 448 if (RESERVED_MIGR_COPY_ENGINE) {
444 // copy engine deadicated to migration operations 449 // copy engine deadicated to migration operations
445 CE_SEND_LOCKS[idx] = ce_0_lock; 450 CE_SEND_LOCKS[idx] = ce_0_lock;
@@ -469,15 +474,18 @@ public:
469 { 474 {
470 memset(&pool[0], 0, sizeof(pool[0])*poolSize); 475 memset(&pool[0], 0, sizeof(pool[0])*poolSize);
471 } 476 }
472 477
473 int get(pthread_mutex_t* tex, int preference = -1) 478 int get(pthread_mutex_t* tex, int preference = -1)
474 { 479 {
475 int which = -1; 480 int which = -1;
476 int last = (preference >= 0) ? preference : 0; 481 // int last = (preference >= 0) ? preference : 0;
482 int last = (ENABLE_AFFINITY) ?
483 (preference >= 0) ? preference : 0 :
484 rand()%poolSize;
477 int minIdx = last; 485 int minIdx = last;
478 486
479 pthread_mutex_lock(tex); 487 pthread_mutex_lock(tex);
480 488
481 int min = pool[last]; 489 int min = pool[last];
482 for(int i = (minIdx+1)%poolSize; i != last; i = (i+1)%poolSize) 490 for(int i = (minIdx+1)%poolSize; i != last; i = (i+1)%poolSize)
483 { 491 {
@@ -485,21 +493,21 @@ public:
485 minIdx = i; 493 minIdx = i;
486 } 494 }
487 ++pool[minIdx]; 495 ++pool[minIdx];
488 496
489 pthread_mutex_unlock(tex); 497 pthread_mutex_unlock(tex);
490 498
491 which = minIdx; 499 which = minIdx;
492 500
493 return which; 501 return which;
494 } 502 }
495 503
496 void put(pthread_mutex_t* tex, int which) 504 void put(pthread_mutex_t* tex, int which)
497 { 505 {
498 pthread_mutex_lock(tex); 506 pthread_mutex_lock(tex);
499 --pool[which]; 507 --pool[which];
500 pthread_mutex_unlock(tex); 508 pthread_mutex_unlock(tex);
501 } 509 }
502 510
503private: 511private:
504 int poolSize; 512 int poolSize;
505 int pool[NR_GPUS]; // >= gpu_part_size 513 int pool[NR_GPUS]; // >= gpu_part_size
@@ -508,19 +516,19 @@ private:
508static gpu_pool* GPU_LINUX_SEM_POOL = NULL; 516static gpu_pool* GPU_LINUX_SEM_POOL = NULL;
509static pthread_mutex_t* GPU_LINUX_MUTEX_POOL = NULL; 517static pthread_mutex_t* GPU_LINUX_MUTEX_POOL = NULL;
510 518
511static void allocate_locks_linux(int num_gpu_users) 519static void allocate_locks_linux(const int num_gpu_users)
512{ 520{
513 managed_shared_memory *segment_pool_ptr = NULL; 521 managed_shared_memory *segment_pool_ptr = NULL;
514 managed_shared_memory *segment_mutex_ptr = NULL; 522 managed_shared_memory *segment_mutex_ptr = NULL;
515 523
516 int numGpuPartitions = NR_GPUS/GPU_PARTITION_SIZE; 524 int numGpuPartitions = NR_GPUS/GPU_PARTITION_SIZE;
517 525
518 if(num_gpu_users != 0) 526 if(num_gpu_users > 0)
519 { 527 {
520 printf("%d creating shared memory for linux semaphores; num pools = %d, pool size = %d\n", getpid(), numGpuPartitions, GPU_PARTITION_SIZE); 528 printf("%d creating shared memory for linux semaphores; num pools = %d, pool size = %d\n", getpid(), numGpuPartitions, GPU_PARTITION_SIZE);
521 shared_memory_object::remove("linux_mutex_memory"); 529 shared_memory_object::remove("linux_mutex_memory");
522 shared_memory_object::remove("linux_sem_memory"); 530 shared_memory_object::remove("linux_sem_memory");
523 531
524 segment_mutex_ptr = new managed_shared_memory(create_only, "linux_mutex_memory", 4*1024); 532 segment_mutex_ptr = new managed_shared_memory(create_only, "linux_mutex_memory", 4*1024);
525 GPU_LINUX_MUTEX_POOL = segment_mutex_ptr->construct<pthread_mutex_t>("pthread_mutex_t linux_m")[numGpuPartitions](); 533 GPU_LINUX_MUTEX_POOL = segment_mutex_ptr->construct<pthread_mutex_t>("pthread_mutex_t linux_m")[numGpuPartitions]();
526 for(int i = 0; i < numGpuPartitions; ++i) 534 for(int i = 0; i < numGpuPartitions; ++i)
@@ -531,7 +539,7 @@ static void allocate_locks_linux(int num_gpu_users)
531 pthread_mutex_init(&(GPU_LINUX_MUTEX_POOL[i]), &attr); 539 pthread_mutex_init(&(GPU_LINUX_MUTEX_POOL[i]), &attr);
532 pthread_mutexattr_destroy(&attr); 540 pthread_mutexattr_destroy(&attr);
533 } 541 }
534 542
535 segment_pool_ptr = new managed_shared_memory(create_only, "linux_sem_memory", 4*1024); 543 segment_pool_ptr = new managed_shared_memory(create_only, "linux_sem_memory", 4*1024);
536 GPU_LINUX_SEM_POOL = segment_pool_ptr->construct<gpu_pool>("gpu_pool linux_p")[numGpuPartitions](GPU_PARTITION_SIZE); 544 GPU_LINUX_SEM_POOL = segment_pool_ptr->construct<gpu_pool>("gpu_pool linux_p")[numGpuPartitions](GPU_PARTITION_SIZE);
537 } 545 }
@@ -548,7 +556,7 @@ static void allocate_locks_linux(int num_gpu_users)
548 sleep(1); 556 sleep(1);
549 } 557 }
550 }while(segment_pool_ptr == NULL); 558 }while(segment_pool_ptr == NULL);
551 559
552 do 560 do
553 { 561 {
554 try 562 try
@@ -560,7 +568,7 @@ static void allocate_locks_linux(int num_gpu_users)
560 sleep(1); 568 sleep(1);
561 } 569 }
562 }while(segment_mutex_ptr == NULL); 570 }while(segment_mutex_ptr == NULL);
563 571
564 GPU_LINUX_SEM_POOL = segment_pool_ptr->find<gpu_pool>("gpu_pool linux_p").first; 572 GPU_LINUX_SEM_POOL = segment_pool_ptr->find<gpu_pool>("gpu_pool linux_p").first;
565 GPU_LINUX_MUTEX_POOL = segment_mutex_ptr->find<pthread_mutex_t>("pthread_mutex_t linux_m").first; 573 GPU_LINUX_MUTEX_POOL = segment_mutex_ptr->find<pthread_mutex_t>("pthread_mutex_t linux_m").first;
566 } 574 }
@@ -569,7 +577,7 @@ static void allocate_locks_linux(int num_gpu_users)
569 577
570 578
571 579
572static void allocate_locks(int num_gpu_users, bool linux_mode) 580static void allocate_locks(const int num_gpu_users, bool linux_mode)
573{ 581{
574 if(!linux_mode) 582 if(!linux_mode)
575 allocate_locks_litmus(); 583 allocate_locks_litmus();
@@ -593,14 +601,14 @@ static pthread_barrier_t *gpu_barrier = NULL;
593static interprocess_mutex *gpu_mgmt_mutexes = NULL; 601static interprocess_mutex *gpu_mgmt_mutexes = NULL;
594static managed_shared_memory *segment_ptr = NULL; 602static managed_shared_memory *segment_ptr = NULL;
595 603
596void coordinate_gpu_tasks(int num_gpu_users) 604void coordinate_gpu_tasks(const int num_gpu_users)
597{ 605{
598 if(num_gpu_users != 0) 606 if(num_gpu_users > 0)
599 { 607 {
600 printf("%d creating shared memory\n", getpid()); 608 printf("%d creating shared memory\n", getpid());
601 shared_memory_object::remove("gpu_barrier_memory"); 609 shared_memory_object::remove("gpu_barrier_memory");
602 segment_ptr = new managed_shared_memory(create_only, "gpu_barrier_memory", 4*1024); 610 segment_ptr = new managed_shared_memory(create_only, "gpu_barrier_memory", 4*1024);
603 611
604 printf("%d creating a barrier for %d users\n", getpid(), num_gpu_users); 612 printf("%d creating a barrier for %d users\n", getpid(), num_gpu_users);
605 gpu_barrier = segment_ptr->construct<pthread_barrier_t>("pthread_barrier_t gpu_barrier")(); 613 gpu_barrier = segment_ptr->construct<pthread_barrier_t>("pthread_barrier_t gpu_barrier")();
606 pthread_barrierattr_t battr; 614 pthread_barrierattr_t battr;
@@ -624,7 +632,7 @@ void coordinate_gpu_tasks(int num_gpu_users)
624 sleep(1); 632 sleep(1);
625 } 633 }
626 }while(segment_ptr == NULL); 634 }while(segment_ptr == NULL);
627 635
628 gpu_barrier = segment_ptr->find<pthread_barrier_t>("pthread_barrier_t gpu_barrier").first; 636 gpu_barrier = segment_ptr->find<pthread_barrier_t>("pthread_barrier_t gpu_barrier").first;
629 gpu_mgmt_mutexes = segment_ptr->find<interprocess_mutex>("interprocess_mutex m").first; 637 gpu_mgmt_mutexes = segment_ptr->find<interprocess_mutex>("interprocess_mutex m").first;
630 } 638 }
@@ -647,15 +655,16 @@ char *h_state_data = 0;
647 655
648unsigned int *h_iteration_count[NR_GPUS] = {0}; 656unsigned int *h_iteration_count[NR_GPUS] = {0};
649 657
650static void init_cuda(int num_gpu_users) 658static void init_cuda(const int num_gpu_users)
651{ 659{
652 const int PAGE_SIZE = 4*1024; 660 const int PAGE_SIZE = 4*1024;
653 size_t send_alloc_bytes = SEND_SIZE + (SEND_SIZE%PAGE_SIZE != 0)*PAGE_SIZE; 661 size_t send_alloc_bytes = SEND_SIZE + (SEND_SIZE%PAGE_SIZE != 0)*PAGE_SIZE;
654 size_t recv_alloc_bytes = RECV_SIZE + (RECV_SIZE%PAGE_SIZE != 0)*PAGE_SIZE; 662 size_t recv_alloc_bytes = RECV_SIZE + (RECV_SIZE%PAGE_SIZE != 0)*PAGE_SIZE;
655 size_t state_alloc_bytes = STATE_SIZE + (STATE_SIZE%PAGE_SIZE != 0)*PAGE_SIZE; 663 size_t state_alloc_bytes = STATE_SIZE + (STATE_SIZE%PAGE_SIZE != 0)*PAGE_SIZE;
656 664
657 coordinate_gpu_tasks(num_gpu_users); 665 coordinate_gpu_tasks(num_gpu_users);
658 666
667#if 1
659 switch (CUDA_SYNC_MODE) 668 switch (CUDA_SYNC_MODE)
660 { 669 {
661 case BLOCKING: 670 case BLOCKING:
@@ -665,72 +674,85 @@ static void init_cuda(int num_gpu_users)
665 cudaSetDeviceFlags(cudaDeviceScheduleSpin); 674 cudaSetDeviceFlags(cudaDeviceScheduleSpin);
666 break; 675 break;
667 } 676 }
668 677#else
678 cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
679#endif
680
669 for(int i = 0; i < GPU_PARTITION_SIZE; ++i) 681 for(int i = 0; i < GPU_PARTITION_SIZE; ++i)
670 { 682 {
671 cudaDeviceProp prop; 683 cudaDeviceProp prop;
672 int which = GPU_PARTITION*GPU_PARTITION_SIZE + i; 684 int which = GPU_PARTITION*GPU_PARTITION_SIZE + i;
673 685
674 gpu_mgmt_mutexes[which].lock(); 686 gpu_mgmt_mutexes[which].lock();
675 687 try
676 set_cur_gpu(which);
677 cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 0);
678 cudaDeviceSetLimit(cudaLimitMallocHeapSize, 0);
679
680 cudaGetDeviceProperties(&prop, which);
681 GPU_HZ[which] = prop.clockRate * 1000; /* khz -> hz */
682 NUM_SM[which] = prop.multiProcessorCount;
683 WARP_SIZE[which] = prop.warpSize;
684
685 // enough to fill the L2 cache exactly.
686 ELEM_PER_THREAD[which] = (prop.l2CacheSize/(NUM_SM[which]*WARP_SIZE[which]*sizeof(spindata_t)));
687
688
689 if (!MIGRATE_VIA_SYSMEM && prop.unifiedAddressing)
690 { 688 {
691 for(int j = 0; j < GPU_PARTITION_SIZE; ++j) 689 set_cur_gpu(which);
690 cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 0);
691 cudaDeviceSetLimit(cudaLimitMallocHeapSize, 0);
692
693 cudaGetDeviceProperties(&prop, which);
694 GPU_HZ[which] = prop.clockRate * 1000; /* khz -> hz */
695 NUM_SM[which] = prop.multiProcessorCount;
696 WARP_SIZE[which] = prop.warpSize;
697
698 // enough to fill the L2 cache exactly.
699 ELEM_PER_THREAD[which] = (prop.l2CacheSize/(NUM_SM[which]*WARP_SIZE[which]*sizeof(spindata_t)));
700
701
702 if (!MIGRATE_VIA_SYSMEM && prop.unifiedAddressing)
692 { 703 {
693 if (i != j) 704 for(int j = 0; j < GPU_PARTITION_SIZE; ++j)
694 { 705 {
695 int other = GPU_PARTITION*GPU_PARTITION_SIZE + j; 706 if (i != j)
696 int canAccess = 0;
697 cudaDeviceCanAccessPeer(&canAccess, which, other);
698 if(canAccess)
699 { 707 {
700 cudaDeviceEnablePeerAccess(other, 0); 708 int other = GPU_PARTITION*GPU_PARTITION_SIZE + j;
701 p2pMigration[which][other] = true; 709 int canAccess = 0;
710 cudaDeviceCanAccessPeer(&canAccess, which, other);
711 if(canAccess)
712 {
713 cudaDeviceEnablePeerAccess(other, 0);
714 p2pMigration[which][other] = true;
715 }
702 } 716 }
703 } 717 }
704 } 718 }
719
720 cudaStreamCreate(&STREAMS[CUR_DEVICE]);
721
722 cudaMalloc(&d_spin_data[which], prop.l2CacheSize);
723 cudaMemset(&d_spin_data[which], 0, prop.l2CacheSize);
724// cudaMalloc(&d_iteration_count[which], NUM_SM[which]*WARP_SIZE[which]*sizeof(unsigned int));
725// cudaHostAlloc(&h_iteration_count[which], NUM_SM[which]*WARP_SIZE[which]*sizeof(unsigned int), cudaHostAllocPortable | cudaHostAllocMapped);
726
727 if (send_alloc_bytes) {
728 cudaMalloc(&d_send_data[which], send_alloc_bytes);
729 cudaHostAlloc(&h_send_data, send_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped);
730 }
731
732 if (h_recv_data) {
733 cudaMalloc(&d_recv_data[which], recv_alloc_bytes);
734 cudaHostAlloc(&h_recv_data, recv_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped);
735 }
736
737 if (h_state_data) {
738 cudaMalloc(&d_state_data[which], state_alloc_bytes);
739
740 if (MIGRATE_VIA_SYSMEM)
741 cudaHostAlloc(&h_state_data, state_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped | cudaHostAllocWriteCombined);
742 }
705 } 743 }
706 744 catch(std::exception &e)
707 cudaStreamCreate(&STREAMS[CUR_DEVICE]); 745 {
708 746 printf("caught an exception during initializiation!: %s\n", e.what());
709 cudaMalloc(&d_spin_data[which], prop.l2CacheSize);
710 cudaMemset(&d_spin_data[which], 0, prop.l2CacheSize);
711// cudaMalloc(&d_iteration_count[which], NUM_SM[which]*WARP_SIZE[which]*sizeof(unsigned int));
712// cudaHostAlloc(&h_iteration_count[which], NUM_SM[which]*WARP_SIZE[which]*sizeof(unsigned int), cudaHostAllocPortable | cudaHostAllocMapped);
713
714 if (send_alloc_bytes) {
715 cudaMalloc(&d_send_data[which], send_alloc_bytes);
716 cudaHostAlloc(&h_send_data, send_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped);
717 }
718
719 if (h_recv_data) {
720 cudaMalloc(&d_recv_data[which], recv_alloc_bytes);
721 cudaHostAlloc(&h_recv_data, recv_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped);
722 } 747 }
723 748 catch(...)
724 if (h_state_data) { 749 {
725 cudaMalloc(&d_state_data[which], state_alloc_bytes); 750 printf("caught unknown exception.\n");
726
727 if (MIGRATE_VIA_SYSMEM)
728 cudaHostAlloc(&h_state_data, state_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped | cudaHostAllocWriteCombined);
729 } 751 }
730 752
731 gpu_mgmt_mutexes[which].unlock(); 753 gpu_mgmt_mutexes[which].unlock();
732 } 754 }
733 755
734 // roll back to first GPU 756 // roll back to first GPU
735 set_cur_gpu(GPU_PARTITION*GPU_PARTITION_SIZE); 757 set_cur_gpu(GPU_PARTITION*GPU_PARTITION_SIZE);
736} 758}
@@ -772,26 +794,26 @@ static bool MigrateToGPU_SysMem(int from, int to)
772 // you should be using speculative migrations. 794 // you should be using speculative migrations.
773 // Use PushState() and PullState(). 795 // Use PushState() and PullState().
774 assert(false); // for now 796 assert(false); // for now
775 797
776 bool success = true; 798 bool success = true;
777 799
778 set_cur_gpu(from); 800 set_cur_gpu(from);
779 chunkMemcpy(h_state_data, this_gpu(d_state_data), 801 chunkMemcpy(h_state_data, this_gpu(d_state_data),
780 STATE_SIZE, cudaMemcpyDeviceToHost, 802 STATE_SIZE, cudaMemcpyDeviceToHost,
781 from, useEngineLocks(), -1, true); 803 from, useEngineLocks(), -1, true);
782 804
783 set_cur_gpu(to); 805 set_cur_gpu(to);
784 chunkMemcpy(this_gpu(d_state_data), h_state_data, 806 chunkMemcpy(this_gpu(d_state_data), h_state_data,
785 STATE_SIZE, cudaMemcpyHostToDevice, 807 STATE_SIZE, cudaMemcpyHostToDevice,
786 to, useEngineLocks(), -1, true); 808 to, useEngineLocks(), -1, true);
787 809
788 return success; 810 return success;
789} 811}
790 812
791static bool MigrateToGPU(int from, int to) 813static bool MigrateToGPU(int from, int to)
792{ 814{
793 bool success = false; 815 bool success = false;
794 816
795 if (from != to) 817 if (from != to)
796 { 818 {
797 if(!MIGRATE_VIA_SYSMEM && p2pMigration[to][from]) 819 if(!MIGRATE_VIA_SYSMEM && p2pMigration[to][from])
@@ -804,7 +826,7 @@ static bool MigrateToGPU(int from, int to)
804 set_cur_gpu(to); 826 set_cur_gpu(to);
805 success = true; 827 success = true;
806 } 828 }
807 829
808 return success; 830 return success;
809} 831}
810 832
@@ -851,9 +873,9 @@ static void catch_exit(int catch_exit)
851 { 873 {
852 int which = GPU_PARTITION*GPU_PARTITION_SIZE + i; 874 int which = GPU_PARTITION*GPU_PARTITION_SIZE + i;
853 set_cur_gpu(which); 875 set_cur_gpu(which);
854 876
855// cudaDeviceReset(); 877// cudaDeviceReset();
856 878
857 // try to unlock everything. litmus will prevent bogus calls. 879 // try to unlock everything. litmus will prevent bogus calls.
858 if(USE_ENGINE_LOCKS) 880 if(USE_ENGINE_LOCKS)
859 { 881 {
@@ -883,15 +905,15 @@ static int gpucount = 0;
883 905
884__global__ void docudaspin(float* data, /*unsigned int* iterations,*/ unsigned int num_elem, unsigned int cycles) 906__global__ void docudaspin(float* data, /*unsigned int* iterations,*/ unsigned int num_elem, unsigned int cycles)
885{ 907{
886 long long int now = clock64(); 908 long long int now = clock64();
887 long long unsigned int elapsed = 0; 909 long long unsigned int elapsed = 0;
888 long long int last; 910 long long int last;
889 911
890// unsigned int iter = 0; 912// unsigned int iter = 0;
891 unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 913 unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
892 unsigned int j = 0; 914 unsigned int j = 0;
893 bool toggle = true; 915 bool toggle = true;
894 916
895// iterations[i] = 0; 917// iterations[i] = 0;
896 do 918 do
897 { 919 {
@@ -899,7 +921,7 @@ __global__ void docudaspin(float* data, /*unsigned int* iterations,*/ unsigned i
899 j = (j + 1 != num_elem) ? j + 1 : 0; 921 j = (j + 1 != num_elem) ? j + 1 : 0;
900 toggle = !toggle; 922 toggle = !toggle;
901// iter++; 923// iter++;
902 924
903 last = now; 925 last = now;
904 now = clock64(); 926 now = clock64();
905 927
@@ -909,13 +931,13 @@ __global__ void docudaspin(float* data, /*unsigned int* iterations,*/ unsigned i
909// elapsed += (diff > 0) ? 931// elapsed += (diff > 0) ?
910// diff : 932// diff :
911// now + ((~((long long int)0)<<1)>>1) - last; 933// now + ((~((long long int)0)<<1)>>1) - last;
912 934
913 // don't count iterations with clock roll-over 935 // don't count iterations with clock roll-over
914 elapsed += max(0ll, now - last); 936 elapsed += max(0ll, now - last);
915 }while(elapsed < cycles); 937 }while(elapsed < cycles);
916 938
917// iterations[i] = iter; 939// iterations[i] = iter;
918 940
919 return; 941 return;
920} 942}
921 943
@@ -923,9 +945,11 @@ static void gpu_loop_for(double gpu_sec_time, unsigned int num_kernels, double e
923{ 945{
924 int next_gpu; 946 int next_gpu;
925 947
948 if (gpu_sec_time <= 0.0)
949 goto out;
926 if (emergency_exit && wctime() > emergency_exit) 950 if (emergency_exit && wctime() > emergency_exit)
927 goto out; 951 goto out;
928 952
929 next_gpu = litmus_lock(TOKEN_LOCK); 953 next_gpu = litmus_lock(TOKEN_LOCK);
930 { 954 {
931 MigrateIfNeeded(next_gpu); 955 MigrateIfNeeded(next_gpu);
@@ -934,7 +958,7 @@ static void gpu_loop_for(double gpu_sec_time, unsigned int num_kernels, double e
934 if(SEND_SIZE > 0) 958 if(SEND_SIZE > 0)
935 chunkMemcpy(this_gpu(d_state_data), h_send_data, SEND_SIZE, 959 chunkMemcpy(this_gpu(d_state_data), h_send_data, SEND_SIZE,
936 cudaMemcpyHostToDevice, CUR_DEVICE, useEngineLocks()); 960 cudaMemcpyHostToDevice, CUR_DEVICE, useEngineLocks());
937 961
938 for(unsigned int i = 0; i < num_kernels; ++i) 962 for(unsigned int i = 0; i < num_kernels; ++i)
939 { 963 {
940 if(useEngineLocks()) litmus_lock(cur_ee()); 964 if(useEngineLocks()) litmus_lock(cur_ee());
@@ -943,18 +967,18 @@ static void gpu_loop_for(double gpu_sec_time, unsigned int num_kernels, double e
943 cudaStreamSynchronize(cur_stream()); 967 cudaStreamSynchronize(cur_stream());
944 if(useEngineLocks()) litmus_unlock(cur_ee()); 968 if(useEngineLocks()) litmus_unlock(cur_ee());
945 } 969 }
946 970
947 if(RECV_SIZE > 0) 971 if(RECV_SIZE > 0)
948 chunkMemcpy(h_recv_data, this_gpu(d_state_data), RECV_SIZE, 972 chunkMemcpy(h_recv_data, this_gpu(d_state_data), RECV_SIZE,
949 cudaMemcpyDeviceToHost, CUR_DEVICE, useEngineLocks()); 973 cudaMemcpyDeviceToHost, CUR_DEVICE, useEngineLocks());
950 974
951 if (MIGRATE_VIA_SYSMEM) 975 if (MIGRATE_VIA_SYSMEM)
952 PullState(); 976 PullState();
953 } 977 }
954 litmus_unlock(TOKEN_LOCK); 978 litmus_unlock(TOKEN_LOCK);
955 979
956 last_gpu() = cur_gpu(); 980 last_gpu() = cur_gpu();
957 981
958out: 982out:
959 return; 983 return;
960} 984}
@@ -964,7 +988,14 @@ static void gpu_loop_for_linux(double gpu_sec_time, unsigned int num_kernels, do
964 static int GPU_OFFSET = GPU_PARTITION * GPU_PARTITION_SIZE; 988 static int GPU_OFFSET = GPU_PARTITION * GPU_PARTITION_SIZE;
965 static gpu_pool *pool = &GPU_LINUX_SEM_POOL[GPU_PARTITION]; 989 static gpu_pool *pool = &GPU_LINUX_SEM_POOL[GPU_PARTITION];
966 static pthread_mutex_t *mutex = &GPU_LINUX_MUTEX_POOL[GPU_PARTITION]; 990 static pthread_mutex_t *mutex = &GPU_LINUX_MUTEX_POOL[GPU_PARTITION];
967 991
992 int next_gpu;
993
994 if (gpu_sec_time <= 0.0)
995 goto out;
996 if (emergency_exit && wctime() > emergency_exit)
997 goto out;
998
968#ifdef VANILLA_LINUX 999#ifdef VANILLA_LINUX
969 static bool once = false; 1000 static bool once = false;
970 static cudaEvent_t start, end; 1001 static cudaEvent_t start, end;
@@ -977,21 +1008,16 @@ static void gpu_loop_for_linux(double gpu_sec_time, unsigned int num_kernels, do
977 } 1008 }
978#endif 1009#endif
979 1010
980 int next_gpu;
981
982 if (emergency_exit && wctime() > emergency_exit)
983 goto out;
984
985 next_gpu = pool->get(mutex, cur_gpu() - GPU_OFFSET) + GPU_OFFSET; 1011 next_gpu = pool->get(mutex, cur_gpu() - GPU_OFFSET) + GPU_OFFSET;
986 { 1012 {
987 MigrateIfNeeded(next_gpu); 1013 MigrateIfNeeded(next_gpu);
988 1014
989 unsigned int numcycles = ((unsigned int)(cur_hz() * gpu_sec_time))/num_kernels; 1015 unsigned int numcycles = ((unsigned int)(cur_hz() * gpu_sec_time))/num_kernels;
990 1016
991 if(SEND_SIZE > 0) 1017 if(SEND_SIZE > 0)
992 chunkMemcpy(this_gpu(d_state_data), h_send_data, SEND_SIZE, 1018 chunkMemcpy(this_gpu(d_state_data), h_send_data, SEND_SIZE,
993 cudaMemcpyHostToDevice, cur_gpu(), useEngineLocks()); 1019 cudaMemcpyHostToDevice, cur_gpu(), useEngineLocks());
994 1020
995 for(unsigned int i = 0; i < num_kernels; ++i) 1021 for(unsigned int i = 0; i < num_kernels; ++i)
996 { 1022 {
997 /* one block per sm, one warp per block */ 1023 /* one block per sm, one warp per block */
@@ -1004,7 +1030,7 @@ static void gpu_loop_for_linux(double gpu_sec_time, unsigned int num_kernels, do
1004 cudaEventSynchronize(end); 1030 cudaEventSynchronize(end);
1005#endif 1031#endif
1006 cudaStreamSynchronize(cur_stream()); 1032 cudaStreamSynchronize(cur_stream());
1007 1033
1008#ifdef VANILLA_LINUX 1034#ifdef VANILLA_LINUX
1009 cudaEventElapsedTime(&ms, start, end); 1035 cudaEventElapsedTime(&ms, start, end);
1010 ms_sum += ms; 1036 ms_sum += ms;
@@ -1013,18 +1039,18 @@ static void gpu_loop_for_linux(double gpu_sec_time, unsigned int num_kernels, do
1013#ifdef VANILLA_LINUX 1039#ifdef VANILLA_LINUX
1014 ++gpucount; 1040 ++gpucount;
1015#endif 1041#endif
1016 1042
1017 if(RECV_SIZE > 0) 1043 if(RECV_SIZE > 0)
1018 chunkMemcpy(h_recv_data, this_gpu(d_state_data), RECV_SIZE, 1044 chunkMemcpy(h_recv_data, this_gpu(d_state_data), RECV_SIZE,
1019 cudaMemcpyDeviceToHost, cur_gpu(), useEngineLocks()); 1045 cudaMemcpyDeviceToHost, cur_gpu(), useEngineLocks());
1020 1046
1021 if (MIGRATE_VIA_SYSMEM) 1047 if (MIGRATE_VIA_SYSMEM)
1022 PullState(); 1048 PullState();
1023 } 1049 }
1024 pool->put(mutex, cur_gpu() - GPU_OFFSET); 1050 pool->put(mutex, cur_gpu() - GPU_OFFSET);
1025 1051
1026 last_gpu() = cur_gpu(); 1052 last_gpu() = cur_gpu();
1027 1053
1028out: 1054out:
1029 return; 1055 return;
1030} 1056}
@@ -1131,15 +1157,20 @@ static int loop_once(void)
1131 1157
1132static int loop_for(double exec_time, double emergency_exit) 1158static int loop_for(double exec_time, double emergency_exit)
1133{ 1159{
1134 double last_loop = 0, loop_start;
1135 int tmp = 0; 1160 int tmp = 0;
1161 double last_loop, loop_start;
1162 double start, now;
1163
1164 if (exec_time <= 0.0)
1165 goto out;
1136 1166
1137 double start = cputime(); 1167 start = cputime();
1138 double now = cputime(); 1168 now = cputime();
1139 1169
1140 if (emergency_exit && wctime() > emergency_exit) 1170 if (emergency_exit && wctime() > emergency_exit)
1141 goto out; 1171 goto out;
1142 1172
1173 last_loop = 0;
1143 while (now + last_loop < start + exec_time) { 1174 while (now + last_loop < start + exec_time) {
1144 loop_start = now; 1175 loop_start = now;
1145 tmp += loop_once(); 1176 tmp += loop_once();
@@ -1177,36 +1208,39 @@ static void debug_delay_loop(void)
1177 } 1208 }
1178} 1209}
1179 1210
1180static int gpu_job(double exec_time, double gpu_exec_time, unsigned int num_kernels, double program_end) 1211typedef bool (*gpu_job_t)(double exec_time, double gpu_exec_time, unsigned int num_kernels, double program_end);
1212typedef bool (*cpu_job_t)(double exec_time, double program_end);
1213
1214static bool gpu_job(double exec_time, double gpu_exec_time, unsigned int num_kernels, double program_end)
1181{ 1215{
1182 double chunk1, chunk2; 1216 double chunk1, chunk2;
1183 1217
1184 if (wctime() > program_end) { 1218 if (wctime() > program_end) {
1185 return 0; 1219 return false;
1186 } 1220 }
1187 else { 1221 else {
1188 chunk1 = exec_time * drand48(); 1222 chunk1 = exec_time * drand48();
1189 chunk2 = exec_time - chunk1; 1223 chunk2 = exec_time - chunk1;
1190 1224
1191 loop_for(chunk1, program_end + 1); 1225 loop_for(chunk1, program_end + 1);
1192 gpu_loop_for(gpu_exec_time, num_kernels, program_end + 1); 1226 gpu_loop_for(gpu_exec_time, num_kernels, program_end + 1);
1193 loop_for(chunk2, program_end + 1); 1227 loop_for(chunk2, program_end + 1);
1194 1228
1195 sleep_next_period(); 1229 sleep_next_period();
1196 } 1230 }
1197 return 1; 1231 return true;
1198} 1232}
1199 1233
1200static int job(double exec_time, double program_end) 1234static bool job(double exec_time, double program_end)
1201{ 1235{
1202 if (wctime() > program_end) { 1236 if (wctime() > program_end) {
1203 return 0; 1237 return false;
1204 } 1238 }
1205 else { 1239 else {
1206 loop_for(exec_time, program_end + 1); 1240 loop_for(exec_time, program_end + 1);
1207 sleep_next_period(); 1241 sleep_next_period();
1208 } 1242 }
1209 return 1; 1243 return true;
1210} 1244}
1211 1245
1212/*****************************/ 1246/*****************************/
@@ -1254,12 +1288,12 @@ static void init_linux()
1254 mlockall(MCL_CURRENT | MCL_FUTURE); 1288 mlockall(MCL_CURRENT | MCL_FUTURE);
1255} 1289}
1256 1290
1257static int gpu_job_linux(double exec_time, double gpu_exec_time, unsigned int num_kernels, double program_end) 1291static bool gpu_job_linux(double exec_time, double gpu_exec_time, unsigned int num_kernels, double program_end)
1258{ 1292{
1259 double chunk1, chunk2; 1293 double chunk1, chunk2;
1260 1294
1261 if (wctime() > program_end) { 1295 if (wctime() > program_end) {
1262 return 0; 1296 return false;
1263 } 1297 }
1264 else { 1298 else {
1265 chunk1 = exec_time * drand48(); 1299 chunk1 = exec_time * drand48();
@@ -1268,22 +1302,22 @@ static int gpu_job_linux(double exec_time, double gpu_exec_time, unsigned int nu
1268 loop_for(chunk1, program_end + 1); 1302 loop_for(chunk1, program_end + 1);
1269 gpu_loop_for_linux(gpu_exec_time, num_kernels, program_end + 1); 1303 gpu_loop_for_linux(gpu_exec_time, num_kernels, program_end + 1);
1270 loop_for(chunk2, program_end + 1); 1304 loop_for(chunk2, program_end + 1);
1271 1305
1272 sleep_next_period_linux(); 1306 sleep_next_period_linux();
1273 } 1307 }
1274 return 1; 1308 return true;
1275} 1309}
1276 1310
1277static int job_linux(double exec_time, double program_end) 1311static bool job_linux(double exec_time, double program_end)
1278{ 1312{
1279 if (wctime() > program_end) { 1313 if (wctime() > program_end) {
1280 return 0; 1314 return false;
1281 } 1315 }
1282 else { 1316 else {
1283 loop_for(exec_time, program_end + 1); 1317 loop_for(exec_time, program_end + 1);
1284 sleep_next_period_linux(); 1318 sleep_next_period_linux();
1285 } 1319 }
1286 return 1; 1320 return true;
1287} 1321}
1288 1322
1289/*****************************/ 1323/*****************************/
@@ -1296,7 +1330,7 @@ enum eScheduler
1296}; 1330};
1297 1331
1298#define CPU_OPTIONS "p:z:c:wlveio:f:s:q:X:L:Q:d:" 1332#define CPU_OPTIONS "p:z:c:wlveio:f:s:q:X:L:Q:d:"
1299#define GPU_OPTIONS "g:y:r:C:E:DG:xS:R:T:Z:aFm:b:MNIk:V" 1333#define GPU_OPTIONS "g:y:r:C:E:DG:xS:R:T:Z:aFm:b:MNIk:VW:"
1300 1334
1301// concat the option strings 1335// concat the option strings
1302#define OPTSTR CPU_OPTIONS GPU_OPTIONS 1336#define OPTSTR CPU_OPTIONS GPU_OPTIONS
@@ -1304,37 +1338,52 @@ enum eScheduler
1304int main(int argc, char** argv) 1338int main(int argc, char** argv)
1305{ 1339{
1306 int ret; 1340 int ret;
1341
1342 struct rt_task param;
1343
1307 lt_t wcet; 1344 lt_t wcet;
1308 lt_t period; 1345 lt_t period;
1309 double wcet_ms = -1, gpu_wcet_ms = -1, period_ms = -1; 1346 lt_t budget;
1347 double wcet_ms = -1.0;
1348 double gpu_wcet_ms = 0.0;
1349 double period_ms = -1.0;
1350 double budget_ms = -1.0;
1351
1352 unsigned int num_kernels = 1;
1353
1354 budget_drain_policy_t drain = DRAIN_SIMPLE;
1355 bool want_enforcement = false;
1356 bool want_signals = false;
1357
1310 unsigned int priority = LITMUS_LOWEST_PRIORITY; 1358 unsigned int priority = LITMUS_LOWEST_PRIORITY;
1359
1360 task_class_t cls = RT_CLASS_SOFT;
1361
1362 eScheduler scheduler = LITMUS;
1363 int num_gpu_users = 0;
1311 int migrate = 0; 1364 int migrate = 0;
1312 int cluster = 0; 1365 int cluster = 0;
1313 int cluster_size = 1; 1366 int cluster_size = 1;
1314 int opt; 1367
1368 Normal<double> *wcet_dist_ms = NULL;
1369 float stdpct = 0.0;
1370
1371 cpu_job_t cjobfn = NULL;
1372 gpu_job_t gjobfn = NULL;
1373
1315 int wait = 0; 1374 int wait = 0;
1375 double scale = 1.0;
1316 int test_loop = 0; 1376 int test_loop = 0;
1317 int column = 1; 1377
1318 const char *file = NULL;
1319 int want_enforcement = 0;
1320 int want_signals = 0;
1321 double duration = 0, start = 0; 1378 double duration = 0, start = 0;
1322 double *exec_times = NULL;
1323 double scale = 1.0;
1324 task_class_t cls = RT_CLASS_SOFT;
1325 int cur_job = 0, num_jobs = 0; 1379 int cur_job = 0, num_jobs = 0;
1326 struct rt_task param; 1380 int column = 1;
1327 1381
1328 double budget_ms = -1.0; 1382 int opt;
1329 lt_t budget; 1383
1330 1384 double *exec_times = NULL;
1331 int num_gpu_users = 0; 1385 const char *file = NULL;
1332 unsigned int num_kernels = 1;
1333 1386
1334 budget_drain_policy_t drain = DRAIN_SIMPLE;
1335
1336 eScheduler scheduler = LITMUS;
1337
1338 /* locking */ 1387 /* locking */
1339// int lock_od = -1; 1388// int lock_od = -1;
1340// int resource_id = 0; 1389// int resource_id = 0;
@@ -1414,7 +1463,7 @@ int main(int argc, char** argv)
1414 MIGRATE_VIA_SYSMEM = true; 1463 MIGRATE_VIA_SYSMEM = true;
1415 break; 1464 break;
1416 case 'm': 1465 case 'm':
1417 num_gpu_users = atoi(optarg); 1466 num_gpu_users = (int)atoi(optarg);
1418 assert(num_gpu_users > 0); 1467 assert(num_gpu_users > 0);
1419 break; 1468 break;
1420 case 'k': 1469 case 'k':
@@ -1423,6 +1472,9 @@ int main(int argc, char** argv)
1423 case 'b': 1472 case 'b':
1424 budget_ms = atoi(optarg); 1473 budget_ms = atoi(optarg);
1425 break; 1474 break;
1475 case 'W':
1476 stdpct = atof(optarg);
1477 break;
1426 case 'N': 1478 case 'N':
1427 scheduler = LINUX; 1479 scheduler = LINUX;
1428 break; 1480 break;
@@ -1438,10 +1490,10 @@ int main(int argc, char** argv)
1438 usage("Unknown task class."); 1490 usage("Unknown task class.");
1439 break; 1491 break;
1440 case 'e': 1492 case 'e':
1441 want_enforcement = 1; 1493 want_enforcement = true;
1442 break; 1494 break;
1443 case 'i': 1495 case 'i':
1444 want_signals = 1; 1496 want_signals = true;
1445 break; 1497 break;
1446 case 'd': 1498 case 'd':
1447 drain = (budget_drain_policy_t)atoi(optarg); 1499 drain = (budget_drain_policy_t)atoi(optarg);
@@ -1489,27 +1541,34 @@ int main(int argc, char** argv)
1489 assert(scheduler != LITMUS); 1541 assert(scheduler != LITMUS);
1490 assert(!wait); 1542 assert(!wait);
1491#endif 1543#endif
1492 1544
1545 assert(stdpct >= 0.0);
1546
1547 if (MIGRATE_VIA_SYSMEM)
1548 assert(GPU_PARTITION_SIZE != 1);
1549
1493 // turn off some features to be safe 1550 // turn off some features to be safe
1494 if (scheduler != LITMUS) 1551 if (scheduler != LITMUS)
1495 { 1552 {
1496 RHO = 0; 1553 RHO = 0;
1497 USE_ENGINE_LOCKS = false; 1554 USE_ENGINE_LOCKS = false;
1498 USE_DYNAMIC_GROUP_LOCKS = false; 1555 USE_DYNAMIC_GROUP_LOCKS = false;
1499 ENABLE_AFFINITY = false;
1500 RELAX_FIFO_MAX_LEN = false; 1556 RELAX_FIFO_MAX_LEN = false;
1501 ENABLE_RT_AUX_THREADS = false; 1557 ENABLE_RT_AUX_THREADS = false;
1502 budget_ms = -1.0; 1558 budget_ms = -1.0;
1503 want_enforcement = 0; 1559 want_enforcement = false;
1504 want_signals = 0; 1560 want_signals = false;
1505 1561
1562 cjobfn = job_linux;
1563 gjobfn = gpu_job_linux;
1564
1506 if (scheduler == RT_LINUX) 1565 if (scheduler == RT_LINUX)
1507 { 1566 {
1508 struct sched_param fifoparams; 1567 struct sched_param fifoparams;
1509 1568
1510 assert(priority >= sched_get_priority_min(SCHED_FIFO) && 1569 assert(priority >= sched_get_priority_min(SCHED_FIFO) &&
1511 priority <= sched_get_priority_max(SCHED_FIFO)); 1570 priority <= sched_get_priority_max(SCHED_FIFO));
1512 1571
1513 memset(&fifoparams, 0, sizeof(fifoparams)); 1572 memset(&fifoparams, 0, sizeof(fifoparams));
1514 fifoparams.sched_priority = priority; 1573 fifoparams.sched_priority = priority;
1515 assert(0 == sched_setscheduler(getpid(), SCHED_FIFO, &fifoparams)); 1574 assert(0 == sched_setscheduler(getpid(), SCHED_FIFO, &fifoparams));
@@ -1517,16 +1576,19 @@ int main(int argc, char** argv)
1517 } 1576 }
1518 else 1577 else
1519 { 1578 {
1579 cjobfn = job;
1580 gjobfn = gpu_job;
1581
1520 if (!litmus_is_valid_fixed_prio(priority)) 1582 if (!litmus_is_valid_fixed_prio(priority))
1521 usage("Invalid priority."); 1583 usage("Invalid priority.");
1522 } 1584 }
1523 1585
1524 if (test_loop) { 1586 if (test_loop) {
1525 debug_delay_loop(); 1587 debug_delay_loop();
1526 return 0; 1588 return 0;
1527 } 1589 }
1528 1590
1529 srand(getpid()); 1591 srand(time(0));
1530 1592
1531 if (file) { 1593 if (file) {
1532 get_exec_times(file, column, &num_jobs, &exec_times); 1594 get_exec_times(file, column, &num_jobs, &exec_times);
@@ -1548,7 +1610,7 @@ int main(int argc, char** argv)
1548 } 1610 }
1549 1611
1550 if (argc - optind == 3) { 1612 if (argc - optind == 3) {
1551 assert(!GPU_USING); 1613 assert(!GPU_USING);
1552 wcet_ms = atof(argv[optind + 0]); 1614 wcet_ms = atof(argv[optind + 0]);
1553 period_ms = atof(argv[optind + 1]); 1615 period_ms = atof(argv[optind + 1]);
1554 duration = atof(argv[optind + 2]); 1616 duration = atof(argv[optind + 2]);
@@ -1560,7 +1622,7 @@ int main(int argc, char** argv)
1560 period_ms = atof(argv[optind + 2]); 1622 period_ms = atof(argv[optind + 2]);
1561 duration = atof(argv[optind + 3]); 1623 duration = atof(argv[optind + 3]);
1562 } 1624 }
1563 1625
1564 wcet = ms2ns(wcet_ms); 1626 wcet = ms2ns(wcet_ms);
1565 period = ms2ns(period_ms); 1627 period = ms2ns(period_ms);
1566 if (wcet <= 0) 1628 if (wcet <= 0)
@@ -1579,7 +1641,29 @@ int main(int argc, char** argv)
1579 budget = ms2ns(budget_ms); 1641 budget = ms2ns(budget_ms);
1580 else 1642 else
1581 budget = wcet; 1643 budget = wcet;
1582 1644
1645#if 0
1646 // use upscale to determine breakdown utilization
1647 // only scaling up CPU time for now.
1648 double upscale = (double)period/(double)budget - 1.0;
1649 upscale = std::min(std::max(0.0, upscale), 0.6); // at most 30%
1650 wcet = wcet + wcet*upscale;
1651 budget = budget + wcet*upscale;
1652 wcet_ms = wcet_ms + wcet_ms*upscale;
1653
1654 // fucking floating point
1655 if (budget < wcet)
1656 budget = wcet;
1657 if (budget > period)
1658 budget = period;
1659#endif
1660
1661 // randomize execution time according to a normal distribution
1662 // centered around the desired execution time.
1663 // standard deviation is a percentage of this average
1664 wcet_dist_ms = new Normal<double>(wcet_ms + gpu_wcet_ms, (wcet_ms + gpu_wcet_ms) * stdpct);
1665 wcet_dist_ms->seed((unsigned int)time(0));
1666
1583 if (file && num_jobs > 1) 1667 if (file && num_jobs > 1)
1584 duration += period_ms * 0.001 * (num_jobs - 1); 1668 duration += period_ms * 0.001 * (num_jobs - 1);
1585 1669
@@ -1588,7 +1672,7 @@ int main(int argc, char** argv)
1588 if (ret < 0) 1672 if (ret < 0)
1589 bail_out("could not migrate to target partition or cluster."); 1673 bail_out("could not migrate to target partition or cluster.");
1590 } 1674 }
1591 1675
1592 if (scheduler != LITMUS) 1676 if (scheduler != LITMUS)
1593 { 1677 {
1594 // set some variables needed by linux modes 1678 // set some variables needed by linux modes
@@ -1612,17 +1696,19 @@ int main(int argc, char** argv)
1612 PRECISE_SIGNALS : NO_SIGNALS; 1696 PRECISE_SIGNALS : NO_SIGNALS;
1613 param.drain_policy = drain; 1697 param.drain_policy = drain;
1614 param.release_policy = PERIODIC; 1698 param.release_policy = PERIODIC;
1615 1699
1616 if (migrate) 1700 if (migrate)
1617 param.cpu = cluster_to_first_cpu(cluster, cluster_size); 1701 param.cpu = cluster_to_first_cpu(cluster, cluster_size);
1618 ret = set_rt_task_param(gettid(), &param); 1702 ret = set_rt_task_param(gettid(), &param);
1619 if (ret < 0) 1703 if (ret < 0)
1620 bail_out("could not setup rt task params"); 1704 bail_out("could not setup rt task params");
1621 1705
1622 if (scheduler == LITMUS) 1706 if (scheduler == LITMUS) {
1623 init_litmus(); 1707 init_litmus();
1624 else 1708 }
1709 else {
1625 init_linux(); 1710 init_linux();
1711 }
1626 1712
1627 if (want_signals) { 1713 if (want_signals) {
1628 /* bind default longjmp signal handler to SIG_BUDGET. */ 1714 /* bind default longjmp signal handler to SIG_BUDGET. */
@@ -1640,16 +1726,16 @@ int main(int argc, char** argv)
1640 1726
1641 if (GPU_USING) { 1727 if (GPU_USING) {
1642 allocate_locks(num_gpu_users, scheduler != LITMUS); 1728 allocate_locks(num_gpu_users, scheduler != LITMUS);
1643 1729
1644 signal(SIGABRT, catch_exit); 1730 signal(SIGABRT, catch_exit);
1645 signal(SIGTERM, catch_exit); 1731 signal(SIGTERM, catch_exit);
1646 signal(SIGQUIT, catch_exit); 1732 signal(SIGQUIT, catch_exit);
1647 signal(SIGSEGV, catch_exit); 1733 signal(SIGSEGV, catch_exit);
1648 1734
1649 init_cuda(num_gpu_users); 1735 init_cuda(num_gpu_users);
1650 safetynet = true; 1736 safetynet = true;
1651 } 1737 }
1652 1738
1653 if (scheduler == LITMUS) 1739 if (scheduler == LITMUS)
1654 { 1740 {
1655 ret = task_mode(LITMUS_RT_TASK); 1741 ret = task_mode(LITMUS_RT_TASK);
@@ -1666,7 +1752,7 @@ int main(int argc, char** argv)
1666 ret = wait_for_ts_release2(&releaseTime); 1752 ret = wait_for_ts_release2(&releaseTime);
1667 if (ret != 0) 1753 if (ret != 0)
1668 bail_out("wait_for_ts_release2()"); 1754 bail_out("wait_for_ts_release2()");
1669 1755
1670 if (scheduler != LITMUS) 1756 if (scheduler != LITMUS)
1671 log_release(); 1757 log_release();
1672 } 1758 }
@@ -1683,35 +1769,38 @@ int main(int argc, char** argv)
1683 1769
1684 start = wctime(); 1770 start = wctime();
1685 1771
1686 if (scheduler == LITMUS) 1772 if (!GPU_USING) {
1687 { 1773 bool keepgoing;
1688 if (!GPU_USING) { 1774 do
1689 while (job(wcet_ms * 0.001 * scale, start + duration)); 1775 {
1690 } 1776 double job_ms = wcet_dist_ms->random();
1691 else { 1777 if (job_ms < 0.0)
1692 while (gpu_job(wcet_ms * 0.001 * scale, 1778 job_ms = 0.0;
1693 gpu_wcet_ms * 0.001 * scale, 1779 keepgoing = cjobfn(ms2s(job_ms * scale), start + duration);
1694 num_kernels, 1780 }while(keepgoing);
1695 start + duration));
1696 }
1697 } 1781 }
1698 else 1782 else {
1699 { 1783 bool keepgoing;
1700 if (!GPU_USING) { 1784 do
1701 while (job_linux(wcet_ms * 0.001 * scale, start + duration)); 1785 {
1702 } 1786 double job_ms = wcet_dist_ms->random();
1703 else { 1787 if (job_ms < 0.0)
1704 while (gpu_job_linux(wcet_ms * 0.001 * scale, 1788 job_ms = 0.0;
1705 gpu_wcet_ms * 0.001 * scale, 1789
1706 num_kernels, 1790 double cpu_job_ms = (job_ms/(wcet_ms + gpu_wcet_ms))*wcet_ms;
1707 start + duration)); 1791 double gpu_job_ms = (job_ms/(wcet_ms + gpu_wcet_ms))*gpu_wcet_ms;
1708 } 1792 keepgoing = gjobfn(
1793 ms2s(cpu_job_ms * scale),
1794 ms2s(gpu_job_ms * scale),
1795 num_kernels,
1796 start + duration);
1797 }while(keepgoing);
1709 } 1798 }
1710 1799
1711 if (GPU_USING && ENABLE_RT_AUX_THREADS) 1800 if (GPU_USING && ENABLE_RT_AUX_THREADS)
1712 if (disable_aux_rt_tasks(AUX_CURRENT | AUX_FUTURE) != 0) 1801 if (disable_aux_rt_tasks(AUX_CURRENT | AUX_FUTURE) != 0)
1713 bail_out("disable_aux_rt_tasks() failed"); 1802 bail_out("disable_aux_rt_tasks() failed");
1714 1803
1715// if (file) { 1804// if (file) {
1716// /* use times read from the CSV file */ 1805// /* use times read from the CSV file */
1717// for (cur_job = 0; cur_job < num_jobs; ++cur_job) { 1806// for (cur_job = 0; cur_job < num_jobs; ++cur_job) {
@@ -1740,7 +1829,10 @@ int main(int argc, char** argv)
1740 1829
1741// printf("avg: %f\n", ms_sum/gpucount); 1830// printf("avg: %f\n", ms_sum/gpucount);
1742 } 1831 }
1743 1832
1833 if (wcet_dist_ms)
1834 delete wcet_dist_ms;
1835
1744 if (file) 1836 if (file)
1745 free(exec_times); 1837 free(exec_times);
1746 1838
diff --git a/src/migration.c b/src/migration.c
index 7ac320e..084b68c 100644
--- a/src/migration.c
+++ b/src/migration.c
@@ -66,6 +66,7 @@ int cluster_to_first_cpu(int cluster, int cluster_sz)
66static int setup_numa(pid_t tid, int sz, const cpu_set_t *cpus) 66static int setup_numa(pid_t tid, int sz, const cpu_set_t *cpus)
67{ 67{
68 int nr_nodes; 68 int nr_nodes;
69 int nr_cpus = num_online_cpus();
69 struct bitmask* new_nodes; 70 struct bitmask* new_nodes;
70 struct bitmask* old_nodes; 71 struct bitmask* old_nodes;
71 int i; 72 int i;
@@ -78,7 +79,7 @@ static int setup_numa(pid_t tid, int sz, const cpu_set_t *cpus)
78 new_nodes = numa_bitmask_alloc(nr_nodes); 79 new_nodes = numa_bitmask_alloc(nr_nodes);
79 old_nodes = numa_bitmask_alloc(nr_nodes); 80 old_nodes = numa_bitmask_alloc(nr_nodes);
80 /* map the cpu mask to a numa mask */ 81 /* map the cpu mask to a numa mask */
81 for (i = 0; i < sz; ++i) { 82 for (i = 0; i < nr_cpus; ++i) {
82 if(CPU_ISSET_S(i, sz, cpus)) { 83 if(CPU_ISSET_S(i, sz, cpus)) {
83 numa_bitmask_setbit(new_nodes, numa_node_of_cpu(i)); 84 numa_bitmask_setbit(new_nodes, numa_node_of_cpu(i));
84 } 85 }
@@ -124,6 +125,7 @@ int be_migrate_thread_to_cpu(pid_t tid, int target_cpu)
124 125
125 cpu_set = CPU_ALLOC(num_cpus); 126 cpu_set = CPU_ALLOC(num_cpus);
126 sz = CPU_ALLOC_SIZE(num_cpus); 127 sz = CPU_ALLOC_SIZE(num_cpus);
128
127 CPU_ZERO_S(sz, cpu_set); 129 CPU_ZERO_S(sz, cpu_set);
128 CPU_SET_S(target_cpu, sz, cpu_set); 130 CPU_SET_S(target_cpu, sz, cpu_set);
129 131