From 95e840f68892d46289120d1042ee36f9eaf41de7 Mon Sep 17 00:00:00 2001 From: Glenn Elliott <gelliott@cs.unc.edu> Date: Mon, 6 May 2013 18:57:37 -0400 Subject: several new *important* features 1) gpusync daemon mode. 2) engine yield logic 3) fixed chunking (did not work on memcpys > 2 chunks) --- Makefile | 8 +- gpu/dgl.c | 1 + gpu/gpuspin.cu | 1919 +++++++++++++++++++++++++++++++++++----------------- include/litmus.h | 2 + src/kernel_iface.c | 2 +- src/syscalls.c | 10 + 6 files changed, 1307 insertions(+), 635 deletions(-) diff --git a/Makefile b/Makefile index 831c16b..e877ca4 100644 --- a/Makefile +++ b/Makefile @@ -25,12 +25,15 @@ NUMA_SUPPORT = dummyval # compiler flags flags-debug = -O2 -Wall -Werror -g -Wdeclaration-after-statement +#flags-debug = -Wall -Werror -g -Wdeclaration-after-statement flags-debug-cpp = -O2 -Wall -Werror -g +#flags-debug-cpp = -Wall -Werror -g flags-api = -D_XOPEN_SOURCE=600 -D_GNU_SOURCE flags-misc = -fasynchronous-unwind-tables -fnon-call-exceptions flags-cu-debug = -g -G -Xcompiler -Wall -Xcompiler -Werror flags-cu-optim = -O2 -Xcompiler -march=native +#flags-cu-optim = -Xcompiler -march=native flags-cu-nvcc = --use_fast_math -gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 flags-cu-misc = -Xcompiler -fasynchronous-unwind-tables -Xcompiler -fnon-call-exceptions -Xcompiler -malign-double -Xcompiler -pthread flags-cu-x86_64 = -m64 @@ -63,7 +66,6 @@ headers = -I${LIBLITMUS}/include -I${LIBLITMUS}/arch/${include-${ARCH}}/include # combine options CPPFLAGS = ${flags-api} ${flags-debug-cpp} ${flags-misc} ${flags-${ARCH}} -DARCH=${ARCH} ${headers} CUFLAGS = ${flags-api} ${flags-cu-debug} ${flags-cu-optim} ${flags-cu-nvcc} ${flags-cu-misc} -DARCH=${ARCH} ${headers} -#CUFLAGS = ${flags-api} ${flags-cu-optim} ${flags-cu-nvcc} ${flags-cu-misc} -DARCH=${ARCH} ${headers} CFLAGS = ${flags-debug} ${flags-misc} LDFLAGS = ${flags-${ARCH}} @@ -82,7 +84,7 @@ endif # how to link cuda cuda-flags-i386 = -L/usr/local/cuda/lib cuda-flags-x86_64 = -L/usr/local/cuda/lib64 -cuda-flags = ${cuda-flags-${ARCH}} -lcudart +cuda-flags = ${cuda-flags-${ARCH}} -lcudart -lcuda # Force gcc instead of cc, but let the user specify a more specific version if # desired. @@ -299,7 +301,7 @@ lib-budget = -lrt -lm -pthread vpath %.cu gpu/ objcu-gpuspin = gpuspin.o common.o -lib-gpuspin = -lblitz -lrt -lm -lpthread +lib-gpuspin = -lblitz -lrt -lm -lpthread -lboost_filesystem -lboost_system # ############################################################################## # Build everything that depends on liblitmus. diff --git a/gpu/dgl.c b/gpu/dgl.c index 3029255..c40fec6 100644 --- a/gpu/dgl.c +++ b/gpu/dgl.c @@ -229,6 +229,7 @@ void* rt_thread(void* _ctx) do_exit = job(ctx); + fprintf(stdout, "[%d] should yield dgl: %d.\n", ctx->id, litmus_dgl_should_yield_lock(dgl, dgl_size)); xfprintf(stdout, "[%d] unlocking dgl.\n", ctx->id); litmus_dgl_unlock(dgl, dgl_size); diff --git a/gpu/gpuspin.cu b/gpu/gpuspin.cu index 21134f6..f361b86 100644 --- a/gpu/gpuspin.cu +++ b/gpu/gpuspin.cu @@ -10,9 +10,11 @@ #include <boost/interprocess/managed_shared_memory.hpp> #include <boost/interprocess/sync/interprocess_mutex.hpp> +#include <boost/filesystem.hpp> #include <random/normal.h> +#include <cuda.h> #include <cuda_runtime.h> #include "litmus.h" @@ -24,7 +26,21 @@ using namespace ranlib; #define ms2s(ms) ((ms)*0.001) +bool SILENT = true; +inline int xprintf(const char *format, ...) +{ + int ret = 0; + if (!SILENT) { + va_list args; + va_start(args, format); + ret = vprintf(format, args); + va_end(args); + } + return ret; +} + const char *lock_namespace = "./.gpuspin-locks"; +const size_t PAGE_SIZE = sysconf(_SC_PAGESIZE); const int NR_GPUS = 8; @@ -34,6 +50,8 @@ bool RELAX_FIFO_MAX_LEN = false; bool ENABLE_CHUNKING = false; bool MIGRATE_VIA_SYSMEM = false; +bool YIELD_LOCKS = false; + enum eEngineLockTypes { FIFO, @@ -97,15 +115,82 @@ int CUR_DEVICE = -1; int LAST_DEVICE = -1; cudaStream_t STREAMS[NR_GPUS]; +cudaEvent_t EVENTS[NR_GPUS]; int GPU_HZ[NR_GPUS]; int NUM_SM[NR_GPUS]; int WARP_SIZE[NR_GPUS]; int ELEM_PER_THREAD[NR_GPUS]; +enum eScheduler +{ + LITMUS, + LINUX, + RT_LINUX +}; + +struct Args +{ + bool wait; + bool migrate; + int cluster; + int cluster_size; + bool gpu_using; + int gpu_partition; + int gpu_partition_size; + int rho; + int num_ce; + bool reserve_migr_ce; + bool use_engine_locks; + eEngineLockTypes engine_lock_type; + bool yield_locks; + bool use_dgls; + eGpuSyncMode gpusync_mode; + bool enable_affinity; + int relax_fifo_len; + eCudaSyncMode sync_mode; + size_t send_size; + size_t recv_size; + size_t state_size; + bool enable_chunking; + size_t chunk_size; + bool use_sysmem_migration; + int num_kernels; + + double wcet_ms; + double gpu_wcet_ms; + double period_ms; + + double budget_ms; + + double stddev; + + eScheduler scheduler; + + unsigned int priority; + + task_class_t cls; + + bool want_enforcement; + bool want_signals; + budget_drain_policy_t drain_policy; + + int column; + + int num_gpu_tasks; + int num_tasks; + + double scale; + + double duration; +}; + + + #define DEFINE_PER_GPU(type, var) type var[NR_GPUS] #define per_gpu(var, idx) (var[(idx)]) #define this_gpu(var) (var[(CUR_DEVICE)]) #define cur_stream() (this_gpu(STREAMS)) +#define cur_event() (this_gpu(EVENTS)) #define cur_gpu() (CUR_DEVICE) #define last_gpu() (LAST_DEVICE) #define cur_ee() (EE_LOCKS[CUR_DEVICE]) @@ -208,9 +293,10 @@ struct ce_lock_state void lock() { if(locks[0] == locks[1]) crash(); - if(USE_DYNAMIC_GROUP_LOCKS) { + if (num_locks == 1) + litmus_lock(locks[0]); + else if(USE_DYNAMIC_GROUP_LOCKS) litmus_dgl_lock(locks, num_locks); - } else { for(int l = 0; l < num_locks; ++l) @@ -224,9 +310,10 @@ struct ce_lock_state void unlock() { if(locks[0] == locks[1]) crash(); - if(USE_DYNAMIC_GROUP_LOCKS) { + if (num_locks == 1) + litmus_unlock(locks[0]); + else if(USE_DYNAMIC_GROUP_LOCKS) litmus_dgl_unlock(locks, num_locks); - } else { // reverse order @@ -238,6 +325,21 @@ struct ce_lock_state locked = false; } + bool should_yield() { + int yield = 1; // assume we should yield + if (YIELD_LOCKS) { + if(locks[0] == locks[1]) crash(); + if (num_locks == 1) + yield = litmus_should_yield_lock(locks[0]); + else if(USE_DYNAMIC_GROUP_LOCKS) + yield = litmus_dgl_should_yield_lock(locks, num_locks); + else + for(int l = num_locks - 1; l >= 0; --l) // reverse order + yield = litmus_should_yield_lock(locks[l]); + } + return (yield); + } + void refresh() { budget_remaining = CHUNK_SIZE; } @@ -271,34 +373,37 @@ static cudaError_t __chunkMemcpy(void* a_dst, const void* a_src, size_t count, { int bytesToCopy = std::min(remaining, chunk_size); - if(state && state->budgetIsAvailable(bytesToCopy) && state->locked) { - cudaStreamSynchronize(STREAMS[CUR_DEVICE]); - ret = cudaGetLastError(); - - if(ret != cudaSuccess) - { - break; + if (state && state->locked) { + // we have to unlock/re-lock the copy engine to refresh our budget unless + // we still have budget available. + if (!state->budgetIsAvailable(bytesToCopy)) { + // optimization - don't unlock if no one else needs the engine + if (state->should_yield()) { + //cudaStreamSynchronize(STREAMS[CUR_DEVICE]); + cudaEventSynchronize(EVENTS[CUR_DEVICE]); + ret = cudaGetLastError(); + state->unlock(); + if(ret != cudaSuccess) + break; + } + // we can only run out of + // budget if chunking is enabled. + // we presume that init budget would + // be set to cover entire memcpy + // if chunking were disabled. + state->refresh(); } - - state->unlock(); - state->refresh(); // replentish. - // we can only run out of - // budget if chunking is enabled. - // we presume that init budget would - // be set to cover entire memcpy - // if chunking were disabled. } - if(state && !state->locked) { + if(state && !state->locked) state->lock(); - } //ret = cudaMemcpy(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind); cudaMemcpyAsync(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind, STREAMS[CUR_DEVICE]); + cudaEventRecord(EVENTS[CUR_DEVICE], STREAMS[CUR_DEVICE]); - if(state) { + if(state) state->decreaseBudget(bytesToCopy); - } ++i; remaining -= bytesToCopy; @@ -316,7 +421,8 @@ static cudaError_t chunkMemcpy(void* a_dst, const void* a_src, size_t count, cudaError_t ret; if(!do_locking || device_a == -1) { ret = __chunkMemcpy(a_dst, a_src, count, kind, NULL); - cudaStreamSynchronize(cur_stream()); + cudaEventSynchronize(cur_event()); +// cudaStreamSynchronize(cur_stream()); if(ret == cudaSuccess) ret = cudaGetLastError(); } @@ -324,7 +430,8 @@ static cudaError_t chunkMemcpy(void* a_dst, const void* a_src, size_t count, ce_lock_state state(device_a, kind, count, device_b, migration); state.lock(); ret = __chunkMemcpy(a_dst, a_src, count, kind, &state); - cudaStreamSynchronize(cur_stream()); + cudaEventSynchronize(cur_event()); + // cudaStreamSynchronize(cur_stream()); if(ret == cudaSuccess) ret = cudaGetLastError(); state.unlock(); @@ -332,17 +439,26 @@ static cudaError_t chunkMemcpy(void* a_dst, const void* a_src, size_t count, return ret; } +int LITMUS_LOCK_FD = 0; + +int EXP_OFFSET = 0; void allocate_locks_litmus(void) { + stringstream ss; + ss<<lock_namespace<<"-"<<EXP_OFFSET; + // allocate k-FMLP lock - int fd = open(lock_namespace, O_RDONLY | O_CREAT, S_IRUSR | S_IWUSR); + //LITMUS_LOCK_FD = open(lock_namespace, O_RDONLY | O_CREAT, S_IRUSR | S_IWUSR); + LITMUS_LOCK_FD = open(ss.str().c_str(), O_RDONLY | O_CREAT, S_IRUSR | S_IWUSR); + int *fd = &LITMUS_LOCK_FD; - int base_name = GPU_PARTITION * 1000; + int base_name = GPU_PARTITION * 100 + EXP_OFFSET * 200; + ++EXP_OFFSET; if (GPU_SYNC_MODE == IKGLP_MODE) { /* Standard (optimal) IKGLP */ - TOKEN_LOCK = open_gpusync_token_lock(fd, + TOKEN_LOCK = open_gpusync_token_lock(*fd, base_name, /* name */ GPU_PARTITION_SIZE, GPU_PARTITION*GPU_PARTITION_SIZE, @@ -355,7 +471,7 @@ void allocate_locks_litmus(void) } else if (GPU_SYNC_MODE == KFMLP_MODE) { /* KFMLP. FIFO queues only for tokens. */ - TOKEN_LOCK = open_gpusync_token_lock(fd, + TOKEN_LOCK = open_gpusync_token_lock(*fd, base_name, /* name */ GPU_PARTITION_SIZE, GPU_PARTITION*GPU_PARTITION_SIZE, @@ -366,7 +482,7 @@ void allocate_locks_litmus(void) } else if (GPU_SYNC_MODE == RGEM_MODE) { /* RGEM-like token allocation. Shared priority queue for all tokens. */ - TOKEN_LOCK = open_gpusync_token_lock(fd, + TOKEN_LOCK = open_gpusync_token_lock(*fd, base_name, /* name */ GPU_PARTITION_SIZE, GPU_PARTITION*GPU_PARTITION_SIZE, @@ -380,7 +496,7 @@ void allocate_locks_litmus(void) * token requests. */ int max_simult_run = std::max(CPU_PARTITION_SIZE, RHO*GPU_PARTITION_SIZE); int max_fifo_len = (int)ceil((float)max_simult_run / (RHO*GPU_PARTITION_SIZE)); - TOKEN_LOCK = open_gpusync_token_lock(fd, + TOKEN_LOCK = open_gpusync_token_lock(*fd, base_name, /* name */ GPU_PARTITION_SIZE, GPU_PARTITION*GPU_PARTITION_SIZE, @@ -416,17 +532,17 @@ void allocate_locks_litmus(void) open_sem_t openEngineLock = (ENGINE_LOCK_TYPE == FIFO) ? open_fifo_sem : open_prioq_sem; - ee_lock = openEngineLock(fd, ee_name); + ee_lock = openEngineLock(*fd, ee_name); if (ee_lock < 0) perror("open_*_sem (engine lock)"); - ce_0_lock = openEngineLock(fd, ce_0_name); + ce_0_lock = openEngineLock(*fd, ce_0_name); if (ce_0_lock < 0) perror("open_*_sem (engine lock)"); if (NUM_COPY_ENGINES == 2) { - ce_1_lock = openEngineLock(fd, ce_1_name); + ce_1_lock = openEngineLock(*fd, ce_1_name); if (ce_1_lock < 0) perror("open_*_sem (engine lock)"); } @@ -464,7 +580,41 @@ void allocate_locks_litmus(void) } } +void deallocate_locks_litmus(void) +{ + for (int i = 0; i < GPU_PARTITION_SIZE; ++i) + { + int idx = GPU_PARTITION*GPU_PARTITION_SIZE + i; + + od_close(EE_LOCKS[idx]); + if (NUM_COPY_ENGINES == 1) + { + od_close(CE_SEND_LOCKS[idx]); + } + else + { + if (RESERVED_MIGR_COPY_ENGINE) { + od_close(CE_SEND_LOCKS[idx]); + od_close(CE_MIGR_SEND_LOCKS[idx]); + } + else { + od_close(CE_SEND_LOCKS[idx]); + od_close(CE_RECV_LOCKS[idx]); + } + } + } + + od_close(TOKEN_LOCK); + close(LITMUS_LOCK_FD); + + memset(&CE_SEND_LOCKS[0], 0, sizeof(CE_SEND_LOCKS)); + memset(&CE_RECV_LOCKS[0], 0, sizeof(CE_RECV_LOCKS)); + memset(&CE_MIGR_SEND_LOCKS[0], 0, sizeof(CE_MIGR_SEND_LOCKS)); + memset(&CE_MIGR_RECV_LOCKS[0], 0, sizeof(CE_MIGR_RECV_LOCKS)); + TOKEN_LOCK = -1; + LITMUS_LOCK_FD = 0; +} class gpu_pool @@ -478,10 +628,9 @@ public: int get(pthread_mutex_t* tex, int preference = -1) { int which = -1; - // int last = (preference >= 0) ? preference : 0; int last = (ENABLE_AFFINITY) ? - (preference >= 0) ? preference : 0 : - rand()%poolSize; + ((preference >= 0) ? preference : 0) : + (rand()%poolSize); int minIdx = last; pthread_mutex_lock(tex); @@ -513,24 +662,22 @@ private: int pool[NR_GPUS]; // >= gpu_part_size }; + +static managed_shared_memory *linux_lock_segment_ptr = NULL; static gpu_pool* GPU_LINUX_SEM_POOL = NULL; static pthread_mutex_t* GPU_LINUX_MUTEX_POOL = NULL; static void allocate_locks_linux(const int num_gpu_users) { - managed_shared_memory *segment_pool_ptr = NULL; - managed_shared_memory *segment_mutex_ptr = NULL; - int numGpuPartitions = NR_GPUS/GPU_PARTITION_SIZE; if(num_gpu_users > 0) { - printf("%d creating shared memory for linux semaphores; num pools = %d, pool size = %d\n", getpid(), numGpuPartitions, GPU_PARTITION_SIZE); - shared_memory_object::remove("linux_mutex_memory"); - shared_memory_object::remove("linux_sem_memory"); + xprintf("%d: creating linux locks\n", getpid()); + shared_memory_object::remove("linux_lock_memory"); - segment_mutex_ptr = new managed_shared_memory(create_only, "linux_mutex_memory", 4*1024); - GPU_LINUX_MUTEX_POOL = segment_mutex_ptr->construct<pthread_mutex_t>("pthread_mutex_t linux_m")[numGpuPartitions](); + linux_lock_segment_ptr = new managed_shared_memory(create_only, "linux_lock_memory", 30*PAGE_SIZE); + GPU_LINUX_MUTEX_POOL = linux_lock_segment_ptr->construct<pthread_mutex_t>("pthread_mutex_t linux_m")[numGpuPartitions](); for(int i = 0; i < numGpuPartitions; ++i) { pthread_mutexattr_t attr; @@ -539,41 +686,41 @@ static void allocate_locks_linux(const int num_gpu_users) pthread_mutex_init(&(GPU_LINUX_MUTEX_POOL[i]), &attr); pthread_mutexattr_destroy(&attr); } - - segment_pool_ptr = new managed_shared_memory(create_only, "linux_sem_memory", 4*1024); - GPU_LINUX_SEM_POOL = segment_pool_ptr->construct<gpu_pool>("gpu_pool linux_p")[numGpuPartitions](GPU_PARTITION_SIZE); + GPU_LINUX_SEM_POOL = linux_lock_segment_ptr->construct<gpu_pool>("gpu_pool linux_p")[numGpuPartitions](GPU_PARTITION_SIZE); } else { + sleep(5); do { try { - if (!segment_pool_ptr) segment_pool_ptr = new managed_shared_memory(open_only, "linux_sem_memory"); - } - catch(...) - { - sleep(1); - } - }while(segment_pool_ptr == NULL); - - do - { - try - { - if (!segment_mutex_ptr) segment_mutex_ptr = new managed_shared_memory(open_only, "linux_mutex_memory"); + if (!linux_lock_segment_ptr) + linux_lock_segment_ptr = new managed_shared_memory(open_only, "linux_lock_memory"); } catch(...) { sleep(1); } - }while(segment_mutex_ptr == NULL); + }while(linux_lock_segment_ptr == NULL); - GPU_LINUX_SEM_POOL = segment_pool_ptr->find<gpu_pool>("gpu_pool linux_p").first; - GPU_LINUX_MUTEX_POOL = segment_mutex_ptr->find<pthread_mutex_t>("pthread_mutex_t linux_m").first; + GPU_LINUX_MUTEX_POOL = linux_lock_segment_ptr->find<pthread_mutex_t>("pthread_mutex_t linux_m").first; + GPU_LINUX_SEM_POOL = linux_lock_segment_ptr->find<gpu_pool>("gpu_pool linux_p").first; } } +static void deallocate_locks_linux(const int num_gpu_users) +{ + GPU_LINUX_MUTEX_POOL = NULL; + GPU_LINUX_SEM_POOL = NULL; + + delete linux_lock_segment_ptr; + linux_lock_segment_ptr = NULL; + + if(num_gpu_users > 0) + shared_memory_object::remove("linux_lock_memory"); +} + @@ -585,6 +732,14 @@ static void allocate_locks(const int num_gpu_users, bool linux_mode) allocate_locks_linux(num_gpu_users); } +static void deallocate_locks(const int num_gpu_users, bool linux_mode) +{ + if(!linux_mode) + deallocate_locks_litmus(); + else + deallocate_locks_linux(num_gpu_users); +} + static void set_cur_gpu(int gpu) { if (TRACE_MIGRATIONS) { @@ -597,47 +752,52 @@ static void set_cur_gpu(int gpu) } -static pthread_barrier_t *gpu_barrier = NULL; +//static pthread_barrier_t *gpu_barrier = NULL; static interprocess_mutex *gpu_mgmt_mutexes = NULL; -static managed_shared_memory *segment_ptr = NULL; +static managed_shared_memory *gpu_mutex_segment_ptr = NULL; void coordinate_gpu_tasks(const int num_gpu_users) { if(num_gpu_users > 0) { - printf("%d creating shared memory\n", getpid()); - shared_memory_object::remove("gpu_barrier_memory"); - segment_ptr = new managed_shared_memory(create_only, "gpu_barrier_memory", 4*1024); - - printf("%d creating a barrier for %d users\n", getpid(), num_gpu_users); - gpu_barrier = segment_ptr->construct<pthread_barrier_t>("pthread_barrier_t gpu_barrier")(); - pthread_barrierattr_t battr; - pthread_barrierattr_init(&battr); - pthread_barrierattr_setpshared(&battr, PTHREAD_PROCESS_SHARED); - pthread_barrier_init(gpu_barrier, &battr, num_gpu_users); - pthread_barrierattr_destroy(&battr); - printf("%d creating gpu mgmt mutexes for %d devices\n", getpid(), NR_GPUS); - gpu_mgmt_mutexes = segment_ptr->construct<interprocess_mutex>("interprocess_mutex m")[NR_GPUS](); + xprintf("%d creating shared memory\n", getpid()); + shared_memory_object::remove("gpu_mutex_memory"); + gpu_mutex_segment_ptr = new managed_shared_memory(create_only, "gpu_mutex_memory", PAGE_SIZE); + +// printf("%d creating a barrier for %d users\n", getpid(), num_gpu_users); +// gpu_barrier = segment_ptr->construct<pthread_barrier_t>("pthread_barrier_t gpu_barrier")(); +// pthread_barrierattr_t battr; +// pthread_barrierattr_init(&battr); +// pthread_barrierattr_setpshared(&battr, PTHREAD_PROCESS_SHARED); +// pthread_barrier_init(gpu_barrier, &battr, num_gpu_users); +// pthread_barrierattr_destroy(&battr); +// printf("%d creating gpu mgmt mutexes for %d devices\n", getpid(), NR_GPUS); + gpu_mgmt_mutexes = gpu_mutex_segment_ptr->construct<interprocess_mutex>("interprocess_mutex m")[NR_GPUS](); } else { + sleep(5); do { try { - segment_ptr = new managed_shared_memory(open_only, "gpu_barrier_memory"); + gpu_mutex_segment_ptr = new managed_shared_memory(open_only, "gpu_mutex_memory"); } catch(...) { sleep(1); } - }while(segment_ptr == NULL); + }while(gpu_mutex_segment_ptr == NULL); - gpu_barrier = segment_ptr->find<pthread_barrier_t>("pthread_barrier_t gpu_barrier").first; - gpu_mgmt_mutexes = segment_ptr->find<interprocess_mutex>("interprocess_mutex m").first; +// gpu_barrier = segment_ptr->find<pthread_barrier_t>("pthread_barrier_t gpu_barrier").first; + gpu_mgmt_mutexes = gpu_mutex_segment_ptr->find<interprocess_mutex>("interprocess_mutex m").first; } } +const size_t SEND_ALLOC_SIZE = 12*1024; +const size_t RECV_ALLOC_SIZE = 12*1024; +const size_t STATE_ALLOC_SIZE = 16*1024; + typedef float spindata_t; char *d_send_data[NR_GPUS] = {0}; @@ -653,18 +813,48 @@ char *h_send_data = 0; char *h_recv_data = 0; char *h_state_data = 0; -unsigned int *h_iteration_count[NR_GPUS] = {0}; +static void destroy_events() +{ + for(int i = 0; i < GPU_PARTITION_SIZE; ++i) + { + int which = GPU_PARTITION*GPU_PARTITION_SIZE + i; + gpu_mgmt_mutexes[which].lock(); + set_cur_gpu(which); + cudaEventDestroy(EVENTS[which]); + gpu_mgmt_mutexes[which].unlock(); + } +} + +static void init_events() +{ + xprintf("creating %s events\n", (CUDA_SYNC_MODE == BLOCKING) ? "blocking" : "spinning"); + for(int i = 0; i < GPU_PARTITION_SIZE; ++i) + { + int which = GPU_PARTITION*GPU_PARTITION_SIZE + i; + gpu_mgmt_mutexes[which].lock(); + set_cur_gpu(which); + if (CUDA_SYNC_MODE == BLOCKING) + cudaEventCreateWithFlags(&EVENTS[which], cudaEventBlockingSync | cudaEventDisableTiming); + else + cudaEventCreateWithFlags(&EVENTS[which], cudaEventDefault | cudaEventDisableTiming); + gpu_mgmt_mutexes[which].unlock(); + } +} static void init_cuda(const int num_gpu_users) { - const int PAGE_SIZE = 4*1024; - size_t send_alloc_bytes = SEND_SIZE + (SEND_SIZE%PAGE_SIZE != 0)*PAGE_SIZE; - size_t recv_alloc_bytes = RECV_SIZE + (RECV_SIZE%PAGE_SIZE != 0)*PAGE_SIZE; - size_t state_alloc_bytes = STATE_SIZE + (STATE_SIZE%PAGE_SIZE != 0)*PAGE_SIZE; + size_t send_alloc_bytes = SEND_ALLOC_SIZE + (SEND_ALLOC_SIZE%PAGE_SIZE != 0)*PAGE_SIZE; + size_t recv_alloc_bytes = RECV_ALLOC_SIZE + (RECV_ALLOC_SIZE%PAGE_SIZE != 0)*PAGE_SIZE; + size_t state_alloc_bytes = STATE_ALLOC_SIZE + (STATE_ALLOC_SIZE%PAGE_SIZE != 0)*PAGE_SIZE; - coordinate_gpu_tasks(num_gpu_users); + static bool first_time = true; + + if (first_time) { + coordinate_gpu_tasks(num_gpu_users); + first_time = false; + } -#if 1 +#if 0 switch (CUDA_SYNC_MODE) { case BLOCKING: @@ -674,8 +864,6 @@ static void init_cuda(const int num_gpu_users) cudaSetDeviceFlags(cudaDeviceScheduleSpin); break; } -#else - cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); #endif for(int i = 0; i < GPU_PARTITION_SIZE; ++i) @@ -687,6 +875,9 @@ static void init_cuda(const int num_gpu_users) try { set_cur_gpu(which); + + xprintf("setting up GPU %d\n", which); + cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 0); cudaDeviceSetLimit(cudaLimitMallocHeapSize, 0); @@ -698,8 +889,8 @@ static void init_cuda(const int num_gpu_users) // enough to fill the L2 cache exactly. ELEM_PER_THREAD[which] = (prop.l2CacheSize/(NUM_SM[which]*WARP_SIZE[which]*sizeof(spindata_t))); - - if (!MIGRATE_VIA_SYSMEM && prop.unifiedAddressing) +// if (!MIGRATE_VIA_SYSMEM && prop.unifiedAddressing) + if (prop.unifiedAddressing) { for(int j = 0; j < GPU_PARTITION_SIZE; ++j) { @@ -717,29 +908,23 @@ static void init_cuda(const int num_gpu_users) } } - cudaStreamCreate(&STREAMS[CUR_DEVICE]); + cudaStreamCreate(&STREAMS[which]); + // gpu working set cudaMalloc(&d_spin_data[which], prop.l2CacheSize); cudaMemset(&d_spin_data[which], 0, prop.l2CacheSize); -// cudaMalloc(&d_iteration_count[which], NUM_SM[which]*WARP_SIZE[which]*sizeof(unsigned int)); -// cudaHostAlloc(&h_iteration_count[which], NUM_SM[which]*WARP_SIZE[which]*sizeof(unsigned int), cudaHostAllocPortable | cudaHostAllocMapped); - - if (send_alloc_bytes) { - cudaMalloc(&d_send_data[which], send_alloc_bytes); - cudaHostAlloc(&h_send_data, send_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped); - } - if (h_recv_data) { - cudaMalloc(&d_recv_data[which], recv_alloc_bytes); - cudaHostAlloc(&h_recv_data, recv_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped); - } + // send data + cudaMalloc(&d_send_data[which], send_alloc_bytes); + cudaHostAlloc(&h_send_data, send_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped); - if (h_state_data) { - cudaMalloc(&d_state_data[which], state_alloc_bytes); + // recv data + cudaMalloc(&d_recv_data[which], recv_alloc_bytes); + cudaHostAlloc(&h_recv_data, recv_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped); - if (MIGRATE_VIA_SYSMEM) - cudaHostAlloc(&h_state_data, state_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped | cudaHostAllocWriteCombined); - } + // state data + cudaMalloc(&d_state_data[which], state_alloc_bytes); + cudaHostAlloc(&h_state_data, state_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped | cudaHostAllocWriteCombined); } catch(std::exception &e) { @@ -793,6 +978,8 @@ static bool MigrateToGPU_SysMem(int from, int to) // THIS IS ON-DEMAND SYS_MEM MIGRATION. GPUSync says // you should be using speculative migrations. // Use PushState() and PullState(). + fprintf(stderr, "Tried to sysmem migrate from %d to %d\n", + from, to); assert(false); // for now bool success = true; @@ -846,12 +1033,31 @@ static void MigrateIfNeeded(int next_gpu) PushState(); } } + else if(cur_gpu() == -1) { + set_cur_gpu(next_gpu); + } } - - static void exit_cuda() { +#if 0 + for(int i = 0; i < GPU_PARTITION_SIZE; ++i) + { + int which = GPU_PARTITION*GPU_PARTITION_SIZE + i; + gpu_mgmt_mutexes[which].lock(); + set_cur_gpu(which); + cudaFree(d_send_data[which]); + cudaFree(d_recv_data[which]); + cudaFree(d_state_data[which]); + cudaFree(d_spin_data[which]); + gpu_mgmt_mutexes[which].unlock(); + } +#endif + + cudaFreeHost(h_send_data); + cudaFreeHost(h_recv_data); + cudaFreeHost(h_state_data); + for(int i = 0; i < GPU_PARTITION_SIZE; ++i) { int which = GPU_PARTITION*GPU_PARTITION_SIZE + i; @@ -860,6 +1066,14 @@ static void exit_cuda() cudaDeviceReset(); gpu_mgmt_mutexes[which].unlock(); } + + memset(d_send_data, 0, sizeof(d_send_data)); + memset(d_recv_data, 0, sizeof(d_recv_data)); + memset(d_state_data, 0, sizeof(d_state_data)); + memset(d_spin_data, 0, sizeof(d_spin_data)); + h_send_data = NULL; + h_recv_data = NULL; + h_state_data = NULL; } bool safetynet = false; @@ -895,14 +1109,6 @@ static void catch_exit(int catch_exit) } - - - -#ifdef VANILLA_LINUX -static float ms_sum; -static int gpucount = 0; -#endif - __global__ void docudaspin(float* data, /*unsigned int* iterations,*/ unsigned int num_elem, unsigned int cycles) { long long int now = clock64(); @@ -959,13 +1165,30 @@ static void gpu_loop_for(double gpu_sec_time, unsigned int num_kernels, double e chunkMemcpy(this_gpu(d_state_data), h_send_data, SEND_SIZE, cudaMemcpyHostToDevice, CUR_DEVICE, useEngineLocks()); + bool locked = false; for(unsigned int i = 0; i < num_kernels; ++i) { - if(useEngineLocks()) litmus_lock(cur_ee()); + if(useEngineLocks() && !locked) { + litmus_lock(cur_ee()); + locked = true; + } + /* one block per sm, one warp per block */ - docudaspin <<<cur_sms(),cur_warp_size(), 0, cur_stream()>>> (d_spin_data[cur_gpu()], cur_elem_per_thread(), numcycles); - cudaStreamSynchronize(cur_stream()); - if(useEngineLocks()) litmus_unlock(cur_ee()); + docudaspin <<<cur_sms(), cur_warp_size(), 0, cur_stream()>>> (d_spin_data[cur_gpu()], cur_elem_per_thread(), numcycles); + + if(useEngineLocks() && (!YIELD_LOCKS || (YIELD_LOCKS && litmus_should_yield_lock(cur_ee())))) { +// cudaStreamSynchronize(cur_stream()); + cudaEventRecord(cur_event(), cur_stream()); + cudaEventSynchronize(cur_event()); + litmus_unlock(cur_ee()); + locked = false; + } + } + if (locked) { + cudaEventRecord(cur_event(), cur_stream()); + cudaEventSynchronize(cur_event()); + litmus_unlock(cur_ee()); + locked = false; } if(RECV_SIZE > 0) @@ -985,9 +1208,9 @@ out: static void gpu_loop_for_linux(double gpu_sec_time, unsigned int num_kernels, double emergency_exit) { - static int GPU_OFFSET = GPU_PARTITION * GPU_PARTITION_SIZE; - static gpu_pool *pool = &GPU_LINUX_SEM_POOL[GPU_PARTITION]; - static pthread_mutex_t *mutex = &GPU_LINUX_MUTEX_POOL[GPU_PARTITION]; + int GPU_OFFSET = GPU_PARTITION * GPU_PARTITION_SIZE; + gpu_pool *pool = &GPU_LINUX_SEM_POOL[GPU_PARTITION]; + pthread_mutex_t *mutex = &GPU_LINUX_MUTEX_POOL[GPU_PARTITION]; int next_gpu; @@ -996,19 +1219,10 @@ static void gpu_loop_for_linux(double gpu_sec_time, unsigned int num_kernels, do if (emergency_exit && wctime() > emergency_exit) goto out; -#ifdef VANILLA_LINUX - static bool once = false; - static cudaEvent_t start, end; - float ms; - if (!once) - { - once = true; - cudaEventCreate(&start); - cudaEventCreate(&end); - } -#endif - - next_gpu = pool->get(mutex, cur_gpu() - GPU_OFFSET) + GPU_OFFSET; + next_gpu = pool->get(mutex, ((cur_gpu() != -1) ? + cur_gpu() - GPU_OFFSET : + -1)) + + GPU_OFFSET; { MigrateIfNeeded(next_gpu); @@ -1021,24 +1235,11 @@ static void gpu_loop_for_linux(double gpu_sec_time, unsigned int num_kernels, do for(unsigned int i = 0; i < num_kernels; ++i) { /* one block per sm, one warp per block */ -#ifdef VANILLA_LINUX - cudaEventRecord(start, cur_stream()); -#endif docudaspin <<<cur_sms(),cur_warp_size(), 0, cur_stream()>>> (d_spin_data[cur_gpu()], cur_elem_per_thread(), numcycles); -#ifdef VANILLA_LINUX - cudaEventRecord(end, cur_stream()); - cudaEventSynchronize(end); -#endif - cudaStreamSynchronize(cur_stream()); - -#ifdef VANILLA_LINUX - cudaEventElapsedTime(&ms, start, end); - ms_sum += ms; -#endif + cudaEventRecord(cur_event(), cur_stream()); + cudaEventSynchronize(cur_event()); +// cudaStreamSynchronize(cur_stream()); } -#ifdef VANILLA_LINUX - ++gpucount; -#endif if(RECV_SIZE > 0) chunkMemcpy(h_recv_data, this_gpu(d_state_data), RECV_SIZE, @@ -1075,73 +1276,73 @@ static void usage(char *error) { exit(EXIT_FAILURE); } -/* - * returns the character that made processing stop, newline or EOF - */ -static int skip_to_next_line(FILE *fstream) -{ - int ch; - for (ch = fgetc(fstream); ch != EOF && ch != '\n'; ch = fgetc(fstream)); - return ch; -} - -static void skip_comments(FILE *fstream) -{ - int ch; - for (ch = fgetc(fstream); ch == '#'; ch = fgetc(fstream)) - skip_to_next_line(fstream); - ungetc(ch, fstream); -} - -static void get_exec_times(const char *file, const int column, - int *num_jobs, double **exec_times) -{ - FILE *fstream; - int cur_job, cur_col, ch; - *num_jobs = 0; - - fstream = fopen(file, "r"); - if (!fstream) - bail_out("could not open execution time file"); - - /* figure out the number of jobs */ - do { - skip_comments(fstream); - ch = skip_to_next_line(fstream); - if (ch != EOF) - ++(*num_jobs); - } while (ch != EOF); - - if (-1 == fseek(fstream, 0L, SEEK_SET)) - bail_out("rewinding file failed"); - - /* allocate space for exec times */ - *exec_times = (double*)calloc(*num_jobs, sizeof(*exec_times)); - if (!*exec_times) - bail_out("couldn't allocate memory"); - - for (cur_job = 0; cur_job < *num_jobs && !feof(fstream); ++cur_job) { - - skip_comments(fstream); - - for (cur_col = 1; cur_col < column; ++cur_col) { - /* discard input until we get to the column we want */ - int unused __attribute__ ((unused)) = fscanf(fstream, "%*s,"); - } - - /* get the desired exec. time */ - if (1 != fscanf(fstream, "%lf", (*exec_times)+cur_job)) { - fprintf(stderr, "invalid execution time near line %d\n", - cur_job); - exit(EXIT_FAILURE); - } - - skip_to_next_line(fstream); - } - - assert(cur_job == *num_jobs); - fclose(fstream); -} +///* +// * returns the character that made processing stop, newline or EOF +// */ +//static int skip_to_next_line(FILE *fstream) +//{ +// int ch; +// for (ch = fgetc(fstream); ch != EOF && ch != '\n'; ch = fgetc(fstream)); +// return ch; +//} +// +//static void skip_comments(FILE *fstream) +//{ +// int ch; +// for (ch = fgetc(fstream); ch == '#'; ch = fgetc(fstream)) +// skip_to_next_line(fstream); +// ungetc(ch, fstream); +//} +// +//static void get_exec_times(const char *file, const int column, +// int *num_jobs, double **exec_times) +//{ +// FILE *fstream; +// int cur_job, cur_col, ch; +// *num_jobs = 0; +// +// fstream = fopen(file, "r"); +// if (!fstream) +// bail_out("could not open execution time file"); +// +// /* figure out the number of jobs */ +// do { +// skip_comments(fstream); +// ch = skip_to_next_line(fstream); +// if (ch != EOF) +// ++(*num_jobs); +// } while (ch != EOF); +// +// if (-1 == fseek(fstream, 0L, SEEK_SET)) +// bail_out("rewinding file failed"); +// +// /* allocate space for exec times */ +// *exec_times = (double*)calloc(*num_jobs, sizeof(*exec_times)); +// if (!*exec_times) +// bail_out("couldn't allocate memory"); +// +// for (cur_job = 0; cur_job < *num_jobs && !feof(fstream); ++cur_job) { +// +// skip_comments(fstream); +// +// for (cur_col = 1; cur_col < column; ++cur_col) { +// /* discard input until we get to the column we want */ +// int unused __attribute__ ((unused)) = fscanf(fstream, "%*s,"); +// } +// +// /* get the desired exec. time */ +// if (1 != fscanf(fstream, "%lf", (*exec_times)+cur_job)) { +// fprintf(stderr, "invalid execution time near line %d\n", +// cur_job); +// exit(EXIT_FAILURE); +// } +// +// skip_to_next_line(fstream); +// } +// +// assert(cur_job == *num_jobs); +// fclose(fstream); +//} #define NUMS 4096 static int num[NUMS]; @@ -1190,23 +1391,23 @@ out: } -static void debug_delay_loop(void) -{ - double start, end, delay; - - while (1) { - for (delay = 0.5; delay > 0.01; delay -= 0.01) { - start = wctime(); - loop_for(delay, 0); - end = wctime(); - printf("%6.4fs: looped for %10.8fs, delta=%11.8fs, error=%7.4f%%\n", - delay, - end - start, - end - start - delay, - 100 * (end - start - delay) / delay); - } - } -} +//static void debug_delay_loop(void) +//{ +// double start, end, delay; +// +// while (1) { +// for (delay = 0.5; delay > 0.01; delay -= 0.01) { +// start = wctime(); +// loop_for(delay, 0); +// end = wctime(); +// printf("%6.4fs: looped for %10.8fs, delta=%11.8fs, error=%7.4f%%\n", +// delay, +// end - start, +// end - start - delay, +// 100 * (end - start - delay) / delay); +// } +// } +//} typedef bool (*gpu_job_t)(double exec_time, double gpu_exec_time, unsigned int num_kernels, double program_end); typedef bool (*cpu_job_t)(double exec_time, double program_end); @@ -1288,6 +1489,108 @@ static void init_linux() mlockall(MCL_CURRENT | MCL_FUTURE); } +static int enable_aux_rt_tasks_linux(pid_t tid) +{ + /* pre: caller must already be real time */ + int ret = 0; + struct sched_param param; + stringstream pidstr; + boost::filesystem::directory_iterator theEnd; + boost::filesystem::path proc_dir; + + int policy = sched_getscheduler(tid); + if (policy == -1 || policy != SCHED_FIFO) { + ret = -1; + goto out; + } + + ret = sched_getparam(tid, ¶m); + if (ret < 0) + goto out; + + + pidstr<<getpid(); + proc_dir = boost::filesystem::path("/proc"); + proc_dir /= pidstr.str(); + proc_dir /= "task"; + + for(boost::filesystem::directory_iterator iter(proc_dir); iter != theEnd; ++iter) + { + stringstream taskstr(iter->path().leaf().c_str()); + int child = 0; + taskstr>>child; + if (child != tid && child != 0) + { + /* mirror tid's params to others */ + ret = sched_setscheduler(child, policy, ¶m); + if (ret != 0) + goto out; + } + } + +out: + return ret; +} + +static int disable_aux_rt_tasks_linux(pid_t tid) +{ + int ret = 0; + struct sched_param param; + stringstream pidstr; + boost::filesystem::directory_iterator theEnd; + boost::filesystem::path proc_dir; + + memset(¶m, 0, sizeof(param)); + + pidstr<<getpid(); + proc_dir = boost::filesystem::path("/proc"); + proc_dir /= pidstr.str(); + proc_dir /= "task"; + + for(boost::filesystem::directory_iterator iter(proc_dir); iter != theEnd; ++iter) + { + stringstream taskstr(iter->path().leaf().c_str()); + int child = 0; + taskstr>>child; + if (child != tid && child != 0) + { + /* make all other threads sched_normal */ + ret = sched_setscheduler(child, SCHED_OTHER, ¶m); + if (ret != 0) + goto out; + } + } + +out: + return ret; +} + +static int be_migrate_all_to_cluster(int cluster, int cluster_size) +{ + int ret = 0; + stringstream pidstr; + + pidstr<<getpid(); + boost::filesystem::path proc_dir("/proc"); + proc_dir /= pidstr.str(); + proc_dir /= "task"; + boost::filesystem::directory_iterator theEnd; + for(boost::filesystem::directory_iterator iter(proc_dir); iter != theEnd; ++iter) + { + stringstream taskstr(iter->path().leaf().c_str()); + int task = 0; + taskstr>>task; + if (task != 0) { + ret = be_migrate_to_cluster(cluster, cluster_size); + if (ret != 0) + goto out; + } + } + +out: + return ret; +} + static bool gpu_job_linux(double exec_time, double gpu_exec_time, unsigned int num_kernels, double program_end) { double chunk1, chunk2; @@ -1322,195 +1625,820 @@ static bool job_linux(double exec_time, double program_end) /*****************************/ -enum eScheduler + + + + +enum eRunMode { - LITMUS, - LINUX, - RT_LINUX + NORMAL, + PROXY, + DAEMON, }; -#define CPU_OPTIONS "p:z:c:wlveio:f:s:q:X:L:Q:d:" -#define GPU_OPTIONS "g:y:r:C:E:DG:xS:R:T:Z:aFm:b:MNIk:VW:" - -// concat the option strings -#define OPTSTR CPU_OPTIONS GPU_OPTIONS +void set_defaults(struct Args* args) +{ + memset(args, 0, sizeof(*args)); + args->wcet_ms = -1.0; + args->gpu_wcet_ms = 0.0; + args->period_ms = -1.0; + args->budget_ms = -1.0; + args->gpusync_mode = IKGLP_MODE; + args->sync_mode = BLOCKING; + args->gpu_using = false; + args->enable_affinity = false; + args->enable_chunking = false; + args->relax_fifo_len = false; + args->use_sysmem_migration = false; + args->rho = 2; + args->num_ce = 2; + args->reserve_migr_ce = false; + args->num_kernels = 1; + args->engine_lock_type = FIFO; + args->yield_locks = false; + args->drain_policy = DRAIN_SIMPLE; + args->want_enforcement = false; + args->want_signals = false; + args->priority = LITMUS_LOWEST_PRIORITY; + args->cls = RT_CLASS_SOFT; + args->scheduler = LITMUS; + args->migrate = false; + args->cluster = 0; + args->cluster_size = 1; + args->stddev = 0.0; + args->wait = false; + args->scale = 1.0; + args->duration = 0.0; +} -int main(int argc, char** argv) +void apply_args(struct Args* args) { - int ret; + // set all the globals + CPU_PARTITION_SIZE = args->cluster_size; + GPU_USING = args->gpu_using; + GPU_PARTITION = args->gpu_partition; + GPU_PARTITION_SIZE = args->gpu_partition_size; + RHO = args->rho; + NUM_COPY_ENGINES = args->num_ce; + RESERVED_MIGR_COPY_ENGINE = args->reserve_migr_ce; + USE_ENGINE_LOCKS = args->use_engine_locks; + ENGINE_LOCK_TYPE = args->engine_lock_type; + YIELD_LOCKS = args->yield_locks; + USE_DYNAMIC_GROUP_LOCKS = args->use_dgls; + GPU_SYNC_MODE = args->gpusync_mode; + ENABLE_AFFINITY = args->enable_affinity; + RELAX_FIFO_MAX_LEN = args->relax_fifo_len; + CUDA_SYNC_MODE = args->sync_mode; + SEND_SIZE = args->send_size; + RECV_SIZE = args->recv_size; + STATE_SIZE = args->state_size; + ENABLE_CHUNKING = args->enable_chunking; + CHUNK_SIZE = args->chunk_size; + MIGRATE_VIA_SYSMEM = args->use_sysmem_migration; + + // roll back other globals to an initial state + CUR_DEVICE = -1; + LAST_DEVICE = -1; +} +int __do_normal(struct Args* args) +{ + int ret = 0; struct rt_task param; lt_t wcet; lt_t period; lt_t budget; - double wcet_ms = -1.0; - double gpu_wcet_ms = 0.0; - double period_ms = -1.0; - double budget_ms = -1.0; - - unsigned int num_kernels = 1; - - budget_drain_policy_t drain = DRAIN_SIMPLE; - bool want_enforcement = false; - bool want_signals = false; - - unsigned int priority = LITMUS_LOWEST_PRIORITY; - - task_class_t cls = RT_CLASS_SOFT; - - eScheduler scheduler = LITMUS; - int num_gpu_users = 0; - int migrate = 0; - int cluster = 0; - int cluster_size = 1; Normal<double> *wcet_dist_ms = NULL; - float stdpct = 0.0; cpu_job_t cjobfn = NULL; gpu_job_t gjobfn = NULL; - int wait = 0; - double scale = 1.0; - int test_loop = 0; + double start = 0; - double duration = 0, start = 0; - int cur_job = 0, num_jobs = 0; - int column = 1; + if (MIGRATE_VIA_SYSMEM && GPU_PARTITION_SIZE == 1) + return -1; - int opt; - - double *exec_times = NULL; - const char *file = NULL; - - /* locking */ -// int lock_od = -1; -// int resource_id = 0; -// int protocol = -1; -// double cs_length = 1; /* millisecond */ + // turn off some features to be safe + if (args->scheduler != LITMUS) + { + RHO = 0; + USE_ENGINE_LOCKS = false; + USE_DYNAMIC_GROUP_LOCKS = false; + RELAX_FIFO_MAX_LEN = false; + ENABLE_RT_AUX_THREADS = false; + args->budget_ms = -1.0; + args->want_enforcement = false; + args->want_signals = false; - progname = argv[0]; + cjobfn = job_linux; + gjobfn = gpu_job_linux; + } + else + { + cjobfn = job; + gjobfn = gpu_job; + } - while ((opt = getopt(argc, argv, OPTSTR)) != -1) { + wcet = ms2ns(args->wcet_ms); + period = ms2ns(args->period_ms); + if (wcet <= 0) { + printf("The worst-case execution time must be a positive number.\n"); + ret = -1; + goto out; + } + if (period <= 0) { + printf("The period must be a positive number.\n"); + ret = -1; + goto out; + } + if (wcet > period) { + printf("The worst-case execution time must not exceed the period.\n"); + ret = -1; + goto out; + } + if (args->gpu_using && args->gpu_wcet_ms <= 0) { + printf("The worst-case gpu execution time must be a positive number.\n"); + ret = -1; + goto out; + } + + if (args->budget_ms > 0.0) + budget = ms2ns(args->budget_ms); + else + budget = args->wcet_ms; + + // randomize execution time according to a normal distribution + // centered around the desired execution time. + // standard deviation is a percentage of this average + wcet_dist_ms = new Normal<double>(args->wcet_ms + args->gpu_wcet_ms, (args->wcet_ms + args->gpu_wcet_ms) * args->stddev); + wcet_dist_ms->seed((unsigned int)time(0)); + + ret = be_migrate_all_to_cluster(args->cluster, args->cluster_size); + if (ret < 0) { + printf("could not migrate to target partition or cluster.\n"); + goto out; + } + + if (args->scheduler != LITMUS) + { + // set some variables needed by linux modes + if (args->gpu_using) + TRACE_MIGRATIONS = true; + periodTime.tv_sec = period / s2ns(1); + periodTime.tv_nsec = period - periodTime.tv_sec * s2ns(1); + period_ns = period; + job_no = 0; + } + + init_rt_task_param(¶m); + param.exec_cost = budget; + param.period = period; + param.priority = args->priority; + param.cls = args->cls; + param.budget_policy = (args->want_enforcement) ? + PRECISE_ENFORCEMENT : NO_ENFORCEMENT; + param.budget_signal_policy = (args->want_enforcement && args->want_signals) ? + PRECISE_SIGNALS : NO_SIGNALS; + param.drain_policy = args->drain_policy; + param.release_policy = PERIODIC; + param.cpu = cluster_to_first_cpu(args->cluster, args->cluster_size); + + ret = set_rt_task_param(gettid(), ¶m); + if (ret < 0) { + bail_out("could not setup rt task params\n"); + goto out; + } + + if (args->want_signals) + /* bind default longjmp signal handler to SIG_BUDGET. */ + activate_litmus_signals(SIG_BUDGET_MASK, longjmp_on_litmus_signal); + else + ignore_litmus_signals(SIG_BUDGET_MASK); + + if (args->gpu_using) + allocate_locks(args->num_gpu_tasks, args->scheduler != LITMUS); + + if (args->scheduler == LITMUS) + { + ret = task_mode(LITMUS_RT_TASK); + if (ret < 0) { + printf("could not become RT task\n"); + goto out; + } + } + else + { + if (args->scheduler == RT_LINUX) + { + struct sched_param fifoparams; + memset(&fifoparams, 0, sizeof(fifoparams)); + fifoparams.sched_priority = args->priority; + ret = sched_setscheduler(getpid(), SCHED_FIFO, &fifoparams); + if (ret < 0) { + printf("could not become sched_fifo task\n"); + goto out; + } + } + trace_name(); + trace_param(); + } + + if (args->wait) { + xprintf("%d: waiting for release.\n", getpid()); + ret = wait_for_ts_release2(&releaseTime); + if (ret != 0) { + printf("wait_for_ts_release2()\n"); + goto out; + } + + if (args->scheduler != LITMUS) + log_release(); + } + else if (args->scheduler != LITMUS) + { + clock_gettime(CLOCK_MONOTONIC, &releaseTime); + sleep_next_period_linux(); + } + + if (args->gpu_using && ENABLE_RT_AUX_THREADS) { + if (args->scheduler == LITMUS) { + ret = enable_aux_rt_tasks(AUX_CURRENT | AUX_FUTURE); + if (ret != 0) { + printf("enable_aux_rt_tasks() failed\n"); + goto out; + } + } + else if (args->scheduler == RT_LINUX) { + ret = enable_aux_rt_tasks_linux(gettid()); + if (ret != 0) { + printf("enable_aux_rt_tasks_linux() failed\n"); + goto out; + } + } + } + + start = wctime(); + + if (!args->gpu_using) { + bool keepgoing; + do + { + double job_ms = wcet_dist_ms->random(); + if (job_ms < 0.0) + job_ms = 0.0; + keepgoing = cjobfn(ms2s(job_ms * args->scale), start + args->duration); + }while(keepgoing); + } + else { + bool keepgoing; + do + { + double job_ms = wcet_dist_ms->random(); + if (job_ms < 0.0) + job_ms = 0.0; + + double cpu_job_ms = (job_ms/(args->wcet_ms + args->gpu_wcet_ms))*args->wcet_ms; + double gpu_job_ms = (job_ms/(args->wcet_ms + args->gpu_wcet_ms))*args->gpu_wcet_ms; + keepgoing = gjobfn( + ms2s(cpu_job_ms * args->scale), + ms2s(gpu_job_ms * args->scale), + args->num_kernels, + start + args->duration); + }while(keepgoing); + } + + if (args->gpu_using && ENABLE_RT_AUX_THREADS) { + if (args->scheduler == LITMUS) { + ret = disable_aux_rt_tasks(AUX_CURRENT | AUX_FUTURE); + if (ret != 0) { + printf("disable_aux_rt_tasks() failed\n"); + goto out; + } + } + else if(args->scheduler == RT_LINUX) { + ret = disable_aux_rt_tasks_linux(gettid()); + if (ret != 0) { + printf("disable_aux_rt_tasks_linux() failed\n"); + goto out; + } + } + } + + if (args->gpu_using) + deallocate_locks(args->num_gpu_tasks, args->scheduler != LITMUS); + + if (args->scheduler == LITMUS) + { + ret = task_mode(BACKGROUND_TASK); + if (ret != 0) { + printf("could not become regular task (huh?)\n"); + goto out; + } + } + + { + // become a normal task just in case. + struct sched_param normalparams; + memset(&normalparams, 0, sizeof(normalparams)); + ret = sched_setscheduler(getpid(), SCHED_OTHER, &normalparams); + if (ret < 0) { + printf("could not become sched_normal task\n"); + goto out; + } + } + +out: + if (wcet_dist_ms) + delete wcet_dist_ms; + + return ret; +} + +int do_normal(struct Args* args) +{ + int ret = 0; + + apply_args(args); + + if (args->scheduler == LITMUS) + init_litmus(); + else + init_linux(); + + if (args->gpu_using) { + signal(SIGABRT, catch_exit); + signal(SIGTERM, catch_exit); + signal(SIGQUIT, catch_exit); + signal(SIGSEGV, catch_exit); + + cudaSetDeviceFlags(cudaDeviceScheduleSpin); + init_cuda(args->num_gpu_tasks); + init_events(); + safetynet = true; + } + + ret = __do_normal(args); + + if (args->gpu_using) { + safetynet = false; + exit_cuda(); + } + + return ret; +} + +typedef struct run_entry +{ + struct Args args; + int used; + int ret; +} run_entry_t; + + + +static int *num_run_entries = NULL; +static run_entry_t *run_entries = NULL; +static pthread_barrier_t *daemon_barrier = NULL; +static pthread_mutex_t *daemon_mutex = NULL; + +static run_entry_t *my_run_entry = NULL; +static managed_shared_memory *daemon_segment_ptr = NULL; + +int init_daemon(struct Args* args, int num_total_users, bool is_daemon) +{ + if (num_total_users) + { + shared_memory_object::remove("gpuspin_daemon_memory"); + + daemon_segment_ptr = new managed_shared_memory(create_only, "gpuspin_daemon_memory", 30*PAGE_SIZE); + num_run_entries = daemon_segment_ptr->construct<int>("int num_run_entries")(); + *num_run_entries = num_total_users; + + run_entries = daemon_segment_ptr->construct<struct run_entry>("run_entry_t run_entries")[num_total_users](); + memset(run_entries, 0, sizeof(run_entry_t)*num_total_users); + + daemon_mutex = daemon_segment_ptr->construct<pthread_mutex_t>("pthread_mutex_t daemon_mutex")(); + pthread_mutexattr_t attr; + pthread_mutexattr_init(&attr); + pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED); + pthread_mutex_init(daemon_mutex, &attr); + pthread_mutexattr_destroy(&attr); + + daemon_barrier = daemon_segment_ptr->construct<pthread_barrier_t>("pthread_barrier_t daemon_barrier")(); + pthread_barrierattr_t battr; + pthread_barrierattr_init(&battr); + pthread_barrierattr_setpshared(&battr, PTHREAD_PROCESS_SHARED); + pthread_barrier_init(daemon_barrier, &battr, args->num_tasks*2); + pthread_barrierattr_destroy(&battr); + } + else + { + do + { + try + { + if (!daemon_segment_ptr) daemon_segment_ptr = new managed_shared_memory(open_only, "gpuspin_daemon_memory"); + } + catch(...) + { + sleep(1); + } + }while(daemon_segment_ptr == NULL); + + num_run_entries = daemon_segment_ptr->find<int>("int num_run_entries").first; + run_entries = daemon_segment_ptr->find<struct run_entry>("run_entry_t run_entries").first; + daemon_mutex = daemon_segment_ptr->find<pthread_mutex_t>("pthread_mutex_t daemon_mutex").first; + daemon_barrier = daemon_segment_ptr->find<pthread_barrier_t>("pthread_barrier_t daemon_barrier").first; + } + + if (is_daemon) + { + // find and claim an entry + pthread_mutex_lock(daemon_mutex); + for(int i = 0; i < *num_run_entries; ++i) + { + if(!run_entries[i].used) + { + my_run_entry = &run_entries[i]; + my_run_entry->used = 1; + break; + } + } + pthread_mutex_unlock(daemon_mutex); + + assert(my_run_entry); + my_run_entry->args = *args; + my_run_entry->ret = 0; + } + else + { + // find my entry + pthread_mutex_lock(daemon_mutex); + for(int i = 0; i < *num_run_entries; ++i) + { + if (run_entries[i].args.wcet_ms == args->wcet_ms && + run_entries[i].args.gpu_wcet_ms == args->gpu_wcet_ms && + run_entries[i].args.period_ms == args->period_ms) + { + my_run_entry = &run_entries[i]; + break; + } + } + pthread_mutex_unlock(daemon_mutex); + } + + if (!my_run_entry) + return -1; + return 0; +} + +int put_next_run(struct Args* args) +{ + assert(my_run_entry); + + pthread_mutex_lock(daemon_mutex); + my_run_entry->args = *args; + pthread_mutex_unlock(daemon_mutex); + + pthread_barrier_wait(daemon_barrier); + + return 0; +} + +int get_next_run(struct Args* args) +{ + assert(my_run_entry); + + pthread_barrier_wait(daemon_barrier); + + pthread_mutex_lock(daemon_mutex); + *args = my_run_entry->args; + my_run_entry->ret = 0; + pthread_mutex_unlock(daemon_mutex); + + return 0; +} + +int complete_run(int ret) +{ + assert(my_run_entry); + + pthread_mutex_lock(daemon_mutex); + my_run_entry->ret = ret; + pthread_mutex_unlock(daemon_mutex); + + pthread_barrier_wait(daemon_barrier); + + return 0; +} + +int wait_completion() +{ + int ret = 0; + + assert(my_run_entry); + + pthread_barrier_wait(daemon_barrier); + + pthread_mutex_lock(daemon_mutex); + ret = my_run_entry->ret; + pthread_mutex_unlock(daemon_mutex); + + return ret; +} + + + + +int do_proxy(struct Args* args) +{ + int ret = 0; + ret = init_daemon(args, 0, false); + if (ret < 0) + goto out; + put_next_run(args); + ret = wait_completion(); + +out: + return ret; +} + +static bool is_daemon = false; +static bool running = false; +static void catch_exit2(int signal) +{ + if (is_daemon && running) + complete_run(-signal); + catch_exit(signal); +} + +int do_daemon(struct Args* args) +{ + is_daemon = true; + + int ret = 0; + struct Args nextargs; + + signal(SIGFPE, catch_exit2); + signal(SIGABRT, catch_exit2); + signal(SIGTERM, catch_exit2); + signal(SIGQUIT, catch_exit2); + signal(SIGSEGV, catch_exit2); + + init_daemon(args, args->num_tasks, true); + + apply_args(args); + init_litmus(); /* does everything init_linux() does, plus litmus stuff */ + + if (args->gpu_using) { + cudaSetDeviceFlags(cudaDeviceScheduleSpin); + init_cuda(args->num_gpu_tasks); + init_events(); + safetynet = true; + } + + do { + bool sync_change = false; + bool gpu_part_change = false; + bool gpu_part_size_change = false; + + xprintf("%d: waiting for work\n", getpid()); + + get_next_run(&nextargs); + + if (nextargs.gpu_using) { + xprintf("%d: gpu using! gpu partition = %d, gwcet = %f, send = %lu\n", + getpid(), + nextargs.gpu_partition, + nextargs.gpu_wcet_ms, + nextargs.send_size); + } + + running = true; + sync_change = args->gpu_using && (CUDA_SYNC_MODE != nextargs.sync_mode); + gpu_part_change = args->gpu_using && (GPU_PARTITION != nextargs.gpu_partition); + gpu_part_size_change = args->gpu_using && (GPU_PARTITION_SIZE != nextargs.gpu_partition_size); + + if (sync_change || gpu_part_change || gpu_part_size_change) { + destroy_events(); + if (gpu_part_change || gpu_part_size_change) + exit_cuda(); + } + apply_args(&nextargs); + if (sync_change || gpu_part_change || gpu_part_size_change) { + if (gpu_part_change || gpu_part_size_change) { + xprintf("%d: changing device configuration\n", getpid()); + init_cuda(nextargs.num_gpu_tasks); + CUR_DEVICE = -1; + LAST_DEVICE = -1; + } + init_events(); + } + + xprintf("%d: starting run\n", getpid()); + + ret = __do_normal(&nextargs); + complete_run(ret); + running = false; + }while(ret == 0); + + if (args->gpu_using) { + safetynet = false; + exit_cuda(); + } + + if (args->num_gpu_tasks) + shared_memory_object::remove("gpu_mutex_memory"); + + if (args->num_tasks) + shared_memory_object::remove("gpuspin_daemon_memory"); + + return ret; +} + +#define CPU_OPTIONS "p:z:c:wlveio:f:s:q:X:L:Q:d:" +#define GPU_OPTIONS "g:y:r:C:E:DG:xS:R:T:Z:aFm:b:MNIk:VW:u" +#define PROXY_OPTIONS "B:PA" + +// concat the option strings +#define OPTSTR CPU_OPTIONS GPU_OPTIONS PROXY_OPTIONS + +int main(int argc, char** argv) +{ + struct Args myArgs; + set_defaults(&myArgs); + + eRunMode run_mode = NORMAL; + + int opt; + + progname = argv[0]; + + while ((opt = getopt(argc, argv, OPTSTR)) != -1) { switch (opt) { + case 'B': + myArgs.num_tasks = atoi(optarg); + break; + case 'P': + run_mode = PROXY; + break; + case 'A': + run_mode = DAEMON; + break; + + case 'w': - wait = 1; + myArgs.wait = true; break; case 'p': - cluster = atoi(optarg); - migrate = 1; + myArgs.cluster = atoi(optarg); + myArgs.migrate = true; break; case 'z': - cluster_size = atoi(optarg); - CPU_PARTITION_SIZE = cluster_size; +// CPU_PARTITION_SIZE = cluster_size; + myArgs.cluster_size = atoi(optarg); break; case 'g': - GPU_USING = true; - GPU_PARTITION = atoi(optarg); - assert(GPU_PARTITION >= 0 && GPU_PARTITION < NR_GPUS); +// GPU_USING = true; +// GPU_PARTITION = atoi(optarg); + myArgs.gpu_using = true; + myArgs.gpu_partition = atoi(optarg); +// assert(GPU_PARTITION >= 0 && GPU_PARTITION < NR_GPUS); break; case 'y': - GPU_PARTITION_SIZE = atoi(optarg); - assert(GPU_PARTITION_SIZE > 0); +// GPU_PARTITION_SIZE = atoi(optarg); + myArgs.gpu_partition_size = atoi(optarg); +// assert(GPU_PARTITION_SIZE > 0); break; case 'r': - RHO = atoi(optarg); - assert(RHO > 0); +// RHO = atoi(optarg); + myArgs.rho = atoi(optarg); +// assert(RHO > 0); break; case 'C': - NUM_COPY_ENGINES = atoi(optarg); - assert(NUM_COPY_ENGINES == 1 || NUM_COPY_ENGINES == 2); +// NUM_COPY_ENGINES = atoi(optarg); + myArgs.num_ce = atoi(optarg); +// assert(NUM_COPY_ENGINES == 1 || NUM_COPY_ENGINES == 2); break; case 'V': - RESERVED_MIGR_COPY_ENGINE = true; +// RESERVED_MIGR_COPY_ENGINE = true; + myArgs.reserve_migr_ce = true; break; case 'E': - USE_ENGINE_LOCKS = true; - ENGINE_LOCK_TYPE = (eEngineLockTypes)atoi(optarg); - assert(ENGINE_LOCK_TYPE == FIFO || ENGINE_LOCK_TYPE == PRIOQ); +// USE_ENGINE_LOCKS = true; +// ENGINE_LOCK_TYPE = (eEngineLockTypes)atoi(optarg); + myArgs.use_engine_locks = true; + myArgs.engine_lock_type = (eEngineLockTypes)atoi(optarg); +// assert(ENGINE_LOCK_TYPE == FIFO || ENGINE_LOCK_TYPE == PRIOQ); + break; + case 'u': + myArgs.yield_locks = true; break; case 'D': - USE_DYNAMIC_GROUP_LOCKS = true; +// USE_DYNAMIC_GROUP_LOCKS = true; + myArgs.use_dgls = true; break; case 'G': - GPU_SYNC_MODE = (eGpuSyncMode)atoi(optarg); - assert(GPU_SYNC_MODE >= IKGLP_MODE && GPU_SYNC_MODE <= RGEM_MODE); +// GPU_SYNC_MODE = (eGpuSyncMode)atoi(optarg); + myArgs.gpusync_mode = (eGpuSyncMode)atoi(optarg); +// assert(GPU_SYNC_MODE >= IKGLP_MODE && GPU_SYNC_MODE <= RGEM_MODE); break; case 'a': - ENABLE_AFFINITY = true; +// ENABLE_AFFINITY = true; + myArgs.enable_affinity = true; break; case 'F': - RELAX_FIFO_MAX_LEN = true; +// RELAX_FIFO_MAX_LEN = true; + myArgs.relax_fifo_len = true; break; case 'x': - CUDA_SYNC_MODE = SPIN; +// CUDA_SYNC_MODE = SPIN; + myArgs.sync_mode = SPIN; break; case 'S': - SEND_SIZE = kbToB((size_t)atoi(optarg)); +// SEND_SIZE = kbToB((size_t)atoi(optarg)); + myArgs.send_size = kbToB((size_t)atoi(optarg)); break; case 'R': - RECV_SIZE = kbToB((size_t)atoi(optarg)); +// RECV_SIZE = kbToB((size_t)atoi(optarg)); + myArgs.recv_size = kbToB((size_t)atoi(optarg)); break; case 'T': - STATE_SIZE = kbToB((size_t)atoi(optarg)); +// STATE_SIZE = kbToB((size_t)atoi(optarg)); + myArgs.state_size = kbToB((size_t)atoi(optarg)); break; case 'Z': - ENABLE_CHUNKING = true; - CHUNK_SIZE = kbToB((size_t)atoi(optarg)); +// ENABLE_CHUNKING = true; +// CHUNK_SIZE = kbToB((size_t)atoi(optarg)); + myArgs.enable_chunking = true; + myArgs.chunk_size = kbToB((size_t)atoi(optarg)); break; case 'M': - MIGRATE_VIA_SYSMEM = true; +// MIGRATE_VIA_SYSMEM = true; + myArgs.use_sysmem_migration = true; break; case 'm': - num_gpu_users = (int)atoi(optarg); - assert(num_gpu_users > 0); +// num_gpu_users = (int)atoi(optarg); + myArgs.num_gpu_tasks = (int)atoi(optarg); +// assert(num_gpu_users > 0); break; case 'k': - num_kernels = (unsigned int)atoi(optarg); +// num_kernels = (unsigned int)atoi(optarg); + myArgs.num_kernels = (unsigned int)atoi(optarg); break; case 'b': - budget_ms = atoi(optarg); +// budget_ms = atoi(optarg); + myArgs.budget_ms = atoi(optarg); break; case 'W': - stdpct = atof(optarg); +// stdpct = (double)atof(optarg); + myArgs.stddev = (double)atof(optarg); break; case 'N': - scheduler = LINUX; +// scheduler = LINUX; + myArgs.scheduler = LINUX; break; case 'I': - scheduler = RT_LINUX; +// scheduler = RT_LINUX; + myArgs.scheduler = RT_LINUX; break; case 'q': - priority = atoi(optarg); +// priority = atoi(optarg); + myArgs.priority = atoi(optarg); break; case 'c': - cls = str2class(optarg); - if (cls == -1) - usage("Unknown task class."); +// cls = str2class(optarg); + myArgs.cls = str2class(optarg); break; case 'e': - want_enforcement = true; +// want_enforcement = true; + myArgs.want_enforcement = true; break; case 'i': - want_signals = true; +// want_signals = true; + myArgs.want_signals = true; break; case 'd': - drain = (budget_drain_policy_t)atoi(optarg); - assert(drain >= DRAIN_SIMPLE && drain <= DRAIN_SOBLIV); - assert(drain != DRAIN_SAWARE); // unsupported - break; - case 'l': - test_loop = 1; - break; - case 'o': - column = atoi(optarg); +// drain = (budget_drain_policy_t)atoi(optarg); + myArgs.drain_policy = (budget_drain_policy_t)atoi(optarg); +// assert(drain >= DRAIN_SIMPLE && drain <= DRAIN_SOBLIV); +// assert(drain != DRAIN_SAWARE); // unsupported break; +// case 'l': +// test_loop = 1; +// break; +// case 'o': +//// column = atoi(optarg); +// myArgs.column = atoi(optarg); +// break; // case 'f': // file = optarg; // break; case 's': - scale = atof(optarg); +// scale = (double)atof(optarg); + myArgs.scale = (double)atof(optarg); break; // case 'X': // protocol = lock_protocol_for_name(optarg); @@ -1537,304 +2465,33 @@ int main(int argc, char** argv) } } -#ifdef VANILLA_LINUX - assert(scheduler != LITMUS); - assert(!wait); -#endif - - assert(stdpct >= 0.0); - - if (MIGRATE_VIA_SYSMEM) - assert(GPU_PARTITION_SIZE != 1); - - // turn off some features to be safe - if (scheduler != LITMUS) - { - RHO = 0; - USE_ENGINE_LOCKS = false; - USE_DYNAMIC_GROUP_LOCKS = false; - RELAX_FIFO_MAX_LEN = false; - ENABLE_RT_AUX_THREADS = false; - budget_ms = -1.0; - want_enforcement = false; - want_signals = false; - - cjobfn = job_linux; - gjobfn = gpu_job_linux; - - if (scheduler == RT_LINUX) - { - struct sched_param fifoparams; - - assert(priority >= sched_get_priority_min(SCHED_FIFO) && - priority <= sched_get_priority_max(SCHED_FIFO)); - - memset(&fifoparams, 0, sizeof(fifoparams)); - fifoparams.sched_priority = priority; - assert(0 == sched_setscheduler(getpid(), SCHED_FIFO, &fifoparams)); - } - } - else - { - cjobfn = job; - gjobfn = gpu_job; - - if (!litmus_is_valid_fixed_prio(priority)) - usage("Invalid priority."); - } - - if (test_loop) { - debug_delay_loop(); - return 0; - } srand(time(0)); - if (file) { - get_exec_times(file, column, &num_jobs, &exec_times); - - if (argc - optind < 2) - usage("Arguments missing."); - - for (cur_job = 0; cur_job < num_jobs; ++cur_job) { - /* convert the execution time to seconds */ - duration += exec_times[cur_job] * 0.001; - } - } else { - /* - * if we're not reading from the CSV file, then we need - * three parameters - */ - if (argc - optind < 3) - usage("Arguments missing."); - } - if (argc - optind == 3) { - assert(!GPU_USING); - wcet_ms = atof(argv[optind + 0]); - period_ms = atof(argv[optind + 1]); - duration = atof(argv[optind + 2]); + myArgs.wcet_ms = atof(argv[optind + 0]); + myArgs.period_ms = atof(argv[optind + 1]); + myArgs.duration = atof(argv[optind + 2]); } else if (argc - optind == 4) { - assert(GPU_USING); - wcet_ms = atof(argv[optind + 0]); - gpu_wcet_ms = atof(argv[optind + 1]); - period_ms = atof(argv[optind + 2]); - duration = atof(argv[optind + 3]); - } - - wcet = ms2ns(wcet_ms); - period = ms2ns(period_ms); - if (wcet <= 0) - usage("The worst-case execution time must be a " - "positive number."); - if (period <= 0) - usage("The period must be a positive number."); - if (!file && wcet > period) { - usage("The worst-case execution time must not " - "exceed the period."); - } - if (GPU_USING && gpu_wcet_ms <= 0) - usage("The worst-case gpu execution time must be a positive number."); - - if (budget_ms > 0.0) - budget = ms2ns(budget_ms); - else - budget = wcet; - -#if 0 - // use upscale to determine breakdown utilization - // only scaling up CPU time for now. - double upscale = (double)period/(double)budget - 1.0; - upscale = std::min(std::max(0.0, upscale), 0.6); // at most 30% - wcet = wcet + wcet*upscale; - budget = budget + wcet*upscale; - wcet_ms = wcet_ms + wcet_ms*upscale; - - // fucking floating point - if (budget < wcet) - budget = wcet; - if (budget > period) - budget = period; -#endif - - // randomize execution time according to a normal distribution - // centered around the desired execution time. - // standard deviation is a percentage of this average - wcet_dist_ms = new Normal<double>(wcet_ms + gpu_wcet_ms, (wcet_ms + gpu_wcet_ms) * stdpct); - wcet_dist_ms->seed((unsigned int)time(0)); - - if (file && num_jobs > 1) - duration += period_ms * 0.001 * (num_jobs - 1); - - if (migrate) { - ret = be_migrate_to_cluster(cluster, cluster_size); - if (ret < 0) - bail_out("could not migrate to target partition or cluster."); - } - - if (scheduler != LITMUS) - { - // set some variables needed by linux modes - if (GPU_USING) - { - TRACE_MIGRATIONS = true; - } - periodTime.tv_sec = period / s2ns(1); - periodTime.tv_nsec = period - periodTime.tv_sec * s2ns(1); - period_ns = period; - } - - init_rt_task_param(¶m); - param.exec_cost = budget; - param.period = period; - param.priority = priority; - param.cls = cls; - param.budget_policy = (want_enforcement) ? - PRECISE_ENFORCEMENT : NO_ENFORCEMENT; - param.budget_signal_policy = (want_enforcement && want_signals) ? - PRECISE_SIGNALS : NO_SIGNALS; - param.drain_policy = drain; - param.release_policy = PERIODIC; - - if (migrate) - param.cpu = cluster_to_first_cpu(cluster, cluster_size); - ret = set_rt_task_param(gettid(), ¶m); - if (ret < 0) - bail_out("could not setup rt task params"); - - if (scheduler == LITMUS) { - init_litmus(); - } - else { - init_linux(); + myArgs.wcet_ms = atof(argv[optind + 0]); + myArgs.gpu_wcet_ms = atof(argv[optind + 1]); + myArgs.period_ms = atof(argv[optind + 2]); + myArgs.duration = atof(argv[optind + 3]); } - if (want_signals) { - /* bind default longjmp signal handler to SIG_BUDGET. */ - activate_litmus_signals(SIG_BUDGET_MASK, longjmp_on_litmus_signal); + if (myArgs.num_tasks == 0 || myArgs.num_gpu_tasks == 0) { + // safety w.r.t. shared mem. + sleep(2); } -// if (protocol >= 0) { -// /* open reference to semaphore */ -// lock_od = litmus_open_lock(protocol, resource_id, lock_namespace, &cluster); -// if (lock_od < 0) { -// perror("litmus_open_lock"); -// usage("Could not open lock."); -// } -// } - - if (GPU_USING) { - allocate_locks(num_gpu_users, scheduler != LITMUS); - - signal(SIGABRT, catch_exit); - signal(SIGTERM, catch_exit); - signal(SIGQUIT, catch_exit); - signal(SIGSEGV, catch_exit); - - init_cuda(num_gpu_users); - safetynet = true; + if (run_mode == NORMAL) { + return do_normal(&myArgs); } - - if (scheduler == LITMUS) - { - ret = task_mode(LITMUS_RT_TASK); - if (ret != 0) - bail_out("could not become RT task"); + else if (run_mode == PROXY) { + return do_proxy(&myArgs); } - else - { - trace_name(); - trace_param(); + else if (run_mode == DAEMON) { + return do_daemon(&myArgs); } - - if (wait) { - ret = wait_for_ts_release2(&releaseTime); - if (ret != 0) - bail_out("wait_for_ts_release2()"); - - if (scheduler != LITMUS) - log_release(); - } - else if (scheduler != LITMUS) - { - clock_gettime(CLOCK_MONOTONIC, &releaseTime); - sleep_next_period_linux(); - } - - if (scheduler == LITMUS && GPU_USING && ENABLE_RT_AUX_THREADS) { - if (enable_aux_rt_tasks(AUX_CURRENT | AUX_FUTURE) != 0) - bail_out("enable_aux_rt_tasks() failed"); - } - - start = wctime(); - - if (!GPU_USING) { - bool keepgoing; - do - { - double job_ms = wcet_dist_ms->random(); - if (job_ms < 0.0) - job_ms = 0.0; - keepgoing = cjobfn(ms2s(job_ms * scale), start + duration); - }while(keepgoing); - } - else { - bool keepgoing; - do - { - double job_ms = wcet_dist_ms->random(); - if (job_ms < 0.0) - job_ms = 0.0; - - double cpu_job_ms = (job_ms/(wcet_ms + gpu_wcet_ms))*wcet_ms; - double gpu_job_ms = (job_ms/(wcet_ms + gpu_wcet_ms))*gpu_wcet_ms; - keepgoing = gjobfn( - ms2s(cpu_job_ms * scale), - ms2s(gpu_job_ms * scale), - num_kernels, - start + duration); - }while(keepgoing); - } - - if (GPU_USING && ENABLE_RT_AUX_THREADS) - if (disable_aux_rt_tasks(AUX_CURRENT | AUX_FUTURE) != 0) - bail_out("disable_aux_rt_tasks() failed"); - -// if (file) { -// /* use times read from the CSV file */ -// for (cur_job = 0; cur_job < num_jobs; ++cur_job) { -// /* convert job's length to seconds */ -// job(exec_times[cur_job] * 0.001 * scale, -// start + duration, -// lock_od, cs_length * 0.001); -// } -// } else { -// /* convert to seconds and scale */ -// while (job(wcet_ms * 0.001 * scale, start + duration, -// lock_od, cs_length * 0.001)); -// } - - if (scheduler == LITMUS) - { - ret = task_mode(BACKGROUND_TASK); - if (ret != 0) - bail_out("could not become regular task (huh?)"); - } - - if (GPU_USING) { - safetynet = false; - exit_cuda(); - - -// printf("avg: %f\n", ms_sum/gpucount); - } - - if (wcet_dist_ms) - delete wcet_dist_ms; - - if (file) - free(exec_times); - - return 0; } diff --git a/include/litmus.h b/include/litmus.h index d3b89cf..a6c2b13 100644 --- a/include/litmus.h +++ b/include/litmus.h @@ -89,6 +89,7 @@ int litmus_open_lock( /* real-time locking protocol support */ int litmus_lock(int od); int litmus_unlock(int od); +int litmus_should_yield_lock(int od); /* Dynamic group lock support. ods arrays MUST BE PARTIALLY ORDERED!!!!!! * Use the same ordering for lock and unlock. @@ -99,6 +100,7 @@ int litmus_unlock(int od); */ int litmus_dgl_lock(int* ods, int dgl_size); int litmus_dgl_unlock(int* ods, int dgl_size); +int litmus_dgl_should_yield_lock(int* ods, int dgl_size); /* nvidia graphics cards */ int register_nv_device(int nv_device_id); diff --git a/src/kernel_iface.c b/src/kernel_iface.c index e446102..73d398f 100644 --- a/src/kernel_iface.c +++ b/src/kernel_iface.c @@ -80,7 +80,7 @@ int get_nr_ts_release_waiters(void) } /* thread-local pointer to control page */ -static __thread struct control_page *ctrl_page; +static __thread struct control_page *ctrl_page = NULL; int init_kernel_iface(void) { diff --git a/src/syscalls.c b/src/syscalls.c index d3ca5d8..ff02b7d 100644 --- a/src/syscalls.c +++ b/src/syscalls.c @@ -58,6 +58,11 @@ int litmus_unlock(int od) return syscall(__NR_litmus_unlock, od); } +int litmus_should_yield_lock(int od) +{ + return syscall(__NR_litmus_should_yield_lock, od); +} + int litmus_dgl_lock(int *ods, int dgl_size) { return syscall(__NR_litmus_dgl_lock, ods, dgl_size); @@ -68,6 +73,11 @@ int litmus_dgl_unlock(int *ods, int dgl_size) return syscall(__NR_litmus_dgl_unlock, ods, dgl_size); } +int litmus_dgl_should_yield_lock(int *ods, int dgl_size) +{ + return syscall(__NR_litmus_dgl_should_yield_lock, ods, dgl_size); +} + int get_job_no(unsigned int *job_no) { return syscall(__NR_query_job_no, job_no); -- cgit v1.2.2