From 992ce8df6eae19c6826018d62cb337fbc632de75 Mon Sep 17 00:00:00 2001 From: Glenn Elliott <gelliott@cs.unc.edu> Date: Wed, 15 May 2013 02:20:14 -0400 Subject: signal handling in gpuspin --- gpu/gpuspin.cu | 192 +++++++++++++++++++++++++++++++++++++++++++++------------ src/signal.c | 2 + 2 files changed, 156 insertions(+), 38 deletions(-) diff --git a/gpu/gpuspin.cu b/gpu/gpuspin.cu index 304d937..8a9b717 100644 --- a/gpu/gpuspin.cu +++ b/gpu/gpuspin.cu @@ -8,6 +8,8 @@ #include <assert.h> #include <execinfo.h> +#include <exception> + #include <boost/interprocess/managed_shared_memory.hpp> #include <boost/interprocess/sync/interprocess_mutex.hpp> #include <boost/filesystem.hpp> @@ -39,6 +41,7 @@ const unsigned int CE_RECV_START = 400; const unsigned int CE_RECV_END = 401; bool SILENT = true; +//bool SILENT = false; inline int xprintf(const char *format, ...) { int ret = 0; @@ -56,6 +59,19 @@ const size_t PAGE_SIZE = sysconf(_SC_PAGESIZE); const int NR_GPUS = 8; +bool WANT_SIGNALS = false; +inline void gpuspin_block_litmus_signals(unsigned long mask) +{ + if (WANT_SIGNALS) + block_litmus_signals(mask); +} + +inline void gpuspin_unblock_litmus_signals(unsigned long mask) +{ + if (WANT_SIGNALS) + unblock_litmus_signals(mask); +} + bool GPU_USING = false; bool ENABLE_AFFINITY = false; bool RELAX_FIFO_MAX_LEN = false; @@ -305,16 +321,24 @@ struct ce_lock_state void lock() { if(locks[0] == locks[1]) crash(); - if (num_locks == 1) + if (num_locks == 1) { + gpuspin_block_litmus_signals(ALL_LITMUS_SIG_MASKS); litmus_lock(locks[0]); - else if(USE_DYNAMIC_GROUP_LOCKS) + gpuspin_unblock_litmus_signals(ALL_LITMUS_SIG_MASKS); + } + else if(USE_DYNAMIC_GROUP_LOCKS) { + gpuspin_block_litmus_signals(ALL_LITMUS_SIG_MASKS); litmus_dgl_lock(locks, num_locks); + gpuspin_unblock_litmus_signals(ALL_LITMUS_SIG_MASKS); + } else { + gpuspin_block_litmus_signals(ALL_LITMUS_SIG_MASKS); for(int l = 0; l < num_locks; ++l) { litmus_lock(locks[l]); } + gpuspin_unblock_litmus_signals(ALL_LITMUS_SIG_MASKS); } locked = true; } @@ -322,17 +346,25 @@ struct ce_lock_state void unlock() { if(locks[0] == locks[1]) crash(); - if (num_locks == 1) + if (num_locks == 1) { + gpuspin_block_litmus_signals(ALL_LITMUS_SIG_MASKS); litmus_unlock(locks[0]); - else if(USE_DYNAMIC_GROUP_LOCKS) + gpuspin_unblock_litmus_signals(ALL_LITMUS_SIG_MASKS); + } + else if(USE_DYNAMIC_GROUP_LOCKS) { + gpuspin_block_litmus_signals(ALL_LITMUS_SIG_MASKS); litmus_dgl_unlock(locks, num_locks); + gpuspin_unblock_litmus_signals(ALL_LITMUS_SIG_MASKS); + } else { + gpuspin_block_litmus_signals(ALL_LITMUS_SIG_MASKS); // reverse order for(int l = num_locks - 1; l >= 0; --l) { litmus_unlock(locks[l]); } + gpuspin_unblock_litmus_signals(ALL_LITMUS_SIG_MASKS); } locked = false; } @@ -391,14 +423,14 @@ static cudaError_t __chunkMemcpy(void* a_dst, const void* a_src, size_t count, if (!state->budgetIsAvailable(bytesToCopy)) { // optimization - don't unlock if no one else needs the engine if (state->should_yield()) { - //cudaStreamSynchronize(STREAMS[CUR_DEVICE]); + gpuspin_block_litmus_signals(ALL_LITMUS_SIG_MASKS); cudaEventSynchronize(EVENTS[CUR_DEVICE]); ret = cudaGetLastError(); - if (kind == cudaMemcpyDeviceToHost || kind == cudaMemcpyDeviceToDevice) inject_action(CE_RECV_END); if (kind == cudaMemcpyHostToDevice) inject_action(CE_SEND_END); + gpuspin_unblock_litmus_signals(ALL_LITMUS_SIG_MASKS); state->unlock(); if(ret != cudaSuccess) @@ -422,8 +454,10 @@ static cudaError_t __chunkMemcpy(void* a_dst, const void* a_src, size_t count, } //ret = cudaMemcpy(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind); + gpuspin_block_litmus_signals(ALL_LITMUS_SIG_MASKS); cudaMemcpyAsync(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind, STREAMS[CUR_DEVICE]); cudaEventRecord(EVENTS[CUR_DEVICE], STREAMS[CUR_DEVICE]); + gpuspin_unblock_litmus_signals(ALL_LITMUS_SIG_MASKS); if(state) state->decreaseBudget(bytesToCopy); @@ -444,10 +478,11 @@ static cudaError_t chunkMemcpy(void* a_dst, const void* a_src, size_t count, cudaError_t ret; if(!do_locking || device_a == -1) { ret = __chunkMemcpy(a_dst, a_src, count, kind, NULL); + gpuspin_block_litmus_signals(ALL_LITMUS_SIG_MASKS); cudaEventSynchronize(cur_event()); -// cudaStreamSynchronize(cur_stream()); if(ret == cudaSuccess) ret = cudaGetLastError(); + gpuspin_unblock_litmus_signals(ALL_LITMUS_SIG_MASKS); } else { ce_lock_state state(device_a, kind, count, device_b, migration); @@ -459,6 +494,7 @@ static cudaError_t chunkMemcpy(void* a_dst, const void* a_src, size_t count, inject_action(CE_SEND_START); ret = __chunkMemcpy(a_dst, a_src, count, kind, &state); + gpuspin_block_litmus_signals(ALL_LITMUS_SIG_MASKS); cudaEventSynchronize(cur_event()); // cudaStreamSynchronize(cur_stream()); if(ret == cudaSuccess) @@ -468,6 +504,7 @@ static cudaError_t chunkMemcpy(void* a_dst, const void* a_src, size_t count, inject_action(CE_RECV_END); if (kind == cudaMemcpyHostToDevice) inject_action(CE_SEND_END); + gpuspin_unblock_litmus_signals(ALL_LITMUS_SIG_MASKS); state.unlock(); } @@ -963,11 +1000,11 @@ static void init_cuda(const int num_gpu_users) } catch(std::exception &e) { - printf("caught an exception during initializiation!: %s\n", e.what()); + fprintf(stderr, "caught an exception during initializiation!: %s\n", e.what()); } catch(...) { - printf("caught unknown exception.\n"); + fprintf(stderr, "caught unknown exception.\n"); } gpu_mgmt_mutexes[which].unlock(); @@ -1185,63 +1222,109 @@ __global__ void docudaspin(float* data, /*unsigned int* iterations,*/ unsigned i static void gpu_loop_for(double gpu_sec_time, unsigned int num_kernels, double emergency_exit) { int next_gpu; + bool ee_locked = false; + bool early_exit = false; if (gpu_sec_time <= 0.0) goto out; if (emergency_exit && wctime() > emergency_exit) goto out; + gpuspin_block_litmus_signals(ALL_LITMUS_SIG_MASKS); next_gpu = litmus_lock(TOKEN_LOCK); inject_action(TOKEN_START); + gpuspin_unblock_litmus_signals(ALL_LITMUS_SIG_MASKS); + + LITMUS_TRY { + gpuspin_block_litmus_signals(ALL_LITMUS_SIG_MASKS); MigrateIfNeeded(next_gpu); + gpuspin_unblock_litmus_signals(ALL_LITMUS_SIG_MASKS); + unsigned int numcycles = ((unsigned int)(cur_hz() * gpu_sec_time))/num_kernels; if(SEND_SIZE > 0) chunkMemcpy(this_gpu(d_state_data), h_send_data, SEND_SIZE, cudaMemcpyHostToDevice, CUR_DEVICE, useEngineLocks()); - bool locked = false; for(unsigned int i = 0; i < num_kernels; ++i) { - if(useEngineLocks() && !locked) { + gpuspin_block_litmus_signals(ALL_LITMUS_SIG_MASKS); + + if(useEngineLocks() && !ee_locked) { litmus_lock(cur_ee()); inject_action(EE_START); - locked = true; + ee_locked = true; } - /* one block per sm, one warp per block */ docudaspin <<<cur_sms(), cur_warp_size(), 0, cur_stream()>>> (d_spin_data[cur_gpu()], cur_elem_per_thread(), numcycles); - if(useEngineLocks() && (!YIELD_LOCKS || (YIELD_LOCKS && litmus_should_yield_lock(cur_ee())))) { // cudaStreamSynchronize(cur_stream()); cudaEventRecord(cur_event(), cur_stream()); cudaEventSynchronize(cur_event()); inject_action(EE_END); litmus_unlock(cur_ee()); - locked = false; + ee_locked = false; } + gpuspin_unblock_litmus_signals(ALL_LITMUS_SIG_MASKS); } - if (locked) { + + if (ee_locked) { + gpuspin_block_litmus_signals(ALL_LITMUS_SIG_MASKS); + cudaEventRecord(cur_event(), cur_stream()); cudaEventSynchronize(cur_event()); inject_action(EE_END); litmus_unlock(cur_ee()); - locked = false; + + gpuspin_unblock_litmus_signals(ALL_LITMUS_SIG_MASKS); + ee_locked = false; } if(RECV_SIZE > 0) chunkMemcpy(h_recv_data, this_gpu(d_state_data), RECV_SIZE, cudaMemcpyDeviceToHost, CUR_DEVICE, useEngineLocks()); - if (MIGRATE_VIA_SYSMEM) + if (MIGRATE_VIA_SYSMEM) { + gpuspin_block_litmus_signals(ALL_LITMUS_SIG_MASKS); PullState(); + gpuspin_unblock_litmus_signals(ALL_LITMUS_SIG_MASKS); + } + } + LITMUS_CATCH(SIG_BUDGET) + { + cudaEventRecord(cur_event(), cur_stream()); + cudaEventSynchronize(cur_event()); + + if (useEngineLocks()) { + /* unlock all engine locks. will fail safely if not held */ + litmus_unlock(cur_ee()); + if (NUM_COPY_ENGINES == 1) { + litmus_unlock(cur_send()); + } + else if (RESERVED_MIGR_COPY_ENGINE) { + litmus_unlock(cur_send()); + litmus_unlock(cur_migr_send()); + } + else { + litmus_unlock(cur_send()); + litmus_unlock(cur_recv()); + } + } + early_exit = true; } + END_LITMUS_TRY + + gpuspin_block_litmus_signals(ALL_LITMUS_SIG_MASKS); inject_action(TOKEN_END); litmus_unlock(TOKEN_LOCK); + gpuspin_unblock_litmus_signals(ALL_LITMUS_SIG_MASKS); last_gpu() = cur_gpu(); + if (early_exit) + throw std::exception(); + out: return; } @@ -1463,9 +1546,24 @@ static bool gpu_job(double exec_time, double gpu_exec_time, unsigned int num_ker chunk1 = exec_time * drand48(); chunk2 = exec_time - chunk1; - loop_for(chunk1, program_end + 1); - gpu_loop_for(gpu_exec_time, num_kernels, program_end + 1); - loop_for(chunk2, program_end + 1); + LITMUS_TRY + { + try + { + loop_for(chunk1, program_end + 1); + gpu_loop_for(gpu_exec_time, num_kernels, program_end + 1); + loop_for(chunk2, program_end + 1); + } + catch(std::exception& e) + { + xprintf("%d: ran out of time while using GPU\n", gettid()); + } + } + LITMUS_CATCH(SIG_BUDGET) + { + xprintf("%d: ran out of time\n", gettid()); + } + END_LITMUS_TRY sleep_next_period(); } @@ -1478,7 +1576,15 @@ static bool job(double exec_time, double program_end) return false; } else { - loop_for(exec_time, program_end + 1); + LITMUS_TRY + { + loop_for(exec_time, program_end + 1); + } + LITMUS_CATCH(SIG_BUDGET) + { + xprintf("%d: ran out of time\n", gettid()); + } + END_LITMUS_TRY sleep_next_period(); } return true; @@ -1741,6 +1847,8 @@ void apply_args(struct Args* args) else if (args->scheduler == LITMUS) TRACE_MIGRATIONS = false; + WANT_SIGNALS = args->want_signals; + // roll back other globals to an initial state CUR_DEVICE = -1; LAST_DEVICE = -1; @@ -1773,7 +1881,6 @@ int __do_normal(struct Args* args) USE_DYNAMIC_GROUP_LOCKS = false; RELAX_FIFO_MAX_LEN = false; ENABLE_RT_AUX_THREADS = false; - args->budget_ms = -1.0; args->want_enforcement = false; args->want_signals = false; @@ -1788,23 +1895,24 @@ int __do_normal(struct Args* args) wcet = ms2ns(args->wcet_ms); period = ms2ns(args->period_ms); + if (wcet <= 0) { - printf("The worst-case execution time must be a positive number.\n"); + fprintf(stderr, "The worst-case execution time must be a positive number.\n"); ret = -1; goto out; } if (period <= 0) { - printf("The period must be a positive number.\n"); + fprintf(stderr, "The period must be a positive number.\n"); ret = -1; goto out; } if (wcet > period) { - printf("The worst-case execution time must not exceed the period.\n"); + fprintf(stderr, "The worst-case execution time must not exceed the period.\n"); ret = -1; goto out; } if (args->gpu_using && args->gpu_wcet_ms <= 0) { - printf("The worst-case gpu execution time must be a positive number.\n"); + fprintf(stderr, "The worst-case gpu execution time must be a positive number.\n"); ret = -1; goto out; } @@ -1812,7 +1920,7 @@ int __do_normal(struct Args* args) if (args->budget_ms > 0.0) budget = ms2ns(args->budget_ms); else - budget = args->wcet_ms; + budget = wcet; // randomize execution time according to a normal distribution // centered around the desired execution time. @@ -1822,7 +1930,7 @@ int __do_normal(struct Args* args) ret = be_migrate_all_to_cluster(args->cluster, args->cluster_size); if (ret < 0) { - printf("could not migrate to target partition or cluster.\n"); + fprintf(stderr, "could not migrate to target partition or cluster.\n"); goto out; } @@ -1844,9 +1952,10 @@ int __do_normal(struct Args* args) param.cls = args->cls; param.budget_policy = (args->want_enforcement) ? PRECISE_ENFORCEMENT : NO_ENFORCEMENT; - param.budget_signal_policy = (args->want_enforcement && args->want_signals) ? + param.budget_signal_policy = (args->want_signals) ? PRECISE_SIGNALS : NO_SIGNALS; param.drain_policy = args->drain_policy; + param.drain_policy = args->drain_policy; param.release_policy = PERIODIC; param.cpu = cluster_to_first_cpu(args->cluster, args->cluster_size); @@ -1869,7 +1978,7 @@ int __do_normal(struct Args* args) { ret = task_mode(LITMUS_RT_TASK); if (ret < 0) { - printf("could not become RT task\n"); + fprintf(stderr, "could not become RT task\n"); goto out; } } @@ -1882,7 +1991,7 @@ int __do_normal(struct Args* args) fifoparams.sched_priority = args->priority; ret = sched_setscheduler(getpid(), SCHED_FIFO, &fifoparams); if (ret < 0) { - printf("could not become sched_fifo task\n"); + fprintf(stderr, "could not become sched_fifo task\n"); goto out; } } @@ -1911,14 +2020,14 @@ int __do_normal(struct Args* args) if (args->scheduler == LITMUS) { ret = enable_aux_rt_tasks(AUX_CURRENT | AUX_FUTURE); if (ret != 0) { - printf("enable_aux_rt_tasks() failed\n"); + fprintf(stderr, "enable_aux_rt_tasks() failed\n"); goto out; } } else if (args->scheduler == RT_LINUX) { ret = enable_aux_rt_tasks_linux(gettid()); if (ret != 0) { - printf("enable_aux_rt_tasks_linux() failed\n"); + fprintf(stderr, "enable_aux_rt_tasks_linux() failed\n"); goto out; } } @@ -1954,18 +2063,22 @@ int __do_normal(struct Args* args) }while(keepgoing); } + if (args->want_signals) + ignore_litmus_signals(SIG_BUDGET_MASK); + + if (args->gpu_using && ENABLE_RT_AUX_THREADS) { if (args->scheduler == LITMUS) { ret = disable_aux_rt_tasks(AUX_CURRENT | AUX_FUTURE); if (ret != 0) { - printf("disable_aux_rt_tasks() failed\n"); + fprintf(stderr, "disable_aux_rt_tasks() failed\n"); goto out; } } else if(args->scheduler == RT_LINUX) { ret = disable_aux_rt_tasks_linux(gettid()); if (ret != 0) { - printf("disable_aux_rt_tasks_linux() failed\n"); + fprintf(stderr, "disable_aux_rt_tasks_linux() failed\n"); goto out; } } @@ -1978,7 +2091,7 @@ int __do_normal(struct Args* args) { ret = task_mode(BACKGROUND_TASK); if (ret != 0) { - printf("could not become regular task (huh?)\n"); + fprintf(stderr, "could not become regular task (huh?)\n"); goto out; } } @@ -1989,7 +2102,7 @@ int __do_normal(struct Args* args) memset(&normalparams, 0, sizeof(normalparams)); ret = sched_setscheduler(getpid(), SCHED_OTHER, &normalparams); if (ret < 0) { - printf("could not become sched_normal task\n"); + fprintf(stderr, "could not become sched_normal task\n"); goto out; } } @@ -2532,6 +2645,9 @@ int main(int argc, char** argv) sleep(2); } + /* make sure children don't take sigmasks */ + ignore_litmus_signals(ALL_LITMUS_SIG_MASKS); + if (run_mode == NORMAL) { return do_normal(&myArgs); } diff --git a/src/signal.c b/src/signal.c index 397a797..1bd0f62 100644 --- a/src/signal.c +++ b/src/signal.c @@ -99,9 +99,11 @@ void longjmp_on_litmus_signal(int signum) lit_env = pop_sigjmp(); if (lit_env) { /* What you say?! */ + //printf("%d: we get signal = %d!\n", gettid(), signum); siglongjmp(lit_env->env, signum); /* restores signal mask */ } else { /* silently ignore the signal */ + //printf("%d: silently ignoring signal.\n", gettid()); } } -- cgit v1.2.2