Implemented gpusync rtspin.

author: Glenn Elliott <gelliott@cs.unc.edu> 2013-04-14 15:06:43 -0400
committer: Glenn Elliott <gelliott@cs.unc.edu> 2013-04-14 15:06:43 -0400
commit: 37b4a24ba84f1dffd680fd550a3d8cad2ac5e3a8 (patch)
tree: 5dc5e56a7a4f424e75f59f7705263bdb43b86fb3
parent: 209f1961ea2d5863d6f2d2e9d2323446ee5e53c4 (diff)
5 files changed, 1909 insertions, 1199 deletions
diff --git a/Makefile b/Makefile
index a8e528e..720a585 100644
--- a/Makefile
+++ b/Makefile
@@ -24,6 +24,12 @@ flags-debug-cpp    = -O2 -Wall -Werror -g
 flags-api      = -D_XOPEN_SOURCE=600 -D_GNU_SOURCE
 flags-misc     = -fasynchronous-unwind-tables -fnon-call-exceptions
+flags-cu-debug = -g -G -Xcompiler -Wall -Xcompiler -Werror
+flags-cu-optim = -O3 -Xcompiler -march=native
+flags-cu-nvcc = --use_fast_math -gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30
+flags-cu-misc  = -Xcompiler -fasynchronous-unwind-tables -Xcompiler -fnon-call-exceptions -Xcompiler -malign-double -Xcompiler -pthread
+flags-cu-x86_64 = -m64
 # architecture-specific flags
 flags-i386     = -m32
 flags-x86_64   = -m64
@@ -51,12 +57,19 @@ headers = -I${LIBLITMUS}/include -I${LIBLITMUS}/arch/${include-${ARCH}}/include
 # combine options
 CPPFLAGS = ${flags-api} ${flags-debug-cpp} ${flags-misc} ${flags-${ARCH}} -DARCH=${ARCH} ${headers}
+#CUFLAGS  = ${flags-api} ${flags-cu-debug} ${flags-cu-optim} ${flags-cu-nvcc} ${flags-cu-misc} -DARCH=${ARCH} ${headers}
+CUFLAGS  = ${flags-api} ${flags-cu-optim} ${flags-cu-nvcc} ${flags-cu-misc} -DARCH=${ARCH} ${headers}
 CFLAGS   = ${flags-debug} ${flags-misc}
 LDFLAGS  = ${flags-${ARCH}}
 # how to link against liblitmus
 liblitmus-flags = -L${LIBLITMUS} -llitmus
+# how to link cuda
+cuda-flags-i386 = -L/usr/local/cuda/lib
+cuda-flags-x86_64 = -L/usr/local/cuda/lib64
+cuda-flags = ${cuda-flags-${ARCH}} -lcudart
 # Force gcc instead of cc, but let the user specify a more specific version if
 # desired.
 ifeq (${CC},cc)
@@ -67,20 +80,24 @@ endif
 CPP = g++
 #endif
+CU = nvcc
 # incorporate cross-compiler (if any)
 CC  := ${CROSS_COMPILE}${CC}
 CPP  := ${CROSS_COMPILE}${CPP}
 LD  := ${CROSS_COMPILE}${LD}
 AR  := ${CROSS_COMPILE}${AR}
+CU  := ${CROSS_COMPILE}${CU}
 # ##############################################################################
 # Targets
-all     = lib ${rt-apps} ${rt-cppapps}
+all     = lib ${rt-apps} ${rt-cppapps} ${rt-cuapps}
 rt-apps = cycles base_task rt_launch rtspin release_ts measure_syscall \
          base_mt_task uncache runtests \
          nested locktest ikglptest dgl aux_threads normal_task
 rt-cppapps = budget
+rt-cuapps = gpuspin
 .PHONY: all lib clean dump-config TAGS tags cscope help
@@ -95,10 +112,14 @@ inc/config.makefile: Makefile
        @printf "%-15s= %-20s\n" \
                ARCH ${ARCH} \
                CFLAGS '${CFLAGS}' \
+                CPPFLAGS '${CPPFLAGS}' \
+                CUFLAGS '${CUFLAGS}' \
                LDFLAGS '${LDFLAGS}' \
                LDLIBS '${liblitmus-flags}' \
                CPPFLAGS '${CPPFLAGS}' \
                CC '${shell which ${CC}}' \
+                CPP '${shell which ${CPP}}' \
+                CU '${shell which ${CU}}' \
                LD '${shell which ${LD}}' \
                AR '${shell which ${AR}}' \
        > $@
@@ -112,10 +133,12 @@ dump-config:
                headers "${headers}" \
                "kernel headers" "${imported-headers}" \
                CFLAGS "${CFLAGS}" \
-                LDFLAGS "${LDFLAGS}" \
                CPPFLAGS "${CPPFLAGS}" \
+                CUFLAGS "${CUFLAGS}" \
+                LDFLAGS "${LDFLAGS}" \
                CC "${CC}" \
                CPP "${CPP}" \
+                CU "${CU}" \
                LD "${LD}" \
                AR "${AR}" \
                obj-all "${obj-all}"
@@ -124,8 +147,7 @@ help:
        @cat INSTALL
 clean:
-        rm -f ${rt-apps}
+        rm -f ${rt-apps} ${rt-cppapps} ${rt-cuapps}
-        rm -f ${rt-cppapps}
        rm -f *.o *.d *.a test_catalog.inc
        rm -f ${imported-headers}
        rm -f inc/config.makefile
@@ -259,6 +281,12 @@ vpath %.cpp gpu/
 objcpp-budget = budget.o common.o
 lib-budget = -lrt -lm -pthread
+vpath %.cu gpu/
+objcu-gpuspin = gpuspin.o common.o
+lib-gpuspin = -lrt -lm -lpthread
 # ##############################################################################
 # Build everything that depends on liblitmus.
@@ -269,14 +297,19 @@ ${rt-apps}: $${obj-$$@} liblitmus.a
 ${rt-cppapps}: $${objcpp-$$@} liblitmus.a
        $(CPP) -o $@ $(LDFLAGS) ${ldf-$@} $(filter-out liblitmus.a,$+) $(LOADLIBS) $(LDLIBS) ${liblitmus-flags} ${lib-$@}
+${rt-cuapps}: $${objcu-$$@} liblitmus.a
+        $(CPP) -o $@ $(LDFLAGS) ${ldf-$@} $(filter-out liblitmus.a,$+) $(LOADLIBS) $(LDLIBS) ${liblitmus-flags} ${cuda-flags} ${lib-$@}
 # ##############################################################################
 # Dependency resolution.
 vpath %.c bin/ src/ gpu/ tests/
 vpath %.cpp gpu/
+vpath %.cu gpu/
 obj-all = ${sort ${foreach target,${all},${obj-${target}}}}
 obj-all += ${sort ${foreach target,${all},${objcpp-${target}}}}
+obj-all += ${sort ${foreach target,${all},${objcu-${target}}}}
 # rule to generate dependency files
 %.d: %.c ${imported-headers}
@@ -291,6 +324,16 @@ obj-all += ${sort ${foreach target,${all},${objcpp-${target}}}}
                sed 's,\($*\)\.o[ :]*,\1.o $@ : ,g' < $@.$$$$ > $@; \
                rm -f $@.$$$$
+%.d: %.cu ${imported-headers}
+        @set -e; rm -f $@; \
+                $(CU) --generate-dependencies $(CUFLAGS) $< > $@.$$$$; \
+                sed 's,\($*\)\.o[ :]*,\1.o $@ : ,g' < $@.$$$$ > $@; \
+                rm -f $@.$$$$
+# teach make how to compile .cu files
+%.o: %.cu
+        $(CU) --compile $(CUFLAGS) $(OUTPUT_OPTION) $<
 ifeq ($(MAKECMDGOALS),)
 MAKECMDGOALS += all
 endif
diff --git a/gpu/budget.cpp b/gpu/budget.cpp
index f62c515..8a2546a 100644
--- a/gpu/budget.cpp
+++ b/gpu/budget.cpp
@@ -80,6 +80,28 @@ int SIGNALS = 0;
 int BLOCK_SIGNALS_ON_SLEEP = 0;
 int OVERRUN_RATE = 1; /* default: every job overruns */
+int CXS_OVERRUN = 0;
+int NUM_LOCKS = 1;
+int NUM_REPLICAS = 1;
+int NAMESPACE = 0;
+int *LOCKS = NULL;
+int IKGLP_LOCK = 0;
+int USE_DGLS = 0;
+int NEST_IN_IKGLP = 0;
+int WAIT = 0;
+enum eLockType
+{
+        FIFO,
+        PRIOQ,
+        IKGLP
+};
+eLockType LOCK_TYPE = FIFO;
+int OVERRUN_BY_SLEEP = 0;
 int NUM_JOBS = 0;
 int NUM_COMPLETED_JOBS = 0;
 int NUM_OVERRUNS = 0;
@@ -103,9 +125,32 @@ int job(lt_t exec_ns, lt_t budget_ns)
                                if (SIGNALS && BLOCK_SIGNALS_ON_SLEEP)
                                        block_litmus_signals(SIG_BUDGET);
+                                if(CXS_OVERRUN) {
+                                        if (NEST_IN_IKGLP)
+                                                litmus_lock(IKGLP_LOCK);
+                                        if (USE_DGLS)
+                                                litmus_dgl_lock(LOCKS, NUM_LOCKS);
+                                        else
+                                                for(int i = 0; i < NUM_LOCKS; ++i)
+                                                        litmus_lock(LOCKS[i]);
+                                }
+                                
                                // intentionally overrun via suspension
-                                lt_sleep(approx_remaining + overrun_extra);
+                                if (OVERRUN_BY_SLEEP)
+                                        lt_sleep(approx_remaining + overrun_extra);
+                                else
+                                        loop_for((approx_remaining + overrun_extra) * 0.9);
+                                if(CXS_OVERRUN) {
+                                        if (USE_DGLS)
+                                                litmus_dgl_unlock(LOCKS, NUM_LOCKS);
+                                        else
+                                                for(int i = NUM_LOCKS-1; i >= 0; --i)
+                                                        litmus_unlock(LOCKS[i]);                                                
+                                        if (NEST_IN_IKGLP)
+                                                litmus_unlock(IKGLP_LOCK);
+                                }
+                                
                                if (SIGNALS && BLOCK_SIGNALS_ON_SLEEP)
                                        unblock_litmus_signals(SIG_BUDGET);
                        }
@@ -120,15 +165,18 @@ int job(lt_t exec_ns, lt_t budget_ns)
        return 1;
 }
-#define OPTSTR "sboOva"
+#define OPTSTR "SbosOvalwqixdn:r:"
 int main(int argc, char** argv)
 {
        int ret;
-        lt_t e_ns = ms2ns(10);
-        lt_t p_ns = ms2ns(100);
+        srand(getpid());
+        lt_t e_ns = ms2ns(2);
+        lt_t p_ns = ms2ns(50) + rand()%200;
        lt_t budget_ns = p_ns/2;
-        lt_t duration = s2ns(10);
+        lt_t duration = s2ns(60);
        lt_t terminate_time;
        unsigned int first_job, last_job;
        int opt;
@@ -140,12 +188,15 @@ int main(int argc, char** argv)
        while ((opt = getopt(argc, argv, OPTSTR)) != -1) {
                switch(opt) {
-                case 's':
+                case 'S':
                        SIGNALS = 1;
                        break;
                case 'b':
                        BLOCK_SIGNALS_ON_SLEEP = 1;
                        break;
+                case 's':
+                        OVERRUN_BY_SLEEP = 1;
+                        break;
                case 'o':
                        OVERRUN = 1;
                        overrun_extra = budget_ns/2;
@@ -164,6 +215,31 @@ int main(int argc, char** argv)
                case 'v':
                        drain_policy = DRAIN_SOBLIV;
                        break;
+                case 'l':
+                        CXS_OVERRUN = 1;
+                        NAMESPACE = open("semaphores", O_RDONLY | O_CREAT, S_IRUSR | S_IWUSR);
+                        break;
+                case 'q':
+                        LOCK_TYPE = PRIOQ;
+                        break;
+                case 'i':
+                        LOCK_TYPE = IKGLP;
+                        break;
+                case 'x':
+                        NEST_IN_IKGLP = 1;
+                        break;
+                case 'w':
+                        WAIT = 1;
+                        break;
+                case 'd':
+                        USE_DGLS = 1;
+                        break;
+                case 'n':
+                        NUM_LOCKS = atoi(optarg);
+                        break;
+                case 'r':
+                        NUM_REPLICAS = atoi(optarg);
+                        break;
                case ':':
                        printf("missing argument\n");
                        assert(false);
@@ -176,10 +252,21 @@ int main(int argc, char** argv)
        }
        assert(!BLOCK_SIGNALS_ON_SLEEP || (BLOCK_SIGNALS_ON_SLEEP && SIGNALS));
+        assert(!CXS_OVERRUN || (CXS_OVERRUN && WAIT));
+        assert(LOCK_TYPE != IKGLP || NUM_LOCKS == 1);
+        assert(LOCK_TYPE != IKGLP || (LOCK_TYPE == IKGLP && !NEST_IN_IKGLP));
+        assert(NUM_LOCKS > 0);
+        if (LOCK_TYPE == IKGLP || NEST_IN_IKGLP)
+                assert(NUM_REPLICAS >= 1);
+        
+        LOCKS = new int[NUM_LOCKS];
        if (compute_overrun_rate) {
                int backlog = (int)ceil((overrun_extra + budget_ns)/(double)budget_ns);
-                OVERRUN_RATE = backlog + 2; /* some padding */
+                if (!CXS_OVERRUN)
+                        OVERRUN_RATE = backlog + 2; /* some padding */
+                else
+                        OVERRUN_RATE = 2*backlog + 2; /* overrun less frequently for testing */
        }
        init_rt_task_param(&param);
@@ -197,6 +284,44 @@ int main(int argc, char** argv)
        ret = set_rt_task_param(gettid(), &param);
        assert(ret == 0);
+        if (CXS_OVERRUN) {
+                int i;
+                for(i = 0; i < NUM_LOCKS; ++i) {
+                        int lock = -1;
+                        switch(LOCK_TYPE)
+                        {
+                                case FIFO:
+                                        lock = open_fifo_sem(NAMESPACE, i);
+                                        break;
+                                case PRIOQ:
+                                        lock = open_prioq_sem(NAMESPACE, i);
+                                        break;
+                                case IKGLP:
+                                        lock = open_ikglp_sem(NAMESPACE, i, NUM_REPLICAS);
+                                        break;
+                        }
+                        if (lock < 0) {
+                                perror("open_sem");
+                                exit(-1);
+                        }
+                        LOCKS[i] = lock;
+                }
+                
+                if (NEST_IN_IKGLP) {
+                        IKGLP_LOCK = open_ikglp_sem(NAMESPACE, i, NUM_REPLICAS);
+                        if (IKGLP_LOCK < 0) {
+                                perror("open_sem");
+                                exit(-1);
+                        }
+                }
+        }
+        
+        if (WAIT) {
+                ret = wait_for_ts_release();
+                if (ret < 0)
+                        perror("wait_for_ts_release");
+        }
+        
        ret = task_mode(LITMUS_RT_TASK);
        assert(ret == 0);
@@ -231,5 +356,7 @@ int main(int argc, char** argv)
        printf("# User Jobs Completed: %d\n", NUM_COMPLETED_JOBS);
        printf("# Overruns: %d\n", NUM_OVERRUNS);
+        delete[] LOCKS;
+        
        return 0;
 }
diff --git a/gpu/gpuspin.cu b/gpu/gpuspin.cu
new file mode 100644
index 0000000..aff6cd1
--- /dev/null
+++ b/gpu/gpuspin.cu
@@ -0,0 +1,1720 @@
+#include <sys/time.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <time.h>
+#include <string.h>
+#include <assert.h>
+#include <execinfo.h>
+#include <boost/interprocess/managed_shared_memory.hpp>
+#include <boost/interprocess/sync/interprocess_mutex.hpp>
+#include <cuda_runtime.h>
+#include "litmus.h"
+#include "common.h"
+using namespace std;
+using namespace boost::interprocess;
+const char *lock_namespace = "./.gpuspin-locks";
+const int NR_GPUS = 8;
+bool GPU_USING = false;
+bool ENABLE_AFFINITY = false;
+bool RELAX_FIFO_MAX_LEN = false;
+bool ENABLE_CHUNKING = false;
+bool MIGRATE_VIA_SYSMEM = false;
+enum eEngineLockTypes
+{
+        FIFO,
+        PRIOQ
+};
+eEngineLockTypes ENGINE_LOCK_TYPE = FIFO;
+int GPU_PARTITION = 0;
+int GPU_PARTITION_SIZE = 0;
+int CPU_PARTITION_SIZE = 0;
+int RHO = 2;
+int NUM_COPY_ENGINES = 2;
+__attribute__((unused)) static size_t kbToB(size_t kb) { return kb * 1024; }
+__attribute__((unused)) static size_t mbToB(size_t mb) { return kbToB(mb * 1024); }
+/* in bytes */
+size_t SEND_SIZE = 0;
+size_t RECV_SIZE = 0;
+size_t STATE_SIZE = 0;
+size_t CHUNK_SIZE = 0;
+int TOKEN_LOCK = -1;
+bool USE_ENGINE_LOCKS = true;
+bool USE_DYNAMIC_GROUP_LOCKS = false;
+int EE_LOCKS[NR_GPUS];
+int CE_SEND_LOCKS[NR_GPUS];
+int CE_RECV_LOCKS[NR_GPUS];
+int CE_MIGR_SEND_LOCKS[NR_GPUS];
+int CE_MIGR_RECV_LOCKS[NR_GPUS];
+bool RESERVED_MIGR_COPY_ENGINE = false;  // only checked if NUM_COPY_ENGINES == 2
+bool ENABLE_RT_AUX_THREADS = true;
+enum eGpuSyncMode
+{
+        IKGLP_MODE,
+        IKGLP_WC_MODE, /* work-conserving IKGLP. no GPU is left idle, but breaks optimality */
+        KFMLP_MODE,
+        RGEM_MODE,
+};
+eGpuSyncMode GPU_SYNC_MODE = IKGLP_MODE;
+enum eCudaSyncMode
+{
+        BLOCKING,
+        SPIN
+};
+eCudaSyncMode CUDA_SYNC_MODE = BLOCKING;
+int CUR_DEVICE = -1;
+int LAST_DEVICE = -1;
+cudaStream_t STREAMS[NR_GPUS];
+int GPU_HZ[NR_GPUS];
+int NUM_SM[NR_GPUS];
+int WARP_SIZE[NR_GPUS];
+int ELEM_PER_THREAD[NR_GPUS];
+#define DEFINE_PER_GPU(type, var) type var[NR_GPUS]
+#define per_gpu(var, idx) (var[(idx)])
+#define this_gpu(var) (var[(CUR_DEVICE)])
+#define cur_stream() (this_gpu(STREAMS))
+#define cur_gpu() (CUR_DEVICE)
+#define last_gpu() (LAST_DEVICE)
+#define cur_ee() (EE_LOCKS[CUR_DEVICE])
+#define cur_send() (CE_SEND_LOCKS[CUR_DEVICE])
+#define cur_recv() (CE_RECV_LOCKS[CUR_DEVICE])
+#define cur_migr_send() (CE_MIGR_SEND_LOCKS[CUR_DEVICE])
+#define cur_migr_recv() (CE_MIGR_RECV_LOCKS[CUR_DEVICE])
+#define cur_hz() (GPU_HZ[CUR_DEVICE])
+#define cur_sms() (NUM_SM[CUR_DEVICE])
+#define cur_warp_size() (WARP_SIZE[CUR_DEVICE])
+#define cur_elem_per_thread() (ELEM_PER_THREAD[CUR_DEVICE])
+#define num_online_gpus() (NUM_GPUS)
+static bool useEngineLocks()
+{
+        return(USE_ENGINE_LOCKS);
+}
+#define VANILLA_LINUX
+bool TRACE_MIGRATIONS = false;
+#ifndef VANILLA_LINUX
+#define trace_migration(to, from)                                       do { inject_gpu_migration((to), (from)); } while(0)
+#define trace_release(arrival, deadline, jobno)         do { inject_release((arrival), (deadline), (jobno)); } while(0)
+#define trace_completion(jobno)                                         do { inject_completion((jobno)); } while(0)
+#define trace_name()                                                            do { inject_name(); } while(0)
+#define trace_param()                                                           do { inject_param(); } while(0)
+#else
+#define set_rt_task_param(x, y)                                         (0)
+#define trace_migration(to, from)
+#define trace_release(arrival, deadline, jobno)
+#define trace_completion(jobno)
+#define trace_name()
+#define trace_param()
+#endif
+struct ce_lock_state
+{
+        int locks[2];
+        size_t num_locks;
+        size_t budget_remaining;
+        bool locked;
+        
+        ce_lock_state(int device_a, enum cudaMemcpyKind kind, size_t size, int device_b = -1, bool migration = false) {
+                num_locks = (device_a != -1) + (device_b != -1);
+                
+                if(device_a != -1) {
+                        if (!migration)
+                                locks[0] = (kind == cudaMemcpyHostToDevice || (kind == cudaMemcpyDeviceToDevice && device_b == -1)) ?
+                                CE_SEND_LOCKS[device_a] : CE_RECV_LOCKS[device_a];
+                        else
+                                locks[0] = (kind == cudaMemcpyHostToDevice || (kind == cudaMemcpyDeviceToDevice && device_b == -1)) ?
+                                CE_MIGR_SEND_LOCKS[device_a] : CE_MIGR_RECV_LOCKS[device_a];
+                }
+                
+                if(device_b != -1) {
+                        assert(kind == cudaMemcpyDeviceToDevice);
+                        
+                        if (!migration)
+                                locks[1] = CE_RECV_LOCKS[device_b];
+                        else
+                                locks[1] = CE_MIGR_RECV_LOCKS[device_b];
+                        
+                        if(locks[1] < locks[0]) {
+                                // enforce total order on locking
+                                int temp = locks[1];
+                                locks[1] = locks[0];
+                                locks[0] = temp;
+                        }
+                }
+                else {
+                        locks[1] = -1;
+                }
+                
+                if(!ENABLE_CHUNKING)
+                        budget_remaining = size;
+                else
+                        budget_remaining = CHUNK_SIZE;
+        }
+        
+        void crash(void) {
+                void *array[50];
+                int size, i;
+                char **messages;
+                
+                size = backtrace(array, 50);
+                messages = backtrace_symbols(array, size);
+                
+                fprintf(stderr, "%d: TRIED TO GRAB SAME LOCK TWICE! Lock = %d\n", getpid(), locks[0]);
+                for (i = 1; i < size && messages != NULL; ++i)
+                {
+                        fprintf(stderr, "%d: [bt]: (%d) %s\n", getpid(), i, messages[i]);
+                }
+                free(messages);
+                
+                assert(false);
+        }
+        
+        
+        void lock() {
+                if(locks[0] == locks[1]) crash();
+                
+                if(USE_DYNAMIC_GROUP_LOCKS) {
+                        litmus_dgl_lock(locks, num_locks);
+                }
+                else
+                {
+                        for(int l = 0; l < num_locks; ++l)
+                        {
+                                litmus_lock(locks[l]);
+                        }
+                }
+                locked = true;
+        }
+        
+        void unlock() {
+                if(locks[0] == locks[1]) crash();
+                
+                if(USE_DYNAMIC_GROUP_LOCKS) {
+                        litmus_dgl_unlock(locks, num_locks);
+                }
+                else
+                {
+                        // reverse order
+                        for(int l = num_locks - 1; l >= 0; --l)
+                        {
+                                litmus_unlock(locks[l]);
+                        }
+                }
+                locked = false;
+        }
+        
+        void refresh() {
+                budget_remaining = CHUNK_SIZE;
+        }
+        
+        bool budgetIsAvailable(size_t tosend) {
+                return(tosend >= budget_remaining);
+        }
+        
+        void decreaseBudget(size_t spent) {
+                budget_remaining -= spent;
+        }
+};
+// precondition: if do_locking == true, locks in state are held.
+static cudaError_t __chunkMemcpy(void* a_dst, const void* a_src, size_t count,
+                                                                 enum cudaMemcpyKind kind,
+                                                                 ce_lock_state* state)
+{
+    cudaError_t ret = cudaSuccess;
+    int remaining = count;
+        
+    char* dst = (char*)a_dst;
+    const char* src = (const char*)a_src;
+        
+        // disable chunking, if needed, by setting chunk_size equal to the
+        // amount of data to be copied.
+        int chunk_size = (ENABLE_CHUNKING) ? CHUNK_SIZE : count;
+        int i = 0;
+        
+    while(remaining != 0)
+    {
+        int bytesToCopy = std::min(remaining, chunk_size);
+                
+                if(state && state->budgetIsAvailable(bytesToCopy) && state->locked) {
+                        cudaStreamSynchronize(STREAMS[CUR_DEVICE]);
+                        ret = cudaGetLastError();
+                        
+                        if(ret != cudaSuccess)
+                        {
+                                break;
+                        }
+                        
+                        state->unlock();
+                        state->refresh(); // replentish.
+                                                          // we can only run out of
+                                                          // budget if chunking is enabled.
+                                                          // we presume that init budget would
+                                                          // be set to cover entire memcpy
+                                                          // if chunking were disabled.
+                }
+                
+                if(state && !state->locked) {
+                        state->lock();
+                }
+                
+        //ret = cudaMemcpy(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind);
+                cudaMemcpyAsync(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind, STREAMS[CUR_DEVICE]);
+                
+                if(state) {
+                        state->decreaseBudget(bytesToCopy);
+                }
+        ++i;
+        remaining -= bytesToCopy;
+    }
+    return ret;
+}
+static cudaError_t chunkMemcpy(void* a_dst, const void* a_src, size_t count,
+                                                           enum cudaMemcpyKind kind,
+                                                           int device_a = -1,  // device_a == -1 disables locking
+                                                           bool do_locking = true,
+                                                           int device_b = -1,
+                                                           bool migration = false)
+{
+        cudaError_t ret;
+        if(!do_locking || device_a == -1) {
+                ret = __chunkMemcpy(a_dst, a_src, count, kind, NULL);
+                cudaStreamSynchronize(cur_stream());
+                if(ret == cudaSuccess)
+                        ret = cudaGetLastError();
+        }
+        else {
+                ce_lock_state state(device_a, kind, count, device_b, migration);
+                state.lock();
+                ret = __chunkMemcpy(a_dst, a_src, count, kind, &state);
+                cudaStreamSynchronize(cur_stream());
+                if(ret == cudaSuccess)
+                        ret = cudaGetLastError();
+                state.unlock();
+        }
+        return ret;
+}
+void allocate_locks_litmus(void)
+{
+        // allocate k-FMLP lock
+        int fd = open(lock_namespace, O_RDONLY | O_CREAT, S_IRUSR | S_IWUSR);
+        
+        int base_name = GPU_PARTITION * 1000;
+        
+        if (GPU_SYNC_MODE == IKGLP_MODE) {
+                /* Standard (optimal) IKGLP */
+                TOKEN_LOCK = open_gpusync_token_lock(fd,
+                                                base_name,  /* name */
+                                                GPU_PARTITION_SIZE,
+                                                GPU_PARTITION*GPU_PARTITION_SIZE,
+                                                RHO,
+                                                IKGLP_M_IN_FIFOS,
+                                                (!RELAX_FIFO_MAX_LEN) ?
+                                                IKGLP_OPTIMAL_FIFO_LEN :
+                                                IKGLP_UNLIMITED_FIFO_LEN,
+                                                ENABLE_AFFINITY);
+        }
+        else if (GPU_SYNC_MODE == KFMLP_MODE) {
+                /* KFMLP. FIFO queues only for tokens. */
+                TOKEN_LOCK = open_gpusync_token_lock(fd,
+                                                base_name,  /* name */
+                                                GPU_PARTITION_SIZE,
+                                                GPU_PARTITION*GPU_PARTITION_SIZE,
+                                                RHO,
+                                                IKGLP_UNLIMITED_IN_FIFOS,
+                                                IKGLP_UNLIMITED_FIFO_LEN,
+                                                ENABLE_AFFINITY);
+        }
+        else if (GPU_SYNC_MODE == RGEM_MODE) {
+                /* RGEM-like token allocation. Shared priority queue for all tokens. */
+                TOKEN_LOCK = open_gpusync_token_lock(fd,
+                                                base_name,  /* name */
+                                                GPU_PARTITION_SIZE,
+                                                GPU_PARTITION*GPU_PARTITION_SIZE,
+                                                RHO,
+                                                RHO*GPU_PARTITION_SIZE,
+                                                1,
+                                                ENABLE_AFFINITY);
+        }
+        else if (GPU_SYNC_MODE == IKGLP_WC_MODE) {
+                /* Non-optimal IKGLP that never lets a replica idle if there are pending
+                 * token requests. */
+                int max_simult_run = std::max(CPU_PARTITION_SIZE, RHO*GPU_PARTITION_SIZE);
+                int max_fifo_len = (int)ceil((float)max_simult_run / (RHO*GPU_PARTITION_SIZE));
+                TOKEN_LOCK = open_gpusync_token_lock(fd,
+                                                base_name,  /* name */
+                                                GPU_PARTITION_SIZE,
+                                                GPU_PARTITION*GPU_PARTITION_SIZE,
+                                                RHO,
+                                                max_simult_run,
+                                                (!RELAX_FIFO_MAX_LEN) ?
+                                                        max_fifo_len :
+                                                        IKGLP_UNLIMITED_FIFO_LEN,
+                                                ENABLE_AFFINITY);
+        }
+        else {
+                perror("Invalid GPUSync mode specified\n");
+                TOKEN_LOCK = -1;
+        }
+        
+        if(TOKEN_LOCK < 0)
+                perror("open_token_sem");
+        
+        if(USE_ENGINE_LOCKS)
+        {
+                assert(NUM_COPY_ENGINES == 1 || NUM_COPY_ENGINES == 2);
+                assert((NUM_COPY_ENGINES == 1 && !RESERVED_MIGR_COPY_ENGINE) || NUM_COPY_ENGINES == 2);
+                
+                // allocate the engine locks.
+                for (int i = 0; i < GPU_PARTITION_SIZE; ++i)
+                {
+                        int idx = GPU_PARTITION*GPU_PARTITION_SIZE + i;
+                        int ee_name = (i+1)*10 + base_name;
+                        int ce_0_name = (i+1)*10 + base_name + 1;
+                        int ce_1_name = (i+1)*10 + base_name + 2;
+                        int ee_lock = -1, ce_0_lock = -1, ce_1_lock = -1;
+                        
+                        open_sem_t openEngineLock = (ENGINE_LOCK_TYPE == FIFO) ?
+                                open_fifo_sem : open_prioq_sem;
+                        
+                        ee_lock = openEngineLock(fd, ee_name);
+                        if (ee_lock < 0)
+                                perror("open_*_sem (engine lock)");
+                        
+                        ce_0_lock = openEngineLock(fd, ce_0_name);
+                        if (ce_0_lock < 0)
+                                perror("open_*_sem (engine lock)");
+                        
+                        if (NUM_COPY_ENGINES == 2)
+                        {
+                                ce_1_lock = openEngineLock(fd, ce_1_name);
+                                if (ce_1_lock < 0)
+                                        perror("open_*_sem (engine lock)");
+                        }
+                        
+                        EE_LOCKS[idx] = ee_lock;
+                        
+                        if (NUM_COPY_ENGINES == 1)
+                        {
+                                // share locks
+                                CE_SEND_LOCKS[idx] = ce_0_lock;
+                                CE_RECV_LOCKS[idx] = ce_0_lock;
+                                CE_MIGR_SEND_LOCKS[idx] = ce_0_lock;
+                                CE_MIGR_RECV_LOCKS[idx] = ce_0_lock;
+                        }
+                        else
+                        {
+                                assert(NUM_COPY_ENGINES == 2);
+                                
+                                if (RESERVED_MIGR_COPY_ENGINE) {
+                                        // copy engine deadicated to migration operations
+                                        CE_SEND_LOCKS[idx] = ce_0_lock;
+                                        CE_RECV_LOCKS[idx] = ce_0_lock;
+                                        CE_MIGR_SEND_LOCKS[idx] = ce_1_lock;
+                                        CE_MIGR_RECV_LOCKS[idx] = ce_1_lock;
+                                }
+                                else {
+                                        // migration transmissions treated as regular data
+                                        CE_SEND_LOCKS[idx] = ce_0_lock;
+                                        CE_RECV_LOCKS[idx] = ce_1_lock;
+                                        CE_MIGR_SEND_LOCKS[idx] = ce_0_lock;
+                                        CE_MIGR_RECV_LOCKS[idx] = ce_1_lock;
+                                }
+                        }
+                }
+        }
+}
+class gpu_pool
+{
+public:
+    gpu_pool(int pSz): poolSize(pSz)
+    {
+                memset(&pool[0], 0, sizeof(pool[0])*poolSize);
+    }
+        
+    int get(pthread_mutex_t* tex, int preference = -1)
+    {
+        int which = -1;
+                int last = (preference >= 0) ? preference : 0;
+                int minIdx = last;
+                
+                pthread_mutex_lock(tex);
+                
+                int min = pool[last];
+                for(int i = (minIdx+1)%poolSize; i != last; i = (i+1)%poolSize)
+                {
+                        if(min > pool[i])
+                                minIdx = i;
+                }
+                ++pool[minIdx];
+                
+                pthread_mutex_unlock(tex);
+                
+                which = minIdx;
+                
+        return which;
+    }
+        
+    void put(pthread_mutex_t* tex, int which)
+    {
+                pthread_mutex_lock(tex);
+                --pool[which];
+                pthread_mutex_unlock(tex);
+    }
+        
+private:
+        int poolSize;
+    int pool[NR_GPUS]; // >= gpu_part_size
+};
+static gpu_pool* GPU_LINUX_SEM_POOL = NULL;
+static pthread_mutex_t* GPU_LINUX_MUTEX_POOL = NULL;
+static void allocate_locks_linux(int num_gpu_users)
+{
+        managed_shared_memory *segment_pool_ptr = NULL;
+        managed_shared_memory *segment_mutex_ptr = NULL;
+        
+        int numGpuPartitions = NR_GPUS/GPU_PARTITION_SIZE;
+        
+        if(num_gpu_users != 0)
+        {
+                printf("%d creating shared memory for linux semaphores; num pools = %d, pool size = %d\n", getpid(), numGpuPartitions, GPU_PARTITION_SIZE);
+                shared_memory_object::remove("linux_mutex_memory");
+                shared_memory_object::remove("linux_sem_memory");
+                
+                segment_mutex_ptr = new managed_shared_memory(create_only, "linux_mutex_memory", 4*1024);
+                GPU_LINUX_MUTEX_POOL = segment_mutex_ptr->construct<pthread_mutex_t>("pthread_mutex_t linux_m")[numGpuPartitions]();
+                for(int i = 0; i < numGpuPartitions; ++i)
+                {
+                        pthread_mutexattr_t attr;
+                        pthread_mutexattr_init(&attr);
+                        pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED);
+                        pthread_mutex_init(&(GPU_LINUX_MUTEX_POOL[i]), &attr);
+                        pthread_mutexattr_destroy(&attr);
+                }
+                
+                segment_pool_ptr = new managed_shared_memory(create_only, "linux_sem_memory", 4*1024);
+                GPU_LINUX_SEM_POOL = segment_pool_ptr->construct<gpu_pool>("gpu_pool linux_p")[numGpuPartitions](GPU_PARTITION_SIZE);
+        }
+        else
+        {
+                do
+                {
+                        try
+                        {
+                                if (!segment_pool_ptr) segment_pool_ptr = new managed_shared_memory(open_only, "linux_sem_memory");
+                        }
+                        catch(...)
+                        {
+                                sleep(1);
+                        }
+                }while(segment_pool_ptr == NULL);
+                
+                do
+                {
+                        try
+                        {
+                                if (!segment_mutex_ptr) segment_mutex_ptr = new managed_shared_memory(open_only, "linux_mutex_memory");
+                        }
+                        catch(...)
+                        {
+                                sleep(1);
+                        }
+                }while(segment_mutex_ptr == NULL);
+                
+                GPU_LINUX_SEM_POOL = segment_pool_ptr->find<gpu_pool>("gpu_pool linux_p").first;
+                GPU_LINUX_MUTEX_POOL = segment_mutex_ptr->find<pthread_mutex_t>("pthread_mutex_t linux_m").first;
+        }
+}
+static void allocate_locks(int num_gpu_users, bool linux_mode)
+{
+        if(!linux_mode)
+                allocate_locks_litmus();
+        else
+                allocate_locks_linux(num_gpu_users);
+}
+static void set_cur_gpu(int gpu)
+{
+        if (TRACE_MIGRATIONS) {
+                trace_migration(gpu, CUR_DEVICE);
+        }
+        if(gpu != CUR_DEVICE) {
+                cudaSetDevice(gpu);
+                CUR_DEVICE = gpu;
+        }
+}
+static pthread_barrier_t *gpu_barrier = NULL;
+static interprocess_mutex *gpu_mgmt_mutexes = NULL;
+static managed_shared_memory *segment_ptr = NULL;
+void coordinate_gpu_tasks(int num_gpu_users)
+{
+        if(num_gpu_users != 0)
+        {
+                printf("%d creating shared memory\n", getpid());
+                shared_memory_object::remove("gpu_barrier_memory");
+                segment_ptr = new managed_shared_memory(create_only, "gpu_barrier_memory", 4*1024);
+                
+                printf("%d creating a barrier for %d users\n", getpid(), num_gpu_users);
+                gpu_barrier = segment_ptr->construct<pthread_barrier_t>("pthread_barrier_t gpu_barrier")();
+                pthread_barrierattr_t battr;
+                pthread_barrierattr_init(&battr);
+                pthread_barrierattr_setpshared(&battr, PTHREAD_PROCESS_SHARED);
+                pthread_barrier_init(gpu_barrier, &battr, num_gpu_users);
+                pthread_barrierattr_destroy(&battr);
+                printf("%d creating gpu mgmt mutexes for %d devices\n", getpid(), NR_GPUS);
+                gpu_mgmt_mutexes = segment_ptr->construct<interprocess_mutex>("interprocess_mutex m")[NR_GPUS]();
+        }
+        else
+        {
+                do
+                {
+                        try
+                        {
+                                segment_ptr = new managed_shared_memory(open_only, "gpu_barrier_memory");
+                        }
+                        catch(...)
+                        {
+                                sleep(1);
+                        }
+                }while(segment_ptr == NULL);
+                
+                gpu_barrier = segment_ptr->find<pthread_barrier_t>("pthread_barrier_t gpu_barrier").first;
+                gpu_mgmt_mutexes = segment_ptr->find<interprocess_mutex>("interprocess_mutex m").first;
+        }
+}
+typedef float spindata_t;
+char *d_send_data[NR_GPUS] = {0};
+char *d_recv_data[NR_GPUS] = {0};
+char *d_state_data[NR_GPUS] = {0};
+spindata_t *d_spin_data[NR_GPUS] = {0};
+//unsigned int *d_iteration_count[NR_GPUS] = {0};
+bool p2pMigration[NR_GPUS][NR_GPUS] = {0};
+char *h_send_data = 0;
+char *h_recv_data = 0;
+char *h_state_data = 0;
+unsigned int *h_iteration_count[NR_GPUS] = {0};
+static void init_cuda(int num_gpu_users)
+{
+        const int PAGE_SIZE = 4*1024;
+        size_t send_alloc_bytes = SEND_SIZE + (SEND_SIZE%PAGE_SIZE != 0)*PAGE_SIZE;
+        size_t recv_alloc_bytes = RECV_SIZE + (RECV_SIZE%PAGE_SIZE != 0)*PAGE_SIZE;
+        size_t state_alloc_bytes = STATE_SIZE + (STATE_SIZE%PAGE_SIZE != 0)*PAGE_SIZE;
+        
+        coordinate_gpu_tasks(num_gpu_users);
+        
+        switch (CUDA_SYNC_MODE)
+        {
+                case BLOCKING:
+                        cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+                        break;
+                case SPIN:
+                        cudaSetDeviceFlags(cudaDeviceScheduleSpin);
+                        break;
+        }
+        
+        for(int i = 0; i < GPU_PARTITION_SIZE; ++i)
+        {
+                cudaDeviceProp prop;
+                int which = GPU_PARTITION*GPU_PARTITION_SIZE + i;
+                
+                gpu_mgmt_mutexes[which].lock();
+                
+                set_cur_gpu(which);
+                cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 0);
+                cudaDeviceSetLimit(cudaLimitMallocHeapSize, 0);
+                cudaGetDeviceProperties(&prop, which);
+                GPU_HZ[which] = prop.clockRate * 1000; /* khz -> hz */
+                NUM_SM[which] = prop.multiProcessorCount;
+                WARP_SIZE[which] = prop.warpSize;
+                
+                // enough to fill the L2 cache exactly.
+                ELEM_PER_THREAD[which] = (prop.l2CacheSize/(NUM_SM[which]*WARP_SIZE[which]*sizeof(spindata_t)));
+                
+                
+                if (!MIGRATE_VIA_SYSMEM && prop.unifiedAddressing)
+                {
+                        for(int j = 0; j < GPU_PARTITION_SIZE; ++j)
+                        {
+                                if (i != j)
+                                {
+                                        int canAccess = 0;
+                                        cudaDeviceCanAccessPeer(&canAccess, i, j);
+                                        if(canAccess)
+                                        {
+                                                cudaDeviceEnablePeerAccess(j, 0);
+                                                p2pMigration[i][j] = true;
+                                        }
+                                }
+                        }
+                }
+                
+                cudaStreamCreate(&STREAMS[CUR_DEVICE]);
+                
+                cudaMalloc(&d_spin_data[which], prop.l2CacheSize);
+                cudaMemset(&d_spin_data[which], 0, prop.l2CacheSize);
+//              cudaMalloc(&d_iteration_count[which], NUM_SM[which]*WARP_SIZE[which]*sizeof(unsigned int));
+//              cudaHostAlloc(&h_iteration_count[which], NUM_SM[which]*WARP_SIZE[which]*sizeof(unsigned int), cudaHostAllocPortable | cudaHostAllocMapped);
+                
+                if (send_alloc_bytes) {
+                        cudaMalloc(&d_send_data[which], send_alloc_bytes);
+                        cudaHostAlloc(&h_send_data, send_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped);
+                }
+                
+                if (h_recv_data) {
+                        cudaMalloc(&d_recv_data[which], recv_alloc_bytes);
+                        cudaHostAlloc(&h_recv_data, recv_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped);
+                }
+                
+                if (h_state_data) {
+                        cudaMalloc(&d_state_data[which], state_alloc_bytes);
+                        
+                        if (MIGRATE_VIA_SYSMEM)
+                                cudaHostAlloc(&h_state_data, state_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped | cudaHostAllocWriteCombined);
+                }
+                
+                gpu_mgmt_mutexes[which].unlock();               
+        }
+        
+        // roll back to first GPU
+        set_cur_gpu(GPU_PARTITION*GPU_PARTITION_SIZE);
+}
+static bool MigrateToGPU_P2P(int from, int to)
+{
+        bool success = true;
+        set_cur_gpu(to);
+        chunkMemcpy(this_gpu(d_state_data), per_gpu(d_state_data, from),
+                                STATE_SIZE, cudaMemcpyDeviceToDevice, to,
+                                useEngineLocks(), from, true);
+        return success;
+}
+static bool PullState(void)
+{
+        bool success = true;
+        chunkMemcpy(h_state_data, this_gpu(d_state_data),
+                                STATE_SIZE, cudaMemcpyDeviceToHost,
+                                cur_gpu(), useEngineLocks(), -1, true);
+        return success;
+}
+static bool PushState(void)
+{
+        bool success = true;
+        chunkMemcpy(this_gpu(d_state_data), h_state_data,
+                                STATE_SIZE, cudaMemcpyHostToDevice,
+                                cur_gpu(), useEngineLocks(), -1, true);
+        return success;
+}
+static bool MigrateToGPU_SysMem(int from, int to)
+{
+        // THIS IS ON-DEMAND SYS_MEM MIGRATION.  GPUSync says
+        // you should be using speculative migrations.
+        // Use PushState() and PullState().
+        assert(false); // for now
+        
+        bool success = true;
+        
+        set_cur_gpu(from);
+        chunkMemcpy(h_state_data, this_gpu(d_state_data),
+                                STATE_SIZE, cudaMemcpyDeviceToHost,
+                                from, useEngineLocks(), -1, true);
+        
+        set_cur_gpu(to);
+        chunkMemcpy(this_gpu(d_state_data), h_state_data,
+                                STATE_SIZE, cudaMemcpyHostToDevice,
+                                to, useEngineLocks(), -1, true);
+        
+        return success;
+}
+static bool MigrateToGPU(int from, int to)
+{
+        bool success = false;
+        
+        if (from != to)
+        {
+                if(!MIGRATE_VIA_SYSMEM && p2pMigration[to][from])
+                        success = MigrateToGPU_P2P(from, to);
+                else
+                        success = MigrateToGPU_SysMem(from, to);
+        }
+        else
+        {
+                set_cur_gpu(to);
+                success = true;
+        }
+        
+        return success;
+}
+static bool MigrateToGPU_Implicit(int to)
+{
+        return( MigrateToGPU(cur_gpu(), to) );
+}
+static void MigrateIfNeeded(int next_gpu)
+{
+        if(next_gpu != cur_gpu() && cur_gpu() != -1)
+        {
+                if (!MIGRATE_VIA_SYSMEM)
+                        MigrateToGPU_Implicit(next_gpu);
+                else {
+                        set_cur_gpu(next_gpu);
+                        PushState();
+                }
+        }
+}
+static void exit_cuda()
+{
+        for(int i = 0; i < GPU_PARTITION_SIZE; ++i)
+        {
+                int which = GPU_PARTITION*GPU_PARTITION_SIZE + i;
+                gpu_mgmt_mutexes[which].lock();
+                set_cur_gpu(which);
+                cudaDeviceReset();
+                gpu_mgmt_mutexes[which].unlock();
+        }
+}
+bool safetynet = false;
+static void catch_exit(int catch_exit)
+{
+        if(GPU_USING && USE_ENGINE_LOCKS && safetynet)
+        {
+                safetynet = false;
+                for(int i = 0; i < GPU_PARTITION_SIZE; ++i)
+                {
+                        int which = GPU_PARTITION*GPU_PARTITION_SIZE + i;
+                        set_cur_gpu(which);
+                        
+//                      cudaDeviceReset();
+                        
+                        // try to unlock everything.  litmus will prevent bogus calls.
+                        if(USE_ENGINE_LOCKS)
+                        {
+                                litmus_unlock(EE_LOCKS[which]);
+                                litmus_unlock(CE_SEND_LOCKS[which]);
+                                if (NUM_COPY_ENGINES == 2)
+                                {
+                                        if (RESERVED_MIGR_COPY_ENGINE)
+                                                litmus_unlock(CE_MIGR_SEND_LOCKS[which]);
+                                        else
+                                                litmus_unlock(CE_MIGR_RECV_LOCKS[which]);
+                                }
+                        }
+                }
+                litmus_unlock(TOKEN_LOCK);
+        }
+}
+static float ms_sum;
+static int gpucount = 0;
+__global__ void docudaspin(float* data, /*unsigned int* iterations,*/ unsigned int num_elem, unsigned int cycles)
+{
+        long long int now = clock64();  
+        long long unsigned int elapsed = 0;
+        long long int last;
+        
+//      unsigned int iter = 0;
+        unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+        unsigned int j = 0;
+        bool toggle = true;
+        
+//      iterations[i] = 0;
+        do
+        {
+                data[i*num_elem+j] += (toggle) ? M_PI : -M_PI;
+                j = (j + 1 != num_elem) ? j + 1 : 0;
+                toggle = !toggle;
+//              iter++;
+                
+                last = now;
+                now = clock64();
+//              // exact calculation takes more cycles than a second
+//              // loop iteration when code is compiled optimized
+//              long long int diff = now - last;
+//              elapsed += (diff > 0) ?
+//                      diff :
+//                      now + ((~((long long int)0)<<1)>>1) - last;
+                
+                // don't count iterations with clock roll-over
+                elapsed += max(0ll, now - last);
+        }while(elapsed < cycles);
+//      iterations[i] = iter;
+        
+        return;
+}
+static void gpu_loop_for(double gpu_sec_time, double emergency_exit)
+{
+        int next_gpu;
+        
+        if (emergency_exit && wctime() > emergency_exit)
+                goto out;
+        
+        next_gpu = litmus_lock(TOKEN_LOCK);
+        {
+                MigrateIfNeeded(next_gpu);
+                
+                unsigned int numcycles = (unsigned int)(cur_hz() * gpu_sec_time);
+                
+                if(SEND_SIZE > 0)
+                        chunkMemcpy(this_gpu(d_state_data), h_send_data, SEND_SIZE,
+                                                cudaMemcpyHostToDevice, CUR_DEVICE, useEngineLocks());
+                
+                if(useEngineLocks()) litmus_lock(cur_ee());
+                /* one block per sm, one warp per block */
+                docudaspin <<<cur_sms(),cur_warp_size(), 0, cur_stream()>>> (d_spin_data[cur_gpu()], cur_elem_per_thread(), numcycles);
+//              docudaspin <<<cur_sms(),cur_warp_size(), 0, cur_stream()>>> (d_spin_data[cur_gpu()], d_iteration_count[cur_gpu()], cur_elem_per_thread(), numcycles);           
+                cudaStreamSynchronize(cur_stream());
+                if(useEngineLocks()) litmus_unlock(cur_ee());
+                
+                if(RECV_SIZE > 0)
+                        chunkMemcpy(h_recv_data, this_gpu(d_state_data), RECV_SIZE,
+                                                cudaMemcpyDeviceToHost, CUR_DEVICE, useEngineLocks());
+                
+                if (MIGRATE_VIA_SYSMEM)
+                        PullState();
+        }
+        litmus_unlock(TOKEN_LOCK);
+        
+        last_gpu() = cur_gpu();
+        
+out:
+        return;
+}
+static void gpu_loop_for_linux(double gpu_sec_time, double emergency_exit)
+{
+        static int GPU_OFFSET = GPU_PARTITION * GPU_PARTITION_SIZE;
+        static gpu_pool *pool = &GPU_LINUX_SEM_POOL[GPU_PARTITION];
+        static pthread_mutex_t *mutex = &GPU_LINUX_MUTEX_POOL[GPU_PARTITION];
+        
+        static bool once = false;
+        static cudaEvent_t start, end;
+        float ms;
+        if (!once)
+        {
+                once = true;
+                cudaEventCreate(&start);
+                cudaEventCreate(&end);
+        }
+        int next_gpu;
+        
+        if (emergency_exit && wctime() > emergency_exit)
+                goto out;
+        
+        next_gpu = pool->get(mutex, cur_gpu() - GPU_OFFSET) + GPU_OFFSET;
+        {
+                MigrateIfNeeded(next_gpu);
+                
+                unsigned int numcycles = (unsigned int)(cur_hz() * gpu_sec_time);
+                
+                if(SEND_SIZE > 0)
+                        chunkMemcpy(this_gpu(d_state_data), h_send_data, SEND_SIZE,
+                                                cudaMemcpyHostToDevice, cur_gpu(), useEngineLocks());
+                
+                /* one block per sm, one warp per block */
+                cudaEventRecord(start, cur_stream());
+                docudaspin <<<cur_sms(),cur_warp_size(), 0, cur_stream()>>> (d_spin_data[cur_gpu()], cur_elem_per_thread(), numcycles);
+//              docudaspin <<<cur_sms(),cur_warp_size(), 0, cur_stream()>>> (d_spin_data[cur_gpu()], d_iteration_count[cur_gpu()], cur_elem_per_thread(), numcycles);           
+                cudaEventRecord(end, cur_stream());
+                cudaEventSynchronize(end);
+                cudaStreamSynchronize(cur_stream());
+                
+//              chunkMemcpy(this_gpu(h_iteration_count), this_gpu(d_iteration_count), sizeof(unsigned int),
+//                                      cudaMemcpyDeviceToHost, cur_gpu(), useEngineLocks());
+//              
+                cudaEventElapsedTime(&ms, start, end);
+                ms_sum += ms;
+                ++gpucount;
+//              printf("%f\n", ms);
+//              printf("%f: %u\n", ms, this_gpu(h_iteration_count)[0]);
+                
+                if(RECV_SIZE > 0)
+                        chunkMemcpy(h_recv_data, this_gpu(d_state_data), RECV_SIZE,
+                                                cudaMemcpyDeviceToHost, cur_gpu(), useEngineLocks());
+                
+                if (MIGRATE_VIA_SYSMEM)
+                        PullState();
+        }
+        pool->put(mutex, cur_gpu() - GPU_OFFSET);
+        
+        last_gpu() = cur_gpu();
+        
+out:
+        return;
+}
+static void usage(char *error) {
+        fprintf(stderr, "Error: %s\n", error);
+        fprintf(stderr,
+                "Usage:\n"
+                "       rt_spin [COMMON-OPTS] WCET PERIOD DURATION\n"
+                "       rt_spin [COMMON-OPTS] -f FILE [-o COLUMN] WCET PERIOD\n"
+                "       rt_spin -l\n"
+                "\n"
+                "COMMON-OPTS = [-w] [-s SCALE]\n"
+                "              [-p PARTITION/CLUSTER [-z CLUSTER SIZE]] [-c CLASS]\n"
+                "              [-X LOCKING-PROTOCOL] [-L CRITICAL SECTION LENGTH] [-Q RESOURCE-ID]"
+                "\n"
+                "WCET and PERIOD are milliseconds, DURATION is seconds.\n"
+                "CRITICAL SECTION LENGTH is in milliseconds.\n");
+        exit(EXIT_FAILURE);
+}
+/*
+ * returns the character that made processing stop, newline or EOF
+ */
+static int skip_to_next_line(FILE *fstream)
+{
+        int ch;
+        for (ch = fgetc(fstream); ch != EOF && ch != '\n'; ch = fgetc(fstream));
+        return ch;
+}
+static void skip_comments(FILE *fstream)
+{
+        int ch;
+        for (ch = fgetc(fstream); ch == '#'; ch = fgetc(fstream))
+                skip_to_next_line(fstream);
+        ungetc(ch, fstream);
+}
+static void get_exec_times(const char *file, const int column,
+                           int *num_jobs,    double **exec_times)
+{
+        FILE *fstream;
+        int  cur_job, cur_col, ch;
+        *num_jobs = 0;
+        fstream = fopen(file, "r");
+        if (!fstream)
+                bail_out("could not open execution time file");
+        /* figure out the number of jobs */
+        do {
+                skip_comments(fstream);
+                ch = skip_to_next_line(fstream);
+                if (ch != EOF)
+                        ++(*num_jobs);
+        } while (ch != EOF);
+        if (-1 == fseek(fstream, 0L, SEEK_SET))
+                bail_out("rewinding file failed");
+        /* allocate space for exec times */
+        *exec_times = (double*)calloc(*num_jobs, sizeof(*exec_times));
+        if (!*exec_times)
+                bail_out("couldn't allocate memory");
+        for (cur_job = 0; cur_job < *num_jobs && !feof(fstream); ++cur_job) {
+                skip_comments(fstream);
+                for (cur_col = 1; cur_col < column; ++cur_col) {
+                        /* discard input until we get to the column we want */
+                        int unused __attribute__ ((unused)) = fscanf(fstream, "%*s,");
+                }
+                /* get the desired exec. time */
+                if (1 != fscanf(fstream, "%lf", (*exec_times)+cur_job)) {
+                        fprintf(stderr, "invalid execution time near line %d\n",
+                                        cur_job);
+                        exit(EXIT_FAILURE);
+                }
+                skip_to_next_line(fstream);
+        }
+        assert(cur_job == *num_jobs);
+        fclose(fstream);
+}
+#define NUMS 4096
+static int num[NUMS];
+__attribute__((unused)) static char* progname;
+static int loop_once(void)
+{
+        int i, j = 0;
+        for (i = 0; i < NUMS; i++)
+                j += num[i]++;
+        return j;
+}
+static int loop_for(double exec_time, double emergency_exit)
+{
+        double last_loop = 0, loop_start;
+        int tmp = 0;
+        double start = cputime();
+        double now = cputime();
+        if (emergency_exit && wctime() > emergency_exit)
+                goto out;
+        while (now + last_loop < start + exec_time) {
+                loop_start = now;
+                tmp += loop_once();
+                now = cputime();
+                last_loop = now - loop_start;
+                if (emergency_exit && wctime() > emergency_exit) {
+                        /* Oops --- this should only be possible if the execution time tracking
+                         * is broken in the LITMUS^RT kernel. */
+                        fprintf(stderr, "!!! gpuspin/%d emergency exit!\n", getpid());
+                        fprintf(stderr, "Something is seriously wrong! Do not ignore this.\n");
+                        break;
+                }
+        }
+out:
+        return tmp;
+}
+static void debug_delay_loop(void)
+{
+        double start, end, delay;
+        while (1) {
+                for (delay = 0.5; delay > 0.01; delay -= 0.01) {
+                        start = wctime();
+                        loop_for(delay, 0);
+                        end = wctime();
+                        printf("%6.4fs: looped for %10.8fs, delta=%11.8fs, error=%7.4f%%\n",
+                               delay,
+                               end - start,
+                               end - start - delay,
+                               100 * (end - start - delay) / delay);
+                }
+        }
+}
+static int gpu_job(double exec_time, double gpu_exec_time, double program_end)
+{
+        double chunk1, chunk2;
+        if (wctime() > program_end) {
+                return 0;
+        }
+        else {
+                chunk1 = exec_time * drand48();
+                chunk2 = exec_time - chunk1;
+                
+                loop_for(chunk1, program_end + 1);
+                gpu_loop_for(gpu_exec_time, program_end + 1);
+                loop_for(chunk2, program_end + 1);
+                
+                sleep_next_period();
+        }
+        return 1;
+}
+static int job(double exec_time, double program_end)
+{
+        if (wctime() > program_end) {
+                return 0;
+        }
+        else {
+                loop_for(exec_time, program_end + 1);
+                sleep_next_period();
+        }
+        return 1;
+}
+/*****************************/
+/* only used for linux modes */
+static struct timespec periodTime;
+static struct timespec releaseTime;
+static unsigned int job_no = 0;
+static lt_t period_ns;
+static void log_release()
+{
+        __attribute__ ((unused)) lt_t rel = releaseTime.tv_sec * s2ns(1) + releaseTime.tv_nsec;
+        __attribute__ ((unused)) lt_t dead = rel + period_ns;
+        trace_release(rel, dead, job_no);
+}
+static void log_completion()
+{
+        trace_completion(job_no);
+        ++job_no;
+}
+static void setup_next_period_linux(struct timespec* spec, struct timespec* period)
+{
+        spec->tv_sec += period->tv_sec;
+        spec->tv_nsec += period->tv_nsec;
+        if (spec->tv_nsec >= s2ns(1)) {
+                ++(spec->tv_sec);
+                spec->tv_nsec -= s2ns(1);
+        }
+}
+static void sleep_next_period_linux()
+{
+        log_completion();
+        setup_next_period_linux(&releaseTime, &periodTime);
+        clock_nanosleep(CLOCK_MONOTONIC, TIMER_ABSTIME, &releaseTime, NULL);
+        log_release();
+}
+static void init_linux()
+{
+        mlockall(MCL_CURRENT | MCL_FUTURE);
+}
+static int gpu_job_linux(double exec_time, double gpu_exec_time, double program_end)
+{
+        double chunk1, chunk2;
+        
+        if (wctime() > program_end) {
+                return 0;
+        }
+        else {
+                chunk1 = exec_time * drand48();
+                chunk2 = exec_time - chunk1;
+                loop_for(chunk1, program_end + 1);
+                gpu_loop_for_linux(gpu_exec_time, program_end + 1);
+                loop_for(chunk2, program_end + 1);
+                
+                sleep_next_period_linux();
+        }
+        return 1;
+}
+static int job_linux(double exec_time, double program_end)
+{
+        if (wctime() > program_end) {
+                return 0;
+        }
+        else {
+                loop_for(exec_time, program_end + 1);
+                sleep_next_period_linux();
+        }
+        return 1;
+}
+/*****************************/
+enum eScheduler
+{
+        LITMUS,
+        LINUX,
+        RT_LINUX
+};
+#define CPU_OPTIONS "p:z:c:wlveio:f:s:q:X:L:Q:"
+#define GPU_OPTIONS "g:y:r:C:E:dG:xS:R:T:Z:aFm:b:MNI"
+// concat the option strings
+#define OPTSTR CPU_OPTIONS GPU_OPTIONS
+int main(int argc, char** argv)
+{
+        int ret;
+        lt_t wcet;
+        lt_t period;
+        double wcet_ms = -1, gpu_wcet_ms = -1, period_ms = -1;
+        unsigned int priority = LITMUS_LOWEST_PRIORITY;
+        int migrate = 0;
+        int cluster = 0;
+        int cluster_size = 1;
+        int opt;
+        int wait = 0;
+        int test_loop = 0;
+        int column = 1;
+        const char *file = NULL;
+        int want_enforcement = 0;
+        int want_signals = 0;
+        double duration = 0, start = 0;
+        double *exec_times = NULL;
+        double scale = 1.0;
+        task_class_t cls = RT_CLASS_HARD;
+        int cur_job = 0, num_jobs = 0;
+        struct rt_task param;
+        double budget_ms = -1.0;
+        lt_t budget;
+        
+        int num_gpu_users = 0;
+        
+        
+        eScheduler scheduler = LITMUS;
+        
+        /* locking */
+//      int lock_od = -1;
+//      int resource_id = 0;
+//      int protocol = -1;
+//      double cs_length = 1; /* millisecond */
+        progname = argv[0];
+        while ((opt = getopt(argc, argv, OPTSTR)) != -1) {
+                switch (opt) {
+                case 'w':
+                        wait = 1;
+                        break;
+                case 'p':
+                        cluster = atoi(optarg);
+                        migrate = 1;
+                        break;
+                case 'z':
+                        cluster_size = atoi(optarg);
+                        CPU_PARTITION_SIZE = cluster_size;
+                        break;
+                case 'g':
+                        GPU_USING = true;
+                        GPU_PARTITION = atoi(optarg);
+                        assert(GPU_PARTITION >= 0 && GPU_PARTITION < NR_GPUS);
+                        break;
+                case 'y':
+                        GPU_PARTITION_SIZE = atoi(optarg);
+                        assert(GPU_PARTITION_SIZE > 0);
+                        break;
+                case 'r':
+                        RHO = atoi(optarg);
+                        assert(RHO > 0);
+                        break;
+                case 'C':
+                        NUM_COPY_ENGINES = atoi(optarg);
+                        assert(NUM_COPY_ENGINES == 1 || NUM_COPY_ENGINES == 2);
+                        break;
+                case 'E':
+                        USE_ENGINE_LOCKS = true;
+                        ENGINE_LOCK_TYPE = (eEngineLockTypes)atoi(optarg);
+                        assert(ENGINE_LOCK_TYPE == FIFO || ENGINE_LOCK_TYPE == PRIOQ);
+                        break;
+                case 'd':
+                        USE_DYNAMIC_GROUP_LOCKS = true;
+                        break;
+                case 'G':
+                        GPU_SYNC_MODE = (eGpuSyncMode)atoi(optarg);
+                        assert(GPU_SYNC_MODE >= IKGLP_MODE && GPU_SYNC_MODE <= RGEM_MODE);
+                        break;
+                case 'a':
+                        ENABLE_AFFINITY = true;
+                        break;
+                case 'F':
+                        RELAX_FIFO_MAX_LEN = true;
+                        break;
+                case 'x':
+                        CUDA_SYNC_MODE = SPIN;
+                        break;
+                case 'S':
+                        SEND_SIZE = kbToB((size_t)atoi(optarg));
+                        break;
+                case 'R':
+                        RECV_SIZE = kbToB((size_t)atoi(optarg));
+                        break;
+                case 'T':
+                        STATE_SIZE = kbToB((size_t)atoi(optarg));
+                        break;
+                case 'Z':
+                        ENABLE_CHUNKING = true;
+                        CHUNK_SIZE = kbToB((size_t)atoi(optarg));
+                        break;
+                case 'M':
+                        MIGRATE_VIA_SYSMEM = true;
+                        break;
+                case 'm':
+                        num_gpu_users = atoi(optarg);
+                        assert(num_gpu_users > 0);
+                        break;
+                case 'b':
+                        budget_ms = atoi(optarg);
+                        break;
+                case 'N':
+                        scheduler = LINUX;
+                        break;
+                case 'I':
+                        scheduler = RT_LINUX;
+                        break;
+                case 'q':
+                        priority = atoi(optarg);
+                        break;
+                case 'c':
+                        cls = str2class(optarg);
+                        if (cls == -1)
+                                usage("Unknown task class.");
+                        break;
+                case 'e':
+                        want_enforcement = 1;
+                        break;
+                case 'i':
+                        want_signals = 1;
+                        break;
+                case 'l':
+                        test_loop = 1;
+                        break;
+                case 'o':
+                        column = atoi(optarg);
+                        break;
+//              case 'f':
+//                      file = optarg;
+//                      break;
+                case 's':
+                        scale = atof(optarg);
+                        break;
+//              case 'X':
+//                      protocol = lock_protocol_for_name(optarg);
+//                      if (protocol < 0)
+//                              usage("Unknown locking protocol specified.");
+//                      break;
+//              case 'L':
+//                      cs_length = atof(optarg);
+//                      if (cs_length <= 0)
+//                              usage("Invalid critical section length.");
+//                      break;
+//              case 'Q':
+//                      resource_id = atoi(optarg);
+//                      if (resource_id <= 0 && strcmp(optarg, "0"))
+//                              usage("Invalid resource ID.");
+//                      break;
+                case ':':
+                        usage("Argument missing.");
+                        break;
+                case '?':
+                default:
+                        usage("Bad argument.");
+                        break;
+                }
+        }
+#ifdef VANILLA_LINUX
+        assert(scheduler != LITMUS);
+        assert(!wait);
+#endif
+        
+        // turn off some features to be safe
+        if (scheduler != LITMUS)
+        {
+                RHO = 0;
+                USE_ENGINE_LOCKS = false;
+                USE_DYNAMIC_GROUP_LOCKS = false;
+                ENABLE_AFFINITY = false;
+                RELAX_FIFO_MAX_LEN = false;
+                ENABLE_RT_AUX_THREADS = false;          
+                budget_ms = -1;
+                want_enforcement = 0;
+                want_signals = 0;
+                
+                if (scheduler == RT_LINUX)
+                {
+                        struct sched_param fifoparams;
+                        
+                        assert(priority >= sched_get_priority_min(SCHED_FIFO) &&
+                                   priority <= sched_get_priority_max(SCHED_FIFO));
+                        
+                        memset(&fifoparams, 0, sizeof(fifoparams));
+                        fifoparams.sched_priority = priority;
+                        assert(0 == sched_setscheduler(getpid(), SCHED_FIFO, &fifoparams));
+                }
+        }
+        else
+        {
+                if (!litmus_is_valid_fixed_prio(priority))
+                        usage("Invalid priority.");
+        }
+        
+        if (test_loop) {
+                debug_delay_loop();
+                return 0;
+        }
+        srand(getpid());
+        if (file) {
+                get_exec_times(file, column, &num_jobs, &exec_times);
+                if (argc - optind < 2)
+                        usage("Arguments missing.");
+                for (cur_job = 0; cur_job < num_jobs; ++cur_job) {
+                        /* convert the execution time to seconds */
+                        duration += exec_times[cur_job] * 0.001;
+                }
+        } else {
+                /*
+                 * if we're not reading from the CSV file, then we need
+                 * three parameters
+                 */
+                if (argc - optind < 3)
+                        usage("Arguments missing.");
+        }
+        if (argc - optind == 3) {
+                assert(!GPU_USING);             
+                wcet_ms   = atof(argv[optind + 0]);
+                period_ms = atof(argv[optind + 1]);
+                duration  = atof(argv[optind + 2]);
+        }
+        else if (argc - optind == 4) {
+                assert(GPU_USING);
+                wcet_ms   = atof(argv[optind + 0]);
+                gpu_wcet_ms = atof(argv[optind + 1]);
+                period_ms = atof(argv[optind + 2]);
+                duration  = atof(argv[optind + 3]);
+        }
+        
+        wcet   = ms2ns(wcet_ms);
+        period = ms2ns(period_ms);
+        if (wcet <= 0)
+                usage("The worst-case execution time must be a "
+                                "positive number.");
+        if (period <= 0)
+                usage("The period must be a positive number.");
+        if (!file && wcet > period) {
+                usage("The worst-case execution time must not "
+                                "exceed the period.");
+        }
+        if (GPU_USING && gpu_wcet_ms <= 0)
+                usage("The worst-case gpu execution time must be a positive number.");
+        if (budget_ms > 0)
+                budget = ms2ns(budget_ms);
+        else
+                budget = wcet;
+        
+        if (file && num_jobs > 1)
+                duration += period_ms * 0.001 * (num_jobs - 1);
+        if (migrate) {
+                ret = be_migrate_to_cluster(cluster, cluster_size);
+                if (ret < 0)
+                        bail_out("could not migrate to target partition or cluster.");
+        }
+        
+        if (scheduler != LITMUS)
+        {
+                // set some variables needed by linux modes
+                if (GPU_USING)
+                {
+                        TRACE_MIGRATIONS = true;
+                }
+                periodTime.tv_sec = period / s2ns(1);
+                periodTime.tv_nsec = period - periodTime.tv_sec * s2ns(1);
+                period_ns = period;
+        }
+        init_rt_task_param(&param);
+        param.exec_cost = budget;
+        param.period = period;
+        param.priority = priority;
+        param.cls = cls;
+        param.budget_policy = (want_enforcement) ?
+                        PRECISE_ENFORCEMENT : NO_ENFORCEMENT;
+        param.budget_signal_policy = (want_enforcement && want_signals) ?
+                        PRECISE_SIGNALS : NO_SIGNALS;
+        param.release_policy = PERIODIC;
+                                
+        if (migrate)
+                param.cpu = cluster_to_first_cpu(cluster, cluster_size);
+        ret = set_rt_task_param(gettid(), &param);
+        if (ret < 0)
+                bail_out("could not setup rt task params");
+        if (scheduler == LITMUS)
+                init_litmus();
+        else
+                init_linux();
+        if (want_signals) {
+                /* bind default longjmp signal handler to SIG_BUDGET. */
+                activate_litmus_signals(SIG_BUDGET_MASK, longjmp_on_litmus_signal);
+        }
+        if (scheduler == LITMUS)
+        {
+                ret = task_mode(LITMUS_RT_TASK);
+                if (ret != 0)
+                        bail_out("could not become RT task");
+        }
+        else
+        {
+                trace_name();
+                trace_param();
+        }
+//      if (protocol >= 0) {
+//              /* open reference to semaphore */
+//              lock_od = litmus_open_lock(protocol, resource_id, lock_namespace, &cluster);
+//              if (lock_od < 0) {
+//                      perror("litmus_open_lock");
+//                      usage("Could not open lock.");
+//              }
+//      }
+        if (GPU_USING) {
+                allocate_locks(num_gpu_users, scheduler != LITMUS);
+                
+                signal(SIGABRT, catch_exit);
+                signal(SIGTERM, catch_exit);
+                signal(SIGQUIT, catch_exit);
+                signal(SIGSEGV, catch_exit);
+                
+                init_cuda(num_gpu_users);
+                safetynet = true;
+                
+                if (ENABLE_RT_AUX_THREADS)
+                        if (enable_aux_rt_tasks(AUX_CURRENT | AUX_FUTURE) != 0)
+                                bail_out("enable_aux_rt_tasks() failed");
+        }
+        
+        if (wait) {
+                ret = wait_for_ts_release2(&releaseTime);
+                if (ret != 0)
+                        bail_out("wait_for_ts_release2()");
+                
+                if (scheduler != LITMUS)
+                        log_release();
+        }
+        else if (scheduler != LITMUS)
+        {
+                clock_gettime(CLOCK_MONOTONIC, &releaseTime);
+                sleep_next_period_linux();
+        }
+        start = wctime();
+        if (scheduler == LITMUS)
+        {
+                if (!GPU_USING) {
+                        while (job(wcet_ms * 0.001 * scale, start + duration));
+                }
+                else {
+                        while (gpu_job(wcet_ms * 0.001 * scale,
+                                                   gpu_wcet_ms * 0.001 * scale,
+                                                   start + duration));
+                }
+        }
+        else
+        {
+                if (!GPU_USING) {
+                        while (job_linux(wcet_ms * 0.001 * scale, start + duration));
+                }
+                else {
+                        while (gpu_job_linux(wcet_ms * 0.001 * scale,
+                                                   gpu_wcet_ms * 0.001 * scale,
+                                                   start + duration));
+                }
+        }
+        
+        if (GPU_USING && ENABLE_RT_AUX_THREADS)
+                if (disable_aux_rt_tasks(AUX_CURRENT | AUX_FUTURE) != 0)
+                        bail_out("disable_aux_rt_tasks() failed");
+        
+//      if (file) {
+//              /* use times read from the CSV file */
+//              for (cur_job = 0; cur_job < num_jobs; ++cur_job) {
+//                      /* convert job's length to seconds */
+//                      job(exec_times[cur_job] * 0.001 * scale,
+//                          start + duration,
+//                          lock_od, cs_length * 0.001);
+//              }
+//      } else {
+//              /* convert to seconds and scale */
+//      while (job(wcet_ms * 0.001 * scale, start + duration,
+//                 lock_od, cs_length * 0.001));
+//      }
+        if (scheduler == LITMUS)
+        {
+                ret = task_mode(BACKGROUND_TASK);
+                if (ret != 0)
+                        bail_out("could not become regular task (huh?)");
+        }
+        if (GPU_USING) {
+                safetynet = false;
+                exit_cuda();
+                printf("avg: %f\n", ms_sum/gpucount);
+        }
+        
+        if (file)
+                free(exec_times);
+        return 0;
+}
diff --git a/gpu/rtspin_fake_cuda.cpp b/gpu/rtspin_fake_cuda.cpp
deleted file mode 100644
index 247a74c..0000000
--- a/gpu/rtspin_fake_cuda.cpp
+++ /dev/null
@@ -1,1187 +0,0 @@
-#include <sys/time.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <time.h>
-#include <assert.h>
-#include <fcntl.h>
-#include <errno.h>
-#include <blitz/array.h>
-#include <boost/interprocess/managed_shared_memory.hpp>
-#include <boost/interprocess/sync/interprocess_barrier.hpp>
-#include <boost/interprocess/sync/interprocess_mutex.hpp>
-#include "litmus.h"
-using namespace blitz;
-using namespace std;
-using namespace boost::interprocess;
-#define RESET_RELEASE_ON_MISS
-void bail_out(const char* msg)
-{
-        perror(msg);
-        exit(-1 * errno);
-}
-static void usage(char *error) {
-        fprintf(stderr, "Error: %s\n", error);
-        fprintf(stderr,
-                "Usage:\n"
-                "       rt_spin [COMMON-OPTS] WCET PERIOD DURATION\n"
-                "       rt_spin [COMMON-OPTS] -f FILE [-o COLUMN] WCET PERIOD\n"
-                "       rt_spin -l\n"
-                "\n"
-                "COMMON-OPTS = [-w] [-p PARTITION] [-c CLASS] [-s SCALE]\n"
-                "\n"
-                "WCET and PERIOD are milliseconds, DURATION is seconds.\n");
-        exit(EXIT_FAILURE);
-}
-#define NUMS 4096
-static int num[NUMS];
-#define PAGE_SIZE (1024*4)
-bool ENABLE_WAIT = true;
-bool GPU_TASK = false;
-bool ENABLE_AFFINITY = false;
-bool USE_KFMLP = false;
-bool RELAX_FIFO_MAX_LEN = false;
-bool USE_DYNAMIC_GROUP_LOCKS = false;
-bool BROADCAST_STATE = false;
-bool ENABLE_CHUNKING = false;
-bool MIGRATE_VIA_SYSMEM = false;
-bool USE_PRIOQ = false;
-int GPU_PARTITION = 0;
-int GPU_PARTITION_SIZE = 0;
-int NUM_SIMULT_USERS = 1;
-size_t SEND_SIZE = 0;
-size_t RECV_SIZE = 0;
-size_t STATE_SIZE = 0;
-size_t CHUNK_SIZE = PAGE_SIZE;
-#define MAX_GPUS 8
-int KEXCLU_LOCK;
-int EE_LOCKS[MAX_GPUS];
-int CE_SEND_LOCKS[MAX_GPUS];
-int CE_RECV_LOCKS[MAX_GPUS];
-int CUR_DEVICE = -1;
-int LAST_DEVICE = -1;
-bool useEngineLocks()
-{
-        return(NUM_SIMULT_USERS != 1);
-}
-int gpuCyclesPerSecond = 0;
-uint64_t *init_release_time = NULL;
-barrier *release_barrier = NULL;
-barrier *gpu_barrier = NULL;
-interprocess_mutex *gpu_mgmt_mutexes = NULL;
-managed_shared_memory *segment_ptr = NULL;
-managed_shared_memory *release_segment_ptr = NULL;
-// observed average rate when four GPUs on same node in use from pagelocked memory.
-// about 1/3 to 1/4 this when there is no bus contention.
-//const double msPerByte = 4.22e-07;
-//const double transOverhead = 0.01008;  // also observed.
-char *d_send_data[MAX_GPUS] = {0};
-char *d_recv_data[MAX_GPUS] = {0};
-char *d_state_data[MAX_GPUS] = {0};
-//cudaStream_t streams[MAX_GPUS];
-char *h_send_data = 0;
-char *h_recv_data = 0;
-char *h_state_data = 0;
-#include <sys/mman.h>
-#define USE_PAGE_LOCKED_MEMORY
-#ifdef USE_PAGE_LOCKED_MEMORY
-#define c_malloc(s) \
-                mmap(NULL, s ,   \
-                                PROT_READ | PROT_WRITE,  \
-                                MAP_PRIVATE | MAP_ANONYMOUS | MAP_LOCKED,  \
-                                -1, 0)
-#else
-#define c_malloc(s) malloc(s)
-#endif
-typedef int cudaError_t;
-#define cudaSuccess 0
-enum cudaMemcpyKind {
-cudaMemcpyHostToDevice = 0,
-cudaMemcpyDeviceToHost = 1,
-cudaMemcpyDeviceToDevice = 2,
-};
-cudaError_t cudaGetLastError()
-{
-        return cudaSuccess;
-}
-////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////
-struct ce_lock_state
-{
-        int locks[2];
-        size_t num_locks;
-        size_t budget_remaining;
-        bool locked;
-        ce_lock_state(int device_a, enum cudaMemcpyKind kind, size_t size, int device_b = -1) {
-                num_locks = (device_a != -1) + (device_b != -1);
-                if(device_a != -1) {
-                        locks[0] = (kind == cudaMemcpyHostToDevice) ?
-                        CE_SEND_LOCKS[device_a] : CE_RECV_LOCKS[device_a];
-                }
-                if(device_b != -1) {
-                        assert(kind == cudaMemcpyDeviceToDevice);
-                        locks[1] = CE_RECV_LOCKS[device_b];
-                        if(locks[1] < locks[0]) {
-                                int temp = locks[1];
-                                locks[1] = locks[0];
-                                locks[0] = temp;
-                        }
-                }
-                if(!ENABLE_CHUNKING)
-                        budget_remaining = size;
-                else
-                        budget_remaining = CHUNK_SIZE;
-        }
-        void lock() {
-                if(USE_DYNAMIC_GROUP_LOCKS) {
-                        litmus_dgl_lock(locks, num_locks);
-                }
-                else
-                {
-                        for(int l = 0; l < num_locks; ++l)
-                        {
-                                litmus_lock(locks[l]);
-                        }
-                }
-                locked = true;
-        }
-        void unlock() {
-                if(USE_DYNAMIC_GROUP_LOCKS) {
-                        litmus_dgl_unlock(locks, num_locks);
-                }
-                else
-                {
-                        // reverse order
-                        for(int l = num_locks - 1; l >= 0; --l)
-                        {
-                                litmus_unlock(locks[l]);
-                        }
-                }
-                locked = false;
-        }
-        void refresh() {
-                budget_remaining = CHUNK_SIZE;
-        }
-        bool budgetIsAvailable(size_t tosend) {
-                return(tosend >= budget_remaining);
-        }
-        void decreaseBudget(size_t spent) {
-                budget_remaining -= spent;
-        }
-};
-// precondition: if do_locking == true, locks in state are held.
-cudaError_t __chunkMemcpy(void* a_dst, const void* a_src, size_t count,
-                                                enum cudaMemcpyKind kind,
-                                                ce_lock_state* state)
-{
-    cudaError_t ret = cudaSuccess;
-    int remaining = count;
-    char* dst = (char*)a_dst;
-    const char* src = (const char*)a_src;
-        // disable chunking, if needed, by setting chunk_size equal to the
-        // amount of data to be copied.
-        int chunk_size = (ENABLE_CHUNKING) ? CHUNK_SIZE : count;
-        int i = 0;
-    while(remaining != 0)
-    {
-        int bytesToCopy = std::min(remaining, chunk_size);
-                if(state && state->budgetIsAvailable(bytesToCopy) && state->locked) {
-                        //cutilSafeCall( cudaStreamSynchronize(streams[CUR_DEVICE]) );
-                        ret = cudaGetLastError();
-                        if(ret != cudaSuccess)
-                        {
-                                break;
-                        }
-                        state->unlock();
-                        state->refresh(); // replentish.
-                                                          // we can only run out of
-                                                          // budget if chunking is enabled.
-                                                          // we presume that init budget would
-                                                          // be set to cover entire memcpy
-                                                          // if chunking were disabled.
-                }
-                if(state && !state->locked) {
-                        state->lock();
-                }
-        //ret = cudaMemcpy(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind);
-                //cudaMemcpyAsync(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind, streams[CUR_DEVICE]);
-                if(state) {
-                        state->decreaseBudget(bytesToCopy);
-                }
-//              if(ret != cudaSuccess)
-//              {
-//                      break;
-//              }
-        ++i;
-        remaining -= bytesToCopy;
-    }
-    return ret;
-}
-cudaError_t chunkMemcpy(void* a_dst, const void* a_src, size_t count,
-                                                enum cudaMemcpyKind kind,
-                                                int device_a = -1,  // device_a == -1 disables locking
-                                                bool do_locking = true,
-                                                int device_b = -1)
-{
-        cudaError_t ret;
-        if(!do_locking || device_a == -1) {
-                ret = __chunkMemcpy(a_dst, a_src, count, kind, NULL);
-                //cutilSafeCall( cudaStreamSynchronize(streams[CUR_DEVICE]) );
-                if(ret == cudaSuccess)
-                        ret = cudaGetLastError();
-        }
-        else {
-                ce_lock_state state(device_a, kind, count, device_b);
-                state.lock();
-                ret = __chunkMemcpy(a_dst, a_src, count, kind, &state);
-                //cutilSafeCall( cudaStreamSynchronize(streams[CUR_DEVICE]) );
-                if(ret == cudaSuccess)
-                        ret = cudaGetLastError();
-                state.unlock();
-        }
-        return ret;
-}
-////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////
-inline uint64_t timespec_to_ns(const struct timespec& t)
-{
-        return(t.tv_sec*1e9 + t.tv_nsec);
-}
-inline struct timespec ns_to_timespec(const uint64_t& ns)
-{
-        struct timespec temp = {ns/1e9, ns - ns/1e9};
-        return(temp);
-}
-inline uint64_t clock_gettime_ns(clockid_t clk_id)
-{
-        struct timespec temp;
-        clock_gettime(clk_id, &temp);
-        return timespec_to_ns(temp);
-}
-static int loop_once(void)
-{
-        int i, j = 0;
-        for (i = 0; i < NUMS; i++)
-                j += num[i]++;
-        return j;
-}
-static int loop_for(double exec_time, double emergency_exit)
-{
-        double last_loop = 0, loop_start;
-        int tmp = 0;
-        double start = cputime();
-        double now = cputime();
-        while (now + last_loop < start + exec_time) {
-                loop_start = now;
-                tmp += loop_once();
-                now = cputime();
-                last_loop = now - loop_start;
-                if (emergency_exit && wctime() > emergency_exit) {
-                        /* Oops --- this should only be possible if the execution time tracking
-                         * is broken in the LITMUS^RT kernel. */
-                        fprintf(stderr, "!!! rtspin/%d emergency exit!\n", getpid());
-                        fprintf(stderr, "Something is seriously wrong! Do not ignore this.\n");
-                        break;
-                }
-        }
-        return tmp;
-}
-static void allocate_locks()
-{
-        // allocate k-FMLP lock
-        int fd = open("semaphores", O_RDONLY | O_CREAT, S_IRUSR | S_IWUSR);
-        int base_name = GPU_PARTITION * 1000;
-        if(USE_KFMLP) {
-                KEXCLU_LOCK = open_kfmlp_gpu_sem(fd,
-                                                                                 base_name,  /* name */
-                                                                                 GPU_PARTITION_SIZE,
-                                                                                 GPU_PARTITION*GPU_PARTITION_SIZE,
-                                                                                 NUM_SIMULT_USERS,
-                                                                                 ENABLE_AFFINITY
-                                                                                 );
-        }
-        else {
-                KEXCLU_LOCK = open_gpusync_token_lock(fd,
-                                                                                 base_name,  /* name */
-                                                                                 GPU_PARTITION_SIZE,
-                                                                                 GPU_PARTITION*GPU_PARTITION_SIZE,
-                                                                                 NUM_SIMULT_USERS,
-                                                                                 IKGLP_M_IN_FIFOS,
-                                                                                 (!RELAX_FIFO_MAX_LEN) ?
-                                                                                          IKGLP_OPTIMAL_FIFO_LEN :
-                                                                                          IKGLP_UNLIMITED_FIFO_LEN,
-                                                                                 ENABLE_AFFINITY
-                                                                                 );
-//              KEXCLU_LOCK = open_ikglp_gpu_sem(fd,
-//                                                                               base_name,  /* name */
-//                                                                               GPU_PARTITION_SIZE,
-//                                                                               GPU_PARTITION*GPU_PARTITION_SIZE,
-//                                                                               NUM_SIMULT_USERS,
-//                                                                               ENABLE_AFFINITY,
-//                                                                               RELAX_FIFO_MAX_LEN
-//                                                                               );
-        }
-        if(KEXCLU_LOCK < 0)
-                perror("open_kexclu_sem");
-        if(NUM_SIMULT_USERS > 1)
-        {
-                open_sem_t opensem = (!USE_PRIOQ) ? open_fifo_sem : open_prioq_sem;
-                const char* opensem_label = (!USE_PRIOQ) ? "open_fifo_sem" : "open_prioq_sem";
-                // allocate the engine locks.
-                for (int i = 0; i < MAX_GPUS; ++i)
-                {
-                        EE_LOCKS[i] = opensem(fd, (i+1)*10 + base_name);
-                        if(EE_LOCKS[i] < 0)
-                                perror(opensem_label);
-                        CE_SEND_LOCKS[i] = opensem(fd, (i+1)*10 + base_name + 1);
-                        if(CE_SEND_LOCKS[i] < 0)
-                                perror(opensem_label);
-                        if(NUM_SIMULT_USERS == 3)
-                        {
-                                // allocate a separate lock for the second copy engine
-                                CE_RECV_LOCKS[i] = opensem(fd, (i+1)*10 + base_name + 2);
-                                if(CE_RECV_LOCKS[i] < 0)
-                                        perror(opensem_label);
-                        }
-                        else
-                        {
-                                // share a single lock for the single copy engine
-                                CE_RECV_LOCKS[i] = CE_SEND_LOCKS[i];
-                        }
-                }
-        }
-}
-static void allocate_host_memory()
-{
-        // round up to page boundaries
-        size_t send_alloc_bytes = SEND_SIZE + (SEND_SIZE%PAGE_SIZE != 0)*PAGE_SIZE;
-        size_t recv_alloc_bytes = RECV_SIZE + (RECV_SIZE%PAGE_SIZE != 0)*PAGE_SIZE;
-        size_t state_alloc_bytes = STATE_SIZE + (STATE_SIZE%PAGE_SIZE != 0)*PAGE_SIZE;
-        printf("Allocating host memory.  send = %dB, recv = %dB, state = %dB\n",
-                                send_alloc_bytes, recv_alloc_bytes, state_alloc_bytes);
-//      if(send_alloc_bytes > 0)
-//      {
-//              h_send_data = (char *)c_malloc(send_alloc_bytes);
-//              memset(h_send_data, 0x55, send_alloc_bytes);  // write some random value
-//              // this will open a connection to GPU 0 if there is no active context, so
-//              // expect long stalls.  LAME.
-//              cutilSafeCall( cudaHostRegister(h_send_data, send_alloc_bytes, cudaHostRegisterPortable) );
-//      }
-//
-//      if(recv_alloc_bytes > 0)
-//      {
-//              h_recv_data = (char *)c_malloc(recv_alloc_bytes);
-//              memset(h_recv_data, 0xAA, recv_alloc_bytes);
-//              cutilSafeCall( cudaHostRegister(h_recv_data, recv_alloc_bytes, cudaHostRegisterPortable) );
-//      }
-//
-//      if(state_alloc_bytes > 0)
-//      {
-//              h_state_data = (char *)c_malloc(state_alloc_bytes);
-//              memset(h_state_data, 0xCC, state_alloc_bytes);  // write some random value
-//              cutilSafeCall( cudaHostRegister(h_state_data, state_alloc_bytes, cudaHostRegisterPortable) );
-//      }
-        printf("Host memory allocated.\n");
-}
-static void allocate_device_memory()
-{
-        printf("Allocating device memory.\n");
-        // establish a connection to each GPU.
-//      for(int i = 0; i < GPU_PARTITION_SIZE; ++i)
-//      {
-//              int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i;
-//
-//              if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].lock();
-//
-//              cutilSafeCall( cudaSetDevice(which_device) );
-//              cutilSafeCall( cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 0) );
-//              cutilSafeCall( cudaDeviceSetLimit(cudaLimitMallocHeapSize, 0) );
-//
-//              cutilSafeCall( cudaStreamCreate(&streams[which_device]) );
-//
-//              /* pre-allocate memory, pray there's enough to go around */
-//              if(SEND_SIZE > 0) {
-//                      cutilSafeCall( cudaMalloc((void**)&d_send_data[which_device], SEND_SIZE) );
-//              }
-//              if(RECV_SIZE > 0) {
-//                      cutilSafeCall( cudaMalloc((void**)&h_recv_data[which_device], RECV_SIZE) );
-//              }
-//              if(STATE_SIZE > 0) {
-//                      cutilSafeCall( cudaMalloc((void**)&h_state_data[which_device], STATE_SIZE) );
-//              }
-//
-//              if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].unlock();
-//      }
-        printf("Device memory allocated.\n");
-}
-static void configure_gpus()
-{
-        printf("Configuring GPU\n");
-//      // SUSPEND WHEN BLOCKED!!
-//      cutilSafeCall( cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync) );
-//
-//      // establish a connection to each GPU.
-//      for(int i = 0; i < GPU_PARTITION_SIZE; ++i)
-//      {
-//              int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i;
-//
-//              if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].lock();
-//
-//              cutilSafeCall( cudaSetDevice(which_device) );
-//              cutilSafeCall( cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 0) );
-//              cutilSafeCall( cudaDeviceSetLimit(cudaLimitMallocHeapSize, 0) );
-//
-//              cutilSafeCall( cudaStreamCreate(&streams[which_device]) );
-//
-//              // enable P2P migrations.
-//              // we assume all GPUs are on the same I/O hub.
-//              for(int j = 0; j < GPU_PARTITION_SIZE; ++j)
-//              {
-//                      int other_device = GPU_PARTITION*GPU_PARTITION_SIZE + j;
-//
-//                      if(which_device != other_device)
-//                      {
-//                              cutilSafeCall( cudaDeviceEnablePeerAccess(other_device, 0) );
-//                      }
-//              }
-//
-//              if(i == 0)
-//              {
-//                      struct cudaDeviceProp pi;
-//                      cudaGetDeviceProperties(&pi, i);
-//                      gpuCyclesPerSecond = pi.clockRate * 1000; /* khz -> hz */
-//              }
-//
-//              if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].unlock();
-//      }
-        printf("GPUs have been configured.\n");
-}
-static void init_cuda()
-{
-        configure_gpus();
-        allocate_host_memory();
-        allocate_device_memory();
-        allocate_locks();
-}
-static void exit_cuda()
-{
-        for(int i = 0; i < GPU_PARTITION_SIZE; ++i)
-        {
-                int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i;
-                if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].lock();
-//              cutilSafeCall( cudaSetDevice(which_device) );
-//              cutilSafeCall( cudaDeviceReset() );
-                if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].unlock();
-        }
-}
-static void catchExit(void)
-{
-        if(GPU_TASK)
-        {
-                // try to unlock everything.  litmus will prevent bogus calls.
-                if(NUM_SIMULT_USERS > 1)
-                {
-                        for(int i = 0; i < GPU_PARTITION_SIZE; ++i)
-                        {
-                                int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i;
-                                litmus_unlock(EE_LOCKS[which_device]);
-                                litmus_unlock(CE_SEND_LOCKS[which_device]);
-                                if(NUM_SIMULT_USERS == 2) {
-                                        litmus_unlock(CE_RECV_LOCKS[which_device]);
-                                }
-                        }
-                }
-                if(CUR_DEVICE >= 0) {
-                        unregister_nv_device(CUR_DEVICE);
-                }
-                litmus_unlock(KEXCLU_LOCK);
-        }
-}
-static void migrateToGPU(int destination)
-{
-        if(!BROADCAST_STATE && STATE_SIZE > 0)
-        {
-                if(MIGRATE_VIA_SYSMEM)
-                {
-                        chunkMemcpy(h_state_data, d_state_data[LAST_DEVICE], STATE_SIZE,
-                                                cudaMemcpyDeviceToHost, LAST_DEVICE, useEngineLocks());
-                }
-        }
-//      cutilSafeCall( cudaSetDevice(destination) );
-        if(!BROADCAST_STATE && STATE_SIZE > 0)
-        {
-                if(MIGRATE_VIA_SYSMEM)
-                {
-                        chunkMemcpy(d_state_data[CUR_DEVICE], h_state_data, STATE_SIZE,
-                                                cudaMemcpyHostToDevice, CUR_DEVICE, useEngineLocks());
-                }
-                else
-                {
-                        chunkMemcpy(d_state_data[destination],
-                                                d_state_data[LAST_DEVICE],
-                                                STATE_SIZE,
-                                                cudaMemcpyDeviceToDevice,
-                                                CUR_DEVICE,
-                                                useEngineLocks(),
-                                                destination);
-                }
-        }
-}
-static void broadcastState(int from)
-{
-        if(STATE_SIZE > 0)
-        {
-                assert(CUR_DEVICE == from);
-                if(MIGRATE_VIA_SYSMEM)
-                {
-                        chunkMemcpy(h_state_data, d_state_data[from], STATE_SIZE,
-                                                cudaMemcpyDeviceToHost, from, useEngineLocks());
-                }
-                for(int i = 0; i < GPU_PARTITION_SIZE; ++i)
-                {
-                        int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i;
-                        if(which_device != from)
-                        {
-                                if(MIGRATE_VIA_SYSMEM)
-                                {
-//                                      cutilSafeCall( cudaSetDevice(which_device) );
-                                        CUR_DEVICE = which_device; // temporary
-                                        chunkMemcpy(d_state_data[which_device], h_state_data, STATE_SIZE,
-                                                                cudaMemcpyHostToDevice, which_device, useEngineLocks());
-                                }
-                                else
-                                {
-                                        chunkMemcpy(d_state_data[which_device],
-                                                                d_state_data[from],
-                                                                STATE_SIZE,
-                                                                cudaMemcpyDeviceToDevice,
-                                                                from,
-                                                                useEngineLocks(),
-                                                                which_device);
-                                }
-                        }
-                }
-                if(MIGRATE_VIA_SYSMEM && CUR_DEVICE != from)
-                {
-//                      cutilSafeCall( cudaSetDevice(from) );
-                        CUR_DEVICE = from;
-                }
-        }
-}
-//// Executes on graphics card.
-//__global__ void docudaspin(unsigned int cycles)
-//{
-//      long long unsigned int elapsed = 0;
-//      long long int now = clock64();
-//      long long int last;
-//      do
-//      {
-//              last = now;
-//              now = clock64();
-//              elapsed += max(0ll, (long long int)(now - last)); // don't count iterations with clock roll-over
-//      }while(elapsed < cycles);
-//
-//      return;
-//}
-static void gpu_loop_for(double gpu_sec_time, double emergency_exit)
-{
-        unsigned int numcycles = (unsigned int)(gpuCyclesPerSecond * gpu_sec_time);
-        int numblocks = 1;
-        int blocksz = 1;
-        CUR_DEVICE = litmus_lock(KEXCLU_LOCK);
-        {
-                if(CUR_DEVICE != LAST_DEVICE && LAST_DEVICE != -1)
-                {
-                        migrateToGPU(CUR_DEVICE);
-                }
-                if(SEND_SIZE > 0)
-                {
-                        // handles chunking and locking, as appropriate.
-                        chunkMemcpy(d_send_data[CUR_DEVICE], h_send_data, SEND_SIZE,
-                                                cudaMemcpyHostToDevice, CUR_DEVICE, useEngineLocks());
-                }
-                if(useEngineLocks()) litmus_lock(EE_LOCKS[CUR_DEVICE]);
-//              docudaspin <<<numblocks,blocksz, 0, streams[CUR_DEVICE]>>> (numcycles);
-//              cutilSafeCall( cudaStreamSynchronize(streams[CUR_DEVICE]) );
-                if(useEngineLocks()) litmus_unlock(EE_LOCKS[CUR_DEVICE]);
-                if(RECV_SIZE > 0)
-                {
-                        chunkMemcpy(h_recv_data, d_recv_data[CUR_DEVICE], RECV_SIZE,
-                                                cudaMemcpyDeviceToHost, CUR_DEVICE, useEngineLocks());
-                }
-                if(BROADCAST_STATE)
-                {
-                        broadcastState(CUR_DEVICE);
-                }
-        }
-        litmus_unlock(KEXCLU_LOCK);
-        LAST_DEVICE = CUR_DEVICE;
-        CUR_DEVICE = -1;
-}
-static void debug_delay_loop(void)
-{
-        double start, end, delay;
-        while (1) {
-                for (delay = 0.5; delay > 0.01; delay -= 0.01) {
-                        start = wctime();
-                        loop_for(delay, 0);
-                        end = wctime();
-                        printf("%6.4fs: looped for %10.8fs, delta=%11.8fs, error=%7.4f%%\n",
-                               delay,
-                               end - start,
-                               end - start - delay,
-                               100 * (end - start - delay) / delay);
-                }
-        }
-}
-static int job(double exec_time, double gpu_sec_time, double program_end)
-{
-        if (wctime() > program_end)
-                return 0;
-        else if (!GPU_TASK)
-        {
-                loop_for(exec_time, program_end + 1);
-        }
-        else
-        {
-                double cpu_bookend = (exec_time)/2.0;
-                loop_for(cpu_bookend, program_end + 1);
-                gpu_loop_for(gpu_sec_time, program_end + 1);
-                loop_for(cpu_bookend, program_end + 1);
-        }
-        return 1;
-}
-#define OPTSTR "p:ls:e:g:G:W:N:S:R:T:BMaLyC:rz:q"
-int main(int argc, char** argv)
-{
-        atexit(catchExit);
-        int ret;
-        lt_t wcet;
-        lt_t period;
-        double wcet_ms, period_ms;
-        int migrate = 0;
-        int cpu = 0;
-        int opt;
-        int test_loop = 0;
-//      int column = 1;
-        const char *file = NULL;
-        int want_enforcement = 0;
-        double duration = 0, releaseTime = 0;
-        double *exec_times = NULL;
-        double scale = 1.0;
-        uint64_t cur_job;
-        uint64_t num_jobs;
-        int create_shm = -1;
-        int num_tasks = 0;
-        double gpu_sec_ms = 0;
-        while ((opt = getopt(argc, argv, OPTSTR)) != -1) {
-//              printf("opt = %c optarg = %s\n", opt, optarg);
-                switch (opt) {
-//              case 'w':
-//                      ENABLE_WAIT = 1;
-//                      break;
-                case 'p':
-                        cpu = atoi(optarg);
-                        migrate = 1;
-                        break;
-                case 'l':
-                        test_loop = 1;
-                        break;
-                case 's':
-                        scale = atof(optarg);
-                        break;
-                case 'e':
-                        gpu_sec_ms = atof(optarg);
-                        break;
-//              case 'x':
-//                      trans_sec_ms = atof(optarg);
-//                      break;
-                case 'z':
-                        NUM_SIMULT_USERS = atoi(optarg);
-                        break;
-                case 'q':
-                        USE_PRIOQ = true;
-                        break;
-                case 'g':
-                        GPU_TASK = 1;
-                        GPU_PARTITION_SIZE = atoi(optarg);
-                        break;
-                case 'G':
-                        GPU_PARTITION = atoi(optarg);
-                        break;
-                case 'S':
-                        SEND_SIZE = (size_t)(atof(optarg)*1024);
-                        break;
-                case 'R':
-                        RECV_SIZE = (size_t)(atof(optarg)*1024);
-                        break;
-                case 'T':
-                        STATE_SIZE = (size_t)(atof(optarg)*1024);
-                        break;
-                case 'B':
-                        BROADCAST_STATE = true;
-                        break;
-                case 'M':
-                        MIGRATE_VIA_SYSMEM = true;
-                        break;
-                case 'a':
-                        ENABLE_AFFINITY = true;
-                        break;
-                case 'r':
-                        RELAX_FIFO_MAX_LEN = true;
-                        break;
-                case 'L':
-                        USE_KFMLP = true;
-                        break;
-                case 'y':
-                        USE_DYNAMIC_GROUP_LOCKS = true;
-                        break;
-                case 'C':
-                        ENABLE_CHUNKING = true;
-                        CHUNK_SIZE = (size_t)(atof(optarg)*1024);
-                        break;
-                case 'W':
-                        create_shm = atoi(optarg);
-                        break;
-                case 'N':
-                        num_tasks = atoi(optarg);
-                        break;
-                case ':':
-                        usage("Argument missing.");
-                        break;
-                case '?':
-                default:
-                        usage("Bad argument.");
-                        break;
-                }
-        }
-        if (test_loop) {
-                debug_delay_loop();
-                return 0;
-        }
-//      if (file) {
-//              int num_jobs_tmp;
-//              get_exec_times(file, column, &num_jobs_tmp, &exec_times);
-//              num_jobs = num_jobs_tmp;
-//
-//              if (argc - optind < 2)
-//                      usage("Arguments missing.");
-//
-//              for (cur_job = 0; cur_job < num_jobs; ++cur_job) {
-//                      /* convert the execution time to seconds */
-//                      duration += exec_times[cur_job] * 0.001;
-//              }
-//      } else {
-                /*
-                 * if we're not reading from the CSV file, then we need
-                 * three parameters
-                 */
-                if (argc - optind < 3)
-                        usage("Arguments missing.");
-//      }
-        wcet_ms   = atof(argv[optind + 0]);
-        period_ms = atof(argv[optind + 1]);
-        wcet   = wcet_ms * __NS_PER_MS;
-        period = period_ms * __NS_PER_MS;
-        if (wcet <= 0)
-                usage("The worst-case execution time must be a "
-                                "positive number.");
-        if (period <= 0)
-                usage("The period must be a positive number.");
-        if (!file && wcet > period) {
-                usage("The worst-case execution time must not "
-                                "exceed the period.");
-        }
-        if (!file)
-        {
-                duration  = atof(argv[optind + 2]);
-                num_jobs = ((double)duration*1e3)/period_ms;
-                ++num_jobs; // padding
-        }
-        else if (file && num_jobs > 1)
-        {
-                duration += period_ms * 0.001 * (num_jobs - 1);
-        }
-        if (migrate) {
-                ret = be_migrate_to(cpu);
-                if (ret < 0)
-                        bail_out("could not migrate to target partition");
-        }
-        if(ENABLE_WAIT)
-        {
-                if(num_tasks > 0)
-                {
-                        printf("%d creating release shared memory\n", getpid());
-                        shared_memory_object::remove("release_barrier_memory");
-                        release_segment_ptr = new managed_shared_memory(create_only, "release_barrier_memory", 4*1024);
-                        printf("%d creating release barrier for %d users\n", getpid(), num_tasks);
-                        release_barrier = release_segment_ptr->construct<barrier>("barrier release_barrier")(num_tasks);
-                        init_release_time = release_segment_ptr->construct<uint64_t>("uint64_t instance")();
-                        *init_release_time = 0;
-                }
-                else
-                {
-                        do
-                        {
-                                try
-                                {
-                                        printf("%d opening release shared memory\n", getpid());
-                                        segment_ptr = new managed_shared_memory(open_only, "release_barrier_memory");
-                                }
-                                catch(...)
-                                {
-                                        printf("%d shared memory not ready.  sleeping\n", getpid());
-                                        sleep(1);
-                                }
-                        }while(segment_ptr == NULL);
-                        release_barrier = segment_ptr->find<barrier>("barrier release_barrier").first;
-                        init_release_time = segment_ptr->find<uint64_t>("uint64_t instance").first;
-                }
-        }
-        if(GPU_TASK)
-        {
-                if(ENABLE_WAIT)
-                {
-                        if(create_shm > -1)
-                        {
-                                printf("%d creating shared memory\n", getpid());
-                                shared_memory_object::remove("gpu_barrier_memory");
-                                segment_ptr = new managed_shared_memory(create_only, "gpu_barrier_memory", 4*1024);
-                                printf("%d creating a barrier for %d users\n", getpid(), create_shm);
-                                gpu_barrier = segment_ptr->construct<barrier>("barrier instance")(create_shm);
-                                printf("%d creating gpu mgmt mutexes for 8 devices\n", getpid());
-                                gpu_mgmt_mutexes = segment_ptr->construct<interprocess_mutex>("interprocess_mutex m")[8]();
-                        }
-                        else
-                        {
-                                do
-                                {
-                                        try
-                                        {
-                                                printf("%d opening shared memory\n", getpid());
-                                                segment_ptr = new managed_shared_memory(open_only, "gpu_barrier_memory");
-                                        }
-                                        catch(...)
-                                        {
-                                                printf("%d shared memory not ready.  sleeping\n", getpid());
-                                                sleep(1);
-                                        }
-                                }while(segment_ptr == NULL);
-                                gpu_barrier = segment_ptr->find<barrier>("barrier instance").first;
-                                gpu_mgmt_mutexes = segment_ptr->find<interprocess_mutex>("interprocess_mutex m").first;
-                        }
-                }
-                // scale data transmission too??
-                SEND_SIZE *= scale;
-                RECV_SIZE *= scale;
-                STATE_SIZE *= scale;
-                init_cuda();
-        }
-        ret = sporadic_task_ns(wcet, period, 0, cpu, RT_CLASS_SOFT,
-                               want_enforcement ? PRECISE_ENFORCEMENT
-                                                : NO_ENFORCEMENT,
-                               migrate);
-        if (ret < 0)
-                bail_out("could not setup rt task params");
-        init_litmus();
-        ret = task_mode(LITMUS_RT_TASK);
-        if (ret != 0)
-                bail_out("could not become RT task");
-        uint64_t jobCount = 0;
-        blitz::Array<uint64_t, 1> responseTimeLog(num_jobs+1);
-        struct timespec spec;
-        uint64_t release;
-        uint64_t finish;
-        if (ENABLE_WAIT) {
-                printf("Waiting for release.\n");
-                ret = wait_for_ts_release();
-                if (ret != 0)
-                        bail_out("wait_for_ts_release()");
-        }
-        else
-        {
-                sleep_next_period();
-        }
-        clock_gettime(CLOCK_MONOTONIC, &spec);
-        release = timespec_to_ns(spec);
-        if (!__sync_bool_compare_and_swap(init_release_time, 0, release))
-        {
-                release = *init_release_time;
-        }
-        releaseTime = wctime();
-        double failsafeEnd = releaseTime + duration;
-        if (file) {
-                /* use times read from the CSV file */
-                for (cur_job = 0; cur_job < num_jobs; ++cur_job) {
-                        /* convert job's length to seconds */
-                        job(exec_times[cur_job] * 0.001 * scale,
-                                        gpu_sec_ms * 0.001 * scale,
-                                        failsafeEnd);
-                }
-        } else {
-                /* convert to seconds and scale */
-                int keepGoing;
-                do
-                {
-                        keepGoing = job(wcet_ms * 0.001 * scale, gpu_sec_ms * 0.001 * scale, failsafeEnd);
-                        clock_gettime(CLOCK_MONOTONIC, &spec);
-                        finish = timespec_to_ns(spec);
-                        responseTimeLog(min(num_jobs,jobCount++)) = finish - release;
-                        // this is an estimated upper-bound on release time.  it may be off by several microseconds.
-#ifdef RESET_RELEASE_ON_MISS
-                        release = (release + period < finish) ?
-                                        finish :  /* missed deadline.  adopt next release as current time. */
-                                        release + period;  /* some time in the future. */
-#else
-                        release = release + period; // allow things to get progressively later.
-#endif
-                        sleep_next_period();
-                        clock_gettime(CLOCK_MONOTONIC, &spec);
-                        release = min(timespec_to_ns(spec), release);
-                } while(keepGoing);
-        }
-        if(GPU_TASK && ENABLE_WAIT)
-        {
-                printf("%d waiting at barrier\n", getpid());
-                gpu_barrier->wait();
-        }
-        ret = task_mode(BACKGROUND_TASK);
-        if (ret != 0)
-                bail_out("could not become regular task (huh?)");
-        if (file)
-                free(exec_times);
-        if(GPU_TASK)
-        {
-                /*
-                if(ENABLE_WAIT)
-                {
-                        // wait for all GPU using tasks ext RT mode.
-                        printf("%d waiting at barrier\n", getpid());
-                        gpu_barrier->wait();
-                }
-                */
-                exit_cuda();
-                if(ENABLE_WAIT)
-                {
-                        /* wait before we clean up memory */
-                        printf("%d waiting for all to shutdown GPUs\n", getpid());
-                        gpu_barrier->wait();
-/*
-                        if(create_shm > -1)
-                        {
-                                printf("%d removing shared memory\n", getpid());
-                                shared_memory_object::remove("gpu_barrier_memory");
-                        }
-*/
-                }
-        }
-        if (ENABLE_WAIT)
-        {
-                printf("%d waiting at exit barrier\n", getpid());
-                release_barrier->wait();
-        }
-        char gpu_using_str[] = "GPU\n";
-        char cpu_only_str[] = "CPU\n";
-        #define USED(arr) (arr)(Range(fromStart,min(num_jobs-1,jobCount-1)))
-        // period (ms), avg-rt, min-rt, max-rt, avg-slack, numMisses
-        printf("DONE,%d,%d,%f,%f,%f,%lu,%lu,%f,%lu,%d,%d,%s",
-                   cpu,
-                   getpid(),
-                   period_ms,
-                   // average
-                   blitz::mean(USED(responseTimeLog)),
-                   // average pct of period
-                   100.0*(blitz::mean(USED(responseTimeLog))/period),
-                   // min
-                   blitz::min(USED(responseTimeLog)),
-                   // max
-                   blitz::max(USED(responseTimeLog)),
-                   // average slack
-                   blitz::mean((uint64_t)period - USED(responseTimeLog)),
-                   // num jobs
-                   min(num_jobs-1,jobCount-1),
-                   // num misses
-                   blitz::count(USED(responseTimeLog) > (uint64_t)period),
-                   // num misses w/ unbounded
-                   blitz::count(USED(responseTimeLog) > (uint64_t)(2*period)),
-                   // flag gpu-using tasks
-                   ((GPU_TASK) ? gpu_using_str : cpu_only_str)
-                   );
-        return 0;
-}
diff --git a/include/common.h b/include/common.h
index d1234ba..faf2c07 100644
--- a/include/common.h
+++ b/include/common.h
@@ -1,7 +1,14 @@
 #ifndef COMMON_H
 #define COMMON_H
+#ifdef __cplusplus
+extern "C" {
+#endif
 void bail_out(const char* msg);
+#ifdef __cplusplus
+}
+#endif
 #endif
author	Glenn Elliott <gelliott@cs.unc.edu>	2013-04-14 15:06:43 -0400
committer	Glenn Elliott <gelliott@cs.unc.edu>	2013-04-14 15:06:43 -0400
commit	37b4a24ba84f1dffd680fd550a3d8cad2ac5e3a8 (patch)
tree	5dc5e56a7a4f424e75f59f7705263bdb43b86fb3
parent	209f1961ea2d5863d6f2d2e9d2323446ee5e53c4 (diff)