Clean up GPU test code placement.

author: Glenn Elliott <gelliott@cs.unc.edu> 2013-01-10 17:48:39 -0500
committer: Glenn Elliott <gelliott@cs.unc.edu> 2013-01-10 17:48:39 -0500
commit: 629486d62ae22c33251d3c367af3febff5fe1e28 (patch)
tree: ef78fc8235c61f8ba37d109ea04266b6ce49b804 /gpu
parent: 1bf0f0094cd9671adfc07cf840bde67cd4cc0c38 (diff)
7 files changed, 2901 insertions, 0 deletions
diff --git a/gpu/aux_threads.c b/gpu/aux_threads.c
new file mode 100644
index 0000000..6636f36
--- /dev/null
+++ b/gpu/aux_threads.c
@@ -0,0 +1,313 @@
+/* based_mt_task.c -- A basic multi-threaded real-time task skeleton. 
+ *
+ * This (by itself useless) task demos how to setup a multi-threaded LITMUS^RT
+ * real-time task. Familiarity with the single threaded example (base_task.c)
+ * is assumed.
+ *
+ * Currently, liblitmus still lacks automated support for real-time
+ * tasks, but internaly it is thread-safe, and thus can be used together
+ * with pthreads.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+/* Include gettid() */
+#include <sys/types.h>
+/* Include threading support. */
+#include <pthread.h>
+/* Include the LITMUS^RT API.*/
+#include "litmus.h"
+//#define PERIOD                500
+#define PERIOD           10
+//#define EXEC_COST      10
+#define EXEC_COST 1
+int NUM_AUX_THREADS = 2;
+#define LITMUS_STATS_FILE "/proc/litmus/stats"
+/* The information passed to each thread. Could be anything. */
+struct thread_context {
+        int id;
+        struct timeval total_time;
+};
+/* The real-time thread program. Doesn't have to be the same for
+ * all threads. Here, we only have one that will invoke job().
+ */
+void* rt_thread(void *tcontext);
+void* aux_thread(void *tcontext);
+/* Declare the periodically invoked job. 
+ * Returns 1 -> task should exit.
+ *         0 -> task should continue.
+ */
+int job(void);
+/* Catch errors.
+ */
+#define CALL( exp ) do { \
+                int ret; \
+                ret = exp; \
+                if (ret != 0) \
+                        fprintf(stderr, "%s failed: %m\n", #exp);\
+                else \
+                        fprintf(stderr, "%s ok.\n", #exp); \
+        } while (0)
+int gRun = 1;
+pthread_mutex_t gMutex = PTHREAD_MUTEX_INITIALIZER;
+pthread_barrier_t gBar;
+#define OPTSTR "t:fcb"
+int main(int argc, char** argv)
+{
+        int i;
+        struct thread_context *ctx;
+        pthread_t *task;
+        int opt;
+        int before = 0;
+        int aux_flags = 0;
+        int do_future = 0;
+        while ((opt = getopt(argc, argv, OPTSTR)) != -1) {
+                switch(opt)
+                {
+                case 't':
+                        NUM_AUX_THREADS = atoi(optarg);
+                        printf("%d aux threads\n", NUM_AUX_THREADS);
+                        break;
+                case 'f':
+                        aux_flags |= AUX_FUTURE;
+                        do_future = 1;
+                        break;
+                case 'c':
+                        aux_flags |= AUX_CURRENT;
+                        break;
+                case 'b':
+                        before = 1;
+                        printf("Will become real-time before spawning aux threads.\n");
+                        break;
+                }
+        }
+        if (aux_flags == 0) {
+                printf("Must specify -c (AUX_CURRENT) and/or -f (AUX_FUTURE) for aux tasks.\n");
+                return -1;
+        }
+        ctx = calloc(NUM_AUX_THREADS, sizeof(struct thread_context));
+        task = calloc(NUM_AUX_THREADS, sizeof(pthread_t));
+        
+        //lt_t delay = ms2lt(1000);
+        /*****
+         * 3) Initialize LITMUS^RT.
+         *    Task parameters will be specified per thread.
+         */
+        init_litmus();
+        {
+                pthread_barrierattr_t battr;
+                pthread_barrierattr_init(&battr);
+                pthread_barrier_init(&gBar, &battr, (NUM_AUX_THREADS)+1);
+        }
+        if(before)
+        {
+                CALL( init_rt_thread() );
+                CALL( sporadic_global(EXEC_COST, PERIOD) );
+                CALL( task_mode(LITMUS_RT_TASK) );
+        }
+        if(do_future && before)
+        {
+                CALL( enable_aux_rt_tasks(aux_flags) );
+        }
+//      printf("Red Leader is now real-time!\n");
+        for (i = 0; i < NUM_AUX_THREADS; i++) {
+                ctx[i].id = i;
+                pthread_create(task + i, NULL, aux_thread, (void *) (ctx + i));
+        }
+//      pthread_barrier_wait(&gBar);
+//      sleep(1);
+        if(!before)
+        {
+                CALL( init_rt_thread() );
+                CALL( sporadic_global(EXEC_COST, PERIOD) );
+                CALL( task_mode(LITMUS_RT_TASK) );
+        }
+        // secondary call *should* be harmless
+        CALL( enable_aux_rt_tasks(aux_flags) );
+        {
+        int last = time(0);
+//      struct timespec sleeptime = {0, 1000}; // 1 microsecond
+//      for(i = 0; i < 24000; ++i) {
+        for(i = 0; i < 2000; ++i) {
+                sleep_next_period();
+//              printf("RED LEADER!\n");
+//              nanosleep(&sleeptime, NULL);
+                pthread_mutex_lock(&gMutex);
+                if((i%(10000/PERIOD)) == 0) {
+                        int now = time(0);
+                        printf("hearbeat %d: %d\n", i, now - last);
+                        last = now;
+                }
+                pthread_mutex_unlock(&gMutex);
+        }
+        }
+        CALL( disable_aux_rt_tasks(aux_flags) );
+        gRun = 0;
+        CALL( task_mode(BACKGROUND_TASK) );
+        /*****
+         * 5) Wait for RT threads to terminate.
+         */
+        for (i = 0; i < NUM_AUX_THREADS; i++) {
+                if (task[i] != 0) {
+                        float time;
+                        pthread_join(task[i], NULL);
+                        time = ctx[i].total_time.tv_sec + ctx[i].total_time.tv_usec / (float)(1e6);
+                        printf("child %d: %fs\n", i, time);
+                }
+        }
+        
+        /***** 
+         * 6) Clean up, maybe print results and stats, and exit.
+         */
+        return 0;
+}
+/* A real-time thread is very similar to the main function of a single-threaded
+ * real-time app. Notice, that init_rt_thread() is called to initialized per-thread
+ * data structures of the LITMUS^RT user space libary.
+ */
+void* aux_thread(void *tcontext)
+{
+        struct thread_context *ctx = (struct thread_context *) tcontext;
+        int count = 0;
+//      pthread_barrier_wait(&gBar);
+        while(gRun)
+        {
+                if(count++ % 100000 == 0) {
+                        pthread_mutex_lock(&gMutex);
+                        pthread_mutex_unlock(&gMutex);
+                }
+        }
+        {
+        struct rusage use;
+        long int sec;
+        getrusage(RUSAGE_THREAD, &use);
+        ctx->total_time.tv_usec = use.ru_utime.tv_usec + use.ru_stime.tv_usec;
+        sec = ctx->total_time.tv_usec / (long int)(1e6);
+        ctx->total_time.tv_usec = ctx->total_time.tv_usec % (long int)(1e6);
+        ctx->total_time.tv_sec = use.ru_utime.tv_sec + use.ru_stime.tv_sec + sec;
+        }
+        return ctx;
+}
+/* A real-time thread is very similar to the main function of a single-threaded
+ * real-time app. Notice, that init_rt_thread() is called to initialized per-thread
+ * data structures of the LITMUS^RT user space libary.
+ */
+void* rt_thread(void *tcontext)
+{
+        struct thread_context *ctx = (struct thread_context *) tcontext;
+        /* Make presence visible. */
+        printf("RT Thread %d active.\n", ctx->id);
+        /*****
+         * 1) Initialize real-time settings.
+         */
+        CALL( init_rt_thread() );
+        CALL( sporadic_global(EXEC_COST, PERIOD + ctx->id * 10) );
+        /*****
+         * 2) Transition to real-time mode.
+         */
+        CALL( task_mode(LITMUS_RT_TASK) );
+        wait_for_ts_release();
+        /* The task is now executing as a real-time task if the call didn't fail. 
+         */
+        /*****
+         * 3) Invoke real-time jobs.
+         */
+        while(gRun) {
+                /* Wait until the next job is released. */
+                sleep_next_period();
+                printf("%d: task.\n", ctx->id);
+        }
+        /*****
+         * 4) Transition to background mode.
+         */
+        CALL( task_mode(BACKGROUND_TASK) );
+        {
+        struct rusage use;
+        long int sec;
+        getrusage(RUSAGE_THREAD, &use);
+        ctx->total_time.tv_usec = use.ru_utime.tv_usec + use.ru_stime.tv_usec;
+        sec = ctx->total_time.tv_usec / (long int)(1e6);
+        ctx->total_time.tv_usec = ctx->total_time.tv_usec % (long int)(1e6);
+        ctx->total_time.tv_sec = use.ru_utime.tv_sec + use.ru_stime.tv_sec + sec;
+        }
+        return ctx;
+}
+int job(void) 
+{
+        /* Do real-time calculation. */
+        /* Don't exit. */
+        return 0;
+}
diff --git a/gpu/dgl.c b/gpu/dgl.c
new file mode 100644
index 0000000..a045879
--- /dev/null
+++ b/gpu/dgl.c
@@ -0,0 +1,251 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <assert.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+/* Include gettid() */
+#include <sys/types.h>
+/* Include threading support. */
+#include <pthread.h>
+/* Include the LITMUS^RT API.*/
+#include "litmus.h"
+/* Catch errors.
+ */
+#define CALL( exp ) do { \
+                int ret; \
+                ret = exp; \
+                if (ret != 0) \
+                        fprintf(stderr, "%s failed: %m\n", #exp);\
+                else \
+                        fprintf(stderr, "%s ok.\n", #exp); \
+        } while (0)
+#define TH_CALL( exp ) do { \
+                int ret; \
+                ret = exp; \
+                if (ret != 0) \
+                        fprintf(stderr, "[%d] %s failed: %m\n", ctx->id, #exp); \
+                else \
+                        fprintf(stderr, "[%d] %s ok.\n", ctx->id, #exp); \
+        } while (0)
+#define TH_SAFE_CALL( exp ) do { \
+                int ret; \
+                fprintf(stderr, "[%d] calling %s...\n", ctx->id, #exp); \
+                ret = exp; \
+                if (ret != 0) \
+                        fprintf(stderr, "\t...[%d] %s failed: %m\n", ctx->id, #exp); \
+                else \
+                        fprintf(stderr, "\t...[%d] %s ok.\n", ctx->id, #exp); \
+        } while (0)
+/* these are only default values */
+int NUM_THREADS=3;
+int NUM_SEMS=1;
+int NUM_REPLICAS=1;
+int NEST_DEPTH=1;
+int SLEEP_BETWEEN_JOBS = 1;
+#define MAX_SEMS 1000
+#define MAX_NEST_DEPTH 10
+// 1000 = 1us
+#define EXEC_COST        1000*1
+#define PERIOD          1000*10
+/* The information passed to each thread. Could be anything. */
+struct thread_context {
+        int id;
+        int fd;
+        int ikglp;
+        int od[MAX_SEMS];
+        int count;
+        unsigned int rand;
+};
+void* rt_thread(void* _ctx);
+int nested_job(struct thread_context* ctx, int *count, int *next);
+int job(struct thread_context*);
+#define OPTSTR "t:k:s:d:f"
+int main(int argc, char** argv)
+{
+        int i;
+        struct thread_context* ctx;
+        pthread_t*           task;
+        int fd;
+        int opt;
+        while((opt = getopt(argc, argv, OPTSTR)) != -1) {
+                switch(opt) {
+                        case 't':
+                                NUM_THREADS = atoi(optarg);
+                                break;
+                        case 'k':
+                                NUM_REPLICAS = atoi(optarg);
+                                assert(NUM_REPLICAS > 0);
+                                break;
+                        case 's':
+                                NUM_SEMS = atoi(optarg);
+                                assert(NUM_SEMS >= 0 && NUM_SEMS <= MAX_SEMS);
+                                break;
+                        case 'd':
+                                NEST_DEPTH = atoi(optarg);
+                                assert(NEST_DEPTH >= 1 && NEST_DEPTH <= MAX_NEST_DEPTH);
+                                break;
+                        case 'f':
+                                SLEEP_BETWEEN_JOBS = 0;
+                                break;
+                        default:
+                                fprintf(stderr, "Unknown option: %c\n", opt);
+                                exit(-1);
+                                break;
+                }
+        }
+        ctx = (struct thread_context*) calloc(NUM_THREADS, sizeof(struct thread_context));
+        task = (pthread_t*) calloc(NUM_THREADS, sizeof(pthread_t));
+        srand(0); /* something repeatable for now */
+        fd = open("semaphores", O_RDONLY | O_CREAT, S_IRUSR | S_IWUSR);
+        CALL( init_litmus() );
+        for (i = 0; i < NUM_THREADS; i++) {
+                ctx[i].id = i;
+                ctx[i].fd = fd;
+                ctx[i].rand = rand();
+                CALL( pthread_create(task + i, NULL, rt_thread, ctx + i) );
+        }
+        for (i = 0; i < NUM_THREADS; i++)
+                pthread_join(task[i], NULL);
+        return 0;
+}
+void* rt_thread(void* _ctx)
+{
+        int i;
+        int do_exit = 0;
+        struct thread_context *ctx = (struct thread_context*)_ctx;
+        TH_CALL( init_rt_thread() );
+        /* Vary period a little bit. */
+        TH_CALL( sporadic_task_ns(EXEC_COST, PERIOD + 10*ctx->id, 0, 0, LITMUS_LOWEST_PRIORITY,
+                                                        RT_CLASS_SOFT, NO_ENFORCEMENT, NO_SIGNALS, 0) );
+        ctx->ikglp = open_ikglp_sem(ctx->fd, 0, (void*)&NUM_REPLICAS);
+        if(ctx->ikglp < 0)
+                perror("open_ikglp_sem");
+        else
+                printf("ikglp od = %d\n", ctx->ikglp);
+        for (i = 0; i < NUM_SEMS; i++) {
+                ctx->od[i] = open_rsm_sem(ctx->fd, i+1);
+                if(ctx->od[i] < 0)
+                        perror("open_rsm_sem");
+                else
+                        printf("rsm[%d] od = %d\n", i, ctx->od[i]);
+        }
+        TH_CALL( task_mode(LITMUS_RT_TASK) );
+        printf("[%d] Waiting for TS release.\n ", ctx->id);
+        wait_for_ts_release();
+        ctx->count = 0;
+        do {
+                int replica = -1;
+                int first = (int)(NUM_SEMS * (rand_r(&(ctx->rand)) / (RAND_MAX + 1.0)));
+                int last = (first + NEST_DEPTH - 1 >= NUM_SEMS) ? NUM_SEMS - 1 : first + NEST_DEPTH - 1;
+                int dgl_size = last - first + 1;
+                int dgl[dgl_size];
+                
+                // construct the DGL
+                for(i = first; i <= last; ++i) {
+                        dgl[i-first] = ctx->od[i];
+                }
+                
+                
+                replica = litmus_lock(ctx->ikglp);
+                printf("[%d] got ikglp replica %d.\n", ctx->id, replica);
+                fflush(stdout);
+                
+                litmus_dgl_lock(dgl, dgl_size);
+                printf("[%d] acquired dgl.\n", ctx->id);
+                fflush(stdout);
+                
+                
+                do_exit = job(ctx);
+                
+                printf("[%d] unlocking dgl.\n", ctx->id);
+                fflush(stdout);         
+                litmus_dgl_unlock(dgl, dgl_size);
+                
+                
+                printf("[%d]: freeing ikglp replica %d.\n", ctx->id, replica);
+                fflush(stdout);
+                litmus_unlock(ctx->ikglp);
+                if(SLEEP_BETWEEN_JOBS && !do_exit) {
+                        sleep_next_period();
+                }
+        } while(!do_exit);
+        /*****
+         * 4) Transition to background mode.
+         */
+        TH_CALL( task_mode(BACKGROUND_TASK) );
+        return NULL;
+}
+void dirty_kb(int kb) 
+{
+        int32_t one_kb[256];
+        int32_t sum = 0;
+        int32_t i;
+        for (i = 0; i < 256; i++)
+                sum += one_kb[i];
+        kb--;
+        /* prevent tail recursion */
+        if (kb)
+                dirty_kb(kb);
+        for (i = 0; i < 256; i++)
+                sum += one_kb[i];
+}
+int job(struct thread_context* ctx)
+{
+        /* Do real-time calculation. */
+        dirty_kb(8);
+        /* Don't exit. */
+        //return ctx->count++ > 100;
+        //return ctx->count++ > 12000;
+        //return ctx->count++ > 120000;
+        return ctx->count++ >   50000;  // controls number of jobs per task
+}
diff --git a/gpu/ikglptest.c b/gpu/ikglptest.c
new file mode 100644
index 0000000..5f566d5
--- /dev/null
+++ b/gpu/ikglptest.c
@@ -0,0 +1,633 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <assert.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <time.h>
+#include <math.h>
+/* Include gettid() */
+#include <sys/types.h>
+/* Include threading support. */
+#include <pthread.h>
+/* Include the LITMUS^RT API.*/
+#include "litmus.h"
+/* Catch errors.
+ */
+#if 1
+#define CALL( exp ) do { \
+                int ret; \
+                ret = exp; \
+                if (ret != 0) \
+                        fprintf(stderr, "%s failed: %m\n", #exp);\
+                else \
+                        fprintf(stderr, "%s ok.\n", #exp); \
+        } while (0)
+#define TH_CALL( exp ) do { \
+                int ret; \
+                ret = exp; \
+                if (ret != 0) \
+                        fprintf(stderr, "[%d] %s failed: %m\n", ctx->id, #exp); \
+                else \
+                        fprintf(stderr, "[%d] %s ok.\n", ctx->id, #exp); \
+        } while (0)
+#define TH_SAFE_CALL( exp ) do { \
+                int ret; \
+                fprintf(stderr, "[%d] calling %s...\n", ctx->id, #exp); \
+                ret = exp; \
+                if (ret != 0) \
+                        fprintf(stderr, "\t...[%d] %s failed: %m\n", ctx->id, #exp); \
+                else \
+                        fprintf(stderr, "\t...[%d] %s ok.\n", ctx->id, #exp); \
+        } while (0)
+#else
+#define CALL( exp )
+#define TH_CALL( exp )
+#define TH_SAFE_CALL( exp )
+#endif
+/* these are only default values */
+int NUM_THREADS=3;
+int NUM_AUX_THREADS=0;
+int NUM_SEMS=1;
+int NUM_GPUS=1;
+int GPU_OFFSET=0;
+int NUM_SIMULT_USERS = 1;
+int ENABLE_AFFINITY = 0;
+int NEST_DEPTH=1;
+int USE_KFMLP = 0;
+int RELAX_FIFO_MAX_LEN = 0;
+int USE_DYNAMIC_GROUP_LOCKS = 0;
+int SLEEP_BETWEEN_JOBS = 1;
+int gAuxRun = 1;
+pthread_mutex_t gMutex = PTHREAD_MUTEX_INITIALIZER;
+#define MAX_SEMS 1000
+// 1000 = 1us
+#define EXEC_COST       1000*1
+#define PERIOD          2*1000*100
+/* The information passed to each thread. Could be anything. */
+struct thread_context {
+        int id;
+        int fd;
+        int kexclu;
+        int od[MAX_SEMS];
+        int count;
+        unsigned int rand;
+        int mig_count[5];
+};
+void* rt_thread(void* _ctx);
+void* aux_thread(void* _ctx);
+int nested_job(struct thread_context* ctx, int *count, int *next, int runfactor);
+int job(struct thread_context* ctx, int runfactor);
+struct avg_info
+{
+        float avg;
+        float stdev;
+};
+struct avg_info feedback(int _a, int _b)
+{
+        fp_t a = _frac(_a, 10000);
+        fp_t b = _frac(_b, 10000);
+        int i;
+        fp_t actual_fp;
+        fp_t _est, _err;
+        int base = 1000000;
+        //int range = 40;
+        fp_t est = _integer_to_fp(base);
+        fp_t err = _fp(base/2);
+#define NUM_SAMPLES 10000
+        float samples[NUM_SAMPLES] = {0.0};
+        float accu_abs, accu;
+        float avg;
+        float devsum;
+        float stdev;
+        struct avg_info ret;
+        for(i = 0; i < NUM_SAMPLES; ++i) {
+                int num = ((rand()%40)*(rand()%2 ? -1 : 1)/100.0)*base + base;
+                float rel_err;
+                actual_fp = _integer_to_fp(num);
+//      printf("Before: est = %d\terr = %d\n", (int)_fp_to_integer(est), (int)_fp_to_integer(err));
+                _err = _sub(actual_fp, est);
+                _est = _add(_mul(a, _err), _mul(b, err));
+                rel_err = _fp_to_integer(_mul(_div(_err, est), _integer_to_fp(10000)))/10000.0;
+                rel_err *= 100.0;
+                //printf("%6.2f\n", rel_err);
+                samples[i] = rel_err;
+                est = _est;
+                err = _add(err, _err);
+                if((int)_fp_to_integer(est) <= 0) {
+                        est = actual_fp;
+                        err = _div(actual_fp, _integer_to_fp(2));
+                }
+        //printf("After: est = %d\terr = %d\n", (int)_fp_to_integer(est), (int)_fp_to_integer(err));
+        }
+        accu_abs = 0.0;
+        accu = 0.0;
+        for(i = 0; i < NUM_SAMPLES; ++i) {
+                accu += samples[i];
+                accu_abs += abs(samples[i]);
+        }
+        avg = accu_abs/NUM_SAMPLES;
+        devsum = 0;
+        for(i = 0; i < NUM_SAMPLES; ++i) {
+                float dev = samples[i] - avg;
+                dev *= dev;
+                devsum += dev;
+        }
+        stdev = sqrtf(devsum/(NUM_SAMPLES-1));
+        
+        ret.avg = avg;
+        ret.stdev = stdev;
+        //printf("AVG: %6.2f\tw/ neg: %6.2f\n", accu_abs/NUM_SAMPLES, accu/NUM_SAMPLES);
+        //return (accu_abs/NUM_SAMPLES);
+        return(ret);
+}
+#define OPTSTR "t:k:o:z:s:d:lfaryA:"
+int main(int argc, char** argv)
+{
+        int i;
+        struct thread_context* ctx;
+        struct thread_context* aux_ctx;
+        pthread_t*           task;
+        pthread_t*           aux_task;
+        int fd;
+        int opt;
+        while((opt = getopt(argc, argv, OPTSTR)) != -1) {
+                switch(opt) {
+                        case 't':
+                                NUM_THREADS = atoi(optarg);
+                                break;
+                        case 'A':
+                                NUM_AUX_THREADS = atoi(optarg);
+                                break;
+                        case 'k':
+                                NUM_GPUS = atoi(optarg);
+                                assert(NUM_GPUS > 0);
+                                break;
+                        case 'z':
+                                NUM_SIMULT_USERS = atoi(optarg);
+                                assert(NUM_SIMULT_USERS > 0);
+                                break;
+                        case 'o':
+                                GPU_OFFSET = atoi(optarg);
+                                assert(GPU_OFFSET >= 0);
+                                break;
+                        case 's':
+                                NUM_SEMS = atoi(optarg);
+                                assert(NUM_SEMS >= 0 && NUM_SEMS < MAX_SEMS);
+                                break;
+                        case 'd':
+                                NEST_DEPTH = atoi(optarg);
+                                assert(NEST_DEPTH >= 0);
+                                break;
+                        case 'f':
+                                SLEEP_BETWEEN_JOBS = 0;
+                                break;
+                        case 'a':
+                                ENABLE_AFFINITY = 1;
+                                break;
+                        case 'l':
+                                USE_KFMLP = 1;
+                                break;
+                        case 'y':
+                                USE_DYNAMIC_GROUP_LOCKS = 1;
+                                break;
+                        case 'r':
+                                RELAX_FIFO_MAX_LEN = 1;
+                                break;
+                        default:
+                                fprintf(stderr, "Unknown option: %c\n", opt);
+                                exit(-1);
+                                break;
+                }
+        }
+#if 0
+        int best_a = 0, best_b = 0;
+        int first = 1;
+        int TRIALS = 15;
+        int a, b, t;
+        struct avg_info best = {0.0,0.0}, second_best;
+        int second_best_a, second_best_b;
+        srand(time(0));
+        int step = 50;
+        for(b = 2000; b < 5000; b += step) {
+                for(a = 1500; a < b; a += (step/4)) {
+                        float std_accum = 0;
+                        float avg_accum = 0;
+                        for(t = 0; t < TRIALS; ++t) {
+                                struct avg_info temp;
+                                temp = feedback(a, b);
+                                std_accum += temp.stdev;
+                                avg_accum += temp.avg;
+                        }
+                        float avg_std = std_accum / TRIALS;
+                        if(first || avg_std < best.stdev) {
+                                second_best_a = best_a;
+                                second_best_b = best_b;
+                                second_best = best;
+                                best.stdev = avg_std;
+                                best.avg = avg_accum / TRIALS;
+                                best_a = a;
+                                best_b = b;
+                                first = 0;
+                        }
+                }
+        }
+        
+        printf("Best:\ta = %d\tb = %d\t(b-a) = %d\tavg = %6.2f\tstdev = %6.2f\n", best_a, best_b, best_b - best_a, best.avg, best.stdev);
+        printf("2nd:\ta = %d\tb = %d\t(b-a) = %d\tavg = %6.2f\tstdev = %6.2f\n", second_best_a, second_best_b, second_best_b - second_best_a, second_best.avg, second_best.stdev);
+                        a = 14008;
+                        b = 16024;
+                        float std_accum = 0;
+                        float avg_accum = 0;
+                        for(t = 0; t < TRIALS; ++t) {
+                                struct avg_info temp;
+                                temp = feedback(a, b);
+                                std_accum += temp.stdev;
+                                avg_accum += temp.avg;
+                        }
+        printf("Aaron:\tavg = %6.2f\tstd = %6.2f\n", avg_accum/TRIALS, std_accum/TRIALS);
+        
+        return 0;
+#endif
+        ctx = (struct thread_context*) calloc(NUM_THREADS, sizeof(struct thread_context));
+        task = (pthread_t*) calloc(NUM_THREADS, sizeof(pthread_t));
+        if (NUM_AUX_THREADS) {
+                aux_ctx = (struct thread_context*) calloc(NUM_AUX_THREADS, sizeof(struct thread_context));
+                aux_task = (pthread_t*) calloc(NUM_AUX_THREADS, sizeof(pthread_t));
+        }
+        srand(0); /* something repeatable for now */
+        fd = open("semaphores", O_RDONLY | O_CREAT, S_IRUSR | S_IWUSR);
+        CALL( init_litmus() );
+        for (i = 0; i < NUM_AUX_THREADS; i++) {
+                aux_ctx[i].id = i;
+                CALL( pthread_create(aux_task + i, NULL, aux_thread, ctx + i) );
+        }
+        for (i = 0; i < NUM_THREADS; i++) {
+                ctx[i].id = i;
+                ctx[i].fd = fd;
+                ctx[i].rand = rand();
+                memset(&ctx[i].mig_count, 0, sizeof(ctx[i].mig_count));
+                CALL( pthread_create(task + i, NULL, rt_thread, ctx + i) );
+        }
+        if (NUM_AUX_THREADS) {
+                TH_CALL( init_rt_thread() );
+                TH_CALL( sporadic_task_ns(EXEC_COST, PERIOD + 10*NUM_THREADS+1, 0, 0,
+                        LITMUS_LOWEST_PRIORITY, RT_CLASS_SOFT, NO_ENFORCEMENT, NO_SIGNALS, 1) );
+                TH_CALL( task_mode(LITMUS_RT_TASK) );
+                printf("[MASTER] Waiting for TS release.\n ");
+                wait_for_ts_release();
+                CALL( enable_aux_rt_tasks(AUX_CURRENT) );
+                for(i = 0; i < 25000; ++i) {
+                        sleep_next_period();
+                        pthread_mutex_lock(&gMutex);
+                        pthread_mutex_unlock(&gMutex);
+                }
+                CALL( disable_aux_rt_tasks(AUX_CURRENT) );
+                __sync_synchronize();
+                gAuxRun = 0;
+                __sync_synchronize();
+                for (i = 0; i < NUM_AUX_THREADS; i++)
+                        pthread_join(aux_task[i], NULL);
+                TH_CALL( task_mode(BACKGROUND_TASK) );
+        }
+        for (i = 0; i < NUM_THREADS; i++)
+                pthread_join(task[i], NULL);
+        return 0;
+}
+int affinity_cost[] = {1, 4, 8, 16};
+int affinity_distance(struct thread_context* ctx, int a, int b)
+{
+        int i;
+        int dist;
+        
+        if(a >= 0 && b >= 0) {
+                for(i = 0; i <= 3; ++i) {
+                        if(a>>i == b>>i) {
+                                dist = i;
+                                goto out;
+                        }
+                }
+                dist = 0; // hopefully never reached.
+        }
+        else {
+                dist = 0;
+        }       
+        
+out:
+        //printf("[%d]: distance: %d -> %d = %d\n", ctx->id, a, b, dist);       
+        
+        ++(ctx->mig_count[dist]);
+        
+        return dist;
+        
+//      int groups[] = {2, 4, 8};
+//      int i;
+//      
+//      if(a < 0 || b < 0)
+//              return (sizeof(groups)/sizeof(groups[0]));  // worst affinity
+//      
+//      // no migration
+//      if(a == b)
+//              return 0;
+//      
+//      for(i = 0; i < sizeof(groups)/sizeof(groups[0]); ++i) {
+//              if(a/groups[i] == b/groups[i])
+//                      return (i+1);
+//      }
+//      assert(0);
+//      return -1;
+}
+void* aux_thread(void* _ctx)
+{
+        struct thread_context *ctx = (struct thread_context*)_ctx;
+        while (gAuxRun) {
+                pthread_mutex_lock(&gMutex);
+                pthread_mutex_unlock(&gMutex);
+        }
+        return ctx;
+}
+void* rt_thread(void* _ctx)
+{
+        int i;
+        int do_exit = 0;
+        int last_replica = -1;  
+        struct thread_context *ctx = (struct thread_context*)_ctx;
+        TH_CALL( init_rt_thread() );
+        /* Vary period a little bit. */
+        TH_CALL( sporadic_task_ns(EXEC_COST, PERIOD + 10*ctx->id, 0, 0,
+                LITMUS_LOWEST_PRIORITY, RT_CLASS_SOFT, NO_ENFORCEMENT, NO_SIGNALS, 1) );
+        if(USE_KFMLP) {
+                ctx->kexclu = open_kfmlp_gpu_sem(ctx->fd,
+                                                                                 0,  /* name */
+                                                                                 NUM_GPUS,
+                                                                                 GPU_OFFSET,
+                                                                                 NUM_SIMULT_USERS,
+                                                                                 ENABLE_AFFINITY
+                                                                                 );
+        }
+        else {
+//              ctx->kexclu = open_ikglp_sem(ctx->fd, 0, &NUM_GPUS);
+                ctx->kexclu = open_ikglp_gpu_sem(ctx->fd,
+                                                                                 0,  /* name */
+                                                                                 NUM_GPUS,
+                                                                                 GPU_OFFSET,
+                                                                                 NUM_SIMULT_USERS,
+                                                                                 ENABLE_AFFINITY,
+                                                                                 RELAX_FIFO_MAX_LEN
+                                                                                 );             
+        }
+        if(ctx->kexclu < 0)
+                perror("open_kexclu_sem");
+        else
+                printf("kexclu od = %d\n", ctx->kexclu);
+        
+        for (i = 0; i < NUM_SEMS; ++i) {
+                ctx->od[i] = open_rsm_sem(ctx->fd, i + ctx->kexclu + 2);
+                if(ctx->od[i] < 0)
+                        perror("open_rsm_sem");
+                else
+                        printf("rsm[%d] od = %d\n", i, ctx->od[i]);
+        }
+        TH_CALL( task_mode(LITMUS_RT_TASK) );
+        printf("[%d] Waiting for TS release.\n ", ctx->id);
+        wait_for_ts_release();
+        ctx->count = 0;
+//      if (ctx->id == 0 && NUM_AUX_THREADS) {
+//              CALL( enable_aux_rt_tasks() );
+//      }
+        do {
+                int first = (int)(NUM_SEMS * (rand_r(&(ctx->rand)) / (RAND_MAX + 1.0)));
+                int last = (first + NEST_DEPTH - 1 >= NUM_SEMS) ? NUM_SEMS - 1 : first + NEST_DEPTH - 1;
+                int dgl_size = last - first + 1;
+                int replica = -1;
+                int distance;
+                
+                int dgl[dgl_size];              
+                
+                // construct the DGL
+                for(i = first; i <= last; ++i) {
+                        dgl[i-first] = ctx->od[i];
+                }               
+                
+                replica = litmus_lock(ctx->kexclu);
+                //printf("[%d] got kexclu replica %d.\n", ctx->id, replica);
+                //fflush(stdout);
+                distance = affinity_distance(ctx, replica, last_replica);
+                
+                if(USE_DYNAMIC_GROUP_LOCKS) {
+                        litmus_dgl_lock(dgl, dgl_size);
+                }
+                else {
+                        for(i = 0; i < dgl_size; ++i) {
+                                litmus_lock(dgl[i]);
+                        }
+                }
+                
+                //do_exit = nested_job(ctx, &count, &first, affinity_cost[distance]);
+                do_exit = job(ctx, affinity_cost[distance]);
+                
+                if(USE_DYNAMIC_GROUP_LOCKS) {
+                        litmus_dgl_unlock(dgl, dgl_size);
+                }
+                else {
+                        for(i = dgl_size - 1; i >= 0; --i) {
+                                litmus_unlock(dgl[i]);
+                        }                       
+                }               
+                
+                //printf("[%d]: freeing kexclu replica %d.\n", ctx->id, replica);
+                //fflush(stdout);
+                litmus_unlock(ctx->kexclu);
+                
+                last_replica = replica;
+                if(SLEEP_BETWEEN_JOBS && !do_exit) {
+                        sleep_next_period();
+                }
+        } while(!do_exit);
+//      if (ctx->id == 0 && NUM_AUX_THREADS) {
+//              gAuxRun = 0;
+//              __sync_synchronize();
+//              CALL( disable_aux_rt_tasks() );
+//      }
+        /*****
+         * 4) Transition to background mode.
+         */
+        TH_CALL( task_mode(BACKGROUND_TASK) );
+        for(i = 0; i < sizeof(ctx->mig_count)/sizeof(ctx->mig_count[0]); ++i) 
+        {
+                printf("[%d]: mig_count[%d] = %d\n", ctx->id, i, ctx->mig_count[i]);
+        }
+        return NULL;
+}
+//int nested_job(struct thread_context* ctx, int *count, int *next, int runfactor)
+//{
+//      int ret;
+//
+//      if(*count == 0 || *next == NUM_SEMS)
+//      {
+//              ret = job(ctx, runfactor);
+//      }
+//      else
+//      {
+//              int which_sem = *next;
+//              int rsm_od = ctx->od[which_sem];
+//
+//              ++(*next);
+//              --(*count);
+//
+//              //printf("[%d]: trying to get semaphore %d.\n", ctx->id, which_sem);
+//              //fflush(stdout);
+//              litmus_lock(rsm_od);
+//
+//              //printf("[%d] got semaphore %d.\n", ctx->id, which_sem);
+//              //fflush(stdout);
+//              ret = nested_job(ctx, count, next, runfactor);
+//
+//              //printf("[%d]: freeing semaphore %d.\n", ctx->id, which_sem);
+//              //fflush(stdout);
+//              litmus_unlock(rsm_od);
+//      }
+//
+//return(ret);
+//}
+void dirty_kb(int kb) 
+{       
+        int32_t one_kb[256];
+        int32_t sum = 0;
+        int32_t i;
+        if(!kb)
+                return; 
+        
+        for (i = 0; i < 256; i++)
+                sum += one_kb[i];
+        kb--;
+        /* prevent tail recursion */
+        if (kb)
+                dirty_kb(kb);
+        for (i = 0; i < 256; i++)
+                sum += one_kb[i];
+}
+int job(struct thread_context* ctx, int runfactor)
+{
+        //struct timespec tosleep = {0, 100000}; // 0.1 ms
+        
+        //printf("[%d]: runfactor = %d\n", ctx->id, runfactor);
+        
+        //dirty_kb(8 * runfactor);
+        dirty_kb(1 * runfactor);
+        //nanosleep(&tosleep, NULL);
+        /* Don't exit. */
+        //return ctx->count++ > 100;
+        //return ctx->count++ > 12000;
+        //return ctx->count++ > 120000;
+        return ctx->count++ >   25000;  // controls number of jobs per task
+}
diff --git a/gpu/locktest.c b/gpu/locktest.c
new file mode 100644
index 0000000..bc4fc54
--- /dev/null
+++ b/gpu/locktest.c
@@ -0,0 +1,206 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <assert.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+/* Include gettid() */
+#include <sys/types.h>
+/* Include threading support. */
+#include <pthread.h>
+/* Include the LITMUS^RT API.*/
+#include "litmus.h"
+/* Catch errors.
+ */
+#define CALL( exp ) do { \
+                int ret; \
+                ret = exp; \
+                if (ret != 0) \
+                        fprintf(stderr, "%s failed: %m\n", #exp);\
+                else \
+                        fprintf(stderr, "%s ok.\n", #exp); \
+        } while (0)
+#define TH_CALL( exp ) do { \
+                int ret; \
+                ret = exp; \
+                if (ret != 0) \
+                        fprintf(stderr, "[%d] %s failed: %m\n", ctx->id, #exp); \
+                else \
+                        fprintf(stderr, "[%d] %s ok.\n", ctx->id, #exp); \
+        } while (0)
+#define TH_SAFE_CALL( exp ) do { \
+                int ret; \
+                fprintf(stderr, "[%d] calling %s...\n", ctx->id, #exp); \
+                ret = exp; \
+                if (ret != 0) \
+                        fprintf(stderr, "\t...[%d] %s failed: %m\n", ctx->id, #exp); \
+                else \
+                        fprintf(stderr, "\t...[%d] %s ok.\n", ctx->id, #exp); \
+        } while (0)
+/* these are only default values */
+int NUM_THREADS=3;
+int NUM_SEMS=10;
+#define MAX_SEMS 1000
+#define EXEC_COST        10
+#define PERIOD          100
+/* The information passed to each thread. Could be anything. */
+struct thread_context {
+        int id;
+        int fd;
+        int od[MAX_SEMS];
+        int count;
+        unsigned int rand;
+};
+void* rt_thread(void* _ctx);
+int nested_job(struct thread_context* ctx, int *count, int *next);
+int job(struct thread_context*);
+#define OPTSTR "t:s:"
+int main(int argc, char** argv)
+{
+        int i;
+        struct thread_context* ctx;
+        pthread_t*           task;
+        int fd;
+        int opt;
+        while((opt = getopt(argc, argv, OPTSTR)) != -1) {
+                switch(opt) {
+                        case 't':
+                                NUM_THREADS = atoi(optarg);
+                                break;
+                        case 's':
+                                NUM_SEMS = atoi(optarg);
+                                assert(NUM_SEMS <= MAX_SEMS);
+                                break;
+                        default:
+                                fprintf(stderr, "Unknown option: %c\n", opt);
+                                exit(-1);
+                                break;
+                }
+        }
+        ctx = (struct thread_context*) calloc(NUM_THREADS, sizeof(struct thread_context));
+        task = (pthread_t*) calloc(NUM_THREADS, sizeof(pthread_t));
+        srand(0); /* something repeatable for now */
+        fd = open("semaphores", O_RDONLY | O_CREAT, S_IRUSR | S_IWUSR);
+        CALL( init_litmus() );
+        for (i = 0; i < NUM_THREADS; i++) {
+                ctx[i].id = i;
+                ctx[i].fd = fd;
+                ctx[i].rand = rand();
+                CALL( pthread_create(task + i, NULL, rt_thread, ctx + i) );
+        }
+        for (i = 0; i < NUM_THREADS; i++)
+                pthread_join(task[i], NULL);
+        return 0;
+}
+void* rt_thread(void* _ctx)
+{
+        int i;
+        int do_exit = 0;
+        struct thread_context *ctx = (struct thread_context*)_ctx;
+        TH_CALL( init_rt_thread() );
+        /* Vary period a little bit. */
+        TH_CALL( sporadic_global(EXEC_COST, PERIOD + 10*ctx->id) );
+        for (i = 0; i < NUM_SEMS; i++) {
+                ctx->od[i] = open_fmlp_sem(ctx->fd, i);
+                if(ctx->od[i] < 0)
+                        perror("open_fmlp_sem");
+        }
+        TH_CALL( task_mode(LITMUS_RT_TASK) );
+        printf("[%d] Waiting for TS release.\n ", ctx->id);
+        wait_for_ts_release();
+        ctx->count = 0;
+        do {
+                int which_sem = (int)(NUM_SEMS * (rand_r(&(ctx->rand)) / (RAND_MAX + 1.0)));
+                printf("[%d]: trying to get semaphore %d.\n", ctx->id, which_sem);
+                fflush(stdout);
+                TH_SAFE_CALL ( litmus_lock(which_sem) );
+                printf("[%d] got semaphore %d.\n", ctx->id, which_sem);
+                fflush(stdout);
+                do_exit = job(ctx);
+                printf("[%d]: freeing semaphore %d.\n", ctx->id, which_sem);
+                fflush(stdout);
+                TH_SAFE_CALL ( litmus_unlock(which_sem) );
+                if(!do_exit) {
+                        sleep_next_period();
+                }
+        } while(!do_exit);
+        /*****
+         * 4) Transition to background mode.
+         */
+        TH_CALL( task_mode(BACKGROUND_TASK) );
+        return NULL;
+}
+void dirty_kb(int kb) 
+{
+        int32_t one_kb[256];
+        int32_t sum = 0;
+        int32_t i;
+        for (i = 0; i < 256; i++)
+                sum += one_kb[i];
+        kb--;
+        /* prevent tail recursion */
+        if (kb)
+                dirty_kb(kb);
+        for (i = 0; i < 256; i++)
+                sum += one_kb[i];
+}
+int job(struct thread_context* ctx)
+{
+        /* Do real-time calculation. */
+        dirty_kb(8);
+        /* Don't exit. */
+        //return ctx->count++ > 100;
+        //return ctx->count++ > 12000;
+        //return ctx->count++ > 120000;
+        return ctx->count++ > 30000;  // controls number of jobs per task
+}
diff --git a/gpu/nested.c b/gpu/nested.c
new file mode 100644
index 0000000..07e237b
--- /dev/null
+++ b/gpu/nested.c
@@ -0,0 +1,245 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <assert.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+/* Include gettid() */
+#include <sys/types.h>
+/* Include threading support. */
+#include <pthread.h>
+/* Include the LITMUS^RT API.*/
+#include "litmus.h"
+/* Catch errors.
+ */
+#define CALL( exp ) do { \
+                int ret; \
+                ret = exp; \
+                if (ret != 0) \
+                        fprintf(stderr, "%s failed: %m\n", #exp);\
+                else \
+                        fprintf(stderr, "%s ok.\n", #exp); \
+        } while (0)
+#define TH_CALL( exp ) do { \
+                int ret; \
+                ret = exp; \
+                if (ret != 0) \
+                        fprintf(stderr, "[%d] %s failed: %m\n", ctx->id, #exp); \
+                else \
+                        fprintf(stderr, "[%d] %s ok.\n", ctx->id, #exp); \
+        } while (0)
+#define TH_SAFE_CALL( exp ) do { \
+                int ret; \
+                fprintf(stderr, "[%d] calling %s...\n", ctx->id, #exp); \
+                ret = exp; \
+                if (ret != 0) \
+                        fprintf(stderr, "\t...[%d] %s failed: %m\n", ctx->id, #exp); \
+                else \
+                        fprintf(stderr, "\t...[%d] %s ok.\n", ctx->id, #exp); \
+        } while (0)
+#define NUM_CPUS        4
+//#define NUM_THREADS   3
+int NUM_THREADS=3;
+/* NEST_DEPTH may not be greater than NUM_SEMS. */
+//#define NUM_SEMS      10
+int NUM_SEMS=10;
+int SLEEP_BETWEEN_JOBS = 1;
+#define MAX_SEMS 1000
+//#define NEST_DEPTH     5
+int NEST_DEPTH=5;
+#define EXEC_COST        1000*1
+#define PERIOD          1000*10
+/* The information passed to each thread. Could be anything. */
+struct thread_context {
+        int id;
+        int fd;
+        int od[MAX_SEMS];
+        int count;
+        unsigned int rand;
+};
+void* rt_thread(void* _ctx);
+int nested_job(struct thread_context* ctx, int *count, int *next);
+int job(struct thread_context*);
+#define OPTSTR "t:s:d:f"
+int main(int argc, char** argv)
+{
+        int i;
+        struct thread_context* ctx; //[NUM_THREADS];
+        pthread_t*           task;  //[NUM_THREADS];
+        int fd;
+        int opt;
+        while((opt = getopt(argc, argv, OPTSTR)) != -1) {
+                switch(opt) {
+                        case 't':
+                                NUM_THREADS = atoi(optarg);
+                                break;
+                        case 's':
+                                NUM_SEMS = atoi(optarg);
+                                assert(NUM_SEMS <= MAX_SEMS);
+                                break;
+                        case 'd':
+                                NEST_DEPTH = atoi(optarg);
+                                break;
+                        case 'f':
+                                SLEEP_BETWEEN_JOBS = 0;
+                                break;
+                        default:
+                                fprintf(stderr, "Unknown option: %c\n", opt);
+                                exit(-1);
+                                break;
+                }
+        }
+        ctx = (struct thread_context*) calloc(NUM_THREADS, sizeof(struct thread_context));
+        task = (pthread_t*) calloc(NUM_THREADS, sizeof(pthread_t));
+        srand(0); /* something repeatable for now */
+        fd = open("semaphores", O_RDONLY | O_CREAT, S_IRUSR | S_IWUSR);
+        CALL( init_litmus() );
+        for (i = 0; i < NUM_THREADS; i++) {
+                ctx[i].id = i;
+                ctx[i].fd = fd;
+                ctx[i].rand = rand();
+                CALL( pthread_create(task + i, NULL, rt_thread, ctx + i) );
+        }
+        for (i = 0; i < NUM_THREADS; i++)
+                pthread_join(task[i], NULL);
+        return 0;
+}
+void* rt_thread(void* _ctx)
+{
+        int i;
+        int do_exit = 0;
+        struct thread_context *ctx = (struct thread_context*)_ctx;
+        /* Make presence visible. */
+        //printf("RT Thread %d active.\n", ctx->id);
+        TH_CALL( init_rt_thread() );
+        TH_CALL( sporadic_task_ns(EXEC_COST, PERIOD + 10*ctx->id, 0, 0,
+                LITMUS_LOWEST_PRIORITY, RT_CLASS_SOFT, NO_ENFORCEMENT, NO_SIGNALS, 0) );
+        for (i = 0; i < NUM_SEMS; i++) {
+                ctx->od[i] = open_rsm_sem(ctx->fd, i);
+                if(ctx->od[i] < 0)
+                        perror("open_rsm_sem");
+                //printf("[%d] ctx->od[%d]: %d\n", ctx->id, i, ctx->od[i]);
+        }
+        TH_CALL( task_mode(LITMUS_RT_TASK) );
+        printf("[%d] Waiting for TS release.\n ", ctx->id);
+        wait_for_ts_release();
+        ctx->count = 0;
+        do {
+                int first = (int)(NUM_SEMS * (rand_r(&(ctx->rand)) / (RAND_MAX + 1.0)));
+                int count = NEST_DEPTH;
+                do_exit = nested_job(ctx, &count, &first);
+                
+                if(SLEEP_BETWEEN_JOBS && !do_exit) {
+                        sleep_next_period();
+                }
+        } while(!do_exit);
+        /*****
+         * 4) Transition to background mode.
+         */
+        TH_CALL( task_mode(BACKGROUND_TASK) );
+        return NULL;
+}
+int nested_job(struct thread_context* ctx, int *count, int *next)
+{
+        int ret;
+        if(*count == 0 || *next == NUM_SEMS)  /* base case */
+        {
+                ret = job(ctx);
+        }
+        else
+        {
+                int which_sem = ctx->od[*next];
+                ++(*next);
+                --(*count);
+                printf("[%d]: trying to get semaphore %d.\n", ctx->id, which_sem);
+                fflush(stdout);
+                TH_SAFE_CALL ( litmus_lock(which_sem) );
+                printf("[%d] got semaphore %d.\n", ctx->id, which_sem);
+                fflush(stdout);
+                ret = nested_job(ctx, count, next);
+                TH_SAFE_CALL ( litmus_unlock(which_sem) );
+                fflush(stdout);
+        }
+        return(ret);
+}
+void dirty_kb(int kb) 
+{
+        int32_t one_kb[256];
+        int32_t sum = 0;
+        int32_t i;
+        for (i = 0; i < 256; i++)
+                sum += one_kb[i];
+        kb--;
+        /* prevent tail recursion */
+        if (kb)
+                dirty_kb(kb);
+        for (i = 0; i < 256; i++)
+                sum += one_kb[i];
+}
+int job(struct thread_context* ctx)
+{
+        /* Do real-time calculation. */
+        dirty_kb(8);
+        /* Don't exit. */
+        //return ctx->count++ > 100;
+        //return ctx->count++ > 12000;
+        //return ctx->count++ > 120000;
+        return ctx->count++ > 30000;
+}
diff --git a/gpu/normal_task.c b/gpu/normal_task.c
new file mode 100644
index 0000000..ffc95b1
--- /dev/null
+++ b/gpu/normal_task.c
@@ -0,0 +1,84 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <assert.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <time.h>
+#include <math.h>
+/* Include gettid() */
+#include <sys/types.h>
+/* Include threading support. */
+#include <pthread.h>
+/* Include the LITMUS^RT API.*/
+#include "litmus.h"
+/* Catch errors.
+ */
+#if 1
+#define CALL( exp ) do { \
+                int ret; \
+                ret = exp; \
+                if (ret != 0) \
+                        fprintf(stderr, "%s failed: %m\n", #exp);\
+                else \
+                        fprintf(stderr, "%s ok.\n", #exp); \
+        } while (0)
+#define TH_CALL( exp ) do { \
+                int ret; \
+                ret = exp; \
+                if (ret != 0) \
+                        fprintf(stderr, "[%d] %s failed: %m\n", ctx->id, #exp); \
+                else \
+                        fprintf(stderr, "[%d] %s ok.\n", ctx->id, #exp); \
+        } while (0)
+#define TH_SAFE_CALL( exp ) do { \
+                int ret; \
+                fprintf(stderr, "[%d] calling %s...\n", ctx->id, #exp); \
+                ret = exp; \
+                if (ret != 0) \
+                        fprintf(stderr, "\t...[%d] %s failed: %m\n", ctx->id, #exp); \
+                else \
+                        fprintf(stderr, "\t...[%d] %s ok.\n", ctx->id, #exp); \
+        } while (0)
+#else
+#define CALL( exp )
+#define TH_CALL( exp )
+#define TH_SAFE_CALL( exp )
+#endif
+/* these are only default values */
+// 1000 = 1us
+#define EXEC_COST       1000*1
+#define PERIOD          2*1000*100
+int main(int argc, char** argv)
+{
+        CALL( init_litmus() );
+        CALL( init_rt_thread() );
+        CALL( sporadic_task_ns(EXEC_COST, PERIOD, 0, 0,
+                LITMUS_LOWEST_PRIORITY, RT_CLASS_SOFT, NO_ENFORCEMENT, NO_SIGNALS, 1) );
+        //CALL( task_mode(LITMUS_RT_TASK) );
+        fprintf(stdout, "Waiting for TS release.\n ");
+        wait_for_ts_release();
+        fprintf(stdout, "Released!\n");
+        //sleep_next_period();
+        //CALL( task_mode(BACKGROUND_TASK) );
+        return 0;
+}
diff --git a/gpu/rtspin_fake_cuda.cpp b/gpu/rtspin_fake_cuda.cpp
new file mode 100644
index 0000000..667c675
--- /dev/null
+++ b/gpu/rtspin_fake_cuda.cpp
@@ -0,0 +1,1169 @@
+#include <sys/time.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <time.h>
+#include <assert.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <blitz/array.h>
+#include <boost/interprocess/managed_shared_memory.hpp>
+#include <boost/interprocess/sync/interprocess_barrier.hpp>
+#include <boost/interprocess/sync/interprocess_mutex.hpp>
+#include "litmus.h"
+using namespace blitz;
+using namespace std;
+using namespace boost::interprocess;
+#define RESET_RELEASE_ON_MISS
+void bail_out(const char* msg)
+{
+        perror(msg);
+        exit(-1 * errno);
+}
+static void usage(char *error) {
+        fprintf(stderr, "Error: %s\n", error);
+        fprintf(stderr,
+                "Usage:\n"
+                "       rt_spin [COMMON-OPTS] WCET PERIOD DURATION\n"
+                "       rt_spin [COMMON-OPTS] -f FILE [-o COLUMN] WCET PERIOD\n"
+                "       rt_spin -l\n"
+                "\n"
+                "COMMON-OPTS = [-w] [-p PARTITION] [-c CLASS] [-s SCALE]\n"
+                "\n"
+                "WCET and PERIOD are milliseconds, DURATION is seconds.\n");
+        exit(EXIT_FAILURE);
+}
+#define NUMS 4096
+static int num[NUMS];
+#define PAGE_SIZE (1024*4)
+bool ENABLE_WAIT = true;
+bool GPU_TASK = false;
+bool ENABLE_AFFINITY = false;
+bool USE_KFMLP = false;
+bool RELAX_FIFO_MAX_LEN = false;
+bool USE_DYNAMIC_GROUP_LOCKS = false;
+bool BROADCAST_STATE = false;
+bool ENABLE_CHUNKING = false;
+bool MIGRATE_VIA_SYSMEM = false;
+int GPU_PARTITION = 0;
+int GPU_PARTITION_SIZE = 0;
+int NUM_SIMULT_USERS = 1;
+size_t SEND_SIZE = 0;
+size_t RECV_SIZE = 0;
+size_t STATE_SIZE = 0;
+size_t CHUNK_SIZE = PAGE_SIZE;
+#define MAX_GPUS 8
+int KEXCLU_LOCK;
+int EE_LOCKS[MAX_GPUS];
+int CE_SEND_LOCKS[MAX_GPUS];
+int CE_RECV_LOCKS[MAX_GPUS];
+int CUR_DEVICE = -1;
+int LAST_DEVICE = -1;
+bool useEngineLocks()
+{
+        return(NUM_SIMULT_USERS != 1);
+}
+int gpuCyclesPerSecond = 0;
+uint64_t *init_release_time = NULL;
+barrier *release_barrier = NULL;
+barrier *gpu_barrier = NULL;
+interprocess_mutex *gpu_mgmt_mutexes = NULL;
+managed_shared_memory *segment_ptr = NULL;
+managed_shared_memory *release_segment_ptr = NULL;
+// observed average rate when four GPUs on same node in use from pagelocked memory.
+// about 1/3 to 1/4 this when there is no bus contention.
+//const double msPerByte = 4.22e-07;
+//const double transOverhead = 0.01008;  // also observed.
+char *d_send_data[MAX_GPUS] = {0};
+char *d_recv_data[MAX_GPUS] = {0};
+char *d_state_data[MAX_GPUS] = {0};
+//cudaStream_t streams[MAX_GPUS];
+char *h_send_data = 0;
+char *h_recv_data = 0;
+char *h_state_data = 0;
+#include <sys/mman.h>
+#define USE_PAGE_LOCKED_MEMORY
+#ifdef USE_PAGE_LOCKED_MEMORY
+#define c_malloc(s) \
+                mmap(NULL, s ,   \
+                                PROT_READ | PROT_WRITE,  \
+                                MAP_PRIVATE | MAP_ANONYMOUS | MAP_LOCKED,  \
+                                -1, 0) 
+#else
+#define c_malloc(s) malloc(s)
+#endif
+typedef int cudaError_t;
+#define cudaSuccess 0
+enum cudaMemcpyKind {
+cudaMemcpyHostToDevice = 0,
+cudaMemcpyDeviceToHost = 1,
+cudaMemcpyDeviceToDevice = 2,
+};
+cudaError_t cudaGetLastError()
+{
+        return cudaSuccess;
+}
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+struct ce_lock_state
+{       
+        int locks[2];
+        size_t num_locks;
+        size_t budget_remaining;
+        bool locked;
+        
+        ce_lock_state(int device_a, enum cudaMemcpyKind kind, size_t size, int device_b = -1) {
+                num_locks = (device_a != -1) + (device_b != -1);
+                
+                if(device_a != -1) {
+                        locks[0] = (kind == cudaMemcpyHostToDevice) ?
+                        CE_SEND_LOCKS[device_a] : CE_RECV_LOCKS[device_a];
+                }
+                
+                if(device_b != -1) {
+                        assert(kind == cudaMemcpyDeviceToDevice);
+                        
+                        locks[1] = CE_RECV_LOCKS[device_b];
+                        
+                        if(locks[1] < locks[0]) {
+                                int temp = locks[1];
+                                locks[1] = locks[0];
+                                locks[0] = temp;
+                        }
+                }
+                
+                if(!ENABLE_CHUNKING)
+                        budget_remaining = size;
+                else
+                        budget_remaining = CHUNK_SIZE;
+        }
+        
+        void lock() {
+                if(USE_DYNAMIC_GROUP_LOCKS) {
+                        litmus_dgl_lock(locks, num_locks);
+                }
+                else
+                {
+                        for(int l = 0; l < num_locks; ++l)
+                        {
+                                litmus_lock(locks[l]);
+                        }
+                }
+                locked = true;
+        }
+        
+        void unlock() {
+                if(USE_DYNAMIC_GROUP_LOCKS) {
+                        litmus_dgl_unlock(locks, num_locks);
+                }
+                else
+                {
+                        // reverse order
+                        for(int l = num_locks - 1; l >= 0; --l)
+                        {
+                                litmus_unlock(locks[l]);
+                        }
+                }
+                locked = false;
+        }
+        
+        void refresh() {
+                budget_remaining = CHUNK_SIZE;
+        }
+        
+        bool budgetIsAvailable(size_t tosend) {
+                return(tosend >= budget_remaining);
+        }
+        
+        void decreaseBudget(size_t spent) {
+                budget_remaining -= spent;
+        }
+};
+// precondition: if do_locking == true, locks in state are held.
+cudaError_t __chunkMemcpy(void* a_dst, const void* a_src, size_t count,
+                                                enum cudaMemcpyKind kind,
+                                                ce_lock_state* state)
+{
+    cudaError_t ret = cudaSuccess;
+    int remaining = count;
+    
+    char* dst = (char*)a_dst;
+    const char* src = (const char*)a_src;
+    
+        // disable chunking, if needed, by setting chunk_size equal to the
+        // amount of data to be copied.
+        int chunk_size = (ENABLE_CHUNKING) ? CHUNK_SIZE : count;
+        int i = 0;
+        
+    while(remaining != 0)
+    {
+        int bytesToCopy = std::min(remaining, chunk_size);
+                
+                if(state && state->budgetIsAvailable(bytesToCopy) && state->locked) {
+                        //cutilSafeCall( cudaStreamSynchronize(streams[CUR_DEVICE]) );
+                        ret = cudaGetLastError();
+                        
+                        if(ret != cudaSuccess)
+                        {
+                                break;
+                        }
+                        
+                        state->unlock();
+                        state->refresh(); // replentish.
+                                                          // we can only run out of
+                                                          // budget if chunking is enabled.
+                                                          // we presume that init budget would
+                                                          // be set to cover entire memcpy
+                                                          // if chunking were disabled.                 
+                }
+                
+                if(state && !state->locked) {
+                        state->lock();
+                }
+                
+        //ret = cudaMemcpy(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind);
+                //cudaMemcpyAsync(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind, streams[CUR_DEVICE]);
+                if(state) {
+                        state->decreaseBudget(bytesToCopy);
+                }
+                
+//              if(ret != cudaSuccess)
+//              {
+//                      break;
+//              }               
+                
+        ++i;
+        remaining -= bytesToCopy;
+    }
+    return ret;
+}
+cudaError_t chunkMemcpy(void* a_dst, const void* a_src, size_t count,
+                                                enum cudaMemcpyKind kind,
+                                                int device_a = -1,  // device_a == -1 disables locking
+                                                bool do_locking = true, 
+                                                int device_b = -1)
+{
+        cudaError_t ret;
+        if(!do_locking || device_a == -1) {
+                ret = __chunkMemcpy(a_dst, a_src, count, kind, NULL);
+                //cutilSafeCall( cudaStreamSynchronize(streams[CUR_DEVICE]) );
+                if(ret == cudaSuccess)
+                        ret = cudaGetLastError();
+        }
+        else {
+                ce_lock_state state(device_a, kind, count, device_b);
+                state.lock();
+                ret = __chunkMemcpy(a_dst, a_src, count, kind, &state);
+                //cutilSafeCall( cudaStreamSynchronize(streams[CUR_DEVICE]) );
+                if(ret == cudaSuccess)
+                        ret = cudaGetLastError();
+                state.unlock();
+        }
+        return ret;
+}
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+inline uint64_t timespec_to_ns(const struct timespec& t)
+{
+        return(t.tv_sec*1e9 + t.tv_nsec);
+}
+inline struct timespec ns_to_timespec(const uint64_t& ns)
+{
+        struct timespec temp = {ns/1e9, ns - ns/1e9};
+        return(temp);   
+}
+inline uint64_t clock_gettime_ns(clockid_t clk_id)
+{
+        struct timespec temp;
+        clock_gettime(clk_id, &temp);
+        return timespec_to_ns(temp);
+}
+static int loop_once(void)
+{
+        int i, j = 0;
+        for (i = 0; i < NUMS; i++)
+                j += num[i]++;
+        return j;
+}
+static int loop_for(double exec_time, double emergency_exit)
+{
+        double last_loop = 0, loop_start;
+        int tmp = 0;
+        double start = cputime();
+        double now = cputime();
+        while (now + last_loop < start + exec_time) {
+                loop_start = now;
+                tmp += loop_once();
+                now = cputime();
+                last_loop = now - loop_start;
+                if (emergency_exit && wctime() > emergency_exit) {
+                        /* Oops --- this should only be possible if the execution time tracking
+                         * is broken in the LITMUS^RT kernel. */
+                        fprintf(stderr, "!!! rtspin/%d emergency exit!\n", getpid());
+                        fprintf(stderr, "Something is seriously wrong! Do not ignore this.\n");
+                        break;
+                }
+        }
+        return tmp;
+}
+static void allocate_locks()
+{
+        // allocate k-FMLP lock
+        int fd = open("semaphores", O_RDONLY | O_CREAT, S_IRUSR | S_IWUSR);
+        
+        int base_name = GPU_PARTITION * 1000;
+        
+        if(USE_KFMLP) {
+                KEXCLU_LOCK = open_kfmlp_gpu_sem(fd,
+                                                                                 base_name,  /* name */
+                                                                                 GPU_PARTITION_SIZE,
+                                                                                 GPU_PARTITION*GPU_PARTITION_SIZE,
+                                                                                 NUM_SIMULT_USERS,
+                                                                                 ENABLE_AFFINITY
+                                                                                 );
+        }
+        else {
+                KEXCLU_LOCK = open_ikglp_gpu_sem(fd,
+                                                                                 base_name,  /* name */
+                                                                                 GPU_PARTITION_SIZE,
+                                                                                 GPU_PARTITION*GPU_PARTITION_SIZE,
+                                                                                 NUM_SIMULT_USERS,
+                                                                                 ENABLE_AFFINITY,
+                                                                                 RELAX_FIFO_MAX_LEN
+                                                                                 );             
+        }
+        if(KEXCLU_LOCK < 0)
+                perror("open_kexclu_sem");
+        if(NUM_SIMULT_USERS > 1)
+        {
+                // allocate the engine locks.
+                for (int i = 0; i < MAX_GPUS; ++i)
+                {
+                        EE_LOCKS[i] = open_rsm_sem(fd, (i+1)*10 + base_name);
+                        if(EE_LOCKS[i] < 0)
+                                perror("open_rsm_sem");
+                        
+                        CE_SEND_LOCKS[i] = open_rsm_sem(fd, (i+1)*10 + base_name + 1);
+                        if(CE_SEND_LOCKS[i] < 0)
+                                perror("open_rsm_sem");                 
+                        
+                        if(NUM_SIMULT_USERS == 3)
+                        {
+                                // allocate a separate lock for the second copy engine
+                                CE_RECV_LOCKS[i] = open_rsm_sem(fd, (i+1)*10 + base_name + 2);
+                                if(CE_RECV_LOCKS[i] < 0)
+                                        perror("open_rsm_sem");                                 
+                        }
+                        else
+                        {
+                                // share a single lock for the single copy engine
+                                CE_RECV_LOCKS[i] = CE_SEND_LOCKS[i];
+                        }
+                }               
+        }
+}
+static void allocate_host_memory()
+{
+        // round up to page boundaries
+        size_t send_alloc_bytes = SEND_SIZE + (SEND_SIZE%PAGE_SIZE != 0)*PAGE_SIZE;
+        size_t recv_alloc_bytes = RECV_SIZE + (RECV_SIZE%PAGE_SIZE != 0)*PAGE_SIZE;
+        size_t state_alloc_bytes = STATE_SIZE + (STATE_SIZE%PAGE_SIZE != 0)*PAGE_SIZE;
+        printf("Allocating host memory.  send = %dB, recv = %dB, state = %dB\n",
+                                send_alloc_bytes, recv_alloc_bytes, state_alloc_bytes);
+//      if(send_alloc_bytes > 0)
+//      {
+//              h_send_data = (char *)c_malloc(send_alloc_bytes);
+//              memset(h_send_data, 0x55, send_alloc_bytes);  // write some random value
+//              // this will open a connection to GPU 0 if there is no active context, so
+//              // expect long stalls.  LAME.   
+//              cutilSafeCall( cudaHostRegister(h_send_data, send_alloc_bytes, cudaHostRegisterPortable) );
+//      }
+//      
+//      if(recv_alloc_bytes > 0)
+//      {       
+//              h_recv_data = (char *)c_malloc(recv_alloc_bytes);
+//              memset(h_recv_data, 0xAA, recv_alloc_bytes);
+//              cutilSafeCall( cudaHostRegister(h_recv_data, recv_alloc_bytes, cudaHostRegisterPortable) );     
+//      }
+//      
+//      if(state_alloc_bytes > 0)
+//      {               
+//              h_state_data = (char *)c_malloc(state_alloc_bytes);
+//              memset(h_state_data, 0xCC, state_alloc_bytes);  // write some random value
+//              cutilSafeCall( cudaHostRegister(h_state_data, state_alloc_bytes, cudaHostRegisterPortable) );   
+//      }
+        printf("Host memory allocated.\n");
+}
+static void allocate_device_memory()
+{
+        printf("Allocating device memory.\n");
+        // establish a connection to each GPU.
+//      for(int i = 0; i < GPU_PARTITION_SIZE; ++i)
+//      {
+//              int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i;
+//              
+//              if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].lock();
+//              
+//              cutilSafeCall( cudaSetDevice(which_device) );
+//              cutilSafeCall( cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 0) );
+//              cutilSafeCall( cudaDeviceSetLimit(cudaLimitMallocHeapSize, 0) );
+//              
+//              cutilSafeCall( cudaStreamCreate(&streams[which_device]) );
+//              
+//              /* pre-allocate memory, pray there's enough to go around */
+//              if(SEND_SIZE > 0) {
+//                      cutilSafeCall( cudaMalloc((void**)&d_send_data[which_device], SEND_SIZE) );     
+//              }
+//              if(RECV_SIZE > 0) {
+//                      cutilSafeCall( cudaMalloc((void**)&h_recv_data[which_device], RECV_SIZE) );
+//              }
+//              if(STATE_SIZE > 0) {
+//                      cutilSafeCall( cudaMalloc((void**)&h_state_data[which_device], STATE_SIZE) );
+//              }               
+//              
+//              if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].unlock();
+//      }       
+        printf("Device memory allocated.\n");
+}
+static void configure_gpus()
+{
+        printf("Configuring GPU\n");
+//      // SUSPEND WHEN BLOCKED!!
+//      cutilSafeCall( cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync) );
+//      
+//      // establish a connection to each GPU.
+//      for(int i = 0; i < GPU_PARTITION_SIZE; ++i)
+//      {
+//              int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i;
+//              
+//              if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].lock();
+//              
+//              cutilSafeCall( cudaSetDevice(which_device) );
+//              cutilSafeCall( cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 0) );
+//              cutilSafeCall( cudaDeviceSetLimit(cudaLimitMallocHeapSize, 0) );
+//              
+//              cutilSafeCall( cudaStreamCreate(&streams[which_device]) );
+//              
+//              // enable P2P migrations.
+//              // we assume all GPUs are on the same I/O hub.
+//              for(int j = 0; j < GPU_PARTITION_SIZE; ++j)
+//              {
+//                      int other_device = GPU_PARTITION*GPU_PARTITION_SIZE + j;
+//                      
+//                      if(which_device != other_device)
+//                      {
+//                              cutilSafeCall( cudaDeviceEnablePeerAccess(other_device, 0) );
+//                      }
+//              }
+//              
+//              if(i == 0)
+//              {
+//                      struct cudaDeviceProp pi;
+//                      cudaGetDeviceProperties(&pi, i);
+//                      gpuCyclesPerSecond = pi.clockRate * 1000; /* khz -> hz */
+//              }               
+//              
+//              if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].unlock();
+//      }
+        printf("GPUs have been configured.\n");
+}
+static void init_cuda()
+{
+        configure_gpus();
+        allocate_host_memory();
+        allocate_device_memory();
+        allocate_locks();
+}
+static void exit_cuda()
+{
+        for(int i = 0; i < GPU_PARTITION_SIZE; ++i)
+        {
+                int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i;
+                if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].lock();
+//              cutilSafeCall( cudaSetDevice(which_device) );
+//              cutilSafeCall( cudaDeviceReset() );
+                if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].unlock();
+        }
+}
+static void catchExit(void)
+{
+        if(GPU_TASK)
+        {
+                // try to unlock everything.  litmus will prevent bogus calls.
+                if(NUM_SIMULT_USERS > 1)
+                {
+                        for(int i = 0; i < GPU_PARTITION_SIZE; ++i)
+                        {
+                                int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i;
+                                
+                                litmus_unlock(EE_LOCKS[which_device]);
+                                litmus_unlock(CE_SEND_LOCKS[which_device]);
+                                if(NUM_SIMULT_USERS == 2) {
+                                        litmus_unlock(CE_RECV_LOCKS[which_device]);
+                                }
+                        }
+                }
+                
+                if(CUR_DEVICE >= 0) {
+                        unregister_nv_device(CUR_DEVICE);
+                }
+                
+                litmus_unlock(KEXCLU_LOCK);
+        }
+}
+static void migrateToGPU(int destination)
+{
+        if(!BROADCAST_STATE && STATE_SIZE > 0)
+        {
+                if(MIGRATE_VIA_SYSMEM)
+                {
+                        chunkMemcpy(h_state_data, d_state_data[LAST_DEVICE], STATE_SIZE,
+                                                cudaMemcpyDeviceToHost, LAST_DEVICE, useEngineLocks());                 
+                }
+        }
+        
+//      cutilSafeCall( cudaSetDevice(destination) );            
+        
+        if(!BROADCAST_STATE && STATE_SIZE > 0)
+        {
+                if(MIGRATE_VIA_SYSMEM)
+                {
+                        chunkMemcpy(d_state_data[CUR_DEVICE], h_state_data, STATE_SIZE,
+                                                cudaMemcpyHostToDevice, CUR_DEVICE, useEngineLocks());                  
+                }
+                else
+                {
+                        chunkMemcpy(d_state_data[destination],
+                                                d_state_data[LAST_DEVICE],
+                                                STATE_SIZE,
+                                                cudaMemcpyDeviceToDevice,
+                                                CUR_DEVICE,
+                                                useEngineLocks(),
+                                                destination);
+                }
+        }
+}
+static void broadcastState(int from)
+{
+        if(STATE_SIZE > 0)
+        {               
+                assert(CUR_DEVICE == from);
+                
+                if(MIGRATE_VIA_SYSMEM)
+                {
+                        chunkMemcpy(h_state_data, d_state_data[from], STATE_SIZE,
+                                                cudaMemcpyDeviceToHost, from, useEngineLocks());
+                }       
+        
+                for(int i = 0; i < GPU_PARTITION_SIZE; ++i)
+                {
+                        int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i;
+                        if(which_device != from)
+                        {
+                                if(MIGRATE_VIA_SYSMEM)
+                                {
+//                                      cutilSafeCall( cudaSetDevice(which_device) );
+                                        CUR_DEVICE = which_device; // temporary
+                                        chunkMemcpy(d_state_data[which_device], h_state_data, STATE_SIZE,
+                                                                cudaMemcpyHostToDevice, which_device, useEngineLocks());                                        
+                                }
+                                else
+                                {
+                                        chunkMemcpy(d_state_data[which_device],
+                                                                d_state_data[from],
+                                                                STATE_SIZE,
+                                                                cudaMemcpyDeviceToDevice,
+                                                                from,
+                                                                useEngineLocks(),
+                                                                which_device);  
+                                }
+                        }
+                }
+                
+                if(MIGRATE_VIA_SYSMEM && CUR_DEVICE != from)
+                {
+//                      cutilSafeCall( cudaSetDevice(from) );
+                        CUR_DEVICE = from;
+                }
+        }
+}
+//// Executes on graphics card.
+//__global__ void docudaspin(unsigned int cycles)
+//{
+//      long long unsigned int elapsed = 0;
+//      long long int now = clock64();
+//      long long int last;
+//      do
+//      {
+//              last = now;
+//              now = clock64();
+//              elapsed += max(0ll, (long long int)(now - last)); // don't count iterations with clock roll-over
+//      }while(elapsed < cycles);
+//
+//      return;
+//}
+static void gpu_loop_for(double gpu_sec_time, double emergency_exit)
+{
+        unsigned int numcycles = (unsigned int)(gpuCyclesPerSecond * gpu_sec_time);
+        int numblocks = 1;
+        int blocksz = 1;
+        CUR_DEVICE = litmus_lock(KEXCLU_LOCK);
+        {
+                if(CUR_DEVICE != LAST_DEVICE && LAST_DEVICE != -1)
+                {
+                        migrateToGPU(CUR_DEVICE);
+                }
+                if(SEND_SIZE > 0)
+                {
+                        // handles chunking and locking, as appropriate.
+                        chunkMemcpy(d_send_data[CUR_DEVICE], h_send_data, SEND_SIZE,
+                                                cudaMemcpyHostToDevice, CUR_DEVICE, useEngineLocks());
+                }
+                if(useEngineLocks()) litmus_lock(EE_LOCKS[CUR_DEVICE]);
+                
+//              docudaspin <<<numblocks,blocksz, 0, streams[CUR_DEVICE]>>> (numcycles);
+//              cutilSafeCall( cudaStreamSynchronize(streams[CUR_DEVICE]) );
+                
+                if(useEngineLocks()) litmus_unlock(EE_LOCKS[CUR_DEVICE]);
+                
+                if(RECV_SIZE > 0)
+                {
+                        chunkMemcpy(h_recv_data, d_recv_data[CUR_DEVICE], RECV_SIZE,
+                                                cudaMemcpyDeviceToHost, CUR_DEVICE, useEngineLocks());
+                }
+                
+                if(BROADCAST_STATE)
+                {
+                        broadcastState(CUR_DEVICE);
+                }
+        }
+        litmus_unlock(KEXCLU_LOCK);
+        LAST_DEVICE = CUR_DEVICE;
+        CUR_DEVICE = -1;
+}
+static void debug_delay_loop(void)
+{
+        double start, end, delay;
+        while (1) {
+                for (delay = 0.5; delay > 0.01; delay -= 0.01) {
+                        start = wctime();
+                        loop_for(delay, 0);
+                        end = wctime();
+                        printf("%6.4fs: looped for %10.8fs, delta=%11.8fs, error=%7.4f%%\n",
+                               delay,
+                               end - start,
+                               end - start - delay,
+                               100 * (end - start - delay) / delay);
+                }
+        }
+}
+static int job(double exec_time, double gpu_sec_time, double program_end)
+{
+        if (wctime() > program_end)
+                return 0;
+        else if (!GPU_TASK)
+        {
+                loop_for(exec_time, program_end + 1);
+        }
+        else
+        {
+                double cpu_bookend = (exec_time)/2.0;
+                loop_for(cpu_bookend, program_end + 1);
+                gpu_loop_for(gpu_sec_time, program_end + 1);
+                loop_for(cpu_bookend, program_end + 1);
+        }
+        return 1;
+}
+#define OPTSTR "p:ls:e:g:G:W:N:S:R:T:BMaLyC:rz:"
+int main(int argc, char** argv)
+{
+        atexit(catchExit);
+        int ret;
+        lt_t wcet;
+        lt_t period;
+        double wcet_ms, period_ms;
+        int migrate = 0;
+        int cpu = 0;
+        int opt;
+        int test_loop = 0;
+//      int column = 1;
+        const char *file = NULL;
+        int want_enforcement = 0;
+        double duration = 0, releaseTime = 0;
+        double *exec_times = NULL;
+        double scale = 1.0;
+        uint64_t cur_job;
+        uint64_t num_jobs;
+        int create_shm = -1;
+        int num_tasks = 0;
+        double gpu_sec_ms = 0;
+        
+        while ((opt = getopt(argc, argv, OPTSTR)) != -1) {
+//              printf("opt = %c optarg = %s\n", opt, optarg);
+                switch (opt) {
+//              case 'w':
+//                      ENABLE_WAIT = 1;
+//                      break;
+                case 'p':
+                        cpu = atoi(optarg);
+                        migrate = 1;
+                        break;
+                case 'l':
+                        test_loop = 1;
+                        break;
+                case 's':
+                        scale = atof(optarg);
+                        break;
+                case 'e':
+                        gpu_sec_ms = atof(optarg);
+                        break;
+//              case 'x':
+//                      trans_sec_ms = atof(optarg);
+//                      break;
+                case 'z':
+                        NUM_SIMULT_USERS = atoi(optarg);
+                        break;
+                case 'g':
+                        GPU_TASK = 1;
+                        GPU_PARTITION_SIZE = atoi(optarg);
+                        break;
+                case 'G':
+                        GPU_PARTITION = atoi(optarg);
+                        break;
+                case 'S':
+                        SEND_SIZE = (size_t)(atof(optarg)*1024);
+                        break;
+                case 'R':
+                        RECV_SIZE = (size_t)(atof(optarg)*1024);
+                        break;
+                case 'T':
+                        STATE_SIZE = (size_t)(atof(optarg)*1024);
+                        break;
+                case 'B':
+                        BROADCAST_STATE = true;
+                        break;
+                case 'M':
+                        MIGRATE_VIA_SYSMEM = true;
+                        break;
+                case 'a':
+                        ENABLE_AFFINITY = true;
+                        break;
+                case 'r':
+                        RELAX_FIFO_MAX_LEN = true;
+                        break;                          
+                case 'L':
+                        USE_KFMLP = true;
+                        break;
+                case 'y':
+                        USE_DYNAMIC_GROUP_LOCKS = true;
+                        break;
+                case 'C':
+                        ENABLE_CHUNKING = true;
+                        CHUNK_SIZE = (size_t)(atof(optarg)*1024);
+                        break;
+                case 'W':
+                        create_shm = atoi(optarg);
+                        break;
+                case 'N':
+                        num_tasks = atoi(optarg);
+                        break;
+                case ':':
+                        usage("Argument missing.");
+                        break;
+                case '?':
+                default:
+                        usage("Bad argument.");
+                        break;
+                }
+        }
+        if (test_loop) {
+                debug_delay_loop();
+                return 0;
+        }
+//      if (file) {
+//              int num_jobs_tmp;
+//              get_exec_times(file, column, &num_jobs_tmp, &exec_times);
+//              num_jobs = num_jobs_tmp;
+//
+//              if (argc - optind < 2)
+//                      usage("Arguments missing.");
+//
+//              for (cur_job = 0; cur_job < num_jobs; ++cur_job) {
+//                      /* convert the execution time to seconds */
+//                      duration += exec_times[cur_job] * 0.001;
+//              }
+//      } else {
+                /*
+                 * if we're not reading from the CSV file, then we need
+                 * three parameters
+                 */
+                if (argc - optind < 3)
+                        usage("Arguments missing.");
+//      }
+        wcet_ms   = atof(argv[optind + 0]);
+        period_ms = atof(argv[optind + 1]);
+        wcet   = wcet_ms * __NS_PER_MS;
+        period = period_ms * __NS_PER_MS;
+        if (wcet <= 0)
+                usage("The worst-case execution time must be a "
+                                "positive number.");
+        if (period <= 0)
+                usage("The period must be a positive number.");
+        if (!file && wcet > period) {
+                usage("The worst-case execution time must not "
+                                "exceed the period.");
+        }
+        if (!file)
+        {
+                duration  = atof(argv[optind + 2]);
+                num_jobs = ((double)duration*1e3)/period_ms;
+                ++num_jobs; // padding
+        }
+        else if (file && num_jobs > 1)
+        {
+                duration += period_ms * 0.001 * (num_jobs - 1);
+        }
+        if (migrate) {
+                ret = be_migrate_to(cpu);
+                if (ret < 0)
+                        bail_out("could not migrate to target partition");
+        }
+        if(ENABLE_WAIT)
+        {
+                if(num_tasks > 0)
+                {
+                        printf("%d creating release shared memory\n", getpid());
+                        shared_memory_object::remove("release_barrier_memory");
+                        release_segment_ptr = new managed_shared_memory(create_only, "release_barrier_memory", 4*1024);                 
+                        
+                        printf("%d creating release barrier for %d users\n", getpid(), num_tasks);
+                        release_barrier = release_segment_ptr->construct<barrier>("barrier release_barrier")(num_tasks);
+                        
+                        init_release_time = release_segment_ptr->construct<uint64_t>("uint64_t instance")();
+                        *init_release_time = 0;         
+                }
+                else
+                {
+                        do
+                        {
+                                try
+                                {
+                                        printf("%d opening release shared memory\n", getpid());
+                                        segment_ptr = new managed_shared_memory(open_only, "release_barrier_memory");
+                                }
+                                catch(...)
+                                {
+                                        printf("%d shared memory not ready.  sleeping\n", getpid());
+                                        sleep(1);
+                                }
+                        }while(segment_ptr == NULL);
+                        
+                        release_barrier = segment_ptr->find<barrier>("barrier release_barrier").first;
+                        init_release_time = segment_ptr->find<uint64_t>("uint64_t instance").first;
+                }
+        }
+        
+        
+        if(GPU_TASK)
+        {
+                if(ENABLE_WAIT)
+                {
+                        if(create_shm > -1)
+                        {
+                                printf("%d creating shared memory\n", getpid());
+                                shared_memory_object::remove("gpu_barrier_memory");
+                                segment_ptr = new managed_shared_memory(create_only, "gpu_barrier_memory", 4*1024);
+                                printf("%d creating a barrier for %d users\n", getpid(), create_shm);
+                                gpu_barrier = segment_ptr->construct<barrier>("barrier instance")(create_shm);
+                                printf("%d creating gpu mgmt mutexes for 8 devices\n", getpid());
+                                gpu_mgmt_mutexes = segment_ptr->construct<interprocess_mutex>("interprocess_mutex m")[8]();
+                        }
+                        else
+                        {
+                                do
+                                {
+                                        try
+                                        {
+                                                printf("%d opening shared memory\n", getpid());
+                                                segment_ptr = new managed_shared_memory(open_only, "gpu_barrier_memory");
+                                        }
+                                        catch(...)
+                                        {
+                                                printf("%d shared memory not ready.  sleeping\n", getpid());
+                                                sleep(1);
+                                        }
+                                }while(segment_ptr == NULL);
+                                gpu_barrier = segment_ptr->find<barrier>("barrier instance").first;
+                                gpu_mgmt_mutexes = segment_ptr->find<interprocess_mutex>("interprocess_mutex m").first;
+                        }
+                }
+                // scale data transmission too??
+                SEND_SIZE *= scale;
+                RECV_SIZE *= scale;
+                STATE_SIZE *= scale;
+                
+                init_cuda();
+        }
+        ret = sporadic_task_ns(wcet, period, 0, cpu, RT_CLASS_SOFT,
+                               want_enforcement ? PRECISE_ENFORCEMENT
+                                                : NO_ENFORCEMENT,
+                               migrate);
+        if (ret < 0)
+                bail_out("could not setup rt task params");
+        init_litmus();
+        ret = task_mode(LITMUS_RT_TASK);
+        if (ret != 0)
+                bail_out("could not become RT task");
+        
+        
+        uint64_t jobCount = 0;
+        blitz::Array<uint64_t, 1> responseTimeLog(num_jobs+1);
+        
+        struct timespec spec;
+        uint64_t release;
+        uint64_t finish;
+                
+        
+        if (ENABLE_WAIT) {
+                printf("Waiting for release.\n");
+                ret = wait_for_ts_release();
+                if (ret != 0)
+                        bail_out("wait_for_ts_release()");
+        }
+        else
+        {
+                sleep_next_period();
+        }
+        
+        clock_gettime(CLOCK_MONOTONIC, &spec);
+        release = timespec_to_ns(spec);
+        if (!__sync_bool_compare_and_swap(init_release_time, 0, release))
+        {
+                release = *init_release_time;
+        }
+                
+        releaseTime = wctime();
+        double failsafeEnd = releaseTime + duration;
+        if (file) {
+                /* use times read from the CSV file */
+                for (cur_job = 0; cur_job < num_jobs; ++cur_job) {
+                        /* convert job's length to seconds */
+                        job(exec_times[cur_job] * 0.001 * scale,
+                                        gpu_sec_ms * 0.001 * scale,
+                                        failsafeEnd);
+                }
+        } else {
+                /* convert to seconds and scale */
+                int keepGoing;
+                do
+                {
+                        keepGoing = job(wcet_ms * 0.001 * scale, gpu_sec_ms * 0.001 * scale, failsafeEnd);
+                        clock_gettime(CLOCK_MONOTONIC, &spec);
+                        finish = timespec_to_ns(spec);
+                        responseTimeLog(min(num_jobs,jobCount++)) = finish - release;   
+                        // this is an estimated upper-bound on release time.  it may be off by several microseconds.
+#ifdef RESET_RELEASE_ON_MISS
+                        release = (release + period < finish) ?
+                                        finish :  /* missed deadline.  adopt next release as current time. */
+                                        release + period;  /* some time in the future. */
+#else
+                        release = release + period; // allow things to get progressively later.
+#endif
+                        
+                        sleep_next_period();
+                        clock_gettime(CLOCK_MONOTONIC, &spec);
+                        release = min(timespec_to_ns(spec), release);
+                        
+                } while(keepGoing);
+        }
+        if(GPU_TASK && ENABLE_WAIT)
+        {
+                printf("%d waiting at barrier\n", getpid());
+                gpu_barrier->wait();
+        }
+        ret = task_mode(BACKGROUND_TASK);
+        if (ret != 0)
+                bail_out("could not become regular task (huh?)");
+        if (file)
+                free(exec_times);
+        if(GPU_TASK)
+        {
+                /*
+                if(ENABLE_WAIT)
+                {
+                        // wait for all GPU using tasks ext RT mode.
+                        printf("%d waiting at barrier\n", getpid());
+                        gpu_barrier->wait();
+                }
+                */
+                exit_cuda();
+                if(ENABLE_WAIT)
+                {
+                        /* wait before we clean up memory */
+                        printf("%d waiting for all to shutdown GPUs\n", getpid());
+                        gpu_barrier->wait();
+/*
+                        if(create_shm > -1)
+                        {
+                                printf("%d removing shared memory\n", getpid());
+                                shared_memory_object::remove("gpu_barrier_memory");
+                        }
+*/
+                }
+        }
+        
+        if (ENABLE_WAIT)
+        {
+                printf("%d waiting at exit barrier\n", getpid());
+                release_barrier->wait();
+        }
+        
+        char gpu_using_str[] = "GPU\n";
+        char cpu_only_str[] = "CPU\n";
+        #define USED(arr) (arr)(Range(fromStart,min(num_jobs-1,jobCount-1)))
+        // period (ms), avg-rt, min-rt, max-rt, avg-slack, numMisses
+        printf("DONE,%d,%d,%f,%f,%f,%lu,%lu,%f,%lu,%d,%d,%s",
+                   cpu,
+                   getpid(),
+                   period_ms,
+                   // average
+                   blitz::mean(USED(responseTimeLog)),
+                   // average pct of period
+                   100.0*(blitz::mean(USED(responseTimeLog))/period),                      
+                   // min
+                   blitz::min(USED(responseTimeLog)),
+                   // max
+                   blitz::max(USED(responseTimeLog)),
+                   // average slack
+                   blitz::mean((uint64_t)period - USED(responseTimeLog)),
+                   // num jobs
+                   min(num_jobs-1,jobCount-1),
+                   // num misses
+                   blitz::count(USED(responseTimeLog) > (uint64_t)period),
+                   // num misses w/ unbounded
+                   blitz::count(USED(responseTimeLog) > (uint64_t)(2*period)),
+                   // flag gpu-using tasks
+                   ((GPU_TASK) ? gpu_using_str : cpu_only_str)
+                   );
+        
+        return 0;
+}
author	Glenn Elliott <gelliott@cs.unc.edu>	2013-01-10 17:48:39 -0500
committer	Glenn Elliott <gelliott@cs.unc.edu>	2013-01-10 17:48:39 -0500
commit	629486d62ae22c33251d3c367af3febff5fe1e28 (patch)
tree	ef78fc8235c61f8ba37d109ea04266b6ce49b804 /gpu
parent	1bf0f0094cd9671adfc07cf840bde67cd4cc0c38 (diff)