Merge branch 'gh/staging' into temp

Conflicts: Makefile bin/rt_launch.c bin/rtspin.c src/task.c
author: Glenn Elliott <gelliott@cs.unc.edu> 2013-03-13 15:33:57 -0400
committer: Glenn Elliott <gelliott@cs.unc.edu> 2013-03-13 15:33:57 -0400
commit: 944a78c21028da69fb53c0aec3e9dfdb048d47e4 (patch)
tree: bdbc77b3c0ff1337670a7e5d0f9d438388c1a866 /gpu
parent: f338b34ea0fb6136ea3895a07161ece030c4b998 (diff)
parent: 1ff4fc699f01f0ad1359fad48b00c9d3be1b28b4 (diff)
6 files changed, 162 insertions, 162 deletions
diff --git a/gpu/aux_threads.c b/gpu/aux_threads.c
index 1e168c6..1711c40 100644
--- a/gpu/aux_threads.c
+++ b/gpu/aux_threads.c
@@ -1,4 +1,4 @@
-/* based_mt_task.c -- A basic multi-threaded real-time task skeleton. 
+/* based_mt_task.c -- A basic multi-threaded real-time task skeleton.
 *
 * This (by itself useless) task demos how to setup a multi-threaded LITMUS^RT
 * real-time task. Familiarity with the single threaded example (base_task.c)
@@ -48,7 +48,7 @@ struct thread_context {
 void* rt_thread(void *tcontext);
 void* aux_thread(void *tcontext);
-/* Declare the periodically invoked job. 
+/* Declare the periodically invoked job.
 * Returns 1 -> task should exit.
 *         0 -> task should continue.
 */
@@ -112,7 +112,7 @@ int main(int argc, char** argv)
        ctx = calloc(NUM_AUX_THREADS, sizeof(struct thread_context));
        task = calloc(NUM_AUX_THREADS, sizeof(pthread_t));
-        
        //lt_t delay = ms2lt(1000);
        /*****
@@ -199,9 +199,9 @@ int main(int argc, char** argv)
                        printf("child %d: %fs\n", i, time);
                }
        }
-        
-        /***** 
+        /*****
         * 6) Clean up, maybe print results and stats, and exit.
         */
        return 0;
@@ -271,7 +271,7 @@ void* rt_thread(void *tcontext)
        wait_for_ts_release();
-        /* The task is now executing as a real-time task if the call didn't fail. 
+        /* The task is now executing as a real-time task if the call didn't fail.
         */
@@ -304,7 +304,7 @@ void* rt_thread(void *tcontext)
        return ctx;
 }
-int job(void) 
+int job(void)
 {
        /* Do real-time calculation. */
diff --git a/gpu/dgl.c b/gpu/dgl.c
index dc68ead..42a3ae2 100644
--- a/gpu/dgl.c
+++ b/gpu/dgl.c
@@ -177,7 +177,7 @@ void* rt_thread(void* _ctx)
                        xfprintf(stdout, "ikglp od = %d\n", ctx->ikglp);
        }
-        
        for (i = 0; i < NUM_SEMS; i++) {
                if(!USE_PRIOQ) {
                        ctx->od[i] = open_fifo_sem(ctx->fd, i+1);
@@ -208,29 +208,29 @@ void* rt_thread(void* _ctx)
                int last = (first + NEST_DEPTH - 1 >= NUM_SEMS) ? NUM_SEMS - 1 : first + NEST_DEPTH - 1;
                int dgl_size = last - first + 1;
                int dgl[dgl_size];
-                
                // construct the DGL
                for(i = first; i <= last; ++i) {
                        dgl[i-first] = ctx->od[i];
                }
-                
-                
                if(NUM_REPLICAS) {
                        replica = litmus_lock(ctx->ikglp);
                        xfprintf(stdout, "[%d] got ikglp replica %d.\n", ctx->id, replica);
                }
-                
                litmus_dgl_lock(dgl, dgl_size);
                xfprintf(stdout, "[%d] acquired dgl.\n", ctx->id);
-                
                do_exit = job(ctx);
-                
                xfprintf(stdout, "[%d] unlocking dgl.\n", ctx->id);
                litmus_dgl_unlock(dgl, dgl_size);
-                
-                if(NUM_REPLICAS) {              
+                if(NUM_REPLICAS) {
                        xfprintf(stdout, "[%d]: freeing ikglp replica %d.\n", ctx->id, replica);
                        litmus_unlock(ctx->ikglp);
                }
@@ -249,7 +249,7 @@ void* rt_thread(void* _ctx)
        return NULL;
 }
-void dirty_kb(int kb) 
+void dirty_kb(int kb)
 {
        int32_t one_kb[256];
        int32_t sum = 0;
diff --git a/gpu/ikglptest.c b/gpu/ikglptest.c
index f802801..30623b7 100644
--- a/gpu/ikglptest.c
+++ b/gpu/ikglptest.c
@@ -172,7 +172,7 @@ struct avg_info feedback(int _a, int _b)
        }
        stdev = sqrtf(devsum/(NUM_SAMPLES-1));
-        
        ret.avg = avg;
        ret.stdev = stdev;
@@ -189,10 +189,10 @@ struct avg_info feedback(int _a, int _b)
 int main(int argc, char** argv)
 {
        int i;
-        struct thread_context* ctx;
+        struct thread_context* ctx = NULL;
-        struct thread_context* aux_ctx;
+        struct thread_context* aux_ctx = NULL;
-        pthread_t*           task;
+        pthread_t*           task = NULL;
-        pthread_t*           aux_task;
+        pthread_t*           aux_task = NULL;
        int fd;
        int opt;
@@ -291,7 +291,7 @@ int main(int argc, char** argv)
                        }
                }
        }
-        
        printf("Best:\ta = %d\tb = %d\t(b-a) = %d\tavg = %6.2f\tstdev = %6.2f\n", best_a, best_b, best_b - best_a, best.avg, best.stdev);
        printf("2nd:\ta = %d\tb = %d\t(b-a) = %d\tavg = %6.2f\tstdev = %6.2f\n", second_best_a, second_best_b, second_best_b - second_best_a, second_best.avg, second_best.stdev);
@@ -308,7 +308,7 @@ int main(int argc, char** argv)
                        }
        printf("Aaron:\tavg = %6.2f\tstd = %6.2f\n", avg_accum/TRIALS, std_accum/TRIALS);
-        
@@ -385,7 +385,7 @@ int affinity_distance(struct thread_context* ctx, int a, int b)
 {
        int i;
        int dist;
-        
        if(a >= 0 && b >= 0) {
                for(i = 0; i <= 3; ++i) {
                        if(a>>i == b>>i) {
@@ -397,25 +397,25 @@ int affinity_distance(struct thread_context* ctx, int a, int b)
        }
        else {
                dist = 0;
-        }       
+        }
-        
 out:
-        //printf("[%d]: distance: %d -> %d = %d\n", ctx->id, a, b, dist);       
+        //printf("[%d]: distance: %d -> %d = %d\n", ctx->id, a, b, dist);
-        
        ++(ctx->mig_count[dist]);
-        
        return dist;
-        
 //      int groups[] = {2, 4, 8};
 //      int i;
-//      
+//
 //      if(a < 0 || b < 0)
 //              return (sizeof(groups)/sizeof(groups[0]));  // worst affinity
-//      
+//
 //      // no migration
 //      if(a == b)
 //              return 0;
-//      
+//
 //      for(i = 0; i < sizeof(groups)/sizeof(groups[0]); ++i) {
 //              if(a/groups[i] == b/groups[i])
 //                      return (i+1);
@@ -441,7 +441,7 @@ void* rt_thread(void* _ctx)
 {
        int i;
        int do_exit = 0;
-        int last_replica = -1;  
+        int last_replica = -1;
        struct thread_context *ctx = (struct thread_context*)_ctx;
@@ -472,13 +472,13 @@ void* rt_thread(void* _ctx)
                                                                          IKGLP_OPTIMAL_FIFO_LEN :
                                                                          IKGLP_UNLIMITED_FIFO_LEN,
                                                                ENABLE_AFFINITY
-                                                                );      
+                                                                );
        }
        if(ctx->kexclu < 0)
                perror("open_kexclu_sem");
        else
                printf("kexclu od = %d\n", ctx->kexclu);
-        
        for (i = 0; i < NUM_SEMS; ++i) {
                if(!USE_PRIOQ) {
                        ctx->od[i] = open_fifo_sem(ctx->fd, i + ctx->kexclu + 2);
@@ -508,21 +508,21 @@ void* rt_thread(void* _ctx)
                int dgl_size = last - first + 1;
                int replica = -1;
                int distance;
-                
-                int dgl[dgl_size];              
+                int dgl[dgl_size];
-                
                // construct the DGL
                for(i = first; i <= last; ++i) {
                        dgl[i-first] = ctx->od[i];
-                }               
+                }
-                
                replica = litmus_lock(ctx->kexclu);
                //printf("[%d] got kexclu replica %d.\n", ctx->id, replica);
                //fflush(stdout);
                distance = affinity_distance(ctx, replica, last_replica);
-                
                if(USE_DYNAMIC_GROUP_LOCKS) {
                        litmus_dgl_lock(dgl, dgl_size);
                }
@@ -531,24 +531,24 @@ void* rt_thread(void* _ctx)
                                litmus_lock(dgl[i]);
                        }
                }
-                
                //do_exit = nested_job(ctx, &count, &first, affinity_cost[distance]);
                do_exit = job(ctx, affinity_cost[distance]);
-                
                if(USE_DYNAMIC_GROUP_LOCKS) {
                        litmus_dgl_unlock(dgl, dgl_size);
                }
                else {
                        for(i = dgl_size - 1; i >= 0; --i) {
                                litmus_unlock(dgl[i]);
-                        }                       
+                        }
-                }               
+                }
-                
                //printf("[%d]: freeing kexclu replica %d.\n", ctx->id, replica);
                //fflush(stdout);
                litmus_unlock(ctx->kexclu);
-                
                last_replica = replica;
                if(SLEEP_BETWEEN_JOBS && !do_exit) {
@@ -567,7 +567,7 @@ void* rt_thread(void* _ctx)
         */
        TH_CALL( task_mode(BACKGROUND_TASK) );
-        for(i = 0; i < sizeof(ctx->mig_count)/sizeof(ctx->mig_count[0]); ++i) 
+        for(i = 0; i < sizeof(ctx->mig_count)/sizeof(ctx->mig_count[0]); ++i)
        {
                printf("[%d]: mig_count[%d] = %d\n", ctx->id, i, ctx->mig_count[i]);
        }
@@ -608,15 +608,15 @@ void* rt_thread(void* _ctx)
 //}
-void dirty_kb(int kb) 
+void dirty_kb(int kb)
-{       
+{
        int32_t one_kb[256];
        int32_t sum = 0;
        int32_t i;
        if(!kb)
-                return; 
+                return;
-        
        for (i = 0; i < 256; i++)
                sum += one_kb[i];
        kb--;
@@ -630,9 +630,9 @@ void dirty_kb(int kb)
 int job(struct thread_context* ctx, int runfactor)
 {
        //struct timespec tosleep = {0, 100000}; // 0.1 ms
-        
        //printf("[%d]: runfactor = %d\n", ctx->id, runfactor);
-        
        //dirty_kb(8 * runfactor);
        dirty_kb(1 * runfactor);
        //nanosleep(&tosleep, NULL);
diff --git a/gpu/locktest.c b/gpu/locktest.c
index bc4fc54..6a1219a 100644
--- a/gpu/locktest.c
+++ b/gpu/locktest.c
@@ -177,7 +177,7 @@ void* rt_thread(void* _ctx)
        return NULL;
 }
-void dirty_kb(int kb) 
+void dirty_kb(int kb)
 {
        int32_t one_kb[256];
        int32_t sum = 0;
diff --git a/gpu/nested.c b/gpu/nested.c
index 8c39152..edec46b 100644
--- a/gpu/nested.c
+++ b/gpu/nested.c
@@ -180,7 +180,7 @@ void* rt_thread(void* _ctx)
                int first = (int)(NUM_SEMS * (rand_r(&(ctx->rand)) / (RAND_MAX + 1.0)));
                int count = NEST_DEPTH;
                do_exit = nested_job(ctx, &count, &first);
-                
                if(SLEEP_BETWEEN_JOBS && !do_exit) {
                        sleep_next_period();
                }
@@ -226,7 +226,7 @@ int nested_job(struct thread_context* ctx, int *count, int *next)
-void dirty_kb(int kb) 
+void dirty_kb(int kb)
 {
        int32_t one_kb[256];
        int32_t sum = 0;
diff --git a/gpu/rtspin_fake_cuda.cpp b/gpu/rtspin_fake_cuda.cpp
index 78e4f60..247a74c 100644
--- a/gpu/rtspin_fake_cuda.cpp
+++ b/gpu/rtspin_fake_cuda.cpp
@@ -119,7 +119,7 @@ char *h_state_data = 0;
                mmap(NULL, s ,   \
                                PROT_READ | PROT_WRITE,  \
                                MAP_PRIVATE | MAP_ANONYMOUS | MAP_LOCKED,  \
-                                -1, 0) 
+                                -1, 0)
 #else
 #define c_malloc(s) malloc(s)
 #endif
@@ -144,38 +144,38 @@ cudaError_t cudaGetLastError()
 ////////////////////////////////////////////////////////////////////////
 struct ce_lock_state
-{       
+{
        int locks[2];
        size_t num_locks;
        size_t budget_remaining;
        bool locked;
-        
        ce_lock_state(int device_a, enum cudaMemcpyKind kind, size_t size, int device_b = -1) {
                num_locks = (device_a != -1) + (device_b != -1);
-                
                if(device_a != -1) {
                        locks[0] = (kind == cudaMemcpyHostToDevice) ?
                        CE_SEND_LOCKS[device_a] : CE_RECV_LOCKS[device_a];
                }
-                
                if(device_b != -1) {
                        assert(kind == cudaMemcpyDeviceToDevice);
-                        
                        locks[1] = CE_RECV_LOCKS[device_b];
-                        
                        if(locks[1] < locks[0]) {
                                int temp = locks[1];
                                locks[1] = locks[0];
                                locks[0] = temp;
                        }
                }
-                
                if(!ENABLE_CHUNKING)
                        budget_remaining = size;
                else
                        budget_remaining = CHUNK_SIZE;
        }
-        
        void lock() {
                if(USE_DYNAMIC_GROUP_LOCKS) {
                        litmus_dgl_lock(locks, num_locks);
@@ -189,7 +189,7 @@ struct ce_lock_state
                }
                locked = true;
        }
-        
        void unlock() {
                if(USE_DYNAMIC_GROUP_LOCKS) {
                        litmus_dgl_unlock(locks, num_locks);
@@ -204,15 +204,15 @@ struct ce_lock_state
                }
                locked = false;
        }
-        
        void refresh() {
                budget_remaining = CHUNK_SIZE;
        }
-        
        bool budgetIsAvailable(size_t tosend) {
                return(tosend >= budget_remaining);
        }
-        
        void decreaseBudget(size_t spent) {
                budget_remaining -= spent;
        }
@@ -225,53 +225,53 @@ cudaError_t __chunkMemcpy(void* a_dst, const void* a_src, size_t count,
 {
    cudaError_t ret = cudaSuccess;
    int remaining = count;
-    
    char* dst = (char*)a_dst;
    const char* src = (const char*)a_src;
-    
        // disable chunking, if needed, by setting chunk_size equal to the
        // amount of data to be copied.
        int chunk_size = (ENABLE_CHUNKING) ? CHUNK_SIZE : count;
        int i = 0;
-        
    while(remaining != 0)
    {
        int bytesToCopy = std::min(remaining, chunk_size);
-                
                if(state && state->budgetIsAvailable(bytesToCopy) && state->locked) {
                        //cutilSafeCall( cudaStreamSynchronize(streams[CUR_DEVICE]) );
                        ret = cudaGetLastError();
-                        
                        if(ret != cudaSuccess)
                        {
                                break;
                        }
-                        
                        state->unlock();
                        state->refresh(); // replentish.
                                                          // we can only run out of
                                                          // budget if chunking is enabled.
                                                          // we presume that init budget would
                                                          // be set to cover entire memcpy
-                                                          // if chunking were disabled.                 
+                                                          // if chunking were disabled.
                }
-                
                if(state && !state->locked) {
                        state->lock();
                }
-                
        //ret = cudaMemcpy(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind);
                //cudaMemcpyAsync(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind, streams[CUR_DEVICE]);
                if(state) {
                        state->decreaseBudget(bytesToCopy);
                }
-                
 //              if(ret != cudaSuccess)
 //              {
 //                      break;
-//              }               
+//              }
-                
        ++i;
        remaining -= bytesToCopy;
    }
@@ -281,7 +281,7 @@ cudaError_t __chunkMemcpy(void* a_dst, const void* a_src, size_t count,
 cudaError_t chunkMemcpy(void* a_dst, const void* a_src, size_t count,
                                                enum cudaMemcpyKind kind,
                                                int device_a = -1,  // device_a == -1 disables locking
-                                                bool do_locking = true, 
+                                                bool do_locking = true,
                                                int device_b = -1)
 {
        cudaError_t ret;
@@ -317,7 +317,7 @@ inline uint64_t timespec_to_ns(const struct timespec& t)
 inline struct timespec ns_to_timespec(const uint64_t& ns)
 {
        struct timespec temp = {ns/1e9, ns - ns/1e9};
-        return(temp);   
+        return(temp);
 }
 inline uint64_t clock_gettime_ns(clockid_t clk_id)
@@ -366,9 +366,9 @@ static void allocate_locks()
 {
        // allocate k-FMLP lock
        int fd = open("semaphores", O_RDONLY | O_CREAT, S_IRUSR | S_IWUSR);
-        
        int base_name = GPU_PARTITION * 1000;
-        
        if(USE_KFMLP) {
                KEXCLU_LOCK = open_kfmlp_gpu_sem(fd,
                                                                                 base_name,  /* name */
@@ -397,7 +397,7 @@ static void allocate_locks()
 //                                                                               NUM_SIMULT_USERS,
 //                                                                               ENABLE_AFFINITY,
 //                                                                               RELAX_FIFO_MAX_LEN
-//                                                                               );             
+//                                                                               );
        }
        if(KEXCLU_LOCK < 0)
                perror("open_kexclu_sem");
@@ -406,31 +406,31 @@ static void allocate_locks()
        {
                open_sem_t opensem = (!USE_PRIOQ) ? open_fifo_sem : open_prioq_sem;
                const char* opensem_label = (!USE_PRIOQ) ? "open_fifo_sem" : "open_prioq_sem";
-                
                // allocate the engine locks.
                for (int i = 0; i < MAX_GPUS; ++i)
                {
                        EE_LOCKS[i] = opensem(fd, (i+1)*10 + base_name);
                        if(EE_LOCKS[i] < 0)
                                perror(opensem_label);
-                        
                        CE_SEND_LOCKS[i] = opensem(fd, (i+1)*10 + base_name + 1);
                        if(CE_SEND_LOCKS[i] < 0)
-                                perror(opensem_label);                  
+                                perror(opensem_label);
-                        
                        if(NUM_SIMULT_USERS == 3)
                        {
                                // allocate a separate lock for the second copy engine
                                CE_RECV_LOCKS[i] = opensem(fd, (i+1)*10 + base_name + 2);
                                if(CE_RECV_LOCKS[i] < 0)
-                                        perror(opensem_label);                                  
+                                        perror(opensem_label);
                        }
                        else
                        {
                                // share a single lock for the single copy engine
                                CE_RECV_LOCKS[i] = CE_SEND_LOCKS[i];
                        }
-                }               
+                }
        }
 }
@@ -449,22 +449,22 @@ static void allocate_host_memory()
 //              h_send_data = (char *)c_malloc(send_alloc_bytes);
 //              memset(h_send_data, 0x55, send_alloc_bytes);  // write some random value
 //              // this will open a connection to GPU 0 if there is no active context, so
-//              // expect long stalls.  LAME.   
+//              // expect long stalls.  LAME.
 //              cutilSafeCall( cudaHostRegister(h_send_data, send_alloc_bytes, cudaHostRegisterPortable) );
 //      }
-//      
+//
 //      if(recv_alloc_bytes > 0)
-//      {       
+//      {
 //              h_recv_data = (char *)c_malloc(recv_alloc_bytes);
 //              memset(h_recv_data, 0xAA, recv_alloc_bytes);
-//              cutilSafeCall( cudaHostRegister(h_recv_data, recv_alloc_bytes, cudaHostRegisterPortable) );     
+//              cutilSafeCall( cudaHostRegister(h_recv_data, recv_alloc_bytes, cudaHostRegisterPortable) );
 //      }
-//      
+//
 //      if(state_alloc_bytes > 0)
-//      {               
+//      {
 //              h_state_data = (char *)c_malloc(state_alloc_bytes);
 //              memset(h_state_data, 0xCC, state_alloc_bytes);  // write some random value
-//              cutilSafeCall( cudaHostRegister(h_state_data, state_alloc_bytes, cudaHostRegisterPortable) );   
+//              cutilSafeCall( cudaHostRegister(h_state_data, state_alloc_bytes, cudaHostRegisterPortable) );
 //      }
        printf("Host memory allocated.\n");
@@ -477,28 +477,28 @@ static void allocate_device_memory()
 //      for(int i = 0; i < GPU_PARTITION_SIZE; ++i)
 //      {
 //              int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i;
-//              
+//
 //              if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].lock();
-//              
+//
 //              cutilSafeCall( cudaSetDevice(which_device) );
 //              cutilSafeCall( cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 0) );
 //              cutilSafeCall( cudaDeviceSetLimit(cudaLimitMallocHeapSize, 0) );
-//              
+//
 //              cutilSafeCall( cudaStreamCreate(&streams[which_device]) );
-//              
+//
 //              /* pre-allocate memory, pray there's enough to go around */
 //              if(SEND_SIZE > 0) {
-//                      cutilSafeCall( cudaMalloc((void**)&d_send_data[which_device], SEND_SIZE) );     
+//                      cutilSafeCall( cudaMalloc((void**)&d_send_data[which_device], SEND_SIZE) );
 //              }
 //              if(RECV_SIZE > 0) {
 //                      cutilSafeCall( cudaMalloc((void**)&h_recv_data[which_device], RECV_SIZE) );
 //              }
 //              if(STATE_SIZE > 0) {
 //                      cutilSafeCall( cudaMalloc((void**)&h_state_data[which_device], STATE_SIZE) );
-//              }               
+//              }
-//              
+//
 //              if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].unlock();
-//      }       
+//      }
        printf("Device memory allocated.\n");
 }
@@ -508,39 +508,39 @@ static void configure_gpus()
 //      // SUSPEND WHEN BLOCKED!!
 //      cutilSafeCall( cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync) );
-//      
+//
 //      // establish a connection to each GPU.
 //      for(int i = 0; i < GPU_PARTITION_SIZE; ++i)
 //      {
 //              int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i;
-//              
+//
 //              if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].lock();
-//              
+//
 //              cutilSafeCall( cudaSetDevice(which_device) );
 //              cutilSafeCall( cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 0) );
 //              cutilSafeCall( cudaDeviceSetLimit(cudaLimitMallocHeapSize, 0) );
-//              
+//
 //              cutilSafeCall( cudaStreamCreate(&streams[which_device]) );
-//              
+//
 //              // enable P2P migrations.
 //              // we assume all GPUs are on the same I/O hub.
 //              for(int j = 0; j < GPU_PARTITION_SIZE; ++j)
 //              {
 //                      int other_device = GPU_PARTITION*GPU_PARTITION_SIZE + j;
-//                      
+//
 //                      if(which_device != other_device)
 //                      {
 //                              cutilSafeCall( cudaDeviceEnablePeerAccess(other_device, 0) );
 //                      }
 //              }
-//              
+//
 //              if(i == 0)
 //              {
 //                      struct cudaDeviceProp pi;
 //                      cudaGetDeviceProperties(&pi, i);
 //                      gpuCyclesPerSecond = pi.clockRate * 1000; /* khz -> hz */
-//              }               
+//              }
-//              
+//
 //              if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].unlock();
 //      }
@@ -580,7 +580,7 @@ static void catchExit(void)
                        for(int i = 0; i < GPU_PARTITION_SIZE; ++i)
                        {
                                int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i;
-                                
                                litmus_unlock(EE_LOCKS[which_device]);
                                litmus_unlock(CE_SEND_LOCKS[which_device]);
                                if(NUM_SIMULT_USERS == 2) {
@@ -588,11 +588,11 @@ static void catchExit(void)
                                }
                        }
                }
-                
                if(CUR_DEVICE >= 0) {
                        unregister_nv_device(CUR_DEVICE);
                }
-                
                litmus_unlock(KEXCLU_LOCK);
        }
 }
@@ -604,18 +604,18 @@ static void migrateToGPU(int destination)
                if(MIGRATE_VIA_SYSMEM)
                {
                        chunkMemcpy(h_state_data, d_state_data[LAST_DEVICE], STATE_SIZE,
-                                                cudaMemcpyDeviceToHost, LAST_DEVICE, useEngineLocks());                 
+                                                cudaMemcpyDeviceToHost, LAST_DEVICE, useEngineLocks());
                }
        }
-        
-//      cutilSafeCall( cudaSetDevice(destination) );            
+//      cutilSafeCall( cudaSetDevice(destination) );
-        
        if(!BROADCAST_STATE && STATE_SIZE > 0)
        {
                if(MIGRATE_VIA_SYSMEM)
                {
                        chunkMemcpy(d_state_data[CUR_DEVICE], h_state_data, STATE_SIZE,
-                                                cudaMemcpyHostToDevice, CUR_DEVICE, useEngineLocks());                  
+                                                cudaMemcpyHostToDevice, CUR_DEVICE, useEngineLocks());
                }
                else
                {
@@ -633,15 +633,15 @@ static void migrateToGPU(int destination)
 static void broadcastState(int from)
 {
        if(STATE_SIZE > 0)
-        {               
+        {
                assert(CUR_DEVICE == from);
-                
                if(MIGRATE_VIA_SYSMEM)
                {
                        chunkMemcpy(h_state_data, d_state_data[from], STATE_SIZE,
                                                cudaMemcpyDeviceToHost, from, useEngineLocks());
-                }       
+                }
-        
                for(int i = 0; i < GPU_PARTITION_SIZE; ++i)
                {
                        int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i;
@@ -652,7 +652,7 @@ static void broadcastState(int from)
 //                                      cutilSafeCall( cudaSetDevice(which_device) );
                                        CUR_DEVICE = which_device; // temporary
                                        chunkMemcpy(d_state_data[which_device], h_state_data, STATE_SIZE,
-                                                                cudaMemcpyHostToDevice, which_device, useEngineLocks());                                        
+                                                                cudaMemcpyHostToDevice, which_device, useEngineLocks());
                                }
                                else
                                {
@@ -662,11 +662,11 @@ static void broadcastState(int from)
                                                                cudaMemcpyDeviceToDevice,
                                                                from,
                                                                useEngineLocks(),
-                                                                which_device);  
+                                                                which_device);
                                }
                        }
                }
-                
                if(MIGRATE_VIA_SYSMEM && CUR_DEVICE != from)
                {
 //                      cutilSafeCall( cudaSetDevice(from) );
@@ -714,18 +714,18 @@ static void gpu_loop_for(double gpu_sec_time, double emergency_exit)
                }
                if(useEngineLocks()) litmus_lock(EE_LOCKS[CUR_DEVICE]);
-                
 //              docudaspin <<<numblocks,blocksz, 0, streams[CUR_DEVICE]>>> (numcycles);
 //              cutilSafeCall( cudaStreamSynchronize(streams[CUR_DEVICE]) );
-                
                if(useEngineLocks()) litmus_unlock(EE_LOCKS[CUR_DEVICE]);
-                
                if(RECV_SIZE > 0)
                {
                        chunkMemcpy(h_recv_data, d_recv_data[CUR_DEVICE], RECV_SIZE,
                                                cudaMemcpyDeviceToHost, CUR_DEVICE, useEngineLocks());
                }
-                
                if(BROADCAST_STATE)
                {
                        broadcastState(CUR_DEVICE);
@@ -802,7 +802,7 @@ int main(int argc, char** argv)
        int num_tasks = 0;
        double gpu_sec_ms = 0;
-        
        while ((opt = getopt(argc, argv, OPTSTR)) != -1) {
 //              printf("opt = %c optarg = %s\n", opt, optarg);
                switch (opt) {
@@ -858,7 +858,7 @@ int main(int argc, char** argv)
                        break;
                case 'r':
                        RELAX_FIFO_MAX_LEN = true;
-                        break;                          
+                        break;
                case 'L':
                        USE_KFMLP = true;
                        break;
@@ -949,13 +949,13 @@ int main(int argc, char** argv)
                {
                        printf("%d creating release shared memory\n", getpid());
                        shared_memory_object::remove("release_barrier_memory");
-                        release_segment_ptr = new managed_shared_memory(create_only, "release_barrier_memory", 4*1024);                 
+                        release_segment_ptr = new managed_shared_memory(create_only, "release_barrier_memory", 4*1024);
-                        
                        printf("%d creating release barrier for %d users\n", getpid(), num_tasks);
                        release_barrier = release_segment_ptr->construct<barrier>("barrier release_barrier")(num_tasks);
-                        
                        init_release_time = release_segment_ptr->construct<uint64_t>("uint64_t instance")();
-                        *init_release_time = 0;         
+                        *init_release_time = 0;
                }
                else
                {
@@ -972,13 +972,13 @@ int main(int argc, char** argv)
                                        sleep(1);
                                }
                        }while(segment_ptr == NULL);
-                        
                        release_barrier = segment_ptr->find<barrier>("barrier release_barrier").first;
                        init_release_time = segment_ptr->find<uint64_t>("uint64_t instance").first;
                }
        }
-        
-        
        if(GPU_TASK)
        {
                if(ENABLE_WAIT)
@@ -1019,7 +1019,7 @@ int main(int argc, char** argv)
                SEND_SIZE *= scale;
                RECV_SIZE *= scale;
                STATE_SIZE *= scale;
-                
                init_cuda();
        }
@@ -1036,16 +1036,16 @@ int main(int argc, char** argv)
        if (ret != 0)
                bail_out("could not become RT task");
-        
-        
        uint64_t jobCount = 0;
        blitz::Array<uint64_t, 1> responseTimeLog(num_jobs+1);
-        
        struct timespec spec;
        uint64_t release;
        uint64_t finish;
-                
-        
        if (ENABLE_WAIT) {
                printf("Waiting for release.\n");
                ret = wait_for_ts_release();
@@ -1056,14 +1056,14 @@ int main(int argc, char** argv)
        {
                sleep_next_period();
        }
-        
        clock_gettime(CLOCK_MONOTONIC, &spec);
        release = timespec_to_ns(spec);
        if (!__sync_bool_compare_and_swap(init_release_time, 0, release))
        {
                release = *init_release_time;
        }
-                
        releaseTime = wctime();
        double failsafeEnd = releaseTime + duration;
@@ -1087,7 +1087,7 @@ int main(int argc, char** argv)
                        clock_gettime(CLOCK_MONOTONIC, &spec);
                        finish = timespec_to_ns(spec);
-                        responseTimeLog(min(num_jobs,jobCount++)) = finish - release;   
+                        responseTimeLog(min(num_jobs,jobCount++)) = finish - release;
                        // this is an estimated upper-bound on release time.  it may be off by several microseconds.
 #ifdef RESET_RELEASE_ON_MISS
@@ -1097,11 +1097,11 @@ int main(int argc, char** argv)
 #else
                        release = release + period; // allow things to get progressively later.
 #endif
-                        
                        sleep_next_period();
                        clock_gettime(CLOCK_MONOTONIC, &spec);
                        release = min(timespec_to_ns(spec), release);
-                        
                } while(keepGoing);
        }
@@ -1147,13 +1147,13 @@ int main(int argc, char** argv)
                }
        }
-        
        if (ENABLE_WAIT)
        {
                printf("%d waiting at exit barrier\n", getpid());
                release_barrier->wait();
        }
-        
        char gpu_using_str[] = "GPU\n";
        char cpu_only_str[] = "CPU\n";
@@ -1166,7 +1166,7 @@ int main(int argc, char** argv)
                   // average
                   blitz::mean(USED(responseTimeLog)),
                   // average pct of period
-                   100.0*(blitz::mean(USED(responseTimeLog))/period),                      
+                   100.0*(blitz::mean(USED(responseTimeLog))/period),
                   // min
                   blitz::min(USED(responseTimeLog)),
                   // max
@@ -1182,6 +1182,6 @@ int main(int argc, char** argv)
                   // flag gpu-using tasks
                   ((GPU_TASK) ? gpu_using_str : cpu_only_str)
                   );
-        
        return 0;
 }
author	Glenn Elliott <gelliott@cs.unc.edu>	2013-03-13 15:33:57 -0400
committer	Glenn Elliott <gelliott@cs.unc.edu>	2013-03-13 15:33:57 -0400
commit	944a78c21028da69fb53c0aec3e9dfdb048d47e4 (patch)
tree	bdbc77b3c0ff1337670a7e5d0f9d438388c1a866 /gpu
parent	f338b34ea0fb6136ea3895a07161ece030c4b998 (diff)
parent	1ff4fc699f01f0ad1359fad48b00c9d3be1b28b4 (diff)