updates for further litmus development

author: Glenn Elliott <gelliott@cs.unc.edu> 2013-04-16 15:08:44 -0400
committer: Glenn Elliott <gelliott@cs.unc.edu> 2013-04-16 15:08:44 -0400
commit: 76b0d79069973bd58cda6028c65a9edaa6d2ea73 (patch)
tree: ae1a121c81d23d95e9c7d5478f2a13ec33eb465c
parent: 346efb91841609a8c1f2ecf40efd692ced0565cb (diff)
4 files changed, 69 insertions, 43 deletions
diff --git a/Makefile b/Makefile
index 720a585..f50af0f 100644
--- a/Makefile
+++ b/Makefile
@@ -57,8 +57,8 @@ headers = -I${LIBLITMUS}/include -I${LIBLITMUS}/arch/${include-${ARCH}}/include
 # combine options
 CPPFLAGS = ${flags-api} ${flags-debug-cpp} ${flags-misc} ${flags-${ARCH}} -DARCH=${ARCH} ${headers}
-#CUFLAGS  = ${flags-api} ${flags-cu-debug} ${flags-cu-optim} ${flags-cu-nvcc} ${flags-cu-misc} -DARCH=${ARCH} ${headers}
+CUFLAGS  = ${flags-api} ${flags-cu-debug} ${flags-cu-optim} ${flags-cu-nvcc} ${flags-cu-misc} -DARCH=${ARCH} ${headers}
-CUFLAGS  = ${flags-api} ${flags-cu-optim} ${flags-cu-nvcc} ${flags-cu-misc} -DARCH=${ARCH} ${headers}
+#CUFLAGS  = ${flags-api} ${flags-cu-optim} ${flags-cu-nvcc} ${flags-cu-misc} -DARCH=${ARCH} ${headers}
 CFLAGS   = ${flags-debug} ${flags-misc}
 LDFLAGS  = ${flags-${ARCH}}
diff --git a/gpu/budget.cpp b/gpu/budget.cpp
index 8a2546a..eebb14e 100644
--- a/gpu/budget.cpp
+++ b/gpu/budget.cpp
@@ -165,7 +165,7 @@ int job(lt_t exec_ns, lt_t budget_ns)
        return 1;
 }
-#define OPTSTR "SbosOvalwqixdn:r:"
+#define OPTSTR "SbosOvzalwqixdn:r:"
 int main(int argc, char** argv)
 {
@@ -215,6 +215,9 @@ int main(int argc, char** argv)
                case 'v':
                        drain_policy = DRAIN_SOBLIV;
                        break;
+                case 'z':
+                        drain_policy = DRAIN_SIMPLE_IO;
+                        break;
                case 'l':
                        CXS_OVERRUN = 1;
                        NAMESPACE = open("semaphores", O_RDONLY | O_CREAT, S_IRUSR | S_IWUSR);
diff --git a/gpu/gpuspin.cu b/gpu/gpuspin.cu
index 414e074..b096c82 100644
--- a/gpu/gpuspin.cu
+++ b/gpu/gpuspin.cu
@@ -66,6 +66,7 @@ int CE_MIGR_SEND_LOCKS[NR_GPUS];
 int CE_MIGR_RECV_LOCKS[NR_GPUS];
 bool RESERVED_MIGR_COPY_ENGINE = false;  // only checked if NUM_COPY_ENGINES == 2
+//bool ENABLE_RT_AUX_THREADS = false;
 bool ENABLE_RT_AUX_THREADS = true;
 enum eGpuSyncMode
@@ -874,8 +875,10 @@ static void catch_exit(int catch_exit)
+#ifdef VANILLA_LINUX
 static float ms_sum;
 static int gpucount = 0;
+#endif
 __global__ void docudaspin(float* data, /*unsigned int* iterations,*/ unsigned int num_elem, unsigned int cycles)
 {
@@ -915,29 +918,30 @@ __global__ void docudaspin(float* data, /*unsigned int* iterations,*/ unsigned i
        return;
 }
-static void gpu_loop_for(double gpu_sec_time, double emergency_exit)
+static void gpu_loop_for(double gpu_sec_time, unsigned int num_kernels, double emergency_exit)
 {
        int next_gpu;
-        
        if (emergency_exit && wctime() > emergency_exit)
                goto out;
        
        next_gpu = litmus_lock(TOKEN_LOCK);
        {
                MigrateIfNeeded(next_gpu);
-                
+                unsigned int numcycles = ((unsigned int)(cur_hz() * gpu_sec_time))/num_kernels;
-                unsigned int numcycles = (unsigned int)(cur_hz() * gpu_sec_time);
-                
                if(SEND_SIZE > 0)
                        chunkMemcpy(this_gpu(d_state_data), h_send_data, SEND_SIZE,
                                                cudaMemcpyHostToDevice, CUR_DEVICE, useEngineLocks());
                
-                if(useEngineLocks()) litmus_lock(cur_ee());
+                for(unsigned int i = 0; i < num_kernels; ++i)
-                /* one block per sm, one warp per block */
+                {
-                docudaspin <<<cur_sms(),cur_warp_size(), 0, cur_stream()>>> (d_spin_data[cur_gpu()], cur_elem_per_thread(), numcycles);
+                        if(useEngineLocks()) litmus_lock(cur_ee());
-//              docudaspin <<<cur_sms(),cur_warp_size(), 0, cur_stream()>>> (d_spin_data[cur_gpu()], d_iteration_count[cur_gpu()], cur_elem_per_thread(), numcycles);           
+                        /* one block per sm, one warp per block */
-                cudaStreamSynchronize(cur_stream());
+                        docudaspin <<<cur_sms(),cur_warp_size(), 0, cur_stream()>>> (d_spin_data[cur_gpu()], cur_elem_per_thread(), numcycles);
-                if(useEngineLocks()) litmus_unlock(cur_ee());
+                        cudaStreamSynchronize(cur_stream());
+                        if(useEngineLocks()) litmus_unlock(cur_ee());
+                }
                
                if(RECV_SIZE > 0)
                        chunkMemcpy(h_recv_data, this_gpu(d_state_data), RECV_SIZE,
@@ -954,12 +958,13 @@ out:
        return;
 }
-static void gpu_loop_for_linux(double gpu_sec_time, double emergency_exit)
+static void gpu_loop_for_linux(double gpu_sec_time, unsigned int num_kernels, double emergency_exit)
 {
        static int GPU_OFFSET = GPU_PARTITION * GPU_PARTITION_SIZE;
        static gpu_pool *pool = &GPU_LINUX_SEM_POOL[GPU_PARTITION];
        static pthread_mutex_t *mutex = &GPU_LINUX_MUTEX_POOL[GPU_PARTITION];
        
+#ifdef VANILLA_LINUX
        static bool once = false;
        static cudaEvent_t start, end;
        float ms;
@@ -969,6 +974,7 @@ static void gpu_loop_for_linux(double gpu_sec_time, double emergency_exit)
                cudaEventCreate(&start);
                cudaEventCreate(&end);
        }
+#endif
        int next_gpu;
        
@@ -979,29 +985,33 @@ static void gpu_loop_for_linux(double gpu_sec_time, double emergency_exit)
        {
                MigrateIfNeeded(next_gpu);
                
-                unsigned int numcycles = (unsigned int)(cur_hz() * gpu_sec_time);
+                unsigned int numcycles = ((unsigned int)(cur_hz() * gpu_sec_time))/num_kernels;
                
                if(SEND_SIZE > 0)
                        chunkMemcpy(this_gpu(d_state_data), h_send_data, SEND_SIZE,
                                                cudaMemcpyHostToDevice, cur_gpu(), useEngineLocks());
                
-                /* one block per sm, one warp per block */
+                for(unsigned int i = 0; i < num_kernels; ++i)
-                cudaEventRecord(start, cur_stream());
+                {
-                docudaspin <<<cur_sms(),cur_warp_size(), 0, cur_stream()>>> (d_spin_data[cur_gpu()], cur_elem_per_thread(), numcycles);
+                        /* one block per sm, one warp per block */
-//              docudaspin <<<cur_sms(),cur_warp_size(), 0, cur_stream()>>> (d_spin_data[cur_gpu()], d_iteration_count[cur_gpu()], cur_elem_per_thread(), numcycles);           
+#ifdef VANILLA_LINUX
-                cudaEventRecord(end, cur_stream());
+                        cudaEventRecord(start, cur_stream());
-                cudaEventSynchronize(end);
+#endif
-                cudaStreamSynchronize(cur_stream());
+                        docudaspin <<<cur_sms(),cur_warp_size(), 0, cur_stream()>>> (d_spin_data[cur_gpu()], cur_elem_per_thread(), numcycles);
+#ifdef VANILLA_LINUX
+                        cudaEventRecord(end, cur_stream());
+                        cudaEventSynchronize(end);
+#endif
+                        cudaStreamSynchronize(cur_stream());
                
-//              chunkMemcpy(this_gpu(h_iteration_count), this_gpu(d_iteration_count), sizeof(unsigned int),
+#ifdef VANILLA_LINUX
-//                                      cudaMemcpyDeviceToHost, cur_gpu(), useEngineLocks());
+                        cudaEventElapsedTime(&ms, start, end);
-//              
+                        ms_sum += ms;
-                cudaEventElapsedTime(&ms, start, end);
+#endif
-                ms_sum += ms;
+                }
+#ifdef VANILLA_LINUX
                ++gpucount;
-//              printf("%f\n", ms);
+#endif
-//              printf("%f: %u\n", ms, this_gpu(h_iteration_count)[0]);
                
                if(RECV_SIZE > 0)
                        chunkMemcpy(h_recv_data, this_gpu(d_state_data), RECV_SIZE,
@@ -1166,7 +1176,7 @@ static void debug_delay_loop(void)
        }
 }
-static int gpu_job(double exec_time, double gpu_exec_time, double program_end)
+static int gpu_job(double exec_time, double gpu_exec_time, unsigned int num_kernels, double program_end)
 {
        double chunk1, chunk2;
@@ -1178,7 +1188,7 @@ static int gpu_job(double exec_time, double gpu_exec_time, double program_end)
                chunk2 = exec_time - chunk1;
                
                loop_for(chunk1, program_end + 1);
-                gpu_loop_for(gpu_exec_time, program_end + 1);
+                gpu_loop_for(gpu_exec_time, num_kernels, program_end + 1);
                loop_for(chunk2, program_end + 1);
                
                sleep_next_period();
@@ -1243,7 +1253,7 @@ static void init_linux()
        mlockall(MCL_CURRENT | MCL_FUTURE);
 }
-static int gpu_job_linux(double exec_time, double gpu_exec_time, double program_end)
+static int gpu_job_linux(double exec_time, double gpu_exec_time, unsigned int num_kernels, double program_end)
 {
        double chunk1, chunk2;
        
@@ -1255,7 +1265,7 @@ static int gpu_job_linux(double exec_time, double gpu_exec_time, double program_
                chunk2 = exec_time - chunk1;
                loop_for(chunk1, program_end + 1);
-                gpu_loop_for_linux(gpu_exec_time, program_end + 1);
+                gpu_loop_for_linux(gpu_exec_time, num_kernels, program_end + 1);
                loop_for(chunk2, program_end + 1);
                
                sleep_next_period_linux();
@@ -1284,8 +1294,8 @@ enum eScheduler
        RT_LINUX
 };
-#define CPU_OPTIONS "p:z:c:wlveio:f:s:q:X:L:Q:"
+#define CPU_OPTIONS "p:z:c:wlveio:f:s:q:X:L:Q:d"
-#define GPU_OPTIONS "g:y:r:C:E:dG:xS:R:T:Z:aFm:b:MNI"
+#define GPU_OPTIONS "g:y:r:C:E:DG:xS:R:T:Z:aFm:b:MNIk:"
 // concat the option strings
 #define OPTSTR CPU_OPTIONS GPU_OPTIONS
@@ -1310,7 +1320,7 @@ int main(int argc, char** argv)
        double duration = 0, start = 0;
        double *exec_times = NULL;
        double scale = 1.0;
-        task_class_t cls = RT_CLASS_HARD;
+        task_class_t cls = RT_CLASS_SOFT;
        int cur_job = 0, num_jobs = 0;
        struct rt_task param;
@@ -1318,7 +1328,9 @@ int main(int argc, char** argv)
        lt_t budget;
        
        int num_gpu_users = 0;
-        
+        unsigned int num_kernels = 1;
+        budget_drain_policy_t drain = DRAIN_SIMPLE;
        
        eScheduler scheduler = LITMUS;
        
@@ -1365,7 +1377,7 @@ int main(int argc, char** argv)
                        ENGINE_LOCK_TYPE = (eEngineLockTypes)atoi(optarg);
                        assert(ENGINE_LOCK_TYPE == FIFO || ENGINE_LOCK_TYPE == PRIOQ);
                        break;
-                case 'd':
+                case 'D':
                        USE_DYNAMIC_GROUP_LOCKS = true;
                        break;
                case 'G':
@@ -1401,6 +1413,9 @@ int main(int argc, char** argv)
                        num_gpu_users = atoi(optarg);
                        assert(num_gpu_users > 0);
                        break;
+                case 'k':
+                        num_kernels = (unsigned int)atoi(optarg);
+                        break;
                case 'b':
                        budget_ms = atoi(optarg);
                        break;
@@ -1424,6 +1439,9 @@ int main(int argc, char** argv)
                case 'i':
                        want_signals = 1;
                        break;
+                case 'd':
+                        drain = DRAIN_SOBLIV;
+                        break;
                case 'l':
                        test_loop = 1;
                        break;
@@ -1475,7 +1493,7 @@ int main(int argc, char** argv)
                ENABLE_AFFINITY = false;
                RELAX_FIFO_MAX_LEN = false;
                ENABLE_RT_AUX_THREADS = false;          
-                budget_ms = -1;
+                budget_ms = -1.0;
                want_enforcement = 0;
                want_signals = 0;
                
@@ -1551,7 +1569,7 @@ int main(int argc, char** argv)
        if (GPU_USING && gpu_wcet_ms <= 0)
                usage("The worst-case gpu execution time must be a positive number.");
-        if (budget_ms > 0)
+        if (budget_ms > 0.0)
                budget = ms2ns(budget_ms);
        else
                budget = wcet;
@@ -1586,6 +1604,7 @@ int main(int argc, char** argv)
                        PRECISE_ENFORCEMENT : NO_ENFORCEMENT;
        param.budget_signal_policy = (want_enforcement && want_signals) ?
                        PRECISE_SIGNALS : NO_SIGNALS;
+        param.drain_policy = drain;
        param.release_policy = PERIODIC;
                                
        if (migrate)
@@ -1665,6 +1684,7 @@ int main(int argc, char** argv)
                else {
                        while (gpu_job(wcet_ms * 0.001 * scale,
                                                   gpu_wcet_ms * 0.001 * scale,
+                                                   num_kernels,
                                                   start + duration));
                }
        }
@@ -1676,6 +1696,7 @@ int main(int argc, char** argv)
                else {
                        while (gpu_job_linux(wcet_ms * 0.001 * scale,
                                                   gpu_wcet_ms * 0.001 * scale,
+                                                   num_kernels,
                                                   start + duration));
                }
        }
@@ -1710,7 +1731,7 @@ int main(int argc, char** argv)
                exit_cuda();
-                printf("avg: %f\n", ms_sum/gpucount);
+//              printf("avg: %f\n", ms_sum/gpucount);
        }
        
        if (file)
diff --git a/src/litmus.c b/src/litmus.c
index 213ac3f..70f7fb6 100644
--- a/src/litmus.c
+++ b/src/litmus.c
@@ -113,6 +113,8 @@ void init_rt_task_param(struct rt_task* tp)
        tp->cls = RT_CLASS_SOFT;
        tp->priority = LITMUS_LOWEST_PRIORITY;
        tp->budget_policy = NO_ENFORCEMENT;
+        tp->drain_policy = DRAIN_SIMPLE;
+        tp->budget_signal_policy = NO_SIGNALS;
        tp->release_policy = SPORADIC;
 }
author	Glenn Elliott <gelliott@cs.unc.edu>	2013-04-16 15:08:44 -0400
committer	Glenn Elliott <gelliott@cs.unc.edu>	2013-04-16 15:08:44 -0400
commit	76b0d79069973bd58cda6028c65a9edaa6d2ea73 (patch)
tree	ae1a121c81d23d95e9c7d5478f2a13ec33eb465c
parent	346efb91841609a8c1f2ecf40efd692ced0565cb (diff)

diff --git a/Makefile b/Makefile index 720a585..f50af0f 100644 --- a/Makefile +++ b/Makefile
@@ -57,8 +57,8 @@ headers = -I${LIBLITMUS}/include -I${LIBLITMUS}/arch/${include-${ARCH}}/include
57		57
58	# combine options	58	# combine options
59	CPPFLAGS = ${flags-api} ${flags-debug-cpp} ${flags-misc} ${flags-${ARCH}} -DARCH=${ARCH} ${headers}	59	CPPFLAGS = ${flags-api} ${flags-debug-cpp} ${flags-misc} ${flags-${ARCH}} -DARCH=${ARCH} ${headers}
60	#CUFLAGS = ${flags-api} ${flags-cu-debug} ${flags-cu-optim} ${flags-cu-nvcc} ${flags-cu-misc} -DARCH=${ARCH} ${headers}	60	CUFLAGS = ${flags-api} ${flags-cu-debug} ${flags-cu-optim} ${flags-cu-nvcc} ${flags-cu-misc} -DARCH=${ARCH} ${headers}
61	CUFLAGS = ${flags-api} ${flags-cu-optim} ${flags-cu-nvcc} ${flags-cu-misc} -DARCH=${ARCH} ${headers}	61	#CUFLAGS = ${flags-api} ${flags-cu-optim} ${flags-cu-nvcc} ${flags-cu-misc} -DARCH=${ARCH} ${headers}
62	CFLAGS = ${flags-debug} ${flags-misc}	62	CFLAGS = ${flags-debug} ${flags-misc}
63	LDFLAGS = ${flags-${ARCH}}	63	LDFLAGS = ${flags-${ARCH}}
64		64


diff --git a/gpu/budget.cpp b/gpu/budget.cpp index 8a2546a..eebb14e 100644 --- a/gpu/budget.cpp +++ b/gpu/budget.cpp
@@ -165,7 +165,7 @@ int job(lt_t exec_ns, lt_t budget_ns)
165	return 1;	165	return 1;
166	}	166	}
167		167
168	#define OPTSTR "SbosOvalwqixdn:r:"	168	#define OPTSTR "SbosOvzalwqixdn:r:"
169		169
170	int main(int argc, char** argv)	170	int main(int argc, char** argv)
171	{	171	{
@@ -215,6 +215,9 @@ int main(int argc, char** argv)
215	case 'v':	215	case 'v':
216	drain_policy = DRAIN_SOBLIV;	216	drain_policy = DRAIN_SOBLIV;
217	break;	217	break;
		218	case 'z':
		219	drain_policy = DRAIN_SIMPLE_IO;
		220	break;
218	case 'l':	221	case 'l':
219	CXS_OVERRUN = 1;	222	CXS_OVERRUN = 1;
220	NAMESPACE = open("semaphores", O_RDONLY \| O_CREAT, S_IRUSR \| S_IWUSR);	223	NAMESPACE = open("semaphores", O_RDONLY \| O_CREAT, S_IRUSR \| S_IWUSR);


diff --git a/gpu/gpuspin.cu b/gpu/gpuspin.cu index 414e074..b096c82 100644 --- a/gpu/gpuspin.cu +++ b/gpu/gpuspin.cu
@@ -66,6 +66,7 @@ int CE_MIGR_SEND_LOCKS[NR_GPUS];
66	int CE_MIGR_RECV_LOCKS[NR_GPUS];	66	int CE_MIGR_RECV_LOCKS[NR_GPUS];
67	bool RESERVED_MIGR_COPY_ENGINE = false; // only checked if NUM_COPY_ENGINES == 2	67	bool RESERVED_MIGR_COPY_ENGINE = false; // only checked if NUM_COPY_ENGINES == 2
68		68
		69	//bool ENABLE_RT_AUX_THREADS = false;
69	bool ENABLE_RT_AUX_THREADS = true;	70	bool ENABLE_RT_AUX_THREADS = true;
70		71
71	enum eGpuSyncMode	72	enum eGpuSyncMode
@@ -874,8 +875,10 @@ static void catch_exit(int catch_exit)
874		875
875		876
876		877
		878	#ifdef VANILLA_LINUX
877	static float ms_sum;	879	static float ms_sum;
878	static int gpucount = 0;	880	static int gpucount = 0;
		881	#endif
879		882
880	__global__ void docudaspin(float* data, /unsigned int iterations,*/ unsigned int num_elem, unsigned int cycles)	883	__global__ void docudaspin(float* data, /unsigned int iterations,*/ unsigned int num_elem, unsigned int cycles)
881	{	884	{
@@ -915,29 +918,30 @@ __global__ void docudaspin(float* data, /unsigned int iterations,*/ unsigned i
915	return;	918	return;
916	}	919	}
917		920
918	static void gpu_loop_for(double gpu_sec_time, double emergency_exit)	921	static void gpu_loop_for(double gpu_sec_time, unsigned int num_kernels, double emergency_exit)
919	{	922	{
920	int next_gpu;	923	int next_gpu;
921		924
922	if (emergency_exit && wctime() > emergency_exit)	925	if (emergency_exit && wctime() > emergency_exit)
923	goto out;	926	goto out;
924		927
925	next_gpu = litmus_lock(TOKEN_LOCK);	928	next_gpu = litmus_lock(TOKEN_LOCK);
926	{	929	{
927	MigrateIfNeeded(next_gpu);	930	MigrateIfNeeded(next_gpu);
928		931	unsigned int numcycles = ((unsigned int)(cur_hz() * gpu_sec_time))/num_kernels;
929	unsigned int numcycles = (unsigned int)(cur_hz() * gpu_sec_time);	932
930
931	if(SEND_SIZE > 0)	933	if(SEND_SIZE > 0)
932	chunkMemcpy(this_gpu(d_state_data), h_send_data, SEND_SIZE,	934	chunkMemcpy(this_gpu(d_state_data), h_send_data, SEND_SIZE,
933	cudaMemcpyHostToDevice, CUR_DEVICE, useEngineLocks());	935	cudaMemcpyHostToDevice, CUR_DEVICE, useEngineLocks());
934		936
935	if(useEngineLocks()) litmus_lock(cur_ee());	937	for(unsigned int i = 0; i < num_kernels; ++i)
936	/* one block per sm, one warp per block */	938	{
937	docudaspin <<<cur_sms(),cur_warp_size(), 0, cur_stream()>>> (d_spin_data[cur_gpu()], cur_elem_per_thread(), numcycles);	939	if(useEngineLocks()) litmus_lock(cur_ee());
938	// docudaspin <<<cur_sms(),cur_warp_size(), 0, cur_stream()>>> (d_spin_data[cur_gpu()], d_iteration_count[cur_gpu()], cur_elem_per_thread(), numcycles);	940	/* one block per sm, one warp per block */
939	cudaStreamSynchronize(cur_stream());	941	docudaspin <<<cur_sms(),cur_warp_size(), 0, cur_stream()>>> (d_spin_data[cur_gpu()], cur_elem_per_thread(), numcycles);
940	if(useEngineLocks()) litmus_unlock(cur_ee());	942	cudaStreamSynchronize(cur_stream());
		943	if(useEngineLocks()) litmus_unlock(cur_ee());
		944	}
941		945
942	if(RECV_SIZE > 0)	946	if(RECV_SIZE > 0)
943	chunkMemcpy(h_recv_data, this_gpu(d_state_data), RECV_SIZE,	947	chunkMemcpy(h_recv_data, this_gpu(d_state_data), RECV_SIZE,
@@ -954,12 +958,13 @@ out:
954	return;	958	return;
955	}	959	}
956		960
957	static void gpu_loop_for_linux(double gpu_sec_time, double emergency_exit)	961	static void gpu_loop_for_linux(double gpu_sec_time, unsigned int num_kernels, double emergency_exit)
958	{	962	{
959	static int GPU_OFFSET = GPU_PARTITION * GPU_PARTITION_SIZE;	963	static int GPU_OFFSET = GPU_PARTITION * GPU_PARTITION_SIZE;
960	static gpu_pool *pool = &GPU_LINUX_SEM_POOL[GPU_PARTITION];	964	static gpu_pool *pool = &GPU_LINUX_SEM_POOL[GPU_PARTITION];
961	static pthread_mutex_t *mutex = &GPU_LINUX_MUTEX_POOL[GPU_PARTITION];	965	static pthread_mutex_t *mutex = &GPU_LINUX_MUTEX_POOL[GPU_PARTITION];
962		966
		967	#ifdef VANILLA_LINUX
963	static bool once = false;	968	static bool once = false;
964	static cudaEvent_t start, end;	969	static cudaEvent_t start, end;
965	float ms;	970	float ms;
@@ -969,6 +974,7 @@ static void gpu_loop_for_linux(double gpu_sec_time, double emergency_exit)
969	cudaEventCreate(&start);	974	cudaEventCreate(&start);
970	cudaEventCreate(&end);	975	cudaEventCreate(&end);
971	}	976	}
		977	#endif
972		978
973	int next_gpu;	979	int next_gpu;
974		980
@@ -979,29 +985,33 @@ static void gpu_loop_for_linux(double gpu_sec_time, double emergency_exit)
979	{	985	{
980	MigrateIfNeeded(next_gpu);	986	MigrateIfNeeded(next_gpu);
981		987
982	unsigned int numcycles = (unsigned int)(cur_hz() * gpu_sec_time);	988	unsigned int numcycles = ((unsigned int)(cur_hz() * gpu_sec_time))/num_kernels;
983		989
984	if(SEND_SIZE > 0)	990	if(SEND_SIZE > 0)
985	chunkMemcpy(this_gpu(d_state_data), h_send_data, SEND_SIZE,	991	chunkMemcpy(this_gpu(d_state_data), h_send_data, SEND_SIZE,
986	cudaMemcpyHostToDevice, cur_gpu(), useEngineLocks());	992	cudaMemcpyHostToDevice, cur_gpu(), useEngineLocks());
987		993
988	/* one block per sm, one warp per block */	994	for(unsigned int i = 0; i < num_kernels; ++i)
989	cudaEventRecord(start, cur_stream());	995	{
990	docudaspin <<<cur_sms(),cur_warp_size(), 0, cur_stream()>>> (d_spin_data[cur_gpu()], cur_elem_per_thread(), numcycles);	996	/* one block per sm, one warp per block */
991	// docudaspin <<<cur_sms(),cur_warp_size(), 0, cur_stream()>>> (d_spin_data[cur_gpu()], d_iteration_count[cur_gpu()], cur_elem_per_thread(), numcycles);	997	#ifdef VANILLA_LINUX
992	cudaEventRecord(end, cur_stream());	998	cudaEventRecord(start, cur_stream());
993	cudaEventSynchronize(end);	999	#endif
994	cudaStreamSynchronize(cur_stream());	1000	docudaspin <<<cur_sms(),cur_warp_size(), 0, cur_stream()>>> (d_spin_data[cur_gpu()], cur_elem_per_thread(), numcycles);
		1001	#ifdef VANILLA_LINUX
		1002	cudaEventRecord(end, cur_stream());
		1003	cudaEventSynchronize(end);
		1004	#endif
		1005	cudaStreamSynchronize(cur_stream());
995		1006
996	// chunkMemcpy(this_gpu(h_iteration_count), this_gpu(d_iteration_count), sizeof(unsigned int),	1007	#ifdef VANILLA_LINUX
997	// cudaMemcpyDeviceToHost, cur_gpu(), useEngineLocks());	1008	cudaEventElapsedTime(&ms, start, end);
998	//	1009	ms_sum += ms;
999	cudaEventElapsedTime(&ms, start, end);	1010	#endif
1000	ms_sum += ms;	1011	}
		1012	#ifdef VANILLA_LINUX
1001	++gpucount;	1013	++gpucount;
1002	// printf("%f\n", ms);	1014	#endif
1003	// printf("%f: %u\n", ms, this_gpu(h_iteration_count)[0]);
1004
1005		1015
1006	if(RECV_SIZE > 0)	1016	if(RECV_SIZE > 0)
1007	chunkMemcpy(h_recv_data, this_gpu(d_state_data), RECV_SIZE,	1017	chunkMemcpy(h_recv_data, this_gpu(d_state_data), RECV_SIZE,
@@ -1166,7 +1176,7 @@ static void debug_delay_loop(void)
1166	}	1176	}
1167	}	1177	}
1168		1178
1169	static int gpu_job(double exec_time, double gpu_exec_time, double program_end)	1179	static int gpu_job(double exec_time, double gpu_exec_time, unsigned int num_kernels, double program_end)
1170	{	1180	{
1171	double chunk1, chunk2;	1181	double chunk1, chunk2;
1172		1182
@@ -1178,7 +1188,7 @@ static int gpu_job(double exec_time, double gpu_exec_time, double program_end)
1178	chunk2 = exec_time - chunk1;	1188	chunk2 = exec_time - chunk1;
1179		1189
1180	loop_for(chunk1, program_end + 1);	1190	loop_for(chunk1, program_end + 1);
1181	gpu_loop_for(gpu_exec_time, program_end + 1);	1191	gpu_loop_for(gpu_exec_time, num_kernels, program_end + 1);
1182	loop_for(chunk2, program_end + 1);	1192	loop_for(chunk2, program_end + 1);
1183		1193
1184	sleep_next_period();	1194	sleep_next_period();
@@ -1243,7 +1253,7 @@ static void init_linux()
1243	mlockall(MCL_CURRENT \| MCL_FUTURE);	1253	mlockall(MCL_CURRENT \| MCL_FUTURE);
1244	}	1254	}
1245		1255
1246	static int gpu_job_linux(double exec_time, double gpu_exec_time, double program_end)	1256	static int gpu_job_linux(double exec_time, double gpu_exec_time, unsigned int num_kernels, double program_end)
1247	{	1257	{
1248	double chunk1, chunk2;	1258	double chunk1, chunk2;
1249		1259
@@ -1255,7 +1265,7 @@ static int gpu_job_linux(double exec_time, double gpu_exec_time, double program_
1255	chunk2 = exec_time - chunk1;	1265	chunk2 = exec_time - chunk1;
1256		1266
1257	loop_for(chunk1, program_end + 1);	1267	loop_for(chunk1, program_end + 1);
1258	gpu_loop_for_linux(gpu_exec_time, program_end + 1);	1268	gpu_loop_for_linux(gpu_exec_time, num_kernels, program_end + 1);
1259	loop_for(chunk2, program_end + 1);	1269	loop_for(chunk2, program_end + 1);
1260		1270
1261	sleep_next_period_linux();	1271	sleep_next_period_linux();
@@ -1284,8 +1294,8 @@ enum eScheduler
1284	RT_LINUX	1294	RT_LINUX
1285	};	1295	};
1286		1296
1287	#define CPU_OPTIONS "p:z:c:wlveio:f:s:q:X:L:Q:"	1297	#define CPU_OPTIONS "p:z:c:wlveio:f:s:q:X:L:Q:d"
1288	#define GPU_OPTIONS "g:y:r:C:E:dG:xS:R:T:Z:aFm:b:MNI"	1298	#define GPU_OPTIONS "g:y:r:C:E:DG:xS:R:T:Z:aFm:b:MNIk:"
1289		1299
1290	// concat the option strings	1300	// concat the option strings
1291	#define OPTSTR CPU_OPTIONS GPU_OPTIONS	1301	#define OPTSTR CPU_OPTIONS GPU_OPTIONS
@@ -1310,7 +1320,7 @@ int main(int argc, char** argv)
1310	double duration = 0, start = 0;	1320	double duration = 0, start = 0;
1311	double *exec_times = NULL;	1321	double *exec_times = NULL;
1312	double scale = 1.0;	1322	double scale = 1.0;
1313	task_class_t cls = RT_CLASS_HARD;	1323	task_class_t cls = RT_CLASS_SOFT;
1314	int cur_job = 0, num_jobs = 0;	1324	int cur_job = 0, num_jobs = 0;
1315	struct rt_task param;	1325	struct rt_task param;
1316		1326
@@ -1318,7 +1328,9 @@ int main(int argc, char** argv)
1318	lt_t budget;	1328	lt_t budget;
1319		1329
1320	int num_gpu_users = 0;	1330	int num_gpu_users = 0;
1321		1331	unsigned int num_kernels = 1;
		1332
		1333	budget_drain_policy_t drain = DRAIN_SIMPLE;
1322		1334
1323	eScheduler scheduler = LITMUS;	1335	eScheduler scheduler = LITMUS;
1324		1336
@@ -1365,7 +1377,7 @@ int main(int argc, char** argv)
1365	ENGINE_LOCK_TYPE = (eEngineLockTypes)atoi(optarg);	1377	ENGINE_LOCK_TYPE = (eEngineLockTypes)atoi(optarg);
1366	assert(ENGINE_LOCK_TYPE == FIFO \|\| ENGINE_LOCK_TYPE == PRIOQ);	1378	assert(ENGINE_LOCK_TYPE == FIFO \|\| ENGINE_LOCK_TYPE == PRIOQ);
1367	break;	1379	break;
1368	case 'd':	1380	case 'D':
1369	USE_DYNAMIC_GROUP_LOCKS = true;	1381	USE_DYNAMIC_GROUP_LOCKS = true;
1370	break;	1382	break;
1371	case 'G':	1383	case 'G':
@@ -1401,6 +1413,9 @@ int main(int argc, char** argv)
1401	num_gpu_users = atoi(optarg);	1413	num_gpu_users = atoi(optarg);
1402	assert(num_gpu_users > 0);	1414	assert(num_gpu_users > 0);
1403	break;	1415	break;
		1416	case 'k':
		1417	num_kernels = (unsigned int)atoi(optarg);
		1418	break;
1404	case 'b':	1419	case 'b':
1405	budget_ms = atoi(optarg);	1420	budget_ms = atoi(optarg);
1406	break;	1421	break;
@@ -1424,6 +1439,9 @@ int main(int argc, char** argv)
1424	case 'i':	1439	case 'i':
1425	want_signals = 1;	1440	want_signals = 1;
1426	break;	1441	break;
		1442	case 'd':
		1443	drain = DRAIN_SOBLIV;
		1444	break;
1427	case 'l':	1445	case 'l':
1428	test_loop = 1;	1446	test_loop = 1;
1429	break;	1447	break;
@@ -1475,7 +1493,7 @@ int main(int argc, char** argv)
1475	ENABLE_AFFINITY = false;	1493	ENABLE_AFFINITY = false;
1476	RELAX_FIFO_MAX_LEN = false;	1494	RELAX_FIFO_MAX_LEN = false;
1477	ENABLE_RT_AUX_THREADS = false;	1495	ENABLE_RT_AUX_THREADS = false;
1478	budget_ms = -1;	1496	budget_ms = -1.0;
1479	want_enforcement = 0;	1497	want_enforcement = 0;
1480	want_signals = 0;	1498	want_signals = 0;
1481		1499
@@ -1551,7 +1569,7 @@ int main(int argc, char** argv)
1551	if (GPU_USING && gpu_wcet_ms <= 0)	1569	if (GPU_USING && gpu_wcet_ms <= 0)
1552	usage("The worst-case gpu execution time must be a positive number.");	1570	usage("The worst-case gpu execution time must be a positive number.");
1553		1571
1554	if (budget_ms > 0)	1572	if (budget_ms > 0.0)
1555	budget = ms2ns(budget_ms);	1573	budget = ms2ns(budget_ms);
1556	else	1574	else
1557	budget = wcet;	1575	budget = wcet;
@@ -1586,6 +1604,7 @@ int main(int argc, char** argv)
1586	PRECISE_ENFORCEMENT : NO_ENFORCEMENT;	1604	PRECISE_ENFORCEMENT : NO_ENFORCEMENT;
1587	param.budget_signal_policy = (want_enforcement && want_signals) ?	1605	param.budget_signal_policy = (want_enforcement && want_signals) ?
1588	PRECISE_SIGNALS : NO_SIGNALS;	1606	PRECISE_SIGNALS : NO_SIGNALS;
		1607	param.drain_policy = drain;
1589	param.release_policy = PERIODIC;	1608	param.release_policy = PERIODIC;
1590		1609
1591	if (migrate)	1610	if (migrate)
@@ -1665,6 +1684,7 @@ int main(int argc, char** argv)
1665	else {	1684	else {
1666	while (gpu_job(wcet_ms * 0.001 * scale,	1685	while (gpu_job(wcet_ms * 0.001 * scale,
1667	gpu_wcet_ms * 0.001 * scale,	1686	gpu_wcet_ms * 0.001 * scale,
		1687	num_kernels,
1668	start + duration));	1688	start + duration));
1669	}	1689	}
1670	}	1690	}
@@ -1676,6 +1696,7 @@ int main(int argc, char** argv)
1676	else {	1696	else {
1677	while (gpu_job_linux(wcet_ms * 0.001 * scale,	1697	while (gpu_job_linux(wcet_ms * 0.001 * scale,
1678	gpu_wcet_ms * 0.001 * scale,	1698	gpu_wcet_ms * 0.001 * scale,
		1699	num_kernels,
1679	start + duration));	1700	start + duration));
1680	}	1701	}
1681	}	1702	}
@@ -1710,7 +1731,7 @@ int main(int argc, char** argv)
1710	exit_cuda();	1731	exit_cuda();
1711		1732
1712		1733
1713	printf("avg: %f\n", ms_sum/gpucount);	1734	// printf("avg: %f\n", ms_sum/gpucount);
1714	}	1735	}
1715		1736
1716	if (file)	1737	if (file)


diff --git a/src/litmus.c b/src/litmus.c index 213ac3f..70f7fb6 100644 --- a/src/litmus.c +++ b/src/litmus.c
@@ -113,6 +113,8 @@ void init_rt_task_param(struct rt_task* tp)
113	tp->cls = RT_CLASS_SOFT;	113	tp->cls = RT_CLASS_SOFT;
114	tp->priority = LITMUS_LOWEST_PRIORITY;	114	tp->priority = LITMUS_LOWEST_PRIORITY;
115	tp->budget_policy = NO_ENFORCEMENT;	115	tp->budget_policy = NO_ENFORCEMENT;
		116	tp->drain_policy = DRAIN_SIMPLE;
		117	tp->budget_signal_policy = NO_SIGNALS;
116	tp->release_policy = SPORADIC;	118	tp->release_policy = SPORADIC;
117	}	119	}
118		120