aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGlenn Elliott <gelliott@cs.unc.edu>2013-04-14 15:06:43 -0400
committerGlenn Elliott <gelliott@cs.unc.edu>2013-04-14 15:06:43 -0400
commit37b4a24ba84f1dffd680fd550a3d8cad2ac5e3a8 (patch)
tree5dc5e56a7a4f424e75f59f7705263bdb43b86fb3
parent209f1961ea2d5863d6f2d2e9d2323446ee5e53c4 (diff)
Implemented gpusync rtspin.
-rw-r--r--Makefile51
-rw-r--r--gpu/budget.cpp143
-rw-r--r--gpu/gpuspin.cu1720
-rw-r--r--gpu/rtspin_fake_cuda.cpp1187
-rw-r--r--include/common.h7
5 files changed, 1909 insertions, 1199 deletions
diff --git a/Makefile b/Makefile
index a8e528e..720a585 100644
--- a/Makefile
+++ b/Makefile
@@ -24,6 +24,12 @@ flags-debug-cpp = -O2 -Wall -Werror -g
24flags-api = -D_XOPEN_SOURCE=600 -D_GNU_SOURCE 24flags-api = -D_XOPEN_SOURCE=600 -D_GNU_SOURCE
25flags-misc = -fasynchronous-unwind-tables -fnon-call-exceptions 25flags-misc = -fasynchronous-unwind-tables -fnon-call-exceptions
26 26
27flags-cu-debug = -g -G -Xcompiler -Wall -Xcompiler -Werror
28flags-cu-optim = -O3 -Xcompiler -march=native
29flags-cu-nvcc = --use_fast_math -gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30
30flags-cu-misc = -Xcompiler -fasynchronous-unwind-tables -Xcompiler -fnon-call-exceptions -Xcompiler -malign-double -Xcompiler -pthread
31flags-cu-x86_64 = -m64
32
27# architecture-specific flags 33# architecture-specific flags
28flags-i386 = -m32 34flags-i386 = -m32
29flags-x86_64 = -m64 35flags-x86_64 = -m64
@@ -51,12 +57,19 @@ headers = -I${LIBLITMUS}/include -I${LIBLITMUS}/arch/${include-${ARCH}}/include
51 57
52# combine options 58# combine options
53CPPFLAGS = ${flags-api} ${flags-debug-cpp} ${flags-misc} ${flags-${ARCH}} -DARCH=${ARCH} ${headers} 59CPPFLAGS = ${flags-api} ${flags-debug-cpp} ${flags-misc} ${flags-${ARCH}} -DARCH=${ARCH} ${headers}
60#CUFLAGS = ${flags-api} ${flags-cu-debug} ${flags-cu-optim} ${flags-cu-nvcc} ${flags-cu-misc} -DARCH=${ARCH} ${headers}
61CUFLAGS = ${flags-api} ${flags-cu-optim} ${flags-cu-nvcc} ${flags-cu-misc} -DARCH=${ARCH} ${headers}
54CFLAGS = ${flags-debug} ${flags-misc} 62CFLAGS = ${flags-debug} ${flags-misc}
55LDFLAGS = ${flags-${ARCH}} 63LDFLAGS = ${flags-${ARCH}}
56 64
57# how to link against liblitmus 65# how to link against liblitmus
58liblitmus-flags = -L${LIBLITMUS} -llitmus 66liblitmus-flags = -L${LIBLITMUS} -llitmus
59 67
68# how to link cuda
69cuda-flags-i386 = -L/usr/local/cuda/lib
70cuda-flags-x86_64 = -L/usr/local/cuda/lib64
71cuda-flags = ${cuda-flags-${ARCH}} -lcudart
72
60# Force gcc instead of cc, but let the user specify a more specific version if 73# Force gcc instead of cc, but let the user specify a more specific version if
61# desired. 74# desired.
62ifeq (${CC},cc) 75ifeq (${CC},cc)
@@ -67,20 +80,24 @@ endif
67CPP = g++ 80CPP = g++
68#endif 81#endif
69 82
83CU = nvcc
84
70# incorporate cross-compiler (if any) 85# incorporate cross-compiler (if any)
71CC := ${CROSS_COMPILE}${CC} 86CC := ${CROSS_COMPILE}${CC}
72CPP := ${CROSS_COMPILE}${CPP} 87CPP := ${CROSS_COMPILE}${CPP}
73LD := ${CROSS_COMPILE}${LD} 88LD := ${CROSS_COMPILE}${LD}
74AR := ${CROSS_COMPILE}${AR} 89AR := ${CROSS_COMPILE}${AR}
90CU := ${CROSS_COMPILE}${CU}
75 91
76# ############################################################################## 92# ##############################################################################
77# Targets 93# Targets
78 94
79all = lib ${rt-apps} ${rt-cppapps} 95all = lib ${rt-apps} ${rt-cppapps} ${rt-cuapps}
80rt-apps = cycles base_task rt_launch rtspin release_ts measure_syscall \ 96rt-apps = cycles base_task rt_launch rtspin release_ts measure_syscall \
81 base_mt_task uncache runtests \ 97 base_mt_task uncache runtests \
82 nested locktest ikglptest dgl aux_threads normal_task 98 nested locktest ikglptest dgl aux_threads normal_task
83rt-cppapps = budget 99rt-cppapps = budget
100rt-cuapps = gpuspin
84 101
85.PHONY: all lib clean dump-config TAGS tags cscope help 102.PHONY: all lib clean dump-config TAGS tags cscope help
86 103
@@ -95,10 +112,14 @@ inc/config.makefile: Makefile
95 @printf "%-15s= %-20s\n" \ 112 @printf "%-15s= %-20s\n" \
96 ARCH ${ARCH} \ 113 ARCH ${ARCH} \
97 CFLAGS '${CFLAGS}' \ 114 CFLAGS '${CFLAGS}' \
115 CPPFLAGS '${CPPFLAGS}' \
116 CUFLAGS '${CUFLAGS}' \
98 LDFLAGS '${LDFLAGS}' \ 117 LDFLAGS '${LDFLAGS}' \
99 LDLIBS '${liblitmus-flags}' \ 118 LDLIBS '${liblitmus-flags}' \
100 CPPFLAGS '${CPPFLAGS}' \ 119 CPPFLAGS '${CPPFLAGS}' \
101 CC '${shell which ${CC}}' \ 120 CC '${shell which ${CC}}' \
121 CPP '${shell which ${CPP}}' \
122 CU '${shell which ${CU}}' \
102 LD '${shell which ${LD}}' \ 123 LD '${shell which ${LD}}' \
103 AR '${shell which ${AR}}' \ 124 AR '${shell which ${AR}}' \
104 > $@ 125 > $@
@@ -112,10 +133,12 @@ dump-config:
112 headers "${headers}" \ 133 headers "${headers}" \
113 "kernel headers" "${imported-headers}" \ 134 "kernel headers" "${imported-headers}" \
114 CFLAGS "${CFLAGS}" \ 135 CFLAGS "${CFLAGS}" \
115 LDFLAGS "${LDFLAGS}" \
116 CPPFLAGS "${CPPFLAGS}" \ 136 CPPFLAGS "${CPPFLAGS}" \
137 CUFLAGS "${CUFLAGS}" \
138 LDFLAGS "${LDFLAGS}" \
117 CC "${CC}" \ 139 CC "${CC}" \
118 CPP "${CPP}" \ 140 CPP "${CPP}" \
141 CU "${CU}" \
119 LD "${LD}" \ 142 LD "${LD}" \
120 AR "${AR}" \ 143 AR "${AR}" \
121 obj-all "${obj-all}" 144 obj-all "${obj-all}"
@@ -124,8 +147,7 @@ help:
124 @cat INSTALL 147 @cat INSTALL
125 148
126clean: 149clean:
127 rm -f ${rt-apps} 150 rm -f ${rt-apps} ${rt-cppapps} ${rt-cuapps}
128 rm -f ${rt-cppapps}
129 rm -f *.o *.d *.a test_catalog.inc 151 rm -f *.o *.d *.a test_catalog.inc
130 rm -f ${imported-headers} 152 rm -f ${imported-headers}
131 rm -f inc/config.makefile 153 rm -f inc/config.makefile
@@ -259,6 +281,12 @@ vpath %.cpp gpu/
259objcpp-budget = budget.o common.o 281objcpp-budget = budget.o common.o
260lib-budget = -lrt -lm -pthread 282lib-budget = -lrt -lm -pthread
261 283
284
285vpath %.cu gpu/
286
287objcu-gpuspin = gpuspin.o common.o
288lib-gpuspin = -lrt -lm -lpthread
289
262# ############################################################################## 290# ##############################################################################
263# Build everything that depends on liblitmus. 291# Build everything that depends on liblitmus.
264 292
@@ -269,14 +297,19 @@ ${rt-apps}: $${obj-$$@} liblitmus.a
269${rt-cppapps}: $${objcpp-$$@} liblitmus.a 297${rt-cppapps}: $${objcpp-$$@} liblitmus.a
270 $(CPP) -o $@ $(LDFLAGS) ${ldf-$@} $(filter-out liblitmus.a,$+) $(LOADLIBS) $(LDLIBS) ${liblitmus-flags} ${lib-$@} 298 $(CPP) -o $@ $(LDFLAGS) ${ldf-$@} $(filter-out liblitmus.a,$+) $(LOADLIBS) $(LDLIBS) ${liblitmus-flags} ${lib-$@}
271 299
300${rt-cuapps}: $${objcu-$$@} liblitmus.a
301 $(CPP) -o $@ $(LDFLAGS) ${ldf-$@} $(filter-out liblitmus.a,$+) $(LOADLIBS) $(LDLIBS) ${liblitmus-flags} ${cuda-flags} ${lib-$@}
302
272# ############################################################################## 303# ##############################################################################
273# Dependency resolution. 304# Dependency resolution.
274 305
275vpath %.c bin/ src/ gpu/ tests/ 306vpath %.c bin/ src/ gpu/ tests/
276vpath %.cpp gpu/ 307vpath %.cpp gpu/
308vpath %.cu gpu/
277 309
278obj-all = ${sort ${foreach target,${all},${obj-${target}}}} 310obj-all = ${sort ${foreach target,${all},${obj-${target}}}}
279obj-all += ${sort ${foreach target,${all},${objcpp-${target}}}} 311obj-all += ${sort ${foreach target,${all},${objcpp-${target}}}}
312obj-all += ${sort ${foreach target,${all},${objcu-${target}}}}
280 313
281# rule to generate dependency files 314# rule to generate dependency files
282%.d: %.c ${imported-headers} 315%.d: %.c ${imported-headers}
@@ -291,6 +324,16 @@ obj-all += ${sort ${foreach target,${all},${objcpp-${target}}}}
291 sed 's,\($*\)\.o[ :]*,\1.o $@ : ,g' < $@.$$$$ > $@; \ 324 sed 's,\($*\)\.o[ :]*,\1.o $@ : ,g' < $@.$$$$ > $@; \
292 rm -f $@.$$$$ 325 rm -f $@.$$$$
293 326
327%.d: %.cu ${imported-headers}
328 @set -e; rm -f $@; \
329 $(CU) --generate-dependencies $(CUFLAGS) $< > $@.$$$$; \
330 sed 's,\($*\)\.o[ :]*,\1.o $@ : ,g' < $@.$$$$ > $@; \
331 rm -f $@.$$$$
332
333# teach make how to compile .cu files
334%.o: %.cu
335 $(CU) --compile $(CUFLAGS) $(OUTPUT_OPTION) $<
336
294ifeq ($(MAKECMDGOALS),) 337ifeq ($(MAKECMDGOALS),)
295MAKECMDGOALS += all 338MAKECMDGOALS += all
296endif 339endif
diff --git a/gpu/budget.cpp b/gpu/budget.cpp
index f62c515..8a2546a 100644
--- a/gpu/budget.cpp
+++ b/gpu/budget.cpp
@@ -80,6 +80,28 @@ int SIGNALS = 0;
80int BLOCK_SIGNALS_ON_SLEEP = 0; 80int BLOCK_SIGNALS_ON_SLEEP = 0;
81int OVERRUN_RATE = 1; /* default: every job overruns */ 81int OVERRUN_RATE = 1; /* default: every job overruns */
82 82
83int CXS_OVERRUN = 0;
84int NUM_LOCKS = 1;
85int NUM_REPLICAS = 1;
86int NAMESPACE = 0;
87int *LOCKS = NULL;
88int IKGLP_LOCK = 0;
89int USE_DGLS = 0;
90int NEST_IN_IKGLP = 0;
91
92int WAIT = 0;
93
94enum eLockType
95{
96 FIFO,
97 PRIOQ,
98 IKGLP
99};
100
101eLockType LOCK_TYPE = FIFO;
102
103int OVERRUN_BY_SLEEP = 0;
104
83int NUM_JOBS = 0; 105int NUM_JOBS = 0;
84int NUM_COMPLETED_JOBS = 0; 106int NUM_COMPLETED_JOBS = 0;
85int NUM_OVERRUNS = 0; 107int NUM_OVERRUNS = 0;
@@ -103,9 +125,32 @@ int job(lt_t exec_ns, lt_t budget_ns)
103 if (SIGNALS && BLOCK_SIGNALS_ON_SLEEP) 125 if (SIGNALS && BLOCK_SIGNALS_ON_SLEEP)
104 block_litmus_signals(SIG_BUDGET); 126 block_litmus_signals(SIG_BUDGET);
105 127
128 if(CXS_OVERRUN) {
129 if (NEST_IN_IKGLP)
130 litmus_lock(IKGLP_LOCK);
131 if (USE_DGLS)
132 litmus_dgl_lock(LOCKS, NUM_LOCKS);
133 else
134 for(int i = 0; i < NUM_LOCKS; ++i)
135 litmus_lock(LOCKS[i]);
136 }
137
106 // intentionally overrun via suspension 138 // intentionally overrun via suspension
107 lt_sleep(approx_remaining + overrun_extra); 139 if (OVERRUN_BY_SLEEP)
108 140 lt_sleep(approx_remaining + overrun_extra);
141 else
142 loop_for((approx_remaining + overrun_extra) * 0.9);
143
144 if(CXS_OVERRUN) {
145 if (USE_DGLS)
146 litmus_dgl_unlock(LOCKS, NUM_LOCKS);
147 else
148 for(int i = NUM_LOCKS-1; i >= 0; --i)
149 litmus_unlock(LOCKS[i]);
150 if (NEST_IN_IKGLP)
151 litmus_unlock(IKGLP_LOCK);
152 }
153
109 if (SIGNALS && BLOCK_SIGNALS_ON_SLEEP) 154 if (SIGNALS && BLOCK_SIGNALS_ON_SLEEP)
110 unblock_litmus_signals(SIG_BUDGET); 155 unblock_litmus_signals(SIG_BUDGET);
111 } 156 }
@@ -120,15 +165,18 @@ int job(lt_t exec_ns, lt_t budget_ns)
120 return 1; 165 return 1;
121} 166}
122 167
123#define OPTSTR "sboOva" 168#define OPTSTR "SbosOvalwqixdn:r:"
124 169
125int main(int argc, char** argv) 170int main(int argc, char** argv)
126{ 171{
127 int ret; 172 int ret;
128 lt_t e_ns = ms2ns(10); 173
129 lt_t p_ns = ms2ns(100); 174 srand(getpid());
175
176 lt_t e_ns = ms2ns(2);
177 lt_t p_ns = ms2ns(50) + rand()%200;
130 lt_t budget_ns = p_ns/2; 178 lt_t budget_ns = p_ns/2;
131 lt_t duration = s2ns(10); 179 lt_t duration = s2ns(60);
132 lt_t terminate_time; 180 lt_t terminate_time;
133 unsigned int first_job, last_job; 181 unsigned int first_job, last_job;
134 int opt; 182 int opt;
@@ -140,12 +188,15 @@ int main(int argc, char** argv)
140 188
141 while ((opt = getopt(argc, argv, OPTSTR)) != -1) { 189 while ((opt = getopt(argc, argv, OPTSTR)) != -1) {
142 switch(opt) { 190 switch(opt) {
143 case 's': 191 case 'S':
144 SIGNALS = 1; 192 SIGNALS = 1;
145 break; 193 break;
146 case 'b': 194 case 'b':
147 BLOCK_SIGNALS_ON_SLEEP = 1; 195 BLOCK_SIGNALS_ON_SLEEP = 1;
148 break; 196 break;
197 case 's':
198 OVERRUN_BY_SLEEP = 1;
199 break;
149 case 'o': 200 case 'o':
150 OVERRUN = 1; 201 OVERRUN = 1;
151 overrun_extra = budget_ns/2; 202 overrun_extra = budget_ns/2;
@@ -164,6 +215,31 @@ int main(int argc, char** argv)
164 case 'v': 215 case 'v':
165 drain_policy = DRAIN_SOBLIV; 216 drain_policy = DRAIN_SOBLIV;
166 break; 217 break;
218 case 'l':
219 CXS_OVERRUN = 1;
220 NAMESPACE = open("semaphores", O_RDONLY | O_CREAT, S_IRUSR | S_IWUSR);
221 break;
222 case 'q':
223 LOCK_TYPE = PRIOQ;
224 break;
225 case 'i':
226 LOCK_TYPE = IKGLP;
227 break;
228 case 'x':
229 NEST_IN_IKGLP = 1;
230 break;
231 case 'w':
232 WAIT = 1;
233 break;
234 case 'd':
235 USE_DGLS = 1;
236 break;
237 case 'n':
238 NUM_LOCKS = atoi(optarg);
239 break;
240 case 'r':
241 NUM_REPLICAS = atoi(optarg);
242 break;
167 case ':': 243 case ':':
168 printf("missing argument\n"); 244 printf("missing argument\n");
169 assert(false); 245 assert(false);
@@ -176,10 +252,21 @@ int main(int argc, char** argv)
176 } 252 }
177 253
178 assert(!BLOCK_SIGNALS_ON_SLEEP || (BLOCK_SIGNALS_ON_SLEEP && SIGNALS)); 254 assert(!BLOCK_SIGNALS_ON_SLEEP || (BLOCK_SIGNALS_ON_SLEEP && SIGNALS));
255 assert(!CXS_OVERRUN || (CXS_OVERRUN && WAIT));
256 assert(LOCK_TYPE != IKGLP || NUM_LOCKS == 1);
257 assert(LOCK_TYPE != IKGLP || (LOCK_TYPE == IKGLP && !NEST_IN_IKGLP));
258 assert(NUM_LOCKS > 0);
259 if (LOCK_TYPE == IKGLP || NEST_IN_IKGLP)
260 assert(NUM_REPLICAS >= 1);
261
262 LOCKS = new int[NUM_LOCKS];
179 263
180 if (compute_overrun_rate) { 264 if (compute_overrun_rate) {
181 int backlog = (int)ceil((overrun_extra + budget_ns)/(double)budget_ns); 265 int backlog = (int)ceil((overrun_extra + budget_ns)/(double)budget_ns);
182 OVERRUN_RATE = backlog + 2; /* some padding */ 266 if (!CXS_OVERRUN)
267 OVERRUN_RATE = backlog + 2; /* some padding */
268 else
269 OVERRUN_RATE = 2*backlog + 2; /* overrun less frequently for testing */
183 } 270 }
184 271
185 init_rt_task_param(&param); 272 init_rt_task_param(&param);
@@ -197,6 +284,44 @@ int main(int argc, char** argv)
197 ret = set_rt_task_param(gettid(), &param); 284 ret = set_rt_task_param(gettid(), &param);
198 assert(ret == 0); 285 assert(ret == 0);
199 286
287 if (CXS_OVERRUN) {
288 int i;
289 for(i = 0; i < NUM_LOCKS; ++i) {
290 int lock = -1;
291 switch(LOCK_TYPE)
292 {
293 case FIFO:
294 lock = open_fifo_sem(NAMESPACE, i);
295 break;
296 case PRIOQ:
297 lock = open_prioq_sem(NAMESPACE, i);
298 break;
299 case IKGLP:
300 lock = open_ikglp_sem(NAMESPACE, i, NUM_REPLICAS);
301 break;
302 }
303 if (lock < 0) {
304 perror("open_sem");
305 exit(-1);
306 }
307 LOCKS[i] = lock;
308 }
309
310 if (NEST_IN_IKGLP) {
311 IKGLP_LOCK = open_ikglp_sem(NAMESPACE, i, NUM_REPLICAS);
312 if (IKGLP_LOCK < 0) {
313 perror("open_sem");
314 exit(-1);
315 }
316 }
317 }
318
319 if (WAIT) {
320 ret = wait_for_ts_release();
321 if (ret < 0)
322 perror("wait_for_ts_release");
323 }
324
200 ret = task_mode(LITMUS_RT_TASK); 325 ret = task_mode(LITMUS_RT_TASK);
201 assert(ret == 0); 326 assert(ret == 0);
202 327
@@ -231,5 +356,7 @@ int main(int argc, char** argv)
231 printf("# User Jobs Completed: %d\n", NUM_COMPLETED_JOBS); 356 printf("# User Jobs Completed: %d\n", NUM_COMPLETED_JOBS);
232 printf("# Overruns: %d\n", NUM_OVERRUNS); 357 printf("# Overruns: %d\n", NUM_OVERRUNS);
233 358
359 delete[] LOCKS;
360
234 return 0; 361 return 0;
235} 362}
diff --git a/gpu/gpuspin.cu b/gpu/gpuspin.cu
new file mode 100644
index 0000000..aff6cd1
--- /dev/null
+++ b/gpu/gpuspin.cu
@@ -0,0 +1,1720 @@
1#include <sys/time.h>
2
3#include <stdio.h>
4#include <stdlib.h>
5#include <unistd.h>
6#include <time.h>
7#include <string.h>
8#include <assert.h>
9#include <execinfo.h>
10
11#include <boost/interprocess/managed_shared_memory.hpp>
12#include <boost/interprocess/sync/interprocess_mutex.hpp>
13
14#include <cuda_runtime.h>
15
16#include "litmus.h"
17#include "common.h"
18
19using namespace std;
20using namespace boost::interprocess;
21
22const char *lock_namespace = "./.gpuspin-locks";
23
24const int NR_GPUS = 8;
25
26bool GPU_USING = false;
27bool ENABLE_AFFINITY = false;
28bool RELAX_FIFO_MAX_LEN = false;
29bool ENABLE_CHUNKING = false;
30bool MIGRATE_VIA_SYSMEM = false;
31
32enum eEngineLockTypes
33{
34 FIFO,
35 PRIOQ
36};
37
38eEngineLockTypes ENGINE_LOCK_TYPE = FIFO;
39
40int GPU_PARTITION = 0;
41int GPU_PARTITION_SIZE = 0;
42int CPU_PARTITION_SIZE = 0;
43
44int RHO = 2;
45
46int NUM_COPY_ENGINES = 2;
47
48
49__attribute__((unused)) static size_t kbToB(size_t kb) { return kb * 1024; }
50__attribute__((unused)) static size_t mbToB(size_t mb) { return kbToB(mb * 1024); }
51
52/* in bytes */
53size_t SEND_SIZE = 0;
54size_t RECV_SIZE = 0;
55size_t STATE_SIZE = 0;
56size_t CHUNK_SIZE = 0;
57
58int TOKEN_LOCK = -1;
59
60bool USE_ENGINE_LOCKS = true;
61bool USE_DYNAMIC_GROUP_LOCKS = false;
62int EE_LOCKS[NR_GPUS];
63int CE_SEND_LOCKS[NR_GPUS];
64int CE_RECV_LOCKS[NR_GPUS];
65int CE_MIGR_SEND_LOCKS[NR_GPUS];
66int CE_MIGR_RECV_LOCKS[NR_GPUS];
67bool RESERVED_MIGR_COPY_ENGINE = false; // only checked if NUM_COPY_ENGINES == 2
68
69bool ENABLE_RT_AUX_THREADS = true;
70
71enum eGpuSyncMode
72{
73 IKGLP_MODE,
74 IKGLP_WC_MODE, /* work-conserving IKGLP. no GPU is left idle, but breaks optimality */
75 KFMLP_MODE,
76 RGEM_MODE,
77};
78
79eGpuSyncMode GPU_SYNC_MODE = IKGLP_MODE;
80
81enum eCudaSyncMode
82{
83 BLOCKING,
84 SPIN
85};
86
87eCudaSyncMode CUDA_SYNC_MODE = BLOCKING;
88
89
90int CUR_DEVICE = -1;
91int LAST_DEVICE = -1;
92
93cudaStream_t STREAMS[NR_GPUS];
94int GPU_HZ[NR_GPUS];
95int NUM_SM[NR_GPUS];
96int WARP_SIZE[NR_GPUS];
97int ELEM_PER_THREAD[NR_GPUS];
98
99#define DEFINE_PER_GPU(type, var) type var[NR_GPUS]
100#define per_gpu(var, idx) (var[(idx)])
101#define this_gpu(var) (var[(CUR_DEVICE)])
102#define cur_stream() (this_gpu(STREAMS))
103#define cur_gpu() (CUR_DEVICE)
104#define last_gpu() (LAST_DEVICE)
105#define cur_ee() (EE_LOCKS[CUR_DEVICE])
106#define cur_send() (CE_SEND_LOCKS[CUR_DEVICE])
107#define cur_recv() (CE_RECV_LOCKS[CUR_DEVICE])
108#define cur_migr_send() (CE_MIGR_SEND_LOCKS[CUR_DEVICE])
109#define cur_migr_recv() (CE_MIGR_RECV_LOCKS[CUR_DEVICE])
110#define cur_hz() (GPU_HZ[CUR_DEVICE])
111#define cur_sms() (NUM_SM[CUR_DEVICE])
112#define cur_warp_size() (WARP_SIZE[CUR_DEVICE])
113#define cur_elem_per_thread() (ELEM_PER_THREAD[CUR_DEVICE])
114#define num_online_gpus() (NUM_GPUS)
115
116static bool useEngineLocks()
117{
118 return(USE_ENGINE_LOCKS);
119}
120
121#define VANILLA_LINUX
122
123bool TRACE_MIGRATIONS = false;
124#ifndef VANILLA_LINUX
125#define trace_migration(to, from) do { inject_gpu_migration((to), (from)); } while(0)
126#define trace_release(arrival, deadline, jobno) do { inject_release((arrival), (deadline), (jobno)); } while(0)
127#define trace_completion(jobno) do { inject_completion((jobno)); } while(0)
128#define trace_name() do { inject_name(); } while(0)
129#define trace_param() do { inject_param(); } while(0)
130#else
131#define set_rt_task_param(x, y) (0)
132#define trace_migration(to, from)
133#define trace_release(arrival, deadline, jobno)
134#define trace_completion(jobno)
135#define trace_name()
136#define trace_param()
137#endif
138
139struct ce_lock_state
140{
141 int locks[2];
142 size_t num_locks;
143 size_t budget_remaining;
144 bool locked;
145
146 ce_lock_state(int device_a, enum cudaMemcpyKind kind, size_t size, int device_b = -1, bool migration = false) {
147 num_locks = (device_a != -1) + (device_b != -1);
148
149 if(device_a != -1) {
150 if (!migration)
151 locks[0] = (kind == cudaMemcpyHostToDevice || (kind == cudaMemcpyDeviceToDevice && device_b == -1)) ?
152 CE_SEND_LOCKS[device_a] : CE_RECV_LOCKS[device_a];
153 else
154 locks[0] = (kind == cudaMemcpyHostToDevice || (kind == cudaMemcpyDeviceToDevice && device_b == -1)) ?
155 CE_MIGR_SEND_LOCKS[device_a] : CE_MIGR_RECV_LOCKS[device_a];
156 }
157
158 if(device_b != -1) {
159 assert(kind == cudaMemcpyDeviceToDevice);
160
161 if (!migration)
162 locks[1] = CE_RECV_LOCKS[device_b];
163 else
164 locks[1] = CE_MIGR_RECV_LOCKS[device_b];
165
166 if(locks[1] < locks[0]) {
167 // enforce total order on locking
168 int temp = locks[1];
169 locks[1] = locks[0];
170 locks[0] = temp;
171 }
172 }
173 else {
174 locks[1] = -1;
175 }
176
177 if(!ENABLE_CHUNKING)
178 budget_remaining = size;
179 else
180 budget_remaining = CHUNK_SIZE;
181 }
182
183 void crash(void) {
184 void *array[50];
185 int size, i;
186 char **messages;
187
188 size = backtrace(array, 50);
189 messages = backtrace_symbols(array, size);
190
191 fprintf(stderr, "%d: TRIED TO GRAB SAME LOCK TWICE! Lock = %d\n", getpid(), locks[0]);
192 for (i = 1; i < size && messages != NULL; ++i)
193 {
194 fprintf(stderr, "%d: [bt]: (%d) %s\n", getpid(), i, messages[i]);
195 }
196 free(messages);
197
198 assert(false);
199 }
200
201
202 void lock() {
203 if(locks[0] == locks[1]) crash();
204
205 if(USE_DYNAMIC_GROUP_LOCKS) {
206 litmus_dgl_lock(locks, num_locks);
207 }
208 else
209 {
210 for(int l = 0; l < num_locks; ++l)
211 {
212 litmus_lock(locks[l]);
213 }
214 }
215 locked = true;
216 }
217
218 void unlock() {
219 if(locks[0] == locks[1]) crash();
220
221 if(USE_DYNAMIC_GROUP_LOCKS) {
222 litmus_dgl_unlock(locks, num_locks);
223 }
224 else
225 {
226 // reverse order
227 for(int l = num_locks - 1; l >= 0; --l)
228 {
229 litmus_unlock(locks[l]);
230 }
231 }
232 locked = false;
233 }
234
235 void refresh() {
236 budget_remaining = CHUNK_SIZE;
237 }
238
239 bool budgetIsAvailable(size_t tosend) {
240 return(tosend >= budget_remaining);
241 }
242
243 void decreaseBudget(size_t spent) {
244 budget_remaining -= spent;
245 }
246};
247
248// precondition: if do_locking == true, locks in state are held.
249static cudaError_t __chunkMemcpy(void* a_dst, const void* a_src, size_t count,
250 enum cudaMemcpyKind kind,
251 ce_lock_state* state)
252{
253 cudaError_t ret = cudaSuccess;
254 int remaining = count;
255
256 char* dst = (char*)a_dst;
257 const char* src = (const char*)a_src;
258
259 // disable chunking, if needed, by setting chunk_size equal to the
260 // amount of data to be copied.
261 int chunk_size = (ENABLE_CHUNKING) ? CHUNK_SIZE : count;
262 int i = 0;
263
264 while(remaining != 0)
265 {
266 int bytesToCopy = std::min(remaining, chunk_size);
267
268 if(state && state->budgetIsAvailable(bytesToCopy) && state->locked) {
269 cudaStreamSynchronize(STREAMS[CUR_DEVICE]);
270 ret = cudaGetLastError();
271
272 if(ret != cudaSuccess)
273 {
274 break;
275 }
276
277 state->unlock();
278 state->refresh(); // replentish.
279 // we can only run out of
280 // budget if chunking is enabled.
281 // we presume that init budget would
282 // be set to cover entire memcpy
283 // if chunking were disabled.
284 }
285
286 if(state && !state->locked) {
287 state->lock();
288 }
289
290 //ret = cudaMemcpy(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind);
291 cudaMemcpyAsync(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind, STREAMS[CUR_DEVICE]);
292
293 if(state) {
294 state->decreaseBudget(bytesToCopy);
295 }
296
297 ++i;
298 remaining -= bytesToCopy;
299 }
300 return ret;
301}
302
303static cudaError_t chunkMemcpy(void* a_dst, const void* a_src, size_t count,
304 enum cudaMemcpyKind kind,
305 int device_a = -1, // device_a == -1 disables locking
306 bool do_locking = true,
307 int device_b = -1,
308 bool migration = false)
309{
310 cudaError_t ret;
311 if(!do_locking || device_a == -1) {
312 ret = __chunkMemcpy(a_dst, a_src, count, kind, NULL);
313 cudaStreamSynchronize(cur_stream());
314 if(ret == cudaSuccess)
315 ret = cudaGetLastError();
316 }
317 else {
318 ce_lock_state state(device_a, kind, count, device_b, migration);
319 state.lock();
320 ret = __chunkMemcpy(a_dst, a_src, count, kind, &state);
321 cudaStreamSynchronize(cur_stream());
322 if(ret == cudaSuccess)
323 ret = cudaGetLastError();
324 state.unlock();
325 }
326 return ret;
327}
328
329
330void allocate_locks_litmus(void)
331{
332 // allocate k-FMLP lock
333 int fd = open(lock_namespace, O_RDONLY | O_CREAT, S_IRUSR | S_IWUSR);
334
335 int base_name = GPU_PARTITION * 1000;
336
337 if (GPU_SYNC_MODE == IKGLP_MODE) {
338 /* Standard (optimal) IKGLP */
339 TOKEN_LOCK = open_gpusync_token_lock(fd,
340 base_name, /* name */
341 GPU_PARTITION_SIZE,
342 GPU_PARTITION*GPU_PARTITION_SIZE,
343 RHO,
344 IKGLP_M_IN_FIFOS,
345 (!RELAX_FIFO_MAX_LEN) ?
346 IKGLP_OPTIMAL_FIFO_LEN :
347 IKGLP_UNLIMITED_FIFO_LEN,
348 ENABLE_AFFINITY);
349 }
350 else if (GPU_SYNC_MODE == KFMLP_MODE) {
351 /* KFMLP. FIFO queues only for tokens. */
352 TOKEN_LOCK = open_gpusync_token_lock(fd,
353 base_name, /* name */
354 GPU_PARTITION_SIZE,
355 GPU_PARTITION*GPU_PARTITION_SIZE,
356 RHO,
357 IKGLP_UNLIMITED_IN_FIFOS,
358 IKGLP_UNLIMITED_FIFO_LEN,
359 ENABLE_AFFINITY);
360 }
361 else if (GPU_SYNC_MODE == RGEM_MODE) {
362 /* RGEM-like token allocation. Shared priority queue for all tokens. */
363 TOKEN_LOCK = open_gpusync_token_lock(fd,
364 base_name, /* name */
365 GPU_PARTITION_SIZE,
366 GPU_PARTITION*GPU_PARTITION_SIZE,
367 RHO,
368 RHO*GPU_PARTITION_SIZE,
369 1,
370 ENABLE_AFFINITY);
371 }
372 else if (GPU_SYNC_MODE == IKGLP_WC_MODE) {
373 /* Non-optimal IKGLP that never lets a replica idle if there are pending
374 * token requests. */
375 int max_simult_run = std::max(CPU_PARTITION_SIZE, RHO*GPU_PARTITION_SIZE);
376 int max_fifo_len = (int)ceil((float)max_simult_run / (RHO*GPU_PARTITION_SIZE));
377 TOKEN_LOCK = open_gpusync_token_lock(fd,
378 base_name, /* name */
379 GPU_PARTITION_SIZE,
380 GPU_PARTITION*GPU_PARTITION_SIZE,
381 RHO,
382 max_simult_run,
383 (!RELAX_FIFO_MAX_LEN) ?
384 max_fifo_len :
385 IKGLP_UNLIMITED_FIFO_LEN,
386 ENABLE_AFFINITY);
387 }
388 else {
389 perror("Invalid GPUSync mode specified\n");
390 TOKEN_LOCK = -1;
391 }
392
393 if(TOKEN_LOCK < 0)
394 perror("open_token_sem");
395
396 if(USE_ENGINE_LOCKS)
397 {
398 assert(NUM_COPY_ENGINES == 1 || NUM_COPY_ENGINES == 2);
399 assert((NUM_COPY_ENGINES == 1 && !RESERVED_MIGR_COPY_ENGINE) || NUM_COPY_ENGINES == 2);
400
401 // allocate the engine locks.
402 for (int i = 0; i < GPU_PARTITION_SIZE; ++i)
403 {
404 int idx = GPU_PARTITION*GPU_PARTITION_SIZE + i;
405 int ee_name = (i+1)*10 + base_name;
406 int ce_0_name = (i+1)*10 + base_name + 1;
407 int ce_1_name = (i+1)*10 + base_name + 2;
408 int ee_lock = -1, ce_0_lock = -1, ce_1_lock = -1;
409
410 open_sem_t openEngineLock = (ENGINE_LOCK_TYPE == FIFO) ?
411 open_fifo_sem : open_prioq_sem;
412
413 ee_lock = openEngineLock(fd, ee_name);
414 if (ee_lock < 0)
415 perror("open_*_sem (engine lock)");
416
417 ce_0_lock = openEngineLock(fd, ce_0_name);
418 if (ce_0_lock < 0)
419 perror("open_*_sem (engine lock)");
420
421 if (NUM_COPY_ENGINES == 2)
422 {
423 ce_1_lock = openEngineLock(fd, ce_1_name);
424 if (ce_1_lock < 0)
425 perror("open_*_sem (engine lock)");
426 }
427
428 EE_LOCKS[idx] = ee_lock;
429
430 if (NUM_COPY_ENGINES == 1)
431 {
432 // share locks
433 CE_SEND_LOCKS[idx] = ce_0_lock;
434 CE_RECV_LOCKS[idx] = ce_0_lock;
435 CE_MIGR_SEND_LOCKS[idx] = ce_0_lock;
436 CE_MIGR_RECV_LOCKS[idx] = ce_0_lock;
437 }
438 else
439 {
440 assert(NUM_COPY_ENGINES == 2);
441
442 if (RESERVED_MIGR_COPY_ENGINE) {
443 // copy engine deadicated to migration operations
444 CE_SEND_LOCKS[idx] = ce_0_lock;
445 CE_RECV_LOCKS[idx] = ce_0_lock;
446 CE_MIGR_SEND_LOCKS[idx] = ce_1_lock;
447 CE_MIGR_RECV_LOCKS[idx] = ce_1_lock;
448 }
449 else {
450 // migration transmissions treated as regular data
451 CE_SEND_LOCKS[idx] = ce_0_lock;
452 CE_RECV_LOCKS[idx] = ce_1_lock;
453 CE_MIGR_SEND_LOCKS[idx] = ce_0_lock;
454 CE_MIGR_RECV_LOCKS[idx] = ce_1_lock;
455 }
456 }
457 }
458 }
459}
460
461
462
463
464class gpu_pool
465{
466public:
467 gpu_pool(int pSz): poolSize(pSz)
468 {
469 memset(&pool[0], 0, sizeof(pool[0])*poolSize);
470 }
471
472 int get(pthread_mutex_t* tex, int preference = -1)
473 {
474 int which = -1;
475 int last = (preference >= 0) ? preference : 0;
476 int minIdx = last;
477
478 pthread_mutex_lock(tex);
479
480 int min = pool[last];
481 for(int i = (minIdx+1)%poolSize; i != last; i = (i+1)%poolSize)
482 {
483 if(min > pool[i])
484 minIdx = i;
485 }
486 ++pool[minIdx];
487
488 pthread_mutex_unlock(tex);
489
490 which = minIdx;
491
492 return which;
493 }
494
495 void put(pthread_mutex_t* tex, int which)
496 {
497 pthread_mutex_lock(tex);
498 --pool[which];
499 pthread_mutex_unlock(tex);
500 }
501
502private:
503 int poolSize;
504 int pool[NR_GPUS]; // >= gpu_part_size
505};
506
507static gpu_pool* GPU_LINUX_SEM_POOL = NULL;
508static pthread_mutex_t* GPU_LINUX_MUTEX_POOL = NULL;
509
510static void allocate_locks_linux(int num_gpu_users)
511{
512 managed_shared_memory *segment_pool_ptr = NULL;
513 managed_shared_memory *segment_mutex_ptr = NULL;
514
515 int numGpuPartitions = NR_GPUS/GPU_PARTITION_SIZE;
516
517 if(num_gpu_users != 0)
518 {
519 printf("%d creating shared memory for linux semaphores; num pools = %d, pool size = %d\n", getpid(), numGpuPartitions, GPU_PARTITION_SIZE);
520 shared_memory_object::remove("linux_mutex_memory");
521 shared_memory_object::remove("linux_sem_memory");
522
523 segment_mutex_ptr = new managed_shared_memory(create_only, "linux_mutex_memory", 4*1024);
524 GPU_LINUX_MUTEX_POOL = segment_mutex_ptr->construct<pthread_mutex_t>("pthread_mutex_t linux_m")[numGpuPartitions]();
525 for(int i = 0; i < numGpuPartitions; ++i)
526 {
527 pthread_mutexattr_t attr;
528 pthread_mutexattr_init(&attr);
529 pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED);
530 pthread_mutex_init(&(GPU_LINUX_MUTEX_POOL[i]), &attr);
531 pthread_mutexattr_destroy(&attr);
532 }
533
534 segment_pool_ptr = new managed_shared_memory(create_only, "linux_sem_memory", 4*1024);
535 GPU_LINUX_SEM_POOL = segment_pool_ptr->construct<gpu_pool>("gpu_pool linux_p")[numGpuPartitions](GPU_PARTITION_SIZE);
536 }
537 else
538 {
539 do
540 {
541 try
542 {
543 if (!segment_pool_ptr) segment_pool_ptr = new managed_shared_memory(open_only, "linux_sem_memory");
544 }
545 catch(...)
546 {
547 sleep(1);
548 }
549 }while(segment_pool_ptr == NULL);
550
551 do
552 {
553 try
554 {
555 if (!segment_mutex_ptr) segment_mutex_ptr = new managed_shared_memory(open_only, "linux_mutex_memory");
556 }
557 catch(...)
558 {
559 sleep(1);
560 }
561 }while(segment_mutex_ptr == NULL);
562
563 GPU_LINUX_SEM_POOL = segment_pool_ptr->find<gpu_pool>("gpu_pool linux_p").first;
564 GPU_LINUX_MUTEX_POOL = segment_mutex_ptr->find<pthread_mutex_t>("pthread_mutex_t linux_m").first;
565 }
566}
567
568
569
570
571static void allocate_locks(int num_gpu_users, bool linux_mode)
572{
573 if(!linux_mode)
574 allocate_locks_litmus();
575 else
576 allocate_locks_linux(num_gpu_users);
577}
578
579static void set_cur_gpu(int gpu)
580{
581 if (TRACE_MIGRATIONS) {
582 trace_migration(gpu, CUR_DEVICE);
583 }
584 if(gpu != CUR_DEVICE) {
585 cudaSetDevice(gpu);
586 CUR_DEVICE = gpu;
587 }
588}
589
590
591static pthread_barrier_t *gpu_barrier = NULL;
592static interprocess_mutex *gpu_mgmt_mutexes = NULL;
593static managed_shared_memory *segment_ptr = NULL;
594
595void coordinate_gpu_tasks(int num_gpu_users)
596{
597 if(num_gpu_users != 0)
598 {
599 printf("%d creating shared memory\n", getpid());
600 shared_memory_object::remove("gpu_barrier_memory");
601 segment_ptr = new managed_shared_memory(create_only, "gpu_barrier_memory", 4*1024);
602
603 printf("%d creating a barrier for %d users\n", getpid(), num_gpu_users);
604 gpu_barrier = segment_ptr->construct<pthread_barrier_t>("pthread_barrier_t gpu_barrier")();
605 pthread_barrierattr_t battr;
606 pthread_barrierattr_init(&battr);
607 pthread_barrierattr_setpshared(&battr, PTHREAD_PROCESS_SHARED);
608 pthread_barrier_init(gpu_barrier, &battr, num_gpu_users);
609 pthread_barrierattr_destroy(&battr);
610 printf("%d creating gpu mgmt mutexes for %d devices\n", getpid(), NR_GPUS);
611 gpu_mgmt_mutexes = segment_ptr->construct<interprocess_mutex>("interprocess_mutex m")[NR_GPUS]();
612 }
613 else
614 {
615 do
616 {
617 try
618 {
619 segment_ptr = new managed_shared_memory(open_only, "gpu_barrier_memory");
620 }
621 catch(...)
622 {
623 sleep(1);
624 }
625 }while(segment_ptr == NULL);
626
627 gpu_barrier = segment_ptr->find<pthread_barrier_t>("pthread_barrier_t gpu_barrier").first;
628 gpu_mgmt_mutexes = segment_ptr->find<interprocess_mutex>("interprocess_mutex m").first;
629 }
630}
631
632typedef float spindata_t;
633
634char *d_send_data[NR_GPUS] = {0};
635char *d_recv_data[NR_GPUS] = {0};
636char *d_state_data[NR_GPUS] = {0};
637spindata_t *d_spin_data[NR_GPUS] = {0};
638//unsigned int *d_iteration_count[NR_GPUS] = {0};
639
640
641bool p2pMigration[NR_GPUS][NR_GPUS] = {0};
642
643char *h_send_data = 0;
644char *h_recv_data = 0;
645char *h_state_data = 0;
646
647unsigned int *h_iteration_count[NR_GPUS] = {0};
648
649static void init_cuda(int num_gpu_users)
650{
651 const int PAGE_SIZE = 4*1024;
652 size_t send_alloc_bytes = SEND_SIZE + (SEND_SIZE%PAGE_SIZE != 0)*PAGE_SIZE;
653 size_t recv_alloc_bytes = RECV_SIZE + (RECV_SIZE%PAGE_SIZE != 0)*PAGE_SIZE;
654 size_t state_alloc_bytes = STATE_SIZE + (STATE_SIZE%PAGE_SIZE != 0)*PAGE_SIZE;
655
656 coordinate_gpu_tasks(num_gpu_users);
657
658 switch (CUDA_SYNC_MODE)
659 {
660 case BLOCKING:
661 cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
662 break;
663 case SPIN:
664 cudaSetDeviceFlags(cudaDeviceScheduleSpin);
665 break;
666 }
667
668 for(int i = 0; i < GPU_PARTITION_SIZE; ++i)
669 {
670 cudaDeviceProp prop;
671 int which = GPU_PARTITION*GPU_PARTITION_SIZE + i;
672
673 gpu_mgmt_mutexes[which].lock();
674
675 set_cur_gpu(which);
676 cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 0);
677 cudaDeviceSetLimit(cudaLimitMallocHeapSize, 0);
678
679 cudaGetDeviceProperties(&prop, which);
680 GPU_HZ[which] = prop.clockRate * 1000; /* khz -> hz */
681 NUM_SM[which] = prop.multiProcessorCount;
682 WARP_SIZE[which] = prop.warpSize;
683
684 // enough to fill the L2 cache exactly.
685 ELEM_PER_THREAD[which] = (prop.l2CacheSize/(NUM_SM[which]*WARP_SIZE[which]*sizeof(spindata_t)));
686
687
688 if (!MIGRATE_VIA_SYSMEM && prop.unifiedAddressing)
689 {
690 for(int j = 0; j < GPU_PARTITION_SIZE; ++j)
691 {
692 if (i != j)
693 {
694 int canAccess = 0;
695 cudaDeviceCanAccessPeer(&canAccess, i, j);
696 if(canAccess)
697 {
698 cudaDeviceEnablePeerAccess(j, 0);
699 p2pMigration[i][j] = true;
700 }
701 }
702 }
703 }
704
705 cudaStreamCreate(&STREAMS[CUR_DEVICE]);
706
707 cudaMalloc(&d_spin_data[which], prop.l2CacheSize);
708 cudaMemset(&d_spin_data[which], 0, prop.l2CacheSize);
709// cudaMalloc(&d_iteration_count[which], NUM_SM[which]*WARP_SIZE[which]*sizeof(unsigned int));
710// cudaHostAlloc(&h_iteration_count[which], NUM_SM[which]*WARP_SIZE[which]*sizeof(unsigned int), cudaHostAllocPortable | cudaHostAllocMapped);
711
712 if (send_alloc_bytes) {
713 cudaMalloc(&d_send_data[which], send_alloc_bytes);
714 cudaHostAlloc(&h_send_data, send_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped);
715 }
716
717 if (h_recv_data) {
718 cudaMalloc(&d_recv_data[which], recv_alloc_bytes);
719 cudaHostAlloc(&h_recv_data, recv_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped);
720 }
721
722 if (h_state_data) {
723 cudaMalloc(&d_state_data[which], state_alloc_bytes);
724
725 if (MIGRATE_VIA_SYSMEM)
726 cudaHostAlloc(&h_state_data, state_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped | cudaHostAllocWriteCombined);
727 }
728
729 gpu_mgmt_mutexes[which].unlock();
730 }
731
732 // roll back to first GPU
733 set_cur_gpu(GPU_PARTITION*GPU_PARTITION_SIZE);
734}
735
736
737
738static bool MigrateToGPU_P2P(int from, int to)
739{
740 bool success = true;
741 set_cur_gpu(to);
742 chunkMemcpy(this_gpu(d_state_data), per_gpu(d_state_data, from),
743 STATE_SIZE, cudaMemcpyDeviceToDevice, to,
744 useEngineLocks(), from, true);
745 return success;
746}
747
748
749static bool PullState(void)
750{
751 bool success = true;
752 chunkMemcpy(h_state_data, this_gpu(d_state_data),
753 STATE_SIZE, cudaMemcpyDeviceToHost,
754 cur_gpu(), useEngineLocks(), -1, true);
755 return success;
756}
757
758static bool PushState(void)
759{
760 bool success = true;
761 chunkMemcpy(this_gpu(d_state_data), h_state_data,
762 STATE_SIZE, cudaMemcpyHostToDevice,
763 cur_gpu(), useEngineLocks(), -1, true);
764 return success;
765}
766
767static bool MigrateToGPU_SysMem(int from, int to)
768{
769 // THIS IS ON-DEMAND SYS_MEM MIGRATION. GPUSync says
770 // you should be using speculative migrations.
771 // Use PushState() and PullState().
772 assert(false); // for now
773
774 bool success = true;
775
776 set_cur_gpu(from);
777 chunkMemcpy(h_state_data, this_gpu(d_state_data),
778 STATE_SIZE, cudaMemcpyDeviceToHost,
779 from, useEngineLocks(), -1, true);
780
781 set_cur_gpu(to);
782 chunkMemcpy(this_gpu(d_state_data), h_state_data,
783 STATE_SIZE, cudaMemcpyHostToDevice,
784 to, useEngineLocks(), -1, true);
785
786 return success;
787}
788
789static bool MigrateToGPU(int from, int to)
790{
791 bool success = false;
792
793 if (from != to)
794 {
795 if(!MIGRATE_VIA_SYSMEM && p2pMigration[to][from])
796 success = MigrateToGPU_P2P(from, to);
797 else
798 success = MigrateToGPU_SysMem(from, to);
799 }
800 else
801 {
802 set_cur_gpu(to);
803 success = true;
804 }
805
806 return success;
807}
808
809static bool MigrateToGPU_Implicit(int to)
810{
811 return( MigrateToGPU(cur_gpu(), to) );
812}
813
814static void MigrateIfNeeded(int next_gpu)
815{
816 if(next_gpu != cur_gpu() && cur_gpu() != -1)
817 {
818 if (!MIGRATE_VIA_SYSMEM)
819 MigrateToGPU_Implicit(next_gpu);
820 else {
821 set_cur_gpu(next_gpu);
822 PushState();
823 }
824 }
825}
826
827
828
829static void exit_cuda()
830{
831 for(int i = 0; i < GPU_PARTITION_SIZE; ++i)
832 {
833 int which = GPU_PARTITION*GPU_PARTITION_SIZE + i;
834 gpu_mgmt_mutexes[which].lock();
835 set_cur_gpu(which);
836 cudaDeviceReset();
837 gpu_mgmt_mutexes[which].unlock();
838 }
839}
840
841bool safetynet = false;
842
843static void catch_exit(int catch_exit)
844{
845 if(GPU_USING && USE_ENGINE_LOCKS && safetynet)
846 {
847 safetynet = false;
848 for(int i = 0; i < GPU_PARTITION_SIZE; ++i)
849 {
850 int which = GPU_PARTITION*GPU_PARTITION_SIZE + i;
851 set_cur_gpu(which);
852
853// cudaDeviceReset();
854
855 // try to unlock everything. litmus will prevent bogus calls.
856 if(USE_ENGINE_LOCKS)
857 {
858 litmus_unlock(EE_LOCKS[which]);
859 litmus_unlock(CE_SEND_LOCKS[which]);
860 if (NUM_COPY_ENGINES == 2)
861 {
862 if (RESERVED_MIGR_COPY_ENGINE)
863 litmus_unlock(CE_MIGR_SEND_LOCKS[which]);
864 else
865 litmus_unlock(CE_MIGR_RECV_LOCKS[which]);
866 }
867 }
868 }
869 litmus_unlock(TOKEN_LOCK);
870 }
871}
872
873
874
875
876
877static float ms_sum;
878static int gpucount = 0;
879
880__global__ void docudaspin(float* data, /*unsigned int* iterations,*/ unsigned int num_elem, unsigned int cycles)
881{
882 long long int now = clock64();
883 long long unsigned int elapsed = 0;
884 long long int last;
885
886// unsigned int iter = 0;
887 unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
888 unsigned int j = 0;
889 bool toggle = true;
890
891// iterations[i] = 0;
892 do
893 {
894 data[i*num_elem+j] += (toggle) ? M_PI : -M_PI;
895 j = (j + 1 != num_elem) ? j + 1 : 0;
896 toggle = !toggle;
897// iter++;
898
899 last = now;
900 now = clock64();
901
902// // exact calculation takes more cycles than a second
903// // loop iteration when code is compiled optimized
904// long long int diff = now - last;
905// elapsed += (diff > 0) ?
906// diff :
907// now + ((~((long long int)0)<<1)>>1) - last;
908
909 // don't count iterations with clock roll-over
910 elapsed += max(0ll, now - last);
911 }while(elapsed < cycles);
912
913// iterations[i] = iter;
914
915 return;
916}
917
918static void gpu_loop_for(double gpu_sec_time, double emergency_exit)
919{
920 int next_gpu;
921
922 if (emergency_exit && wctime() > emergency_exit)
923 goto out;
924
925 next_gpu = litmus_lock(TOKEN_LOCK);
926 {
927 MigrateIfNeeded(next_gpu);
928
929 unsigned int numcycles = (unsigned int)(cur_hz() * gpu_sec_time);
930
931 if(SEND_SIZE > 0)
932 chunkMemcpy(this_gpu(d_state_data), h_send_data, SEND_SIZE,
933 cudaMemcpyHostToDevice, CUR_DEVICE, useEngineLocks());
934
935 if(useEngineLocks()) litmus_lock(cur_ee());
936 /* one block per sm, one warp per block */
937 docudaspin <<<cur_sms(),cur_warp_size(), 0, cur_stream()>>> (d_spin_data[cur_gpu()], cur_elem_per_thread(), numcycles);
938// docudaspin <<<cur_sms(),cur_warp_size(), 0, cur_stream()>>> (d_spin_data[cur_gpu()], d_iteration_count[cur_gpu()], cur_elem_per_thread(), numcycles);
939 cudaStreamSynchronize(cur_stream());
940 if(useEngineLocks()) litmus_unlock(cur_ee());
941
942 if(RECV_SIZE > 0)
943 chunkMemcpy(h_recv_data, this_gpu(d_state_data), RECV_SIZE,
944 cudaMemcpyDeviceToHost, CUR_DEVICE, useEngineLocks());
945
946 if (MIGRATE_VIA_SYSMEM)
947 PullState();
948 }
949 litmus_unlock(TOKEN_LOCK);
950
951 last_gpu() = cur_gpu();
952
953out:
954 return;
955}
956
957static void gpu_loop_for_linux(double gpu_sec_time, double emergency_exit)
958{
959 static int GPU_OFFSET = GPU_PARTITION * GPU_PARTITION_SIZE;
960 static gpu_pool *pool = &GPU_LINUX_SEM_POOL[GPU_PARTITION];
961 static pthread_mutex_t *mutex = &GPU_LINUX_MUTEX_POOL[GPU_PARTITION];
962
963 static bool once = false;
964 static cudaEvent_t start, end;
965 float ms;
966 if (!once)
967 {
968 once = true;
969 cudaEventCreate(&start);
970 cudaEventCreate(&end);
971 }
972
973 int next_gpu;
974
975 if (emergency_exit && wctime() > emergency_exit)
976 goto out;
977
978 next_gpu = pool->get(mutex, cur_gpu() - GPU_OFFSET) + GPU_OFFSET;
979 {
980 MigrateIfNeeded(next_gpu);
981
982 unsigned int numcycles = (unsigned int)(cur_hz() * gpu_sec_time);
983
984 if(SEND_SIZE > 0)
985 chunkMemcpy(this_gpu(d_state_data), h_send_data, SEND_SIZE,
986 cudaMemcpyHostToDevice, cur_gpu(), useEngineLocks());
987
988 /* one block per sm, one warp per block */
989 cudaEventRecord(start, cur_stream());
990 docudaspin <<<cur_sms(),cur_warp_size(), 0, cur_stream()>>> (d_spin_data[cur_gpu()], cur_elem_per_thread(), numcycles);
991// docudaspin <<<cur_sms(),cur_warp_size(), 0, cur_stream()>>> (d_spin_data[cur_gpu()], d_iteration_count[cur_gpu()], cur_elem_per_thread(), numcycles);
992 cudaEventRecord(end, cur_stream());
993 cudaEventSynchronize(end);
994 cudaStreamSynchronize(cur_stream());
995
996// chunkMemcpy(this_gpu(h_iteration_count), this_gpu(d_iteration_count), sizeof(unsigned int),
997// cudaMemcpyDeviceToHost, cur_gpu(), useEngineLocks());
998//
999 cudaEventElapsedTime(&ms, start, end);
1000 ms_sum += ms;
1001 ++gpucount;
1002// printf("%f\n", ms);
1003// printf("%f: %u\n", ms, this_gpu(h_iteration_count)[0]);
1004
1005
1006 if(RECV_SIZE > 0)
1007 chunkMemcpy(h_recv_data, this_gpu(d_state_data), RECV_SIZE,
1008 cudaMemcpyDeviceToHost, cur_gpu(), useEngineLocks());
1009
1010 if (MIGRATE_VIA_SYSMEM)
1011 PullState();
1012 }
1013 pool->put(mutex, cur_gpu() - GPU_OFFSET);
1014
1015 last_gpu() = cur_gpu();
1016
1017out:
1018 return;
1019}
1020
1021
1022
1023
1024static void usage(char *error) {
1025 fprintf(stderr, "Error: %s\n", error);
1026 fprintf(stderr,
1027 "Usage:\n"
1028 " rt_spin [COMMON-OPTS] WCET PERIOD DURATION\n"
1029 " rt_spin [COMMON-OPTS] -f FILE [-o COLUMN] WCET PERIOD\n"
1030 " rt_spin -l\n"
1031 "\n"
1032 "COMMON-OPTS = [-w] [-s SCALE]\n"
1033 " [-p PARTITION/CLUSTER [-z CLUSTER SIZE]] [-c CLASS]\n"
1034 " [-X LOCKING-PROTOCOL] [-L CRITICAL SECTION LENGTH] [-Q RESOURCE-ID]"
1035 "\n"
1036 "WCET and PERIOD are milliseconds, DURATION is seconds.\n"
1037 "CRITICAL SECTION LENGTH is in milliseconds.\n");
1038 exit(EXIT_FAILURE);
1039}
1040
1041/*
1042 * returns the character that made processing stop, newline or EOF
1043 */
1044static int skip_to_next_line(FILE *fstream)
1045{
1046 int ch;
1047 for (ch = fgetc(fstream); ch != EOF && ch != '\n'; ch = fgetc(fstream));
1048 return ch;
1049}
1050
1051static void skip_comments(FILE *fstream)
1052{
1053 int ch;
1054 for (ch = fgetc(fstream); ch == '#'; ch = fgetc(fstream))
1055 skip_to_next_line(fstream);
1056 ungetc(ch, fstream);
1057}
1058
1059static void get_exec_times(const char *file, const int column,
1060 int *num_jobs, double **exec_times)
1061{
1062 FILE *fstream;
1063 int cur_job, cur_col, ch;
1064 *num_jobs = 0;
1065
1066 fstream = fopen(file, "r");
1067 if (!fstream)
1068 bail_out("could not open execution time file");
1069
1070 /* figure out the number of jobs */
1071 do {
1072 skip_comments(fstream);
1073 ch = skip_to_next_line(fstream);
1074 if (ch != EOF)
1075 ++(*num_jobs);
1076 } while (ch != EOF);
1077
1078 if (-1 == fseek(fstream, 0L, SEEK_SET))
1079 bail_out("rewinding file failed");
1080
1081 /* allocate space for exec times */
1082 *exec_times = (double*)calloc(*num_jobs, sizeof(*exec_times));
1083 if (!*exec_times)
1084 bail_out("couldn't allocate memory");
1085
1086 for (cur_job = 0; cur_job < *num_jobs && !feof(fstream); ++cur_job) {
1087
1088 skip_comments(fstream);
1089
1090 for (cur_col = 1; cur_col < column; ++cur_col) {
1091 /* discard input until we get to the column we want */
1092 int unused __attribute__ ((unused)) = fscanf(fstream, "%*s,");
1093 }
1094
1095 /* get the desired exec. time */
1096 if (1 != fscanf(fstream, "%lf", (*exec_times)+cur_job)) {
1097 fprintf(stderr, "invalid execution time near line %d\n",
1098 cur_job);
1099 exit(EXIT_FAILURE);
1100 }
1101
1102 skip_to_next_line(fstream);
1103 }
1104
1105 assert(cur_job == *num_jobs);
1106 fclose(fstream);
1107}
1108
1109#define NUMS 4096
1110static int num[NUMS];
1111__attribute__((unused)) static char* progname;
1112
1113static int loop_once(void)
1114{
1115 int i, j = 0;
1116 for (i = 0; i < NUMS; i++)
1117 j += num[i]++;
1118 return j;
1119}
1120
1121static int loop_for(double exec_time, double emergency_exit)
1122{
1123 double last_loop = 0, loop_start;
1124 int tmp = 0;
1125
1126 double start = cputime();
1127 double now = cputime();
1128
1129 if (emergency_exit && wctime() > emergency_exit)
1130 goto out;
1131
1132 while (now + last_loop < start + exec_time) {
1133 loop_start = now;
1134 tmp += loop_once();
1135 now = cputime();
1136 last_loop = now - loop_start;
1137 if (emergency_exit && wctime() > emergency_exit) {
1138 /* Oops --- this should only be possible if the execution time tracking
1139 * is broken in the LITMUS^RT kernel. */
1140 fprintf(stderr, "!!! gpuspin/%d emergency exit!\n", getpid());
1141 fprintf(stderr, "Something is seriously wrong! Do not ignore this.\n");
1142 break;
1143 }
1144 }
1145
1146out:
1147 return tmp;
1148}
1149
1150
1151static void debug_delay_loop(void)
1152{
1153 double start, end, delay;
1154
1155 while (1) {
1156 for (delay = 0.5; delay > 0.01; delay -= 0.01) {
1157 start = wctime();
1158 loop_for(delay, 0);
1159 end = wctime();
1160 printf("%6.4fs: looped for %10.8fs, delta=%11.8fs, error=%7.4f%%\n",
1161 delay,
1162 end - start,
1163 end - start - delay,
1164 100 * (end - start - delay) / delay);
1165 }
1166 }
1167}
1168
1169static int gpu_job(double exec_time, double gpu_exec_time, double program_end)
1170{
1171 double chunk1, chunk2;
1172
1173 if (wctime() > program_end) {
1174 return 0;
1175 }
1176 else {
1177 chunk1 = exec_time * drand48();
1178 chunk2 = exec_time - chunk1;
1179
1180 loop_for(chunk1, program_end + 1);
1181 gpu_loop_for(gpu_exec_time, program_end + 1);
1182 loop_for(chunk2, program_end + 1);
1183
1184 sleep_next_period();
1185 }
1186 return 1;
1187}
1188
1189static int job(double exec_time, double program_end)
1190{
1191 if (wctime() > program_end) {
1192 return 0;
1193 }
1194 else {
1195 loop_for(exec_time, program_end + 1);
1196 sleep_next_period();
1197 }
1198 return 1;
1199}
1200
1201/*****************************/
1202/* only used for linux modes */
1203
1204static struct timespec periodTime;
1205static struct timespec releaseTime;
1206static unsigned int job_no = 0;
1207
1208static lt_t period_ns;
1209
1210static void log_release()
1211{
1212 __attribute__ ((unused)) lt_t rel = releaseTime.tv_sec * s2ns(1) + releaseTime.tv_nsec;
1213 __attribute__ ((unused)) lt_t dead = rel + period_ns;
1214 trace_release(rel, dead, job_no);
1215}
1216
1217static void log_completion()
1218{
1219 trace_completion(job_no);
1220 ++job_no;
1221}
1222
1223static void setup_next_period_linux(struct timespec* spec, struct timespec* period)
1224{
1225 spec->tv_sec += period->tv_sec;
1226 spec->tv_nsec += period->tv_nsec;
1227 if (spec->tv_nsec >= s2ns(1)) {
1228 ++(spec->tv_sec);
1229 spec->tv_nsec -= s2ns(1);
1230 }
1231}
1232
1233static void sleep_next_period_linux()
1234{
1235 log_completion();
1236 setup_next_period_linux(&releaseTime, &periodTime);
1237 clock_nanosleep(CLOCK_MONOTONIC, TIMER_ABSTIME, &releaseTime, NULL);
1238 log_release();
1239}
1240
1241static void init_linux()
1242{
1243 mlockall(MCL_CURRENT | MCL_FUTURE);
1244}
1245
1246static int gpu_job_linux(double exec_time, double gpu_exec_time, double program_end)
1247{
1248 double chunk1, chunk2;
1249
1250 if (wctime() > program_end) {
1251 return 0;
1252 }
1253 else {
1254 chunk1 = exec_time * drand48();
1255 chunk2 = exec_time - chunk1;
1256
1257 loop_for(chunk1, program_end + 1);
1258 gpu_loop_for_linux(gpu_exec_time, program_end + 1);
1259 loop_for(chunk2, program_end + 1);
1260
1261 sleep_next_period_linux();
1262 }
1263 return 1;
1264}
1265
1266static int job_linux(double exec_time, double program_end)
1267{
1268 if (wctime() > program_end) {
1269 return 0;
1270 }
1271 else {
1272 loop_for(exec_time, program_end + 1);
1273 sleep_next_period_linux();
1274 }
1275 return 1;
1276}
1277
1278/*****************************/
1279
1280enum eScheduler
1281{
1282 LITMUS,
1283 LINUX,
1284 RT_LINUX
1285};
1286
1287#define CPU_OPTIONS "p:z:c:wlveio:f:s:q:X:L:Q:"
1288#define GPU_OPTIONS "g:y:r:C:E:dG:xS:R:T:Z:aFm:b:MNI"
1289
1290// concat the option strings
1291#define OPTSTR CPU_OPTIONS GPU_OPTIONS
1292
1293int main(int argc, char** argv)
1294{
1295 int ret;
1296 lt_t wcet;
1297 lt_t period;
1298 double wcet_ms = -1, gpu_wcet_ms = -1, period_ms = -1;
1299 unsigned int priority = LITMUS_LOWEST_PRIORITY;
1300 int migrate = 0;
1301 int cluster = 0;
1302 int cluster_size = 1;
1303 int opt;
1304 int wait = 0;
1305 int test_loop = 0;
1306 int column = 1;
1307 const char *file = NULL;
1308 int want_enforcement = 0;
1309 int want_signals = 0;
1310 double duration = 0, start = 0;
1311 double *exec_times = NULL;
1312 double scale = 1.0;
1313 task_class_t cls = RT_CLASS_HARD;
1314 int cur_job = 0, num_jobs = 0;
1315 struct rt_task param;
1316
1317 double budget_ms = -1.0;
1318 lt_t budget;
1319
1320 int num_gpu_users = 0;
1321
1322
1323 eScheduler scheduler = LITMUS;
1324
1325 /* locking */
1326// int lock_od = -1;
1327// int resource_id = 0;
1328// int protocol = -1;
1329// double cs_length = 1; /* millisecond */
1330
1331 progname = argv[0];
1332
1333 while ((opt = getopt(argc, argv, OPTSTR)) != -1) {
1334 switch (opt) {
1335 case 'w':
1336 wait = 1;
1337 break;
1338 case 'p':
1339 cluster = atoi(optarg);
1340 migrate = 1;
1341 break;
1342 case 'z':
1343 cluster_size = atoi(optarg);
1344 CPU_PARTITION_SIZE = cluster_size;
1345 break;
1346 case 'g':
1347 GPU_USING = true;
1348 GPU_PARTITION = atoi(optarg);
1349 assert(GPU_PARTITION >= 0 && GPU_PARTITION < NR_GPUS);
1350 break;
1351 case 'y':
1352 GPU_PARTITION_SIZE = atoi(optarg);
1353 assert(GPU_PARTITION_SIZE > 0);
1354 break;
1355 case 'r':
1356 RHO = atoi(optarg);
1357 assert(RHO > 0);
1358 break;
1359 case 'C':
1360 NUM_COPY_ENGINES = atoi(optarg);
1361 assert(NUM_COPY_ENGINES == 1 || NUM_COPY_ENGINES == 2);
1362 break;
1363 case 'E':
1364 USE_ENGINE_LOCKS = true;
1365 ENGINE_LOCK_TYPE = (eEngineLockTypes)atoi(optarg);
1366 assert(ENGINE_LOCK_TYPE == FIFO || ENGINE_LOCK_TYPE == PRIOQ);
1367 break;
1368 case 'd':
1369 USE_DYNAMIC_GROUP_LOCKS = true;
1370 break;
1371 case 'G':
1372 GPU_SYNC_MODE = (eGpuSyncMode)atoi(optarg);
1373 assert(GPU_SYNC_MODE >= IKGLP_MODE && GPU_SYNC_MODE <= RGEM_MODE);
1374 break;
1375 case 'a':
1376 ENABLE_AFFINITY = true;
1377 break;
1378 case 'F':
1379 RELAX_FIFO_MAX_LEN = true;
1380 break;
1381 case 'x':
1382 CUDA_SYNC_MODE = SPIN;
1383 break;
1384 case 'S':
1385 SEND_SIZE = kbToB((size_t)atoi(optarg));
1386 break;
1387 case 'R':
1388 RECV_SIZE = kbToB((size_t)atoi(optarg));
1389 break;
1390 case 'T':
1391 STATE_SIZE = kbToB((size_t)atoi(optarg));
1392 break;
1393 case 'Z':
1394 ENABLE_CHUNKING = true;
1395 CHUNK_SIZE = kbToB((size_t)atoi(optarg));
1396 break;
1397 case 'M':
1398 MIGRATE_VIA_SYSMEM = true;
1399 break;
1400 case 'm':
1401 num_gpu_users = atoi(optarg);
1402 assert(num_gpu_users > 0);
1403 break;
1404 case 'b':
1405 budget_ms = atoi(optarg);
1406 break;
1407 case 'N':
1408 scheduler = LINUX;
1409 break;
1410 case 'I':
1411 scheduler = RT_LINUX;
1412 break;
1413 case 'q':
1414 priority = atoi(optarg);
1415 break;
1416 case 'c':
1417 cls = str2class(optarg);
1418 if (cls == -1)
1419 usage("Unknown task class.");
1420 break;
1421 case 'e':
1422 want_enforcement = 1;
1423 break;
1424 case 'i':
1425 want_signals = 1;
1426 break;
1427 case 'l':
1428 test_loop = 1;
1429 break;
1430 case 'o':
1431 column = atoi(optarg);
1432 break;
1433// case 'f':
1434// file = optarg;
1435// break;
1436 case 's':
1437 scale = atof(optarg);
1438 break;
1439// case 'X':
1440// protocol = lock_protocol_for_name(optarg);
1441// if (protocol < 0)
1442// usage("Unknown locking protocol specified.");
1443// break;
1444// case 'L':
1445// cs_length = atof(optarg);
1446// if (cs_length <= 0)
1447// usage("Invalid critical section length.");
1448// break;
1449// case 'Q':
1450// resource_id = atoi(optarg);
1451// if (resource_id <= 0 && strcmp(optarg, "0"))
1452// usage("Invalid resource ID.");
1453// break;
1454 case ':':
1455 usage("Argument missing.");
1456 break;
1457 case '?':
1458 default:
1459 usage("Bad argument.");
1460 break;
1461 }
1462 }
1463
1464#ifdef VANILLA_LINUX
1465 assert(scheduler != LITMUS);
1466 assert(!wait);
1467#endif
1468
1469 // turn off some features to be safe
1470 if (scheduler != LITMUS)
1471 {
1472 RHO = 0;
1473 USE_ENGINE_LOCKS = false;
1474 USE_DYNAMIC_GROUP_LOCKS = false;
1475 ENABLE_AFFINITY = false;
1476 RELAX_FIFO_MAX_LEN = false;
1477 ENABLE_RT_AUX_THREADS = false;
1478 budget_ms = -1;
1479 want_enforcement = 0;
1480 want_signals = 0;
1481
1482 if (scheduler == RT_LINUX)
1483 {
1484 struct sched_param fifoparams;
1485
1486 assert(priority >= sched_get_priority_min(SCHED_FIFO) &&
1487 priority <= sched_get_priority_max(SCHED_FIFO));
1488
1489 memset(&fifoparams, 0, sizeof(fifoparams));
1490 fifoparams.sched_priority = priority;
1491 assert(0 == sched_setscheduler(getpid(), SCHED_FIFO, &fifoparams));
1492 }
1493 }
1494 else
1495 {
1496 if (!litmus_is_valid_fixed_prio(priority))
1497 usage("Invalid priority.");
1498 }
1499
1500 if (test_loop) {
1501 debug_delay_loop();
1502 return 0;
1503 }
1504
1505 srand(getpid());
1506
1507 if (file) {
1508 get_exec_times(file, column, &num_jobs, &exec_times);
1509
1510 if (argc - optind < 2)
1511 usage("Arguments missing.");
1512
1513 for (cur_job = 0; cur_job < num_jobs; ++cur_job) {
1514 /* convert the execution time to seconds */
1515 duration += exec_times[cur_job] * 0.001;
1516 }
1517 } else {
1518 /*
1519 * if we're not reading from the CSV file, then we need
1520 * three parameters
1521 */
1522 if (argc - optind < 3)
1523 usage("Arguments missing.");
1524 }
1525
1526 if (argc - optind == 3) {
1527 assert(!GPU_USING);
1528 wcet_ms = atof(argv[optind + 0]);
1529 period_ms = atof(argv[optind + 1]);
1530 duration = atof(argv[optind + 2]);
1531 }
1532 else if (argc - optind == 4) {
1533 assert(GPU_USING);
1534 wcet_ms = atof(argv[optind + 0]);
1535 gpu_wcet_ms = atof(argv[optind + 1]);
1536 period_ms = atof(argv[optind + 2]);
1537 duration = atof(argv[optind + 3]);
1538 }
1539
1540 wcet = ms2ns(wcet_ms);
1541 period = ms2ns(period_ms);
1542 if (wcet <= 0)
1543 usage("The worst-case execution time must be a "
1544 "positive number.");
1545 if (period <= 0)
1546 usage("The period must be a positive number.");
1547 if (!file && wcet > period) {
1548 usage("The worst-case execution time must not "
1549 "exceed the period.");
1550 }
1551 if (GPU_USING && gpu_wcet_ms <= 0)
1552 usage("The worst-case gpu execution time must be a positive number.");
1553
1554 if (budget_ms > 0)
1555 budget = ms2ns(budget_ms);
1556 else
1557 budget = wcet;
1558
1559 if (file && num_jobs > 1)
1560 duration += period_ms * 0.001 * (num_jobs - 1);
1561
1562 if (migrate) {
1563 ret = be_migrate_to_cluster(cluster, cluster_size);
1564 if (ret < 0)
1565 bail_out("could not migrate to target partition or cluster.");
1566 }
1567
1568 if (scheduler != LITMUS)
1569 {
1570 // set some variables needed by linux modes
1571 if (GPU_USING)
1572 {
1573 TRACE_MIGRATIONS = true;
1574 }
1575 periodTime.tv_sec = period / s2ns(1);
1576 periodTime.tv_nsec = period - periodTime.tv_sec * s2ns(1);
1577 period_ns = period;
1578 }
1579
1580 init_rt_task_param(&param);
1581 param.exec_cost = budget;
1582 param.period = period;
1583 param.priority = priority;
1584 param.cls = cls;
1585 param.budget_policy = (want_enforcement) ?
1586 PRECISE_ENFORCEMENT : NO_ENFORCEMENT;
1587 param.budget_signal_policy = (want_enforcement && want_signals) ?
1588 PRECISE_SIGNALS : NO_SIGNALS;
1589 param.release_policy = PERIODIC;
1590
1591 if (migrate)
1592 param.cpu = cluster_to_first_cpu(cluster, cluster_size);
1593 ret = set_rt_task_param(gettid(), &param);
1594 if (ret < 0)
1595 bail_out("could not setup rt task params");
1596
1597 if (scheduler == LITMUS)
1598 init_litmus();
1599 else
1600 init_linux();
1601
1602 if (want_signals) {
1603 /* bind default longjmp signal handler to SIG_BUDGET. */
1604 activate_litmus_signals(SIG_BUDGET_MASK, longjmp_on_litmus_signal);
1605 }
1606
1607 if (scheduler == LITMUS)
1608 {
1609 ret = task_mode(LITMUS_RT_TASK);
1610 if (ret != 0)
1611 bail_out("could not become RT task");
1612 }
1613 else
1614 {
1615 trace_name();
1616 trace_param();
1617 }
1618
1619// if (protocol >= 0) {
1620// /* open reference to semaphore */
1621// lock_od = litmus_open_lock(protocol, resource_id, lock_namespace, &cluster);
1622// if (lock_od < 0) {
1623// perror("litmus_open_lock");
1624// usage("Could not open lock.");
1625// }
1626// }
1627
1628 if (GPU_USING) {
1629 allocate_locks(num_gpu_users, scheduler != LITMUS);
1630
1631 signal(SIGABRT, catch_exit);
1632 signal(SIGTERM, catch_exit);
1633 signal(SIGQUIT, catch_exit);
1634 signal(SIGSEGV, catch_exit);
1635
1636 init_cuda(num_gpu_users);
1637 safetynet = true;
1638
1639 if (ENABLE_RT_AUX_THREADS)
1640 if (enable_aux_rt_tasks(AUX_CURRENT | AUX_FUTURE) != 0)
1641 bail_out("enable_aux_rt_tasks() failed");
1642 }
1643
1644 if (wait) {
1645 ret = wait_for_ts_release2(&releaseTime);
1646 if (ret != 0)
1647 bail_out("wait_for_ts_release2()");
1648
1649 if (scheduler != LITMUS)
1650 log_release();
1651 }
1652 else if (scheduler != LITMUS)
1653 {
1654 clock_gettime(CLOCK_MONOTONIC, &releaseTime);
1655 sleep_next_period_linux();
1656 }
1657
1658 start = wctime();
1659
1660 if (scheduler == LITMUS)
1661 {
1662 if (!GPU_USING) {
1663 while (job(wcet_ms * 0.001 * scale, start + duration));
1664 }
1665 else {
1666 while (gpu_job(wcet_ms * 0.001 * scale,
1667 gpu_wcet_ms * 0.001 * scale,
1668 start + duration));
1669 }
1670 }
1671 else
1672 {
1673 if (!GPU_USING) {
1674 while (job_linux(wcet_ms * 0.001 * scale, start + duration));
1675 }
1676 else {
1677 while (gpu_job_linux(wcet_ms * 0.001 * scale,
1678 gpu_wcet_ms * 0.001 * scale,
1679 start + duration));
1680 }
1681 }
1682
1683 if (GPU_USING && ENABLE_RT_AUX_THREADS)
1684 if (disable_aux_rt_tasks(AUX_CURRENT | AUX_FUTURE) != 0)
1685 bail_out("disable_aux_rt_tasks() failed");
1686
1687// if (file) {
1688// /* use times read from the CSV file */
1689// for (cur_job = 0; cur_job < num_jobs; ++cur_job) {
1690// /* convert job's length to seconds */
1691// job(exec_times[cur_job] * 0.001 * scale,
1692// start + duration,
1693// lock_od, cs_length * 0.001);
1694// }
1695// } else {
1696// /* convert to seconds and scale */
1697// while (job(wcet_ms * 0.001 * scale, start + duration,
1698// lock_od, cs_length * 0.001));
1699// }
1700
1701 if (scheduler == LITMUS)
1702 {
1703 ret = task_mode(BACKGROUND_TASK);
1704 if (ret != 0)
1705 bail_out("could not become regular task (huh?)");
1706 }
1707
1708 if (GPU_USING) {
1709 safetynet = false;
1710 exit_cuda();
1711
1712
1713 printf("avg: %f\n", ms_sum/gpucount);
1714 }
1715
1716 if (file)
1717 free(exec_times);
1718
1719 return 0;
1720}
diff --git a/gpu/rtspin_fake_cuda.cpp b/gpu/rtspin_fake_cuda.cpp
deleted file mode 100644
index 247a74c..0000000
--- a/gpu/rtspin_fake_cuda.cpp
+++ /dev/null
@@ -1,1187 +0,0 @@
1#include <sys/time.h>
2
3#include <stdint.h>
4#include <stdio.h>
5#include <stdlib.h>
6#include <unistd.h>
7#include <time.h>
8#include <assert.h>
9#include <fcntl.h>
10#include <errno.h>
11
12#include <blitz/array.h>
13
14#include <boost/interprocess/managed_shared_memory.hpp>
15#include <boost/interprocess/sync/interprocess_barrier.hpp>
16#include <boost/interprocess/sync/interprocess_mutex.hpp>
17
18#include "litmus.h"
19
20using namespace blitz;
21using namespace std;
22using namespace boost::interprocess;
23
24#define RESET_RELEASE_ON_MISS
25
26
27void bail_out(const char* msg)
28{
29 perror(msg);
30 exit(-1 * errno);
31}
32
33
34static void usage(char *error) {
35 fprintf(stderr, "Error: %s\n", error);
36 fprintf(stderr,
37 "Usage:\n"
38 " rt_spin [COMMON-OPTS] WCET PERIOD DURATION\n"
39 " rt_spin [COMMON-OPTS] -f FILE [-o COLUMN] WCET PERIOD\n"
40 " rt_spin -l\n"
41 "\n"
42 "COMMON-OPTS = [-w] [-p PARTITION] [-c CLASS] [-s SCALE]\n"
43 "\n"
44 "WCET and PERIOD are milliseconds, DURATION is seconds.\n");
45 exit(EXIT_FAILURE);
46}
47
48#define NUMS 4096
49static int num[NUMS];
50
51#define PAGE_SIZE (1024*4)
52
53bool ENABLE_WAIT = true;
54bool GPU_TASK = false;
55bool ENABLE_AFFINITY = false;
56bool USE_KFMLP = false;
57bool RELAX_FIFO_MAX_LEN = false;
58bool USE_DYNAMIC_GROUP_LOCKS = false;
59bool BROADCAST_STATE = false;
60bool ENABLE_CHUNKING = false;
61bool MIGRATE_VIA_SYSMEM = false;
62bool USE_PRIOQ = false;
63
64int GPU_PARTITION = 0;
65int GPU_PARTITION_SIZE = 0;
66int NUM_SIMULT_USERS = 1;
67size_t SEND_SIZE = 0;
68size_t RECV_SIZE = 0;
69size_t STATE_SIZE = 0;
70size_t CHUNK_SIZE = PAGE_SIZE;
71
72
73#define MAX_GPUS 8
74
75int KEXCLU_LOCK;
76int EE_LOCKS[MAX_GPUS];
77int CE_SEND_LOCKS[MAX_GPUS];
78int CE_RECV_LOCKS[MAX_GPUS];
79
80int CUR_DEVICE = -1;
81int LAST_DEVICE = -1;
82
83bool useEngineLocks()
84{
85 return(NUM_SIMULT_USERS != 1);
86}
87
88int gpuCyclesPerSecond = 0;
89
90uint64_t *init_release_time = NULL;
91barrier *release_barrier = NULL;
92barrier *gpu_barrier = NULL;
93interprocess_mutex *gpu_mgmt_mutexes = NULL;
94managed_shared_memory *segment_ptr = NULL;
95managed_shared_memory *release_segment_ptr = NULL;
96
97// observed average rate when four GPUs on same node in use from pagelocked memory.
98// about 1/3 to 1/4 this when there is no bus contention.
99//const double msPerByte = 4.22e-07;
100//const double transOverhead = 0.01008; // also observed.
101
102
103
104char *d_send_data[MAX_GPUS] = {0};
105char *d_recv_data[MAX_GPUS] = {0};
106char *d_state_data[MAX_GPUS] = {0};
107
108//cudaStream_t streams[MAX_GPUS];
109
110char *h_send_data = 0;
111char *h_recv_data = 0;
112char *h_state_data = 0;
113
114
115#include <sys/mman.h>
116#define USE_PAGE_LOCKED_MEMORY
117#ifdef USE_PAGE_LOCKED_MEMORY
118#define c_malloc(s) \
119 mmap(NULL, s , \
120 PROT_READ | PROT_WRITE, \
121 MAP_PRIVATE | MAP_ANONYMOUS | MAP_LOCKED, \
122 -1, 0)
123#else
124#define c_malloc(s) malloc(s)
125#endif
126
127typedef int cudaError_t;
128#define cudaSuccess 0
129
130enum cudaMemcpyKind {
131cudaMemcpyHostToDevice = 0,
132cudaMemcpyDeviceToHost = 1,
133cudaMemcpyDeviceToDevice = 2,
134};
135
136cudaError_t cudaGetLastError()
137{
138 return cudaSuccess;
139}
140
141////////////////////////////////////////////////////////////////////////
142////////////////////////////////////////////////////////////////////////
143////////////////////////////////////////////////////////////////////////
144////////////////////////////////////////////////////////////////////////
145
146struct ce_lock_state
147{
148 int locks[2];
149 size_t num_locks;
150 size_t budget_remaining;
151 bool locked;
152
153 ce_lock_state(int device_a, enum cudaMemcpyKind kind, size_t size, int device_b = -1) {
154 num_locks = (device_a != -1) + (device_b != -1);
155
156 if(device_a != -1) {
157 locks[0] = (kind == cudaMemcpyHostToDevice) ?
158 CE_SEND_LOCKS[device_a] : CE_RECV_LOCKS[device_a];
159 }
160
161 if(device_b != -1) {
162 assert(kind == cudaMemcpyDeviceToDevice);
163
164 locks[1] = CE_RECV_LOCKS[device_b];
165
166 if(locks[1] < locks[0]) {
167 int temp = locks[1];
168 locks[1] = locks[0];
169 locks[0] = temp;
170 }
171 }
172
173 if(!ENABLE_CHUNKING)
174 budget_remaining = size;
175 else
176 budget_remaining = CHUNK_SIZE;
177 }
178
179 void lock() {
180 if(USE_DYNAMIC_GROUP_LOCKS) {
181 litmus_dgl_lock(locks, num_locks);
182 }
183 else
184 {
185 for(int l = 0; l < num_locks; ++l)
186 {
187 litmus_lock(locks[l]);
188 }
189 }
190 locked = true;
191 }
192
193 void unlock() {
194 if(USE_DYNAMIC_GROUP_LOCKS) {
195 litmus_dgl_unlock(locks, num_locks);
196 }
197 else
198 {
199 // reverse order
200 for(int l = num_locks - 1; l >= 0; --l)
201 {
202 litmus_unlock(locks[l]);
203 }
204 }
205 locked = false;
206 }
207
208 void refresh() {
209 budget_remaining = CHUNK_SIZE;
210 }
211
212 bool budgetIsAvailable(size_t tosend) {
213 return(tosend >= budget_remaining);
214 }
215
216 void decreaseBudget(size_t spent) {
217 budget_remaining -= spent;
218 }
219};
220
221// precondition: if do_locking == true, locks in state are held.
222cudaError_t __chunkMemcpy(void* a_dst, const void* a_src, size_t count,
223 enum cudaMemcpyKind kind,
224 ce_lock_state* state)
225{
226 cudaError_t ret = cudaSuccess;
227 int remaining = count;
228
229 char* dst = (char*)a_dst;
230 const char* src = (const char*)a_src;
231
232 // disable chunking, if needed, by setting chunk_size equal to the
233 // amount of data to be copied.
234 int chunk_size = (ENABLE_CHUNKING) ? CHUNK_SIZE : count;
235 int i = 0;
236
237 while(remaining != 0)
238 {
239 int bytesToCopy = std::min(remaining, chunk_size);
240
241 if(state && state->budgetIsAvailable(bytesToCopy) && state->locked) {
242 //cutilSafeCall( cudaStreamSynchronize(streams[CUR_DEVICE]) );
243 ret = cudaGetLastError();
244
245 if(ret != cudaSuccess)
246 {
247 break;
248 }
249
250 state->unlock();
251 state->refresh(); // replentish.
252 // we can only run out of
253 // budget if chunking is enabled.
254 // we presume that init budget would
255 // be set to cover entire memcpy
256 // if chunking were disabled.
257 }
258
259 if(state && !state->locked) {
260 state->lock();
261 }
262
263 //ret = cudaMemcpy(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind);
264 //cudaMemcpyAsync(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind, streams[CUR_DEVICE]);
265
266 if(state) {
267 state->decreaseBudget(bytesToCopy);
268 }
269
270// if(ret != cudaSuccess)
271// {
272// break;
273// }
274
275 ++i;
276 remaining -= bytesToCopy;
277 }
278 return ret;
279}
280
281cudaError_t chunkMemcpy(void* a_dst, const void* a_src, size_t count,
282 enum cudaMemcpyKind kind,
283 int device_a = -1, // device_a == -1 disables locking
284 bool do_locking = true,
285 int device_b = -1)
286{
287 cudaError_t ret;
288 if(!do_locking || device_a == -1) {
289 ret = __chunkMemcpy(a_dst, a_src, count, kind, NULL);
290 //cutilSafeCall( cudaStreamSynchronize(streams[CUR_DEVICE]) );
291 if(ret == cudaSuccess)
292 ret = cudaGetLastError();
293 }
294 else {
295 ce_lock_state state(device_a, kind, count, device_b);
296 state.lock();
297 ret = __chunkMemcpy(a_dst, a_src, count, kind, &state);
298 //cutilSafeCall( cudaStreamSynchronize(streams[CUR_DEVICE]) );
299 if(ret == cudaSuccess)
300 ret = cudaGetLastError();
301 state.unlock();
302 }
303 return ret;
304}
305
306
307////////////////////////////////////////////////////////////////////////
308////////////////////////////////////////////////////////////////////////
309////////////////////////////////////////////////////////////////////////
310
311
312inline uint64_t timespec_to_ns(const struct timespec& t)
313{
314 return(t.tv_sec*1e9 + t.tv_nsec);
315}
316
317inline struct timespec ns_to_timespec(const uint64_t& ns)
318{
319 struct timespec temp = {ns/1e9, ns - ns/1e9};
320 return(temp);
321}
322
323inline uint64_t clock_gettime_ns(clockid_t clk_id)
324{
325 struct timespec temp;
326 clock_gettime(clk_id, &temp);
327 return timespec_to_ns(temp);
328}
329
330
331
332static int loop_once(void)
333{
334 int i, j = 0;
335 for (i = 0; i < NUMS; i++)
336 j += num[i]++;
337 return j;
338}
339
340static int loop_for(double exec_time, double emergency_exit)
341{
342 double last_loop = 0, loop_start;
343 int tmp = 0;
344
345 double start = cputime();
346 double now = cputime();
347
348 while (now + last_loop < start + exec_time) {
349 loop_start = now;
350 tmp += loop_once();
351 now = cputime();
352 last_loop = now - loop_start;
353 if (emergency_exit && wctime() > emergency_exit) {
354 /* Oops --- this should only be possible if the execution time tracking
355 * is broken in the LITMUS^RT kernel. */
356 fprintf(stderr, "!!! rtspin/%d emergency exit!\n", getpid());
357 fprintf(stderr, "Something is seriously wrong! Do not ignore this.\n");
358 break;
359 }
360 }
361
362 return tmp;
363}
364
365static void allocate_locks()
366{
367 // allocate k-FMLP lock
368 int fd = open("semaphores", O_RDONLY | O_CREAT, S_IRUSR | S_IWUSR);
369
370 int base_name = GPU_PARTITION * 1000;
371
372 if(USE_KFMLP) {
373 KEXCLU_LOCK = open_kfmlp_gpu_sem(fd,
374 base_name, /* name */
375 GPU_PARTITION_SIZE,
376 GPU_PARTITION*GPU_PARTITION_SIZE,
377 NUM_SIMULT_USERS,
378 ENABLE_AFFINITY
379 );
380 }
381 else {
382 KEXCLU_LOCK = open_gpusync_token_lock(fd,
383 base_name, /* name */
384 GPU_PARTITION_SIZE,
385 GPU_PARTITION*GPU_PARTITION_SIZE,
386 NUM_SIMULT_USERS,
387 IKGLP_M_IN_FIFOS,
388 (!RELAX_FIFO_MAX_LEN) ?
389 IKGLP_OPTIMAL_FIFO_LEN :
390 IKGLP_UNLIMITED_FIFO_LEN,
391 ENABLE_AFFINITY
392 );
393// KEXCLU_LOCK = open_ikglp_gpu_sem(fd,
394// base_name, /* name */
395// GPU_PARTITION_SIZE,
396// GPU_PARTITION*GPU_PARTITION_SIZE,
397// NUM_SIMULT_USERS,
398// ENABLE_AFFINITY,
399// RELAX_FIFO_MAX_LEN
400// );
401 }
402 if(KEXCLU_LOCK < 0)
403 perror("open_kexclu_sem");
404
405 if(NUM_SIMULT_USERS > 1)
406 {
407 open_sem_t opensem = (!USE_PRIOQ) ? open_fifo_sem : open_prioq_sem;
408 const char* opensem_label = (!USE_PRIOQ) ? "open_fifo_sem" : "open_prioq_sem";
409
410 // allocate the engine locks.
411 for (int i = 0; i < MAX_GPUS; ++i)
412 {
413 EE_LOCKS[i] = opensem(fd, (i+1)*10 + base_name);
414 if(EE_LOCKS[i] < 0)
415 perror(opensem_label);
416
417 CE_SEND_LOCKS[i] = opensem(fd, (i+1)*10 + base_name + 1);
418 if(CE_SEND_LOCKS[i] < 0)
419 perror(opensem_label);
420
421 if(NUM_SIMULT_USERS == 3)
422 {
423 // allocate a separate lock for the second copy engine
424 CE_RECV_LOCKS[i] = opensem(fd, (i+1)*10 + base_name + 2);
425 if(CE_RECV_LOCKS[i] < 0)
426 perror(opensem_label);
427 }
428 else
429 {
430 // share a single lock for the single copy engine
431 CE_RECV_LOCKS[i] = CE_SEND_LOCKS[i];
432 }
433 }
434 }
435}
436
437static void allocate_host_memory()
438{
439 // round up to page boundaries
440 size_t send_alloc_bytes = SEND_SIZE + (SEND_SIZE%PAGE_SIZE != 0)*PAGE_SIZE;
441 size_t recv_alloc_bytes = RECV_SIZE + (RECV_SIZE%PAGE_SIZE != 0)*PAGE_SIZE;
442 size_t state_alloc_bytes = STATE_SIZE + (STATE_SIZE%PAGE_SIZE != 0)*PAGE_SIZE;
443
444 printf("Allocating host memory. send = %dB, recv = %dB, state = %dB\n",
445 send_alloc_bytes, recv_alloc_bytes, state_alloc_bytes);
446
447// if(send_alloc_bytes > 0)
448// {
449// h_send_data = (char *)c_malloc(send_alloc_bytes);
450// memset(h_send_data, 0x55, send_alloc_bytes); // write some random value
451// // this will open a connection to GPU 0 if there is no active context, so
452// // expect long stalls. LAME.
453// cutilSafeCall( cudaHostRegister(h_send_data, send_alloc_bytes, cudaHostRegisterPortable) );
454// }
455//
456// if(recv_alloc_bytes > 0)
457// {
458// h_recv_data = (char *)c_malloc(recv_alloc_bytes);
459// memset(h_recv_data, 0xAA, recv_alloc_bytes);
460// cutilSafeCall( cudaHostRegister(h_recv_data, recv_alloc_bytes, cudaHostRegisterPortable) );
461// }
462//
463// if(state_alloc_bytes > 0)
464// {
465// h_state_data = (char *)c_malloc(state_alloc_bytes);
466// memset(h_state_data, 0xCC, state_alloc_bytes); // write some random value
467// cutilSafeCall( cudaHostRegister(h_state_data, state_alloc_bytes, cudaHostRegisterPortable) );
468// }
469
470 printf("Host memory allocated.\n");
471}
472
473static void allocate_device_memory()
474{
475 printf("Allocating device memory.\n");
476 // establish a connection to each GPU.
477// for(int i = 0; i < GPU_PARTITION_SIZE; ++i)
478// {
479// int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i;
480//
481// if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].lock();
482//
483// cutilSafeCall( cudaSetDevice(which_device) );
484// cutilSafeCall( cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 0) );
485// cutilSafeCall( cudaDeviceSetLimit(cudaLimitMallocHeapSize, 0) );
486//
487// cutilSafeCall( cudaStreamCreate(&streams[which_device]) );
488//
489// /* pre-allocate memory, pray there's enough to go around */
490// if(SEND_SIZE > 0) {
491// cutilSafeCall( cudaMalloc((void**)&d_send_data[which_device], SEND_SIZE) );
492// }
493// if(RECV_SIZE > 0) {
494// cutilSafeCall( cudaMalloc((void**)&h_recv_data[which_device], RECV_SIZE) );
495// }
496// if(STATE_SIZE > 0) {
497// cutilSafeCall( cudaMalloc((void**)&h_state_data[which_device], STATE_SIZE) );
498// }
499//
500// if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].unlock();
501// }
502 printf("Device memory allocated.\n");
503}
504
505static void configure_gpus()
506{
507 printf("Configuring GPU\n");
508
509// // SUSPEND WHEN BLOCKED!!
510// cutilSafeCall( cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync) );
511//
512// // establish a connection to each GPU.
513// for(int i = 0; i < GPU_PARTITION_SIZE; ++i)
514// {
515// int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i;
516//
517// if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].lock();
518//
519// cutilSafeCall( cudaSetDevice(which_device) );
520// cutilSafeCall( cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 0) );
521// cutilSafeCall( cudaDeviceSetLimit(cudaLimitMallocHeapSize, 0) );
522//
523// cutilSafeCall( cudaStreamCreate(&streams[which_device]) );
524//
525// // enable P2P migrations.
526// // we assume all GPUs are on the same I/O hub.
527// for(int j = 0; j < GPU_PARTITION_SIZE; ++j)
528// {
529// int other_device = GPU_PARTITION*GPU_PARTITION_SIZE + j;
530//
531// if(which_device != other_device)
532// {
533// cutilSafeCall( cudaDeviceEnablePeerAccess(other_device, 0) );
534// }
535// }
536//
537// if(i == 0)
538// {
539// struct cudaDeviceProp pi;
540// cudaGetDeviceProperties(&pi, i);
541// gpuCyclesPerSecond = pi.clockRate * 1000; /* khz -> hz */
542// }
543//
544// if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].unlock();
545// }
546
547 printf("GPUs have been configured.\n");
548}
549
550static void init_cuda()
551{
552 configure_gpus();
553 allocate_host_memory();
554 allocate_device_memory();
555 allocate_locks();
556}
557
558static void exit_cuda()
559{
560 for(int i = 0; i < GPU_PARTITION_SIZE; ++i)
561 {
562 int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i;
563
564 if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].lock();
565
566// cutilSafeCall( cudaSetDevice(which_device) );
567// cutilSafeCall( cudaDeviceReset() );
568
569 if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].unlock();
570 }
571}
572
573static void catchExit(void)
574{
575 if(GPU_TASK)
576 {
577 // try to unlock everything. litmus will prevent bogus calls.
578 if(NUM_SIMULT_USERS > 1)
579 {
580 for(int i = 0; i < GPU_PARTITION_SIZE; ++i)
581 {
582 int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i;
583
584 litmus_unlock(EE_LOCKS[which_device]);
585 litmus_unlock(CE_SEND_LOCKS[which_device]);
586 if(NUM_SIMULT_USERS == 2) {
587 litmus_unlock(CE_RECV_LOCKS[which_device]);
588 }
589 }
590 }
591
592 if(CUR_DEVICE >= 0) {
593 unregister_nv_device(CUR_DEVICE);
594 }
595
596 litmus_unlock(KEXCLU_LOCK);
597 }
598}
599
600static void migrateToGPU(int destination)
601{
602 if(!BROADCAST_STATE && STATE_SIZE > 0)
603 {
604 if(MIGRATE_VIA_SYSMEM)
605 {
606 chunkMemcpy(h_state_data, d_state_data[LAST_DEVICE], STATE_SIZE,
607 cudaMemcpyDeviceToHost, LAST_DEVICE, useEngineLocks());
608 }
609 }
610
611// cutilSafeCall( cudaSetDevice(destination) );
612
613 if(!BROADCAST_STATE && STATE_SIZE > 0)
614 {
615 if(MIGRATE_VIA_SYSMEM)
616 {
617 chunkMemcpy(d_state_data[CUR_DEVICE], h_state_data, STATE_SIZE,
618 cudaMemcpyHostToDevice, CUR_DEVICE, useEngineLocks());
619 }
620 else
621 {
622 chunkMemcpy(d_state_data[destination],
623 d_state_data[LAST_DEVICE],
624 STATE_SIZE,
625 cudaMemcpyDeviceToDevice,
626 CUR_DEVICE,
627 useEngineLocks(),
628 destination);
629 }
630 }
631}
632
633static void broadcastState(int from)
634{
635 if(STATE_SIZE > 0)
636 {
637 assert(CUR_DEVICE == from);
638
639 if(MIGRATE_VIA_SYSMEM)
640 {
641 chunkMemcpy(h_state_data, d_state_data[from], STATE_SIZE,
642 cudaMemcpyDeviceToHost, from, useEngineLocks());
643 }
644
645 for(int i = 0; i < GPU_PARTITION_SIZE; ++i)
646 {
647 int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i;
648 if(which_device != from)
649 {
650 if(MIGRATE_VIA_SYSMEM)
651 {
652// cutilSafeCall( cudaSetDevice(which_device) );
653 CUR_DEVICE = which_device; // temporary
654 chunkMemcpy(d_state_data[which_device], h_state_data, STATE_SIZE,
655 cudaMemcpyHostToDevice, which_device, useEngineLocks());
656 }
657 else
658 {
659 chunkMemcpy(d_state_data[which_device],
660 d_state_data[from],
661 STATE_SIZE,
662 cudaMemcpyDeviceToDevice,
663 from,
664 useEngineLocks(),
665 which_device);
666 }
667 }
668 }
669
670 if(MIGRATE_VIA_SYSMEM && CUR_DEVICE != from)
671 {
672// cutilSafeCall( cudaSetDevice(from) );
673 CUR_DEVICE = from;
674 }
675 }
676}
677
678//// Executes on graphics card.
679//__global__ void docudaspin(unsigned int cycles)
680//{
681// long long unsigned int elapsed = 0;
682// long long int now = clock64();
683// long long int last;
684// do
685// {
686// last = now;
687// now = clock64();
688// elapsed += max(0ll, (long long int)(now - last)); // don't count iterations with clock roll-over
689// }while(elapsed < cycles);
690//
691// return;
692//}
693
694
695
696static void gpu_loop_for(double gpu_sec_time, double emergency_exit)
697{
698 unsigned int numcycles = (unsigned int)(gpuCyclesPerSecond * gpu_sec_time);
699 int numblocks = 1;
700 int blocksz = 1;
701
702 CUR_DEVICE = litmus_lock(KEXCLU_LOCK);
703 {
704 if(CUR_DEVICE != LAST_DEVICE && LAST_DEVICE != -1)
705 {
706 migrateToGPU(CUR_DEVICE);
707 }
708
709 if(SEND_SIZE > 0)
710 {
711 // handles chunking and locking, as appropriate.
712 chunkMemcpy(d_send_data[CUR_DEVICE], h_send_data, SEND_SIZE,
713 cudaMemcpyHostToDevice, CUR_DEVICE, useEngineLocks());
714 }
715
716 if(useEngineLocks()) litmus_lock(EE_LOCKS[CUR_DEVICE]);
717
718// docudaspin <<<numblocks,blocksz, 0, streams[CUR_DEVICE]>>> (numcycles);
719// cutilSafeCall( cudaStreamSynchronize(streams[CUR_DEVICE]) );
720
721 if(useEngineLocks()) litmus_unlock(EE_LOCKS[CUR_DEVICE]);
722
723 if(RECV_SIZE > 0)
724 {
725 chunkMemcpy(h_recv_data, d_recv_data[CUR_DEVICE], RECV_SIZE,
726 cudaMemcpyDeviceToHost, CUR_DEVICE, useEngineLocks());
727 }
728
729 if(BROADCAST_STATE)
730 {
731 broadcastState(CUR_DEVICE);
732 }
733 }
734 litmus_unlock(KEXCLU_LOCK);
735
736 LAST_DEVICE = CUR_DEVICE;
737 CUR_DEVICE = -1;
738}
739
740
741static void debug_delay_loop(void)
742{
743 double start, end, delay;
744
745 while (1) {
746 for (delay = 0.5; delay > 0.01; delay -= 0.01) {
747 start = wctime();
748 loop_for(delay, 0);
749 end = wctime();
750 printf("%6.4fs: looped for %10.8fs, delta=%11.8fs, error=%7.4f%%\n",
751 delay,
752 end - start,
753 end - start - delay,
754 100 * (end - start - delay) / delay);
755 }
756 }
757}
758
759static int job(double exec_time, double gpu_sec_time, double program_end)
760{
761 if (wctime() > program_end)
762 return 0;
763 else if (!GPU_TASK)
764 {
765 loop_for(exec_time, program_end + 1);
766 }
767 else
768 {
769 double cpu_bookend = (exec_time)/2.0;
770
771 loop_for(cpu_bookend, program_end + 1);
772 gpu_loop_for(gpu_sec_time, program_end + 1);
773 loop_for(cpu_bookend, program_end + 1);
774 }
775 return 1;
776}
777
778#define OPTSTR "p:ls:e:g:G:W:N:S:R:T:BMaLyC:rz:q"
779
780int main(int argc, char** argv)
781{
782 atexit(catchExit);
783
784 int ret;
785 lt_t wcet;
786 lt_t period;
787 double wcet_ms, period_ms;
788 int migrate = 0;
789 int cpu = 0;
790 int opt;
791 int test_loop = 0;
792// int column = 1;
793 const char *file = NULL;
794 int want_enforcement = 0;
795 double duration = 0, releaseTime = 0;
796 double *exec_times = NULL;
797 double scale = 1.0;
798 uint64_t cur_job;
799 uint64_t num_jobs;
800
801 int create_shm = -1;
802 int num_tasks = 0;
803
804 double gpu_sec_ms = 0;
805
806 while ((opt = getopt(argc, argv, OPTSTR)) != -1) {
807// printf("opt = %c optarg = %s\n", opt, optarg);
808 switch (opt) {
809// case 'w':
810// ENABLE_WAIT = 1;
811// break;
812 case 'p':
813 cpu = atoi(optarg);
814 migrate = 1;
815 break;
816 case 'l':
817 test_loop = 1;
818 break;
819 case 's':
820 scale = atof(optarg);
821 break;
822 case 'e':
823 gpu_sec_ms = atof(optarg);
824 break;
825// case 'x':
826// trans_sec_ms = atof(optarg);
827// break;
828 case 'z':
829 NUM_SIMULT_USERS = atoi(optarg);
830 break;
831 case 'q':
832 USE_PRIOQ = true;
833 break;
834 case 'g':
835 GPU_TASK = 1;
836 GPU_PARTITION_SIZE = atoi(optarg);
837 break;
838 case 'G':
839 GPU_PARTITION = atoi(optarg);
840 break;
841 case 'S':
842 SEND_SIZE = (size_t)(atof(optarg)*1024);
843 break;
844 case 'R':
845 RECV_SIZE = (size_t)(atof(optarg)*1024);
846 break;
847 case 'T':
848 STATE_SIZE = (size_t)(atof(optarg)*1024);
849 break;
850 case 'B':
851 BROADCAST_STATE = true;
852 break;
853 case 'M':
854 MIGRATE_VIA_SYSMEM = true;
855 break;
856 case 'a':
857 ENABLE_AFFINITY = true;
858 break;
859 case 'r':
860 RELAX_FIFO_MAX_LEN = true;
861 break;
862 case 'L':
863 USE_KFMLP = true;
864 break;
865 case 'y':
866 USE_DYNAMIC_GROUP_LOCKS = true;
867 break;
868 case 'C':
869 ENABLE_CHUNKING = true;
870 CHUNK_SIZE = (size_t)(atof(optarg)*1024);
871 break;
872 case 'W':
873 create_shm = atoi(optarg);
874 break;
875 case 'N':
876 num_tasks = atoi(optarg);
877 break;
878 case ':':
879 usage("Argument missing.");
880 break;
881 case '?':
882 default:
883 usage("Bad argument.");
884 break;
885 }
886 }
887
888 if (test_loop) {
889 debug_delay_loop();
890 return 0;
891 }
892
893// if (file) {
894// int num_jobs_tmp;
895// get_exec_times(file, column, &num_jobs_tmp, &exec_times);
896// num_jobs = num_jobs_tmp;
897//
898// if (argc - optind < 2)
899// usage("Arguments missing.");
900//
901// for (cur_job = 0; cur_job < num_jobs; ++cur_job) {
902// /* convert the execution time to seconds */
903// duration += exec_times[cur_job] * 0.001;
904// }
905// } else {
906 /*
907 * if we're not reading from the CSV file, then we need
908 * three parameters
909 */
910 if (argc - optind < 3)
911 usage("Arguments missing.");
912// }
913
914 wcet_ms = atof(argv[optind + 0]);
915 period_ms = atof(argv[optind + 1]);
916
917 wcet = wcet_ms * __NS_PER_MS;
918 period = period_ms * __NS_PER_MS;
919 if (wcet <= 0)
920 usage("The worst-case execution time must be a "
921 "positive number.");
922 if (period <= 0)
923 usage("The period must be a positive number.");
924 if (!file && wcet > period) {
925 usage("The worst-case execution time must not "
926 "exceed the period.");
927 }
928
929 if (!file)
930 {
931 duration = atof(argv[optind + 2]);
932 num_jobs = ((double)duration*1e3)/period_ms;
933 ++num_jobs; // padding
934 }
935 else if (file && num_jobs > 1)
936 {
937 duration += period_ms * 0.001 * (num_jobs - 1);
938 }
939
940 if (migrate) {
941 ret = be_migrate_to(cpu);
942 if (ret < 0)
943 bail_out("could not migrate to target partition");
944 }
945
946 if(ENABLE_WAIT)
947 {
948 if(num_tasks > 0)
949 {
950 printf("%d creating release shared memory\n", getpid());
951 shared_memory_object::remove("release_barrier_memory");
952 release_segment_ptr = new managed_shared_memory(create_only, "release_barrier_memory", 4*1024);
953
954 printf("%d creating release barrier for %d users\n", getpid(), num_tasks);
955 release_barrier = release_segment_ptr->construct<barrier>("barrier release_barrier")(num_tasks);
956
957 init_release_time = release_segment_ptr->construct<uint64_t>("uint64_t instance")();
958 *init_release_time = 0;
959 }
960 else
961 {
962 do
963 {
964 try
965 {
966 printf("%d opening release shared memory\n", getpid());
967 segment_ptr = new managed_shared_memory(open_only, "release_barrier_memory");
968 }
969 catch(...)
970 {
971 printf("%d shared memory not ready. sleeping\n", getpid());
972 sleep(1);
973 }
974 }while(segment_ptr == NULL);
975
976 release_barrier = segment_ptr->find<barrier>("barrier release_barrier").first;
977 init_release_time = segment_ptr->find<uint64_t>("uint64_t instance").first;
978 }
979 }
980
981
982 if(GPU_TASK)
983 {
984 if(ENABLE_WAIT)
985 {
986 if(create_shm > -1)
987 {
988 printf("%d creating shared memory\n", getpid());
989 shared_memory_object::remove("gpu_barrier_memory");
990 segment_ptr = new managed_shared_memory(create_only, "gpu_barrier_memory", 4*1024);
991
992 printf("%d creating a barrier for %d users\n", getpid(), create_shm);
993 gpu_barrier = segment_ptr->construct<barrier>("barrier instance")(create_shm);
994 printf("%d creating gpu mgmt mutexes for 8 devices\n", getpid());
995 gpu_mgmt_mutexes = segment_ptr->construct<interprocess_mutex>("interprocess_mutex m")[8]();
996 }
997 else
998 {
999 do
1000 {
1001 try
1002 {
1003 printf("%d opening shared memory\n", getpid());
1004 segment_ptr = new managed_shared_memory(open_only, "gpu_barrier_memory");
1005 }
1006 catch(...)
1007 {
1008 printf("%d shared memory not ready. sleeping\n", getpid());
1009 sleep(1);
1010 }
1011 }while(segment_ptr == NULL);
1012
1013 gpu_barrier = segment_ptr->find<barrier>("barrier instance").first;
1014 gpu_mgmt_mutexes = segment_ptr->find<interprocess_mutex>("interprocess_mutex m").first;
1015 }
1016 }
1017
1018 // scale data transmission too??
1019 SEND_SIZE *= scale;
1020 RECV_SIZE *= scale;
1021 STATE_SIZE *= scale;
1022
1023 init_cuda();
1024 }
1025
1026 ret = sporadic_task_ns(wcet, period, 0, cpu, RT_CLASS_SOFT,
1027 want_enforcement ? PRECISE_ENFORCEMENT
1028 : NO_ENFORCEMENT,
1029 migrate);
1030 if (ret < 0)
1031 bail_out("could not setup rt task params");
1032
1033 init_litmus();
1034
1035 ret = task_mode(LITMUS_RT_TASK);
1036 if (ret != 0)
1037 bail_out("could not become RT task");
1038
1039
1040
1041 uint64_t jobCount = 0;
1042 blitz::Array<uint64_t, 1> responseTimeLog(num_jobs+1);
1043
1044 struct timespec spec;
1045 uint64_t release;
1046 uint64_t finish;
1047
1048
1049 if (ENABLE_WAIT) {
1050 printf("Waiting for release.\n");
1051 ret = wait_for_ts_release();
1052 if (ret != 0)
1053 bail_out("wait_for_ts_release()");
1054 }
1055 else
1056 {
1057 sleep_next_period();
1058 }
1059
1060 clock_gettime(CLOCK_MONOTONIC, &spec);
1061 release = timespec_to_ns(spec);
1062 if (!__sync_bool_compare_and_swap(init_release_time, 0, release))
1063 {
1064 release = *init_release_time;
1065 }
1066
1067 releaseTime = wctime();
1068 double failsafeEnd = releaseTime + duration;
1069
1070
1071 if (file) {
1072 /* use times read from the CSV file */
1073 for (cur_job = 0; cur_job < num_jobs; ++cur_job) {
1074 /* convert job's length to seconds */
1075 job(exec_times[cur_job] * 0.001 * scale,
1076 gpu_sec_ms * 0.001 * scale,
1077 failsafeEnd);
1078 }
1079 } else {
1080 /* convert to seconds and scale */
1081 int keepGoing;
1082 do
1083 {
1084 keepGoing = job(wcet_ms * 0.001 * scale, gpu_sec_ms * 0.001 * scale, failsafeEnd);
1085
1086
1087 clock_gettime(CLOCK_MONOTONIC, &spec);
1088 finish = timespec_to_ns(spec);
1089
1090 responseTimeLog(min(num_jobs,jobCount++)) = finish - release;
1091
1092 // this is an estimated upper-bound on release time. it may be off by several microseconds.
1093#ifdef RESET_RELEASE_ON_MISS
1094 release = (release + period < finish) ?
1095 finish : /* missed deadline. adopt next release as current time. */
1096 release + period; /* some time in the future. */
1097#else
1098 release = release + period; // allow things to get progressively later.
1099#endif
1100
1101 sleep_next_period();
1102 clock_gettime(CLOCK_MONOTONIC, &spec);
1103 release = min(timespec_to_ns(spec), release);
1104
1105 } while(keepGoing);
1106 }
1107
1108 if(GPU_TASK && ENABLE_WAIT)
1109 {
1110 printf("%d waiting at barrier\n", getpid());
1111 gpu_barrier->wait();
1112 }
1113
1114 ret = task_mode(BACKGROUND_TASK);
1115 if (ret != 0)
1116 bail_out("could not become regular task (huh?)");
1117
1118 if (file)
1119 free(exec_times);
1120
1121 if(GPU_TASK)
1122 {
1123 /*
1124 if(ENABLE_WAIT)
1125 {
1126 // wait for all GPU using tasks ext RT mode.
1127 printf("%d waiting at barrier\n", getpid());
1128 gpu_barrier->wait();
1129 }
1130 */
1131
1132 exit_cuda();
1133
1134 if(ENABLE_WAIT)
1135 {
1136 /* wait before we clean up memory */
1137 printf("%d waiting for all to shutdown GPUs\n", getpid());
1138 gpu_barrier->wait();
1139
1140/*
1141 if(create_shm > -1)
1142 {
1143 printf("%d removing shared memory\n", getpid());
1144 shared_memory_object::remove("gpu_barrier_memory");
1145 }
1146*/
1147 }
1148 }
1149
1150
1151 if (ENABLE_WAIT)
1152 {
1153 printf("%d waiting at exit barrier\n", getpid());
1154 release_barrier->wait();
1155 }
1156
1157
1158 char gpu_using_str[] = "GPU\n";
1159 char cpu_only_str[] = "CPU\n";
1160 #define USED(arr) (arr)(Range(fromStart,min(num_jobs-1,jobCount-1)))
1161 // period (ms), avg-rt, min-rt, max-rt, avg-slack, numMisses
1162 printf("DONE,%d,%d,%f,%f,%f,%lu,%lu,%f,%lu,%d,%d,%s",
1163 cpu,
1164 getpid(),
1165 period_ms,
1166 // average
1167 blitz::mean(USED(responseTimeLog)),
1168 // average pct of period
1169 100.0*(blitz::mean(USED(responseTimeLog))/period),
1170 // min
1171 blitz::min(USED(responseTimeLog)),
1172 // max
1173 blitz::max(USED(responseTimeLog)),
1174 // average slack
1175 blitz::mean((uint64_t)period - USED(responseTimeLog)),
1176 // num jobs
1177 min(num_jobs-1,jobCount-1),
1178 // num misses
1179 blitz::count(USED(responseTimeLog) > (uint64_t)period),
1180 // num misses w/ unbounded
1181 blitz::count(USED(responseTimeLog) > (uint64_t)(2*period)),
1182 // flag gpu-using tasks
1183 ((GPU_TASK) ? gpu_using_str : cpu_only_str)
1184 );
1185
1186 return 0;
1187}
diff --git a/include/common.h b/include/common.h
index d1234ba..faf2c07 100644
--- a/include/common.h
+++ b/include/common.h
@@ -1,7 +1,14 @@
1#ifndef COMMON_H 1#ifndef COMMON_H
2#define COMMON_H 2#define COMMON_H
3 3
4#ifdef __cplusplus
5extern "C" {
6#endif
4 7
5void bail_out(const char* msg); 8void bail_out(const char* msg);
6 9
10#ifdef __cplusplus
11}
12#endif
13
7#endif 14#endif