diff options
author | Glenn Elliott <gelliott@cs.unc.edu> | 2013-04-14 15:06:43 -0400 |
---|---|---|
committer | Glenn Elliott <gelliott@cs.unc.edu> | 2013-04-14 15:06:43 -0400 |
commit | 37b4a24ba84f1dffd680fd550a3d8cad2ac5e3a8 (patch) | |
tree | 5dc5e56a7a4f424e75f59f7705263bdb43b86fb3 | |
parent | 209f1961ea2d5863d6f2d2e9d2323446ee5e53c4 (diff) |
Implemented gpusync rtspin.
-rw-r--r-- | Makefile | 51 | ||||
-rw-r--r-- | gpu/budget.cpp | 143 | ||||
-rw-r--r-- | gpu/gpuspin.cu | 1720 | ||||
-rw-r--r-- | gpu/rtspin_fake_cuda.cpp | 1187 | ||||
-rw-r--r-- | include/common.h | 7 |
5 files changed, 1909 insertions, 1199 deletions
@@ -24,6 +24,12 @@ flags-debug-cpp = -O2 -Wall -Werror -g | |||
24 | flags-api = -D_XOPEN_SOURCE=600 -D_GNU_SOURCE | 24 | flags-api = -D_XOPEN_SOURCE=600 -D_GNU_SOURCE |
25 | flags-misc = -fasynchronous-unwind-tables -fnon-call-exceptions | 25 | flags-misc = -fasynchronous-unwind-tables -fnon-call-exceptions |
26 | 26 | ||
27 | flags-cu-debug = -g -G -Xcompiler -Wall -Xcompiler -Werror | ||
28 | flags-cu-optim = -O3 -Xcompiler -march=native | ||
29 | flags-cu-nvcc = --use_fast_math -gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 | ||
30 | flags-cu-misc = -Xcompiler -fasynchronous-unwind-tables -Xcompiler -fnon-call-exceptions -Xcompiler -malign-double -Xcompiler -pthread | ||
31 | flags-cu-x86_64 = -m64 | ||
32 | |||
27 | # architecture-specific flags | 33 | # architecture-specific flags |
28 | flags-i386 = -m32 | 34 | flags-i386 = -m32 |
29 | flags-x86_64 = -m64 | 35 | flags-x86_64 = -m64 |
@@ -51,12 +57,19 @@ headers = -I${LIBLITMUS}/include -I${LIBLITMUS}/arch/${include-${ARCH}}/include | |||
51 | 57 | ||
52 | # combine options | 58 | # combine options |
53 | CPPFLAGS = ${flags-api} ${flags-debug-cpp} ${flags-misc} ${flags-${ARCH}} -DARCH=${ARCH} ${headers} | 59 | CPPFLAGS = ${flags-api} ${flags-debug-cpp} ${flags-misc} ${flags-${ARCH}} -DARCH=${ARCH} ${headers} |
60 | #CUFLAGS = ${flags-api} ${flags-cu-debug} ${flags-cu-optim} ${flags-cu-nvcc} ${flags-cu-misc} -DARCH=${ARCH} ${headers} | ||
61 | CUFLAGS = ${flags-api} ${flags-cu-optim} ${flags-cu-nvcc} ${flags-cu-misc} -DARCH=${ARCH} ${headers} | ||
54 | CFLAGS = ${flags-debug} ${flags-misc} | 62 | CFLAGS = ${flags-debug} ${flags-misc} |
55 | LDFLAGS = ${flags-${ARCH}} | 63 | LDFLAGS = ${flags-${ARCH}} |
56 | 64 | ||
57 | # how to link against liblitmus | 65 | # how to link against liblitmus |
58 | liblitmus-flags = -L${LIBLITMUS} -llitmus | 66 | liblitmus-flags = -L${LIBLITMUS} -llitmus |
59 | 67 | ||
68 | # how to link cuda | ||
69 | cuda-flags-i386 = -L/usr/local/cuda/lib | ||
70 | cuda-flags-x86_64 = -L/usr/local/cuda/lib64 | ||
71 | cuda-flags = ${cuda-flags-${ARCH}} -lcudart | ||
72 | |||
60 | # Force gcc instead of cc, but let the user specify a more specific version if | 73 | # Force gcc instead of cc, but let the user specify a more specific version if |
61 | # desired. | 74 | # desired. |
62 | ifeq (${CC},cc) | 75 | ifeq (${CC},cc) |
@@ -67,20 +80,24 @@ endif | |||
67 | CPP = g++ | 80 | CPP = g++ |
68 | #endif | 81 | #endif |
69 | 82 | ||
83 | CU = nvcc | ||
84 | |||
70 | # incorporate cross-compiler (if any) | 85 | # incorporate cross-compiler (if any) |
71 | CC := ${CROSS_COMPILE}${CC} | 86 | CC := ${CROSS_COMPILE}${CC} |
72 | CPP := ${CROSS_COMPILE}${CPP} | 87 | CPP := ${CROSS_COMPILE}${CPP} |
73 | LD := ${CROSS_COMPILE}${LD} | 88 | LD := ${CROSS_COMPILE}${LD} |
74 | AR := ${CROSS_COMPILE}${AR} | 89 | AR := ${CROSS_COMPILE}${AR} |
90 | CU := ${CROSS_COMPILE}${CU} | ||
75 | 91 | ||
76 | # ############################################################################## | 92 | # ############################################################################## |
77 | # Targets | 93 | # Targets |
78 | 94 | ||
79 | all = lib ${rt-apps} ${rt-cppapps} | 95 | all = lib ${rt-apps} ${rt-cppapps} ${rt-cuapps} |
80 | rt-apps = cycles base_task rt_launch rtspin release_ts measure_syscall \ | 96 | rt-apps = cycles base_task rt_launch rtspin release_ts measure_syscall \ |
81 | base_mt_task uncache runtests \ | 97 | base_mt_task uncache runtests \ |
82 | nested locktest ikglptest dgl aux_threads normal_task | 98 | nested locktest ikglptest dgl aux_threads normal_task |
83 | rt-cppapps = budget | 99 | rt-cppapps = budget |
100 | rt-cuapps = gpuspin | ||
84 | 101 | ||
85 | .PHONY: all lib clean dump-config TAGS tags cscope help | 102 | .PHONY: all lib clean dump-config TAGS tags cscope help |
86 | 103 | ||
@@ -95,10 +112,14 @@ inc/config.makefile: Makefile | |||
95 | @printf "%-15s= %-20s\n" \ | 112 | @printf "%-15s= %-20s\n" \ |
96 | ARCH ${ARCH} \ | 113 | ARCH ${ARCH} \ |
97 | CFLAGS '${CFLAGS}' \ | 114 | CFLAGS '${CFLAGS}' \ |
115 | CPPFLAGS '${CPPFLAGS}' \ | ||
116 | CUFLAGS '${CUFLAGS}' \ | ||
98 | LDFLAGS '${LDFLAGS}' \ | 117 | LDFLAGS '${LDFLAGS}' \ |
99 | LDLIBS '${liblitmus-flags}' \ | 118 | LDLIBS '${liblitmus-flags}' \ |
100 | CPPFLAGS '${CPPFLAGS}' \ | 119 | CPPFLAGS '${CPPFLAGS}' \ |
101 | CC '${shell which ${CC}}' \ | 120 | CC '${shell which ${CC}}' \ |
121 | CPP '${shell which ${CPP}}' \ | ||
122 | CU '${shell which ${CU}}' \ | ||
102 | LD '${shell which ${LD}}' \ | 123 | LD '${shell which ${LD}}' \ |
103 | AR '${shell which ${AR}}' \ | 124 | AR '${shell which ${AR}}' \ |
104 | > $@ | 125 | > $@ |
@@ -112,10 +133,12 @@ dump-config: | |||
112 | headers "${headers}" \ | 133 | headers "${headers}" \ |
113 | "kernel headers" "${imported-headers}" \ | 134 | "kernel headers" "${imported-headers}" \ |
114 | CFLAGS "${CFLAGS}" \ | 135 | CFLAGS "${CFLAGS}" \ |
115 | LDFLAGS "${LDFLAGS}" \ | ||
116 | CPPFLAGS "${CPPFLAGS}" \ | 136 | CPPFLAGS "${CPPFLAGS}" \ |
137 | CUFLAGS "${CUFLAGS}" \ | ||
138 | LDFLAGS "${LDFLAGS}" \ | ||
117 | CC "${CC}" \ | 139 | CC "${CC}" \ |
118 | CPP "${CPP}" \ | 140 | CPP "${CPP}" \ |
141 | CU "${CU}" \ | ||
119 | LD "${LD}" \ | 142 | LD "${LD}" \ |
120 | AR "${AR}" \ | 143 | AR "${AR}" \ |
121 | obj-all "${obj-all}" | 144 | obj-all "${obj-all}" |
@@ -124,8 +147,7 @@ help: | |||
124 | @cat INSTALL | 147 | @cat INSTALL |
125 | 148 | ||
126 | clean: | 149 | clean: |
127 | rm -f ${rt-apps} | 150 | rm -f ${rt-apps} ${rt-cppapps} ${rt-cuapps} |
128 | rm -f ${rt-cppapps} | ||
129 | rm -f *.o *.d *.a test_catalog.inc | 151 | rm -f *.o *.d *.a test_catalog.inc |
130 | rm -f ${imported-headers} | 152 | rm -f ${imported-headers} |
131 | rm -f inc/config.makefile | 153 | rm -f inc/config.makefile |
@@ -259,6 +281,12 @@ vpath %.cpp gpu/ | |||
259 | objcpp-budget = budget.o common.o | 281 | objcpp-budget = budget.o common.o |
260 | lib-budget = -lrt -lm -pthread | 282 | lib-budget = -lrt -lm -pthread |
261 | 283 | ||
284 | |||
285 | vpath %.cu gpu/ | ||
286 | |||
287 | objcu-gpuspin = gpuspin.o common.o | ||
288 | lib-gpuspin = -lrt -lm -lpthread | ||
289 | |||
262 | # ############################################################################## | 290 | # ############################################################################## |
263 | # Build everything that depends on liblitmus. | 291 | # Build everything that depends on liblitmus. |
264 | 292 | ||
@@ -269,14 +297,19 @@ ${rt-apps}: $${obj-$$@} liblitmus.a | |||
269 | ${rt-cppapps}: $${objcpp-$$@} liblitmus.a | 297 | ${rt-cppapps}: $${objcpp-$$@} liblitmus.a |
270 | $(CPP) -o $@ $(LDFLAGS) ${ldf-$@} $(filter-out liblitmus.a,$+) $(LOADLIBS) $(LDLIBS) ${liblitmus-flags} ${lib-$@} | 298 | $(CPP) -o $@ $(LDFLAGS) ${ldf-$@} $(filter-out liblitmus.a,$+) $(LOADLIBS) $(LDLIBS) ${liblitmus-flags} ${lib-$@} |
271 | 299 | ||
300 | ${rt-cuapps}: $${objcu-$$@} liblitmus.a | ||
301 | $(CPP) -o $@ $(LDFLAGS) ${ldf-$@} $(filter-out liblitmus.a,$+) $(LOADLIBS) $(LDLIBS) ${liblitmus-flags} ${cuda-flags} ${lib-$@} | ||
302 | |||
272 | # ############################################################################## | 303 | # ############################################################################## |
273 | # Dependency resolution. | 304 | # Dependency resolution. |
274 | 305 | ||
275 | vpath %.c bin/ src/ gpu/ tests/ | 306 | vpath %.c bin/ src/ gpu/ tests/ |
276 | vpath %.cpp gpu/ | 307 | vpath %.cpp gpu/ |
308 | vpath %.cu gpu/ | ||
277 | 309 | ||
278 | obj-all = ${sort ${foreach target,${all},${obj-${target}}}} | 310 | obj-all = ${sort ${foreach target,${all},${obj-${target}}}} |
279 | obj-all += ${sort ${foreach target,${all},${objcpp-${target}}}} | 311 | obj-all += ${sort ${foreach target,${all},${objcpp-${target}}}} |
312 | obj-all += ${sort ${foreach target,${all},${objcu-${target}}}} | ||
280 | 313 | ||
281 | # rule to generate dependency files | 314 | # rule to generate dependency files |
282 | %.d: %.c ${imported-headers} | 315 | %.d: %.c ${imported-headers} |
@@ -291,6 +324,16 @@ obj-all += ${sort ${foreach target,${all},${objcpp-${target}}}} | |||
291 | sed 's,\($*\)\.o[ :]*,\1.o $@ : ,g' < $@.$$$$ > $@; \ | 324 | sed 's,\($*\)\.o[ :]*,\1.o $@ : ,g' < $@.$$$$ > $@; \ |
292 | rm -f $@.$$$$ | 325 | rm -f $@.$$$$ |
293 | 326 | ||
327 | %.d: %.cu ${imported-headers} | ||
328 | @set -e; rm -f $@; \ | ||
329 | $(CU) --generate-dependencies $(CUFLAGS) $< > $@.$$$$; \ | ||
330 | sed 's,\($*\)\.o[ :]*,\1.o $@ : ,g' < $@.$$$$ > $@; \ | ||
331 | rm -f $@.$$$$ | ||
332 | |||
333 | # teach make how to compile .cu files | ||
334 | %.o: %.cu | ||
335 | $(CU) --compile $(CUFLAGS) $(OUTPUT_OPTION) $< | ||
336 | |||
294 | ifeq ($(MAKECMDGOALS),) | 337 | ifeq ($(MAKECMDGOALS),) |
295 | MAKECMDGOALS += all | 338 | MAKECMDGOALS += all |
296 | endif | 339 | endif |
diff --git a/gpu/budget.cpp b/gpu/budget.cpp index f62c515..8a2546a 100644 --- a/gpu/budget.cpp +++ b/gpu/budget.cpp | |||
@@ -80,6 +80,28 @@ int SIGNALS = 0; | |||
80 | int BLOCK_SIGNALS_ON_SLEEP = 0; | 80 | int BLOCK_SIGNALS_ON_SLEEP = 0; |
81 | int OVERRUN_RATE = 1; /* default: every job overruns */ | 81 | int OVERRUN_RATE = 1; /* default: every job overruns */ |
82 | 82 | ||
83 | int CXS_OVERRUN = 0; | ||
84 | int NUM_LOCKS = 1; | ||
85 | int NUM_REPLICAS = 1; | ||
86 | int NAMESPACE = 0; | ||
87 | int *LOCKS = NULL; | ||
88 | int IKGLP_LOCK = 0; | ||
89 | int USE_DGLS = 0; | ||
90 | int NEST_IN_IKGLP = 0; | ||
91 | |||
92 | int WAIT = 0; | ||
93 | |||
94 | enum eLockType | ||
95 | { | ||
96 | FIFO, | ||
97 | PRIOQ, | ||
98 | IKGLP | ||
99 | }; | ||
100 | |||
101 | eLockType LOCK_TYPE = FIFO; | ||
102 | |||
103 | int OVERRUN_BY_SLEEP = 0; | ||
104 | |||
83 | int NUM_JOBS = 0; | 105 | int NUM_JOBS = 0; |
84 | int NUM_COMPLETED_JOBS = 0; | 106 | int NUM_COMPLETED_JOBS = 0; |
85 | int NUM_OVERRUNS = 0; | 107 | int NUM_OVERRUNS = 0; |
@@ -103,9 +125,32 @@ int job(lt_t exec_ns, lt_t budget_ns) | |||
103 | if (SIGNALS && BLOCK_SIGNALS_ON_SLEEP) | 125 | if (SIGNALS && BLOCK_SIGNALS_ON_SLEEP) |
104 | block_litmus_signals(SIG_BUDGET); | 126 | block_litmus_signals(SIG_BUDGET); |
105 | 127 | ||
128 | if(CXS_OVERRUN) { | ||
129 | if (NEST_IN_IKGLP) | ||
130 | litmus_lock(IKGLP_LOCK); | ||
131 | if (USE_DGLS) | ||
132 | litmus_dgl_lock(LOCKS, NUM_LOCKS); | ||
133 | else | ||
134 | for(int i = 0; i < NUM_LOCKS; ++i) | ||
135 | litmus_lock(LOCKS[i]); | ||
136 | } | ||
137 | |||
106 | // intentionally overrun via suspension | 138 | // intentionally overrun via suspension |
107 | lt_sleep(approx_remaining + overrun_extra); | 139 | if (OVERRUN_BY_SLEEP) |
108 | 140 | lt_sleep(approx_remaining + overrun_extra); | |
141 | else | ||
142 | loop_for((approx_remaining + overrun_extra) * 0.9); | ||
143 | |||
144 | if(CXS_OVERRUN) { | ||
145 | if (USE_DGLS) | ||
146 | litmus_dgl_unlock(LOCKS, NUM_LOCKS); | ||
147 | else | ||
148 | for(int i = NUM_LOCKS-1; i >= 0; --i) | ||
149 | litmus_unlock(LOCKS[i]); | ||
150 | if (NEST_IN_IKGLP) | ||
151 | litmus_unlock(IKGLP_LOCK); | ||
152 | } | ||
153 | |||
109 | if (SIGNALS && BLOCK_SIGNALS_ON_SLEEP) | 154 | if (SIGNALS && BLOCK_SIGNALS_ON_SLEEP) |
110 | unblock_litmus_signals(SIG_BUDGET); | 155 | unblock_litmus_signals(SIG_BUDGET); |
111 | } | 156 | } |
@@ -120,15 +165,18 @@ int job(lt_t exec_ns, lt_t budget_ns) | |||
120 | return 1; | 165 | return 1; |
121 | } | 166 | } |
122 | 167 | ||
123 | #define OPTSTR "sboOva" | 168 | #define OPTSTR "SbosOvalwqixdn:r:" |
124 | 169 | ||
125 | int main(int argc, char** argv) | 170 | int main(int argc, char** argv) |
126 | { | 171 | { |
127 | int ret; | 172 | int ret; |
128 | lt_t e_ns = ms2ns(10); | 173 | |
129 | lt_t p_ns = ms2ns(100); | 174 | srand(getpid()); |
175 | |||
176 | lt_t e_ns = ms2ns(2); | ||
177 | lt_t p_ns = ms2ns(50) + rand()%200; | ||
130 | lt_t budget_ns = p_ns/2; | 178 | lt_t budget_ns = p_ns/2; |
131 | lt_t duration = s2ns(10); | 179 | lt_t duration = s2ns(60); |
132 | lt_t terminate_time; | 180 | lt_t terminate_time; |
133 | unsigned int first_job, last_job; | 181 | unsigned int first_job, last_job; |
134 | int opt; | 182 | int opt; |
@@ -140,12 +188,15 @@ int main(int argc, char** argv) | |||
140 | 188 | ||
141 | while ((opt = getopt(argc, argv, OPTSTR)) != -1) { | 189 | while ((opt = getopt(argc, argv, OPTSTR)) != -1) { |
142 | switch(opt) { | 190 | switch(opt) { |
143 | case 's': | 191 | case 'S': |
144 | SIGNALS = 1; | 192 | SIGNALS = 1; |
145 | break; | 193 | break; |
146 | case 'b': | 194 | case 'b': |
147 | BLOCK_SIGNALS_ON_SLEEP = 1; | 195 | BLOCK_SIGNALS_ON_SLEEP = 1; |
148 | break; | 196 | break; |
197 | case 's': | ||
198 | OVERRUN_BY_SLEEP = 1; | ||
199 | break; | ||
149 | case 'o': | 200 | case 'o': |
150 | OVERRUN = 1; | 201 | OVERRUN = 1; |
151 | overrun_extra = budget_ns/2; | 202 | overrun_extra = budget_ns/2; |
@@ -164,6 +215,31 @@ int main(int argc, char** argv) | |||
164 | case 'v': | 215 | case 'v': |
165 | drain_policy = DRAIN_SOBLIV; | 216 | drain_policy = DRAIN_SOBLIV; |
166 | break; | 217 | break; |
218 | case 'l': | ||
219 | CXS_OVERRUN = 1; | ||
220 | NAMESPACE = open("semaphores", O_RDONLY | O_CREAT, S_IRUSR | S_IWUSR); | ||
221 | break; | ||
222 | case 'q': | ||
223 | LOCK_TYPE = PRIOQ; | ||
224 | break; | ||
225 | case 'i': | ||
226 | LOCK_TYPE = IKGLP; | ||
227 | break; | ||
228 | case 'x': | ||
229 | NEST_IN_IKGLP = 1; | ||
230 | break; | ||
231 | case 'w': | ||
232 | WAIT = 1; | ||
233 | break; | ||
234 | case 'd': | ||
235 | USE_DGLS = 1; | ||
236 | break; | ||
237 | case 'n': | ||
238 | NUM_LOCKS = atoi(optarg); | ||
239 | break; | ||
240 | case 'r': | ||
241 | NUM_REPLICAS = atoi(optarg); | ||
242 | break; | ||
167 | case ':': | 243 | case ':': |
168 | printf("missing argument\n"); | 244 | printf("missing argument\n"); |
169 | assert(false); | 245 | assert(false); |
@@ -176,10 +252,21 @@ int main(int argc, char** argv) | |||
176 | } | 252 | } |
177 | 253 | ||
178 | assert(!BLOCK_SIGNALS_ON_SLEEP || (BLOCK_SIGNALS_ON_SLEEP && SIGNALS)); | 254 | assert(!BLOCK_SIGNALS_ON_SLEEP || (BLOCK_SIGNALS_ON_SLEEP && SIGNALS)); |
255 | assert(!CXS_OVERRUN || (CXS_OVERRUN && WAIT)); | ||
256 | assert(LOCK_TYPE != IKGLP || NUM_LOCKS == 1); | ||
257 | assert(LOCK_TYPE != IKGLP || (LOCK_TYPE == IKGLP && !NEST_IN_IKGLP)); | ||
258 | assert(NUM_LOCKS > 0); | ||
259 | if (LOCK_TYPE == IKGLP || NEST_IN_IKGLP) | ||
260 | assert(NUM_REPLICAS >= 1); | ||
261 | |||
262 | LOCKS = new int[NUM_LOCKS]; | ||
179 | 263 | ||
180 | if (compute_overrun_rate) { | 264 | if (compute_overrun_rate) { |
181 | int backlog = (int)ceil((overrun_extra + budget_ns)/(double)budget_ns); | 265 | int backlog = (int)ceil((overrun_extra + budget_ns)/(double)budget_ns); |
182 | OVERRUN_RATE = backlog + 2; /* some padding */ | 266 | if (!CXS_OVERRUN) |
267 | OVERRUN_RATE = backlog + 2; /* some padding */ | ||
268 | else | ||
269 | OVERRUN_RATE = 2*backlog + 2; /* overrun less frequently for testing */ | ||
183 | } | 270 | } |
184 | 271 | ||
185 | init_rt_task_param(¶m); | 272 | init_rt_task_param(¶m); |
@@ -197,6 +284,44 @@ int main(int argc, char** argv) | |||
197 | ret = set_rt_task_param(gettid(), ¶m); | 284 | ret = set_rt_task_param(gettid(), ¶m); |
198 | assert(ret == 0); | 285 | assert(ret == 0); |
199 | 286 | ||
287 | if (CXS_OVERRUN) { | ||
288 | int i; | ||
289 | for(i = 0; i < NUM_LOCKS; ++i) { | ||
290 | int lock = -1; | ||
291 | switch(LOCK_TYPE) | ||
292 | { | ||
293 | case FIFO: | ||
294 | lock = open_fifo_sem(NAMESPACE, i); | ||
295 | break; | ||
296 | case PRIOQ: | ||
297 | lock = open_prioq_sem(NAMESPACE, i); | ||
298 | break; | ||
299 | case IKGLP: | ||
300 | lock = open_ikglp_sem(NAMESPACE, i, NUM_REPLICAS); | ||
301 | break; | ||
302 | } | ||
303 | if (lock < 0) { | ||
304 | perror("open_sem"); | ||
305 | exit(-1); | ||
306 | } | ||
307 | LOCKS[i] = lock; | ||
308 | } | ||
309 | |||
310 | if (NEST_IN_IKGLP) { | ||
311 | IKGLP_LOCK = open_ikglp_sem(NAMESPACE, i, NUM_REPLICAS); | ||
312 | if (IKGLP_LOCK < 0) { | ||
313 | perror("open_sem"); | ||
314 | exit(-1); | ||
315 | } | ||
316 | } | ||
317 | } | ||
318 | |||
319 | if (WAIT) { | ||
320 | ret = wait_for_ts_release(); | ||
321 | if (ret < 0) | ||
322 | perror("wait_for_ts_release"); | ||
323 | } | ||
324 | |||
200 | ret = task_mode(LITMUS_RT_TASK); | 325 | ret = task_mode(LITMUS_RT_TASK); |
201 | assert(ret == 0); | 326 | assert(ret == 0); |
202 | 327 | ||
@@ -231,5 +356,7 @@ int main(int argc, char** argv) | |||
231 | printf("# User Jobs Completed: %d\n", NUM_COMPLETED_JOBS); | 356 | printf("# User Jobs Completed: %d\n", NUM_COMPLETED_JOBS); |
232 | printf("# Overruns: %d\n", NUM_OVERRUNS); | 357 | printf("# Overruns: %d\n", NUM_OVERRUNS); |
233 | 358 | ||
359 | delete[] LOCKS; | ||
360 | |||
234 | return 0; | 361 | return 0; |
235 | } | 362 | } |
diff --git a/gpu/gpuspin.cu b/gpu/gpuspin.cu new file mode 100644 index 0000000..aff6cd1 --- /dev/null +++ b/gpu/gpuspin.cu | |||
@@ -0,0 +1,1720 @@ | |||
1 | #include <sys/time.h> | ||
2 | |||
3 | #include <stdio.h> | ||
4 | #include <stdlib.h> | ||
5 | #include <unistd.h> | ||
6 | #include <time.h> | ||
7 | #include <string.h> | ||
8 | #include <assert.h> | ||
9 | #include <execinfo.h> | ||
10 | |||
11 | #include <boost/interprocess/managed_shared_memory.hpp> | ||
12 | #include <boost/interprocess/sync/interprocess_mutex.hpp> | ||
13 | |||
14 | #include <cuda_runtime.h> | ||
15 | |||
16 | #include "litmus.h" | ||
17 | #include "common.h" | ||
18 | |||
19 | using namespace std; | ||
20 | using namespace boost::interprocess; | ||
21 | |||
22 | const char *lock_namespace = "./.gpuspin-locks"; | ||
23 | |||
24 | const int NR_GPUS = 8; | ||
25 | |||
26 | bool GPU_USING = false; | ||
27 | bool ENABLE_AFFINITY = false; | ||
28 | bool RELAX_FIFO_MAX_LEN = false; | ||
29 | bool ENABLE_CHUNKING = false; | ||
30 | bool MIGRATE_VIA_SYSMEM = false; | ||
31 | |||
32 | enum eEngineLockTypes | ||
33 | { | ||
34 | FIFO, | ||
35 | PRIOQ | ||
36 | }; | ||
37 | |||
38 | eEngineLockTypes ENGINE_LOCK_TYPE = FIFO; | ||
39 | |||
40 | int GPU_PARTITION = 0; | ||
41 | int GPU_PARTITION_SIZE = 0; | ||
42 | int CPU_PARTITION_SIZE = 0; | ||
43 | |||
44 | int RHO = 2; | ||
45 | |||
46 | int NUM_COPY_ENGINES = 2; | ||
47 | |||
48 | |||
49 | __attribute__((unused)) static size_t kbToB(size_t kb) { return kb * 1024; } | ||
50 | __attribute__((unused)) static size_t mbToB(size_t mb) { return kbToB(mb * 1024); } | ||
51 | |||
52 | /* in bytes */ | ||
53 | size_t SEND_SIZE = 0; | ||
54 | size_t RECV_SIZE = 0; | ||
55 | size_t STATE_SIZE = 0; | ||
56 | size_t CHUNK_SIZE = 0; | ||
57 | |||
58 | int TOKEN_LOCK = -1; | ||
59 | |||
60 | bool USE_ENGINE_LOCKS = true; | ||
61 | bool USE_DYNAMIC_GROUP_LOCKS = false; | ||
62 | int EE_LOCKS[NR_GPUS]; | ||
63 | int CE_SEND_LOCKS[NR_GPUS]; | ||
64 | int CE_RECV_LOCKS[NR_GPUS]; | ||
65 | int CE_MIGR_SEND_LOCKS[NR_GPUS]; | ||
66 | int CE_MIGR_RECV_LOCKS[NR_GPUS]; | ||
67 | bool RESERVED_MIGR_COPY_ENGINE = false; // only checked if NUM_COPY_ENGINES == 2 | ||
68 | |||
69 | bool ENABLE_RT_AUX_THREADS = true; | ||
70 | |||
71 | enum eGpuSyncMode | ||
72 | { | ||
73 | IKGLP_MODE, | ||
74 | IKGLP_WC_MODE, /* work-conserving IKGLP. no GPU is left idle, but breaks optimality */ | ||
75 | KFMLP_MODE, | ||
76 | RGEM_MODE, | ||
77 | }; | ||
78 | |||
79 | eGpuSyncMode GPU_SYNC_MODE = IKGLP_MODE; | ||
80 | |||
81 | enum eCudaSyncMode | ||
82 | { | ||
83 | BLOCKING, | ||
84 | SPIN | ||
85 | }; | ||
86 | |||
87 | eCudaSyncMode CUDA_SYNC_MODE = BLOCKING; | ||
88 | |||
89 | |||
90 | int CUR_DEVICE = -1; | ||
91 | int LAST_DEVICE = -1; | ||
92 | |||
93 | cudaStream_t STREAMS[NR_GPUS]; | ||
94 | int GPU_HZ[NR_GPUS]; | ||
95 | int NUM_SM[NR_GPUS]; | ||
96 | int WARP_SIZE[NR_GPUS]; | ||
97 | int ELEM_PER_THREAD[NR_GPUS]; | ||
98 | |||
99 | #define DEFINE_PER_GPU(type, var) type var[NR_GPUS] | ||
100 | #define per_gpu(var, idx) (var[(idx)]) | ||
101 | #define this_gpu(var) (var[(CUR_DEVICE)]) | ||
102 | #define cur_stream() (this_gpu(STREAMS)) | ||
103 | #define cur_gpu() (CUR_DEVICE) | ||
104 | #define last_gpu() (LAST_DEVICE) | ||
105 | #define cur_ee() (EE_LOCKS[CUR_DEVICE]) | ||
106 | #define cur_send() (CE_SEND_LOCKS[CUR_DEVICE]) | ||
107 | #define cur_recv() (CE_RECV_LOCKS[CUR_DEVICE]) | ||
108 | #define cur_migr_send() (CE_MIGR_SEND_LOCKS[CUR_DEVICE]) | ||
109 | #define cur_migr_recv() (CE_MIGR_RECV_LOCKS[CUR_DEVICE]) | ||
110 | #define cur_hz() (GPU_HZ[CUR_DEVICE]) | ||
111 | #define cur_sms() (NUM_SM[CUR_DEVICE]) | ||
112 | #define cur_warp_size() (WARP_SIZE[CUR_DEVICE]) | ||
113 | #define cur_elem_per_thread() (ELEM_PER_THREAD[CUR_DEVICE]) | ||
114 | #define num_online_gpus() (NUM_GPUS) | ||
115 | |||
116 | static bool useEngineLocks() | ||
117 | { | ||
118 | return(USE_ENGINE_LOCKS); | ||
119 | } | ||
120 | |||
121 | #define VANILLA_LINUX | ||
122 | |||
123 | bool TRACE_MIGRATIONS = false; | ||
124 | #ifndef VANILLA_LINUX | ||
125 | #define trace_migration(to, from) do { inject_gpu_migration((to), (from)); } while(0) | ||
126 | #define trace_release(arrival, deadline, jobno) do { inject_release((arrival), (deadline), (jobno)); } while(0) | ||
127 | #define trace_completion(jobno) do { inject_completion((jobno)); } while(0) | ||
128 | #define trace_name() do { inject_name(); } while(0) | ||
129 | #define trace_param() do { inject_param(); } while(0) | ||
130 | #else | ||
131 | #define set_rt_task_param(x, y) (0) | ||
132 | #define trace_migration(to, from) | ||
133 | #define trace_release(arrival, deadline, jobno) | ||
134 | #define trace_completion(jobno) | ||
135 | #define trace_name() | ||
136 | #define trace_param() | ||
137 | #endif | ||
138 | |||
139 | struct ce_lock_state | ||
140 | { | ||
141 | int locks[2]; | ||
142 | size_t num_locks; | ||
143 | size_t budget_remaining; | ||
144 | bool locked; | ||
145 | |||
146 | ce_lock_state(int device_a, enum cudaMemcpyKind kind, size_t size, int device_b = -1, bool migration = false) { | ||
147 | num_locks = (device_a != -1) + (device_b != -1); | ||
148 | |||
149 | if(device_a != -1) { | ||
150 | if (!migration) | ||
151 | locks[0] = (kind == cudaMemcpyHostToDevice || (kind == cudaMemcpyDeviceToDevice && device_b == -1)) ? | ||
152 | CE_SEND_LOCKS[device_a] : CE_RECV_LOCKS[device_a]; | ||
153 | else | ||
154 | locks[0] = (kind == cudaMemcpyHostToDevice || (kind == cudaMemcpyDeviceToDevice && device_b == -1)) ? | ||
155 | CE_MIGR_SEND_LOCKS[device_a] : CE_MIGR_RECV_LOCKS[device_a]; | ||
156 | } | ||
157 | |||
158 | if(device_b != -1) { | ||
159 | assert(kind == cudaMemcpyDeviceToDevice); | ||
160 | |||
161 | if (!migration) | ||
162 | locks[1] = CE_RECV_LOCKS[device_b]; | ||
163 | else | ||
164 | locks[1] = CE_MIGR_RECV_LOCKS[device_b]; | ||
165 | |||
166 | if(locks[1] < locks[0]) { | ||
167 | // enforce total order on locking | ||
168 | int temp = locks[1]; | ||
169 | locks[1] = locks[0]; | ||
170 | locks[0] = temp; | ||
171 | } | ||
172 | } | ||
173 | else { | ||
174 | locks[1] = -1; | ||
175 | } | ||
176 | |||
177 | if(!ENABLE_CHUNKING) | ||
178 | budget_remaining = size; | ||
179 | else | ||
180 | budget_remaining = CHUNK_SIZE; | ||
181 | } | ||
182 | |||
183 | void crash(void) { | ||
184 | void *array[50]; | ||
185 | int size, i; | ||
186 | char **messages; | ||
187 | |||
188 | size = backtrace(array, 50); | ||
189 | messages = backtrace_symbols(array, size); | ||
190 | |||
191 | fprintf(stderr, "%d: TRIED TO GRAB SAME LOCK TWICE! Lock = %d\n", getpid(), locks[0]); | ||
192 | for (i = 1; i < size && messages != NULL; ++i) | ||
193 | { | ||
194 | fprintf(stderr, "%d: [bt]: (%d) %s\n", getpid(), i, messages[i]); | ||
195 | } | ||
196 | free(messages); | ||
197 | |||
198 | assert(false); | ||
199 | } | ||
200 | |||
201 | |||
202 | void lock() { | ||
203 | if(locks[0] == locks[1]) crash(); | ||
204 | |||
205 | if(USE_DYNAMIC_GROUP_LOCKS) { | ||
206 | litmus_dgl_lock(locks, num_locks); | ||
207 | } | ||
208 | else | ||
209 | { | ||
210 | for(int l = 0; l < num_locks; ++l) | ||
211 | { | ||
212 | litmus_lock(locks[l]); | ||
213 | } | ||
214 | } | ||
215 | locked = true; | ||
216 | } | ||
217 | |||
218 | void unlock() { | ||
219 | if(locks[0] == locks[1]) crash(); | ||
220 | |||
221 | if(USE_DYNAMIC_GROUP_LOCKS) { | ||
222 | litmus_dgl_unlock(locks, num_locks); | ||
223 | } | ||
224 | else | ||
225 | { | ||
226 | // reverse order | ||
227 | for(int l = num_locks - 1; l >= 0; --l) | ||
228 | { | ||
229 | litmus_unlock(locks[l]); | ||
230 | } | ||
231 | } | ||
232 | locked = false; | ||
233 | } | ||
234 | |||
235 | void refresh() { | ||
236 | budget_remaining = CHUNK_SIZE; | ||
237 | } | ||
238 | |||
239 | bool budgetIsAvailable(size_t tosend) { | ||
240 | return(tosend >= budget_remaining); | ||
241 | } | ||
242 | |||
243 | void decreaseBudget(size_t spent) { | ||
244 | budget_remaining -= spent; | ||
245 | } | ||
246 | }; | ||
247 | |||
248 | // precondition: if do_locking == true, locks in state are held. | ||
249 | static cudaError_t __chunkMemcpy(void* a_dst, const void* a_src, size_t count, | ||
250 | enum cudaMemcpyKind kind, | ||
251 | ce_lock_state* state) | ||
252 | { | ||
253 | cudaError_t ret = cudaSuccess; | ||
254 | int remaining = count; | ||
255 | |||
256 | char* dst = (char*)a_dst; | ||
257 | const char* src = (const char*)a_src; | ||
258 | |||
259 | // disable chunking, if needed, by setting chunk_size equal to the | ||
260 | // amount of data to be copied. | ||
261 | int chunk_size = (ENABLE_CHUNKING) ? CHUNK_SIZE : count; | ||
262 | int i = 0; | ||
263 | |||
264 | while(remaining != 0) | ||
265 | { | ||
266 | int bytesToCopy = std::min(remaining, chunk_size); | ||
267 | |||
268 | if(state && state->budgetIsAvailable(bytesToCopy) && state->locked) { | ||
269 | cudaStreamSynchronize(STREAMS[CUR_DEVICE]); | ||
270 | ret = cudaGetLastError(); | ||
271 | |||
272 | if(ret != cudaSuccess) | ||
273 | { | ||
274 | break; | ||
275 | } | ||
276 | |||
277 | state->unlock(); | ||
278 | state->refresh(); // replentish. | ||
279 | // we can only run out of | ||
280 | // budget if chunking is enabled. | ||
281 | // we presume that init budget would | ||
282 | // be set to cover entire memcpy | ||
283 | // if chunking were disabled. | ||
284 | } | ||
285 | |||
286 | if(state && !state->locked) { | ||
287 | state->lock(); | ||
288 | } | ||
289 | |||
290 | //ret = cudaMemcpy(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind); | ||
291 | cudaMemcpyAsync(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind, STREAMS[CUR_DEVICE]); | ||
292 | |||
293 | if(state) { | ||
294 | state->decreaseBudget(bytesToCopy); | ||
295 | } | ||
296 | |||
297 | ++i; | ||
298 | remaining -= bytesToCopy; | ||
299 | } | ||
300 | return ret; | ||
301 | } | ||
302 | |||
303 | static cudaError_t chunkMemcpy(void* a_dst, const void* a_src, size_t count, | ||
304 | enum cudaMemcpyKind kind, | ||
305 | int device_a = -1, // device_a == -1 disables locking | ||
306 | bool do_locking = true, | ||
307 | int device_b = -1, | ||
308 | bool migration = false) | ||
309 | { | ||
310 | cudaError_t ret; | ||
311 | if(!do_locking || device_a == -1) { | ||
312 | ret = __chunkMemcpy(a_dst, a_src, count, kind, NULL); | ||
313 | cudaStreamSynchronize(cur_stream()); | ||
314 | if(ret == cudaSuccess) | ||
315 | ret = cudaGetLastError(); | ||
316 | } | ||
317 | else { | ||
318 | ce_lock_state state(device_a, kind, count, device_b, migration); | ||
319 | state.lock(); | ||
320 | ret = __chunkMemcpy(a_dst, a_src, count, kind, &state); | ||
321 | cudaStreamSynchronize(cur_stream()); | ||
322 | if(ret == cudaSuccess) | ||
323 | ret = cudaGetLastError(); | ||
324 | state.unlock(); | ||
325 | } | ||
326 | return ret; | ||
327 | } | ||
328 | |||
329 | |||
330 | void allocate_locks_litmus(void) | ||
331 | { | ||
332 | // allocate k-FMLP lock | ||
333 | int fd = open(lock_namespace, O_RDONLY | O_CREAT, S_IRUSR | S_IWUSR); | ||
334 | |||
335 | int base_name = GPU_PARTITION * 1000; | ||
336 | |||
337 | if (GPU_SYNC_MODE == IKGLP_MODE) { | ||
338 | /* Standard (optimal) IKGLP */ | ||
339 | TOKEN_LOCK = open_gpusync_token_lock(fd, | ||
340 | base_name, /* name */ | ||
341 | GPU_PARTITION_SIZE, | ||
342 | GPU_PARTITION*GPU_PARTITION_SIZE, | ||
343 | RHO, | ||
344 | IKGLP_M_IN_FIFOS, | ||
345 | (!RELAX_FIFO_MAX_LEN) ? | ||
346 | IKGLP_OPTIMAL_FIFO_LEN : | ||
347 | IKGLP_UNLIMITED_FIFO_LEN, | ||
348 | ENABLE_AFFINITY); | ||
349 | } | ||
350 | else if (GPU_SYNC_MODE == KFMLP_MODE) { | ||
351 | /* KFMLP. FIFO queues only for tokens. */ | ||
352 | TOKEN_LOCK = open_gpusync_token_lock(fd, | ||
353 | base_name, /* name */ | ||
354 | GPU_PARTITION_SIZE, | ||
355 | GPU_PARTITION*GPU_PARTITION_SIZE, | ||
356 | RHO, | ||
357 | IKGLP_UNLIMITED_IN_FIFOS, | ||
358 | IKGLP_UNLIMITED_FIFO_LEN, | ||
359 | ENABLE_AFFINITY); | ||
360 | } | ||
361 | else if (GPU_SYNC_MODE == RGEM_MODE) { | ||
362 | /* RGEM-like token allocation. Shared priority queue for all tokens. */ | ||
363 | TOKEN_LOCK = open_gpusync_token_lock(fd, | ||
364 | base_name, /* name */ | ||
365 | GPU_PARTITION_SIZE, | ||
366 | GPU_PARTITION*GPU_PARTITION_SIZE, | ||
367 | RHO, | ||
368 | RHO*GPU_PARTITION_SIZE, | ||
369 | 1, | ||
370 | ENABLE_AFFINITY); | ||
371 | } | ||
372 | else if (GPU_SYNC_MODE == IKGLP_WC_MODE) { | ||
373 | /* Non-optimal IKGLP that never lets a replica idle if there are pending | ||
374 | * token requests. */ | ||
375 | int max_simult_run = std::max(CPU_PARTITION_SIZE, RHO*GPU_PARTITION_SIZE); | ||
376 | int max_fifo_len = (int)ceil((float)max_simult_run / (RHO*GPU_PARTITION_SIZE)); | ||
377 | TOKEN_LOCK = open_gpusync_token_lock(fd, | ||
378 | base_name, /* name */ | ||
379 | GPU_PARTITION_SIZE, | ||
380 | GPU_PARTITION*GPU_PARTITION_SIZE, | ||
381 | RHO, | ||
382 | max_simult_run, | ||
383 | (!RELAX_FIFO_MAX_LEN) ? | ||
384 | max_fifo_len : | ||
385 | IKGLP_UNLIMITED_FIFO_LEN, | ||
386 | ENABLE_AFFINITY); | ||
387 | } | ||
388 | else { | ||
389 | perror("Invalid GPUSync mode specified\n"); | ||
390 | TOKEN_LOCK = -1; | ||
391 | } | ||
392 | |||
393 | if(TOKEN_LOCK < 0) | ||
394 | perror("open_token_sem"); | ||
395 | |||
396 | if(USE_ENGINE_LOCKS) | ||
397 | { | ||
398 | assert(NUM_COPY_ENGINES == 1 || NUM_COPY_ENGINES == 2); | ||
399 | assert((NUM_COPY_ENGINES == 1 && !RESERVED_MIGR_COPY_ENGINE) || NUM_COPY_ENGINES == 2); | ||
400 | |||
401 | // allocate the engine locks. | ||
402 | for (int i = 0; i < GPU_PARTITION_SIZE; ++i) | ||
403 | { | ||
404 | int idx = GPU_PARTITION*GPU_PARTITION_SIZE + i; | ||
405 | int ee_name = (i+1)*10 + base_name; | ||
406 | int ce_0_name = (i+1)*10 + base_name + 1; | ||
407 | int ce_1_name = (i+1)*10 + base_name + 2; | ||
408 | int ee_lock = -1, ce_0_lock = -1, ce_1_lock = -1; | ||
409 | |||
410 | open_sem_t openEngineLock = (ENGINE_LOCK_TYPE == FIFO) ? | ||
411 | open_fifo_sem : open_prioq_sem; | ||
412 | |||
413 | ee_lock = openEngineLock(fd, ee_name); | ||
414 | if (ee_lock < 0) | ||
415 | perror("open_*_sem (engine lock)"); | ||
416 | |||
417 | ce_0_lock = openEngineLock(fd, ce_0_name); | ||
418 | if (ce_0_lock < 0) | ||
419 | perror("open_*_sem (engine lock)"); | ||
420 | |||
421 | if (NUM_COPY_ENGINES == 2) | ||
422 | { | ||
423 | ce_1_lock = openEngineLock(fd, ce_1_name); | ||
424 | if (ce_1_lock < 0) | ||
425 | perror("open_*_sem (engine lock)"); | ||
426 | } | ||
427 | |||
428 | EE_LOCKS[idx] = ee_lock; | ||
429 | |||
430 | if (NUM_COPY_ENGINES == 1) | ||
431 | { | ||
432 | // share locks | ||
433 | CE_SEND_LOCKS[idx] = ce_0_lock; | ||
434 | CE_RECV_LOCKS[idx] = ce_0_lock; | ||
435 | CE_MIGR_SEND_LOCKS[idx] = ce_0_lock; | ||
436 | CE_MIGR_RECV_LOCKS[idx] = ce_0_lock; | ||
437 | } | ||
438 | else | ||
439 | { | ||
440 | assert(NUM_COPY_ENGINES == 2); | ||
441 | |||
442 | if (RESERVED_MIGR_COPY_ENGINE) { | ||
443 | // copy engine deadicated to migration operations | ||
444 | CE_SEND_LOCKS[idx] = ce_0_lock; | ||
445 | CE_RECV_LOCKS[idx] = ce_0_lock; | ||
446 | CE_MIGR_SEND_LOCKS[idx] = ce_1_lock; | ||
447 | CE_MIGR_RECV_LOCKS[idx] = ce_1_lock; | ||
448 | } | ||
449 | else { | ||
450 | // migration transmissions treated as regular data | ||
451 | CE_SEND_LOCKS[idx] = ce_0_lock; | ||
452 | CE_RECV_LOCKS[idx] = ce_1_lock; | ||
453 | CE_MIGR_SEND_LOCKS[idx] = ce_0_lock; | ||
454 | CE_MIGR_RECV_LOCKS[idx] = ce_1_lock; | ||
455 | } | ||
456 | } | ||
457 | } | ||
458 | } | ||
459 | } | ||
460 | |||
461 | |||
462 | |||
463 | |||
464 | class gpu_pool | ||
465 | { | ||
466 | public: | ||
467 | gpu_pool(int pSz): poolSize(pSz) | ||
468 | { | ||
469 | memset(&pool[0], 0, sizeof(pool[0])*poolSize); | ||
470 | } | ||
471 | |||
472 | int get(pthread_mutex_t* tex, int preference = -1) | ||
473 | { | ||
474 | int which = -1; | ||
475 | int last = (preference >= 0) ? preference : 0; | ||
476 | int minIdx = last; | ||
477 | |||
478 | pthread_mutex_lock(tex); | ||
479 | |||
480 | int min = pool[last]; | ||
481 | for(int i = (minIdx+1)%poolSize; i != last; i = (i+1)%poolSize) | ||
482 | { | ||
483 | if(min > pool[i]) | ||
484 | minIdx = i; | ||
485 | } | ||
486 | ++pool[minIdx]; | ||
487 | |||
488 | pthread_mutex_unlock(tex); | ||
489 | |||
490 | which = minIdx; | ||
491 | |||
492 | return which; | ||
493 | } | ||
494 | |||
495 | void put(pthread_mutex_t* tex, int which) | ||
496 | { | ||
497 | pthread_mutex_lock(tex); | ||
498 | --pool[which]; | ||
499 | pthread_mutex_unlock(tex); | ||
500 | } | ||
501 | |||
502 | private: | ||
503 | int poolSize; | ||
504 | int pool[NR_GPUS]; // >= gpu_part_size | ||
505 | }; | ||
506 | |||
507 | static gpu_pool* GPU_LINUX_SEM_POOL = NULL; | ||
508 | static pthread_mutex_t* GPU_LINUX_MUTEX_POOL = NULL; | ||
509 | |||
510 | static void allocate_locks_linux(int num_gpu_users) | ||
511 | { | ||
512 | managed_shared_memory *segment_pool_ptr = NULL; | ||
513 | managed_shared_memory *segment_mutex_ptr = NULL; | ||
514 | |||
515 | int numGpuPartitions = NR_GPUS/GPU_PARTITION_SIZE; | ||
516 | |||
517 | if(num_gpu_users != 0) | ||
518 | { | ||
519 | printf("%d creating shared memory for linux semaphores; num pools = %d, pool size = %d\n", getpid(), numGpuPartitions, GPU_PARTITION_SIZE); | ||
520 | shared_memory_object::remove("linux_mutex_memory"); | ||
521 | shared_memory_object::remove("linux_sem_memory"); | ||
522 | |||
523 | segment_mutex_ptr = new managed_shared_memory(create_only, "linux_mutex_memory", 4*1024); | ||
524 | GPU_LINUX_MUTEX_POOL = segment_mutex_ptr->construct<pthread_mutex_t>("pthread_mutex_t linux_m")[numGpuPartitions](); | ||
525 | for(int i = 0; i < numGpuPartitions; ++i) | ||
526 | { | ||
527 | pthread_mutexattr_t attr; | ||
528 | pthread_mutexattr_init(&attr); | ||
529 | pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED); | ||
530 | pthread_mutex_init(&(GPU_LINUX_MUTEX_POOL[i]), &attr); | ||
531 | pthread_mutexattr_destroy(&attr); | ||
532 | } | ||
533 | |||
534 | segment_pool_ptr = new managed_shared_memory(create_only, "linux_sem_memory", 4*1024); | ||
535 | GPU_LINUX_SEM_POOL = segment_pool_ptr->construct<gpu_pool>("gpu_pool linux_p")[numGpuPartitions](GPU_PARTITION_SIZE); | ||
536 | } | ||
537 | else | ||
538 | { | ||
539 | do | ||
540 | { | ||
541 | try | ||
542 | { | ||
543 | if (!segment_pool_ptr) segment_pool_ptr = new managed_shared_memory(open_only, "linux_sem_memory"); | ||
544 | } | ||
545 | catch(...) | ||
546 | { | ||
547 | sleep(1); | ||
548 | } | ||
549 | }while(segment_pool_ptr == NULL); | ||
550 | |||
551 | do | ||
552 | { | ||
553 | try | ||
554 | { | ||
555 | if (!segment_mutex_ptr) segment_mutex_ptr = new managed_shared_memory(open_only, "linux_mutex_memory"); | ||
556 | } | ||
557 | catch(...) | ||
558 | { | ||
559 | sleep(1); | ||
560 | } | ||
561 | }while(segment_mutex_ptr == NULL); | ||
562 | |||
563 | GPU_LINUX_SEM_POOL = segment_pool_ptr->find<gpu_pool>("gpu_pool linux_p").first; | ||
564 | GPU_LINUX_MUTEX_POOL = segment_mutex_ptr->find<pthread_mutex_t>("pthread_mutex_t linux_m").first; | ||
565 | } | ||
566 | } | ||
567 | |||
568 | |||
569 | |||
570 | |||
571 | static void allocate_locks(int num_gpu_users, bool linux_mode) | ||
572 | { | ||
573 | if(!linux_mode) | ||
574 | allocate_locks_litmus(); | ||
575 | else | ||
576 | allocate_locks_linux(num_gpu_users); | ||
577 | } | ||
578 | |||
579 | static void set_cur_gpu(int gpu) | ||
580 | { | ||
581 | if (TRACE_MIGRATIONS) { | ||
582 | trace_migration(gpu, CUR_DEVICE); | ||
583 | } | ||
584 | if(gpu != CUR_DEVICE) { | ||
585 | cudaSetDevice(gpu); | ||
586 | CUR_DEVICE = gpu; | ||
587 | } | ||
588 | } | ||
589 | |||
590 | |||
591 | static pthread_barrier_t *gpu_barrier = NULL; | ||
592 | static interprocess_mutex *gpu_mgmt_mutexes = NULL; | ||
593 | static managed_shared_memory *segment_ptr = NULL; | ||
594 | |||
595 | void coordinate_gpu_tasks(int num_gpu_users) | ||
596 | { | ||
597 | if(num_gpu_users != 0) | ||
598 | { | ||
599 | printf("%d creating shared memory\n", getpid()); | ||
600 | shared_memory_object::remove("gpu_barrier_memory"); | ||
601 | segment_ptr = new managed_shared_memory(create_only, "gpu_barrier_memory", 4*1024); | ||
602 | |||
603 | printf("%d creating a barrier for %d users\n", getpid(), num_gpu_users); | ||
604 | gpu_barrier = segment_ptr->construct<pthread_barrier_t>("pthread_barrier_t gpu_barrier")(); | ||
605 | pthread_barrierattr_t battr; | ||
606 | pthread_barrierattr_init(&battr); | ||
607 | pthread_barrierattr_setpshared(&battr, PTHREAD_PROCESS_SHARED); | ||
608 | pthread_barrier_init(gpu_barrier, &battr, num_gpu_users); | ||
609 | pthread_barrierattr_destroy(&battr); | ||
610 | printf("%d creating gpu mgmt mutexes for %d devices\n", getpid(), NR_GPUS); | ||
611 | gpu_mgmt_mutexes = segment_ptr->construct<interprocess_mutex>("interprocess_mutex m")[NR_GPUS](); | ||
612 | } | ||
613 | else | ||
614 | { | ||
615 | do | ||
616 | { | ||
617 | try | ||
618 | { | ||
619 | segment_ptr = new managed_shared_memory(open_only, "gpu_barrier_memory"); | ||
620 | } | ||
621 | catch(...) | ||
622 | { | ||
623 | sleep(1); | ||
624 | } | ||
625 | }while(segment_ptr == NULL); | ||
626 | |||
627 | gpu_barrier = segment_ptr->find<pthread_barrier_t>("pthread_barrier_t gpu_barrier").first; | ||
628 | gpu_mgmt_mutexes = segment_ptr->find<interprocess_mutex>("interprocess_mutex m").first; | ||
629 | } | ||
630 | } | ||
631 | |||
632 | typedef float spindata_t; | ||
633 | |||
634 | char *d_send_data[NR_GPUS] = {0}; | ||
635 | char *d_recv_data[NR_GPUS] = {0}; | ||
636 | char *d_state_data[NR_GPUS] = {0}; | ||
637 | spindata_t *d_spin_data[NR_GPUS] = {0}; | ||
638 | //unsigned int *d_iteration_count[NR_GPUS] = {0}; | ||
639 | |||
640 | |||
641 | bool p2pMigration[NR_GPUS][NR_GPUS] = {0}; | ||
642 | |||
643 | char *h_send_data = 0; | ||
644 | char *h_recv_data = 0; | ||
645 | char *h_state_data = 0; | ||
646 | |||
647 | unsigned int *h_iteration_count[NR_GPUS] = {0}; | ||
648 | |||
649 | static void init_cuda(int num_gpu_users) | ||
650 | { | ||
651 | const int PAGE_SIZE = 4*1024; | ||
652 | size_t send_alloc_bytes = SEND_SIZE + (SEND_SIZE%PAGE_SIZE != 0)*PAGE_SIZE; | ||
653 | size_t recv_alloc_bytes = RECV_SIZE + (RECV_SIZE%PAGE_SIZE != 0)*PAGE_SIZE; | ||
654 | size_t state_alloc_bytes = STATE_SIZE + (STATE_SIZE%PAGE_SIZE != 0)*PAGE_SIZE; | ||
655 | |||
656 | coordinate_gpu_tasks(num_gpu_users); | ||
657 | |||
658 | switch (CUDA_SYNC_MODE) | ||
659 | { | ||
660 | case BLOCKING: | ||
661 | cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); | ||
662 | break; | ||
663 | case SPIN: | ||
664 | cudaSetDeviceFlags(cudaDeviceScheduleSpin); | ||
665 | break; | ||
666 | } | ||
667 | |||
668 | for(int i = 0; i < GPU_PARTITION_SIZE; ++i) | ||
669 | { | ||
670 | cudaDeviceProp prop; | ||
671 | int which = GPU_PARTITION*GPU_PARTITION_SIZE + i; | ||
672 | |||
673 | gpu_mgmt_mutexes[which].lock(); | ||
674 | |||
675 | set_cur_gpu(which); | ||
676 | cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 0); | ||
677 | cudaDeviceSetLimit(cudaLimitMallocHeapSize, 0); | ||
678 | |||
679 | cudaGetDeviceProperties(&prop, which); | ||
680 | GPU_HZ[which] = prop.clockRate * 1000; /* khz -> hz */ | ||
681 | NUM_SM[which] = prop.multiProcessorCount; | ||
682 | WARP_SIZE[which] = prop.warpSize; | ||
683 | |||
684 | // enough to fill the L2 cache exactly. | ||
685 | ELEM_PER_THREAD[which] = (prop.l2CacheSize/(NUM_SM[which]*WARP_SIZE[which]*sizeof(spindata_t))); | ||
686 | |||
687 | |||
688 | if (!MIGRATE_VIA_SYSMEM && prop.unifiedAddressing) | ||
689 | { | ||
690 | for(int j = 0; j < GPU_PARTITION_SIZE; ++j) | ||
691 | { | ||
692 | if (i != j) | ||
693 | { | ||
694 | int canAccess = 0; | ||
695 | cudaDeviceCanAccessPeer(&canAccess, i, j); | ||
696 | if(canAccess) | ||
697 | { | ||
698 | cudaDeviceEnablePeerAccess(j, 0); | ||
699 | p2pMigration[i][j] = true; | ||
700 | } | ||
701 | } | ||
702 | } | ||
703 | } | ||
704 | |||
705 | cudaStreamCreate(&STREAMS[CUR_DEVICE]); | ||
706 | |||
707 | cudaMalloc(&d_spin_data[which], prop.l2CacheSize); | ||
708 | cudaMemset(&d_spin_data[which], 0, prop.l2CacheSize); | ||
709 | // cudaMalloc(&d_iteration_count[which], NUM_SM[which]*WARP_SIZE[which]*sizeof(unsigned int)); | ||
710 | // cudaHostAlloc(&h_iteration_count[which], NUM_SM[which]*WARP_SIZE[which]*sizeof(unsigned int), cudaHostAllocPortable | cudaHostAllocMapped); | ||
711 | |||
712 | if (send_alloc_bytes) { | ||
713 | cudaMalloc(&d_send_data[which], send_alloc_bytes); | ||
714 | cudaHostAlloc(&h_send_data, send_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped); | ||
715 | } | ||
716 | |||
717 | if (h_recv_data) { | ||
718 | cudaMalloc(&d_recv_data[which], recv_alloc_bytes); | ||
719 | cudaHostAlloc(&h_recv_data, recv_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped); | ||
720 | } | ||
721 | |||
722 | if (h_state_data) { | ||
723 | cudaMalloc(&d_state_data[which], state_alloc_bytes); | ||
724 | |||
725 | if (MIGRATE_VIA_SYSMEM) | ||
726 | cudaHostAlloc(&h_state_data, state_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped | cudaHostAllocWriteCombined); | ||
727 | } | ||
728 | |||
729 | gpu_mgmt_mutexes[which].unlock(); | ||
730 | } | ||
731 | |||
732 | // roll back to first GPU | ||
733 | set_cur_gpu(GPU_PARTITION*GPU_PARTITION_SIZE); | ||
734 | } | ||
735 | |||
736 | |||
737 | |||
738 | static bool MigrateToGPU_P2P(int from, int to) | ||
739 | { | ||
740 | bool success = true; | ||
741 | set_cur_gpu(to); | ||
742 | chunkMemcpy(this_gpu(d_state_data), per_gpu(d_state_data, from), | ||
743 | STATE_SIZE, cudaMemcpyDeviceToDevice, to, | ||
744 | useEngineLocks(), from, true); | ||
745 | return success; | ||
746 | } | ||
747 | |||
748 | |||
749 | static bool PullState(void) | ||
750 | { | ||
751 | bool success = true; | ||
752 | chunkMemcpy(h_state_data, this_gpu(d_state_data), | ||
753 | STATE_SIZE, cudaMemcpyDeviceToHost, | ||
754 | cur_gpu(), useEngineLocks(), -1, true); | ||
755 | return success; | ||
756 | } | ||
757 | |||
758 | static bool PushState(void) | ||
759 | { | ||
760 | bool success = true; | ||
761 | chunkMemcpy(this_gpu(d_state_data), h_state_data, | ||
762 | STATE_SIZE, cudaMemcpyHostToDevice, | ||
763 | cur_gpu(), useEngineLocks(), -1, true); | ||
764 | return success; | ||
765 | } | ||
766 | |||
767 | static bool MigrateToGPU_SysMem(int from, int to) | ||
768 | { | ||
769 | // THIS IS ON-DEMAND SYS_MEM MIGRATION. GPUSync says | ||
770 | // you should be using speculative migrations. | ||
771 | // Use PushState() and PullState(). | ||
772 | assert(false); // for now | ||
773 | |||
774 | bool success = true; | ||
775 | |||
776 | set_cur_gpu(from); | ||
777 | chunkMemcpy(h_state_data, this_gpu(d_state_data), | ||
778 | STATE_SIZE, cudaMemcpyDeviceToHost, | ||
779 | from, useEngineLocks(), -1, true); | ||
780 | |||
781 | set_cur_gpu(to); | ||
782 | chunkMemcpy(this_gpu(d_state_data), h_state_data, | ||
783 | STATE_SIZE, cudaMemcpyHostToDevice, | ||
784 | to, useEngineLocks(), -1, true); | ||
785 | |||
786 | return success; | ||
787 | } | ||
788 | |||
789 | static bool MigrateToGPU(int from, int to) | ||
790 | { | ||
791 | bool success = false; | ||
792 | |||
793 | if (from != to) | ||
794 | { | ||
795 | if(!MIGRATE_VIA_SYSMEM && p2pMigration[to][from]) | ||
796 | success = MigrateToGPU_P2P(from, to); | ||
797 | else | ||
798 | success = MigrateToGPU_SysMem(from, to); | ||
799 | } | ||
800 | else | ||
801 | { | ||
802 | set_cur_gpu(to); | ||
803 | success = true; | ||
804 | } | ||
805 | |||
806 | return success; | ||
807 | } | ||
808 | |||
809 | static bool MigrateToGPU_Implicit(int to) | ||
810 | { | ||
811 | return( MigrateToGPU(cur_gpu(), to) ); | ||
812 | } | ||
813 | |||
814 | static void MigrateIfNeeded(int next_gpu) | ||
815 | { | ||
816 | if(next_gpu != cur_gpu() && cur_gpu() != -1) | ||
817 | { | ||
818 | if (!MIGRATE_VIA_SYSMEM) | ||
819 | MigrateToGPU_Implicit(next_gpu); | ||
820 | else { | ||
821 | set_cur_gpu(next_gpu); | ||
822 | PushState(); | ||
823 | } | ||
824 | } | ||
825 | } | ||
826 | |||
827 | |||
828 | |||
829 | static void exit_cuda() | ||
830 | { | ||
831 | for(int i = 0; i < GPU_PARTITION_SIZE; ++i) | ||
832 | { | ||
833 | int which = GPU_PARTITION*GPU_PARTITION_SIZE + i; | ||
834 | gpu_mgmt_mutexes[which].lock(); | ||
835 | set_cur_gpu(which); | ||
836 | cudaDeviceReset(); | ||
837 | gpu_mgmt_mutexes[which].unlock(); | ||
838 | } | ||
839 | } | ||
840 | |||
841 | bool safetynet = false; | ||
842 | |||
843 | static void catch_exit(int catch_exit) | ||
844 | { | ||
845 | if(GPU_USING && USE_ENGINE_LOCKS && safetynet) | ||
846 | { | ||
847 | safetynet = false; | ||
848 | for(int i = 0; i < GPU_PARTITION_SIZE; ++i) | ||
849 | { | ||
850 | int which = GPU_PARTITION*GPU_PARTITION_SIZE + i; | ||
851 | set_cur_gpu(which); | ||
852 | |||
853 | // cudaDeviceReset(); | ||
854 | |||
855 | // try to unlock everything. litmus will prevent bogus calls. | ||
856 | if(USE_ENGINE_LOCKS) | ||
857 | { | ||
858 | litmus_unlock(EE_LOCKS[which]); | ||
859 | litmus_unlock(CE_SEND_LOCKS[which]); | ||
860 | if (NUM_COPY_ENGINES == 2) | ||
861 | { | ||
862 | if (RESERVED_MIGR_COPY_ENGINE) | ||
863 | litmus_unlock(CE_MIGR_SEND_LOCKS[which]); | ||
864 | else | ||
865 | litmus_unlock(CE_MIGR_RECV_LOCKS[which]); | ||
866 | } | ||
867 | } | ||
868 | } | ||
869 | litmus_unlock(TOKEN_LOCK); | ||
870 | } | ||
871 | } | ||
872 | |||
873 | |||
874 | |||
875 | |||
876 | |||
877 | static float ms_sum; | ||
878 | static int gpucount = 0; | ||
879 | |||
880 | __global__ void docudaspin(float* data, /*unsigned int* iterations,*/ unsigned int num_elem, unsigned int cycles) | ||
881 | { | ||
882 | long long int now = clock64(); | ||
883 | long long unsigned int elapsed = 0; | ||
884 | long long int last; | ||
885 | |||
886 | // unsigned int iter = 0; | ||
887 | unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; | ||
888 | unsigned int j = 0; | ||
889 | bool toggle = true; | ||
890 | |||
891 | // iterations[i] = 0; | ||
892 | do | ||
893 | { | ||
894 | data[i*num_elem+j] += (toggle) ? M_PI : -M_PI; | ||
895 | j = (j + 1 != num_elem) ? j + 1 : 0; | ||
896 | toggle = !toggle; | ||
897 | // iter++; | ||
898 | |||
899 | last = now; | ||
900 | now = clock64(); | ||
901 | |||
902 | // // exact calculation takes more cycles than a second | ||
903 | // // loop iteration when code is compiled optimized | ||
904 | // long long int diff = now - last; | ||
905 | // elapsed += (diff > 0) ? | ||
906 | // diff : | ||
907 | // now + ((~((long long int)0)<<1)>>1) - last; | ||
908 | |||
909 | // don't count iterations with clock roll-over | ||
910 | elapsed += max(0ll, now - last); | ||
911 | }while(elapsed < cycles); | ||
912 | |||
913 | // iterations[i] = iter; | ||
914 | |||
915 | return; | ||
916 | } | ||
917 | |||
918 | static void gpu_loop_for(double gpu_sec_time, double emergency_exit) | ||
919 | { | ||
920 | int next_gpu; | ||
921 | |||
922 | if (emergency_exit && wctime() > emergency_exit) | ||
923 | goto out; | ||
924 | |||
925 | next_gpu = litmus_lock(TOKEN_LOCK); | ||
926 | { | ||
927 | MigrateIfNeeded(next_gpu); | ||
928 | |||
929 | unsigned int numcycles = (unsigned int)(cur_hz() * gpu_sec_time); | ||
930 | |||
931 | if(SEND_SIZE > 0) | ||
932 | chunkMemcpy(this_gpu(d_state_data), h_send_data, SEND_SIZE, | ||
933 | cudaMemcpyHostToDevice, CUR_DEVICE, useEngineLocks()); | ||
934 | |||
935 | if(useEngineLocks()) litmus_lock(cur_ee()); | ||
936 | /* one block per sm, one warp per block */ | ||
937 | docudaspin <<<cur_sms(),cur_warp_size(), 0, cur_stream()>>> (d_spin_data[cur_gpu()], cur_elem_per_thread(), numcycles); | ||
938 | // docudaspin <<<cur_sms(),cur_warp_size(), 0, cur_stream()>>> (d_spin_data[cur_gpu()], d_iteration_count[cur_gpu()], cur_elem_per_thread(), numcycles); | ||
939 | cudaStreamSynchronize(cur_stream()); | ||
940 | if(useEngineLocks()) litmus_unlock(cur_ee()); | ||
941 | |||
942 | if(RECV_SIZE > 0) | ||
943 | chunkMemcpy(h_recv_data, this_gpu(d_state_data), RECV_SIZE, | ||
944 | cudaMemcpyDeviceToHost, CUR_DEVICE, useEngineLocks()); | ||
945 | |||
946 | if (MIGRATE_VIA_SYSMEM) | ||
947 | PullState(); | ||
948 | } | ||
949 | litmus_unlock(TOKEN_LOCK); | ||
950 | |||
951 | last_gpu() = cur_gpu(); | ||
952 | |||
953 | out: | ||
954 | return; | ||
955 | } | ||
956 | |||
957 | static void gpu_loop_for_linux(double gpu_sec_time, double emergency_exit) | ||
958 | { | ||
959 | static int GPU_OFFSET = GPU_PARTITION * GPU_PARTITION_SIZE; | ||
960 | static gpu_pool *pool = &GPU_LINUX_SEM_POOL[GPU_PARTITION]; | ||
961 | static pthread_mutex_t *mutex = &GPU_LINUX_MUTEX_POOL[GPU_PARTITION]; | ||
962 | |||
963 | static bool once = false; | ||
964 | static cudaEvent_t start, end; | ||
965 | float ms; | ||
966 | if (!once) | ||
967 | { | ||
968 | once = true; | ||
969 | cudaEventCreate(&start); | ||
970 | cudaEventCreate(&end); | ||
971 | } | ||
972 | |||
973 | int next_gpu; | ||
974 | |||
975 | if (emergency_exit && wctime() > emergency_exit) | ||
976 | goto out; | ||
977 | |||
978 | next_gpu = pool->get(mutex, cur_gpu() - GPU_OFFSET) + GPU_OFFSET; | ||
979 | { | ||
980 | MigrateIfNeeded(next_gpu); | ||
981 | |||
982 | unsigned int numcycles = (unsigned int)(cur_hz() * gpu_sec_time); | ||
983 | |||
984 | if(SEND_SIZE > 0) | ||
985 | chunkMemcpy(this_gpu(d_state_data), h_send_data, SEND_SIZE, | ||
986 | cudaMemcpyHostToDevice, cur_gpu(), useEngineLocks()); | ||
987 | |||
988 | /* one block per sm, one warp per block */ | ||
989 | cudaEventRecord(start, cur_stream()); | ||
990 | docudaspin <<<cur_sms(),cur_warp_size(), 0, cur_stream()>>> (d_spin_data[cur_gpu()], cur_elem_per_thread(), numcycles); | ||
991 | // docudaspin <<<cur_sms(),cur_warp_size(), 0, cur_stream()>>> (d_spin_data[cur_gpu()], d_iteration_count[cur_gpu()], cur_elem_per_thread(), numcycles); | ||
992 | cudaEventRecord(end, cur_stream()); | ||
993 | cudaEventSynchronize(end); | ||
994 | cudaStreamSynchronize(cur_stream()); | ||
995 | |||
996 | // chunkMemcpy(this_gpu(h_iteration_count), this_gpu(d_iteration_count), sizeof(unsigned int), | ||
997 | // cudaMemcpyDeviceToHost, cur_gpu(), useEngineLocks()); | ||
998 | // | ||
999 | cudaEventElapsedTime(&ms, start, end); | ||
1000 | ms_sum += ms; | ||
1001 | ++gpucount; | ||
1002 | // printf("%f\n", ms); | ||
1003 | // printf("%f: %u\n", ms, this_gpu(h_iteration_count)[0]); | ||
1004 | |||
1005 | |||
1006 | if(RECV_SIZE > 0) | ||
1007 | chunkMemcpy(h_recv_data, this_gpu(d_state_data), RECV_SIZE, | ||
1008 | cudaMemcpyDeviceToHost, cur_gpu(), useEngineLocks()); | ||
1009 | |||
1010 | if (MIGRATE_VIA_SYSMEM) | ||
1011 | PullState(); | ||
1012 | } | ||
1013 | pool->put(mutex, cur_gpu() - GPU_OFFSET); | ||
1014 | |||
1015 | last_gpu() = cur_gpu(); | ||
1016 | |||
1017 | out: | ||
1018 | return; | ||
1019 | } | ||
1020 | |||
1021 | |||
1022 | |||
1023 | |||
1024 | static void usage(char *error) { | ||
1025 | fprintf(stderr, "Error: %s\n", error); | ||
1026 | fprintf(stderr, | ||
1027 | "Usage:\n" | ||
1028 | " rt_spin [COMMON-OPTS] WCET PERIOD DURATION\n" | ||
1029 | " rt_spin [COMMON-OPTS] -f FILE [-o COLUMN] WCET PERIOD\n" | ||
1030 | " rt_spin -l\n" | ||
1031 | "\n" | ||
1032 | "COMMON-OPTS = [-w] [-s SCALE]\n" | ||
1033 | " [-p PARTITION/CLUSTER [-z CLUSTER SIZE]] [-c CLASS]\n" | ||
1034 | " [-X LOCKING-PROTOCOL] [-L CRITICAL SECTION LENGTH] [-Q RESOURCE-ID]" | ||
1035 | "\n" | ||
1036 | "WCET and PERIOD are milliseconds, DURATION is seconds.\n" | ||
1037 | "CRITICAL SECTION LENGTH is in milliseconds.\n"); | ||
1038 | exit(EXIT_FAILURE); | ||
1039 | } | ||
1040 | |||
1041 | /* | ||
1042 | * returns the character that made processing stop, newline or EOF | ||
1043 | */ | ||
1044 | static int skip_to_next_line(FILE *fstream) | ||
1045 | { | ||
1046 | int ch; | ||
1047 | for (ch = fgetc(fstream); ch != EOF && ch != '\n'; ch = fgetc(fstream)); | ||
1048 | return ch; | ||
1049 | } | ||
1050 | |||
1051 | static void skip_comments(FILE *fstream) | ||
1052 | { | ||
1053 | int ch; | ||
1054 | for (ch = fgetc(fstream); ch == '#'; ch = fgetc(fstream)) | ||
1055 | skip_to_next_line(fstream); | ||
1056 | ungetc(ch, fstream); | ||
1057 | } | ||
1058 | |||
1059 | static void get_exec_times(const char *file, const int column, | ||
1060 | int *num_jobs, double **exec_times) | ||
1061 | { | ||
1062 | FILE *fstream; | ||
1063 | int cur_job, cur_col, ch; | ||
1064 | *num_jobs = 0; | ||
1065 | |||
1066 | fstream = fopen(file, "r"); | ||
1067 | if (!fstream) | ||
1068 | bail_out("could not open execution time file"); | ||
1069 | |||
1070 | /* figure out the number of jobs */ | ||
1071 | do { | ||
1072 | skip_comments(fstream); | ||
1073 | ch = skip_to_next_line(fstream); | ||
1074 | if (ch != EOF) | ||
1075 | ++(*num_jobs); | ||
1076 | } while (ch != EOF); | ||
1077 | |||
1078 | if (-1 == fseek(fstream, 0L, SEEK_SET)) | ||
1079 | bail_out("rewinding file failed"); | ||
1080 | |||
1081 | /* allocate space for exec times */ | ||
1082 | *exec_times = (double*)calloc(*num_jobs, sizeof(*exec_times)); | ||
1083 | if (!*exec_times) | ||
1084 | bail_out("couldn't allocate memory"); | ||
1085 | |||
1086 | for (cur_job = 0; cur_job < *num_jobs && !feof(fstream); ++cur_job) { | ||
1087 | |||
1088 | skip_comments(fstream); | ||
1089 | |||
1090 | for (cur_col = 1; cur_col < column; ++cur_col) { | ||
1091 | /* discard input until we get to the column we want */ | ||
1092 | int unused __attribute__ ((unused)) = fscanf(fstream, "%*s,"); | ||
1093 | } | ||
1094 | |||
1095 | /* get the desired exec. time */ | ||
1096 | if (1 != fscanf(fstream, "%lf", (*exec_times)+cur_job)) { | ||
1097 | fprintf(stderr, "invalid execution time near line %d\n", | ||
1098 | cur_job); | ||
1099 | exit(EXIT_FAILURE); | ||
1100 | } | ||
1101 | |||
1102 | skip_to_next_line(fstream); | ||
1103 | } | ||
1104 | |||
1105 | assert(cur_job == *num_jobs); | ||
1106 | fclose(fstream); | ||
1107 | } | ||
1108 | |||
1109 | #define NUMS 4096 | ||
1110 | static int num[NUMS]; | ||
1111 | __attribute__((unused)) static char* progname; | ||
1112 | |||
1113 | static int loop_once(void) | ||
1114 | { | ||
1115 | int i, j = 0; | ||
1116 | for (i = 0; i < NUMS; i++) | ||
1117 | j += num[i]++; | ||
1118 | return j; | ||
1119 | } | ||
1120 | |||
1121 | static int loop_for(double exec_time, double emergency_exit) | ||
1122 | { | ||
1123 | double last_loop = 0, loop_start; | ||
1124 | int tmp = 0; | ||
1125 | |||
1126 | double start = cputime(); | ||
1127 | double now = cputime(); | ||
1128 | |||
1129 | if (emergency_exit && wctime() > emergency_exit) | ||
1130 | goto out; | ||
1131 | |||
1132 | while (now + last_loop < start + exec_time) { | ||
1133 | loop_start = now; | ||
1134 | tmp += loop_once(); | ||
1135 | now = cputime(); | ||
1136 | last_loop = now - loop_start; | ||
1137 | if (emergency_exit && wctime() > emergency_exit) { | ||
1138 | /* Oops --- this should only be possible if the execution time tracking | ||
1139 | * is broken in the LITMUS^RT kernel. */ | ||
1140 | fprintf(stderr, "!!! gpuspin/%d emergency exit!\n", getpid()); | ||
1141 | fprintf(stderr, "Something is seriously wrong! Do not ignore this.\n"); | ||
1142 | break; | ||
1143 | } | ||
1144 | } | ||
1145 | |||
1146 | out: | ||
1147 | return tmp; | ||
1148 | } | ||
1149 | |||
1150 | |||
1151 | static void debug_delay_loop(void) | ||
1152 | { | ||
1153 | double start, end, delay; | ||
1154 | |||
1155 | while (1) { | ||
1156 | for (delay = 0.5; delay > 0.01; delay -= 0.01) { | ||
1157 | start = wctime(); | ||
1158 | loop_for(delay, 0); | ||
1159 | end = wctime(); | ||
1160 | printf("%6.4fs: looped for %10.8fs, delta=%11.8fs, error=%7.4f%%\n", | ||
1161 | delay, | ||
1162 | end - start, | ||
1163 | end - start - delay, | ||
1164 | 100 * (end - start - delay) / delay); | ||
1165 | } | ||
1166 | } | ||
1167 | } | ||
1168 | |||
1169 | static int gpu_job(double exec_time, double gpu_exec_time, double program_end) | ||
1170 | { | ||
1171 | double chunk1, chunk2; | ||
1172 | |||
1173 | if (wctime() > program_end) { | ||
1174 | return 0; | ||
1175 | } | ||
1176 | else { | ||
1177 | chunk1 = exec_time * drand48(); | ||
1178 | chunk2 = exec_time - chunk1; | ||
1179 | |||
1180 | loop_for(chunk1, program_end + 1); | ||
1181 | gpu_loop_for(gpu_exec_time, program_end + 1); | ||
1182 | loop_for(chunk2, program_end + 1); | ||
1183 | |||
1184 | sleep_next_period(); | ||
1185 | } | ||
1186 | return 1; | ||
1187 | } | ||
1188 | |||
1189 | static int job(double exec_time, double program_end) | ||
1190 | { | ||
1191 | if (wctime() > program_end) { | ||
1192 | return 0; | ||
1193 | } | ||
1194 | else { | ||
1195 | loop_for(exec_time, program_end + 1); | ||
1196 | sleep_next_period(); | ||
1197 | } | ||
1198 | return 1; | ||
1199 | } | ||
1200 | |||
1201 | /*****************************/ | ||
1202 | /* only used for linux modes */ | ||
1203 | |||
1204 | static struct timespec periodTime; | ||
1205 | static struct timespec releaseTime; | ||
1206 | static unsigned int job_no = 0; | ||
1207 | |||
1208 | static lt_t period_ns; | ||
1209 | |||
1210 | static void log_release() | ||
1211 | { | ||
1212 | __attribute__ ((unused)) lt_t rel = releaseTime.tv_sec * s2ns(1) + releaseTime.tv_nsec; | ||
1213 | __attribute__ ((unused)) lt_t dead = rel + period_ns; | ||
1214 | trace_release(rel, dead, job_no); | ||
1215 | } | ||
1216 | |||
1217 | static void log_completion() | ||
1218 | { | ||
1219 | trace_completion(job_no); | ||
1220 | ++job_no; | ||
1221 | } | ||
1222 | |||
1223 | static void setup_next_period_linux(struct timespec* spec, struct timespec* period) | ||
1224 | { | ||
1225 | spec->tv_sec += period->tv_sec; | ||
1226 | spec->tv_nsec += period->tv_nsec; | ||
1227 | if (spec->tv_nsec >= s2ns(1)) { | ||
1228 | ++(spec->tv_sec); | ||
1229 | spec->tv_nsec -= s2ns(1); | ||
1230 | } | ||
1231 | } | ||
1232 | |||
1233 | static void sleep_next_period_linux() | ||
1234 | { | ||
1235 | log_completion(); | ||
1236 | setup_next_period_linux(&releaseTime, &periodTime); | ||
1237 | clock_nanosleep(CLOCK_MONOTONIC, TIMER_ABSTIME, &releaseTime, NULL); | ||
1238 | log_release(); | ||
1239 | } | ||
1240 | |||
1241 | static void init_linux() | ||
1242 | { | ||
1243 | mlockall(MCL_CURRENT | MCL_FUTURE); | ||
1244 | } | ||
1245 | |||
1246 | static int gpu_job_linux(double exec_time, double gpu_exec_time, double program_end) | ||
1247 | { | ||
1248 | double chunk1, chunk2; | ||
1249 | |||
1250 | if (wctime() > program_end) { | ||
1251 | return 0; | ||
1252 | } | ||
1253 | else { | ||
1254 | chunk1 = exec_time * drand48(); | ||
1255 | chunk2 = exec_time - chunk1; | ||
1256 | |||
1257 | loop_for(chunk1, program_end + 1); | ||
1258 | gpu_loop_for_linux(gpu_exec_time, program_end + 1); | ||
1259 | loop_for(chunk2, program_end + 1); | ||
1260 | |||
1261 | sleep_next_period_linux(); | ||
1262 | } | ||
1263 | return 1; | ||
1264 | } | ||
1265 | |||
1266 | static int job_linux(double exec_time, double program_end) | ||
1267 | { | ||
1268 | if (wctime() > program_end) { | ||
1269 | return 0; | ||
1270 | } | ||
1271 | else { | ||
1272 | loop_for(exec_time, program_end + 1); | ||
1273 | sleep_next_period_linux(); | ||
1274 | } | ||
1275 | return 1; | ||
1276 | } | ||
1277 | |||
1278 | /*****************************/ | ||
1279 | |||
1280 | enum eScheduler | ||
1281 | { | ||
1282 | LITMUS, | ||
1283 | LINUX, | ||
1284 | RT_LINUX | ||
1285 | }; | ||
1286 | |||
1287 | #define CPU_OPTIONS "p:z:c:wlveio:f:s:q:X:L:Q:" | ||
1288 | #define GPU_OPTIONS "g:y:r:C:E:dG:xS:R:T:Z:aFm:b:MNI" | ||
1289 | |||
1290 | // concat the option strings | ||
1291 | #define OPTSTR CPU_OPTIONS GPU_OPTIONS | ||
1292 | |||
1293 | int main(int argc, char** argv) | ||
1294 | { | ||
1295 | int ret; | ||
1296 | lt_t wcet; | ||
1297 | lt_t period; | ||
1298 | double wcet_ms = -1, gpu_wcet_ms = -1, period_ms = -1; | ||
1299 | unsigned int priority = LITMUS_LOWEST_PRIORITY; | ||
1300 | int migrate = 0; | ||
1301 | int cluster = 0; | ||
1302 | int cluster_size = 1; | ||
1303 | int opt; | ||
1304 | int wait = 0; | ||
1305 | int test_loop = 0; | ||
1306 | int column = 1; | ||
1307 | const char *file = NULL; | ||
1308 | int want_enforcement = 0; | ||
1309 | int want_signals = 0; | ||
1310 | double duration = 0, start = 0; | ||
1311 | double *exec_times = NULL; | ||
1312 | double scale = 1.0; | ||
1313 | task_class_t cls = RT_CLASS_HARD; | ||
1314 | int cur_job = 0, num_jobs = 0; | ||
1315 | struct rt_task param; | ||
1316 | |||
1317 | double budget_ms = -1.0; | ||
1318 | lt_t budget; | ||
1319 | |||
1320 | int num_gpu_users = 0; | ||
1321 | |||
1322 | |||
1323 | eScheduler scheduler = LITMUS; | ||
1324 | |||
1325 | /* locking */ | ||
1326 | // int lock_od = -1; | ||
1327 | // int resource_id = 0; | ||
1328 | // int protocol = -1; | ||
1329 | // double cs_length = 1; /* millisecond */ | ||
1330 | |||
1331 | progname = argv[0]; | ||
1332 | |||
1333 | while ((opt = getopt(argc, argv, OPTSTR)) != -1) { | ||
1334 | switch (opt) { | ||
1335 | case 'w': | ||
1336 | wait = 1; | ||
1337 | break; | ||
1338 | case 'p': | ||
1339 | cluster = atoi(optarg); | ||
1340 | migrate = 1; | ||
1341 | break; | ||
1342 | case 'z': | ||
1343 | cluster_size = atoi(optarg); | ||
1344 | CPU_PARTITION_SIZE = cluster_size; | ||
1345 | break; | ||
1346 | case 'g': | ||
1347 | GPU_USING = true; | ||
1348 | GPU_PARTITION = atoi(optarg); | ||
1349 | assert(GPU_PARTITION >= 0 && GPU_PARTITION < NR_GPUS); | ||
1350 | break; | ||
1351 | case 'y': | ||
1352 | GPU_PARTITION_SIZE = atoi(optarg); | ||
1353 | assert(GPU_PARTITION_SIZE > 0); | ||
1354 | break; | ||
1355 | case 'r': | ||
1356 | RHO = atoi(optarg); | ||
1357 | assert(RHO > 0); | ||
1358 | break; | ||
1359 | case 'C': | ||
1360 | NUM_COPY_ENGINES = atoi(optarg); | ||
1361 | assert(NUM_COPY_ENGINES == 1 || NUM_COPY_ENGINES == 2); | ||
1362 | break; | ||
1363 | case 'E': | ||
1364 | USE_ENGINE_LOCKS = true; | ||
1365 | ENGINE_LOCK_TYPE = (eEngineLockTypes)atoi(optarg); | ||
1366 | assert(ENGINE_LOCK_TYPE == FIFO || ENGINE_LOCK_TYPE == PRIOQ); | ||
1367 | break; | ||
1368 | case 'd': | ||
1369 | USE_DYNAMIC_GROUP_LOCKS = true; | ||
1370 | break; | ||
1371 | case 'G': | ||
1372 | GPU_SYNC_MODE = (eGpuSyncMode)atoi(optarg); | ||
1373 | assert(GPU_SYNC_MODE >= IKGLP_MODE && GPU_SYNC_MODE <= RGEM_MODE); | ||
1374 | break; | ||
1375 | case 'a': | ||
1376 | ENABLE_AFFINITY = true; | ||
1377 | break; | ||
1378 | case 'F': | ||
1379 | RELAX_FIFO_MAX_LEN = true; | ||
1380 | break; | ||
1381 | case 'x': | ||
1382 | CUDA_SYNC_MODE = SPIN; | ||
1383 | break; | ||
1384 | case 'S': | ||
1385 | SEND_SIZE = kbToB((size_t)atoi(optarg)); | ||
1386 | break; | ||
1387 | case 'R': | ||
1388 | RECV_SIZE = kbToB((size_t)atoi(optarg)); | ||
1389 | break; | ||
1390 | case 'T': | ||
1391 | STATE_SIZE = kbToB((size_t)atoi(optarg)); | ||
1392 | break; | ||
1393 | case 'Z': | ||
1394 | ENABLE_CHUNKING = true; | ||
1395 | CHUNK_SIZE = kbToB((size_t)atoi(optarg)); | ||
1396 | break; | ||
1397 | case 'M': | ||
1398 | MIGRATE_VIA_SYSMEM = true; | ||
1399 | break; | ||
1400 | case 'm': | ||
1401 | num_gpu_users = atoi(optarg); | ||
1402 | assert(num_gpu_users > 0); | ||
1403 | break; | ||
1404 | case 'b': | ||
1405 | budget_ms = atoi(optarg); | ||
1406 | break; | ||
1407 | case 'N': | ||
1408 | scheduler = LINUX; | ||
1409 | break; | ||
1410 | case 'I': | ||
1411 | scheduler = RT_LINUX; | ||
1412 | break; | ||
1413 | case 'q': | ||
1414 | priority = atoi(optarg); | ||
1415 | break; | ||
1416 | case 'c': | ||
1417 | cls = str2class(optarg); | ||
1418 | if (cls == -1) | ||
1419 | usage("Unknown task class."); | ||
1420 | break; | ||
1421 | case 'e': | ||
1422 | want_enforcement = 1; | ||
1423 | break; | ||
1424 | case 'i': | ||
1425 | want_signals = 1; | ||
1426 | break; | ||
1427 | case 'l': | ||
1428 | test_loop = 1; | ||
1429 | break; | ||
1430 | case 'o': | ||
1431 | column = atoi(optarg); | ||
1432 | break; | ||
1433 | // case 'f': | ||
1434 | // file = optarg; | ||
1435 | // break; | ||
1436 | case 's': | ||
1437 | scale = atof(optarg); | ||
1438 | break; | ||
1439 | // case 'X': | ||
1440 | // protocol = lock_protocol_for_name(optarg); | ||
1441 | // if (protocol < 0) | ||
1442 | // usage("Unknown locking protocol specified."); | ||
1443 | // break; | ||
1444 | // case 'L': | ||
1445 | // cs_length = atof(optarg); | ||
1446 | // if (cs_length <= 0) | ||
1447 | // usage("Invalid critical section length."); | ||
1448 | // break; | ||
1449 | // case 'Q': | ||
1450 | // resource_id = atoi(optarg); | ||
1451 | // if (resource_id <= 0 && strcmp(optarg, "0")) | ||
1452 | // usage("Invalid resource ID."); | ||
1453 | // break; | ||
1454 | case ':': | ||
1455 | usage("Argument missing."); | ||
1456 | break; | ||
1457 | case '?': | ||
1458 | default: | ||
1459 | usage("Bad argument."); | ||
1460 | break; | ||
1461 | } | ||
1462 | } | ||
1463 | |||
1464 | #ifdef VANILLA_LINUX | ||
1465 | assert(scheduler != LITMUS); | ||
1466 | assert(!wait); | ||
1467 | #endif | ||
1468 | |||
1469 | // turn off some features to be safe | ||
1470 | if (scheduler != LITMUS) | ||
1471 | { | ||
1472 | RHO = 0; | ||
1473 | USE_ENGINE_LOCKS = false; | ||
1474 | USE_DYNAMIC_GROUP_LOCKS = false; | ||
1475 | ENABLE_AFFINITY = false; | ||
1476 | RELAX_FIFO_MAX_LEN = false; | ||
1477 | ENABLE_RT_AUX_THREADS = false; | ||
1478 | budget_ms = -1; | ||
1479 | want_enforcement = 0; | ||
1480 | want_signals = 0; | ||
1481 | |||
1482 | if (scheduler == RT_LINUX) | ||
1483 | { | ||
1484 | struct sched_param fifoparams; | ||
1485 | |||
1486 | assert(priority >= sched_get_priority_min(SCHED_FIFO) && | ||
1487 | priority <= sched_get_priority_max(SCHED_FIFO)); | ||
1488 | |||
1489 | memset(&fifoparams, 0, sizeof(fifoparams)); | ||
1490 | fifoparams.sched_priority = priority; | ||
1491 | assert(0 == sched_setscheduler(getpid(), SCHED_FIFO, &fifoparams)); | ||
1492 | } | ||
1493 | } | ||
1494 | else | ||
1495 | { | ||
1496 | if (!litmus_is_valid_fixed_prio(priority)) | ||
1497 | usage("Invalid priority."); | ||
1498 | } | ||
1499 | |||
1500 | if (test_loop) { | ||
1501 | debug_delay_loop(); | ||
1502 | return 0; | ||
1503 | } | ||
1504 | |||
1505 | srand(getpid()); | ||
1506 | |||
1507 | if (file) { | ||
1508 | get_exec_times(file, column, &num_jobs, &exec_times); | ||
1509 | |||
1510 | if (argc - optind < 2) | ||
1511 | usage("Arguments missing."); | ||
1512 | |||
1513 | for (cur_job = 0; cur_job < num_jobs; ++cur_job) { | ||
1514 | /* convert the execution time to seconds */ | ||
1515 | duration += exec_times[cur_job] * 0.001; | ||
1516 | } | ||
1517 | } else { | ||
1518 | /* | ||
1519 | * if we're not reading from the CSV file, then we need | ||
1520 | * three parameters | ||
1521 | */ | ||
1522 | if (argc - optind < 3) | ||
1523 | usage("Arguments missing."); | ||
1524 | } | ||
1525 | |||
1526 | if (argc - optind == 3) { | ||
1527 | assert(!GPU_USING); | ||
1528 | wcet_ms = atof(argv[optind + 0]); | ||
1529 | period_ms = atof(argv[optind + 1]); | ||
1530 | duration = atof(argv[optind + 2]); | ||
1531 | } | ||
1532 | else if (argc - optind == 4) { | ||
1533 | assert(GPU_USING); | ||
1534 | wcet_ms = atof(argv[optind + 0]); | ||
1535 | gpu_wcet_ms = atof(argv[optind + 1]); | ||
1536 | period_ms = atof(argv[optind + 2]); | ||
1537 | duration = atof(argv[optind + 3]); | ||
1538 | } | ||
1539 | |||
1540 | wcet = ms2ns(wcet_ms); | ||
1541 | period = ms2ns(period_ms); | ||
1542 | if (wcet <= 0) | ||
1543 | usage("The worst-case execution time must be a " | ||
1544 | "positive number."); | ||
1545 | if (period <= 0) | ||
1546 | usage("The period must be a positive number."); | ||
1547 | if (!file && wcet > period) { | ||
1548 | usage("The worst-case execution time must not " | ||
1549 | "exceed the period."); | ||
1550 | } | ||
1551 | if (GPU_USING && gpu_wcet_ms <= 0) | ||
1552 | usage("The worst-case gpu execution time must be a positive number."); | ||
1553 | |||
1554 | if (budget_ms > 0) | ||
1555 | budget = ms2ns(budget_ms); | ||
1556 | else | ||
1557 | budget = wcet; | ||
1558 | |||
1559 | if (file && num_jobs > 1) | ||
1560 | duration += period_ms * 0.001 * (num_jobs - 1); | ||
1561 | |||
1562 | if (migrate) { | ||
1563 | ret = be_migrate_to_cluster(cluster, cluster_size); | ||
1564 | if (ret < 0) | ||
1565 | bail_out("could not migrate to target partition or cluster."); | ||
1566 | } | ||
1567 | |||
1568 | if (scheduler != LITMUS) | ||
1569 | { | ||
1570 | // set some variables needed by linux modes | ||
1571 | if (GPU_USING) | ||
1572 | { | ||
1573 | TRACE_MIGRATIONS = true; | ||
1574 | } | ||
1575 | periodTime.tv_sec = period / s2ns(1); | ||
1576 | periodTime.tv_nsec = period - periodTime.tv_sec * s2ns(1); | ||
1577 | period_ns = period; | ||
1578 | } | ||
1579 | |||
1580 | init_rt_task_param(¶m); | ||
1581 | param.exec_cost = budget; | ||
1582 | param.period = period; | ||
1583 | param.priority = priority; | ||
1584 | param.cls = cls; | ||
1585 | param.budget_policy = (want_enforcement) ? | ||
1586 | PRECISE_ENFORCEMENT : NO_ENFORCEMENT; | ||
1587 | param.budget_signal_policy = (want_enforcement && want_signals) ? | ||
1588 | PRECISE_SIGNALS : NO_SIGNALS; | ||
1589 | param.release_policy = PERIODIC; | ||
1590 | |||
1591 | if (migrate) | ||
1592 | param.cpu = cluster_to_first_cpu(cluster, cluster_size); | ||
1593 | ret = set_rt_task_param(gettid(), ¶m); | ||
1594 | if (ret < 0) | ||
1595 | bail_out("could not setup rt task params"); | ||
1596 | |||
1597 | if (scheduler == LITMUS) | ||
1598 | init_litmus(); | ||
1599 | else | ||
1600 | init_linux(); | ||
1601 | |||
1602 | if (want_signals) { | ||
1603 | /* bind default longjmp signal handler to SIG_BUDGET. */ | ||
1604 | activate_litmus_signals(SIG_BUDGET_MASK, longjmp_on_litmus_signal); | ||
1605 | } | ||
1606 | |||
1607 | if (scheduler == LITMUS) | ||
1608 | { | ||
1609 | ret = task_mode(LITMUS_RT_TASK); | ||
1610 | if (ret != 0) | ||
1611 | bail_out("could not become RT task"); | ||
1612 | } | ||
1613 | else | ||
1614 | { | ||
1615 | trace_name(); | ||
1616 | trace_param(); | ||
1617 | } | ||
1618 | |||
1619 | // if (protocol >= 0) { | ||
1620 | // /* open reference to semaphore */ | ||
1621 | // lock_od = litmus_open_lock(protocol, resource_id, lock_namespace, &cluster); | ||
1622 | // if (lock_od < 0) { | ||
1623 | // perror("litmus_open_lock"); | ||
1624 | // usage("Could not open lock."); | ||
1625 | // } | ||
1626 | // } | ||
1627 | |||
1628 | if (GPU_USING) { | ||
1629 | allocate_locks(num_gpu_users, scheduler != LITMUS); | ||
1630 | |||
1631 | signal(SIGABRT, catch_exit); | ||
1632 | signal(SIGTERM, catch_exit); | ||
1633 | signal(SIGQUIT, catch_exit); | ||
1634 | signal(SIGSEGV, catch_exit); | ||
1635 | |||
1636 | init_cuda(num_gpu_users); | ||
1637 | safetynet = true; | ||
1638 | |||
1639 | if (ENABLE_RT_AUX_THREADS) | ||
1640 | if (enable_aux_rt_tasks(AUX_CURRENT | AUX_FUTURE) != 0) | ||
1641 | bail_out("enable_aux_rt_tasks() failed"); | ||
1642 | } | ||
1643 | |||
1644 | if (wait) { | ||
1645 | ret = wait_for_ts_release2(&releaseTime); | ||
1646 | if (ret != 0) | ||
1647 | bail_out("wait_for_ts_release2()"); | ||
1648 | |||
1649 | if (scheduler != LITMUS) | ||
1650 | log_release(); | ||
1651 | } | ||
1652 | else if (scheduler != LITMUS) | ||
1653 | { | ||
1654 | clock_gettime(CLOCK_MONOTONIC, &releaseTime); | ||
1655 | sleep_next_period_linux(); | ||
1656 | } | ||
1657 | |||
1658 | start = wctime(); | ||
1659 | |||
1660 | if (scheduler == LITMUS) | ||
1661 | { | ||
1662 | if (!GPU_USING) { | ||
1663 | while (job(wcet_ms * 0.001 * scale, start + duration)); | ||
1664 | } | ||
1665 | else { | ||
1666 | while (gpu_job(wcet_ms * 0.001 * scale, | ||
1667 | gpu_wcet_ms * 0.001 * scale, | ||
1668 | start + duration)); | ||
1669 | } | ||
1670 | } | ||
1671 | else | ||
1672 | { | ||
1673 | if (!GPU_USING) { | ||
1674 | while (job_linux(wcet_ms * 0.001 * scale, start + duration)); | ||
1675 | } | ||
1676 | else { | ||
1677 | while (gpu_job_linux(wcet_ms * 0.001 * scale, | ||
1678 | gpu_wcet_ms * 0.001 * scale, | ||
1679 | start + duration)); | ||
1680 | } | ||
1681 | } | ||
1682 | |||
1683 | if (GPU_USING && ENABLE_RT_AUX_THREADS) | ||
1684 | if (disable_aux_rt_tasks(AUX_CURRENT | AUX_FUTURE) != 0) | ||
1685 | bail_out("disable_aux_rt_tasks() failed"); | ||
1686 | |||
1687 | // if (file) { | ||
1688 | // /* use times read from the CSV file */ | ||
1689 | // for (cur_job = 0; cur_job < num_jobs; ++cur_job) { | ||
1690 | // /* convert job's length to seconds */ | ||
1691 | // job(exec_times[cur_job] * 0.001 * scale, | ||
1692 | // start + duration, | ||
1693 | // lock_od, cs_length * 0.001); | ||
1694 | // } | ||
1695 | // } else { | ||
1696 | // /* convert to seconds and scale */ | ||
1697 | // while (job(wcet_ms * 0.001 * scale, start + duration, | ||
1698 | // lock_od, cs_length * 0.001)); | ||
1699 | // } | ||
1700 | |||
1701 | if (scheduler == LITMUS) | ||
1702 | { | ||
1703 | ret = task_mode(BACKGROUND_TASK); | ||
1704 | if (ret != 0) | ||
1705 | bail_out("could not become regular task (huh?)"); | ||
1706 | } | ||
1707 | |||
1708 | if (GPU_USING) { | ||
1709 | safetynet = false; | ||
1710 | exit_cuda(); | ||
1711 | |||
1712 | |||
1713 | printf("avg: %f\n", ms_sum/gpucount); | ||
1714 | } | ||
1715 | |||
1716 | if (file) | ||
1717 | free(exec_times); | ||
1718 | |||
1719 | return 0; | ||
1720 | } | ||
diff --git a/gpu/rtspin_fake_cuda.cpp b/gpu/rtspin_fake_cuda.cpp deleted file mode 100644 index 247a74c..0000000 --- a/gpu/rtspin_fake_cuda.cpp +++ /dev/null | |||
@@ -1,1187 +0,0 @@ | |||
1 | #include <sys/time.h> | ||
2 | |||
3 | #include <stdint.h> | ||
4 | #include <stdio.h> | ||
5 | #include <stdlib.h> | ||
6 | #include <unistd.h> | ||
7 | #include <time.h> | ||
8 | #include <assert.h> | ||
9 | #include <fcntl.h> | ||
10 | #include <errno.h> | ||
11 | |||
12 | #include <blitz/array.h> | ||
13 | |||
14 | #include <boost/interprocess/managed_shared_memory.hpp> | ||
15 | #include <boost/interprocess/sync/interprocess_barrier.hpp> | ||
16 | #include <boost/interprocess/sync/interprocess_mutex.hpp> | ||
17 | |||
18 | #include "litmus.h" | ||
19 | |||
20 | using namespace blitz; | ||
21 | using namespace std; | ||
22 | using namespace boost::interprocess; | ||
23 | |||
24 | #define RESET_RELEASE_ON_MISS | ||
25 | |||
26 | |||
27 | void bail_out(const char* msg) | ||
28 | { | ||
29 | perror(msg); | ||
30 | exit(-1 * errno); | ||
31 | } | ||
32 | |||
33 | |||
34 | static void usage(char *error) { | ||
35 | fprintf(stderr, "Error: %s\n", error); | ||
36 | fprintf(stderr, | ||
37 | "Usage:\n" | ||
38 | " rt_spin [COMMON-OPTS] WCET PERIOD DURATION\n" | ||
39 | " rt_spin [COMMON-OPTS] -f FILE [-o COLUMN] WCET PERIOD\n" | ||
40 | " rt_spin -l\n" | ||
41 | "\n" | ||
42 | "COMMON-OPTS = [-w] [-p PARTITION] [-c CLASS] [-s SCALE]\n" | ||
43 | "\n" | ||
44 | "WCET and PERIOD are milliseconds, DURATION is seconds.\n"); | ||
45 | exit(EXIT_FAILURE); | ||
46 | } | ||
47 | |||
48 | #define NUMS 4096 | ||
49 | static int num[NUMS]; | ||
50 | |||
51 | #define PAGE_SIZE (1024*4) | ||
52 | |||
53 | bool ENABLE_WAIT = true; | ||
54 | bool GPU_TASK = false; | ||
55 | bool ENABLE_AFFINITY = false; | ||
56 | bool USE_KFMLP = false; | ||
57 | bool RELAX_FIFO_MAX_LEN = false; | ||
58 | bool USE_DYNAMIC_GROUP_LOCKS = false; | ||
59 | bool BROADCAST_STATE = false; | ||
60 | bool ENABLE_CHUNKING = false; | ||
61 | bool MIGRATE_VIA_SYSMEM = false; | ||
62 | bool USE_PRIOQ = false; | ||
63 | |||
64 | int GPU_PARTITION = 0; | ||
65 | int GPU_PARTITION_SIZE = 0; | ||
66 | int NUM_SIMULT_USERS = 1; | ||
67 | size_t SEND_SIZE = 0; | ||
68 | size_t RECV_SIZE = 0; | ||
69 | size_t STATE_SIZE = 0; | ||
70 | size_t CHUNK_SIZE = PAGE_SIZE; | ||
71 | |||
72 | |||
73 | #define MAX_GPUS 8 | ||
74 | |||
75 | int KEXCLU_LOCK; | ||
76 | int EE_LOCKS[MAX_GPUS]; | ||
77 | int CE_SEND_LOCKS[MAX_GPUS]; | ||
78 | int CE_RECV_LOCKS[MAX_GPUS]; | ||
79 | |||
80 | int CUR_DEVICE = -1; | ||
81 | int LAST_DEVICE = -1; | ||
82 | |||
83 | bool useEngineLocks() | ||
84 | { | ||
85 | return(NUM_SIMULT_USERS != 1); | ||
86 | } | ||
87 | |||
88 | int gpuCyclesPerSecond = 0; | ||
89 | |||
90 | uint64_t *init_release_time = NULL; | ||
91 | barrier *release_barrier = NULL; | ||
92 | barrier *gpu_barrier = NULL; | ||
93 | interprocess_mutex *gpu_mgmt_mutexes = NULL; | ||
94 | managed_shared_memory *segment_ptr = NULL; | ||
95 | managed_shared_memory *release_segment_ptr = NULL; | ||
96 | |||
97 | // observed average rate when four GPUs on same node in use from pagelocked memory. | ||
98 | // about 1/3 to 1/4 this when there is no bus contention. | ||
99 | //const double msPerByte = 4.22e-07; | ||
100 | //const double transOverhead = 0.01008; // also observed. | ||
101 | |||
102 | |||
103 | |||
104 | char *d_send_data[MAX_GPUS] = {0}; | ||
105 | char *d_recv_data[MAX_GPUS] = {0}; | ||
106 | char *d_state_data[MAX_GPUS] = {0}; | ||
107 | |||
108 | //cudaStream_t streams[MAX_GPUS]; | ||
109 | |||
110 | char *h_send_data = 0; | ||
111 | char *h_recv_data = 0; | ||
112 | char *h_state_data = 0; | ||
113 | |||
114 | |||
115 | #include <sys/mman.h> | ||
116 | #define USE_PAGE_LOCKED_MEMORY | ||
117 | #ifdef USE_PAGE_LOCKED_MEMORY | ||
118 | #define c_malloc(s) \ | ||
119 | mmap(NULL, s , \ | ||
120 | PROT_READ | PROT_WRITE, \ | ||
121 | MAP_PRIVATE | MAP_ANONYMOUS | MAP_LOCKED, \ | ||
122 | -1, 0) | ||
123 | #else | ||
124 | #define c_malloc(s) malloc(s) | ||
125 | #endif | ||
126 | |||
127 | typedef int cudaError_t; | ||
128 | #define cudaSuccess 0 | ||
129 | |||
130 | enum cudaMemcpyKind { | ||
131 | cudaMemcpyHostToDevice = 0, | ||
132 | cudaMemcpyDeviceToHost = 1, | ||
133 | cudaMemcpyDeviceToDevice = 2, | ||
134 | }; | ||
135 | |||
136 | cudaError_t cudaGetLastError() | ||
137 | { | ||
138 | return cudaSuccess; | ||
139 | } | ||
140 | |||
141 | //////////////////////////////////////////////////////////////////////// | ||
142 | //////////////////////////////////////////////////////////////////////// | ||
143 | //////////////////////////////////////////////////////////////////////// | ||
144 | //////////////////////////////////////////////////////////////////////// | ||
145 | |||
146 | struct ce_lock_state | ||
147 | { | ||
148 | int locks[2]; | ||
149 | size_t num_locks; | ||
150 | size_t budget_remaining; | ||
151 | bool locked; | ||
152 | |||
153 | ce_lock_state(int device_a, enum cudaMemcpyKind kind, size_t size, int device_b = -1) { | ||
154 | num_locks = (device_a != -1) + (device_b != -1); | ||
155 | |||
156 | if(device_a != -1) { | ||
157 | locks[0] = (kind == cudaMemcpyHostToDevice) ? | ||
158 | CE_SEND_LOCKS[device_a] : CE_RECV_LOCKS[device_a]; | ||
159 | } | ||
160 | |||
161 | if(device_b != -1) { | ||
162 | assert(kind == cudaMemcpyDeviceToDevice); | ||
163 | |||
164 | locks[1] = CE_RECV_LOCKS[device_b]; | ||
165 | |||
166 | if(locks[1] < locks[0]) { | ||
167 | int temp = locks[1]; | ||
168 | locks[1] = locks[0]; | ||
169 | locks[0] = temp; | ||
170 | } | ||
171 | } | ||
172 | |||
173 | if(!ENABLE_CHUNKING) | ||
174 | budget_remaining = size; | ||
175 | else | ||
176 | budget_remaining = CHUNK_SIZE; | ||
177 | } | ||
178 | |||
179 | void lock() { | ||
180 | if(USE_DYNAMIC_GROUP_LOCKS) { | ||
181 | litmus_dgl_lock(locks, num_locks); | ||
182 | } | ||
183 | else | ||
184 | { | ||
185 | for(int l = 0; l < num_locks; ++l) | ||
186 | { | ||
187 | litmus_lock(locks[l]); | ||
188 | } | ||
189 | } | ||
190 | locked = true; | ||
191 | } | ||
192 | |||
193 | void unlock() { | ||
194 | if(USE_DYNAMIC_GROUP_LOCKS) { | ||
195 | litmus_dgl_unlock(locks, num_locks); | ||
196 | } | ||
197 | else | ||
198 | { | ||
199 | // reverse order | ||
200 | for(int l = num_locks - 1; l >= 0; --l) | ||
201 | { | ||
202 | litmus_unlock(locks[l]); | ||
203 | } | ||
204 | } | ||
205 | locked = false; | ||
206 | } | ||
207 | |||
208 | void refresh() { | ||
209 | budget_remaining = CHUNK_SIZE; | ||
210 | } | ||
211 | |||
212 | bool budgetIsAvailable(size_t tosend) { | ||
213 | return(tosend >= budget_remaining); | ||
214 | } | ||
215 | |||
216 | void decreaseBudget(size_t spent) { | ||
217 | budget_remaining -= spent; | ||
218 | } | ||
219 | }; | ||
220 | |||
221 | // precondition: if do_locking == true, locks in state are held. | ||
222 | cudaError_t __chunkMemcpy(void* a_dst, const void* a_src, size_t count, | ||
223 | enum cudaMemcpyKind kind, | ||
224 | ce_lock_state* state) | ||
225 | { | ||
226 | cudaError_t ret = cudaSuccess; | ||
227 | int remaining = count; | ||
228 | |||
229 | char* dst = (char*)a_dst; | ||
230 | const char* src = (const char*)a_src; | ||
231 | |||
232 | // disable chunking, if needed, by setting chunk_size equal to the | ||
233 | // amount of data to be copied. | ||
234 | int chunk_size = (ENABLE_CHUNKING) ? CHUNK_SIZE : count; | ||
235 | int i = 0; | ||
236 | |||
237 | while(remaining != 0) | ||
238 | { | ||
239 | int bytesToCopy = std::min(remaining, chunk_size); | ||
240 | |||
241 | if(state && state->budgetIsAvailable(bytesToCopy) && state->locked) { | ||
242 | //cutilSafeCall( cudaStreamSynchronize(streams[CUR_DEVICE]) ); | ||
243 | ret = cudaGetLastError(); | ||
244 | |||
245 | if(ret != cudaSuccess) | ||
246 | { | ||
247 | break; | ||
248 | } | ||
249 | |||
250 | state->unlock(); | ||
251 | state->refresh(); // replentish. | ||
252 | // we can only run out of | ||
253 | // budget if chunking is enabled. | ||
254 | // we presume that init budget would | ||
255 | // be set to cover entire memcpy | ||
256 | // if chunking were disabled. | ||
257 | } | ||
258 | |||
259 | if(state && !state->locked) { | ||
260 | state->lock(); | ||
261 | } | ||
262 | |||
263 | //ret = cudaMemcpy(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind); | ||
264 | //cudaMemcpyAsync(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind, streams[CUR_DEVICE]); | ||
265 | |||
266 | if(state) { | ||
267 | state->decreaseBudget(bytesToCopy); | ||
268 | } | ||
269 | |||
270 | // if(ret != cudaSuccess) | ||
271 | // { | ||
272 | // break; | ||
273 | // } | ||
274 | |||
275 | ++i; | ||
276 | remaining -= bytesToCopy; | ||
277 | } | ||
278 | return ret; | ||
279 | } | ||
280 | |||
281 | cudaError_t chunkMemcpy(void* a_dst, const void* a_src, size_t count, | ||
282 | enum cudaMemcpyKind kind, | ||
283 | int device_a = -1, // device_a == -1 disables locking | ||
284 | bool do_locking = true, | ||
285 | int device_b = -1) | ||
286 | { | ||
287 | cudaError_t ret; | ||
288 | if(!do_locking || device_a == -1) { | ||
289 | ret = __chunkMemcpy(a_dst, a_src, count, kind, NULL); | ||
290 | //cutilSafeCall( cudaStreamSynchronize(streams[CUR_DEVICE]) ); | ||
291 | if(ret == cudaSuccess) | ||
292 | ret = cudaGetLastError(); | ||
293 | } | ||
294 | else { | ||
295 | ce_lock_state state(device_a, kind, count, device_b); | ||
296 | state.lock(); | ||
297 | ret = __chunkMemcpy(a_dst, a_src, count, kind, &state); | ||
298 | //cutilSafeCall( cudaStreamSynchronize(streams[CUR_DEVICE]) ); | ||
299 | if(ret == cudaSuccess) | ||
300 | ret = cudaGetLastError(); | ||
301 | state.unlock(); | ||
302 | } | ||
303 | return ret; | ||
304 | } | ||
305 | |||
306 | |||
307 | //////////////////////////////////////////////////////////////////////// | ||
308 | //////////////////////////////////////////////////////////////////////// | ||
309 | //////////////////////////////////////////////////////////////////////// | ||
310 | |||
311 | |||
312 | inline uint64_t timespec_to_ns(const struct timespec& t) | ||
313 | { | ||
314 | return(t.tv_sec*1e9 + t.tv_nsec); | ||
315 | } | ||
316 | |||
317 | inline struct timespec ns_to_timespec(const uint64_t& ns) | ||
318 | { | ||
319 | struct timespec temp = {ns/1e9, ns - ns/1e9}; | ||
320 | return(temp); | ||
321 | } | ||
322 | |||
323 | inline uint64_t clock_gettime_ns(clockid_t clk_id) | ||
324 | { | ||
325 | struct timespec temp; | ||
326 | clock_gettime(clk_id, &temp); | ||
327 | return timespec_to_ns(temp); | ||
328 | } | ||
329 | |||
330 | |||
331 | |||
332 | static int loop_once(void) | ||
333 | { | ||
334 | int i, j = 0; | ||
335 | for (i = 0; i < NUMS; i++) | ||
336 | j += num[i]++; | ||
337 | return j; | ||
338 | } | ||
339 | |||
340 | static int loop_for(double exec_time, double emergency_exit) | ||
341 | { | ||
342 | double last_loop = 0, loop_start; | ||
343 | int tmp = 0; | ||
344 | |||
345 | double start = cputime(); | ||
346 | double now = cputime(); | ||
347 | |||
348 | while (now + last_loop < start + exec_time) { | ||
349 | loop_start = now; | ||
350 | tmp += loop_once(); | ||
351 | now = cputime(); | ||
352 | last_loop = now - loop_start; | ||
353 | if (emergency_exit && wctime() > emergency_exit) { | ||
354 | /* Oops --- this should only be possible if the execution time tracking | ||
355 | * is broken in the LITMUS^RT kernel. */ | ||
356 | fprintf(stderr, "!!! rtspin/%d emergency exit!\n", getpid()); | ||
357 | fprintf(stderr, "Something is seriously wrong! Do not ignore this.\n"); | ||
358 | break; | ||
359 | } | ||
360 | } | ||
361 | |||
362 | return tmp; | ||
363 | } | ||
364 | |||
365 | static void allocate_locks() | ||
366 | { | ||
367 | // allocate k-FMLP lock | ||
368 | int fd = open("semaphores", O_RDONLY | O_CREAT, S_IRUSR | S_IWUSR); | ||
369 | |||
370 | int base_name = GPU_PARTITION * 1000; | ||
371 | |||
372 | if(USE_KFMLP) { | ||
373 | KEXCLU_LOCK = open_kfmlp_gpu_sem(fd, | ||
374 | base_name, /* name */ | ||
375 | GPU_PARTITION_SIZE, | ||
376 | GPU_PARTITION*GPU_PARTITION_SIZE, | ||
377 | NUM_SIMULT_USERS, | ||
378 | ENABLE_AFFINITY | ||
379 | ); | ||
380 | } | ||
381 | else { | ||
382 | KEXCLU_LOCK = open_gpusync_token_lock(fd, | ||
383 | base_name, /* name */ | ||
384 | GPU_PARTITION_SIZE, | ||
385 | GPU_PARTITION*GPU_PARTITION_SIZE, | ||
386 | NUM_SIMULT_USERS, | ||
387 | IKGLP_M_IN_FIFOS, | ||
388 | (!RELAX_FIFO_MAX_LEN) ? | ||
389 | IKGLP_OPTIMAL_FIFO_LEN : | ||
390 | IKGLP_UNLIMITED_FIFO_LEN, | ||
391 | ENABLE_AFFINITY | ||
392 | ); | ||
393 | // KEXCLU_LOCK = open_ikglp_gpu_sem(fd, | ||
394 | // base_name, /* name */ | ||
395 | // GPU_PARTITION_SIZE, | ||
396 | // GPU_PARTITION*GPU_PARTITION_SIZE, | ||
397 | // NUM_SIMULT_USERS, | ||
398 | // ENABLE_AFFINITY, | ||
399 | // RELAX_FIFO_MAX_LEN | ||
400 | // ); | ||
401 | } | ||
402 | if(KEXCLU_LOCK < 0) | ||
403 | perror("open_kexclu_sem"); | ||
404 | |||
405 | if(NUM_SIMULT_USERS > 1) | ||
406 | { | ||
407 | open_sem_t opensem = (!USE_PRIOQ) ? open_fifo_sem : open_prioq_sem; | ||
408 | const char* opensem_label = (!USE_PRIOQ) ? "open_fifo_sem" : "open_prioq_sem"; | ||
409 | |||
410 | // allocate the engine locks. | ||
411 | for (int i = 0; i < MAX_GPUS; ++i) | ||
412 | { | ||
413 | EE_LOCKS[i] = opensem(fd, (i+1)*10 + base_name); | ||
414 | if(EE_LOCKS[i] < 0) | ||
415 | perror(opensem_label); | ||
416 | |||
417 | CE_SEND_LOCKS[i] = opensem(fd, (i+1)*10 + base_name + 1); | ||
418 | if(CE_SEND_LOCKS[i] < 0) | ||
419 | perror(opensem_label); | ||
420 | |||
421 | if(NUM_SIMULT_USERS == 3) | ||
422 | { | ||
423 | // allocate a separate lock for the second copy engine | ||
424 | CE_RECV_LOCKS[i] = opensem(fd, (i+1)*10 + base_name + 2); | ||
425 | if(CE_RECV_LOCKS[i] < 0) | ||
426 | perror(opensem_label); | ||
427 | } | ||
428 | else | ||
429 | { | ||
430 | // share a single lock for the single copy engine | ||
431 | CE_RECV_LOCKS[i] = CE_SEND_LOCKS[i]; | ||
432 | } | ||
433 | } | ||
434 | } | ||
435 | } | ||
436 | |||
437 | static void allocate_host_memory() | ||
438 | { | ||
439 | // round up to page boundaries | ||
440 | size_t send_alloc_bytes = SEND_SIZE + (SEND_SIZE%PAGE_SIZE != 0)*PAGE_SIZE; | ||
441 | size_t recv_alloc_bytes = RECV_SIZE + (RECV_SIZE%PAGE_SIZE != 0)*PAGE_SIZE; | ||
442 | size_t state_alloc_bytes = STATE_SIZE + (STATE_SIZE%PAGE_SIZE != 0)*PAGE_SIZE; | ||
443 | |||
444 | printf("Allocating host memory. send = %dB, recv = %dB, state = %dB\n", | ||
445 | send_alloc_bytes, recv_alloc_bytes, state_alloc_bytes); | ||
446 | |||
447 | // if(send_alloc_bytes > 0) | ||
448 | // { | ||
449 | // h_send_data = (char *)c_malloc(send_alloc_bytes); | ||
450 | // memset(h_send_data, 0x55, send_alloc_bytes); // write some random value | ||
451 | // // this will open a connection to GPU 0 if there is no active context, so | ||
452 | // // expect long stalls. LAME. | ||
453 | // cutilSafeCall( cudaHostRegister(h_send_data, send_alloc_bytes, cudaHostRegisterPortable) ); | ||
454 | // } | ||
455 | // | ||
456 | // if(recv_alloc_bytes > 0) | ||
457 | // { | ||
458 | // h_recv_data = (char *)c_malloc(recv_alloc_bytes); | ||
459 | // memset(h_recv_data, 0xAA, recv_alloc_bytes); | ||
460 | // cutilSafeCall( cudaHostRegister(h_recv_data, recv_alloc_bytes, cudaHostRegisterPortable) ); | ||
461 | // } | ||
462 | // | ||
463 | // if(state_alloc_bytes > 0) | ||
464 | // { | ||
465 | // h_state_data = (char *)c_malloc(state_alloc_bytes); | ||
466 | // memset(h_state_data, 0xCC, state_alloc_bytes); // write some random value | ||
467 | // cutilSafeCall( cudaHostRegister(h_state_data, state_alloc_bytes, cudaHostRegisterPortable) ); | ||
468 | // } | ||
469 | |||
470 | printf("Host memory allocated.\n"); | ||
471 | } | ||
472 | |||
473 | static void allocate_device_memory() | ||
474 | { | ||
475 | printf("Allocating device memory.\n"); | ||
476 | // establish a connection to each GPU. | ||
477 | // for(int i = 0; i < GPU_PARTITION_SIZE; ++i) | ||
478 | // { | ||
479 | // int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i; | ||
480 | // | ||
481 | // if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].lock(); | ||
482 | // | ||
483 | // cutilSafeCall( cudaSetDevice(which_device) ); | ||
484 | // cutilSafeCall( cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 0) ); | ||
485 | // cutilSafeCall( cudaDeviceSetLimit(cudaLimitMallocHeapSize, 0) ); | ||
486 | // | ||
487 | // cutilSafeCall( cudaStreamCreate(&streams[which_device]) ); | ||
488 | // | ||
489 | // /* pre-allocate memory, pray there's enough to go around */ | ||
490 | // if(SEND_SIZE > 0) { | ||
491 | // cutilSafeCall( cudaMalloc((void**)&d_send_data[which_device], SEND_SIZE) ); | ||
492 | // } | ||
493 | // if(RECV_SIZE > 0) { | ||
494 | // cutilSafeCall( cudaMalloc((void**)&h_recv_data[which_device], RECV_SIZE) ); | ||
495 | // } | ||
496 | // if(STATE_SIZE > 0) { | ||
497 | // cutilSafeCall( cudaMalloc((void**)&h_state_data[which_device], STATE_SIZE) ); | ||
498 | // } | ||
499 | // | ||
500 | // if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].unlock(); | ||
501 | // } | ||
502 | printf("Device memory allocated.\n"); | ||
503 | } | ||
504 | |||
505 | static void configure_gpus() | ||
506 | { | ||
507 | printf("Configuring GPU\n"); | ||
508 | |||
509 | // // SUSPEND WHEN BLOCKED!! | ||
510 | // cutilSafeCall( cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync) ); | ||
511 | // | ||
512 | // // establish a connection to each GPU. | ||
513 | // for(int i = 0; i < GPU_PARTITION_SIZE; ++i) | ||
514 | // { | ||
515 | // int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i; | ||
516 | // | ||
517 | // if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].lock(); | ||
518 | // | ||
519 | // cutilSafeCall( cudaSetDevice(which_device) ); | ||
520 | // cutilSafeCall( cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 0) ); | ||
521 | // cutilSafeCall( cudaDeviceSetLimit(cudaLimitMallocHeapSize, 0) ); | ||
522 | // | ||
523 | // cutilSafeCall( cudaStreamCreate(&streams[which_device]) ); | ||
524 | // | ||
525 | // // enable P2P migrations. | ||
526 | // // we assume all GPUs are on the same I/O hub. | ||
527 | // for(int j = 0; j < GPU_PARTITION_SIZE; ++j) | ||
528 | // { | ||
529 | // int other_device = GPU_PARTITION*GPU_PARTITION_SIZE + j; | ||
530 | // | ||
531 | // if(which_device != other_device) | ||
532 | // { | ||
533 | // cutilSafeCall( cudaDeviceEnablePeerAccess(other_device, 0) ); | ||
534 | // } | ||
535 | // } | ||
536 | // | ||
537 | // if(i == 0) | ||
538 | // { | ||
539 | // struct cudaDeviceProp pi; | ||
540 | // cudaGetDeviceProperties(&pi, i); | ||
541 | // gpuCyclesPerSecond = pi.clockRate * 1000; /* khz -> hz */ | ||
542 | // } | ||
543 | // | ||
544 | // if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].unlock(); | ||
545 | // } | ||
546 | |||
547 | printf("GPUs have been configured.\n"); | ||
548 | } | ||
549 | |||
550 | static void init_cuda() | ||
551 | { | ||
552 | configure_gpus(); | ||
553 | allocate_host_memory(); | ||
554 | allocate_device_memory(); | ||
555 | allocate_locks(); | ||
556 | } | ||
557 | |||
558 | static void exit_cuda() | ||
559 | { | ||
560 | for(int i = 0; i < GPU_PARTITION_SIZE; ++i) | ||
561 | { | ||
562 | int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i; | ||
563 | |||
564 | if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].lock(); | ||
565 | |||
566 | // cutilSafeCall( cudaSetDevice(which_device) ); | ||
567 | // cutilSafeCall( cudaDeviceReset() ); | ||
568 | |||
569 | if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].unlock(); | ||
570 | } | ||
571 | } | ||
572 | |||
573 | static void catchExit(void) | ||
574 | { | ||
575 | if(GPU_TASK) | ||
576 | { | ||
577 | // try to unlock everything. litmus will prevent bogus calls. | ||
578 | if(NUM_SIMULT_USERS > 1) | ||
579 | { | ||
580 | for(int i = 0; i < GPU_PARTITION_SIZE; ++i) | ||
581 | { | ||
582 | int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i; | ||
583 | |||
584 | litmus_unlock(EE_LOCKS[which_device]); | ||
585 | litmus_unlock(CE_SEND_LOCKS[which_device]); | ||
586 | if(NUM_SIMULT_USERS == 2) { | ||
587 | litmus_unlock(CE_RECV_LOCKS[which_device]); | ||
588 | } | ||
589 | } | ||
590 | } | ||
591 | |||
592 | if(CUR_DEVICE >= 0) { | ||
593 | unregister_nv_device(CUR_DEVICE); | ||
594 | } | ||
595 | |||
596 | litmus_unlock(KEXCLU_LOCK); | ||
597 | } | ||
598 | } | ||
599 | |||
600 | static void migrateToGPU(int destination) | ||
601 | { | ||
602 | if(!BROADCAST_STATE && STATE_SIZE > 0) | ||
603 | { | ||
604 | if(MIGRATE_VIA_SYSMEM) | ||
605 | { | ||
606 | chunkMemcpy(h_state_data, d_state_data[LAST_DEVICE], STATE_SIZE, | ||
607 | cudaMemcpyDeviceToHost, LAST_DEVICE, useEngineLocks()); | ||
608 | } | ||
609 | } | ||
610 | |||
611 | // cutilSafeCall( cudaSetDevice(destination) ); | ||
612 | |||
613 | if(!BROADCAST_STATE && STATE_SIZE > 0) | ||
614 | { | ||
615 | if(MIGRATE_VIA_SYSMEM) | ||
616 | { | ||
617 | chunkMemcpy(d_state_data[CUR_DEVICE], h_state_data, STATE_SIZE, | ||
618 | cudaMemcpyHostToDevice, CUR_DEVICE, useEngineLocks()); | ||
619 | } | ||
620 | else | ||
621 | { | ||
622 | chunkMemcpy(d_state_data[destination], | ||
623 | d_state_data[LAST_DEVICE], | ||
624 | STATE_SIZE, | ||
625 | cudaMemcpyDeviceToDevice, | ||
626 | CUR_DEVICE, | ||
627 | useEngineLocks(), | ||
628 | destination); | ||
629 | } | ||
630 | } | ||
631 | } | ||
632 | |||
633 | static void broadcastState(int from) | ||
634 | { | ||
635 | if(STATE_SIZE > 0) | ||
636 | { | ||
637 | assert(CUR_DEVICE == from); | ||
638 | |||
639 | if(MIGRATE_VIA_SYSMEM) | ||
640 | { | ||
641 | chunkMemcpy(h_state_data, d_state_data[from], STATE_SIZE, | ||
642 | cudaMemcpyDeviceToHost, from, useEngineLocks()); | ||
643 | } | ||
644 | |||
645 | for(int i = 0; i < GPU_PARTITION_SIZE; ++i) | ||
646 | { | ||
647 | int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i; | ||
648 | if(which_device != from) | ||
649 | { | ||
650 | if(MIGRATE_VIA_SYSMEM) | ||
651 | { | ||
652 | // cutilSafeCall( cudaSetDevice(which_device) ); | ||
653 | CUR_DEVICE = which_device; // temporary | ||
654 | chunkMemcpy(d_state_data[which_device], h_state_data, STATE_SIZE, | ||
655 | cudaMemcpyHostToDevice, which_device, useEngineLocks()); | ||
656 | } | ||
657 | else | ||
658 | { | ||
659 | chunkMemcpy(d_state_data[which_device], | ||
660 | d_state_data[from], | ||
661 | STATE_SIZE, | ||
662 | cudaMemcpyDeviceToDevice, | ||
663 | from, | ||
664 | useEngineLocks(), | ||
665 | which_device); | ||
666 | } | ||
667 | } | ||
668 | } | ||
669 | |||
670 | if(MIGRATE_VIA_SYSMEM && CUR_DEVICE != from) | ||
671 | { | ||
672 | // cutilSafeCall( cudaSetDevice(from) ); | ||
673 | CUR_DEVICE = from; | ||
674 | } | ||
675 | } | ||
676 | } | ||
677 | |||
678 | //// Executes on graphics card. | ||
679 | //__global__ void docudaspin(unsigned int cycles) | ||
680 | //{ | ||
681 | // long long unsigned int elapsed = 0; | ||
682 | // long long int now = clock64(); | ||
683 | // long long int last; | ||
684 | // do | ||
685 | // { | ||
686 | // last = now; | ||
687 | // now = clock64(); | ||
688 | // elapsed += max(0ll, (long long int)(now - last)); // don't count iterations with clock roll-over | ||
689 | // }while(elapsed < cycles); | ||
690 | // | ||
691 | // return; | ||
692 | //} | ||
693 | |||
694 | |||
695 | |||
696 | static void gpu_loop_for(double gpu_sec_time, double emergency_exit) | ||
697 | { | ||
698 | unsigned int numcycles = (unsigned int)(gpuCyclesPerSecond * gpu_sec_time); | ||
699 | int numblocks = 1; | ||
700 | int blocksz = 1; | ||
701 | |||
702 | CUR_DEVICE = litmus_lock(KEXCLU_LOCK); | ||
703 | { | ||
704 | if(CUR_DEVICE != LAST_DEVICE && LAST_DEVICE != -1) | ||
705 | { | ||
706 | migrateToGPU(CUR_DEVICE); | ||
707 | } | ||
708 | |||
709 | if(SEND_SIZE > 0) | ||
710 | { | ||
711 | // handles chunking and locking, as appropriate. | ||
712 | chunkMemcpy(d_send_data[CUR_DEVICE], h_send_data, SEND_SIZE, | ||
713 | cudaMemcpyHostToDevice, CUR_DEVICE, useEngineLocks()); | ||
714 | } | ||
715 | |||
716 | if(useEngineLocks()) litmus_lock(EE_LOCKS[CUR_DEVICE]); | ||
717 | |||
718 | // docudaspin <<<numblocks,blocksz, 0, streams[CUR_DEVICE]>>> (numcycles); | ||
719 | // cutilSafeCall( cudaStreamSynchronize(streams[CUR_DEVICE]) ); | ||
720 | |||
721 | if(useEngineLocks()) litmus_unlock(EE_LOCKS[CUR_DEVICE]); | ||
722 | |||
723 | if(RECV_SIZE > 0) | ||
724 | { | ||
725 | chunkMemcpy(h_recv_data, d_recv_data[CUR_DEVICE], RECV_SIZE, | ||
726 | cudaMemcpyDeviceToHost, CUR_DEVICE, useEngineLocks()); | ||
727 | } | ||
728 | |||
729 | if(BROADCAST_STATE) | ||
730 | { | ||
731 | broadcastState(CUR_DEVICE); | ||
732 | } | ||
733 | } | ||
734 | litmus_unlock(KEXCLU_LOCK); | ||
735 | |||
736 | LAST_DEVICE = CUR_DEVICE; | ||
737 | CUR_DEVICE = -1; | ||
738 | } | ||
739 | |||
740 | |||
741 | static void debug_delay_loop(void) | ||
742 | { | ||
743 | double start, end, delay; | ||
744 | |||
745 | while (1) { | ||
746 | for (delay = 0.5; delay > 0.01; delay -= 0.01) { | ||
747 | start = wctime(); | ||
748 | loop_for(delay, 0); | ||
749 | end = wctime(); | ||
750 | printf("%6.4fs: looped for %10.8fs, delta=%11.8fs, error=%7.4f%%\n", | ||
751 | delay, | ||
752 | end - start, | ||
753 | end - start - delay, | ||
754 | 100 * (end - start - delay) / delay); | ||
755 | } | ||
756 | } | ||
757 | } | ||
758 | |||
759 | static int job(double exec_time, double gpu_sec_time, double program_end) | ||
760 | { | ||
761 | if (wctime() > program_end) | ||
762 | return 0; | ||
763 | else if (!GPU_TASK) | ||
764 | { | ||
765 | loop_for(exec_time, program_end + 1); | ||
766 | } | ||
767 | else | ||
768 | { | ||
769 | double cpu_bookend = (exec_time)/2.0; | ||
770 | |||
771 | loop_for(cpu_bookend, program_end + 1); | ||
772 | gpu_loop_for(gpu_sec_time, program_end + 1); | ||
773 | loop_for(cpu_bookend, program_end + 1); | ||
774 | } | ||
775 | return 1; | ||
776 | } | ||
777 | |||
778 | #define OPTSTR "p:ls:e:g:G:W:N:S:R:T:BMaLyC:rz:q" | ||
779 | |||
780 | int main(int argc, char** argv) | ||
781 | { | ||
782 | atexit(catchExit); | ||
783 | |||
784 | int ret; | ||
785 | lt_t wcet; | ||
786 | lt_t period; | ||
787 | double wcet_ms, period_ms; | ||
788 | int migrate = 0; | ||
789 | int cpu = 0; | ||
790 | int opt; | ||
791 | int test_loop = 0; | ||
792 | // int column = 1; | ||
793 | const char *file = NULL; | ||
794 | int want_enforcement = 0; | ||
795 | double duration = 0, releaseTime = 0; | ||
796 | double *exec_times = NULL; | ||
797 | double scale = 1.0; | ||
798 | uint64_t cur_job; | ||
799 | uint64_t num_jobs; | ||
800 | |||
801 | int create_shm = -1; | ||
802 | int num_tasks = 0; | ||
803 | |||
804 | double gpu_sec_ms = 0; | ||
805 | |||
806 | while ((opt = getopt(argc, argv, OPTSTR)) != -1) { | ||
807 | // printf("opt = %c optarg = %s\n", opt, optarg); | ||
808 | switch (opt) { | ||
809 | // case 'w': | ||
810 | // ENABLE_WAIT = 1; | ||
811 | // break; | ||
812 | case 'p': | ||
813 | cpu = atoi(optarg); | ||
814 | migrate = 1; | ||
815 | break; | ||
816 | case 'l': | ||
817 | test_loop = 1; | ||
818 | break; | ||
819 | case 's': | ||
820 | scale = atof(optarg); | ||
821 | break; | ||
822 | case 'e': | ||
823 | gpu_sec_ms = atof(optarg); | ||
824 | break; | ||
825 | // case 'x': | ||
826 | // trans_sec_ms = atof(optarg); | ||
827 | // break; | ||
828 | case 'z': | ||
829 | NUM_SIMULT_USERS = atoi(optarg); | ||
830 | break; | ||
831 | case 'q': | ||
832 | USE_PRIOQ = true; | ||
833 | break; | ||
834 | case 'g': | ||
835 | GPU_TASK = 1; | ||
836 | GPU_PARTITION_SIZE = atoi(optarg); | ||
837 | break; | ||
838 | case 'G': | ||
839 | GPU_PARTITION = atoi(optarg); | ||
840 | break; | ||
841 | case 'S': | ||
842 | SEND_SIZE = (size_t)(atof(optarg)*1024); | ||
843 | break; | ||
844 | case 'R': | ||
845 | RECV_SIZE = (size_t)(atof(optarg)*1024); | ||
846 | break; | ||
847 | case 'T': | ||
848 | STATE_SIZE = (size_t)(atof(optarg)*1024); | ||
849 | break; | ||
850 | case 'B': | ||
851 | BROADCAST_STATE = true; | ||
852 | break; | ||
853 | case 'M': | ||
854 | MIGRATE_VIA_SYSMEM = true; | ||
855 | break; | ||
856 | case 'a': | ||
857 | ENABLE_AFFINITY = true; | ||
858 | break; | ||
859 | case 'r': | ||
860 | RELAX_FIFO_MAX_LEN = true; | ||
861 | break; | ||
862 | case 'L': | ||
863 | USE_KFMLP = true; | ||
864 | break; | ||
865 | case 'y': | ||
866 | USE_DYNAMIC_GROUP_LOCKS = true; | ||
867 | break; | ||
868 | case 'C': | ||
869 | ENABLE_CHUNKING = true; | ||
870 | CHUNK_SIZE = (size_t)(atof(optarg)*1024); | ||
871 | break; | ||
872 | case 'W': | ||
873 | create_shm = atoi(optarg); | ||
874 | break; | ||
875 | case 'N': | ||
876 | num_tasks = atoi(optarg); | ||
877 | break; | ||
878 | case ':': | ||
879 | usage("Argument missing."); | ||
880 | break; | ||
881 | case '?': | ||
882 | default: | ||
883 | usage("Bad argument."); | ||
884 | break; | ||
885 | } | ||
886 | } | ||
887 | |||
888 | if (test_loop) { | ||
889 | debug_delay_loop(); | ||
890 | return 0; | ||
891 | } | ||
892 | |||
893 | // if (file) { | ||
894 | // int num_jobs_tmp; | ||
895 | // get_exec_times(file, column, &num_jobs_tmp, &exec_times); | ||
896 | // num_jobs = num_jobs_tmp; | ||
897 | // | ||
898 | // if (argc - optind < 2) | ||
899 | // usage("Arguments missing."); | ||
900 | // | ||
901 | // for (cur_job = 0; cur_job < num_jobs; ++cur_job) { | ||
902 | // /* convert the execution time to seconds */ | ||
903 | // duration += exec_times[cur_job] * 0.001; | ||
904 | // } | ||
905 | // } else { | ||
906 | /* | ||
907 | * if we're not reading from the CSV file, then we need | ||
908 | * three parameters | ||
909 | */ | ||
910 | if (argc - optind < 3) | ||
911 | usage("Arguments missing."); | ||
912 | // } | ||
913 | |||
914 | wcet_ms = atof(argv[optind + 0]); | ||
915 | period_ms = atof(argv[optind + 1]); | ||
916 | |||
917 | wcet = wcet_ms * __NS_PER_MS; | ||
918 | period = period_ms * __NS_PER_MS; | ||
919 | if (wcet <= 0) | ||
920 | usage("The worst-case execution time must be a " | ||
921 | "positive number."); | ||
922 | if (period <= 0) | ||
923 | usage("The period must be a positive number."); | ||
924 | if (!file && wcet > period) { | ||
925 | usage("The worst-case execution time must not " | ||
926 | "exceed the period."); | ||
927 | } | ||
928 | |||
929 | if (!file) | ||
930 | { | ||
931 | duration = atof(argv[optind + 2]); | ||
932 | num_jobs = ((double)duration*1e3)/period_ms; | ||
933 | ++num_jobs; // padding | ||
934 | } | ||
935 | else if (file && num_jobs > 1) | ||
936 | { | ||
937 | duration += period_ms * 0.001 * (num_jobs - 1); | ||
938 | } | ||
939 | |||
940 | if (migrate) { | ||
941 | ret = be_migrate_to(cpu); | ||
942 | if (ret < 0) | ||
943 | bail_out("could not migrate to target partition"); | ||
944 | } | ||
945 | |||
946 | if(ENABLE_WAIT) | ||
947 | { | ||
948 | if(num_tasks > 0) | ||
949 | { | ||
950 | printf("%d creating release shared memory\n", getpid()); | ||
951 | shared_memory_object::remove("release_barrier_memory"); | ||
952 | release_segment_ptr = new managed_shared_memory(create_only, "release_barrier_memory", 4*1024); | ||
953 | |||
954 | printf("%d creating release barrier for %d users\n", getpid(), num_tasks); | ||
955 | release_barrier = release_segment_ptr->construct<barrier>("barrier release_barrier")(num_tasks); | ||
956 | |||
957 | init_release_time = release_segment_ptr->construct<uint64_t>("uint64_t instance")(); | ||
958 | *init_release_time = 0; | ||
959 | } | ||
960 | else | ||
961 | { | ||
962 | do | ||
963 | { | ||
964 | try | ||
965 | { | ||
966 | printf("%d opening release shared memory\n", getpid()); | ||
967 | segment_ptr = new managed_shared_memory(open_only, "release_barrier_memory"); | ||
968 | } | ||
969 | catch(...) | ||
970 | { | ||
971 | printf("%d shared memory not ready. sleeping\n", getpid()); | ||
972 | sleep(1); | ||
973 | } | ||
974 | }while(segment_ptr == NULL); | ||
975 | |||
976 | release_barrier = segment_ptr->find<barrier>("barrier release_barrier").first; | ||
977 | init_release_time = segment_ptr->find<uint64_t>("uint64_t instance").first; | ||
978 | } | ||
979 | } | ||
980 | |||
981 | |||
982 | if(GPU_TASK) | ||
983 | { | ||
984 | if(ENABLE_WAIT) | ||
985 | { | ||
986 | if(create_shm > -1) | ||
987 | { | ||
988 | printf("%d creating shared memory\n", getpid()); | ||
989 | shared_memory_object::remove("gpu_barrier_memory"); | ||
990 | segment_ptr = new managed_shared_memory(create_only, "gpu_barrier_memory", 4*1024); | ||
991 | |||
992 | printf("%d creating a barrier for %d users\n", getpid(), create_shm); | ||
993 | gpu_barrier = segment_ptr->construct<barrier>("barrier instance")(create_shm); | ||
994 | printf("%d creating gpu mgmt mutexes for 8 devices\n", getpid()); | ||
995 | gpu_mgmt_mutexes = segment_ptr->construct<interprocess_mutex>("interprocess_mutex m")[8](); | ||
996 | } | ||
997 | else | ||
998 | { | ||
999 | do | ||
1000 | { | ||
1001 | try | ||
1002 | { | ||
1003 | printf("%d opening shared memory\n", getpid()); | ||
1004 | segment_ptr = new managed_shared_memory(open_only, "gpu_barrier_memory"); | ||
1005 | } | ||
1006 | catch(...) | ||
1007 | { | ||
1008 | printf("%d shared memory not ready. sleeping\n", getpid()); | ||
1009 | sleep(1); | ||
1010 | } | ||
1011 | }while(segment_ptr == NULL); | ||
1012 | |||
1013 | gpu_barrier = segment_ptr->find<barrier>("barrier instance").first; | ||
1014 | gpu_mgmt_mutexes = segment_ptr->find<interprocess_mutex>("interprocess_mutex m").first; | ||
1015 | } | ||
1016 | } | ||
1017 | |||
1018 | // scale data transmission too?? | ||
1019 | SEND_SIZE *= scale; | ||
1020 | RECV_SIZE *= scale; | ||
1021 | STATE_SIZE *= scale; | ||
1022 | |||
1023 | init_cuda(); | ||
1024 | } | ||
1025 | |||
1026 | ret = sporadic_task_ns(wcet, period, 0, cpu, RT_CLASS_SOFT, | ||
1027 | want_enforcement ? PRECISE_ENFORCEMENT | ||
1028 | : NO_ENFORCEMENT, | ||
1029 | migrate); | ||
1030 | if (ret < 0) | ||
1031 | bail_out("could not setup rt task params"); | ||
1032 | |||
1033 | init_litmus(); | ||
1034 | |||
1035 | ret = task_mode(LITMUS_RT_TASK); | ||
1036 | if (ret != 0) | ||
1037 | bail_out("could not become RT task"); | ||
1038 | |||
1039 | |||
1040 | |||
1041 | uint64_t jobCount = 0; | ||
1042 | blitz::Array<uint64_t, 1> responseTimeLog(num_jobs+1); | ||
1043 | |||
1044 | struct timespec spec; | ||
1045 | uint64_t release; | ||
1046 | uint64_t finish; | ||
1047 | |||
1048 | |||
1049 | if (ENABLE_WAIT) { | ||
1050 | printf("Waiting for release.\n"); | ||
1051 | ret = wait_for_ts_release(); | ||
1052 | if (ret != 0) | ||
1053 | bail_out("wait_for_ts_release()"); | ||
1054 | } | ||
1055 | else | ||
1056 | { | ||
1057 | sleep_next_period(); | ||
1058 | } | ||
1059 | |||
1060 | clock_gettime(CLOCK_MONOTONIC, &spec); | ||
1061 | release = timespec_to_ns(spec); | ||
1062 | if (!__sync_bool_compare_and_swap(init_release_time, 0, release)) | ||
1063 | { | ||
1064 | release = *init_release_time; | ||
1065 | } | ||
1066 | |||
1067 | releaseTime = wctime(); | ||
1068 | double failsafeEnd = releaseTime + duration; | ||
1069 | |||
1070 | |||
1071 | if (file) { | ||
1072 | /* use times read from the CSV file */ | ||
1073 | for (cur_job = 0; cur_job < num_jobs; ++cur_job) { | ||
1074 | /* convert job's length to seconds */ | ||
1075 | job(exec_times[cur_job] * 0.001 * scale, | ||
1076 | gpu_sec_ms * 0.001 * scale, | ||
1077 | failsafeEnd); | ||
1078 | } | ||
1079 | } else { | ||
1080 | /* convert to seconds and scale */ | ||
1081 | int keepGoing; | ||
1082 | do | ||
1083 | { | ||
1084 | keepGoing = job(wcet_ms * 0.001 * scale, gpu_sec_ms * 0.001 * scale, failsafeEnd); | ||
1085 | |||
1086 | |||
1087 | clock_gettime(CLOCK_MONOTONIC, &spec); | ||
1088 | finish = timespec_to_ns(spec); | ||
1089 | |||
1090 | responseTimeLog(min(num_jobs,jobCount++)) = finish - release; | ||
1091 | |||
1092 | // this is an estimated upper-bound on release time. it may be off by several microseconds. | ||
1093 | #ifdef RESET_RELEASE_ON_MISS | ||
1094 | release = (release + period < finish) ? | ||
1095 | finish : /* missed deadline. adopt next release as current time. */ | ||
1096 | release + period; /* some time in the future. */ | ||
1097 | #else | ||
1098 | release = release + period; // allow things to get progressively later. | ||
1099 | #endif | ||
1100 | |||
1101 | sleep_next_period(); | ||
1102 | clock_gettime(CLOCK_MONOTONIC, &spec); | ||
1103 | release = min(timespec_to_ns(spec), release); | ||
1104 | |||
1105 | } while(keepGoing); | ||
1106 | } | ||
1107 | |||
1108 | if(GPU_TASK && ENABLE_WAIT) | ||
1109 | { | ||
1110 | printf("%d waiting at barrier\n", getpid()); | ||
1111 | gpu_barrier->wait(); | ||
1112 | } | ||
1113 | |||
1114 | ret = task_mode(BACKGROUND_TASK); | ||
1115 | if (ret != 0) | ||
1116 | bail_out("could not become regular task (huh?)"); | ||
1117 | |||
1118 | if (file) | ||
1119 | free(exec_times); | ||
1120 | |||
1121 | if(GPU_TASK) | ||
1122 | { | ||
1123 | /* | ||
1124 | if(ENABLE_WAIT) | ||
1125 | { | ||
1126 | // wait for all GPU using tasks ext RT mode. | ||
1127 | printf("%d waiting at barrier\n", getpid()); | ||
1128 | gpu_barrier->wait(); | ||
1129 | } | ||
1130 | */ | ||
1131 | |||
1132 | exit_cuda(); | ||
1133 | |||
1134 | if(ENABLE_WAIT) | ||
1135 | { | ||
1136 | /* wait before we clean up memory */ | ||
1137 | printf("%d waiting for all to shutdown GPUs\n", getpid()); | ||
1138 | gpu_barrier->wait(); | ||
1139 | |||
1140 | /* | ||
1141 | if(create_shm > -1) | ||
1142 | { | ||
1143 | printf("%d removing shared memory\n", getpid()); | ||
1144 | shared_memory_object::remove("gpu_barrier_memory"); | ||
1145 | } | ||
1146 | */ | ||
1147 | } | ||
1148 | } | ||
1149 | |||
1150 | |||
1151 | if (ENABLE_WAIT) | ||
1152 | { | ||
1153 | printf("%d waiting at exit barrier\n", getpid()); | ||
1154 | release_barrier->wait(); | ||
1155 | } | ||
1156 | |||
1157 | |||
1158 | char gpu_using_str[] = "GPU\n"; | ||
1159 | char cpu_only_str[] = "CPU\n"; | ||
1160 | #define USED(arr) (arr)(Range(fromStart,min(num_jobs-1,jobCount-1))) | ||
1161 | // period (ms), avg-rt, min-rt, max-rt, avg-slack, numMisses | ||
1162 | printf("DONE,%d,%d,%f,%f,%f,%lu,%lu,%f,%lu,%d,%d,%s", | ||
1163 | cpu, | ||
1164 | getpid(), | ||
1165 | period_ms, | ||
1166 | // average | ||
1167 | blitz::mean(USED(responseTimeLog)), | ||
1168 | // average pct of period | ||
1169 | 100.0*(blitz::mean(USED(responseTimeLog))/period), | ||
1170 | // min | ||
1171 | blitz::min(USED(responseTimeLog)), | ||
1172 | // max | ||
1173 | blitz::max(USED(responseTimeLog)), | ||
1174 | // average slack | ||
1175 | blitz::mean((uint64_t)period - USED(responseTimeLog)), | ||
1176 | // num jobs | ||
1177 | min(num_jobs-1,jobCount-1), | ||
1178 | // num misses | ||
1179 | blitz::count(USED(responseTimeLog) > (uint64_t)period), | ||
1180 | // num misses w/ unbounded | ||
1181 | blitz::count(USED(responseTimeLog) > (uint64_t)(2*period)), | ||
1182 | // flag gpu-using tasks | ||
1183 | ((GPU_TASK) ? gpu_using_str : cpu_only_str) | ||
1184 | ); | ||
1185 | |||
1186 | return 0; | ||
1187 | } | ||
diff --git a/include/common.h b/include/common.h index d1234ba..faf2c07 100644 --- a/include/common.h +++ b/include/common.h | |||
@@ -1,7 +1,14 @@ | |||
1 | #ifndef COMMON_H | 1 | #ifndef COMMON_H |
2 | #define COMMON_H | 2 | #define COMMON_H |
3 | 3 | ||
4 | #ifdef __cplusplus | ||
5 | extern "C" { | ||
6 | #endif | ||
4 | 7 | ||
5 | void bail_out(const char* msg); | 8 | void bail_out(const char* msg); |
6 | 9 | ||
10 | #ifdef __cplusplus | ||
11 | } | ||
12 | #endif | ||
13 | |||
7 | #endif | 14 | #endif |