diff options
| author | Glenn Elliott <gelliott@cs.unc.edu> | 2013-04-14 15:06:43 -0400 |
|---|---|---|
| committer | Glenn Elliott <gelliott@cs.unc.edu> | 2013-04-14 15:06:43 -0400 |
| commit | 37b4a24ba84f1dffd680fd550a3d8cad2ac5e3a8 (patch) | |
| tree | 5dc5e56a7a4f424e75f59f7705263bdb43b86fb3 | |
| parent | 209f1961ea2d5863d6f2d2e9d2323446ee5e53c4 (diff) | |
Implemented gpusync rtspin.
| -rw-r--r-- | Makefile | 51 | ||||
| -rw-r--r-- | gpu/budget.cpp | 143 | ||||
| -rw-r--r-- | gpu/gpuspin.cu | 1720 | ||||
| -rw-r--r-- | gpu/rtspin_fake_cuda.cpp | 1187 | ||||
| -rw-r--r-- | include/common.h | 7 |
5 files changed, 1909 insertions, 1199 deletions
| @@ -24,6 +24,12 @@ flags-debug-cpp = -O2 -Wall -Werror -g | |||
| 24 | flags-api = -D_XOPEN_SOURCE=600 -D_GNU_SOURCE | 24 | flags-api = -D_XOPEN_SOURCE=600 -D_GNU_SOURCE |
| 25 | flags-misc = -fasynchronous-unwind-tables -fnon-call-exceptions | 25 | flags-misc = -fasynchronous-unwind-tables -fnon-call-exceptions |
| 26 | 26 | ||
| 27 | flags-cu-debug = -g -G -Xcompiler -Wall -Xcompiler -Werror | ||
| 28 | flags-cu-optim = -O3 -Xcompiler -march=native | ||
| 29 | flags-cu-nvcc = --use_fast_math -gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 | ||
| 30 | flags-cu-misc = -Xcompiler -fasynchronous-unwind-tables -Xcompiler -fnon-call-exceptions -Xcompiler -malign-double -Xcompiler -pthread | ||
| 31 | flags-cu-x86_64 = -m64 | ||
| 32 | |||
| 27 | # architecture-specific flags | 33 | # architecture-specific flags |
| 28 | flags-i386 = -m32 | 34 | flags-i386 = -m32 |
| 29 | flags-x86_64 = -m64 | 35 | flags-x86_64 = -m64 |
| @@ -51,12 +57,19 @@ headers = -I${LIBLITMUS}/include -I${LIBLITMUS}/arch/${include-${ARCH}}/include | |||
| 51 | 57 | ||
| 52 | # combine options | 58 | # combine options |
| 53 | CPPFLAGS = ${flags-api} ${flags-debug-cpp} ${flags-misc} ${flags-${ARCH}} -DARCH=${ARCH} ${headers} | 59 | CPPFLAGS = ${flags-api} ${flags-debug-cpp} ${flags-misc} ${flags-${ARCH}} -DARCH=${ARCH} ${headers} |
| 60 | #CUFLAGS = ${flags-api} ${flags-cu-debug} ${flags-cu-optim} ${flags-cu-nvcc} ${flags-cu-misc} -DARCH=${ARCH} ${headers} | ||
| 61 | CUFLAGS = ${flags-api} ${flags-cu-optim} ${flags-cu-nvcc} ${flags-cu-misc} -DARCH=${ARCH} ${headers} | ||
| 54 | CFLAGS = ${flags-debug} ${flags-misc} | 62 | CFLAGS = ${flags-debug} ${flags-misc} |
| 55 | LDFLAGS = ${flags-${ARCH}} | 63 | LDFLAGS = ${flags-${ARCH}} |
| 56 | 64 | ||
| 57 | # how to link against liblitmus | 65 | # how to link against liblitmus |
| 58 | liblitmus-flags = -L${LIBLITMUS} -llitmus | 66 | liblitmus-flags = -L${LIBLITMUS} -llitmus |
| 59 | 67 | ||
| 68 | # how to link cuda | ||
| 69 | cuda-flags-i386 = -L/usr/local/cuda/lib | ||
| 70 | cuda-flags-x86_64 = -L/usr/local/cuda/lib64 | ||
| 71 | cuda-flags = ${cuda-flags-${ARCH}} -lcudart | ||
| 72 | |||
| 60 | # Force gcc instead of cc, but let the user specify a more specific version if | 73 | # Force gcc instead of cc, but let the user specify a more specific version if |
| 61 | # desired. | 74 | # desired. |
| 62 | ifeq (${CC},cc) | 75 | ifeq (${CC},cc) |
| @@ -67,20 +80,24 @@ endif | |||
| 67 | CPP = g++ | 80 | CPP = g++ |
| 68 | #endif | 81 | #endif |
| 69 | 82 | ||
| 83 | CU = nvcc | ||
| 84 | |||
| 70 | # incorporate cross-compiler (if any) | 85 | # incorporate cross-compiler (if any) |
| 71 | CC := ${CROSS_COMPILE}${CC} | 86 | CC := ${CROSS_COMPILE}${CC} |
| 72 | CPP := ${CROSS_COMPILE}${CPP} | 87 | CPP := ${CROSS_COMPILE}${CPP} |
| 73 | LD := ${CROSS_COMPILE}${LD} | 88 | LD := ${CROSS_COMPILE}${LD} |
| 74 | AR := ${CROSS_COMPILE}${AR} | 89 | AR := ${CROSS_COMPILE}${AR} |
| 90 | CU := ${CROSS_COMPILE}${CU} | ||
| 75 | 91 | ||
| 76 | # ############################################################################## | 92 | # ############################################################################## |
| 77 | # Targets | 93 | # Targets |
| 78 | 94 | ||
| 79 | all = lib ${rt-apps} ${rt-cppapps} | 95 | all = lib ${rt-apps} ${rt-cppapps} ${rt-cuapps} |
| 80 | rt-apps = cycles base_task rt_launch rtspin release_ts measure_syscall \ | 96 | rt-apps = cycles base_task rt_launch rtspin release_ts measure_syscall \ |
| 81 | base_mt_task uncache runtests \ | 97 | base_mt_task uncache runtests \ |
| 82 | nested locktest ikglptest dgl aux_threads normal_task | 98 | nested locktest ikglptest dgl aux_threads normal_task |
| 83 | rt-cppapps = budget | 99 | rt-cppapps = budget |
| 100 | rt-cuapps = gpuspin | ||
| 84 | 101 | ||
| 85 | .PHONY: all lib clean dump-config TAGS tags cscope help | 102 | .PHONY: all lib clean dump-config TAGS tags cscope help |
| 86 | 103 | ||
| @@ -95,10 +112,14 @@ inc/config.makefile: Makefile | |||
| 95 | @printf "%-15s= %-20s\n" \ | 112 | @printf "%-15s= %-20s\n" \ |
| 96 | ARCH ${ARCH} \ | 113 | ARCH ${ARCH} \ |
| 97 | CFLAGS '${CFLAGS}' \ | 114 | CFLAGS '${CFLAGS}' \ |
| 115 | CPPFLAGS '${CPPFLAGS}' \ | ||
| 116 | CUFLAGS '${CUFLAGS}' \ | ||
| 98 | LDFLAGS '${LDFLAGS}' \ | 117 | LDFLAGS '${LDFLAGS}' \ |
| 99 | LDLIBS '${liblitmus-flags}' \ | 118 | LDLIBS '${liblitmus-flags}' \ |
| 100 | CPPFLAGS '${CPPFLAGS}' \ | 119 | CPPFLAGS '${CPPFLAGS}' \ |
| 101 | CC '${shell which ${CC}}' \ | 120 | CC '${shell which ${CC}}' \ |
| 121 | CPP '${shell which ${CPP}}' \ | ||
| 122 | CU '${shell which ${CU}}' \ | ||
| 102 | LD '${shell which ${LD}}' \ | 123 | LD '${shell which ${LD}}' \ |
| 103 | AR '${shell which ${AR}}' \ | 124 | AR '${shell which ${AR}}' \ |
| 104 | > $@ | 125 | > $@ |
| @@ -112,10 +133,12 @@ dump-config: | |||
| 112 | headers "${headers}" \ | 133 | headers "${headers}" \ |
| 113 | "kernel headers" "${imported-headers}" \ | 134 | "kernel headers" "${imported-headers}" \ |
| 114 | CFLAGS "${CFLAGS}" \ | 135 | CFLAGS "${CFLAGS}" \ |
| 115 | LDFLAGS "${LDFLAGS}" \ | ||
| 116 | CPPFLAGS "${CPPFLAGS}" \ | 136 | CPPFLAGS "${CPPFLAGS}" \ |
| 137 | CUFLAGS "${CUFLAGS}" \ | ||
| 138 | LDFLAGS "${LDFLAGS}" \ | ||
| 117 | CC "${CC}" \ | 139 | CC "${CC}" \ |
| 118 | CPP "${CPP}" \ | 140 | CPP "${CPP}" \ |
| 141 | CU "${CU}" \ | ||
| 119 | LD "${LD}" \ | 142 | LD "${LD}" \ |
| 120 | AR "${AR}" \ | 143 | AR "${AR}" \ |
| 121 | obj-all "${obj-all}" | 144 | obj-all "${obj-all}" |
| @@ -124,8 +147,7 @@ help: | |||
| 124 | @cat INSTALL | 147 | @cat INSTALL |
| 125 | 148 | ||
| 126 | clean: | 149 | clean: |
| 127 | rm -f ${rt-apps} | 150 | rm -f ${rt-apps} ${rt-cppapps} ${rt-cuapps} |
| 128 | rm -f ${rt-cppapps} | ||
| 129 | rm -f *.o *.d *.a test_catalog.inc | 151 | rm -f *.o *.d *.a test_catalog.inc |
| 130 | rm -f ${imported-headers} | 152 | rm -f ${imported-headers} |
| 131 | rm -f inc/config.makefile | 153 | rm -f inc/config.makefile |
| @@ -259,6 +281,12 @@ vpath %.cpp gpu/ | |||
| 259 | objcpp-budget = budget.o common.o | 281 | objcpp-budget = budget.o common.o |
| 260 | lib-budget = -lrt -lm -pthread | 282 | lib-budget = -lrt -lm -pthread |
| 261 | 283 | ||
| 284 | |||
| 285 | vpath %.cu gpu/ | ||
| 286 | |||
| 287 | objcu-gpuspin = gpuspin.o common.o | ||
| 288 | lib-gpuspin = -lrt -lm -lpthread | ||
| 289 | |||
| 262 | # ############################################################################## | 290 | # ############################################################################## |
| 263 | # Build everything that depends on liblitmus. | 291 | # Build everything that depends on liblitmus. |
| 264 | 292 | ||
| @@ -269,14 +297,19 @@ ${rt-apps}: $${obj-$$@} liblitmus.a | |||
| 269 | ${rt-cppapps}: $${objcpp-$$@} liblitmus.a | 297 | ${rt-cppapps}: $${objcpp-$$@} liblitmus.a |
| 270 | $(CPP) -o $@ $(LDFLAGS) ${ldf-$@} $(filter-out liblitmus.a,$+) $(LOADLIBS) $(LDLIBS) ${liblitmus-flags} ${lib-$@} | 298 | $(CPP) -o $@ $(LDFLAGS) ${ldf-$@} $(filter-out liblitmus.a,$+) $(LOADLIBS) $(LDLIBS) ${liblitmus-flags} ${lib-$@} |
| 271 | 299 | ||
| 300 | ${rt-cuapps}: $${objcu-$$@} liblitmus.a | ||
| 301 | $(CPP) -o $@ $(LDFLAGS) ${ldf-$@} $(filter-out liblitmus.a,$+) $(LOADLIBS) $(LDLIBS) ${liblitmus-flags} ${cuda-flags} ${lib-$@} | ||
| 302 | |||
| 272 | # ############################################################################## | 303 | # ############################################################################## |
| 273 | # Dependency resolution. | 304 | # Dependency resolution. |
| 274 | 305 | ||
| 275 | vpath %.c bin/ src/ gpu/ tests/ | 306 | vpath %.c bin/ src/ gpu/ tests/ |
| 276 | vpath %.cpp gpu/ | 307 | vpath %.cpp gpu/ |
| 308 | vpath %.cu gpu/ | ||
| 277 | 309 | ||
| 278 | obj-all = ${sort ${foreach target,${all},${obj-${target}}}} | 310 | obj-all = ${sort ${foreach target,${all},${obj-${target}}}} |
| 279 | obj-all += ${sort ${foreach target,${all},${objcpp-${target}}}} | 311 | obj-all += ${sort ${foreach target,${all},${objcpp-${target}}}} |
| 312 | obj-all += ${sort ${foreach target,${all},${objcu-${target}}}} | ||
| 280 | 313 | ||
| 281 | # rule to generate dependency files | 314 | # rule to generate dependency files |
| 282 | %.d: %.c ${imported-headers} | 315 | %.d: %.c ${imported-headers} |
| @@ -291,6 +324,16 @@ obj-all += ${sort ${foreach target,${all},${objcpp-${target}}}} | |||
| 291 | sed 's,\($*\)\.o[ :]*,\1.o $@ : ,g' < $@.$$$$ > $@; \ | 324 | sed 's,\($*\)\.o[ :]*,\1.o $@ : ,g' < $@.$$$$ > $@; \ |
| 292 | rm -f $@.$$$$ | 325 | rm -f $@.$$$$ |
| 293 | 326 | ||
| 327 | %.d: %.cu ${imported-headers} | ||
| 328 | @set -e; rm -f $@; \ | ||
| 329 | $(CU) --generate-dependencies $(CUFLAGS) $< > $@.$$$$; \ | ||
| 330 | sed 's,\($*\)\.o[ :]*,\1.o $@ : ,g' < $@.$$$$ > $@; \ | ||
| 331 | rm -f $@.$$$$ | ||
| 332 | |||
| 333 | # teach make how to compile .cu files | ||
| 334 | %.o: %.cu | ||
| 335 | $(CU) --compile $(CUFLAGS) $(OUTPUT_OPTION) $< | ||
| 336 | |||
| 294 | ifeq ($(MAKECMDGOALS),) | 337 | ifeq ($(MAKECMDGOALS),) |
| 295 | MAKECMDGOALS += all | 338 | MAKECMDGOALS += all |
| 296 | endif | 339 | endif |
diff --git a/gpu/budget.cpp b/gpu/budget.cpp index f62c515..8a2546a 100644 --- a/gpu/budget.cpp +++ b/gpu/budget.cpp | |||
| @@ -80,6 +80,28 @@ int SIGNALS = 0; | |||
| 80 | int BLOCK_SIGNALS_ON_SLEEP = 0; | 80 | int BLOCK_SIGNALS_ON_SLEEP = 0; |
| 81 | int OVERRUN_RATE = 1; /* default: every job overruns */ | 81 | int OVERRUN_RATE = 1; /* default: every job overruns */ |
| 82 | 82 | ||
| 83 | int CXS_OVERRUN = 0; | ||
| 84 | int NUM_LOCKS = 1; | ||
| 85 | int NUM_REPLICAS = 1; | ||
| 86 | int NAMESPACE = 0; | ||
| 87 | int *LOCKS = NULL; | ||
| 88 | int IKGLP_LOCK = 0; | ||
| 89 | int USE_DGLS = 0; | ||
| 90 | int NEST_IN_IKGLP = 0; | ||
| 91 | |||
| 92 | int WAIT = 0; | ||
| 93 | |||
| 94 | enum eLockType | ||
| 95 | { | ||
| 96 | FIFO, | ||
| 97 | PRIOQ, | ||
| 98 | IKGLP | ||
| 99 | }; | ||
| 100 | |||
| 101 | eLockType LOCK_TYPE = FIFO; | ||
| 102 | |||
| 103 | int OVERRUN_BY_SLEEP = 0; | ||
| 104 | |||
| 83 | int NUM_JOBS = 0; | 105 | int NUM_JOBS = 0; |
| 84 | int NUM_COMPLETED_JOBS = 0; | 106 | int NUM_COMPLETED_JOBS = 0; |
| 85 | int NUM_OVERRUNS = 0; | 107 | int NUM_OVERRUNS = 0; |
| @@ -103,9 +125,32 @@ int job(lt_t exec_ns, lt_t budget_ns) | |||
| 103 | if (SIGNALS && BLOCK_SIGNALS_ON_SLEEP) | 125 | if (SIGNALS && BLOCK_SIGNALS_ON_SLEEP) |
| 104 | block_litmus_signals(SIG_BUDGET); | 126 | block_litmus_signals(SIG_BUDGET); |
| 105 | 127 | ||
| 128 | if(CXS_OVERRUN) { | ||
| 129 | if (NEST_IN_IKGLP) | ||
| 130 | litmus_lock(IKGLP_LOCK); | ||
| 131 | if (USE_DGLS) | ||
| 132 | litmus_dgl_lock(LOCKS, NUM_LOCKS); | ||
| 133 | else | ||
| 134 | for(int i = 0; i < NUM_LOCKS; ++i) | ||
| 135 | litmus_lock(LOCKS[i]); | ||
| 136 | } | ||
| 137 | |||
| 106 | // intentionally overrun via suspension | 138 | // intentionally overrun via suspension |
| 107 | lt_sleep(approx_remaining + overrun_extra); | 139 | if (OVERRUN_BY_SLEEP) |
| 108 | 140 | lt_sleep(approx_remaining + overrun_extra); | |
| 141 | else | ||
| 142 | loop_for((approx_remaining + overrun_extra) * 0.9); | ||
| 143 | |||
| 144 | if(CXS_OVERRUN) { | ||
| 145 | if (USE_DGLS) | ||
| 146 | litmus_dgl_unlock(LOCKS, NUM_LOCKS); | ||
| 147 | else | ||
| 148 | for(int i = NUM_LOCKS-1; i >= 0; --i) | ||
| 149 | litmus_unlock(LOCKS[i]); | ||
| 150 | if (NEST_IN_IKGLP) | ||
| 151 | litmus_unlock(IKGLP_LOCK); | ||
| 152 | } | ||
| 153 | |||
| 109 | if (SIGNALS && BLOCK_SIGNALS_ON_SLEEP) | 154 | if (SIGNALS && BLOCK_SIGNALS_ON_SLEEP) |
| 110 | unblock_litmus_signals(SIG_BUDGET); | 155 | unblock_litmus_signals(SIG_BUDGET); |
| 111 | } | 156 | } |
| @@ -120,15 +165,18 @@ int job(lt_t exec_ns, lt_t budget_ns) | |||
| 120 | return 1; | 165 | return 1; |
| 121 | } | 166 | } |
| 122 | 167 | ||
| 123 | #define OPTSTR "sboOva" | 168 | #define OPTSTR "SbosOvalwqixdn:r:" |
| 124 | 169 | ||
| 125 | int main(int argc, char** argv) | 170 | int main(int argc, char** argv) |
| 126 | { | 171 | { |
| 127 | int ret; | 172 | int ret; |
| 128 | lt_t e_ns = ms2ns(10); | 173 | |
| 129 | lt_t p_ns = ms2ns(100); | 174 | srand(getpid()); |
| 175 | |||
| 176 | lt_t e_ns = ms2ns(2); | ||
| 177 | lt_t p_ns = ms2ns(50) + rand()%200; | ||
| 130 | lt_t budget_ns = p_ns/2; | 178 | lt_t budget_ns = p_ns/2; |
| 131 | lt_t duration = s2ns(10); | 179 | lt_t duration = s2ns(60); |
| 132 | lt_t terminate_time; | 180 | lt_t terminate_time; |
| 133 | unsigned int first_job, last_job; | 181 | unsigned int first_job, last_job; |
| 134 | int opt; | 182 | int opt; |
| @@ -140,12 +188,15 @@ int main(int argc, char** argv) | |||
| 140 | 188 | ||
| 141 | while ((opt = getopt(argc, argv, OPTSTR)) != -1) { | 189 | while ((opt = getopt(argc, argv, OPTSTR)) != -1) { |
| 142 | switch(opt) { | 190 | switch(opt) { |
| 143 | case 's': | 191 | case 'S': |
| 144 | SIGNALS = 1; | 192 | SIGNALS = 1; |
| 145 | break; | 193 | break; |
| 146 | case 'b': | 194 | case 'b': |
| 147 | BLOCK_SIGNALS_ON_SLEEP = 1; | 195 | BLOCK_SIGNALS_ON_SLEEP = 1; |
| 148 | break; | 196 | break; |
| 197 | case 's': | ||
| 198 | OVERRUN_BY_SLEEP = 1; | ||
| 199 | break; | ||
| 149 | case 'o': | 200 | case 'o': |
| 150 | OVERRUN = 1; | 201 | OVERRUN = 1; |
| 151 | overrun_extra = budget_ns/2; | 202 | overrun_extra = budget_ns/2; |
| @@ -164,6 +215,31 @@ int main(int argc, char** argv) | |||
| 164 | case 'v': | 215 | case 'v': |
| 165 | drain_policy = DRAIN_SOBLIV; | 216 | drain_policy = DRAIN_SOBLIV; |
| 166 | break; | 217 | break; |
| 218 | case 'l': | ||
| 219 | CXS_OVERRUN = 1; | ||
| 220 | NAMESPACE = open("semaphores", O_RDONLY | O_CREAT, S_IRUSR | S_IWUSR); | ||
| 221 | break; | ||
| 222 | case 'q': | ||
| 223 | LOCK_TYPE = PRIOQ; | ||
| 224 | break; | ||
| 225 | case 'i': | ||
| 226 | LOCK_TYPE = IKGLP; | ||
| 227 | break; | ||
| 228 | case 'x': | ||
| 229 | NEST_IN_IKGLP = 1; | ||
| 230 | break; | ||
| 231 | case 'w': | ||
| 232 | WAIT = 1; | ||
| 233 | break; | ||
| 234 | case 'd': | ||
| 235 | USE_DGLS = 1; | ||
| 236 | break; | ||
| 237 | case 'n': | ||
| 238 | NUM_LOCKS = atoi(optarg); | ||
| 239 | break; | ||
| 240 | case 'r': | ||
| 241 | NUM_REPLICAS = atoi(optarg); | ||
| 242 | break; | ||
| 167 | case ':': | 243 | case ':': |
| 168 | printf("missing argument\n"); | 244 | printf("missing argument\n"); |
| 169 | assert(false); | 245 | assert(false); |
| @@ -176,10 +252,21 @@ int main(int argc, char** argv) | |||
| 176 | } | 252 | } |
| 177 | 253 | ||
| 178 | assert(!BLOCK_SIGNALS_ON_SLEEP || (BLOCK_SIGNALS_ON_SLEEP && SIGNALS)); | 254 | assert(!BLOCK_SIGNALS_ON_SLEEP || (BLOCK_SIGNALS_ON_SLEEP && SIGNALS)); |
| 255 | assert(!CXS_OVERRUN || (CXS_OVERRUN && WAIT)); | ||
| 256 | assert(LOCK_TYPE != IKGLP || NUM_LOCKS == 1); | ||
| 257 | assert(LOCK_TYPE != IKGLP || (LOCK_TYPE == IKGLP && !NEST_IN_IKGLP)); | ||
| 258 | assert(NUM_LOCKS > 0); | ||
| 259 | if (LOCK_TYPE == IKGLP || NEST_IN_IKGLP) | ||
| 260 | assert(NUM_REPLICAS >= 1); | ||
| 261 | |||
| 262 | LOCKS = new int[NUM_LOCKS]; | ||
| 179 | 263 | ||
| 180 | if (compute_overrun_rate) { | 264 | if (compute_overrun_rate) { |
| 181 | int backlog = (int)ceil((overrun_extra + budget_ns)/(double)budget_ns); | 265 | int backlog = (int)ceil((overrun_extra + budget_ns)/(double)budget_ns); |
| 182 | OVERRUN_RATE = backlog + 2; /* some padding */ | 266 | if (!CXS_OVERRUN) |
| 267 | OVERRUN_RATE = backlog + 2; /* some padding */ | ||
| 268 | else | ||
| 269 | OVERRUN_RATE = 2*backlog + 2; /* overrun less frequently for testing */ | ||
| 183 | } | 270 | } |
| 184 | 271 | ||
| 185 | init_rt_task_param(¶m); | 272 | init_rt_task_param(¶m); |
| @@ -197,6 +284,44 @@ int main(int argc, char** argv) | |||
| 197 | ret = set_rt_task_param(gettid(), ¶m); | 284 | ret = set_rt_task_param(gettid(), ¶m); |
| 198 | assert(ret == 0); | 285 | assert(ret == 0); |
| 199 | 286 | ||
| 287 | if (CXS_OVERRUN) { | ||
| 288 | int i; | ||
| 289 | for(i = 0; i < NUM_LOCKS; ++i) { | ||
| 290 | int lock = -1; | ||
| 291 | switch(LOCK_TYPE) | ||
| 292 | { | ||
| 293 | case FIFO: | ||
| 294 | lock = open_fifo_sem(NAMESPACE, i); | ||
| 295 | break; | ||
| 296 | case PRIOQ: | ||
| 297 | lock = open_prioq_sem(NAMESPACE, i); | ||
| 298 | break; | ||
| 299 | case IKGLP: | ||
| 300 | lock = open_ikglp_sem(NAMESPACE, i, NUM_REPLICAS); | ||
| 301 | break; | ||
| 302 | } | ||
| 303 | if (lock < 0) { | ||
| 304 | perror("open_sem"); | ||
| 305 | exit(-1); | ||
| 306 | } | ||
| 307 | LOCKS[i] = lock; | ||
| 308 | } | ||
| 309 | |||
| 310 | if (NEST_IN_IKGLP) { | ||
| 311 | IKGLP_LOCK = open_ikglp_sem(NAMESPACE, i, NUM_REPLICAS); | ||
| 312 | if (IKGLP_LOCK < 0) { | ||
| 313 | perror("open_sem"); | ||
| 314 | exit(-1); | ||
| 315 | } | ||
| 316 | } | ||
| 317 | } | ||
| 318 | |||
| 319 | if (WAIT) { | ||
| 320 | ret = wait_for_ts_release(); | ||
| 321 | if (ret < 0) | ||
| 322 | perror("wait_for_ts_release"); | ||
| 323 | } | ||
| 324 | |||
| 200 | ret = task_mode(LITMUS_RT_TASK); | 325 | ret = task_mode(LITMUS_RT_TASK); |
| 201 | assert(ret == 0); | 326 | assert(ret == 0); |
| 202 | 327 | ||
| @@ -231,5 +356,7 @@ int main(int argc, char** argv) | |||
| 231 | printf("# User Jobs Completed: %d\n", NUM_COMPLETED_JOBS); | 356 | printf("# User Jobs Completed: %d\n", NUM_COMPLETED_JOBS); |
| 232 | printf("# Overruns: %d\n", NUM_OVERRUNS); | 357 | printf("# Overruns: %d\n", NUM_OVERRUNS); |
| 233 | 358 | ||
| 359 | delete[] LOCKS; | ||
| 360 | |||
| 234 | return 0; | 361 | return 0; |
| 235 | } | 362 | } |
diff --git a/gpu/gpuspin.cu b/gpu/gpuspin.cu new file mode 100644 index 0000000..aff6cd1 --- /dev/null +++ b/gpu/gpuspin.cu | |||
| @@ -0,0 +1,1720 @@ | |||
| 1 | #include <sys/time.h> | ||
| 2 | |||
| 3 | #include <stdio.h> | ||
| 4 | #include <stdlib.h> | ||
| 5 | #include <unistd.h> | ||
| 6 | #include <time.h> | ||
| 7 | #include <string.h> | ||
| 8 | #include <assert.h> | ||
| 9 | #include <execinfo.h> | ||
| 10 | |||
| 11 | #include <boost/interprocess/managed_shared_memory.hpp> | ||
| 12 | #include <boost/interprocess/sync/interprocess_mutex.hpp> | ||
| 13 | |||
| 14 | #include <cuda_runtime.h> | ||
| 15 | |||
| 16 | #include "litmus.h" | ||
| 17 | #include "common.h" | ||
| 18 | |||
| 19 | using namespace std; | ||
| 20 | using namespace boost::interprocess; | ||
| 21 | |||
| 22 | const char *lock_namespace = "./.gpuspin-locks"; | ||
| 23 | |||
| 24 | const int NR_GPUS = 8; | ||
| 25 | |||
| 26 | bool GPU_USING = false; | ||
| 27 | bool ENABLE_AFFINITY = false; | ||
| 28 | bool RELAX_FIFO_MAX_LEN = false; | ||
| 29 | bool ENABLE_CHUNKING = false; | ||
| 30 | bool MIGRATE_VIA_SYSMEM = false; | ||
| 31 | |||
| 32 | enum eEngineLockTypes | ||
| 33 | { | ||
| 34 | FIFO, | ||
| 35 | PRIOQ | ||
| 36 | }; | ||
| 37 | |||
| 38 | eEngineLockTypes ENGINE_LOCK_TYPE = FIFO; | ||
| 39 | |||
| 40 | int GPU_PARTITION = 0; | ||
| 41 | int GPU_PARTITION_SIZE = 0; | ||
| 42 | int CPU_PARTITION_SIZE = 0; | ||
| 43 | |||
| 44 | int RHO = 2; | ||
| 45 | |||
| 46 | int NUM_COPY_ENGINES = 2; | ||
| 47 | |||
| 48 | |||
| 49 | __attribute__((unused)) static size_t kbToB(size_t kb) { return kb * 1024; } | ||
| 50 | __attribute__((unused)) static size_t mbToB(size_t mb) { return kbToB(mb * 1024); } | ||
| 51 | |||
| 52 | /* in bytes */ | ||
| 53 | size_t SEND_SIZE = 0; | ||
| 54 | size_t RECV_SIZE = 0; | ||
| 55 | size_t STATE_SIZE = 0; | ||
| 56 | size_t CHUNK_SIZE = 0; | ||
| 57 | |||
| 58 | int TOKEN_LOCK = -1; | ||
| 59 | |||
| 60 | bool USE_ENGINE_LOCKS = true; | ||
| 61 | bool USE_DYNAMIC_GROUP_LOCKS = false; | ||
| 62 | int EE_LOCKS[NR_GPUS]; | ||
| 63 | int CE_SEND_LOCKS[NR_GPUS]; | ||
| 64 | int CE_RECV_LOCKS[NR_GPUS]; | ||
| 65 | int CE_MIGR_SEND_LOCKS[NR_GPUS]; | ||
| 66 | int CE_MIGR_RECV_LOCKS[NR_GPUS]; | ||
| 67 | bool RESERVED_MIGR_COPY_ENGINE = false; // only checked if NUM_COPY_ENGINES == 2 | ||
| 68 | |||
| 69 | bool ENABLE_RT_AUX_THREADS = true; | ||
| 70 | |||
| 71 | enum eGpuSyncMode | ||
| 72 | { | ||
| 73 | IKGLP_MODE, | ||
| 74 | IKGLP_WC_MODE, /* work-conserving IKGLP. no GPU is left idle, but breaks optimality */ | ||
| 75 | KFMLP_MODE, | ||
| 76 | RGEM_MODE, | ||
| 77 | }; | ||
| 78 | |||
| 79 | eGpuSyncMode GPU_SYNC_MODE = IKGLP_MODE; | ||
| 80 | |||
| 81 | enum eCudaSyncMode | ||
| 82 | { | ||
| 83 | BLOCKING, | ||
| 84 | SPIN | ||
| 85 | }; | ||
| 86 | |||
| 87 | eCudaSyncMode CUDA_SYNC_MODE = BLOCKING; | ||
| 88 | |||
| 89 | |||
| 90 | int CUR_DEVICE = -1; | ||
| 91 | int LAST_DEVICE = -1; | ||
| 92 | |||
| 93 | cudaStream_t STREAMS[NR_GPUS]; | ||
| 94 | int GPU_HZ[NR_GPUS]; | ||
| 95 | int NUM_SM[NR_GPUS]; | ||
| 96 | int WARP_SIZE[NR_GPUS]; | ||
| 97 | int ELEM_PER_THREAD[NR_GPUS]; | ||
| 98 | |||
| 99 | #define DEFINE_PER_GPU(type, var) type var[NR_GPUS] | ||
| 100 | #define per_gpu(var, idx) (var[(idx)]) | ||
| 101 | #define this_gpu(var) (var[(CUR_DEVICE)]) | ||
| 102 | #define cur_stream() (this_gpu(STREAMS)) | ||
| 103 | #define cur_gpu() (CUR_DEVICE) | ||
| 104 | #define last_gpu() (LAST_DEVICE) | ||
| 105 | #define cur_ee() (EE_LOCKS[CUR_DEVICE]) | ||
| 106 | #define cur_send() (CE_SEND_LOCKS[CUR_DEVICE]) | ||
| 107 | #define cur_recv() (CE_RECV_LOCKS[CUR_DEVICE]) | ||
| 108 | #define cur_migr_send() (CE_MIGR_SEND_LOCKS[CUR_DEVICE]) | ||
| 109 | #define cur_migr_recv() (CE_MIGR_RECV_LOCKS[CUR_DEVICE]) | ||
| 110 | #define cur_hz() (GPU_HZ[CUR_DEVICE]) | ||
| 111 | #define cur_sms() (NUM_SM[CUR_DEVICE]) | ||
| 112 | #define cur_warp_size() (WARP_SIZE[CUR_DEVICE]) | ||
| 113 | #define cur_elem_per_thread() (ELEM_PER_THREAD[CUR_DEVICE]) | ||
| 114 | #define num_online_gpus() (NUM_GPUS) | ||
| 115 | |||
| 116 | static bool useEngineLocks() | ||
| 117 | { | ||
| 118 | return(USE_ENGINE_LOCKS); | ||
| 119 | } | ||
| 120 | |||
| 121 | #define VANILLA_LINUX | ||
| 122 | |||
| 123 | bool TRACE_MIGRATIONS = false; | ||
| 124 | #ifndef VANILLA_LINUX | ||
| 125 | #define trace_migration(to, from) do { inject_gpu_migration((to), (from)); } while(0) | ||
| 126 | #define trace_release(arrival, deadline, jobno) do { inject_release((arrival), (deadline), (jobno)); } while(0) | ||
| 127 | #define trace_completion(jobno) do { inject_completion((jobno)); } while(0) | ||
| 128 | #define trace_name() do { inject_name(); } while(0) | ||
| 129 | #define trace_param() do { inject_param(); } while(0) | ||
| 130 | #else | ||
| 131 | #define set_rt_task_param(x, y) (0) | ||
| 132 | #define trace_migration(to, from) | ||
| 133 | #define trace_release(arrival, deadline, jobno) | ||
| 134 | #define trace_completion(jobno) | ||
| 135 | #define trace_name() | ||
| 136 | #define trace_param() | ||
| 137 | #endif | ||
| 138 | |||
| 139 | struct ce_lock_state | ||
| 140 | { | ||
| 141 | int locks[2]; | ||
| 142 | size_t num_locks; | ||
| 143 | size_t budget_remaining; | ||
| 144 | bool locked; | ||
| 145 | |||
| 146 | ce_lock_state(int device_a, enum cudaMemcpyKind kind, size_t size, int device_b = -1, bool migration = false) { | ||
| 147 | num_locks = (device_a != -1) + (device_b != -1); | ||
| 148 | |||
| 149 | if(device_a != -1) { | ||
| 150 | if (!migration) | ||
| 151 | locks[0] = (kind == cudaMemcpyHostToDevice || (kind == cudaMemcpyDeviceToDevice && device_b == -1)) ? | ||
| 152 | CE_SEND_LOCKS[device_a] : CE_RECV_LOCKS[device_a]; | ||
| 153 | else | ||
| 154 | locks[0] = (kind == cudaMemcpyHostToDevice || (kind == cudaMemcpyDeviceToDevice && device_b == -1)) ? | ||
| 155 | CE_MIGR_SEND_LOCKS[device_a] : CE_MIGR_RECV_LOCKS[device_a]; | ||
| 156 | } | ||
| 157 | |||
| 158 | if(device_b != -1) { | ||
| 159 | assert(kind == cudaMemcpyDeviceToDevice); | ||
| 160 | |||
| 161 | if (!migration) | ||
| 162 | locks[1] = CE_RECV_LOCKS[device_b]; | ||
| 163 | else | ||
| 164 | locks[1] = CE_MIGR_RECV_LOCKS[device_b]; | ||
| 165 | |||
| 166 | if(locks[1] < locks[0]) { | ||
| 167 | // enforce total order on locking | ||
| 168 | int temp = locks[1]; | ||
| 169 | locks[1] = locks[0]; | ||
| 170 | locks[0] = temp; | ||
| 171 | } | ||
| 172 | } | ||
| 173 | else { | ||
| 174 | locks[1] = -1; | ||
| 175 | } | ||
| 176 | |||
| 177 | if(!ENABLE_CHUNKING) | ||
| 178 | budget_remaining = size; | ||
| 179 | else | ||
| 180 | budget_remaining = CHUNK_SIZE; | ||
| 181 | } | ||
| 182 | |||
| 183 | void crash(void) { | ||
| 184 | void *array[50]; | ||
| 185 | int size, i; | ||
| 186 | char **messages; | ||
| 187 | |||
| 188 | size = backtrace(array, 50); | ||
| 189 | messages = backtrace_symbols(array, size); | ||
| 190 | |||
| 191 | fprintf(stderr, "%d: TRIED TO GRAB SAME LOCK TWICE! Lock = %d\n", getpid(), locks[0]); | ||
| 192 | for (i = 1; i < size && messages != NULL; ++i) | ||
| 193 | { | ||
| 194 | fprintf(stderr, "%d: [bt]: (%d) %s\n", getpid(), i, messages[i]); | ||
| 195 | } | ||
| 196 | free(messages); | ||
| 197 | |||
| 198 | assert(false); | ||
| 199 | } | ||
| 200 | |||
| 201 | |||
| 202 | void lock() { | ||
| 203 | if(locks[0] == locks[1]) crash(); | ||
| 204 | |||
| 205 | if(USE_DYNAMIC_GROUP_LOCKS) { | ||
| 206 | litmus_dgl_lock(locks, num_locks); | ||
| 207 | } | ||
| 208 | else | ||
| 209 | { | ||
| 210 | for(int l = 0; l < num_locks; ++l) | ||
| 211 | { | ||
| 212 | litmus_lock(locks[l]); | ||
| 213 | } | ||
| 214 | } | ||
| 215 | locked = true; | ||
| 216 | } | ||
| 217 | |||
| 218 | void unlock() { | ||
| 219 | if(locks[0] == locks[1]) crash(); | ||
| 220 | |||
| 221 | if(USE_DYNAMIC_GROUP_LOCKS) { | ||
| 222 | litmus_dgl_unlock(locks, num_locks); | ||
| 223 | } | ||
| 224 | else | ||
| 225 | { | ||
| 226 | // reverse order | ||
| 227 | for(int l = num_locks - 1; l >= 0; --l) | ||
| 228 | { | ||
| 229 | litmus_unlock(locks[l]); | ||
| 230 | } | ||
| 231 | } | ||
| 232 | locked = false; | ||
| 233 | } | ||
| 234 | |||
| 235 | void refresh() { | ||
| 236 | budget_remaining = CHUNK_SIZE; | ||
| 237 | } | ||
| 238 | |||
| 239 | bool budgetIsAvailable(size_t tosend) { | ||
| 240 | return(tosend >= budget_remaining); | ||
| 241 | } | ||
| 242 | |||
| 243 | void decreaseBudget(size_t spent) { | ||
| 244 | budget_remaining -= spent; | ||
| 245 | } | ||
| 246 | }; | ||
| 247 | |||
| 248 | // precondition: if do_locking == true, locks in state are held. | ||
| 249 | static cudaError_t __chunkMemcpy(void* a_dst, const void* a_src, size_t count, | ||
| 250 | enum cudaMemcpyKind kind, | ||
| 251 | ce_lock_state* state) | ||
| 252 | { | ||
| 253 | cudaError_t ret = cudaSuccess; | ||
| 254 | int remaining = count; | ||
| 255 | |||
| 256 | char* dst = (char*)a_dst; | ||
| 257 | const char* src = (const char*)a_src; | ||
| 258 | |||
| 259 | // disable chunking, if needed, by setting chunk_size equal to the | ||
| 260 | // amount of data to be copied. | ||
| 261 | int chunk_size = (ENABLE_CHUNKING) ? CHUNK_SIZE : count; | ||
| 262 | int i = 0; | ||
| 263 | |||
| 264 | while(remaining != 0) | ||
| 265 | { | ||
| 266 | int bytesToCopy = std::min(remaining, chunk_size); | ||
| 267 | |||
| 268 | if(state && state->budgetIsAvailable(bytesToCopy) && state->locked) { | ||
| 269 | cudaStreamSynchronize(STREAMS[CUR_DEVICE]); | ||
| 270 | ret = cudaGetLastError(); | ||
| 271 | |||
| 272 | if(ret != cudaSuccess) | ||
| 273 | { | ||
| 274 | break; | ||
| 275 | } | ||
| 276 | |||
| 277 | state->unlock(); | ||
| 278 | state->refresh(); // replentish. | ||
| 279 | // we can only run out of | ||
| 280 | // budget if chunking is enabled. | ||
| 281 | // we presume that init budget would | ||
| 282 | // be set to cover entire memcpy | ||
| 283 | // if chunking were disabled. | ||
| 284 | } | ||
| 285 | |||
| 286 | if(state && !state->locked) { | ||
| 287 | state->lock(); | ||
| 288 | } | ||
| 289 | |||
| 290 | //ret = cudaMemcpy(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind); | ||
| 291 | cudaMemcpyAsync(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind, STREAMS[CUR_DEVICE]); | ||
| 292 | |||
| 293 | if(state) { | ||
| 294 | state->decreaseBudget(bytesToCopy); | ||
| 295 | } | ||
| 296 | |||
| 297 | ++i; | ||
| 298 | remaining -= bytesToCopy; | ||
| 299 | } | ||
| 300 | return ret; | ||
| 301 | } | ||
| 302 | |||
| 303 | static cudaError_t chunkMemcpy(void* a_dst, const void* a_src, size_t count, | ||
| 304 | enum cudaMemcpyKind kind, | ||
| 305 | int device_a = -1, // device_a == -1 disables locking | ||
| 306 | bool do_locking = true, | ||
| 307 | int device_b = -1, | ||
| 308 | bool migration = false) | ||
| 309 | { | ||
| 310 | cudaError_t ret; | ||
| 311 | if(!do_locking || device_a == -1) { | ||
| 312 | ret = __chunkMemcpy(a_dst, a_src, count, kind, NULL); | ||
| 313 | cudaStreamSynchronize(cur_stream()); | ||
| 314 | if(ret == cudaSuccess) | ||
| 315 | ret = cudaGetLastError(); | ||
| 316 | } | ||
| 317 | else { | ||
| 318 | ce_lock_state state(device_a, kind, count, device_b, migration); | ||
| 319 | state.lock(); | ||
| 320 | ret = __chunkMemcpy(a_dst, a_src, count, kind, &state); | ||
| 321 | cudaStreamSynchronize(cur_stream()); | ||
| 322 | if(ret == cudaSuccess) | ||
| 323 | ret = cudaGetLastError(); | ||
| 324 | state.unlock(); | ||
| 325 | } | ||
| 326 | return ret; | ||
| 327 | } | ||
| 328 | |||
| 329 | |||
| 330 | void allocate_locks_litmus(void) | ||
| 331 | { | ||
| 332 | // allocate k-FMLP lock | ||
| 333 | int fd = open(lock_namespace, O_RDONLY | O_CREAT, S_IRUSR | S_IWUSR); | ||
| 334 | |||
| 335 | int base_name = GPU_PARTITION * 1000; | ||
| 336 | |||
| 337 | if (GPU_SYNC_MODE == IKGLP_MODE) { | ||
| 338 | /* Standard (optimal) IKGLP */ | ||
| 339 | TOKEN_LOCK = open_gpusync_token_lock(fd, | ||
| 340 | base_name, /* name */ | ||
| 341 | GPU_PARTITION_SIZE, | ||
| 342 | GPU_PARTITION*GPU_PARTITION_SIZE, | ||
| 343 | RHO, | ||
| 344 | IKGLP_M_IN_FIFOS, | ||
| 345 | (!RELAX_FIFO_MAX_LEN) ? | ||
| 346 | IKGLP_OPTIMAL_FIFO_LEN : | ||
| 347 | IKGLP_UNLIMITED_FIFO_LEN, | ||
| 348 | ENABLE_AFFINITY); | ||
| 349 | } | ||
| 350 | else if (GPU_SYNC_MODE == KFMLP_MODE) { | ||
| 351 | /* KFMLP. FIFO queues only for tokens. */ | ||
| 352 | TOKEN_LOCK = open_gpusync_token_lock(fd, | ||
| 353 | base_name, /* name */ | ||
| 354 | GPU_PARTITION_SIZE, | ||
| 355 | GPU_PARTITION*GPU_PARTITION_SIZE, | ||
| 356 | RHO, | ||
| 357 | IKGLP_UNLIMITED_IN_FIFOS, | ||
| 358 | IKGLP_UNLIMITED_FIFO_LEN, | ||
| 359 | ENABLE_AFFINITY); | ||
| 360 | } | ||
| 361 | else if (GPU_SYNC_MODE == RGEM_MODE) { | ||
| 362 | /* RGEM-like token allocation. Shared priority queue for all tokens. */ | ||
| 363 | TOKEN_LOCK = open_gpusync_token_lock(fd, | ||
| 364 | base_name, /* name */ | ||
| 365 | GPU_PARTITION_SIZE, | ||
| 366 | GPU_PARTITION*GPU_PARTITION_SIZE, | ||
| 367 | RHO, | ||
| 368 | RHO*GPU_PARTITION_SIZE, | ||
| 369 | 1, | ||
| 370 | ENABLE_AFFINITY); | ||
| 371 | } | ||
| 372 | else if (GPU_SYNC_MODE == IKGLP_WC_MODE) { | ||
| 373 | /* Non-optimal IKGLP that never lets a replica idle if there are pending | ||
| 374 | * token requests. */ | ||
| 375 | int max_simult_run = std::max(CPU_PARTITION_SIZE, RHO*GPU_PARTITION_SIZE); | ||
| 376 | int max_fifo_len = (int)ceil((float)max_simult_run / (RHO*GPU_PARTITION_SIZE)); | ||
| 377 | TOKEN_LOCK = open_gpusync_token_lock(fd, | ||
| 378 | base_name, /* name */ | ||
| 379 | GPU_PARTITION_SIZE, | ||
| 380 | GPU_PARTITION*GPU_PARTITION_SIZE, | ||
| 381 | RHO, | ||
| 382 | max_simult_run, | ||
| 383 | (!RELAX_FIFO_MAX_LEN) ? | ||
| 384 | max_fifo_len : | ||
| 385 | IKGLP_UNLIMITED_FIFO_LEN, | ||
| 386 | ENABLE_AFFINITY); | ||
| 387 | } | ||
| 388 | else { | ||
| 389 | perror("Invalid GPUSync mode specified\n"); | ||
| 390 | TOKEN_LOCK = -1; | ||
| 391 | } | ||
| 392 | |||
| 393 | if(TOKEN_LOCK < 0) | ||
| 394 | perror("open_token_sem"); | ||
| 395 | |||
| 396 | if(USE_ENGINE_LOCKS) | ||
| 397 | { | ||
| 398 | assert(NUM_COPY_ENGINES == 1 || NUM_COPY_ENGINES == 2); | ||
| 399 | assert((NUM_COPY_ENGINES == 1 && !RESERVED_MIGR_COPY_ENGINE) || NUM_COPY_ENGINES == 2); | ||
| 400 | |||
| 401 | // allocate the engine locks. | ||
| 402 | for (int i = 0; i < GPU_PARTITION_SIZE; ++i) | ||
| 403 | { | ||
| 404 | int idx = GPU_PARTITION*GPU_PARTITION_SIZE + i; | ||
| 405 | int ee_name = (i+1)*10 + base_name; | ||
| 406 | int ce_0_name = (i+1)*10 + base_name + 1; | ||
| 407 | int ce_1_name = (i+1)*10 + base_name + 2; | ||
| 408 | int ee_lock = -1, ce_0_lock = -1, ce_1_lock = -1; | ||
| 409 | |||
| 410 | open_sem_t openEngineLock = (ENGINE_LOCK_TYPE == FIFO) ? | ||
| 411 | open_fifo_sem : open_prioq_sem; | ||
| 412 | |||
| 413 | ee_lock = openEngineLock(fd, ee_name); | ||
| 414 | if (ee_lock < 0) | ||
| 415 | perror("open_*_sem (engine lock)"); | ||
| 416 | |||
| 417 | ce_0_lock = openEngineLock(fd, ce_0_name); | ||
| 418 | if (ce_0_lock < 0) | ||
| 419 | perror("open_*_sem (engine lock)"); | ||
| 420 | |||
| 421 | if (NUM_COPY_ENGINES == 2) | ||
| 422 | { | ||
| 423 | ce_1_lock = openEngineLock(fd, ce_1_name); | ||
| 424 | if (ce_1_lock < 0) | ||
| 425 | perror("open_*_sem (engine lock)"); | ||
| 426 | } | ||
| 427 | |||
| 428 | EE_LOCKS[idx] = ee_lock; | ||
| 429 | |||
| 430 | if (NUM_COPY_ENGINES == 1) | ||
| 431 | { | ||
| 432 | // share locks | ||
| 433 | CE_SEND_LOCKS[idx] = ce_0_lock; | ||
| 434 | CE_RECV_LOCKS[idx] = ce_0_lock; | ||
| 435 | CE_MIGR_SEND_LOCKS[idx] = ce_0_lock; | ||
| 436 | CE_MIGR_RECV_LOCKS[idx] = ce_0_lock; | ||
| 437 | } | ||
| 438 | else | ||
| 439 | { | ||
| 440 | assert(NUM_COPY_ENGINES == 2); | ||
| 441 | |||
| 442 | if (RESERVED_MIGR_COPY_ENGINE) { | ||
| 443 | // copy engine deadicated to migration operations | ||
| 444 | CE_SEND_LOCKS[idx] = ce_0_lock; | ||
| 445 | CE_RECV_LOCKS[idx] = ce_0_lock; | ||
| 446 | CE_MIGR_SEND_LOCKS[idx] = ce_1_lock; | ||
| 447 | CE_MIGR_RECV_LOCKS[idx] = ce_1_lock; | ||
| 448 | } | ||
| 449 | else { | ||
| 450 | // migration transmissions treated as regular data | ||
| 451 | CE_SEND_LOCKS[idx] = ce_0_lock; | ||
| 452 | CE_RECV_LOCKS[idx] = ce_1_lock; | ||
| 453 | CE_MIGR_SEND_LOCKS[idx] = ce_0_lock; | ||
| 454 | CE_MIGR_RECV_LOCKS[idx] = ce_1_lock; | ||
| 455 | } | ||
| 456 | } | ||
| 457 | } | ||
| 458 | } | ||
| 459 | } | ||
| 460 | |||
| 461 | |||
| 462 | |||
| 463 | |||
| 464 | class gpu_pool | ||
| 465 | { | ||
| 466 | public: | ||
| 467 | gpu_pool(int pSz): poolSize(pSz) | ||
| 468 | { | ||
| 469 | memset(&pool[0], 0, sizeof(pool[0])*poolSize); | ||
| 470 | } | ||
| 471 | |||
| 472 | int get(pthread_mutex_t* tex, int preference = -1) | ||
| 473 | { | ||
| 474 | int which = -1; | ||
| 475 | int last = (preference >= 0) ? preference : 0; | ||
| 476 | int minIdx = last; | ||
| 477 | |||
| 478 | pthread_mutex_lock(tex); | ||
| 479 | |||
| 480 | int min = pool[last]; | ||
| 481 | for(int i = (minIdx+1)%poolSize; i != last; i = (i+1)%poolSize) | ||
| 482 | { | ||
| 483 | if(min > pool[i]) | ||
| 484 | minIdx = i; | ||
| 485 | } | ||
| 486 | ++pool[minIdx]; | ||
| 487 | |||
| 488 | pthread_mutex_unlock(tex); | ||
| 489 | |||
| 490 | which = minIdx; | ||
| 491 | |||
| 492 | return which; | ||
| 493 | } | ||
| 494 | |||
| 495 | void put(pthread_mutex_t* tex, int which) | ||
| 496 | { | ||
| 497 | pthread_mutex_lock(tex); | ||
| 498 | --pool[which]; | ||
| 499 | pthread_mutex_unlock(tex); | ||
| 500 | } | ||
| 501 | |||
| 502 | private: | ||
| 503 | int poolSize; | ||
| 504 | int pool[NR_GPUS]; // >= gpu_part_size | ||
| 505 | }; | ||
| 506 | |||
| 507 | static gpu_pool* GPU_LINUX_SEM_POOL = NULL; | ||
| 508 | static pthread_mutex_t* GPU_LINUX_MUTEX_POOL = NULL; | ||
| 509 | |||
| 510 | static void allocate_locks_linux(int num_gpu_users) | ||
| 511 | { | ||
| 512 | managed_shared_memory *segment_pool_ptr = NULL; | ||
| 513 | managed_shared_memory *segment_mutex_ptr = NULL; | ||
| 514 | |||
| 515 | int numGpuPartitions = NR_GPUS/GPU_PARTITION_SIZE; | ||
| 516 | |||
| 517 | if(num_gpu_users != 0) | ||
| 518 | { | ||
| 519 | printf("%d creating shared memory for linux semaphores; num pools = %d, pool size = %d\n", getpid(), numGpuPartitions, GPU_PARTITION_SIZE); | ||
| 520 | shared_memory_object::remove("linux_mutex_memory"); | ||
| 521 | shared_memory_object::remove("linux_sem_memory"); | ||
| 522 | |||
| 523 | segment_mutex_ptr = new managed_shared_memory(create_only, "linux_mutex_memory", 4*1024); | ||
| 524 | GPU_LINUX_MUTEX_POOL = segment_mutex_ptr->construct<pthread_mutex_t>("pthread_mutex_t linux_m")[numGpuPartitions](); | ||
| 525 | for(int i = 0; i < numGpuPartitions; ++i) | ||
| 526 | { | ||
| 527 | pthread_mutexattr_t attr; | ||
| 528 | pthread_mutexattr_init(&attr); | ||
| 529 | pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED); | ||
| 530 | pthread_mutex_init(&(GPU_LINUX_MUTEX_POOL[i]), &attr); | ||
| 531 | pthread_mutexattr_destroy(&attr); | ||
| 532 | } | ||
| 533 | |||
| 534 | segment_pool_ptr = new managed_shared_memory(create_only, "linux_sem_memory", 4*1024); | ||
| 535 | GPU_LINUX_SEM_POOL = segment_pool_ptr->construct<gpu_pool>("gpu_pool linux_p")[numGpuPartitions](GPU_PARTITION_SIZE); | ||
| 536 | } | ||
| 537 | else | ||
| 538 | { | ||
| 539 | do | ||
| 540 | { | ||
| 541 | try | ||
| 542 | { | ||
| 543 | if (!segment_pool_ptr) segment_pool_ptr = new managed_shared_memory(open_only, "linux_sem_memory"); | ||
| 544 | } | ||
| 545 | catch(...) | ||
| 546 | { | ||
| 547 | sleep(1); | ||
| 548 | } | ||
| 549 | }while(segment_pool_ptr == NULL); | ||
| 550 | |||
| 551 | do | ||
| 552 | { | ||
| 553 | try | ||
| 554 | { | ||
| 555 | if (!segment_mutex_ptr) segment_mutex_ptr = new managed_shared_memory(open_only, "linux_mutex_memory"); | ||
| 556 | } | ||
| 557 | catch(...) | ||
| 558 | { | ||
| 559 | sleep(1); | ||
| 560 | } | ||
| 561 | }while(segment_mutex_ptr == NULL); | ||
| 562 | |||
| 563 | GPU_LINUX_SEM_POOL = segment_pool_ptr->find<gpu_pool>("gpu_pool linux_p").first; | ||
| 564 | GPU_LINUX_MUTEX_POOL = segment_mutex_ptr->find<pthread_mutex_t>("pthread_mutex_t linux_m").first; | ||
| 565 | } | ||
| 566 | } | ||
| 567 | |||
| 568 | |||
| 569 | |||
| 570 | |||
| 571 | static void allocate_locks(int num_gpu_users, bool linux_mode) | ||
| 572 | { | ||
| 573 | if(!linux_mode) | ||
| 574 | allocate_locks_litmus(); | ||
| 575 | else | ||
| 576 | allocate_locks_linux(num_gpu_users); | ||
| 577 | } | ||
| 578 | |||
| 579 | static void set_cur_gpu(int gpu) | ||
| 580 | { | ||
| 581 | if (TRACE_MIGRATIONS) { | ||
| 582 | trace_migration(gpu, CUR_DEVICE); | ||
| 583 | } | ||
| 584 | if(gpu != CUR_DEVICE) { | ||
| 585 | cudaSetDevice(gpu); | ||
| 586 | CUR_DEVICE = gpu; | ||
| 587 | } | ||
| 588 | } | ||
| 589 | |||
| 590 | |||
| 591 | static pthread_barrier_t *gpu_barrier = NULL; | ||
| 592 | static interprocess_mutex *gpu_mgmt_mutexes = NULL; | ||
| 593 | static managed_shared_memory *segment_ptr = NULL; | ||
| 594 | |||
| 595 | void coordinate_gpu_tasks(int num_gpu_users) | ||
| 596 | { | ||
| 597 | if(num_gpu_users != 0) | ||
| 598 | { | ||
| 599 | printf("%d creating shared memory\n", getpid()); | ||
| 600 | shared_memory_object::remove("gpu_barrier_memory"); | ||
| 601 | segment_ptr = new managed_shared_memory(create_only, "gpu_barrier_memory", 4*1024); | ||
| 602 | |||
| 603 | printf("%d creating a barrier for %d users\n", getpid(), num_gpu_users); | ||
| 604 | gpu_barrier = segment_ptr->construct<pthread_barrier_t>("pthread_barrier_t gpu_barrier")(); | ||
| 605 | pthread_barrierattr_t battr; | ||
| 606 | pthread_barrierattr_init(&battr); | ||
| 607 | pthread_barrierattr_setpshared(&battr, PTHREAD_PROCESS_SHARED); | ||
| 608 | pthread_barrier_init(gpu_barrier, &battr, num_gpu_users); | ||
| 609 | pthread_barrierattr_destroy(&battr); | ||
| 610 | printf("%d creating gpu mgmt mutexes for %d devices\n", getpid(), NR_GPUS); | ||
| 611 | gpu_mgmt_mutexes = segment_ptr->construct<interprocess_mutex>("interprocess_mutex m")[NR_GPUS](); | ||
| 612 | } | ||
| 613 | else | ||
| 614 | { | ||
| 615 | do | ||
| 616 | { | ||
| 617 | try | ||
| 618 | { | ||
| 619 | segment_ptr = new managed_shared_memory(open_only, "gpu_barrier_memory"); | ||
| 620 | } | ||
| 621 | catch(...) | ||
| 622 | { | ||
| 623 | sleep(1); | ||
| 624 | } | ||
| 625 | }while(segment_ptr == NULL); | ||
| 626 | |||
| 627 | gpu_barrier = segment_ptr->find<pthread_barrier_t>("pthread_barrier_t gpu_barrier").first; | ||
| 628 | gpu_mgmt_mutexes = segment_ptr->find<interprocess_mutex>("interprocess_mutex m").first; | ||
| 629 | } | ||
| 630 | } | ||
| 631 | |||
| 632 | typedef float spindata_t; | ||
| 633 | |||
| 634 | char *d_send_data[NR_GPUS] = {0}; | ||
| 635 | char *d_recv_data[NR_GPUS] = {0}; | ||
| 636 | char *d_state_data[NR_GPUS] = {0}; | ||
| 637 | spindata_t *d_spin_data[NR_GPUS] = {0}; | ||
| 638 | //unsigned int *d_iteration_count[NR_GPUS] = {0}; | ||
| 639 | |||
| 640 | |||
| 641 | bool p2pMigration[NR_GPUS][NR_GPUS] = {0}; | ||
| 642 | |||
| 643 | char *h_send_data = 0; | ||
| 644 | char *h_recv_data = 0; | ||
| 645 | char *h_state_data = 0; | ||
| 646 | |||
| 647 | unsigned int *h_iteration_count[NR_GPUS] = {0}; | ||
| 648 | |||
| 649 | static void init_cuda(int num_gpu_users) | ||
| 650 | { | ||
| 651 | const int PAGE_SIZE = 4*1024; | ||
| 652 | size_t send_alloc_bytes = SEND_SIZE + (SEND_SIZE%PAGE_SIZE != 0)*PAGE_SIZE; | ||
| 653 | size_t recv_alloc_bytes = RECV_SIZE + (RECV_SIZE%PAGE_SIZE != 0)*PAGE_SIZE; | ||
| 654 | size_t state_alloc_bytes = STATE_SIZE + (STATE_SIZE%PAGE_SIZE != 0)*PAGE_SIZE; | ||
| 655 | |||
| 656 | coordinate_gpu_tasks(num_gpu_users); | ||
| 657 | |||
| 658 | switch (CUDA_SYNC_MODE) | ||
| 659 | { | ||
| 660 | case BLOCKING: | ||
| 661 | cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); | ||
| 662 | break; | ||
| 663 | case SPIN: | ||
| 664 | cudaSetDeviceFlags(cudaDeviceScheduleSpin); | ||
| 665 | break; | ||
| 666 | } | ||
| 667 | |||
| 668 | for(int i = 0; i < GPU_PARTITION_SIZE; ++i) | ||
| 669 | { | ||
| 670 | cudaDeviceProp prop; | ||
| 671 | int which = GPU_PARTITION*GPU_PARTITION_SIZE + i; | ||
| 672 | |||
| 673 | gpu_mgmt_mutexes[which].lock(); | ||
| 674 | |||
| 675 | set_cur_gpu(which); | ||
| 676 | cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 0); | ||
| 677 | cudaDeviceSetLimit(cudaLimitMallocHeapSize, 0); | ||
| 678 | |||
| 679 | cudaGetDeviceProperties(&prop, which); | ||
| 680 | GPU_HZ[which] = prop.clockRate * 1000; /* khz -> hz */ | ||
| 681 | NUM_SM[which] = prop.multiProcessorCount; | ||
| 682 | WARP_SIZE[which] = prop.warpSize; | ||
| 683 | |||
| 684 | // enough to fill the L2 cache exactly. | ||
| 685 | ELEM_PER_THREAD[which] = (prop.l2CacheSize/(NUM_SM[which]*WARP_SIZE[which]*sizeof(spindata_t))); | ||
| 686 | |||
| 687 | |||
| 688 | if (!MIGRATE_VIA_SYSMEM && prop.unifiedAddressing) | ||
| 689 | { | ||
| 690 | for(int j = 0; j < GPU_PARTITION_SIZE; ++j) | ||
| 691 | { | ||
| 692 | if (i != j) | ||
| 693 | { | ||
| 694 | int canAccess = 0; | ||
| 695 | cudaDeviceCanAccessPeer(&canAccess, i, j); | ||
| 696 | if(canAccess) | ||
| 697 | { | ||
| 698 | cudaDeviceEnablePeerAccess(j, 0); | ||
| 699 | p2pMigration[i][j] = true; | ||
| 700 | } | ||
| 701 | } | ||
| 702 | } | ||
| 703 | } | ||
| 704 | |||
| 705 | cudaStreamCreate(&STREAMS[CUR_DEVICE]); | ||
| 706 | |||
| 707 | cudaMalloc(&d_spin_data[which], prop.l2CacheSize); | ||
| 708 | cudaMemset(&d_spin_data[which], 0, prop.l2CacheSize); | ||
| 709 | // cudaMalloc(&d_iteration_count[which], NUM_SM[which]*WARP_SIZE[which]*sizeof(unsigned int)); | ||
| 710 | // cudaHostAlloc(&h_iteration_count[which], NUM_SM[which]*WARP_SIZE[which]*sizeof(unsigned int), cudaHostAllocPortable | cudaHostAllocMapped); | ||
| 711 | |||
| 712 | if (send_alloc_bytes) { | ||
| 713 | cudaMalloc(&d_send_data[which], send_alloc_bytes); | ||
| 714 | cudaHostAlloc(&h_send_data, send_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped); | ||
| 715 | } | ||
| 716 | |||
| 717 | if (h_recv_data) { | ||
| 718 | cudaMalloc(&d_recv_data[which], recv_alloc_bytes); | ||
| 719 | cudaHostAlloc(&h_recv_data, recv_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped); | ||
| 720 | } | ||
| 721 | |||
| 722 | if (h_state_data) { | ||
| 723 | cudaMalloc(&d_state_data[which], state_alloc_bytes); | ||
| 724 | |||
| 725 | if (MIGRATE_VIA_SYSMEM) | ||
| 726 | cudaHostAlloc(&h_state_data, state_alloc_bytes, cudaHostAllocPortable | cudaHostAllocMapped | cudaHostAllocWriteCombined); | ||
| 727 | } | ||
| 728 | |||
| 729 | gpu_mgmt_mutexes[which].unlock(); | ||
| 730 | } | ||
| 731 | |||
| 732 | // roll back to first GPU | ||
| 733 | set_cur_gpu(GPU_PARTITION*GPU_PARTITION_SIZE); | ||
| 734 | } | ||
| 735 | |||
| 736 | |||
| 737 | |||
| 738 | static bool MigrateToGPU_P2P(int from, int to) | ||
| 739 | { | ||
| 740 | bool success = true; | ||
| 741 | set_cur_gpu(to); | ||
| 742 | chunkMemcpy(this_gpu(d_state_data), per_gpu(d_state_data, from), | ||
| 743 | STATE_SIZE, cudaMemcpyDeviceToDevice, to, | ||
| 744 | useEngineLocks(), from, true); | ||
| 745 | return success; | ||
| 746 | } | ||
| 747 | |||
| 748 | |||
| 749 | static bool PullState(void) | ||
| 750 | { | ||
| 751 | bool success = true; | ||
| 752 | chunkMemcpy(h_state_data, this_gpu(d_state_data), | ||
| 753 | STATE_SIZE, cudaMemcpyDeviceToHost, | ||
| 754 | cur_gpu(), useEngineLocks(), -1, true); | ||
| 755 | return success; | ||
| 756 | } | ||
| 757 | |||
| 758 | static bool PushState(void) | ||
| 759 | { | ||
| 760 | bool success = true; | ||
| 761 | chunkMemcpy(this_gpu(d_state_data), h_state_data, | ||
| 762 | STATE_SIZE, cudaMemcpyHostToDevice, | ||
| 763 | cur_gpu(), useEngineLocks(), -1, true); | ||
| 764 | return success; | ||
| 765 | } | ||
| 766 | |||
| 767 | static bool MigrateToGPU_SysMem(int from, int to) | ||
| 768 | { | ||
| 769 | // THIS IS ON-DEMAND SYS_MEM MIGRATION. GPUSync says | ||
| 770 | // you should be using speculative migrations. | ||
| 771 | // Use PushState() and PullState(). | ||
| 772 | assert(false); // for now | ||
| 773 | |||
| 774 | bool success = true; | ||
| 775 | |||
| 776 | set_cur_gpu(from); | ||
| 777 | chunkMemcpy(h_state_data, this_gpu(d_state_data), | ||
| 778 | STATE_SIZE, cudaMemcpyDeviceToHost, | ||
| 779 | from, useEngineLocks(), -1, true); | ||
| 780 | |||
| 781 | set_cur_gpu(to); | ||
| 782 | chunkMemcpy(this_gpu(d_state_data), h_state_data, | ||
| 783 | STATE_SIZE, cudaMemcpyHostToDevice, | ||
| 784 | to, useEngineLocks(), -1, true); | ||
| 785 | |||
| 786 | return success; | ||
| 787 | } | ||
| 788 | |||
| 789 | static bool MigrateToGPU(int from, int to) | ||
| 790 | { | ||
| 791 | bool success = false; | ||
| 792 | |||
| 793 | if (from != to) | ||
| 794 | { | ||
| 795 | if(!MIGRATE_VIA_SYSMEM && p2pMigration[to][from]) | ||
| 796 | success = MigrateToGPU_P2P(from, to); | ||
| 797 | else | ||
| 798 | success = MigrateToGPU_SysMem(from, to); | ||
| 799 | } | ||
| 800 | else | ||
| 801 | { | ||
| 802 | set_cur_gpu(to); | ||
| 803 | success = true; | ||
| 804 | } | ||
| 805 | |||
| 806 | return success; | ||
| 807 | } | ||
| 808 | |||
| 809 | static bool MigrateToGPU_Implicit(int to) | ||
| 810 | { | ||
| 811 | return( MigrateToGPU(cur_gpu(), to) ); | ||
| 812 | } | ||
| 813 | |||
| 814 | static void MigrateIfNeeded(int next_gpu) | ||
| 815 | { | ||
| 816 | if(next_gpu != cur_gpu() && cur_gpu() != -1) | ||
| 817 | { | ||
| 818 | if (!MIGRATE_VIA_SYSMEM) | ||
| 819 | MigrateToGPU_Implicit(next_gpu); | ||
| 820 | else { | ||
| 821 | set_cur_gpu(next_gpu); | ||
| 822 | PushState(); | ||
| 823 | } | ||
| 824 | } | ||
| 825 | } | ||
| 826 | |||
| 827 | |||
| 828 | |||
| 829 | static void exit_cuda() | ||
| 830 | { | ||
| 831 | for(int i = 0; i < GPU_PARTITION_SIZE; ++i) | ||
| 832 | { | ||
| 833 | int which = GPU_PARTITION*GPU_PARTITION_SIZE + i; | ||
| 834 | gpu_mgmt_mutexes[which].lock(); | ||
| 835 | set_cur_gpu(which); | ||
| 836 | cudaDeviceReset(); | ||
| 837 | gpu_mgmt_mutexes[which].unlock(); | ||
| 838 | } | ||
| 839 | } | ||
| 840 | |||
| 841 | bool safetynet = false; | ||
| 842 | |||
| 843 | static void catch_exit(int catch_exit) | ||
| 844 | { | ||
| 845 | if(GPU_USING && USE_ENGINE_LOCKS && safetynet) | ||
| 846 | { | ||
| 847 | safetynet = false; | ||
| 848 | for(int i = 0; i < GPU_PARTITION_SIZE; ++i) | ||
| 849 | { | ||
| 850 | int which = GPU_PARTITION*GPU_PARTITION_SIZE + i; | ||
| 851 | set_cur_gpu(which); | ||
| 852 | |||
| 853 | // cudaDeviceReset(); | ||
| 854 | |||
| 855 | // try to unlock everything. litmus will prevent bogus calls. | ||
| 856 | if(USE_ENGINE_LOCKS) | ||
| 857 | { | ||
| 858 | litmus_unlock(EE_LOCKS[which]); | ||
| 859 | litmus_unlock(CE_SEND_LOCKS[which]); | ||
| 860 | if (NUM_COPY_ENGINES == 2) | ||
| 861 | { | ||
| 862 | if (RESERVED_MIGR_COPY_ENGINE) | ||
| 863 | litmus_unlock(CE_MIGR_SEND_LOCKS[which]); | ||
| 864 | else | ||
| 865 | litmus_unlock(CE_MIGR_RECV_LOCKS[which]); | ||
| 866 | } | ||
| 867 | } | ||
| 868 | } | ||
| 869 | litmus_unlock(TOKEN_LOCK); | ||
| 870 | } | ||
| 871 | } | ||
| 872 | |||
| 873 | |||
| 874 | |||
| 875 | |||
| 876 | |||
| 877 | static float ms_sum; | ||
| 878 | static int gpucount = 0; | ||
| 879 | |||
| 880 | __global__ void docudaspin(float* data, /*unsigned int* iterations,*/ unsigned int num_elem, unsigned int cycles) | ||
| 881 | { | ||
| 882 | long long int now = clock64(); | ||
| 883 | long long unsigned int elapsed = 0; | ||
| 884 | long long int last; | ||
| 885 | |||
| 886 | // unsigned int iter = 0; | ||
| 887 | unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; | ||
| 888 | unsigned int j = 0; | ||
| 889 | bool toggle = true; | ||
| 890 | |||
| 891 | // iterations[i] = 0; | ||
| 892 | do | ||
| 893 | { | ||
| 894 | data[i*num_elem+j] += (toggle) ? M_PI : -M_PI; | ||
| 895 | j = (j + 1 != num_elem) ? j + 1 : 0; | ||
| 896 | toggle = !toggle; | ||
| 897 | // iter++; | ||
| 898 | |||
| 899 | last = now; | ||
| 900 | now = clock64(); | ||
| 901 | |||
| 902 | // // exact calculation takes more cycles than a second | ||
| 903 | // // loop iteration when code is compiled optimized | ||
| 904 | // long long int diff = now - last; | ||
| 905 | // elapsed += (diff > 0) ? | ||
| 906 | // diff : | ||
| 907 | // now + ((~((long long int)0)<<1)>>1) - last; | ||
| 908 | |||
| 909 | // don't count iterations with clock roll-over | ||
| 910 | elapsed += max(0ll, now - last); | ||
| 911 | }while(elapsed < cycles); | ||
| 912 | |||
| 913 | // iterations[i] = iter; | ||
| 914 | |||
| 915 | return; | ||
| 916 | } | ||
| 917 | |||
| 918 | static void gpu_loop_for(double gpu_sec_time, double emergency_exit) | ||
| 919 | { | ||
| 920 | int next_gpu; | ||
| 921 | |||
| 922 | if (emergency_exit && wctime() > emergency_exit) | ||
| 923 | goto out; | ||
| 924 | |||
| 925 | next_gpu = litmus_lock(TOKEN_LOCK); | ||
| 926 | { | ||
| 927 | MigrateIfNeeded(next_gpu); | ||
| 928 | |||
| 929 | unsigned int numcycles = (unsigned int)(cur_hz() * gpu_sec_time); | ||
| 930 | |||
| 931 | if(SEND_SIZE > 0) | ||
| 932 | chunkMemcpy(this_gpu(d_state_data), h_send_data, SEND_SIZE, | ||
| 933 | cudaMemcpyHostToDevice, CUR_DEVICE, useEngineLocks()); | ||
| 934 | |||
| 935 | if(useEngineLocks()) litmus_lock(cur_ee()); | ||
| 936 | /* one block per sm, one warp per block */ | ||
| 937 | docudaspin <<<cur_sms(),cur_warp_size(), 0, cur_stream()>>> (d_spin_data[cur_gpu()], cur_elem_per_thread(), numcycles); | ||
| 938 | // docudaspin <<<cur_sms(),cur_warp_size(), 0, cur_stream()>>> (d_spin_data[cur_gpu()], d_iteration_count[cur_gpu()], cur_elem_per_thread(), numcycles); | ||
| 939 | cudaStreamSynchronize(cur_stream()); | ||
| 940 | if(useEngineLocks()) litmus_unlock(cur_ee()); | ||
| 941 | |||
| 942 | if(RECV_SIZE > 0) | ||
| 943 | chunkMemcpy(h_recv_data, this_gpu(d_state_data), RECV_SIZE, | ||
| 944 | cudaMemcpyDeviceToHost, CUR_DEVICE, useEngineLocks()); | ||
| 945 | |||
| 946 | if (MIGRATE_VIA_SYSMEM) | ||
| 947 | PullState(); | ||
| 948 | } | ||
| 949 | litmus_unlock(TOKEN_LOCK); | ||
| 950 | |||
| 951 | last_gpu() = cur_gpu(); | ||
| 952 | |||
| 953 | out: | ||
| 954 | return; | ||
| 955 | } | ||
| 956 | |||
| 957 | static void gpu_loop_for_linux(double gpu_sec_time, double emergency_exit) | ||
| 958 | { | ||
| 959 | static int GPU_OFFSET = GPU_PARTITION * GPU_PARTITION_SIZE; | ||
| 960 | static gpu_pool *pool = &GPU_LINUX_SEM_POOL[GPU_PARTITION]; | ||
| 961 | static pthread_mutex_t *mutex = &GPU_LINUX_MUTEX_POOL[GPU_PARTITION]; | ||
| 962 | |||
| 963 | static bool once = false; | ||
| 964 | static cudaEvent_t start, end; | ||
| 965 | float ms; | ||
| 966 | if (!once) | ||
| 967 | { | ||
| 968 | once = true; | ||
| 969 | cudaEventCreate(&start); | ||
| 970 | cudaEventCreate(&end); | ||
| 971 | } | ||
| 972 | |||
| 973 | int next_gpu; | ||
| 974 | |||
| 975 | if (emergency_exit && wctime() > emergency_exit) | ||
| 976 | goto out; | ||
| 977 | |||
| 978 | next_gpu = pool->get(mutex, cur_gpu() - GPU_OFFSET) + GPU_OFFSET; | ||
| 979 | { | ||
| 980 | MigrateIfNeeded(next_gpu); | ||
| 981 | |||
| 982 | unsigned int numcycles = (unsigned int)(cur_hz() * gpu_sec_time); | ||
| 983 | |||
| 984 | if(SEND_SIZE > 0) | ||
| 985 | chunkMemcpy(this_gpu(d_state_data), h_send_data, SEND_SIZE, | ||
| 986 | cudaMemcpyHostToDevice, cur_gpu(), useEngineLocks()); | ||
| 987 | |||
| 988 | /* one block per sm, one warp per block */ | ||
| 989 | cudaEventRecord(start, cur_stream()); | ||
| 990 | docudaspin <<<cur_sms(),cur_warp_size(), 0, cur_stream()>>> (d_spin_data[cur_gpu()], cur_elem_per_thread(), numcycles); | ||
| 991 | // docudaspin <<<cur_sms(),cur_warp_size(), 0, cur_stream()>>> (d_spin_data[cur_gpu()], d_iteration_count[cur_gpu()], cur_elem_per_thread(), numcycles); | ||
| 992 | cudaEventRecord(end, cur_stream()); | ||
| 993 | cudaEventSynchronize(end); | ||
| 994 | cudaStreamSynchronize(cur_stream()); | ||
| 995 | |||
| 996 | // chunkMemcpy(this_gpu(h_iteration_count), this_gpu(d_iteration_count), sizeof(unsigned int), | ||
| 997 | // cudaMemcpyDeviceToHost, cur_gpu(), useEngineLocks()); | ||
| 998 | // | ||
| 999 | cudaEventElapsedTime(&ms, start, end); | ||
| 1000 | ms_sum += ms; | ||
| 1001 | ++gpucount; | ||
| 1002 | // printf("%f\n", ms); | ||
| 1003 | // printf("%f: %u\n", ms, this_gpu(h_iteration_count)[0]); | ||
| 1004 | |||
| 1005 | |||
| 1006 | if(RECV_SIZE > 0) | ||
| 1007 | chunkMemcpy(h_recv_data, this_gpu(d_state_data), RECV_SIZE, | ||
| 1008 | cudaMemcpyDeviceToHost, cur_gpu(), useEngineLocks()); | ||
| 1009 | |||
| 1010 | if (MIGRATE_VIA_SYSMEM) | ||
| 1011 | PullState(); | ||
| 1012 | } | ||
| 1013 | pool->put(mutex, cur_gpu() - GPU_OFFSET); | ||
| 1014 | |||
| 1015 | last_gpu() = cur_gpu(); | ||
| 1016 | |||
| 1017 | out: | ||
| 1018 | return; | ||
| 1019 | } | ||
| 1020 | |||
| 1021 | |||
| 1022 | |||
| 1023 | |||
| 1024 | static void usage(char *error) { | ||
| 1025 | fprintf(stderr, "Error: %s\n", error); | ||
| 1026 | fprintf(stderr, | ||
| 1027 | "Usage:\n" | ||
| 1028 | " rt_spin [COMMON-OPTS] WCET PERIOD DURATION\n" | ||
| 1029 | " rt_spin [COMMON-OPTS] -f FILE [-o COLUMN] WCET PERIOD\n" | ||
| 1030 | " rt_spin -l\n" | ||
| 1031 | "\n" | ||
| 1032 | "COMMON-OPTS = [-w] [-s SCALE]\n" | ||
| 1033 | " [-p PARTITION/CLUSTER [-z CLUSTER SIZE]] [-c CLASS]\n" | ||
| 1034 | " [-X LOCKING-PROTOCOL] [-L CRITICAL SECTION LENGTH] [-Q RESOURCE-ID]" | ||
| 1035 | "\n" | ||
| 1036 | "WCET and PERIOD are milliseconds, DURATION is seconds.\n" | ||
| 1037 | "CRITICAL SECTION LENGTH is in milliseconds.\n"); | ||
| 1038 | exit(EXIT_FAILURE); | ||
| 1039 | } | ||
| 1040 | |||
| 1041 | /* | ||
| 1042 | * returns the character that made processing stop, newline or EOF | ||
| 1043 | */ | ||
| 1044 | static int skip_to_next_line(FILE *fstream) | ||
| 1045 | { | ||
| 1046 | int ch; | ||
| 1047 | for (ch = fgetc(fstream); ch != EOF && ch != '\n'; ch = fgetc(fstream)); | ||
| 1048 | return ch; | ||
| 1049 | } | ||
| 1050 | |||
| 1051 | static void skip_comments(FILE *fstream) | ||
| 1052 | { | ||
| 1053 | int ch; | ||
| 1054 | for (ch = fgetc(fstream); ch == '#'; ch = fgetc(fstream)) | ||
| 1055 | skip_to_next_line(fstream); | ||
| 1056 | ungetc(ch, fstream); | ||
| 1057 | } | ||
| 1058 | |||
| 1059 | static void get_exec_times(const char *file, const int column, | ||
| 1060 | int *num_jobs, double **exec_times) | ||
| 1061 | { | ||
| 1062 | FILE *fstream; | ||
| 1063 | int cur_job, cur_col, ch; | ||
| 1064 | *num_jobs = 0; | ||
| 1065 | |||
| 1066 | fstream = fopen(file, "r"); | ||
| 1067 | if (!fstream) | ||
| 1068 | bail_out("could not open execution time file"); | ||
| 1069 | |||
| 1070 | /* figure out the number of jobs */ | ||
| 1071 | do { | ||
| 1072 | skip_comments(fstream); | ||
| 1073 | ch = skip_to_next_line(fstream); | ||
| 1074 | if (ch != EOF) | ||
| 1075 | ++(*num_jobs); | ||
| 1076 | } while (ch != EOF); | ||
| 1077 | |||
| 1078 | if (-1 == fseek(fstream, 0L, SEEK_SET)) | ||
| 1079 | bail_out("rewinding file failed"); | ||
| 1080 | |||
| 1081 | /* allocate space for exec times */ | ||
| 1082 | *exec_times = (double*)calloc(*num_jobs, sizeof(*exec_times)); | ||
| 1083 | if (!*exec_times) | ||
| 1084 | bail_out("couldn't allocate memory"); | ||
| 1085 | |||
| 1086 | for (cur_job = 0; cur_job < *num_jobs && !feof(fstream); ++cur_job) { | ||
| 1087 | |||
| 1088 | skip_comments(fstream); | ||
| 1089 | |||
| 1090 | for (cur_col = 1; cur_col < column; ++cur_col) { | ||
| 1091 | /* discard input until we get to the column we want */ | ||
| 1092 | int unused __attribute__ ((unused)) = fscanf(fstream, "%*s,"); | ||
| 1093 | } | ||
| 1094 | |||
| 1095 | /* get the desired exec. time */ | ||
| 1096 | if (1 != fscanf(fstream, "%lf", (*exec_times)+cur_job)) { | ||
| 1097 | fprintf(stderr, "invalid execution time near line %d\n", | ||
| 1098 | cur_job); | ||
| 1099 | exit(EXIT_FAILURE); | ||
| 1100 | } | ||
| 1101 | |||
| 1102 | skip_to_next_line(fstream); | ||
| 1103 | } | ||
| 1104 | |||
| 1105 | assert(cur_job == *num_jobs); | ||
| 1106 | fclose(fstream); | ||
| 1107 | } | ||
| 1108 | |||
| 1109 | #define NUMS 4096 | ||
| 1110 | static int num[NUMS]; | ||
| 1111 | __attribute__((unused)) static char* progname; | ||
| 1112 | |||
| 1113 | static int loop_once(void) | ||
| 1114 | { | ||
| 1115 | int i, j = 0; | ||
| 1116 | for (i = 0; i < NUMS; i++) | ||
| 1117 | j += num[i]++; | ||
| 1118 | return j; | ||
| 1119 | } | ||
| 1120 | |||
| 1121 | static int loop_for(double exec_time, double emergency_exit) | ||
| 1122 | { | ||
| 1123 | double last_loop = 0, loop_start; | ||
| 1124 | int tmp = 0; | ||
| 1125 | |||
| 1126 | double start = cputime(); | ||
| 1127 | double now = cputime(); | ||
| 1128 | |||
| 1129 | if (emergency_exit && wctime() > emergency_exit) | ||
| 1130 | goto out; | ||
| 1131 | |||
| 1132 | while (now + last_loop < start + exec_time) { | ||
| 1133 | loop_start = now; | ||
| 1134 | tmp += loop_once(); | ||
| 1135 | now = cputime(); | ||
| 1136 | last_loop = now - loop_start; | ||
| 1137 | if (emergency_exit && wctime() > emergency_exit) { | ||
| 1138 | /* Oops --- this should only be possible if the execution time tracking | ||
| 1139 | * is broken in the LITMUS^RT kernel. */ | ||
| 1140 | fprintf(stderr, "!!! gpuspin/%d emergency exit!\n", getpid()); | ||
| 1141 | fprintf(stderr, "Something is seriously wrong! Do not ignore this.\n"); | ||
| 1142 | break; | ||
| 1143 | } | ||
| 1144 | } | ||
| 1145 | |||
| 1146 | out: | ||
| 1147 | return tmp; | ||
| 1148 | } | ||
| 1149 | |||
| 1150 | |||
| 1151 | static void debug_delay_loop(void) | ||
| 1152 | { | ||
| 1153 | double start, end, delay; | ||
| 1154 | |||
| 1155 | while (1) { | ||
| 1156 | for (delay = 0.5; delay > 0.01; delay -= 0.01) { | ||
| 1157 | start = wctime(); | ||
| 1158 | loop_for(delay, 0); | ||
| 1159 | end = wctime(); | ||
| 1160 | printf("%6.4fs: looped for %10.8fs, delta=%11.8fs, error=%7.4f%%\n", | ||
| 1161 | delay, | ||
| 1162 | end - start, | ||
| 1163 | end - start - delay, | ||
| 1164 | 100 * (end - start - delay) / delay); | ||
| 1165 | } | ||
| 1166 | } | ||
| 1167 | } | ||
| 1168 | |||
| 1169 | static int gpu_job(double exec_time, double gpu_exec_time, double program_end) | ||
| 1170 | { | ||
| 1171 | double chunk1, chunk2; | ||
| 1172 | |||
| 1173 | if (wctime() > program_end) { | ||
| 1174 | return 0; | ||
| 1175 | } | ||
| 1176 | else { | ||
| 1177 | chunk1 = exec_time * drand48(); | ||
| 1178 | chunk2 = exec_time - chunk1; | ||
| 1179 | |||
| 1180 | loop_for(chunk1, program_end + 1); | ||
| 1181 | gpu_loop_for(gpu_exec_time, program_end + 1); | ||
| 1182 | loop_for(chunk2, program_end + 1); | ||
| 1183 | |||
| 1184 | sleep_next_period(); | ||
| 1185 | } | ||
| 1186 | return 1; | ||
| 1187 | } | ||
| 1188 | |||
| 1189 | static int job(double exec_time, double program_end) | ||
| 1190 | { | ||
| 1191 | if (wctime() > program_end) { | ||
| 1192 | return 0; | ||
| 1193 | } | ||
| 1194 | else { | ||
| 1195 | loop_for(exec_time, program_end + 1); | ||
| 1196 | sleep_next_period(); | ||
| 1197 | } | ||
| 1198 | return 1; | ||
| 1199 | } | ||
| 1200 | |||
| 1201 | /*****************************/ | ||
| 1202 | /* only used for linux modes */ | ||
| 1203 | |||
| 1204 | static struct timespec periodTime; | ||
| 1205 | static struct timespec releaseTime; | ||
| 1206 | static unsigned int job_no = 0; | ||
| 1207 | |||
| 1208 | static lt_t period_ns; | ||
| 1209 | |||
| 1210 | static void log_release() | ||
| 1211 | { | ||
| 1212 | __attribute__ ((unused)) lt_t rel = releaseTime.tv_sec * s2ns(1) + releaseTime.tv_nsec; | ||
| 1213 | __attribute__ ((unused)) lt_t dead = rel + period_ns; | ||
| 1214 | trace_release(rel, dead, job_no); | ||
| 1215 | } | ||
| 1216 | |||
| 1217 | static void log_completion() | ||
| 1218 | { | ||
| 1219 | trace_completion(job_no); | ||
| 1220 | ++job_no; | ||
| 1221 | } | ||
| 1222 | |||
| 1223 | static void setup_next_period_linux(struct timespec* spec, struct timespec* period) | ||
| 1224 | { | ||
| 1225 | spec->tv_sec += period->tv_sec; | ||
| 1226 | spec->tv_nsec += period->tv_nsec; | ||
| 1227 | if (spec->tv_nsec >= s2ns(1)) { | ||
| 1228 | ++(spec->tv_sec); | ||
| 1229 | spec->tv_nsec -= s2ns(1); | ||
| 1230 | } | ||
| 1231 | } | ||
| 1232 | |||
| 1233 | static void sleep_next_period_linux() | ||
| 1234 | { | ||
| 1235 | log_completion(); | ||
| 1236 | setup_next_period_linux(&releaseTime, &periodTime); | ||
| 1237 | clock_nanosleep(CLOCK_MONOTONIC, TIMER_ABSTIME, &releaseTime, NULL); | ||
| 1238 | log_release(); | ||
| 1239 | } | ||
| 1240 | |||
| 1241 | static void init_linux() | ||
| 1242 | { | ||
| 1243 | mlockall(MCL_CURRENT | MCL_FUTURE); | ||
| 1244 | } | ||
| 1245 | |||
| 1246 | static int gpu_job_linux(double exec_time, double gpu_exec_time, double program_end) | ||
| 1247 | { | ||
| 1248 | double chunk1, chunk2; | ||
| 1249 | |||
| 1250 | if (wctime() > program_end) { | ||
| 1251 | return 0; | ||
| 1252 | } | ||
| 1253 | else { | ||
| 1254 | chunk1 = exec_time * drand48(); | ||
| 1255 | chunk2 = exec_time - chunk1; | ||
| 1256 | |||
| 1257 | loop_for(chunk1, program_end + 1); | ||
| 1258 | gpu_loop_for_linux(gpu_exec_time, program_end + 1); | ||
| 1259 | loop_for(chunk2, program_end + 1); | ||
| 1260 | |||
| 1261 | sleep_next_period_linux(); | ||
| 1262 | } | ||
| 1263 | return 1; | ||
| 1264 | } | ||
| 1265 | |||
| 1266 | static int job_linux(double exec_time, double program_end) | ||
| 1267 | { | ||
| 1268 | if (wctime() > program_end) { | ||
| 1269 | return 0; | ||
| 1270 | } | ||
| 1271 | else { | ||
| 1272 | loop_for(exec_time, program_end + 1); | ||
| 1273 | sleep_next_period_linux(); | ||
| 1274 | } | ||
| 1275 | return 1; | ||
| 1276 | } | ||
| 1277 | |||
| 1278 | /*****************************/ | ||
| 1279 | |||
| 1280 | enum eScheduler | ||
| 1281 | { | ||
| 1282 | LITMUS, | ||
| 1283 | LINUX, | ||
| 1284 | RT_LINUX | ||
| 1285 | }; | ||
| 1286 | |||
| 1287 | #define CPU_OPTIONS "p:z:c:wlveio:f:s:q:X:L:Q:" | ||
| 1288 | #define GPU_OPTIONS "g:y:r:C:E:dG:xS:R:T:Z:aFm:b:MNI" | ||
| 1289 | |||
| 1290 | // concat the option strings | ||
| 1291 | #define OPTSTR CPU_OPTIONS GPU_OPTIONS | ||
| 1292 | |||
| 1293 | int main(int argc, char** argv) | ||
| 1294 | { | ||
| 1295 | int ret; | ||
| 1296 | lt_t wcet; | ||
| 1297 | lt_t period; | ||
| 1298 | double wcet_ms = -1, gpu_wcet_ms = -1, period_ms = -1; | ||
| 1299 | unsigned int priority = LITMUS_LOWEST_PRIORITY; | ||
| 1300 | int migrate = 0; | ||
| 1301 | int cluster = 0; | ||
| 1302 | int cluster_size = 1; | ||
| 1303 | int opt; | ||
| 1304 | int wait = 0; | ||
| 1305 | int test_loop = 0; | ||
| 1306 | int column = 1; | ||
| 1307 | const char *file = NULL; | ||
| 1308 | int want_enforcement = 0; | ||
| 1309 | int want_signals = 0; | ||
| 1310 | double duration = 0, start = 0; | ||
| 1311 | double *exec_times = NULL; | ||
| 1312 | double scale = 1.0; | ||
| 1313 | task_class_t cls = RT_CLASS_HARD; | ||
| 1314 | int cur_job = 0, num_jobs = 0; | ||
| 1315 | struct rt_task param; | ||
| 1316 | |||
| 1317 | double budget_ms = -1.0; | ||
| 1318 | lt_t budget; | ||
| 1319 | |||
| 1320 | int num_gpu_users = 0; | ||
| 1321 | |||
| 1322 | |||
| 1323 | eScheduler scheduler = LITMUS; | ||
| 1324 | |||
| 1325 | /* locking */ | ||
| 1326 | // int lock_od = -1; | ||
| 1327 | // int resource_id = 0; | ||
| 1328 | // int protocol = -1; | ||
| 1329 | // double cs_length = 1; /* millisecond */ | ||
| 1330 | |||
| 1331 | progname = argv[0]; | ||
| 1332 | |||
| 1333 | while ((opt = getopt(argc, argv, OPTSTR)) != -1) { | ||
| 1334 | switch (opt) { | ||
| 1335 | case 'w': | ||
| 1336 | wait = 1; | ||
| 1337 | break; | ||
| 1338 | case 'p': | ||
| 1339 | cluster = atoi(optarg); | ||
| 1340 | migrate = 1; | ||
| 1341 | break; | ||
| 1342 | case 'z': | ||
| 1343 | cluster_size = atoi(optarg); | ||
| 1344 | CPU_PARTITION_SIZE = cluster_size; | ||
| 1345 | break; | ||
| 1346 | case 'g': | ||
| 1347 | GPU_USING = true; | ||
| 1348 | GPU_PARTITION = atoi(optarg); | ||
| 1349 | assert(GPU_PARTITION >= 0 && GPU_PARTITION < NR_GPUS); | ||
| 1350 | break; | ||
| 1351 | case 'y': | ||
| 1352 | GPU_PARTITION_SIZE = atoi(optarg); | ||
| 1353 | assert(GPU_PARTITION_SIZE > 0); | ||
| 1354 | break; | ||
| 1355 | case 'r': | ||
| 1356 | RHO = atoi(optarg); | ||
| 1357 | assert(RHO > 0); | ||
| 1358 | break; | ||
| 1359 | case 'C': | ||
| 1360 | NUM_COPY_ENGINES = atoi(optarg); | ||
| 1361 | assert(NUM_COPY_ENGINES == 1 || NUM_COPY_ENGINES == 2); | ||
| 1362 | break; | ||
| 1363 | case 'E': | ||
| 1364 | USE_ENGINE_LOCKS = true; | ||
| 1365 | ENGINE_LOCK_TYPE = (eEngineLockTypes)atoi(optarg); | ||
| 1366 | assert(ENGINE_LOCK_TYPE == FIFO || ENGINE_LOCK_TYPE == PRIOQ); | ||
| 1367 | break; | ||
| 1368 | case 'd': | ||
| 1369 | USE_DYNAMIC_GROUP_LOCKS = true; | ||
| 1370 | break; | ||
| 1371 | case 'G': | ||
| 1372 | GPU_SYNC_MODE = (eGpuSyncMode)atoi(optarg); | ||
| 1373 | assert(GPU_SYNC_MODE >= IKGLP_MODE && GPU_SYNC_MODE <= RGEM_MODE); | ||
| 1374 | break; | ||
| 1375 | case 'a': | ||
| 1376 | ENABLE_AFFINITY = true; | ||
| 1377 | break; | ||
| 1378 | case 'F': | ||
| 1379 | RELAX_FIFO_MAX_LEN = true; | ||
| 1380 | break; | ||
| 1381 | case 'x': | ||
| 1382 | CUDA_SYNC_MODE = SPIN; | ||
| 1383 | break; | ||
| 1384 | case 'S': | ||
| 1385 | SEND_SIZE = kbToB((size_t)atoi(optarg)); | ||
| 1386 | break; | ||
| 1387 | case 'R': | ||
| 1388 | RECV_SIZE = kbToB((size_t)atoi(optarg)); | ||
| 1389 | break; | ||
| 1390 | case 'T': | ||
| 1391 | STATE_SIZE = kbToB((size_t)atoi(optarg)); | ||
| 1392 | break; | ||
| 1393 | case 'Z': | ||
| 1394 | ENABLE_CHUNKING = true; | ||
| 1395 | CHUNK_SIZE = kbToB((size_t)atoi(optarg)); | ||
| 1396 | break; | ||
| 1397 | case 'M': | ||
| 1398 | MIGRATE_VIA_SYSMEM = true; | ||
| 1399 | break; | ||
| 1400 | case 'm': | ||
| 1401 | num_gpu_users = atoi(optarg); | ||
| 1402 | assert(num_gpu_users > 0); | ||
| 1403 | break; | ||
| 1404 | case 'b': | ||
| 1405 | budget_ms = atoi(optarg); | ||
| 1406 | break; | ||
| 1407 | case 'N': | ||
| 1408 | scheduler = LINUX; | ||
| 1409 | break; | ||
| 1410 | case 'I': | ||
| 1411 | scheduler = RT_LINUX; | ||
| 1412 | break; | ||
| 1413 | case 'q': | ||
| 1414 | priority = atoi(optarg); | ||
| 1415 | break; | ||
| 1416 | case 'c': | ||
| 1417 | cls = str2class(optarg); | ||
| 1418 | if (cls == -1) | ||
| 1419 | usage("Unknown task class."); | ||
| 1420 | break; | ||
| 1421 | case 'e': | ||
| 1422 | want_enforcement = 1; | ||
| 1423 | break; | ||
| 1424 | case 'i': | ||
| 1425 | want_signals = 1; | ||
| 1426 | break; | ||
| 1427 | case 'l': | ||
| 1428 | test_loop = 1; | ||
| 1429 | break; | ||
| 1430 | case 'o': | ||
| 1431 | column = atoi(optarg); | ||
| 1432 | break; | ||
| 1433 | // case 'f': | ||
| 1434 | // file = optarg; | ||
| 1435 | // break; | ||
| 1436 | case 's': | ||
| 1437 | scale = atof(optarg); | ||
| 1438 | break; | ||
| 1439 | // case 'X': | ||
| 1440 | // protocol = lock_protocol_for_name(optarg); | ||
| 1441 | // if (protocol < 0) | ||
| 1442 | // usage("Unknown locking protocol specified."); | ||
| 1443 | // break; | ||
| 1444 | // case 'L': | ||
| 1445 | // cs_length = atof(optarg); | ||
| 1446 | // if (cs_length <= 0) | ||
| 1447 | // usage("Invalid critical section length."); | ||
| 1448 | // break; | ||
| 1449 | // case 'Q': | ||
| 1450 | // resource_id = atoi(optarg); | ||
| 1451 | // if (resource_id <= 0 && strcmp(optarg, "0")) | ||
| 1452 | // usage("Invalid resource ID."); | ||
| 1453 | // break; | ||
| 1454 | case ':': | ||
| 1455 | usage("Argument missing."); | ||
| 1456 | break; | ||
| 1457 | case '?': | ||
| 1458 | default: | ||
| 1459 | usage("Bad argument."); | ||
| 1460 | break; | ||
| 1461 | } | ||
| 1462 | } | ||
| 1463 | |||
| 1464 | #ifdef VANILLA_LINUX | ||
| 1465 | assert(scheduler != LITMUS); | ||
| 1466 | assert(!wait); | ||
| 1467 | #endif | ||
| 1468 | |||
| 1469 | // turn off some features to be safe | ||
| 1470 | if (scheduler != LITMUS) | ||
| 1471 | { | ||
| 1472 | RHO = 0; | ||
| 1473 | USE_ENGINE_LOCKS = false; | ||
| 1474 | USE_DYNAMIC_GROUP_LOCKS = false; | ||
| 1475 | ENABLE_AFFINITY = false; | ||
| 1476 | RELAX_FIFO_MAX_LEN = false; | ||
| 1477 | ENABLE_RT_AUX_THREADS = false; | ||
| 1478 | budget_ms = -1; | ||
| 1479 | want_enforcement = 0; | ||
| 1480 | want_signals = 0; | ||
| 1481 | |||
| 1482 | if (scheduler == RT_LINUX) | ||
| 1483 | { | ||
| 1484 | struct sched_param fifoparams; | ||
| 1485 | |||
| 1486 | assert(priority >= sched_get_priority_min(SCHED_FIFO) && | ||
| 1487 | priority <= sched_get_priority_max(SCHED_FIFO)); | ||
| 1488 | |||
| 1489 | memset(&fifoparams, 0, sizeof(fifoparams)); | ||
| 1490 | fifoparams.sched_priority = priority; | ||
| 1491 | assert(0 == sched_setscheduler(getpid(), SCHED_FIFO, &fifoparams)); | ||
| 1492 | } | ||
| 1493 | } | ||
| 1494 | else | ||
| 1495 | { | ||
| 1496 | if (!litmus_is_valid_fixed_prio(priority)) | ||
| 1497 | usage("Invalid priority."); | ||
| 1498 | } | ||
| 1499 | |||
| 1500 | if (test_loop) { | ||
| 1501 | debug_delay_loop(); | ||
| 1502 | return 0; | ||
| 1503 | } | ||
| 1504 | |||
| 1505 | srand(getpid()); | ||
| 1506 | |||
| 1507 | if (file) { | ||
| 1508 | get_exec_times(file, column, &num_jobs, &exec_times); | ||
| 1509 | |||
| 1510 | if (argc - optind < 2) | ||
| 1511 | usage("Arguments missing."); | ||
| 1512 | |||
| 1513 | for (cur_job = 0; cur_job < num_jobs; ++cur_job) { | ||
| 1514 | /* convert the execution time to seconds */ | ||
| 1515 | duration += exec_times[cur_job] * 0.001; | ||
| 1516 | } | ||
| 1517 | } else { | ||
| 1518 | /* | ||
| 1519 | * if we're not reading from the CSV file, then we need | ||
| 1520 | * three parameters | ||
| 1521 | */ | ||
| 1522 | if (argc - optind < 3) | ||
| 1523 | usage("Arguments missing."); | ||
| 1524 | } | ||
| 1525 | |||
| 1526 | if (argc - optind == 3) { | ||
| 1527 | assert(!GPU_USING); | ||
| 1528 | wcet_ms = atof(argv[optind + 0]); | ||
| 1529 | period_ms = atof(argv[optind + 1]); | ||
| 1530 | duration = atof(argv[optind + 2]); | ||
| 1531 | } | ||
| 1532 | else if (argc - optind == 4) { | ||
| 1533 | assert(GPU_USING); | ||
| 1534 | wcet_ms = atof(argv[optind + 0]); | ||
| 1535 | gpu_wcet_ms = atof(argv[optind + 1]); | ||
| 1536 | period_ms = atof(argv[optind + 2]); | ||
| 1537 | duration = atof(argv[optind + 3]); | ||
| 1538 | } | ||
| 1539 | |||
| 1540 | wcet = ms2ns(wcet_ms); | ||
| 1541 | period = ms2ns(period_ms); | ||
| 1542 | if (wcet <= 0) | ||
| 1543 | usage("The worst-case execution time must be a " | ||
| 1544 | "positive number."); | ||
| 1545 | if (period <= 0) | ||
| 1546 | usage("The period must be a positive number."); | ||
| 1547 | if (!file && wcet > period) { | ||
| 1548 | usage("The worst-case execution time must not " | ||
| 1549 | "exceed the period."); | ||
| 1550 | } | ||
| 1551 | if (GPU_USING && gpu_wcet_ms <= 0) | ||
| 1552 | usage("The worst-case gpu execution time must be a positive number."); | ||
| 1553 | |||
| 1554 | if (budget_ms > 0) | ||
| 1555 | budget = ms2ns(budget_ms); | ||
| 1556 | else | ||
| 1557 | budget = wcet; | ||
| 1558 | |||
| 1559 | if (file && num_jobs > 1) | ||
| 1560 | duration += period_ms * 0.001 * (num_jobs - 1); | ||
| 1561 | |||
| 1562 | if (migrate) { | ||
| 1563 | ret = be_migrate_to_cluster(cluster, cluster_size); | ||
| 1564 | if (ret < 0) | ||
| 1565 | bail_out("could not migrate to target partition or cluster."); | ||
| 1566 | } | ||
| 1567 | |||
| 1568 | if (scheduler != LITMUS) | ||
| 1569 | { | ||
| 1570 | // set some variables needed by linux modes | ||
| 1571 | if (GPU_USING) | ||
| 1572 | { | ||
| 1573 | TRACE_MIGRATIONS = true; | ||
| 1574 | } | ||
| 1575 | periodTime.tv_sec = period / s2ns(1); | ||
| 1576 | periodTime.tv_nsec = period - periodTime.tv_sec * s2ns(1); | ||
| 1577 | period_ns = period; | ||
| 1578 | } | ||
| 1579 | |||
| 1580 | init_rt_task_param(¶m); | ||
| 1581 | param.exec_cost = budget; | ||
| 1582 | param.period = period; | ||
| 1583 | param.priority = priority; | ||
| 1584 | param.cls = cls; | ||
| 1585 | param.budget_policy = (want_enforcement) ? | ||
| 1586 | PRECISE_ENFORCEMENT : NO_ENFORCEMENT; | ||
| 1587 | param.budget_signal_policy = (want_enforcement && want_signals) ? | ||
| 1588 | PRECISE_SIGNALS : NO_SIGNALS; | ||
| 1589 | param.release_policy = PERIODIC; | ||
| 1590 | |||
| 1591 | if (migrate) | ||
| 1592 | param.cpu = cluster_to_first_cpu(cluster, cluster_size); | ||
| 1593 | ret = set_rt_task_param(gettid(), ¶m); | ||
| 1594 | if (ret < 0) | ||
| 1595 | bail_out("could not setup rt task params"); | ||
| 1596 | |||
| 1597 | if (scheduler == LITMUS) | ||
| 1598 | init_litmus(); | ||
| 1599 | else | ||
| 1600 | init_linux(); | ||
| 1601 | |||
| 1602 | if (want_signals) { | ||
| 1603 | /* bind default longjmp signal handler to SIG_BUDGET. */ | ||
| 1604 | activate_litmus_signals(SIG_BUDGET_MASK, longjmp_on_litmus_signal); | ||
| 1605 | } | ||
| 1606 | |||
| 1607 | if (scheduler == LITMUS) | ||
| 1608 | { | ||
| 1609 | ret = task_mode(LITMUS_RT_TASK); | ||
| 1610 | if (ret != 0) | ||
| 1611 | bail_out("could not become RT task"); | ||
| 1612 | } | ||
| 1613 | else | ||
| 1614 | { | ||
| 1615 | trace_name(); | ||
| 1616 | trace_param(); | ||
| 1617 | } | ||
| 1618 | |||
| 1619 | // if (protocol >= 0) { | ||
| 1620 | // /* open reference to semaphore */ | ||
| 1621 | // lock_od = litmus_open_lock(protocol, resource_id, lock_namespace, &cluster); | ||
| 1622 | // if (lock_od < 0) { | ||
| 1623 | // perror("litmus_open_lock"); | ||
| 1624 | // usage("Could not open lock."); | ||
| 1625 | // } | ||
| 1626 | // } | ||
| 1627 | |||
| 1628 | if (GPU_USING) { | ||
| 1629 | allocate_locks(num_gpu_users, scheduler != LITMUS); | ||
| 1630 | |||
| 1631 | signal(SIGABRT, catch_exit); | ||
| 1632 | signal(SIGTERM, catch_exit); | ||
| 1633 | signal(SIGQUIT, catch_exit); | ||
| 1634 | signal(SIGSEGV, catch_exit); | ||
| 1635 | |||
| 1636 | init_cuda(num_gpu_users); | ||
| 1637 | safetynet = true; | ||
| 1638 | |||
| 1639 | if (ENABLE_RT_AUX_THREADS) | ||
| 1640 | if (enable_aux_rt_tasks(AUX_CURRENT | AUX_FUTURE) != 0) | ||
| 1641 | bail_out("enable_aux_rt_tasks() failed"); | ||
| 1642 | } | ||
| 1643 | |||
| 1644 | if (wait) { | ||
| 1645 | ret = wait_for_ts_release2(&releaseTime); | ||
| 1646 | if (ret != 0) | ||
| 1647 | bail_out("wait_for_ts_release2()"); | ||
| 1648 | |||
| 1649 | if (scheduler != LITMUS) | ||
| 1650 | log_release(); | ||
| 1651 | } | ||
| 1652 | else if (scheduler != LITMUS) | ||
| 1653 | { | ||
| 1654 | clock_gettime(CLOCK_MONOTONIC, &releaseTime); | ||
| 1655 | sleep_next_period_linux(); | ||
| 1656 | } | ||
| 1657 | |||
| 1658 | start = wctime(); | ||
| 1659 | |||
| 1660 | if (scheduler == LITMUS) | ||
| 1661 | { | ||
| 1662 | if (!GPU_USING) { | ||
| 1663 | while (job(wcet_ms * 0.001 * scale, start + duration)); | ||
| 1664 | } | ||
| 1665 | else { | ||
| 1666 | while (gpu_job(wcet_ms * 0.001 * scale, | ||
| 1667 | gpu_wcet_ms * 0.001 * scale, | ||
| 1668 | start + duration)); | ||
| 1669 | } | ||
| 1670 | } | ||
| 1671 | else | ||
| 1672 | { | ||
| 1673 | if (!GPU_USING) { | ||
| 1674 | while (job_linux(wcet_ms * 0.001 * scale, start + duration)); | ||
| 1675 | } | ||
| 1676 | else { | ||
| 1677 | while (gpu_job_linux(wcet_ms * 0.001 * scale, | ||
| 1678 | gpu_wcet_ms * 0.001 * scale, | ||
| 1679 | start + duration)); | ||
| 1680 | } | ||
| 1681 | } | ||
| 1682 | |||
| 1683 | if (GPU_USING && ENABLE_RT_AUX_THREADS) | ||
| 1684 | if (disable_aux_rt_tasks(AUX_CURRENT | AUX_FUTURE) != 0) | ||
| 1685 | bail_out("disable_aux_rt_tasks() failed"); | ||
| 1686 | |||
| 1687 | // if (file) { | ||
| 1688 | // /* use times read from the CSV file */ | ||
| 1689 | // for (cur_job = 0; cur_job < num_jobs; ++cur_job) { | ||
| 1690 | // /* convert job's length to seconds */ | ||
| 1691 | // job(exec_times[cur_job] * 0.001 * scale, | ||
| 1692 | // start + duration, | ||
| 1693 | // lock_od, cs_length * 0.001); | ||
| 1694 | // } | ||
| 1695 | // } else { | ||
| 1696 | // /* convert to seconds and scale */ | ||
| 1697 | // while (job(wcet_ms * 0.001 * scale, start + duration, | ||
| 1698 | // lock_od, cs_length * 0.001)); | ||
| 1699 | // } | ||
| 1700 | |||
| 1701 | if (scheduler == LITMUS) | ||
| 1702 | { | ||
| 1703 | ret = task_mode(BACKGROUND_TASK); | ||
| 1704 | if (ret != 0) | ||
| 1705 | bail_out("could not become regular task (huh?)"); | ||
| 1706 | } | ||
| 1707 | |||
| 1708 | if (GPU_USING) { | ||
| 1709 | safetynet = false; | ||
| 1710 | exit_cuda(); | ||
| 1711 | |||
| 1712 | |||
| 1713 | printf("avg: %f\n", ms_sum/gpucount); | ||
| 1714 | } | ||
| 1715 | |||
| 1716 | if (file) | ||
| 1717 | free(exec_times); | ||
| 1718 | |||
| 1719 | return 0; | ||
| 1720 | } | ||
diff --git a/gpu/rtspin_fake_cuda.cpp b/gpu/rtspin_fake_cuda.cpp deleted file mode 100644 index 247a74c..0000000 --- a/gpu/rtspin_fake_cuda.cpp +++ /dev/null | |||
| @@ -1,1187 +0,0 @@ | |||
| 1 | #include <sys/time.h> | ||
| 2 | |||
| 3 | #include <stdint.h> | ||
| 4 | #include <stdio.h> | ||
| 5 | #include <stdlib.h> | ||
| 6 | #include <unistd.h> | ||
| 7 | #include <time.h> | ||
| 8 | #include <assert.h> | ||
| 9 | #include <fcntl.h> | ||
| 10 | #include <errno.h> | ||
| 11 | |||
| 12 | #include <blitz/array.h> | ||
| 13 | |||
| 14 | #include <boost/interprocess/managed_shared_memory.hpp> | ||
| 15 | #include <boost/interprocess/sync/interprocess_barrier.hpp> | ||
| 16 | #include <boost/interprocess/sync/interprocess_mutex.hpp> | ||
| 17 | |||
| 18 | #include "litmus.h" | ||
| 19 | |||
| 20 | using namespace blitz; | ||
| 21 | using namespace std; | ||
| 22 | using namespace boost::interprocess; | ||
| 23 | |||
| 24 | #define RESET_RELEASE_ON_MISS | ||
| 25 | |||
| 26 | |||
| 27 | void bail_out(const char* msg) | ||
| 28 | { | ||
| 29 | perror(msg); | ||
| 30 | exit(-1 * errno); | ||
| 31 | } | ||
| 32 | |||
| 33 | |||
| 34 | static void usage(char *error) { | ||
| 35 | fprintf(stderr, "Error: %s\n", error); | ||
| 36 | fprintf(stderr, | ||
| 37 | "Usage:\n" | ||
| 38 | " rt_spin [COMMON-OPTS] WCET PERIOD DURATION\n" | ||
| 39 | " rt_spin [COMMON-OPTS] -f FILE [-o COLUMN] WCET PERIOD\n" | ||
| 40 | " rt_spin -l\n" | ||
| 41 | "\n" | ||
| 42 | "COMMON-OPTS = [-w] [-p PARTITION] [-c CLASS] [-s SCALE]\n" | ||
| 43 | "\n" | ||
| 44 | "WCET and PERIOD are milliseconds, DURATION is seconds.\n"); | ||
| 45 | exit(EXIT_FAILURE); | ||
| 46 | } | ||
| 47 | |||
| 48 | #define NUMS 4096 | ||
| 49 | static int num[NUMS]; | ||
| 50 | |||
| 51 | #define PAGE_SIZE (1024*4) | ||
| 52 | |||
| 53 | bool ENABLE_WAIT = true; | ||
| 54 | bool GPU_TASK = false; | ||
| 55 | bool ENABLE_AFFINITY = false; | ||
| 56 | bool USE_KFMLP = false; | ||
| 57 | bool RELAX_FIFO_MAX_LEN = false; | ||
| 58 | bool USE_DYNAMIC_GROUP_LOCKS = false; | ||
| 59 | bool BROADCAST_STATE = false; | ||
| 60 | bool ENABLE_CHUNKING = false; | ||
| 61 | bool MIGRATE_VIA_SYSMEM = false; | ||
| 62 | bool USE_PRIOQ = false; | ||
| 63 | |||
| 64 | int GPU_PARTITION = 0; | ||
| 65 | int GPU_PARTITION_SIZE = 0; | ||
| 66 | int NUM_SIMULT_USERS = 1; | ||
| 67 | size_t SEND_SIZE = 0; | ||
| 68 | size_t RECV_SIZE = 0; | ||
| 69 | size_t STATE_SIZE = 0; | ||
| 70 | size_t CHUNK_SIZE = PAGE_SIZE; | ||
| 71 | |||
| 72 | |||
| 73 | #define MAX_GPUS 8 | ||
| 74 | |||
| 75 | int KEXCLU_LOCK; | ||
| 76 | int EE_LOCKS[MAX_GPUS]; | ||
| 77 | int CE_SEND_LOCKS[MAX_GPUS]; | ||
| 78 | int CE_RECV_LOCKS[MAX_GPUS]; | ||
| 79 | |||
| 80 | int CUR_DEVICE = -1; | ||
| 81 | int LAST_DEVICE = -1; | ||
| 82 | |||
| 83 | bool useEngineLocks() | ||
| 84 | { | ||
| 85 | return(NUM_SIMULT_USERS != 1); | ||
| 86 | } | ||
| 87 | |||
| 88 | int gpuCyclesPerSecond = 0; | ||
| 89 | |||
| 90 | uint64_t *init_release_time = NULL; | ||
| 91 | barrier *release_barrier = NULL; | ||
| 92 | barrier *gpu_barrier = NULL; | ||
| 93 | interprocess_mutex *gpu_mgmt_mutexes = NULL; | ||
| 94 | managed_shared_memory *segment_ptr = NULL; | ||
| 95 | managed_shared_memory *release_segment_ptr = NULL; | ||
| 96 | |||
| 97 | // observed average rate when four GPUs on same node in use from pagelocked memory. | ||
| 98 | // about 1/3 to 1/4 this when there is no bus contention. | ||
| 99 | //const double msPerByte = 4.22e-07; | ||
| 100 | //const double transOverhead = 0.01008; // also observed. | ||
| 101 | |||
| 102 | |||
| 103 | |||
| 104 | char *d_send_data[MAX_GPUS] = {0}; | ||
| 105 | char *d_recv_data[MAX_GPUS] = {0}; | ||
| 106 | char *d_state_data[MAX_GPUS] = {0}; | ||
| 107 | |||
| 108 | //cudaStream_t streams[MAX_GPUS]; | ||
| 109 | |||
| 110 | char *h_send_data = 0; | ||
| 111 | char *h_recv_data = 0; | ||
| 112 | char *h_state_data = 0; | ||
| 113 | |||
| 114 | |||
| 115 | #include <sys/mman.h> | ||
| 116 | #define USE_PAGE_LOCKED_MEMORY | ||
| 117 | #ifdef USE_PAGE_LOCKED_MEMORY | ||
| 118 | #define c_malloc(s) \ | ||
| 119 | mmap(NULL, s , \ | ||
| 120 | PROT_READ | PROT_WRITE, \ | ||
| 121 | MAP_PRIVATE | MAP_ANONYMOUS | MAP_LOCKED, \ | ||
| 122 | -1, 0) | ||
| 123 | #else | ||
| 124 | #define c_malloc(s) malloc(s) | ||
| 125 | #endif | ||
| 126 | |||
| 127 | typedef int cudaError_t; | ||
| 128 | #define cudaSuccess 0 | ||
| 129 | |||
| 130 | enum cudaMemcpyKind { | ||
| 131 | cudaMemcpyHostToDevice = 0, | ||
| 132 | cudaMemcpyDeviceToHost = 1, | ||
| 133 | cudaMemcpyDeviceToDevice = 2, | ||
| 134 | }; | ||
| 135 | |||
| 136 | cudaError_t cudaGetLastError() | ||
| 137 | { | ||
| 138 | return cudaSuccess; | ||
| 139 | } | ||
| 140 | |||
| 141 | //////////////////////////////////////////////////////////////////////// | ||
| 142 | //////////////////////////////////////////////////////////////////////// | ||
| 143 | //////////////////////////////////////////////////////////////////////// | ||
| 144 | //////////////////////////////////////////////////////////////////////// | ||
| 145 | |||
| 146 | struct ce_lock_state | ||
| 147 | { | ||
| 148 | int locks[2]; | ||
| 149 | size_t num_locks; | ||
| 150 | size_t budget_remaining; | ||
| 151 | bool locked; | ||
| 152 | |||
| 153 | ce_lock_state(int device_a, enum cudaMemcpyKind kind, size_t size, int device_b = -1) { | ||
| 154 | num_locks = (device_a != -1) + (device_b != -1); | ||
| 155 | |||
| 156 | if(device_a != -1) { | ||
| 157 | locks[0] = (kind == cudaMemcpyHostToDevice) ? | ||
| 158 | CE_SEND_LOCKS[device_a] : CE_RECV_LOCKS[device_a]; | ||
| 159 | } | ||
| 160 | |||
| 161 | if(device_b != -1) { | ||
| 162 | assert(kind == cudaMemcpyDeviceToDevice); | ||
| 163 | |||
| 164 | locks[1] = CE_RECV_LOCKS[device_b]; | ||
| 165 | |||
| 166 | if(locks[1] < locks[0]) { | ||
| 167 | int temp = locks[1]; | ||
| 168 | locks[1] = locks[0]; | ||
| 169 | locks[0] = temp; | ||
| 170 | } | ||
| 171 | } | ||
| 172 | |||
| 173 | if(!ENABLE_CHUNKING) | ||
| 174 | budget_remaining = size; | ||
| 175 | else | ||
| 176 | budget_remaining = CHUNK_SIZE; | ||
| 177 | } | ||
| 178 | |||
| 179 | void lock() { | ||
| 180 | if(USE_DYNAMIC_GROUP_LOCKS) { | ||
| 181 | litmus_dgl_lock(locks, num_locks); | ||
| 182 | } | ||
| 183 | else | ||
| 184 | { | ||
| 185 | for(int l = 0; l < num_locks; ++l) | ||
| 186 | { | ||
| 187 | litmus_lock(locks[l]); | ||
| 188 | } | ||
| 189 | } | ||
| 190 | locked = true; | ||
| 191 | } | ||
| 192 | |||
| 193 | void unlock() { | ||
| 194 | if(USE_DYNAMIC_GROUP_LOCKS) { | ||
| 195 | litmus_dgl_unlock(locks, num_locks); | ||
| 196 | } | ||
| 197 | else | ||
| 198 | { | ||
| 199 | // reverse order | ||
| 200 | for(int l = num_locks - 1; l >= 0; --l) | ||
| 201 | { | ||
| 202 | litmus_unlock(locks[l]); | ||
| 203 | } | ||
| 204 | } | ||
| 205 | locked = false; | ||
| 206 | } | ||
| 207 | |||
| 208 | void refresh() { | ||
| 209 | budget_remaining = CHUNK_SIZE; | ||
| 210 | } | ||
| 211 | |||
| 212 | bool budgetIsAvailable(size_t tosend) { | ||
| 213 | return(tosend >= budget_remaining); | ||
| 214 | } | ||
| 215 | |||
| 216 | void decreaseBudget(size_t spent) { | ||
| 217 | budget_remaining -= spent; | ||
| 218 | } | ||
| 219 | }; | ||
| 220 | |||
| 221 | // precondition: if do_locking == true, locks in state are held. | ||
| 222 | cudaError_t __chunkMemcpy(void* a_dst, const void* a_src, size_t count, | ||
| 223 | enum cudaMemcpyKind kind, | ||
| 224 | ce_lock_state* state) | ||
| 225 | { | ||
| 226 | cudaError_t ret = cudaSuccess; | ||
| 227 | int remaining = count; | ||
| 228 | |||
| 229 | char* dst = (char*)a_dst; | ||
| 230 | const char* src = (const char*)a_src; | ||
| 231 | |||
| 232 | // disable chunking, if needed, by setting chunk_size equal to the | ||
| 233 | // amount of data to be copied. | ||
| 234 | int chunk_size = (ENABLE_CHUNKING) ? CHUNK_SIZE : count; | ||
| 235 | int i = 0; | ||
| 236 | |||
| 237 | while(remaining != 0) | ||
| 238 | { | ||
| 239 | int bytesToCopy = std::min(remaining, chunk_size); | ||
| 240 | |||
| 241 | if(state && state->budgetIsAvailable(bytesToCopy) && state->locked) { | ||
| 242 | //cutilSafeCall( cudaStreamSynchronize(streams[CUR_DEVICE]) ); | ||
| 243 | ret = cudaGetLastError(); | ||
| 244 | |||
| 245 | if(ret != cudaSuccess) | ||
| 246 | { | ||
| 247 | break; | ||
| 248 | } | ||
| 249 | |||
| 250 | state->unlock(); | ||
| 251 | state->refresh(); // replentish. | ||
| 252 | // we can only run out of | ||
| 253 | // budget if chunking is enabled. | ||
| 254 | // we presume that init budget would | ||
| 255 | // be set to cover entire memcpy | ||
| 256 | // if chunking were disabled. | ||
| 257 | } | ||
| 258 | |||
| 259 | if(state && !state->locked) { | ||
| 260 | state->lock(); | ||
| 261 | } | ||
| 262 | |||
| 263 | //ret = cudaMemcpy(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind); | ||
| 264 | //cudaMemcpyAsync(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind, streams[CUR_DEVICE]); | ||
| 265 | |||
| 266 | if(state) { | ||
| 267 | state->decreaseBudget(bytesToCopy); | ||
| 268 | } | ||
| 269 | |||
| 270 | // if(ret != cudaSuccess) | ||
| 271 | // { | ||
| 272 | // break; | ||
| 273 | // } | ||
| 274 | |||
| 275 | ++i; | ||
| 276 | remaining -= bytesToCopy; | ||
| 277 | } | ||
| 278 | return ret; | ||
| 279 | } | ||
| 280 | |||
| 281 | cudaError_t chunkMemcpy(void* a_dst, const void* a_src, size_t count, | ||
| 282 | enum cudaMemcpyKind kind, | ||
| 283 | int device_a = -1, // device_a == -1 disables locking | ||
| 284 | bool do_locking = true, | ||
| 285 | int device_b = -1) | ||
| 286 | { | ||
| 287 | cudaError_t ret; | ||
| 288 | if(!do_locking || device_a == -1) { | ||
| 289 | ret = __chunkMemcpy(a_dst, a_src, count, kind, NULL); | ||
| 290 | //cutilSafeCall( cudaStreamSynchronize(streams[CUR_DEVICE]) ); | ||
| 291 | if(ret == cudaSuccess) | ||
| 292 | ret = cudaGetLastError(); | ||
| 293 | } | ||
| 294 | else { | ||
| 295 | ce_lock_state state(device_a, kind, count, device_b); | ||
| 296 | state.lock(); | ||
| 297 | ret = __chunkMemcpy(a_dst, a_src, count, kind, &state); | ||
| 298 | //cutilSafeCall( cudaStreamSynchronize(streams[CUR_DEVICE]) ); | ||
| 299 | if(ret == cudaSuccess) | ||
| 300 | ret = cudaGetLastError(); | ||
| 301 | state.unlock(); | ||
| 302 | } | ||
| 303 | return ret; | ||
| 304 | } | ||
| 305 | |||
| 306 | |||
| 307 | //////////////////////////////////////////////////////////////////////// | ||
| 308 | //////////////////////////////////////////////////////////////////////// | ||
| 309 | //////////////////////////////////////////////////////////////////////// | ||
| 310 | |||
| 311 | |||
| 312 | inline uint64_t timespec_to_ns(const struct timespec& t) | ||
| 313 | { | ||
| 314 | return(t.tv_sec*1e9 + t.tv_nsec); | ||
| 315 | } | ||
| 316 | |||
| 317 | inline struct timespec ns_to_timespec(const uint64_t& ns) | ||
| 318 | { | ||
| 319 | struct timespec temp = {ns/1e9, ns - ns/1e9}; | ||
| 320 | return(temp); | ||
| 321 | } | ||
| 322 | |||
| 323 | inline uint64_t clock_gettime_ns(clockid_t clk_id) | ||
| 324 | { | ||
| 325 | struct timespec temp; | ||
| 326 | clock_gettime(clk_id, &temp); | ||
| 327 | return timespec_to_ns(temp); | ||
| 328 | } | ||
| 329 | |||
| 330 | |||
| 331 | |||
| 332 | static int loop_once(void) | ||
| 333 | { | ||
| 334 | int i, j = 0; | ||
| 335 | for (i = 0; i < NUMS; i++) | ||
| 336 | j += num[i]++; | ||
| 337 | return j; | ||
| 338 | } | ||
| 339 | |||
| 340 | static int loop_for(double exec_time, double emergency_exit) | ||
| 341 | { | ||
| 342 | double last_loop = 0, loop_start; | ||
| 343 | int tmp = 0; | ||
| 344 | |||
| 345 | double start = cputime(); | ||
| 346 | double now = cputime(); | ||
| 347 | |||
| 348 | while (now + last_loop < start + exec_time) { | ||
| 349 | loop_start = now; | ||
| 350 | tmp += loop_once(); | ||
| 351 | now = cputime(); | ||
| 352 | last_loop = now - loop_start; | ||
| 353 | if (emergency_exit && wctime() > emergency_exit) { | ||
| 354 | /* Oops --- this should only be possible if the execution time tracking | ||
| 355 | * is broken in the LITMUS^RT kernel. */ | ||
| 356 | fprintf(stderr, "!!! rtspin/%d emergency exit!\n", getpid()); | ||
| 357 | fprintf(stderr, "Something is seriously wrong! Do not ignore this.\n"); | ||
| 358 | break; | ||
| 359 | } | ||
| 360 | } | ||
| 361 | |||
| 362 | return tmp; | ||
| 363 | } | ||
| 364 | |||
| 365 | static void allocate_locks() | ||
| 366 | { | ||
| 367 | // allocate k-FMLP lock | ||
| 368 | int fd = open("semaphores", O_RDONLY | O_CREAT, S_IRUSR | S_IWUSR); | ||
| 369 | |||
| 370 | int base_name = GPU_PARTITION * 1000; | ||
| 371 | |||
| 372 | if(USE_KFMLP) { | ||
| 373 | KEXCLU_LOCK = open_kfmlp_gpu_sem(fd, | ||
| 374 | base_name, /* name */ | ||
| 375 | GPU_PARTITION_SIZE, | ||
| 376 | GPU_PARTITION*GPU_PARTITION_SIZE, | ||
| 377 | NUM_SIMULT_USERS, | ||
| 378 | ENABLE_AFFINITY | ||
| 379 | ); | ||
| 380 | } | ||
| 381 | else { | ||
| 382 | KEXCLU_LOCK = open_gpusync_token_lock(fd, | ||
| 383 | base_name, /* name */ | ||
| 384 | GPU_PARTITION_SIZE, | ||
| 385 | GPU_PARTITION*GPU_PARTITION_SIZE, | ||
| 386 | NUM_SIMULT_USERS, | ||
| 387 | IKGLP_M_IN_FIFOS, | ||
| 388 | (!RELAX_FIFO_MAX_LEN) ? | ||
| 389 | IKGLP_OPTIMAL_FIFO_LEN : | ||
| 390 | IKGLP_UNLIMITED_FIFO_LEN, | ||
| 391 | ENABLE_AFFINITY | ||
| 392 | ); | ||
| 393 | // KEXCLU_LOCK = open_ikglp_gpu_sem(fd, | ||
| 394 | // base_name, /* name */ | ||
| 395 | // GPU_PARTITION_SIZE, | ||
| 396 | // GPU_PARTITION*GPU_PARTITION_SIZE, | ||
| 397 | // NUM_SIMULT_USERS, | ||
| 398 | // ENABLE_AFFINITY, | ||
| 399 | // RELAX_FIFO_MAX_LEN | ||
| 400 | // ); | ||
| 401 | } | ||
| 402 | if(KEXCLU_LOCK < 0) | ||
| 403 | perror("open_kexclu_sem"); | ||
| 404 | |||
| 405 | if(NUM_SIMULT_USERS > 1) | ||
| 406 | { | ||
| 407 | open_sem_t opensem = (!USE_PRIOQ) ? open_fifo_sem : open_prioq_sem; | ||
| 408 | const char* opensem_label = (!USE_PRIOQ) ? "open_fifo_sem" : "open_prioq_sem"; | ||
| 409 | |||
| 410 | // allocate the engine locks. | ||
| 411 | for (int i = 0; i < MAX_GPUS; ++i) | ||
| 412 | { | ||
| 413 | EE_LOCKS[i] = opensem(fd, (i+1)*10 + base_name); | ||
| 414 | if(EE_LOCKS[i] < 0) | ||
| 415 | perror(opensem_label); | ||
| 416 | |||
| 417 | CE_SEND_LOCKS[i] = opensem(fd, (i+1)*10 + base_name + 1); | ||
| 418 | if(CE_SEND_LOCKS[i] < 0) | ||
| 419 | perror(opensem_label); | ||
| 420 | |||
| 421 | if(NUM_SIMULT_USERS == 3) | ||
| 422 | { | ||
| 423 | // allocate a separate lock for the second copy engine | ||
| 424 | CE_RECV_LOCKS[i] = opensem(fd, (i+1)*10 + base_name + 2); | ||
| 425 | if(CE_RECV_LOCKS[i] < 0) | ||
| 426 | perror(opensem_label); | ||
| 427 | } | ||
| 428 | else | ||
| 429 | { | ||
| 430 | // share a single lock for the single copy engine | ||
| 431 | CE_RECV_LOCKS[i] = CE_SEND_LOCKS[i]; | ||
| 432 | } | ||
| 433 | } | ||
| 434 | } | ||
| 435 | } | ||
| 436 | |||
| 437 | static void allocate_host_memory() | ||
| 438 | { | ||
| 439 | // round up to page boundaries | ||
| 440 | size_t send_alloc_bytes = SEND_SIZE + (SEND_SIZE%PAGE_SIZE != 0)*PAGE_SIZE; | ||
| 441 | size_t recv_alloc_bytes = RECV_SIZE + (RECV_SIZE%PAGE_SIZE != 0)*PAGE_SIZE; | ||
| 442 | size_t state_alloc_bytes = STATE_SIZE + (STATE_SIZE%PAGE_SIZE != 0)*PAGE_SIZE; | ||
| 443 | |||
| 444 | printf("Allocating host memory. send = %dB, recv = %dB, state = %dB\n", | ||
| 445 | send_alloc_bytes, recv_alloc_bytes, state_alloc_bytes); | ||
| 446 | |||
| 447 | // if(send_alloc_bytes > 0) | ||
| 448 | // { | ||
| 449 | // h_send_data = (char *)c_malloc(send_alloc_bytes); | ||
| 450 | // memset(h_send_data, 0x55, send_alloc_bytes); // write some random value | ||
| 451 | // // this will open a connection to GPU 0 if there is no active context, so | ||
| 452 | // // expect long stalls. LAME. | ||
| 453 | // cutilSafeCall( cudaHostRegister(h_send_data, send_alloc_bytes, cudaHostRegisterPortable) ); | ||
| 454 | // } | ||
| 455 | // | ||
| 456 | // if(recv_alloc_bytes > 0) | ||
| 457 | // { | ||
| 458 | // h_recv_data = (char *)c_malloc(recv_alloc_bytes); | ||
| 459 | // memset(h_recv_data, 0xAA, recv_alloc_bytes); | ||
| 460 | // cutilSafeCall( cudaHostRegister(h_recv_data, recv_alloc_bytes, cudaHostRegisterPortable) ); | ||
| 461 | // } | ||
| 462 | // | ||
| 463 | // if(state_alloc_bytes > 0) | ||
| 464 | // { | ||
| 465 | // h_state_data = (char *)c_malloc(state_alloc_bytes); | ||
| 466 | // memset(h_state_data, 0xCC, state_alloc_bytes); // write some random value | ||
| 467 | // cutilSafeCall( cudaHostRegister(h_state_data, state_alloc_bytes, cudaHostRegisterPortable) ); | ||
| 468 | // } | ||
| 469 | |||
| 470 | printf("Host memory allocated.\n"); | ||
| 471 | } | ||
| 472 | |||
| 473 | static void allocate_device_memory() | ||
| 474 | { | ||
| 475 | printf("Allocating device memory.\n"); | ||
| 476 | // establish a connection to each GPU. | ||
| 477 | // for(int i = 0; i < GPU_PARTITION_SIZE; ++i) | ||
| 478 | // { | ||
| 479 | // int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i; | ||
| 480 | // | ||
| 481 | // if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].lock(); | ||
| 482 | // | ||
| 483 | // cutilSafeCall( cudaSetDevice(which_device) ); | ||
| 484 | // cutilSafeCall( cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 0) ); | ||
| 485 | // cutilSafeCall( cudaDeviceSetLimit(cudaLimitMallocHeapSize, 0) ); | ||
| 486 | // | ||
| 487 | // cutilSafeCall( cudaStreamCreate(&streams[which_device]) ); | ||
| 488 | // | ||
| 489 | // /* pre-allocate memory, pray there's enough to go around */ | ||
| 490 | // if(SEND_SIZE > 0) { | ||
| 491 | // cutilSafeCall( cudaMalloc((void**)&d_send_data[which_device], SEND_SIZE) ); | ||
| 492 | // } | ||
| 493 | // if(RECV_SIZE > 0) { | ||
| 494 | // cutilSafeCall( cudaMalloc((void**)&h_recv_data[which_device], RECV_SIZE) ); | ||
| 495 | // } | ||
| 496 | // if(STATE_SIZE > 0) { | ||
| 497 | // cutilSafeCall( cudaMalloc((void**)&h_state_data[which_device], STATE_SIZE) ); | ||
| 498 | // } | ||
| 499 | // | ||
| 500 | // if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].unlock(); | ||
| 501 | // } | ||
| 502 | printf("Device memory allocated.\n"); | ||
| 503 | } | ||
| 504 | |||
| 505 | static void configure_gpus() | ||
| 506 | { | ||
| 507 | printf("Configuring GPU\n"); | ||
| 508 | |||
| 509 | // // SUSPEND WHEN BLOCKED!! | ||
| 510 | // cutilSafeCall( cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync) ); | ||
| 511 | // | ||
| 512 | // // establish a connection to each GPU. | ||
| 513 | // for(int i = 0; i < GPU_PARTITION_SIZE; ++i) | ||
| 514 | // { | ||
| 515 | // int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i; | ||
| 516 | // | ||
| 517 | // if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].lock(); | ||
| 518 | // | ||
| 519 | // cutilSafeCall( cudaSetDevice(which_device) ); | ||
| 520 | // cutilSafeCall( cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 0) ); | ||
| 521 | // cutilSafeCall( cudaDeviceSetLimit(cudaLimitMallocHeapSize, 0) ); | ||
| 522 | // | ||
| 523 | // cutilSafeCall( cudaStreamCreate(&streams[which_device]) ); | ||
| 524 | // | ||
| 525 | // // enable P2P migrations. | ||
| 526 | // // we assume all GPUs are on the same I/O hub. | ||
| 527 | // for(int j = 0; j < GPU_PARTITION_SIZE; ++j) | ||
| 528 | // { | ||
| 529 | // int other_device = GPU_PARTITION*GPU_PARTITION_SIZE + j; | ||
| 530 | // | ||
| 531 | // if(which_device != other_device) | ||
| 532 | // { | ||
| 533 | // cutilSafeCall( cudaDeviceEnablePeerAccess(other_device, 0) ); | ||
| 534 | // } | ||
| 535 | // } | ||
| 536 | // | ||
| 537 | // if(i == 0) | ||
| 538 | // { | ||
| 539 | // struct cudaDeviceProp pi; | ||
| 540 | // cudaGetDeviceProperties(&pi, i); | ||
| 541 | // gpuCyclesPerSecond = pi.clockRate * 1000; /* khz -> hz */ | ||
| 542 | // } | ||
| 543 | // | ||
| 544 | // if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].unlock(); | ||
| 545 | // } | ||
| 546 | |||
| 547 | printf("GPUs have been configured.\n"); | ||
| 548 | } | ||
| 549 | |||
| 550 | static void init_cuda() | ||
| 551 | { | ||
| 552 | configure_gpus(); | ||
| 553 | allocate_host_memory(); | ||
| 554 | allocate_device_memory(); | ||
| 555 | allocate_locks(); | ||
| 556 | } | ||
| 557 | |||
| 558 | static void exit_cuda() | ||
| 559 | { | ||
| 560 | for(int i = 0; i < GPU_PARTITION_SIZE; ++i) | ||
| 561 | { | ||
| 562 | int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i; | ||
| 563 | |||
| 564 | if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].lock(); | ||
| 565 | |||
| 566 | // cutilSafeCall( cudaSetDevice(which_device) ); | ||
| 567 | // cutilSafeCall( cudaDeviceReset() ); | ||
| 568 | |||
| 569 | if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].unlock(); | ||
| 570 | } | ||
| 571 | } | ||
| 572 | |||
| 573 | static void catchExit(void) | ||
| 574 | { | ||
| 575 | if(GPU_TASK) | ||
| 576 | { | ||
| 577 | // try to unlock everything. litmus will prevent bogus calls. | ||
| 578 | if(NUM_SIMULT_USERS > 1) | ||
| 579 | { | ||
| 580 | for(int i = 0; i < GPU_PARTITION_SIZE; ++i) | ||
| 581 | { | ||
| 582 | int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i; | ||
| 583 | |||
| 584 | litmus_unlock(EE_LOCKS[which_device]); | ||
| 585 | litmus_unlock(CE_SEND_LOCKS[which_device]); | ||
| 586 | if(NUM_SIMULT_USERS == 2) { | ||
| 587 | litmus_unlock(CE_RECV_LOCKS[which_device]); | ||
| 588 | } | ||
| 589 | } | ||
| 590 | } | ||
| 591 | |||
| 592 | if(CUR_DEVICE >= 0) { | ||
| 593 | unregister_nv_device(CUR_DEVICE); | ||
| 594 | } | ||
| 595 | |||
| 596 | litmus_unlock(KEXCLU_LOCK); | ||
| 597 | } | ||
| 598 | } | ||
| 599 | |||
| 600 | static void migrateToGPU(int destination) | ||
| 601 | { | ||
| 602 | if(!BROADCAST_STATE && STATE_SIZE > 0) | ||
| 603 | { | ||
| 604 | if(MIGRATE_VIA_SYSMEM) | ||
| 605 | { | ||
| 606 | chunkMemcpy(h_state_data, d_state_data[LAST_DEVICE], STATE_SIZE, | ||
| 607 | cudaMemcpyDeviceToHost, LAST_DEVICE, useEngineLocks()); | ||
| 608 | } | ||
| 609 | } | ||
| 610 | |||
| 611 | // cutilSafeCall( cudaSetDevice(destination) ); | ||
| 612 | |||
| 613 | if(!BROADCAST_STATE && STATE_SIZE > 0) | ||
| 614 | { | ||
| 615 | if(MIGRATE_VIA_SYSMEM) | ||
| 616 | { | ||
| 617 | chunkMemcpy(d_state_data[CUR_DEVICE], h_state_data, STATE_SIZE, | ||
| 618 | cudaMemcpyHostToDevice, CUR_DEVICE, useEngineLocks()); | ||
| 619 | } | ||
| 620 | else | ||
| 621 | { | ||
| 622 | chunkMemcpy(d_state_data[destination], | ||
| 623 | d_state_data[LAST_DEVICE], | ||
| 624 | STATE_SIZE, | ||
| 625 | cudaMemcpyDeviceToDevice, | ||
| 626 | CUR_DEVICE, | ||
| 627 | useEngineLocks(), | ||
| 628 | destination); | ||
| 629 | } | ||
| 630 | } | ||
| 631 | } | ||
| 632 | |||
| 633 | static void broadcastState(int from) | ||
| 634 | { | ||
| 635 | if(STATE_SIZE > 0) | ||
| 636 | { | ||
| 637 | assert(CUR_DEVICE == from); | ||
| 638 | |||
| 639 | if(MIGRATE_VIA_SYSMEM) | ||
| 640 | { | ||
| 641 | chunkMemcpy(h_state_data, d_state_data[from], STATE_SIZE, | ||
| 642 | cudaMemcpyDeviceToHost, from, useEngineLocks()); | ||
| 643 | } | ||
| 644 | |||
| 645 | for(int i = 0; i < GPU_PARTITION_SIZE; ++i) | ||
| 646 | { | ||
| 647 | int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i; | ||
| 648 | if(which_device != from) | ||
| 649 | { | ||
| 650 | if(MIGRATE_VIA_SYSMEM) | ||
| 651 | { | ||
| 652 | // cutilSafeCall( cudaSetDevice(which_device) ); | ||
| 653 | CUR_DEVICE = which_device; // temporary | ||
| 654 | chunkMemcpy(d_state_data[which_device], h_state_data, STATE_SIZE, | ||
| 655 | cudaMemcpyHostToDevice, which_device, useEngineLocks()); | ||
| 656 | } | ||
| 657 | else | ||
| 658 | { | ||
| 659 | chunkMemcpy(d_state_data[which_device], | ||
| 660 | d_state_data[from], | ||
| 661 | STATE_SIZE, | ||
| 662 | cudaMemcpyDeviceToDevice, | ||
| 663 | from, | ||
| 664 | useEngineLocks(), | ||
| 665 | which_device); | ||
| 666 | } | ||
| 667 | } | ||
| 668 | } | ||
| 669 | |||
| 670 | if(MIGRATE_VIA_SYSMEM && CUR_DEVICE != from) | ||
| 671 | { | ||
| 672 | // cutilSafeCall( cudaSetDevice(from) ); | ||
| 673 | CUR_DEVICE = from; | ||
| 674 | } | ||
| 675 | } | ||
| 676 | } | ||
| 677 | |||
| 678 | //// Executes on graphics card. | ||
| 679 | //__global__ void docudaspin(unsigned int cycles) | ||
| 680 | //{ | ||
| 681 | // long long unsigned int elapsed = 0; | ||
| 682 | // long long int now = clock64(); | ||
| 683 | // long long int last; | ||
| 684 | // do | ||
| 685 | // { | ||
| 686 | // last = now; | ||
| 687 | // now = clock64(); | ||
| 688 | // elapsed += max(0ll, (long long int)(now - last)); // don't count iterations with clock roll-over | ||
| 689 | // }while(elapsed < cycles); | ||
| 690 | // | ||
| 691 | // return; | ||
| 692 | //} | ||
| 693 | |||
| 694 | |||
| 695 | |||
| 696 | static void gpu_loop_for(double gpu_sec_time, double emergency_exit) | ||
| 697 | { | ||
| 698 | unsigned int numcycles = (unsigned int)(gpuCyclesPerSecond * gpu_sec_time); | ||
| 699 | int numblocks = 1; | ||
| 700 | int blocksz = 1; | ||
| 701 | |||
| 702 | CUR_DEVICE = litmus_lock(KEXCLU_LOCK); | ||
| 703 | { | ||
| 704 | if(CUR_DEVICE != LAST_DEVICE && LAST_DEVICE != -1) | ||
| 705 | { | ||
| 706 | migrateToGPU(CUR_DEVICE); | ||
| 707 | } | ||
| 708 | |||
| 709 | if(SEND_SIZE > 0) | ||
| 710 | { | ||
| 711 | // handles chunking and locking, as appropriate. | ||
| 712 | chunkMemcpy(d_send_data[CUR_DEVICE], h_send_data, SEND_SIZE, | ||
| 713 | cudaMemcpyHostToDevice, CUR_DEVICE, useEngineLocks()); | ||
| 714 | } | ||
| 715 | |||
| 716 | if(useEngineLocks()) litmus_lock(EE_LOCKS[CUR_DEVICE]); | ||
| 717 | |||
| 718 | // docudaspin <<<numblocks,blocksz, 0, streams[CUR_DEVICE]>>> (numcycles); | ||
| 719 | // cutilSafeCall( cudaStreamSynchronize(streams[CUR_DEVICE]) ); | ||
| 720 | |||
| 721 | if(useEngineLocks()) litmus_unlock(EE_LOCKS[CUR_DEVICE]); | ||
| 722 | |||
| 723 | if(RECV_SIZE > 0) | ||
| 724 | { | ||
| 725 | chunkMemcpy(h_recv_data, d_recv_data[CUR_DEVICE], RECV_SIZE, | ||
| 726 | cudaMemcpyDeviceToHost, CUR_DEVICE, useEngineLocks()); | ||
| 727 | } | ||
| 728 | |||
| 729 | if(BROADCAST_STATE) | ||
| 730 | { | ||
| 731 | broadcastState(CUR_DEVICE); | ||
| 732 | } | ||
| 733 | } | ||
| 734 | litmus_unlock(KEXCLU_LOCK); | ||
| 735 | |||
| 736 | LAST_DEVICE = CUR_DEVICE; | ||
| 737 | CUR_DEVICE = -1; | ||
| 738 | } | ||
| 739 | |||
| 740 | |||
| 741 | static void debug_delay_loop(void) | ||
| 742 | { | ||
| 743 | double start, end, delay; | ||
| 744 | |||
| 745 | while (1) { | ||
| 746 | for (delay = 0.5; delay > 0.01; delay -= 0.01) { | ||
| 747 | start = wctime(); | ||
| 748 | loop_for(delay, 0); | ||
| 749 | end = wctime(); | ||
| 750 | printf("%6.4fs: looped for %10.8fs, delta=%11.8fs, error=%7.4f%%\n", | ||
| 751 | delay, | ||
| 752 | end - start, | ||
| 753 | end - start - delay, | ||
| 754 | 100 * (end - start - delay) / delay); | ||
| 755 | } | ||
| 756 | } | ||
| 757 | } | ||
| 758 | |||
| 759 | static int job(double exec_time, double gpu_sec_time, double program_end) | ||
| 760 | { | ||
| 761 | if (wctime() > program_end) | ||
| 762 | return 0; | ||
| 763 | else if (!GPU_TASK) | ||
| 764 | { | ||
| 765 | loop_for(exec_time, program_end + 1); | ||
| 766 | } | ||
| 767 | else | ||
| 768 | { | ||
| 769 | double cpu_bookend = (exec_time)/2.0; | ||
| 770 | |||
| 771 | loop_for(cpu_bookend, program_end + 1); | ||
| 772 | gpu_loop_for(gpu_sec_time, program_end + 1); | ||
| 773 | loop_for(cpu_bookend, program_end + 1); | ||
| 774 | } | ||
| 775 | return 1; | ||
| 776 | } | ||
| 777 | |||
| 778 | #define OPTSTR "p:ls:e:g:G:W:N:S:R:T:BMaLyC:rz:q" | ||
| 779 | |||
| 780 | int main(int argc, char** argv) | ||
| 781 | { | ||
| 782 | atexit(catchExit); | ||
| 783 | |||
| 784 | int ret; | ||
| 785 | lt_t wcet; | ||
| 786 | lt_t period; | ||
| 787 | double wcet_ms, period_ms; | ||
| 788 | int migrate = 0; | ||
| 789 | int cpu = 0; | ||
| 790 | int opt; | ||
| 791 | int test_loop = 0; | ||
| 792 | // int column = 1; | ||
| 793 | const char *file = NULL; | ||
| 794 | int want_enforcement = 0; | ||
| 795 | double duration = 0, releaseTime = 0; | ||
| 796 | double *exec_times = NULL; | ||
| 797 | double scale = 1.0; | ||
| 798 | uint64_t cur_job; | ||
| 799 | uint64_t num_jobs; | ||
| 800 | |||
| 801 | int create_shm = -1; | ||
| 802 | int num_tasks = 0; | ||
| 803 | |||
| 804 | double gpu_sec_ms = 0; | ||
| 805 | |||
| 806 | while ((opt = getopt(argc, argv, OPTSTR)) != -1) { | ||
| 807 | // printf("opt = %c optarg = %s\n", opt, optarg); | ||
| 808 | switch (opt) { | ||
| 809 | // case 'w': | ||
| 810 | // ENABLE_WAIT = 1; | ||
| 811 | // break; | ||
| 812 | case 'p': | ||
| 813 | cpu = atoi(optarg); | ||
| 814 | migrate = 1; | ||
| 815 | break; | ||
| 816 | case 'l': | ||
| 817 | test_loop = 1; | ||
| 818 | break; | ||
| 819 | case 's': | ||
| 820 | scale = atof(optarg); | ||
| 821 | break; | ||
| 822 | case 'e': | ||
| 823 | gpu_sec_ms = atof(optarg); | ||
| 824 | break; | ||
| 825 | // case 'x': | ||
| 826 | // trans_sec_ms = atof(optarg); | ||
| 827 | // break; | ||
| 828 | case 'z': | ||
| 829 | NUM_SIMULT_USERS = atoi(optarg); | ||
| 830 | break; | ||
| 831 | case 'q': | ||
| 832 | USE_PRIOQ = true; | ||
| 833 | break; | ||
| 834 | case 'g': | ||
| 835 | GPU_TASK = 1; | ||
| 836 | GPU_PARTITION_SIZE = atoi(optarg); | ||
| 837 | break; | ||
| 838 | case 'G': | ||
| 839 | GPU_PARTITION = atoi(optarg); | ||
| 840 | break; | ||
| 841 | case 'S': | ||
| 842 | SEND_SIZE = (size_t)(atof(optarg)*1024); | ||
| 843 | break; | ||
| 844 | case 'R': | ||
| 845 | RECV_SIZE = (size_t)(atof(optarg)*1024); | ||
| 846 | break; | ||
| 847 | case 'T': | ||
| 848 | STATE_SIZE = (size_t)(atof(optarg)*1024); | ||
| 849 | break; | ||
| 850 | case 'B': | ||
| 851 | BROADCAST_STATE = true; | ||
| 852 | break; | ||
| 853 | case 'M': | ||
| 854 | MIGRATE_VIA_SYSMEM = true; | ||
| 855 | break; | ||
| 856 | case 'a': | ||
| 857 | ENABLE_AFFINITY = true; | ||
| 858 | break; | ||
| 859 | case 'r': | ||
| 860 | RELAX_FIFO_MAX_LEN = true; | ||
| 861 | break; | ||
| 862 | case 'L': | ||
| 863 | USE_KFMLP = true; | ||
| 864 | break; | ||
| 865 | case 'y': | ||
| 866 | USE_DYNAMIC_GROUP_LOCKS = true; | ||
| 867 | break; | ||
| 868 | case 'C': | ||
| 869 | ENABLE_CHUNKING = true; | ||
| 870 | CHUNK_SIZE = (size_t)(atof(optarg)*1024); | ||
| 871 | break; | ||
| 872 | case 'W': | ||
| 873 | create_shm = atoi(optarg); | ||
| 874 | break; | ||
| 875 | case 'N': | ||
| 876 | num_tasks = atoi(optarg); | ||
| 877 | break; | ||
| 878 | case ':': | ||
| 879 | usage("Argument missing."); | ||
| 880 | break; | ||
| 881 | case '?': | ||
| 882 | default: | ||
| 883 | usage("Bad argument."); | ||
| 884 | break; | ||
| 885 | } | ||
| 886 | } | ||
| 887 | |||
| 888 | if (test_loop) { | ||
| 889 | debug_delay_loop(); | ||
| 890 | return 0; | ||
| 891 | } | ||
| 892 | |||
| 893 | // if (file) { | ||
| 894 | // int num_jobs_tmp; | ||
| 895 | // get_exec_times(file, column, &num_jobs_tmp, &exec_times); | ||
| 896 | // num_jobs = num_jobs_tmp; | ||
| 897 | // | ||
| 898 | // if (argc - optind < 2) | ||
| 899 | // usage("Arguments missing."); | ||
| 900 | // | ||
| 901 | // for (cur_job = 0; cur_job < num_jobs; ++cur_job) { | ||
| 902 | // /* convert the execution time to seconds */ | ||
| 903 | // duration += exec_times[cur_job] * 0.001; | ||
| 904 | // } | ||
| 905 | // } else { | ||
| 906 | /* | ||
| 907 | * if we're not reading from the CSV file, then we need | ||
| 908 | * three parameters | ||
| 909 | */ | ||
| 910 | if (argc - optind < 3) | ||
| 911 | usage("Arguments missing."); | ||
| 912 | // } | ||
| 913 | |||
| 914 | wcet_ms = atof(argv[optind + 0]); | ||
| 915 | period_ms = atof(argv[optind + 1]); | ||
| 916 | |||
| 917 | wcet = wcet_ms * __NS_PER_MS; | ||
| 918 | period = period_ms * __NS_PER_MS; | ||
| 919 | if (wcet <= 0) | ||
| 920 | usage("The worst-case execution time must be a " | ||
| 921 | "positive number."); | ||
| 922 | if (period <= 0) | ||
| 923 | usage("The period must be a positive number."); | ||
| 924 | if (!file && wcet > period) { | ||
| 925 | usage("The worst-case execution time must not " | ||
| 926 | "exceed the period."); | ||
| 927 | } | ||
| 928 | |||
| 929 | if (!file) | ||
| 930 | { | ||
| 931 | duration = atof(argv[optind + 2]); | ||
| 932 | num_jobs = ((double)duration*1e3)/period_ms; | ||
| 933 | ++num_jobs; // padding | ||
| 934 | } | ||
| 935 | else if (file && num_jobs > 1) | ||
| 936 | { | ||
| 937 | duration += period_ms * 0.001 * (num_jobs - 1); | ||
| 938 | } | ||
| 939 | |||
| 940 | if (migrate) { | ||
| 941 | ret = be_migrate_to(cpu); | ||
| 942 | if (ret < 0) | ||
| 943 | bail_out("could not migrate to target partition"); | ||
| 944 | } | ||
| 945 | |||
| 946 | if(ENABLE_WAIT) | ||
| 947 | { | ||
| 948 | if(num_tasks > 0) | ||
| 949 | { | ||
| 950 | printf("%d creating release shared memory\n", getpid()); | ||
| 951 | shared_memory_object::remove("release_barrier_memory"); | ||
| 952 | release_segment_ptr = new managed_shared_memory(create_only, "release_barrier_memory", 4*1024); | ||
| 953 | |||
| 954 | printf("%d creating release barrier for %d users\n", getpid(), num_tasks); | ||
| 955 | release_barrier = release_segment_ptr->construct<barrier>("barrier release_barrier")(num_tasks); | ||
| 956 | |||
| 957 | init_release_time = release_segment_ptr->construct<uint64_t>("uint64_t instance")(); | ||
| 958 | *init_release_time = 0; | ||
| 959 | } | ||
| 960 | else | ||
| 961 | { | ||
| 962 | do | ||
| 963 | { | ||
| 964 | try | ||
| 965 | { | ||
| 966 | printf("%d opening release shared memory\n", getpid()); | ||
| 967 | segment_ptr = new managed_shared_memory(open_only, "release_barrier_memory"); | ||
| 968 | } | ||
| 969 | catch(...) | ||
| 970 | { | ||
| 971 | printf("%d shared memory not ready. sleeping\n", getpid()); | ||
| 972 | sleep(1); | ||
| 973 | } | ||
| 974 | }while(segment_ptr == NULL); | ||
| 975 | |||
| 976 | release_barrier = segment_ptr->find<barrier>("barrier release_barrier").first; | ||
| 977 | init_release_time = segment_ptr->find<uint64_t>("uint64_t instance").first; | ||
| 978 | } | ||
| 979 | } | ||
| 980 | |||
| 981 | |||
| 982 | if(GPU_TASK) | ||
| 983 | { | ||
| 984 | if(ENABLE_WAIT) | ||
| 985 | { | ||
| 986 | if(create_shm > -1) | ||
| 987 | { | ||
| 988 | printf("%d creating shared memory\n", getpid()); | ||
| 989 | shared_memory_object::remove("gpu_barrier_memory"); | ||
| 990 | segment_ptr = new managed_shared_memory(create_only, "gpu_barrier_memory", 4*1024); | ||
| 991 | |||
| 992 | printf("%d creating a barrier for %d users\n", getpid(), create_shm); | ||
| 993 | gpu_barrier = segment_ptr->construct<barrier>("barrier instance")(create_shm); | ||
| 994 | printf("%d creating gpu mgmt mutexes for 8 devices\n", getpid()); | ||
| 995 | gpu_mgmt_mutexes = segment_ptr->construct<interprocess_mutex>("interprocess_mutex m")[8](); | ||
| 996 | } | ||
| 997 | else | ||
| 998 | { | ||
| 999 | do | ||
| 1000 | { | ||
| 1001 | try | ||
| 1002 | { | ||
| 1003 | printf("%d opening shared memory\n", getpid()); | ||
| 1004 | segment_ptr = new managed_shared_memory(open_only, "gpu_barrier_memory"); | ||
| 1005 | } | ||
| 1006 | catch(...) | ||
| 1007 | { | ||
| 1008 | printf("%d shared memory not ready. sleeping\n", getpid()); | ||
| 1009 | sleep(1); | ||
| 1010 | } | ||
| 1011 | }while(segment_ptr == NULL); | ||
| 1012 | |||
| 1013 | gpu_barrier = segment_ptr->find<barrier>("barrier instance").first; | ||
| 1014 | gpu_mgmt_mutexes = segment_ptr->find<interprocess_mutex>("interprocess_mutex m").first; | ||
| 1015 | } | ||
| 1016 | } | ||
| 1017 | |||
| 1018 | // scale data transmission too?? | ||
| 1019 | SEND_SIZE *= scale; | ||
| 1020 | RECV_SIZE *= scale; | ||
| 1021 | STATE_SIZE *= scale; | ||
| 1022 | |||
| 1023 | init_cuda(); | ||
| 1024 | } | ||
| 1025 | |||
| 1026 | ret = sporadic_task_ns(wcet, period, 0, cpu, RT_CLASS_SOFT, | ||
| 1027 | want_enforcement ? PRECISE_ENFORCEMENT | ||
| 1028 | : NO_ENFORCEMENT, | ||
| 1029 | migrate); | ||
| 1030 | if (ret < 0) | ||
| 1031 | bail_out("could not setup rt task params"); | ||
| 1032 | |||
| 1033 | init_litmus(); | ||
| 1034 | |||
| 1035 | ret = task_mode(LITMUS_RT_TASK); | ||
| 1036 | if (ret != 0) | ||
| 1037 | bail_out("could not become RT task"); | ||
| 1038 | |||
| 1039 | |||
| 1040 | |||
| 1041 | uint64_t jobCount = 0; | ||
| 1042 | blitz::Array<uint64_t, 1> responseTimeLog(num_jobs+1); | ||
| 1043 | |||
| 1044 | struct timespec spec; | ||
| 1045 | uint64_t release; | ||
| 1046 | uint64_t finish; | ||
| 1047 | |||
| 1048 | |||
| 1049 | if (ENABLE_WAIT) { | ||
| 1050 | printf("Waiting for release.\n"); | ||
| 1051 | ret = wait_for_ts_release(); | ||
| 1052 | if (ret != 0) | ||
| 1053 | bail_out("wait_for_ts_release()"); | ||
| 1054 | } | ||
| 1055 | else | ||
| 1056 | { | ||
| 1057 | sleep_next_period(); | ||
| 1058 | } | ||
| 1059 | |||
| 1060 | clock_gettime(CLOCK_MONOTONIC, &spec); | ||
| 1061 | release = timespec_to_ns(spec); | ||
| 1062 | if (!__sync_bool_compare_and_swap(init_release_time, 0, release)) | ||
| 1063 | { | ||
| 1064 | release = *init_release_time; | ||
| 1065 | } | ||
| 1066 | |||
| 1067 | releaseTime = wctime(); | ||
| 1068 | double failsafeEnd = releaseTime + duration; | ||
| 1069 | |||
| 1070 | |||
| 1071 | if (file) { | ||
| 1072 | /* use times read from the CSV file */ | ||
| 1073 | for (cur_job = 0; cur_job < num_jobs; ++cur_job) { | ||
| 1074 | /* convert job's length to seconds */ | ||
| 1075 | job(exec_times[cur_job] * 0.001 * scale, | ||
| 1076 | gpu_sec_ms * 0.001 * scale, | ||
| 1077 | failsafeEnd); | ||
| 1078 | } | ||
| 1079 | } else { | ||
| 1080 | /* convert to seconds and scale */ | ||
| 1081 | int keepGoing; | ||
| 1082 | do | ||
| 1083 | { | ||
| 1084 | keepGoing = job(wcet_ms * 0.001 * scale, gpu_sec_ms * 0.001 * scale, failsafeEnd); | ||
| 1085 | |||
| 1086 | |||
| 1087 | clock_gettime(CLOCK_MONOTONIC, &spec); | ||
| 1088 | finish = timespec_to_ns(spec); | ||
| 1089 | |||
| 1090 | responseTimeLog(min(num_jobs,jobCount++)) = finish - release; | ||
| 1091 | |||
| 1092 | // this is an estimated upper-bound on release time. it may be off by several microseconds. | ||
| 1093 | #ifdef RESET_RELEASE_ON_MISS | ||
| 1094 | release = (release + period < finish) ? | ||
| 1095 | finish : /* missed deadline. adopt next release as current time. */ | ||
| 1096 | release + period; /* some time in the future. */ | ||
| 1097 | #else | ||
| 1098 | release = release + period; // allow things to get progressively later. | ||
| 1099 | #endif | ||
| 1100 | |||
| 1101 | sleep_next_period(); | ||
| 1102 | clock_gettime(CLOCK_MONOTONIC, &spec); | ||
| 1103 | release = min(timespec_to_ns(spec), release); | ||
| 1104 | |||
| 1105 | } while(keepGoing); | ||
| 1106 | } | ||
| 1107 | |||
| 1108 | if(GPU_TASK && ENABLE_WAIT) | ||
| 1109 | { | ||
| 1110 | printf("%d waiting at barrier\n", getpid()); | ||
| 1111 | gpu_barrier->wait(); | ||
| 1112 | } | ||
| 1113 | |||
| 1114 | ret = task_mode(BACKGROUND_TASK); | ||
| 1115 | if (ret != 0) | ||
| 1116 | bail_out("could not become regular task (huh?)"); | ||
| 1117 | |||
| 1118 | if (file) | ||
| 1119 | free(exec_times); | ||
| 1120 | |||
| 1121 | if(GPU_TASK) | ||
| 1122 | { | ||
| 1123 | /* | ||
| 1124 | if(ENABLE_WAIT) | ||
| 1125 | { | ||
| 1126 | // wait for all GPU using tasks ext RT mode. | ||
| 1127 | printf("%d waiting at barrier\n", getpid()); | ||
| 1128 | gpu_barrier->wait(); | ||
| 1129 | } | ||
| 1130 | */ | ||
| 1131 | |||
| 1132 | exit_cuda(); | ||
| 1133 | |||
| 1134 | if(ENABLE_WAIT) | ||
| 1135 | { | ||
| 1136 | /* wait before we clean up memory */ | ||
| 1137 | printf("%d waiting for all to shutdown GPUs\n", getpid()); | ||
| 1138 | gpu_barrier->wait(); | ||
| 1139 | |||
| 1140 | /* | ||
| 1141 | if(create_shm > -1) | ||
| 1142 | { | ||
| 1143 | printf("%d removing shared memory\n", getpid()); | ||
| 1144 | shared_memory_object::remove("gpu_barrier_memory"); | ||
| 1145 | } | ||
| 1146 | */ | ||
| 1147 | } | ||
| 1148 | } | ||
| 1149 | |||
| 1150 | |||
| 1151 | if (ENABLE_WAIT) | ||
| 1152 | { | ||
| 1153 | printf("%d waiting at exit barrier\n", getpid()); | ||
| 1154 | release_barrier->wait(); | ||
| 1155 | } | ||
| 1156 | |||
| 1157 | |||
| 1158 | char gpu_using_str[] = "GPU\n"; | ||
| 1159 | char cpu_only_str[] = "CPU\n"; | ||
| 1160 | #define USED(arr) (arr)(Range(fromStart,min(num_jobs-1,jobCount-1))) | ||
| 1161 | // period (ms), avg-rt, min-rt, max-rt, avg-slack, numMisses | ||
| 1162 | printf("DONE,%d,%d,%f,%f,%f,%lu,%lu,%f,%lu,%d,%d,%s", | ||
| 1163 | cpu, | ||
| 1164 | getpid(), | ||
| 1165 | period_ms, | ||
| 1166 | // average | ||
| 1167 | blitz::mean(USED(responseTimeLog)), | ||
| 1168 | // average pct of period | ||
| 1169 | 100.0*(blitz::mean(USED(responseTimeLog))/period), | ||
| 1170 | // min | ||
| 1171 | blitz::min(USED(responseTimeLog)), | ||
| 1172 | // max | ||
| 1173 | blitz::max(USED(responseTimeLog)), | ||
| 1174 | // average slack | ||
| 1175 | blitz::mean((uint64_t)period - USED(responseTimeLog)), | ||
| 1176 | // num jobs | ||
| 1177 | min(num_jobs-1,jobCount-1), | ||
| 1178 | // num misses | ||
| 1179 | blitz::count(USED(responseTimeLog) > (uint64_t)period), | ||
| 1180 | // num misses w/ unbounded | ||
| 1181 | blitz::count(USED(responseTimeLog) > (uint64_t)(2*period)), | ||
| 1182 | // flag gpu-using tasks | ||
| 1183 | ((GPU_TASK) ? gpu_using_str : cpu_only_str) | ||
| 1184 | ); | ||
| 1185 | |||
| 1186 | return 0; | ||
| 1187 | } | ||
diff --git a/include/common.h b/include/common.h index d1234ba..faf2c07 100644 --- a/include/common.h +++ b/include/common.h | |||
| @@ -1,7 +1,14 @@ | |||
| 1 | #ifndef COMMON_H | 1 | #ifndef COMMON_H |
| 2 | #define COMMON_H | 2 | #define COMMON_H |
| 3 | 3 | ||
| 4 | #ifdef __cplusplus | ||
| 5 | extern "C" { | ||
| 6 | #endif | ||
| 4 | 7 | ||
| 5 | void bail_out(const char* msg); | 8 | void bail_out(const char* msg); |
| 6 | 9 | ||
| 10 | #ifdef __cplusplus | ||
| 11 | } | ||
| 12 | #endif | ||
| 13 | |||
| 7 | #endif | 14 | #endif |
