aboutsummaryrefslogtreecommitdiffstats
path: root/gpu
diff options
context:
space:
mode:
authorGlenn Elliott <gelliott@cs.unc.edu>2013-01-10 17:48:39 -0500
committerGlenn Elliott <gelliott@cs.unc.edu>2013-01-10 17:48:39 -0500
commit629486d62ae22c33251d3c367af3febff5fe1e28 (patch)
treeef78fc8235c61f8ba37d109ea04266b6ce49b804 /gpu
parent1bf0f0094cd9671adfc07cf840bde67cd4cc0c38 (diff)
Clean up GPU test code placement.
Diffstat (limited to 'gpu')
-rw-r--r--gpu/aux_threads.c313
-rw-r--r--gpu/dgl.c251
-rw-r--r--gpu/ikglptest.c633
-rw-r--r--gpu/locktest.c206
-rw-r--r--gpu/nested.c245
-rw-r--r--gpu/normal_task.c84
-rw-r--r--gpu/rtspin_fake_cuda.cpp1169
7 files changed, 2901 insertions, 0 deletions
diff --git a/gpu/aux_threads.c b/gpu/aux_threads.c
new file mode 100644
index 0000000..6636f36
--- /dev/null
+++ b/gpu/aux_threads.c
@@ -0,0 +1,313 @@
1/* based_mt_task.c -- A basic multi-threaded real-time task skeleton.
2 *
3 * This (by itself useless) task demos how to setup a multi-threaded LITMUS^RT
4 * real-time task. Familiarity with the single threaded example (base_task.c)
5 * is assumed.
6 *
7 * Currently, liblitmus still lacks automated support for real-time
8 * tasks, but internaly it is thread-safe, and thus can be used together
9 * with pthreads.
10 */
11
12#include <stdio.h>
13#include <stdlib.h>
14#include <unistd.h>
15
16#include <fcntl.h>
17#include <sys/stat.h>
18#include <sys/time.h>
19#include <sys/resource.h>
20
21/* Include gettid() */
22#include <sys/types.h>
23
24/* Include threading support. */
25#include <pthread.h>
26
27/* Include the LITMUS^RT API.*/
28#include "litmus.h"
29
30//#define PERIOD 500
31#define PERIOD 10
32//#define EXEC_COST 10
33#define EXEC_COST 1
34
35int NUM_AUX_THREADS = 2;
36
37#define LITMUS_STATS_FILE "/proc/litmus/stats"
38
39/* The information passed to each thread. Could be anything. */
40struct thread_context {
41 int id;
42 struct timeval total_time;
43};
44
45/* The real-time thread program. Doesn't have to be the same for
46 * all threads. Here, we only have one that will invoke job().
47 */
48void* rt_thread(void *tcontext);
49void* aux_thread(void *tcontext);
50
51/* Declare the periodically invoked job.
52 * Returns 1 -> task should exit.
53 * 0 -> task should continue.
54 */
55int job(void);
56
57
58/* Catch errors.
59 */
60#define CALL( exp ) do { \
61 int ret; \
62 ret = exp; \
63 if (ret != 0) \
64 fprintf(stderr, "%s failed: %m\n", #exp);\
65 else \
66 fprintf(stderr, "%s ok.\n", #exp); \
67 } while (0)
68
69int gRun = 1;
70
71pthread_mutex_t gMutex = PTHREAD_MUTEX_INITIALIZER;
72pthread_barrier_t gBar;
73
74#define OPTSTR "t:fcb"
75
76int main(int argc, char** argv)
77{
78 int i;
79 struct thread_context *ctx;
80 pthread_t *task;
81
82 int opt;
83 int before = 0;
84 int aux_flags = 0;
85 int do_future = 0;
86
87 while ((opt = getopt(argc, argv, OPTSTR)) != -1) {
88 switch(opt)
89 {
90 case 't':
91 NUM_AUX_THREADS = atoi(optarg);
92 printf("%d aux threads\n", NUM_AUX_THREADS);
93 break;
94 case 'f':
95 aux_flags |= AUX_FUTURE;
96 do_future = 1;
97 break;
98 case 'c':
99 aux_flags |= AUX_CURRENT;
100 break;
101 case 'b':
102 before = 1;
103 printf("Will become real-time before spawning aux threads.\n");
104 break;
105 }
106 }
107
108 if (aux_flags == 0) {
109 printf("Must specify -c (AUX_CURRENT) and/or -f (AUX_FUTURE) for aux tasks.\n");
110 return -1;
111 }
112
113 ctx = calloc(NUM_AUX_THREADS, sizeof(struct thread_context));
114 task = calloc(NUM_AUX_THREADS, sizeof(pthread_t));
115
116 //lt_t delay = ms2lt(1000);
117
118 /*****
119 * 3) Initialize LITMUS^RT.
120 * Task parameters will be specified per thread.
121 */
122 init_litmus();
123
124 {
125 pthread_barrierattr_t battr;
126 pthread_barrierattr_init(&battr);
127 pthread_barrier_init(&gBar, &battr, (NUM_AUX_THREADS)+1);
128 }
129
130 if(before)
131 {
132 CALL( init_rt_thread() );
133 CALL( sporadic_global(EXEC_COST, PERIOD) );
134 CALL( task_mode(LITMUS_RT_TASK) );
135 }
136
137
138 if(do_future && before)
139 {
140 CALL( enable_aux_rt_tasks(aux_flags) );
141 }
142
143// printf("Red Leader is now real-time!\n");
144
145 for (i = 0; i < NUM_AUX_THREADS; i++) {
146 ctx[i].id = i;
147 pthread_create(task + i, NULL, aux_thread, (void *) (ctx + i));
148 }
149
150// pthread_barrier_wait(&gBar);
151
152// sleep(1);
153
154 if(!before)
155 {
156 CALL( init_rt_thread() );
157 CALL( sporadic_global(EXEC_COST, PERIOD) );
158 CALL( task_mode(LITMUS_RT_TASK) );
159 }
160
161 // secondary call *should* be harmless
162 CALL( enable_aux_rt_tasks(aux_flags) );
163
164 {
165 int last = time(0);
166// struct timespec sleeptime = {0, 1000}; // 1 microsecond
167// for(i = 0; i < 24000; ++i) {
168 for(i = 0; i < 2000; ++i) {
169 sleep_next_period();
170// printf("RED LEADER!\n");
171
172// nanosleep(&sleeptime, NULL);
173
174 pthread_mutex_lock(&gMutex);
175
176 if((i%(10000/PERIOD)) == 0) {
177 int now = time(0);
178 printf("hearbeat %d: %d\n", i, now - last);
179 last = now;
180 }
181
182 pthread_mutex_unlock(&gMutex);
183 }
184 }
185
186 CALL( disable_aux_rt_tasks(aux_flags) );
187 gRun = 0;
188
189 CALL( task_mode(BACKGROUND_TASK) );
190
191 /*****
192 * 5) Wait for RT threads to terminate.
193 */
194 for (i = 0; i < NUM_AUX_THREADS; i++) {
195 if (task[i] != 0) {
196 float time;
197 pthread_join(task[i], NULL);
198 time = ctx[i].total_time.tv_sec + ctx[i].total_time.tv_usec / (float)(1e6);
199 printf("child %d: %fs\n", i, time);
200 }
201 }
202
203
204 /*****
205 * 6) Clean up, maybe print results and stats, and exit.
206 */
207 return 0;
208}
209
210
211
212/* A real-time thread is very similar to the main function of a single-threaded
213 * real-time app. Notice, that init_rt_thread() is called to initialized per-thread
214 * data structures of the LITMUS^RT user space libary.
215 */
216void* aux_thread(void *tcontext)
217{
218 struct thread_context *ctx = (struct thread_context *) tcontext;
219 int count = 0;
220
221// pthread_barrier_wait(&gBar);
222
223 while(gRun)
224 {
225 if(count++ % 100000 == 0) {
226 pthread_mutex_lock(&gMutex);
227 pthread_mutex_unlock(&gMutex);
228 }
229 }
230
231 {
232 struct rusage use;
233 long int sec;
234
235 getrusage(RUSAGE_THREAD, &use);
236
237 ctx->total_time.tv_usec = use.ru_utime.tv_usec + use.ru_stime.tv_usec;
238 sec = ctx->total_time.tv_usec / (long int)(1e6);
239 ctx->total_time.tv_usec = ctx->total_time.tv_usec % (long int)(1e6);
240 ctx->total_time.tv_sec = use.ru_utime.tv_sec + use.ru_stime.tv_sec + sec;
241 }
242
243 return ctx;
244}
245
246
247/* A real-time thread is very similar to the main function of a single-threaded
248 * real-time app. Notice, that init_rt_thread() is called to initialized per-thread
249 * data structures of the LITMUS^RT user space libary.
250 */
251void* rt_thread(void *tcontext)
252{
253 struct thread_context *ctx = (struct thread_context *) tcontext;
254
255 /* Make presence visible. */
256 printf("RT Thread %d active.\n", ctx->id);
257
258 /*****
259 * 1) Initialize real-time settings.
260 */
261 CALL( init_rt_thread() );
262 CALL( sporadic_global(EXEC_COST, PERIOD + ctx->id * 10) );
263
264
265 /*****
266 * 2) Transition to real-time mode.
267 */
268 CALL( task_mode(LITMUS_RT_TASK) );
269
270
271
272 wait_for_ts_release();
273
274 /* The task is now executing as a real-time task if the call didn't fail.
275 */
276
277
278
279 /*****
280 * 3) Invoke real-time jobs.
281 */
282 while(gRun) {
283 /* Wait until the next job is released. */
284 sleep_next_period();
285 printf("%d: task.\n", ctx->id);
286 }
287
288 /*****
289 * 4) Transition to background mode.
290 */
291 CALL( task_mode(BACKGROUND_TASK) );
292
293 {
294 struct rusage use;
295 long int sec;
296
297 getrusage(RUSAGE_THREAD, &use);
298 ctx->total_time.tv_usec = use.ru_utime.tv_usec + use.ru_stime.tv_usec;
299 sec = ctx->total_time.tv_usec / (long int)(1e6);
300 ctx->total_time.tv_usec = ctx->total_time.tv_usec % (long int)(1e6);
301 ctx->total_time.tv_sec = use.ru_utime.tv_sec + use.ru_stime.tv_sec + sec;
302 }
303
304 return ctx;
305}
306
307int job(void)
308{
309 /* Do real-time calculation. */
310
311 /* Don't exit. */
312 return 0;
313}
diff --git a/gpu/dgl.c b/gpu/dgl.c
new file mode 100644
index 0000000..a045879
--- /dev/null
+++ b/gpu/dgl.c
@@ -0,0 +1,251 @@
1#include <stdio.h>
2#include <stdlib.h>
3#include <stdint.h>
4#include <unistd.h>
5#include <assert.h>
6#include <errno.h>
7#include <sys/types.h>
8#include <sys/stat.h>
9#include <fcntl.h>
10
11/* Include gettid() */
12#include <sys/types.h>
13
14/* Include threading support. */
15#include <pthread.h>
16
17/* Include the LITMUS^RT API.*/
18#include "litmus.h"
19
20/* Catch errors.
21 */
22#define CALL( exp ) do { \
23 int ret; \
24 ret = exp; \
25 if (ret != 0) \
26 fprintf(stderr, "%s failed: %m\n", #exp);\
27 else \
28 fprintf(stderr, "%s ok.\n", #exp); \
29 } while (0)
30
31#define TH_CALL( exp ) do { \
32 int ret; \
33 ret = exp; \
34 if (ret != 0) \
35 fprintf(stderr, "[%d] %s failed: %m\n", ctx->id, #exp); \
36 else \
37 fprintf(stderr, "[%d] %s ok.\n", ctx->id, #exp); \
38 } while (0)
39
40#define TH_SAFE_CALL( exp ) do { \
41 int ret; \
42 fprintf(stderr, "[%d] calling %s...\n", ctx->id, #exp); \
43 ret = exp; \
44 if (ret != 0) \
45 fprintf(stderr, "\t...[%d] %s failed: %m\n", ctx->id, #exp); \
46 else \
47 fprintf(stderr, "\t...[%d] %s ok.\n", ctx->id, #exp); \
48 } while (0)
49
50
51/* these are only default values */
52int NUM_THREADS=3;
53int NUM_SEMS=1;
54int NUM_REPLICAS=1;
55int NEST_DEPTH=1;
56
57int SLEEP_BETWEEN_JOBS = 1;
58
59#define MAX_SEMS 1000
60#define MAX_NEST_DEPTH 10
61
62
63// 1000 = 1us
64#define EXEC_COST 1000*1
65#define PERIOD 1000*10
66
67/* The information passed to each thread. Could be anything. */
68struct thread_context {
69 int id;
70 int fd;
71 int ikglp;
72 int od[MAX_SEMS];
73 int count;
74 unsigned int rand;
75};
76
77void* rt_thread(void* _ctx);
78int nested_job(struct thread_context* ctx, int *count, int *next);
79int job(struct thread_context*);
80
81#define OPTSTR "t:k:s:d:f"
82
83int main(int argc, char** argv)
84{
85 int i;
86 struct thread_context* ctx;
87 pthread_t* task;
88 int fd;
89
90 int opt;
91 while((opt = getopt(argc, argv, OPTSTR)) != -1) {
92 switch(opt) {
93 case 't':
94 NUM_THREADS = atoi(optarg);
95 break;
96 case 'k':
97 NUM_REPLICAS = atoi(optarg);
98 assert(NUM_REPLICAS > 0);
99 break;
100 case 's':
101 NUM_SEMS = atoi(optarg);
102 assert(NUM_SEMS >= 0 && NUM_SEMS <= MAX_SEMS);
103 break;
104 case 'd':
105 NEST_DEPTH = atoi(optarg);
106 assert(NEST_DEPTH >= 1 && NEST_DEPTH <= MAX_NEST_DEPTH);
107 break;
108 case 'f':
109 SLEEP_BETWEEN_JOBS = 0;
110 break;
111 default:
112 fprintf(stderr, "Unknown option: %c\n", opt);
113 exit(-1);
114 break;
115 }
116 }
117
118 ctx = (struct thread_context*) calloc(NUM_THREADS, sizeof(struct thread_context));
119 task = (pthread_t*) calloc(NUM_THREADS, sizeof(pthread_t));
120
121 srand(0); /* something repeatable for now */
122
123 fd = open("semaphores", O_RDONLY | O_CREAT, S_IRUSR | S_IWUSR);
124
125 CALL( init_litmus() );
126
127 for (i = 0; i < NUM_THREADS; i++) {
128 ctx[i].id = i;
129 ctx[i].fd = fd;
130 ctx[i].rand = rand();
131 CALL( pthread_create(task + i, NULL, rt_thread, ctx + i) );
132 }
133
134
135 for (i = 0; i < NUM_THREADS; i++)
136 pthread_join(task[i], NULL);
137
138
139 return 0;
140}
141
142void* rt_thread(void* _ctx)
143{
144 int i;
145 int do_exit = 0;
146
147 struct thread_context *ctx = (struct thread_context*)_ctx;
148
149 TH_CALL( init_rt_thread() );
150
151 /* Vary period a little bit. */
152 TH_CALL( sporadic_task_ns(EXEC_COST, PERIOD + 10*ctx->id, 0, 0, LITMUS_LOWEST_PRIORITY,
153 RT_CLASS_SOFT, NO_ENFORCEMENT, NO_SIGNALS, 0) );
154
155 ctx->ikglp = open_ikglp_sem(ctx->fd, 0, (void*)&NUM_REPLICAS);
156 if(ctx->ikglp < 0)
157 perror("open_ikglp_sem");
158 else
159 printf("ikglp od = %d\n", ctx->ikglp);
160
161 for (i = 0; i < NUM_SEMS; i++) {
162 ctx->od[i] = open_rsm_sem(ctx->fd, i+1);
163 if(ctx->od[i] < 0)
164 perror("open_rsm_sem");
165 else
166 printf("rsm[%d] od = %d\n", i, ctx->od[i]);
167 }
168
169 TH_CALL( task_mode(LITMUS_RT_TASK) );
170
171
172 printf("[%d] Waiting for TS release.\n ", ctx->id);
173 wait_for_ts_release();
174 ctx->count = 0;
175
176 do {
177 int replica = -1;
178 int first = (int)(NUM_SEMS * (rand_r(&(ctx->rand)) / (RAND_MAX + 1.0)));
179 int last = (first + NEST_DEPTH - 1 >= NUM_SEMS) ? NUM_SEMS - 1 : first + NEST_DEPTH - 1;
180 int dgl_size = last - first + 1;
181 int dgl[dgl_size];
182
183 // construct the DGL
184 for(i = first; i <= last; ++i) {
185 dgl[i-first] = ctx->od[i];
186 }
187
188
189 replica = litmus_lock(ctx->ikglp);
190 printf("[%d] got ikglp replica %d.\n", ctx->id, replica);
191 fflush(stdout);
192
193
194 litmus_dgl_lock(dgl, dgl_size);
195 printf("[%d] acquired dgl.\n", ctx->id);
196 fflush(stdout);
197
198
199 do_exit = job(ctx);
200
201
202 printf("[%d] unlocking dgl.\n", ctx->id);
203 fflush(stdout);
204 litmus_dgl_unlock(dgl, dgl_size);
205
206
207 printf("[%d]: freeing ikglp replica %d.\n", ctx->id, replica);
208 fflush(stdout);
209 litmus_unlock(ctx->ikglp);
210
211 if(SLEEP_BETWEEN_JOBS && !do_exit) {
212 sleep_next_period();
213 }
214 } while(!do_exit);
215
216 /*****
217 * 4) Transition to background mode.
218 */
219 TH_CALL( task_mode(BACKGROUND_TASK) );
220
221
222 return NULL;
223}
224
225void dirty_kb(int kb)
226{
227 int32_t one_kb[256];
228 int32_t sum = 0;
229 int32_t i;
230
231 for (i = 0; i < 256; i++)
232 sum += one_kb[i];
233 kb--;
234 /* prevent tail recursion */
235 if (kb)
236 dirty_kb(kb);
237 for (i = 0; i < 256; i++)
238 sum += one_kb[i];
239}
240
241int job(struct thread_context* ctx)
242{
243 /* Do real-time calculation. */
244 dirty_kb(8);
245
246 /* Don't exit. */
247 //return ctx->count++ > 100;
248 //return ctx->count++ > 12000;
249 //return ctx->count++ > 120000;
250 return ctx->count++ > 50000; // controls number of jobs per task
251}
diff --git a/gpu/ikglptest.c b/gpu/ikglptest.c
new file mode 100644
index 0000000..5f566d5
--- /dev/null
+++ b/gpu/ikglptest.c
@@ -0,0 +1,633 @@
1#include <stdio.h>
2#include <stdlib.h>
3#include <string.h>
4#include <stdint.h>
5#include <unistd.h>
6#include <assert.h>
7#include <errno.h>
8#include <sys/types.h>
9#include <sys/stat.h>
10#include <fcntl.h>
11#include <time.h>
12#include <math.h>
13
14/* Include gettid() */
15#include <sys/types.h>
16
17/* Include threading support. */
18#include <pthread.h>
19
20/* Include the LITMUS^RT API.*/
21#include "litmus.h"
22
23/* Catch errors.
24 */
25#if 1
26#define CALL( exp ) do { \
27 int ret; \
28 ret = exp; \
29 if (ret != 0) \
30 fprintf(stderr, "%s failed: %m\n", #exp);\
31 else \
32 fprintf(stderr, "%s ok.\n", #exp); \
33 } while (0)
34
35#define TH_CALL( exp ) do { \
36 int ret; \
37 ret = exp; \
38 if (ret != 0) \
39 fprintf(stderr, "[%d] %s failed: %m\n", ctx->id, #exp); \
40 else \
41 fprintf(stderr, "[%d] %s ok.\n", ctx->id, #exp); \
42 } while (0)
43
44#define TH_SAFE_CALL( exp ) do { \
45 int ret; \
46 fprintf(stderr, "[%d] calling %s...\n", ctx->id, #exp); \
47 ret = exp; \
48 if (ret != 0) \
49 fprintf(stderr, "\t...[%d] %s failed: %m\n", ctx->id, #exp); \
50 else \
51 fprintf(stderr, "\t...[%d] %s ok.\n", ctx->id, #exp); \
52 } while (0)
53#else
54#define CALL( exp )
55#define TH_CALL( exp )
56#define TH_SAFE_CALL( exp )
57#endif
58
59/* these are only default values */
60int NUM_THREADS=3;
61int NUM_AUX_THREADS=0;
62int NUM_SEMS=1;
63int NUM_GPUS=1;
64int GPU_OFFSET=0;
65int NUM_SIMULT_USERS = 1;
66int ENABLE_AFFINITY = 0;
67int NEST_DEPTH=1;
68int USE_KFMLP = 0;
69int RELAX_FIFO_MAX_LEN = 0;
70int USE_DYNAMIC_GROUP_LOCKS = 0;
71
72int SLEEP_BETWEEN_JOBS = 1;
73
74int gAuxRun = 1;
75pthread_mutex_t gMutex = PTHREAD_MUTEX_INITIALIZER;
76
77#define MAX_SEMS 1000
78
79// 1000 = 1us
80#define EXEC_COST 1000*1
81#define PERIOD 2*1000*100
82
83/* The information passed to each thread. Could be anything. */
84struct thread_context {
85 int id;
86 int fd;
87 int kexclu;
88 int od[MAX_SEMS];
89 int count;
90 unsigned int rand;
91 int mig_count[5];
92};
93
94void* rt_thread(void* _ctx);
95void* aux_thread(void* _ctx);
96int nested_job(struct thread_context* ctx, int *count, int *next, int runfactor);
97int job(struct thread_context* ctx, int runfactor);
98
99
100struct avg_info
101{
102 float avg;
103 float stdev;
104};
105
106struct avg_info feedback(int _a, int _b)
107{
108 fp_t a = _frac(_a, 10000);
109 fp_t b = _frac(_b, 10000);
110 int i;
111
112 fp_t actual_fp;
113
114 fp_t _est, _err;
115
116 int base = 1000000;
117 //int range = 40;
118
119 fp_t est = _integer_to_fp(base);
120 fp_t err = _fp(base/2);
121
122#define NUM_SAMPLES 10000
123
124 float samples[NUM_SAMPLES] = {0.0};
125 float accu_abs, accu;
126 float avg;
127 float devsum;
128 float stdev;
129 struct avg_info ret;
130
131 for(i = 0; i < NUM_SAMPLES; ++i) {
132 int num = ((rand()%40)*(rand()%2 ? -1 : 1)/100.0)*base + base;
133 float rel_err;
134
135 actual_fp = _integer_to_fp(num);
136
137// printf("Before: est = %d\terr = %d\n", (int)_fp_to_integer(est), (int)_fp_to_integer(err));
138
139 _err = _sub(actual_fp, est);
140 _est = _add(_mul(a, _err), _mul(b, err));
141
142 rel_err = _fp_to_integer(_mul(_div(_err, est), _integer_to_fp(10000)))/10000.0;
143 rel_err *= 100.0;
144 //printf("%6.2f\n", rel_err);
145 samples[i] = rel_err;
146
147 est = _est;
148 err = _add(err, _err);
149
150 if((int)_fp_to_integer(est) <= 0) {
151 est = actual_fp;
152 err = _div(actual_fp, _integer_to_fp(2));
153 }
154
155 //printf("After: est = %d\terr = %d\n", (int)_fp_to_integer(est), (int)_fp_to_integer(err));
156 }
157
158 accu_abs = 0.0;
159 accu = 0.0;
160 for(i = 0; i < NUM_SAMPLES; ++i) {
161 accu += samples[i];
162 accu_abs += abs(samples[i]);
163 }
164
165 avg = accu_abs/NUM_SAMPLES;
166 devsum = 0;
167 for(i = 0; i < NUM_SAMPLES; ++i) {
168 float dev = samples[i] - avg;
169 dev *= dev;
170 devsum += dev;
171 }
172
173 stdev = sqrtf(devsum/(NUM_SAMPLES-1));
174
175 ret.avg = avg;
176 ret.stdev = stdev;
177
178 //printf("AVG: %6.2f\tw/ neg: %6.2f\n", accu_abs/NUM_SAMPLES, accu/NUM_SAMPLES);
179
180 //return (accu_abs/NUM_SAMPLES);
181 return(ret);
182}
183
184
185
186#define OPTSTR "t:k:o:z:s:d:lfaryA:"
187
188int main(int argc, char** argv)
189{
190 int i;
191 struct thread_context* ctx;
192 struct thread_context* aux_ctx;
193 pthread_t* task;
194 pthread_t* aux_task;
195 int fd;
196
197 int opt;
198 while((opt = getopt(argc, argv, OPTSTR)) != -1) {
199 switch(opt) {
200 case 't':
201 NUM_THREADS = atoi(optarg);
202 break;
203 case 'A':
204 NUM_AUX_THREADS = atoi(optarg);
205 break;
206 case 'k':
207 NUM_GPUS = atoi(optarg);
208 assert(NUM_GPUS > 0);
209 break;
210 case 'z':
211 NUM_SIMULT_USERS = atoi(optarg);
212 assert(NUM_SIMULT_USERS > 0);
213 break;
214 case 'o':
215 GPU_OFFSET = atoi(optarg);
216 assert(GPU_OFFSET >= 0);
217 break;
218 case 's':
219 NUM_SEMS = atoi(optarg);
220 assert(NUM_SEMS >= 0 && NUM_SEMS < MAX_SEMS);
221 break;
222 case 'd':
223 NEST_DEPTH = atoi(optarg);
224 assert(NEST_DEPTH >= 0);
225 break;
226 case 'f':
227 SLEEP_BETWEEN_JOBS = 0;
228 break;
229 case 'a':
230 ENABLE_AFFINITY = 1;
231 break;
232 case 'l':
233 USE_KFMLP = 1;
234 break;
235 case 'y':
236 USE_DYNAMIC_GROUP_LOCKS = 1;
237 break;
238 case 'r':
239 RELAX_FIFO_MAX_LEN = 1;
240 break;
241 default:
242 fprintf(stderr, "Unknown option: %c\n", opt);
243 exit(-1);
244 break;
245 }
246 }
247
248#if 0
249 int best_a = 0, best_b = 0;
250 int first = 1;
251 int TRIALS = 15;
252
253 int a, b, t;
254
255 struct avg_info best = {0.0,0.0}, second_best;
256
257 int second_best_a, second_best_b;
258
259 srand(time(0));
260
261 int step = 50;
262
263 for(b = 2000; b < 5000; b += step) {
264 for(a = 1500; a < b; a += (step/4)) {
265 float std_accum = 0;
266 float avg_accum = 0;
267 for(t = 0; t < TRIALS; ++t) {
268 struct avg_info temp;
269 temp = feedback(a, b);
270 std_accum += temp.stdev;
271 avg_accum += temp.avg;
272 }
273
274 float avg_std = std_accum / TRIALS;
275
276 if(first || avg_std < best.stdev) {
277 second_best_a = best_a;
278 second_best_b = best_b;
279 second_best = best;
280
281 best.stdev = avg_std;
282 best.avg = avg_accum / TRIALS;
283 best_a = a;
284 best_b = b;
285
286 first = 0;
287 }
288 }
289 }
290
291 printf("Best:\ta = %d\tb = %d\t(b-a) = %d\tavg = %6.2f\tstdev = %6.2f\n", best_a, best_b, best_b - best_a, best.avg, best.stdev);
292 printf("2nd:\ta = %d\tb = %d\t(b-a) = %d\tavg = %6.2f\tstdev = %6.2f\n", second_best_a, second_best_b, second_best_b - second_best_a, second_best.avg, second_best.stdev);
293
294
295 a = 14008;
296 b = 16024;
297 float std_accum = 0;
298 float avg_accum = 0;
299 for(t = 0; t < TRIALS; ++t) {
300 struct avg_info temp;
301 temp = feedback(a, b);
302 std_accum += temp.stdev;
303 avg_accum += temp.avg;
304 }
305
306 printf("Aaron:\tavg = %6.2f\tstd = %6.2f\n", avg_accum/TRIALS, std_accum/TRIALS);
307
308
309
310
311 return 0;
312#endif
313
314
315
316
317 ctx = (struct thread_context*) calloc(NUM_THREADS, sizeof(struct thread_context));
318 task = (pthread_t*) calloc(NUM_THREADS, sizeof(pthread_t));
319
320 if (NUM_AUX_THREADS) {
321 aux_ctx = (struct thread_context*) calloc(NUM_AUX_THREADS, sizeof(struct thread_context));
322 aux_task = (pthread_t*) calloc(NUM_AUX_THREADS, sizeof(pthread_t));
323 }
324
325 srand(0); /* something repeatable for now */
326
327 fd = open("semaphores", O_RDONLY | O_CREAT, S_IRUSR | S_IWUSR);
328
329 CALL( init_litmus() );
330
331 for (i = 0; i < NUM_AUX_THREADS; i++) {
332 aux_ctx[i].id = i;
333 CALL( pthread_create(aux_task + i, NULL, aux_thread, ctx + i) );
334 }
335
336 for (i = 0; i < NUM_THREADS; i++) {
337 ctx[i].id = i;
338 ctx[i].fd = fd;
339 ctx[i].rand = rand();
340 memset(&ctx[i].mig_count, 0, sizeof(ctx[i].mig_count));
341 CALL( pthread_create(task + i, NULL, rt_thread, ctx + i) );
342 }
343
344 if (NUM_AUX_THREADS) {
345 TH_CALL( init_rt_thread() );
346 TH_CALL( sporadic_task_ns(EXEC_COST, PERIOD + 10*NUM_THREADS+1, 0, 0,
347 LITMUS_LOWEST_PRIORITY, RT_CLASS_SOFT, NO_ENFORCEMENT, NO_SIGNALS, 1) );
348 TH_CALL( task_mode(LITMUS_RT_TASK) );
349
350 printf("[MASTER] Waiting for TS release.\n ");
351 wait_for_ts_release();
352
353 CALL( enable_aux_rt_tasks(AUX_CURRENT) );
354
355 for(i = 0; i < 25000; ++i) {
356 sleep_next_period();
357 pthread_mutex_lock(&gMutex);
358 pthread_mutex_unlock(&gMutex);
359 }
360
361 CALL( disable_aux_rt_tasks(AUX_CURRENT) );
362 __sync_synchronize();
363 gAuxRun = 0;
364 __sync_synchronize();
365
366 for (i = 0; i < NUM_AUX_THREADS; i++)
367 pthread_join(aux_task[i], NULL);
368
369 TH_CALL( task_mode(BACKGROUND_TASK) );
370 }
371
372 for (i = 0; i < NUM_THREADS; i++)
373 pthread_join(task[i], NULL);
374
375 return 0;
376}
377
378int affinity_cost[] = {1, 4, 8, 16};
379
380int affinity_distance(struct thread_context* ctx, int a, int b)
381{
382 int i;
383 int dist;
384
385 if(a >= 0 && b >= 0) {
386 for(i = 0; i <= 3; ++i) {
387 if(a>>i == b>>i) {
388 dist = i;
389 goto out;
390 }
391 }
392 dist = 0; // hopefully never reached.
393 }
394 else {
395 dist = 0;
396 }
397
398out:
399 //printf("[%d]: distance: %d -> %d = %d\n", ctx->id, a, b, dist);
400
401 ++(ctx->mig_count[dist]);
402
403 return dist;
404
405// int groups[] = {2, 4, 8};
406// int i;
407//
408// if(a < 0 || b < 0)
409// return (sizeof(groups)/sizeof(groups[0])); // worst affinity
410//
411// // no migration
412// if(a == b)
413// return 0;
414//
415// for(i = 0; i < sizeof(groups)/sizeof(groups[0]); ++i) {
416// if(a/groups[i] == b/groups[i])
417// return (i+1);
418// }
419// assert(0);
420// return -1;
421}
422
423
424void* aux_thread(void* _ctx)
425{
426 struct thread_context *ctx = (struct thread_context*)_ctx;
427
428 while (gAuxRun) {
429 pthread_mutex_lock(&gMutex);
430 pthread_mutex_unlock(&gMutex);
431 }
432
433 return ctx;
434}
435
436void* rt_thread(void* _ctx)
437{
438 int i;
439 int do_exit = 0;
440 int last_replica = -1;
441
442 struct thread_context *ctx = (struct thread_context*)_ctx;
443
444 TH_CALL( init_rt_thread() );
445
446 /* Vary period a little bit. */
447 TH_CALL( sporadic_task_ns(EXEC_COST, PERIOD + 10*ctx->id, 0, 0,
448 LITMUS_LOWEST_PRIORITY, RT_CLASS_SOFT, NO_ENFORCEMENT, NO_SIGNALS, 1) );
449
450 if(USE_KFMLP) {
451 ctx->kexclu = open_kfmlp_gpu_sem(ctx->fd,
452 0, /* name */
453 NUM_GPUS,
454 GPU_OFFSET,
455 NUM_SIMULT_USERS,
456 ENABLE_AFFINITY
457 );
458 }
459 else {
460// ctx->kexclu = open_ikglp_sem(ctx->fd, 0, &NUM_GPUS);
461 ctx->kexclu = open_ikglp_gpu_sem(ctx->fd,
462 0, /* name */
463 NUM_GPUS,
464 GPU_OFFSET,
465 NUM_SIMULT_USERS,
466 ENABLE_AFFINITY,
467 RELAX_FIFO_MAX_LEN
468 );
469 }
470 if(ctx->kexclu < 0)
471 perror("open_kexclu_sem");
472 else
473 printf("kexclu od = %d\n", ctx->kexclu);
474
475 for (i = 0; i < NUM_SEMS; ++i) {
476 ctx->od[i] = open_rsm_sem(ctx->fd, i + ctx->kexclu + 2);
477 if(ctx->od[i] < 0)
478 perror("open_rsm_sem");
479 else
480 printf("rsm[%d] od = %d\n", i, ctx->od[i]);
481 }
482
483 TH_CALL( task_mode(LITMUS_RT_TASK) );
484
485 printf("[%d] Waiting for TS release.\n ", ctx->id);
486 wait_for_ts_release();
487 ctx->count = 0;
488
489// if (ctx->id == 0 && NUM_AUX_THREADS) {
490// CALL( enable_aux_rt_tasks() );
491// }
492
493 do {
494 int first = (int)(NUM_SEMS * (rand_r(&(ctx->rand)) / (RAND_MAX + 1.0)));
495 int last = (first + NEST_DEPTH - 1 >= NUM_SEMS) ? NUM_SEMS - 1 : first + NEST_DEPTH - 1;
496 int dgl_size = last - first + 1;
497 int replica = -1;
498 int distance;
499
500 int dgl[dgl_size];
501
502 // construct the DGL
503 for(i = first; i <= last; ++i) {
504 dgl[i-first] = ctx->od[i];
505 }
506
507 replica = litmus_lock(ctx->kexclu);
508
509 //printf("[%d] got kexclu replica %d.\n", ctx->id, replica);
510 //fflush(stdout);
511
512 distance = affinity_distance(ctx, replica, last_replica);
513
514 if(USE_DYNAMIC_GROUP_LOCKS) {
515 litmus_dgl_lock(dgl, dgl_size);
516 }
517 else {
518 for(i = 0; i < dgl_size; ++i) {
519 litmus_lock(dgl[i]);
520 }
521 }
522
523 //do_exit = nested_job(ctx, &count, &first, affinity_cost[distance]);
524 do_exit = job(ctx, affinity_cost[distance]);
525
526 if(USE_DYNAMIC_GROUP_LOCKS) {
527 litmus_dgl_unlock(dgl, dgl_size);
528 }
529 else {
530 for(i = dgl_size - 1; i >= 0; --i) {
531 litmus_unlock(dgl[i]);
532 }
533 }
534
535 //printf("[%d]: freeing kexclu replica %d.\n", ctx->id, replica);
536 //fflush(stdout);
537
538 litmus_unlock(ctx->kexclu);
539
540 last_replica = replica;
541
542 if(SLEEP_BETWEEN_JOBS && !do_exit) {
543 sleep_next_period();
544 }
545 } while(!do_exit);
546
547// if (ctx->id == 0 && NUM_AUX_THREADS) {
548// gAuxRun = 0;
549// __sync_synchronize();
550// CALL( disable_aux_rt_tasks() );
551// }
552
553 /*****
554 * 4) Transition to background mode.
555 */
556 TH_CALL( task_mode(BACKGROUND_TASK) );
557
558 for(i = 0; i < sizeof(ctx->mig_count)/sizeof(ctx->mig_count[0]); ++i)
559 {
560 printf("[%d]: mig_count[%d] = %d\n", ctx->id, i, ctx->mig_count[i]);
561 }
562
563 return NULL;
564}
565
566//int nested_job(struct thread_context* ctx, int *count, int *next, int runfactor)
567//{
568// int ret;
569//
570// if(*count == 0 || *next == NUM_SEMS)
571// {
572// ret = job(ctx, runfactor);
573// }
574// else
575// {
576// int which_sem = *next;
577// int rsm_od = ctx->od[which_sem];
578//
579// ++(*next);
580// --(*count);
581//
582// //printf("[%d]: trying to get semaphore %d.\n", ctx->id, which_sem);
583// //fflush(stdout);
584// litmus_lock(rsm_od);
585//
586// //printf("[%d] got semaphore %d.\n", ctx->id, which_sem);
587// //fflush(stdout);
588// ret = nested_job(ctx, count, next, runfactor);
589//
590// //printf("[%d]: freeing semaphore %d.\n", ctx->id, which_sem);
591// //fflush(stdout);
592// litmus_unlock(rsm_od);
593// }
594//
595//return(ret);
596//}
597
598
599void dirty_kb(int kb)
600{
601 int32_t one_kb[256];
602 int32_t sum = 0;
603 int32_t i;
604
605 if(!kb)
606 return;
607
608 for (i = 0; i < 256; i++)
609 sum += one_kb[i];
610 kb--;
611 /* prevent tail recursion */
612 if (kb)
613 dirty_kb(kb);
614 for (i = 0; i < 256; i++)
615 sum += one_kb[i];
616}
617
618int job(struct thread_context* ctx, int runfactor)
619{
620 //struct timespec tosleep = {0, 100000}; // 0.1 ms
621
622 //printf("[%d]: runfactor = %d\n", ctx->id, runfactor);
623
624 //dirty_kb(8 * runfactor);
625 dirty_kb(1 * runfactor);
626 //nanosleep(&tosleep, NULL);
627
628 /* Don't exit. */
629 //return ctx->count++ > 100;
630 //return ctx->count++ > 12000;
631 //return ctx->count++ > 120000;
632 return ctx->count++ > 25000; // controls number of jobs per task
633}
diff --git a/gpu/locktest.c b/gpu/locktest.c
new file mode 100644
index 0000000..bc4fc54
--- /dev/null
+++ b/gpu/locktest.c
@@ -0,0 +1,206 @@
1#include <stdio.h>
2#include <stdlib.h>
3#include <stdint.h>
4#include <unistd.h>
5#include <assert.h>
6#include <errno.h>
7#include <sys/types.h>
8#include <sys/stat.h>
9#include <fcntl.h>
10
11/* Include gettid() */
12#include <sys/types.h>
13
14/* Include threading support. */
15#include <pthread.h>
16
17/* Include the LITMUS^RT API.*/
18#include "litmus.h"
19
20/* Catch errors.
21 */
22#define CALL( exp ) do { \
23 int ret; \
24 ret = exp; \
25 if (ret != 0) \
26 fprintf(stderr, "%s failed: %m\n", #exp);\
27 else \
28 fprintf(stderr, "%s ok.\n", #exp); \
29 } while (0)
30
31#define TH_CALL( exp ) do { \
32 int ret; \
33 ret = exp; \
34 if (ret != 0) \
35 fprintf(stderr, "[%d] %s failed: %m\n", ctx->id, #exp); \
36 else \
37 fprintf(stderr, "[%d] %s ok.\n", ctx->id, #exp); \
38 } while (0)
39
40#define TH_SAFE_CALL( exp ) do { \
41 int ret; \
42 fprintf(stderr, "[%d] calling %s...\n", ctx->id, #exp); \
43 ret = exp; \
44 if (ret != 0) \
45 fprintf(stderr, "\t...[%d] %s failed: %m\n", ctx->id, #exp); \
46 else \
47 fprintf(stderr, "\t...[%d] %s ok.\n", ctx->id, #exp); \
48 } while (0)
49
50
51/* these are only default values */
52int NUM_THREADS=3;
53int NUM_SEMS=10;
54
55#define MAX_SEMS 1000
56
57#define EXEC_COST 10
58#define PERIOD 100
59
60/* The information passed to each thread. Could be anything. */
61struct thread_context {
62 int id;
63 int fd;
64 int od[MAX_SEMS];
65 int count;
66 unsigned int rand;
67};
68
69void* rt_thread(void* _ctx);
70int nested_job(struct thread_context* ctx, int *count, int *next);
71int job(struct thread_context*);
72
73#define OPTSTR "t:s:"
74
75int main(int argc, char** argv)
76{
77 int i;
78 struct thread_context* ctx;
79 pthread_t* task;
80 int fd;
81
82 int opt;
83 while((opt = getopt(argc, argv, OPTSTR)) != -1) {
84 switch(opt) {
85 case 't':
86 NUM_THREADS = atoi(optarg);
87 break;
88 case 's':
89 NUM_SEMS = atoi(optarg);
90 assert(NUM_SEMS <= MAX_SEMS);
91 break;
92 default:
93 fprintf(stderr, "Unknown option: %c\n", opt);
94 exit(-1);
95 break;
96 }
97 }
98
99 ctx = (struct thread_context*) calloc(NUM_THREADS, sizeof(struct thread_context));
100 task = (pthread_t*) calloc(NUM_THREADS, sizeof(pthread_t));
101
102 srand(0); /* something repeatable for now */
103
104 fd = open("semaphores", O_RDONLY | O_CREAT, S_IRUSR | S_IWUSR);
105
106 CALL( init_litmus() );
107
108 for (i = 0; i < NUM_THREADS; i++) {
109 ctx[i].id = i;
110 ctx[i].fd = fd;
111 ctx[i].rand = rand();
112 CALL( pthread_create(task + i, NULL, rt_thread, ctx + i) );
113 }
114
115
116 for (i = 0; i < NUM_THREADS; i++)
117 pthread_join(task[i], NULL);
118
119
120 return 0;
121}
122
123void* rt_thread(void* _ctx)
124{
125 int i;
126 int do_exit = 0;
127
128 struct thread_context *ctx = (struct thread_context*)_ctx;
129
130 TH_CALL( init_rt_thread() );
131
132 /* Vary period a little bit. */
133 TH_CALL( sporadic_global(EXEC_COST, PERIOD + 10*ctx->id) );
134
135 for (i = 0; i < NUM_SEMS; i++) {
136 ctx->od[i] = open_fmlp_sem(ctx->fd, i);
137 if(ctx->od[i] < 0)
138 perror("open_fmlp_sem");
139 }
140
141 TH_CALL( task_mode(LITMUS_RT_TASK) );
142
143
144 printf("[%d] Waiting for TS release.\n ", ctx->id);
145 wait_for_ts_release();
146 ctx->count = 0;
147
148 do {
149 int which_sem = (int)(NUM_SEMS * (rand_r(&(ctx->rand)) / (RAND_MAX + 1.0)));
150
151 printf("[%d]: trying to get semaphore %d.\n", ctx->id, which_sem);
152 fflush(stdout);
153
154 TH_SAFE_CALL ( litmus_lock(which_sem) );
155
156 printf("[%d] got semaphore %d.\n", ctx->id, which_sem);
157 fflush(stdout);
158
159 do_exit = job(ctx);
160
161 printf("[%d]: freeing semaphore %d.\n", ctx->id, which_sem);
162 fflush(stdout);
163
164 TH_SAFE_CALL ( litmus_unlock(which_sem) );
165
166 if(!do_exit) {
167 sleep_next_period();
168 }
169 } while(!do_exit);
170
171 /*****
172 * 4) Transition to background mode.
173 */
174 TH_CALL( task_mode(BACKGROUND_TASK) );
175
176
177 return NULL;
178}
179
180void dirty_kb(int kb)
181{
182 int32_t one_kb[256];
183 int32_t sum = 0;
184 int32_t i;
185
186 for (i = 0; i < 256; i++)
187 sum += one_kb[i];
188 kb--;
189 /* prevent tail recursion */
190 if (kb)
191 dirty_kb(kb);
192 for (i = 0; i < 256; i++)
193 sum += one_kb[i];
194}
195
196int job(struct thread_context* ctx)
197{
198 /* Do real-time calculation. */
199 dirty_kb(8);
200
201 /* Don't exit. */
202 //return ctx->count++ > 100;
203 //return ctx->count++ > 12000;
204 //return ctx->count++ > 120000;
205 return ctx->count++ > 30000; // controls number of jobs per task
206}
diff --git a/gpu/nested.c b/gpu/nested.c
new file mode 100644
index 0000000..07e237b
--- /dev/null
+++ b/gpu/nested.c
@@ -0,0 +1,245 @@
1#include <stdio.h>
2#include <stdlib.h>
3#include <stdint.h>
4#include <unistd.h>
5#include <assert.h>
6#include <errno.h>
7#include <sys/types.h>
8#include <sys/stat.h>
9#include <fcntl.h>
10
11/* Include gettid() */
12#include <sys/types.h>
13
14/* Include threading support. */
15#include <pthread.h>
16
17/* Include the LITMUS^RT API.*/
18#include "litmus.h"
19
20/* Catch errors.
21 */
22#define CALL( exp ) do { \
23 int ret; \
24 ret = exp; \
25 if (ret != 0) \
26 fprintf(stderr, "%s failed: %m\n", #exp);\
27 else \
28 fprintf(stderr, "%s ok.\n", #exp); \
29 } while (0)
30
31#define TH_CALL( exp ) do { \
32 int ret; \
33 ret = exp; \
34 if (ret != 0) \
35 fprintf(stderr, "[%d] %s failed: %m\n", ctx->id, #exp); \
36 else \
37 fprintf(stderr, "[%d] %s ok.\n", ctx->id, #exp); \
38 } while (0)
39
40#define TH_SAFE_CALL( exp ) do { \
41 int ret; \
42 fprintf(stderr, "[%d] calling %s...\n", ctx->id, #exp); \
43 ret = exp; \
44 if (ret != 0) \
45 fprintf(stderr, "\t...[%d] %s failed: %m\n", ctx->id, #exp); \
46 else \
47 fprintf(stderr, "\t...[%d] %s ok.\n", ctx->id, #exp); \
48 } while (0)
49
50
51#define NUM_CPUS 4
52
53//#define NUM_THREADS 3
54int NUM_THREADS=3;
55
56/* NEST_DEPTH may not be greater than NUM_SEMS. */
57//#define NUM_SEMS 10
58int NUM_SEMS=10;
59
60int SLEEP_BETWEEN_JOBS = 1;
61
62#define MAX_SEMS 1000
63
64//#define NEST_DEPTH 5
65int NEST_DEPTH=5;
66
67#define EXEC_COST 1000*1
68#define PERIOD 1000*10
69
70/* The information passed to each thread. Could be anything. */
71struct thread_context {
72 int id;
73 int fd;
74 int od[MAX_SEMS];
75 int count;
76 unsigned int rand;
77};
78
79void* rt_thread(void* _ctx);
80int nested_job(struct thread_context* ctx, int *count, int *next);
81int job(struct thread_context*);
82
83#define OPTSTR "t:s:d:f"
84
85int main(int argc, char** argv)
86{
87 int i;
88 struct thread_context* ctx; //[NUM_THREADS];
89 pthread_t* task; //[NUM_THREADS];
90 int fd;
91
92 int opt;
93 while((opt = getopt(argc, argv, OPTSTR)) != -1) {
94 switch(opt) {
95 case 't':
96 NUM_THREADS = atoi(optarg);
97 break;
98 case 's':
99 NUM_SEMS = atoi(optarg);
100 assert(NUM_SEMS <= MAX_SEMS);
101 break;
102 case 'd':
103 NEST_DEPTH = atoi(optarg);
104 break;
105 case 'f':
106 SLEEP_BETWEEN_JOBS = 0;
107 break;
108 default:
109 fprintf(stderr, "Unknown option: %c\n", opt);
110 exit(-1);
111 break;
112 }
113 }
114
115 ctx = (struct thread_context*) calloc(NUM_THREADS, sizeof(struct thread_context));
116 task = (pthread_t*) calloc(NUM_THREADS, sizeof(pthread_t));
117
118 srand(0); /* something repeatable for now */
119
120 fd = open("semaphores", O_RDONLY | O_CREAT, S_IRUSR | S_IWUSR);
121
122 CALL( init_litmus() );
123
124 for (i = 0; i < NUM_THREADS; i++) {
125 ctx[i].id = i;
126 ctx[i].fd = fd;
127 ctx[i].rand = rand();
128 CALL( pthread_create(task + i, NULL, rt_thread, ctx + i) );
129 }
130
131
132 for (i = 0; i < NUM_THREADS; i++)
133 pthread_join(task[i], NULL);
134
135
136 return 0;
137}
138
139void* rt_thread(void* _ctx)
140{
141 int i;
142 int do_exit = 0;
143
144 struct thread_context *ctx = (struct thread_context*)_ctx;
145
146 /* Make presence visible. */
147 //printf("RT Thread %d active.\n", ctx->id);
148
149 TH_CALL( init_rt_thread() );
150 TH_CALL( sporadic_task_ns(EXEC_COST, PERIOD + 10*ctx->id, 0, 0,
151 LITMUS_LOWEST_PRIORITY, RT_CLASS_SOFT, NO_ENFORCEMENT, NO_SIGNALS, 0) );
152
153 for (i = 0; i < NUM_SEMS; i++) {
154 ctx->od[i] = open_rsm_sem(ctx->fd, i);
155 if(ctx->od[i] < 0)
156 perror("open_rsm_sem");
157 //printf("[%d] ctx->od[%d]: %d\n", ctx->id, i, ctx->od[i]);
158 }
159
160 TH_CALL( task_mode(LITMUS_RT_TASK) );
161
162
163 printf("[%d] Waiting for TS release.\n ", ctx->id);
164 wait_for_ts_release();
165 ctx->count = 0;
166
167 do {
168 int first = (int)(NUM_SEMS * (rand_r(&(ctx->rand)) / (RAND_MAX + 1.0)));
169 int count = NEST_DEPTH;
170 do_exit = nested_job(ctx, &count, &first);
171
172 if(SLEEP_BETWEEN_JOBS && !do_exit) {
173 sleep_next_period();
174 }
175 } while(!do_exit);
176
177 /*****
178 * 4) Transition to background mode.
179 */
180 TH_CALL( task_mode(BACKGROUND_TASK) );
181
182
183 return NULL;
184}
185
186
187int nested_job(struct thread_context* ctx, int *count, int *next)
188{
189 int ret;
190
191 if(*count == 0 || *next == NUM_SEMS) /* base case */
192 {
193 ret = job(ctx);
194 }
195 else
196 {
197 int which_sem = ctx->od[*next];
198
199 ++(*next);
200 --(*count);
201
202 printf("[%d]: trying to get semaphore %d.\n", ctx->id, which_sem);
203 fflush(stdout);
204 TH_SAFE_CALL ( litmus_lock(which_sem) );
205 printf("[%d] got semaphore %d.\n", ctx->id, which_sem);
206 fflush(stdout);
207 ret = nested_job(ctx, count, next);
208 TH_SAFE_CALL ( litmus_unlock(which_sem) );
209 fflush(stdout);
210 }
211
212 return(ret);
213}
214
215
216
217void dirty_kb(int kb)
218{
219 int32_t one_kb[256];
220 int32_t sum = 0;
221 int32_t i;
222
223 for (i = 0; i < 256; i++)
224 sum += one_kb[i];
225 kb--;
226 /* prevent tail recursion */
227 if (kb)
228 dirty_kb(kb);
229 for (i = 0; i < 256; i++)
230 sum += one_kb[i];
231}
232
233
234
235int job(struct thread_context* ctx)
236{
237 /* Do real-time calculation. */
238 dirty_kb(8);
239
240 /* Don't exit. */
241 //return ctx->count++ > 100;
242 //return ctx->count++ > 12000;
243 //return ctx->count++ > 120000;
244 return ctx->count++ > 30000;
245}
diff --git a/gpu/normal_task.c b/gpu/normal_task.c
new file mode 100644
index 0000000..ffc95b1
--- /dev/null
+++ b/gpu/normal_task.c
@@ -0,0 +1,84 @@
1#include <stdio.h>
2#include <stdlib.h>
3#include <string.h>
4#include <stdint.h>
5#include <unistd.h>
6#include <assert.h>
7#include <errno.h>
8#include <sys/types.h>
9#include <sys/stat.h>
10#include <fcntl.h>
11#include <time.h>
12#include <math.h>
13
14/* Include gettid() */
15#include <sys/types.h>
16
17/* Include threading support. */
18#include <pthread.h>
19
20/* Include the LITMUS^RT API.*/
21#include "litmus.h"
22
23/* Catch errors.
24 */
25#if 1
26#define CALL( exp ) do { \
27 int ret; \
28 ret = exp; \
29 if (ret != 0) \
30 fprintf(stderr, "%s failed: %m\n", #exp);\
31 else \
32 fprintf(stderr, "%s ok.\n", #exp); \
33 } while (0)
34
35#define TH_CALL( exp ) do { \
36 int ret; \
37 ret = exp; \
38 if (ret != 0) \
39 fprintf(stderr, "[%d] %s failed: %m\n", ctx->id, #exp); \
40 else \
41 fprintf(stderr, "[%d] %s ok.\n", ctx->id, #exp); \
42 } while (0)
43
44#define TH_SAFE_CALL( exp ) do { \
45 int ret; \
46 fprintf(stderr, "[%d] calling %s...\n", ctx->id, #exp); \
47 ret = exp; \
48 if (ret != 0) \
49 fprintf(stderr, "\t...[%d] %s failed: %m\n", ctx->id, #exp); \
50 else \
51 fprintf(stderr, "\t...[%d] %s ok.\n", ctx->id, #exp); \
52 } while (0)
53#else
54#define CALL( exp )
55#define TH_CALL( exp )
56#define TH_SAFE_CALL( exp )
57#endif
58
59/* these are only default values */
60// 1000 = 1us
61#define EXEC_COST 1000*1
62#define PERIOD 2*1000*100
63
64
65int main(int argc, char** argv)
66{
67 CALL( init_litmus() );
68
69 CALL( init_rt_thread() );
70 CALL( sporadic_task_ns(EXEC_COST, PERIOD, 0, 0,
71 LITMUS_LOWEST_PRIORITY, RT_CLASS_SOFT, NO_ENFORCEMENT, NO_SIGNALS, 1) );
72 //CALL( task_mode(LITMUS_RT_TASK) );
73
74 fprintf(stdout, "Waiting for TS release.\n ");
75 wait_for_ts_release();
76
77 fprintf(stdout, "Released!\n");
78
79 //sleep_next_period();
80 //CALL( task_mode(BACKGROUND_TASK) );
81
82 return 0;
83}
84
diff --git a/gpu/rtspin_fake_cuda.cpp b/gpu/rtspin_fake_cuda.cpp
new file mode 100644
index 0000000..667c675
--- /dev/null
+++ b/gpu/rtspin_fake_cuda.cpp
@@ -0,0 +1,1169 @@
1#include <sys/time.h>
2
3#include <stdint.h>
4#include <stdio.h>
5#include <stdlib.h>
6#include <unistd.h>
7#include <time.h>
8#include <assert.h>
9#include <fcntl.h>
10#include <errno.h>
11
12#include <blitz/array.h>
13
14#include <boost/interprocess/managed_shared_memory.hpp>
15#include <boost/interprocess/sync/interprocess_barrier.hpp>
16#include <boost/interprocess/sync/interprocess_mutex.hpp>
17
18#include "litmus.h"
19
20using namespace blitz;
21using namespace std;
22using namespace boost::interprocess;
23
24#define RESET_RELEASE_ON_MISS
25
26
27void bail_out(const char* msg)
28{
29 perror(msg);
30 exit(-1 * errno);
31}
32
33
34static void usage(char *error) {
35 fprintf(stderr, "Error: %s\n", error);
36 fprintf(stderr,
37 "Usage:\n"
38 " rt_spin [COMMON-OPTS] WCET PERIOD DURATION\n"
39 " rt_spin [COMMON-OPTS] -f FILE [-o COLUMN] WCET PERIOD\n"
40 " rt_spin -l\n"
41 "\n"
42 "COMMON-OPTS = [-w] [-p PARTITION] [-c CLASS] [-s SCALE]\n"
43 "\n"
44 "WCET and PERIOD are milliseconds, DURATION is seconds.\n");
45 exit(EXIT_FAILURE);
46}
47
48#define NUMS 4096
49static int num[NUMS];
50
51#define PAGE_SIZE (1024*4)
52
53bool ENABLE_WAIT = true;
54bool GPU_TASK = false;
55bool ENABLE_AFFINITY = false;
56bool USE_KFMLP = false;
57bool RELAX_FIFO_MAX_LEN = false;
58bool USE_DYNAMIC_GROUP_LOCKS = false;
59bool BROADCAST_STATE = false;
60bool ENABLE_CHUNKING = false;
61bool MIGRATE_VIA_SYSMEM = false;
62
63int GPU_PARTITION = 0;
64int GPU_PARTITION_SIZE = 0;
65int NUM_SIMULT_USERS = 1;
66size_t SEND_SIZE = 0;
67size_t RECV_SIZE = 0;
68size_t STATE_SIZE = 0;
69size_t CHUNK_SIZE = PAGE_SIZE;
70
71
72#define MAX_GPUS 8
73
74int KEXCLU_LOCK;
75int EE_LOCKS[MAX_GPUS];
76int CE_SEND_LOCKS[MAX_GPUS];
77int CE_RECV_LOCKS[MAX_GPUS];
78
79int CUR_DEVICE = -1;
80int LAST_DEVICE = -1;
81
82bool useEngineLocks()
83{
84 return(NUM_SIMULT_USERS != 1);
85}
86
87int gpuCyclesPerSecond = 0;
88
89uint64_t *init_release_time = NULL;
90barrier *release_barrier = NULL;
91barrier *gpu_barrier = NULL;
92interprocess_mutex *gpu_mgmt_mutexes = NULL;
93managed_shared_memory *segment_ptr = NULL;
94managed_shared_memory *release_segment_ptr = NULL;
95
96// observed average rate when four GPUs on same node in use from pagelocked memory.
97// about 1/3 to 1/4 this when there is no bus contention.
98//const double msPerByte = 4.22e-07;
99//const double transOverhead = 0.01008; // also observed.
100
101
102
103char *d_send_data[MAX_GPUS] = {0};
104char *d_recv_data[MAX_GPUS] = {0};
105char *d_state_data[MAX_GPUS] = {0};
106
107//cudaStream_t streams[MAX_GPUS];
108
109char *h_send_data = 0;
110char *h_recv_data = 0;
111char *h_state_data = 0;
112
113
114#include <sys/mman.h>
115#define USE_PAGE_LOCKED_MEMORY
116#ifdef USE_PAGE_LOCKED_MEMORY
117#define c_malloc(s) \
118 mmap(NULL, s , \
119 PROT_READ | PROT_WRITE, \
120 MAP_PRIVATE | MAP_ANONYMOUS | MAP_LOCKED, \
121 -1, 0)
122#else
123#define c_malloc(s) malloc(s)
124#endif
125
126typedef int cudaError_t;
127#define cudaSuccess 0
128
129enum cudaMemcpyKind {
130cudaMemcpyHostToDevice = 0,
131cudaMemcpyDeviceToHost = 1,
132cudaMemcpyDeviceToDevice = 2,
133};
134
135cudaError_t cudaGetLastError()
136{
137 return cudaSuccess;
138}
139
140////////////////////////////////////////////////////////////////////////
141////////////////////////////////////////////////////////////////////////
142////////////////////////////////////////////////////////////////////////
143////////////////////////////////////////////////////////////////////////
144
145struct ce_lock_state
146{
147 int locks[2];
148 size_t num_locks;
149 size_t budget_remaining;
150 bool locked;
151
152 ce_lock_state(int device_a, enum cudaMemcpyKind kind, size_t size, int device_b = -1) {
153 num_locks = (device_a != -1) + (device_b != -1);
154
155 if(device_a != -1) {
156 locks[0] = (kind == cudaMemcpyHostToDevice) ?
157 CE_SEND_LOCKS[device_a] : CE_RECV_LOCKS[device_a];
158 }
159
160 if(device_b != -1) {
161 assert(kind == cudaMemcpyDeviceToDevice);
162
163 locks[1] = CE_RECV_LOCKS[device_b];
164
165 if(locks[1] < locks[0]) {
166 int temp = locks[1];
167 locks[1] = locks[0];
168 locks[0] = temp;
169 }
170 }
171
172 if(!ENABLE_CHUNKING)
173 budget_remaining = size;
174 else
175 budget_remaining = CHUNK_SIZE;
176 }
177
178 void lock() {
179 if(USE_DYNAMIC_GROUP_LOCKS) {
180 litmus_dgl_lock(locks, num_locks);
181 }
182 else
183 {
184 for(int l = 0; l < num_locks; ++l)
185 {
186 litmus_lock(locks[l]);
187 }
188 }
189 locked = true;
190 }
191
192 void unlock() {
193 if(USE_DYNAMIC_GROUP_LOCKS) {
194 litmus_dgl_unlock(locks, num_locks);
195 }
196 else
197 {
198 // reverse order
199 for(int l = num_locks - 1; l >= 0; --l)
200 {
201 litmus_unlock(locks[l]);
202 }
203 }
204 locked = false;
205 }
206
207 void refresh() {
208 budget_remaining = CHUNK_SIZE;
209 }
210
211 bool budgetIsAvailable(size_t tosend) {
212 return(tosend >= budget_remaining);
213 }
214
215 void decreaseBudget(size_t spent) {
216 budget_remaining -= spent;
217 }
218};
219
220// precondition: if do_locking == true, locks in state are held.
221cudaError_t __chunkMemcpy(void* a_dst, const void* a_src, size_t count,
222 enum cudaMemcpyKind kind,
223 ce_lock_state* state)
224{
225 cudaError_t ret = cudaSuccess;
226 int remaining = count;
227
228 char* dst = (char*)a_dst;
229 const char* src = (const char*)a_src;
230
231 // disable chunking, if needed, by setting chunk_size equal to the
232 // amount of data to be copied.
233 int chunk_size = (ENABLE_CHUNKING) ? CHUNK_SIZE : count;
234 int i = 0;
235
236 while(remaining != 0)
237 {
238 int bytesToCopy = std::min(remaining, chunk_size);
239
240 if(state && state->budgetIsAvailable(bytesToCopy) && state->locked) {
241 //cutilSafeCall( cudaStreamSynchronize(streams[CUR_DEVICE]) );
242 ret = cudaGetLastError();
243
244 if(ret != cudaSuccess)
245 {
246 break;
247 }
248
249 state->unlock();
250 state->refresh(); // replentish.
251 // we can only run out of
252 // budget if chunking is enabled.
253 // we presume that init budget would
254 // be set to cover entire memcpy
255 // if chunking were disabled.
256 }
257
258 if(state && !state->locked) {
259 state->lock();
260 }
261
262 //ret = cudaMemcpy(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind);
263 //cudaMemcpyAsync(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind, streams[CUR_DEVICE]);
264
265 if(state) {
266 state->decreaseBudget(bytesToCopy);
267 }
268
269// if(ret != cudaSuccess)
270// {
271// break;
272// }
273
274 ++i;
275 remaining -= bytesToCopy;
276 }
277 return ret;
278}
279
280cudaError_t chunkMemcpy(void* a_dst, const void* a_src, size_t count,
281 enum cudaMemcpyKind kind,
282 int device_a = -1, // device_a == -1 disables locking
283 bool do_locking = true,
284 int device_b = -1)
285{
286 cudaError_t ret;
287 if(!do_locking || device_a == -1) {
288 ret = __chunkMemcpy(a_dst, a_src, count, kind, NULL);
289 //cutilSafeCall( cudaStreamSynchronize(streams[CUR_DEVICE]) );
290 if(ret == cudaSuccess)
291 ret = cudaGetLastError();
292 }
293 else {
294 ce_lock_state state(device_a, kind, count, device_b);
295 state.lock();
296 ret = __chunkMemcpy(a_dst, a_src, count, kind, &state);
297 //cutilSafeCall( cudaStreamSynchronize(streams[CUR_DEVICE]) );
298 if(ret == cudaSuccess)
299 ret = cudaGetLastError();
300 state.unlock();
301 }
302 return ret;
303}
304
305
306////////////////////////////////////////////////////////////////////////
307////////////////////////////////////////////////////////////////////////
308////////////////////////////////////////////////////////////////////////
309
310
311inline uint64_t timespec_to_ns(const struct timespec& t)
312{
313 return(t.tv_sec*1e9 + t.tv_nsec);
314}
315
316inline struct timespec ns_to_timespec(const uint64_t& ns)
317{
318 struct timespec temp = {ns/1e9, ns - ns/1e9};
319 return(temp);
320}
321
322inline uint64_t clock_gettime_ns(clockid_t clk_id)
323{
324 struct timespec temp;
325 clock_gettime(clk_id, &temp);
326 return timespec_to_ns(temp);
327}
328
329
330
331static int loop_once(void)
332{
333 int i, j = 0;
334 for (i = 0; i < NUMS; i++)
335 j += num[i]++;
336 return j;
337}
338
339static int loop_for(double exec_time, double emergency_exit)
340{
341 double last_loop = 0, loop_start;
342 int tmp = 0;
343
344 double start = cputime();
345 double now = cputime();
346
347 while (now + last_loop < start + exec_time) {
348 loop_start = now;
349 tmp += loop_once();
350 now = cputime();
351 last_loop = now - loop_start;
352 if (emergency_exit && wctime() > emergency_exit) {
353 /* Oops --- this should only be possible if the execution time tracking
354 * is broken in the LITMUS^RT kernel. */
355 fprintf(stderr, "!!! rtspin/%d emergency exit!\n", getpid());
356 fprintf(stderr, "Something is seriously wrong! Do not ignore this.\n");
357 break;
358 }
359 }
360
361 return tmp;
362}
363
364static void allocate_locks()
365{
366 // allocate k-FMLP lock
367 int fd = open("semaphores", O_RDONLY | O_CREAT, S_IRUSR | S_IWUSR);
368
369 int base_name = GPU_PARTITION * 1000;
370
371 if(USE_KFMLP) {
372 KEXCLU_LOCK = open_kfmlp_gpu_sem(fd,
373 base_name, /* name */
374 GPU_PARTITION_SIZE,
375 GPU_PARTITION*GPU_PARTITION_SIZE,
376 NUM_SIMULT_USERS,
377 ENABLE_AFFINITY
378 );
379 }
380 else {
381 KEXCLU_LOCK = open_ikglp_gpu_sem(fd,
382 base_name, /* name */
383 GPU_PARTITION_SIZE,
384 GPU_PARTITION*GPU_PARTITION_SIZE,
385 NUM_SIMULT_USERS,
386 ENABLE_AFFINITY,
387 RELAX_FIFO_MAX_LEN
388 );
389 }
390 if(KEXCLU_LOCK < 0)
391 perror("open_kexclu_sem");
392
393 if(NUM_SIMULT_USERS > 1)
394 {
395 // allocate the engine locks.
396 for (int i = 0; i < MAX_GPUS; ++i)
397 {
398 EE_LOCKS[i] = open_rsm_sem(fd, (i+1)*10 + base_name);
399 if(EE_LOCKS[i] < 0)
400 perror("open_rsm_sem");
401
402 CE_SEND_LOCKS[i] = open_rsm_sem(fd, (i+1)*10 + base_name + 1);
403 if(CE_SEND_LOCKS[i] < 0)
404 perror("open_rsm_sem");
405
406 if(NUM_SIMULT_USERS == 3)
407 {
408 // allocate a separate lock for the second copy engine
409 CE_RECV_LOCKS[i] = open_rsm_sem(fd, (i+1)*10 + base_name + 2);
410 if(CE_RECV_LOCKS[i] < 0)
411 perror("open_rsm_sem");
412 }
413 else
414 {
415 // share a single lock for the single copy engine
416 CE_RECV_LOCKS[i] = CE_SEND_LOCKS[i];
417 }
418 }
419 }
420}
421
422static void allocate_host_memory()
423{
424 // round up to page boundaries
425 size_t send_alloc_bytes = SEND_SIZE + (SEND_SIZE%PAGE_SIZE != 0)*PAGE_SIZE;
426 size_t recv_alloc_bytes = RECV_SIZE + (RECV_SIZE%PAGE_SIZE != 0)*PAGE_SIZE;
427 size_t state_alloc_bytes = STATE_SIZE + (STATE_SIZE%PAGE_SIZE != 0)*PAGE_SIZE;
428
429 printf("Allocating host memory. send = %dB, recv = %dB, state = %dB\n",
430 send_alloc_bytes, recv_alloc_bytes, state_alloc_bytes);
431
432// if(send_alloc_bytes > 0)
433// {
434// h_send_data = (char *)c_malloc(send_alloc_bytes);
435// memset(h_send_data, 0x55, send_alloc_bytes); // write some random value
436// // this will open a connection to GPU 0 if there is no active context, so
437// // expect long stalls. LAME.
438// cutilSafeCall( cudaHostRegister(h_send_data, send_alloc_bytes, cudaHostRegisterPortable) );
439// }
440//
441// if(recv_alloc_bytes > 0)
442// {
443// h_recv_data = (char *)c_malloc(recv_alloc_bytes);
444// memset(h_recv_data, 0xAA, recv_alloc_bytes);
445// cutilSafeCall( cudaHostRegister(h_recv_data, recv_alloc_bytes, cudaHostRegisterPortable) );
446// }
447//
448// if(state_alloc_bytes > 0)
449// {
450// h_state_data = (char *)c_malloc(state_alloc_bytes);
451// memset(h_state_data, 0xCC, state_alloc_bytes); // write some random value
452// cutilSafeCall( cudaHostRegister(h_state_data, state_alloc_bytes, cudaHostRegisterPortable) );
453// }
454
455 printf("Host memory allocated.\n");
456}
457
458static void allocate_device_memory()
459{
460 printf("Allocating device memory.\n");
461 // establish a connection to each GPU.
462// for(int i = 0; i < GPU_PARTITION_SIZE; ++i)
463// {
464// int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i;
465//
466// if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].lock();
467//
468// cutilSafeCall( cudaSetDevice(which_device) );
469// cutilSafeCall( cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 0) );
470// cutilSafeCall( cudaDeviceSetLimit(cudaLimitMallocHeapSize, 0) );
471//
472// cutilSafeCall( cudaStreamCreate(&streams[which_device]) );
473//
474// /* pre-allocate memory, pray there's enough to go around */
475// if(SEND_SIZE > 0) {
476// cutilSafeCall( cudaMalloc((void**)&d_send_data[which_device], SEND_SIZE) );
477// }
478// if(RECV_SIZE > 0) {
479// cutilSafeCall( cudaMalloc((void**)&h_recv_data[which_device], RECV_SIZE) );
480// }
481// if(STATE_SIZE > 0) {
482// cutilSafeCall( cudaMalloc((void**)&h_state_data[which_device], STATE_SIZE) );
483// }
484//
485// if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].unlock();
486// }
487 printf("Device memory allocated.\n");
488}
489
490static void configure_gpus()
491{
492 printf("Configuring GPU\n");
493
494// // SUSPEND WHEN BLOCKED!!
495// cutilSafeCall( cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync) );
496//
497// // establish a connection to each GPU.
498// for(int i = 0; i < GPU_PARTITION_SIZE; ++i)
499// {
500// int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i;
501//
502// if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].lock();
503//
504// cutilSafeCall( cudaSetDevice(which_device) );
505// cutilSafeCall( cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 0) );
506// cutilSafeCall( cudaDeviceSetLimit(cudaLimitMallocHeapSize, 0) );
507//
508// cutilSafeCall( cudaStreamCreate(&streams[which_device]) );
509//
510// // enable P2P migrations.
511// // we assume all GPUs are on the same I/O hub.
512// for(int j = 0; j < GPU_PARTITION_SIZE; ++j)
513// {
514// int other_device = GPU_PARTITION*GPU_PARTITION_SIZE + j;
515//
516// if(which_device != other_device)
517// {
518// cutilSafeCall( cudaDeviceEnablePeerAccess(other_device, 0) );
519// }
520// }
521//
522// if(i == 0)
523// {
524// struct cudaDeviceProp pi;
525// cudaGetDeviceProperties(&pi, i);
526// gpuCyclesPerSecond = pi.clockRate * 1000; /* khz -> hz */
527// }
528//
529// if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].unlock();
530// }
531
532 printf("GPUs have been configured.\n");
533}
534
535static void init_cuda()
536{
537 configure_gpus();
538 allocate_host_memory();
539 allocate_device_memory();
540 allocate_locks();
541}
542
543static void exit_cuda()
544{
545 for(int i = 0; i < GPU_PARTITION_SIZE; ++i)
546 {
547 int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i;
548
549 if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].lock();
550
551// cutilSafeCall( cudaSetDevice(which_device) );
552// cutilSafeCall( cudaDeviceReset() );
553
554 if(ENABLE_WAIT) gpu_mgmt_mutexes[which_device].unlock();
555 }
556}
557
558static void catchExit(void)
559{
560 if(GPU_TASK)
561 {
562 // try to unlock everything. litmus will prevent bogus calls.
563 if(NUM_SIMULT_USERS > 1)
564 {
565 for(int i = 0; i < GPU_PARTITION_SIZE; ++i)
566 {
567 int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i;
568
569 litmus_unlock(EE_LOCKS[which_device]);
570 litmus_unlock(CE_SEND_LOCKS[which_device]);
571 if(NUM_SIMULT_USERS == 2) {
572 litmus_unlock(CE_RECV_LOCKS[which_device]);
573 }
574 }
575 }
576
577 if(CUR_DEVICE >= 0) {
578 unregister_nv_device(CUR_DEVICE);
579 }
580
581 litmus_unlock(KEXCLU_LOCK);
582 }
583}
584
585static void migrateToGPU(int destination)
586{
587 if(!BROADCAST_STATE && STATE_SIZE > 0)
588 {
589 if(MIGRATE_VIA_SYSMEM)
590 {
591 chunkMemcpy(h_state_data, d_state_data[LAST_DEVICE], STATE_SIZE,
592 cudaMemcpyDeviceToHost, LAST_DEVICE, useEngineLocks());
593 }
594 }
595
596// cutilSafeCall( cudaSetDevice(destination) );
597
598 if(!BROADCAST_STATE && STATE_SIZE > 0)
599 {
600 if(MIGRATE_VIA_SYSMEM)
601 {
602 chunkMemcpy(d_state_data[CUR_DEVICE], h_state_data, STATE_SIZE,
603 cudaMemcpyHostToDevice, CUR_DEVICE, useEngineLocks());
604 }
605 else
606 {
607 chunkMemcpy(d_state_data[destination],
608 d_state_data[LAST_DEVICE],
609 STATE_SIZE,
610 cudaMemcpyDeviceToDevice,
611 CUR_DEVICE,
612 useEngineLocks(),
613 destination);
614 }
615 }
616}
617
618static void broadcastState(int from)
619{
620 if(STATE_SIZE > 0)
621 {
622 assert(CUR_DEVICE == from);
623
624 if(MIGRATE_VIA_SYSMEM)
625 {
626 chunkMemcpy(h_state_data, d_state_data[from], STATE_SIZE,
627 cudaMemcpyDeviceToHost, from, useEngineLocks());
628 }
629
630 for(int i = 0; i < GPU_PARTITION_SIZE; ++i)
631 {
632 int which_device = GPU_PARTITION*GPU_PARTITION_SIZE + i;
633 if(which_device != from)
634 {
635 if(MIGRATE_VIA_SYSMEM)
636 {
637// cutilSafeCall( cudaSetDevice(which_device) );
638 CUR_DEVICE = which_device; // temporary
639 chunkMemcpy(d_state_data[which_device], h_state_data, STATE_SIZE,
640 cudaMemcpyHostToDevice, which_device, useEngineLocks());
641 }
642 else
643 {
644 chunkMemcpy(d_state_data[which_device],
645 d_state_data[from],
646 STATE_SIZE,
647 cudaMemcpyDeviceToDevice,
648 from,
649 useEngineLocks(),
650 which_device);
651 }
652 }
653 }
654
655 if(MIGRATE_VIA_SYSMEM && CUR_DEVICE != from)
656 {
657// cutilSafeCall( cudaSetDevice(from) );
658 CUR_DEVICE = from;
659 }
660 }
661}
662
663//// Executes on graphics card.
664//__global__ void docudaspin(unsigned int cycles)
665//{
666// long long unsigned int elapsed = 0;
667// long long int now = clock64();
668// long long int last;
669// do
670// {
671// last = now;
672// now = clock64();
673// elapsed += max(0ll, (long long int)(now - last)); // don't count iterations with clock roll-over
674// }while(elapsed < cycles);
675//
676// return;
677//}
678
679
680
681static void gpu_loop_for(double gpu_sec_time, double emergency_exit)
682{
683 unsigned int numcycles = (unsigned int)(gpuCyclesPerSecond * gpu_sec_time);
684 int numblocks = 1;
685 int blocksz = 1;
686
687 CUR_DEVICE = litmus_lock(KEXCLU_LOCK);
688 {
689 if(CUR_DEVICE != LAST_DEVICE && LAST_DEVICE != -1)
690 {
691 migrateToGPU(CUR_DEVICE);
692 }
693
694 if(SEND_SIZE > 0)
695 {
696 // handles chunking and locking, as appropriate.
697 chunkMemcpy(d_send_data[CUR_DEVICE], h_send_data, SEND_SIZE,
698 cudaMemcpyHostToDevice, CUR_DEVICE, useEngineLocks());
699 }
700
701 if(useEngineLocks()) litmus_lock(EE_LOCKS[CUR_DEVICE]);
702
703// docudaspin <<<numblocks,blocksz, 0, streams[CUR_DEVICE]>>> (numcycles);
704// cutilSafeCall( cudaStreamSynchronize(streams[CUR_DEVICE]) );
705
706 if(useEngineLocks()) litmus_unlock(EE_LOCKS[CUR_DEVICE]);
707
708 if(RECV_SIZE > 0)
709 {
710 chunkMemcpy(h_recv_data, d_recv_data[CUR_DEVICE], RECV_SIZE,
711 cudaMemcpyDeviceToHost, CUR_DEVICE, useEngineLocks());
712 }
713
714 if(BROADCAST_STATE)
715 {
716 broadcastState(CUR_DEVICE);
717 }
718 }
719 litmus_unlock(KEXCLU_LOCK);
720
721 LAST_DEVICE = CUR_DEVICE;
722 CUR_DEVICE = -1;
723}
724
725
726static void debug_delay_loop(void)
727{
728 double start, end, delay;
729
730 while (1) {
731 for (delay = 0.5; delay > 0.01; delay -= 0.01) {
732 start = wctime();
733 loop_for(delay, 0);
734 end = wctime();
735 printf("%6.4fs: looped for %10.8fs, delta=%11.8fs, error=%7.4f%%\n",
736 delay,
737 end - start,
738 end - start - delay,
739 100 * (end - start - delay) / delay);
740 }
741 }
742}
743
744static int job(double exec_time, double gpu_sec_time, double program_end)
745{
746 if (wctime() > program_end)
747 return 0;
748 else if (!GPU_TASK)
749 {
750 loop_for(exec_time, program_end + 1);
751 }
752 else
753 {
754 double cpu_bookend = (exec_time)/2.0;
755
756 loop_for(cpu_bookend, program_end + 1);
757 gpu_loop_for(gpu_sec_time, program_end + 1);
758 loop_for(cpu_bookend, program_end + 1);
759 }
760 return 1;
761}
762
763#define OPTSTR "p:ls:e:g:G:W:N:S:R:T:BMaLyC:rz:"
764
765int main(int argc, char** argv)
766{
767 atexit(catchExit);
768
769 int ret;
770 lt_t wcet;
771 lt_t period;
772 double wcet_ms, period_ms;
773 int migrate = 0;
774 int cpu = 0;
775 int opt;
776 int test_loop = 0;
777// int column = 1;
778 const char *file = NULL;
779 int want_enforcement = 0;
780 double duration = 0, releaseTime = 0;
781 double *exec_times = NULL;
782 double scale = 1.0;
783 uint64_t cur_job;
784 uint64_t num_jobs;
785
786 int create_shm = -1;
787 int num_tasks = 0;
788
789 double gpu_sec_ms = 0;
790
791 while ((opt = getopt(argc, argv, OPTSTR)) != -1) {
792// printf("opt = %c optarg = %s\n", opt, optarg);
793 switch (opt) {
794// case 'w':
795// ENABLE_WAIT = 1;
796// break;
797 case 'p':
798 cpu = atoi(optarg);
799 migrate = 1;
800 break;
801 case 'l':
802 test_loop = 1;
803 break;
804 case 's':
805 scale = atof(optarg);
806 break;
807 case 'e':
808 gpu_sec_ms = atof(optarg);
809 break;
810// case 'x':
811// trans_sec_ms = atof(optarg);
812// break;
813 case 'z':
814 NUM_SIMULT_USERS = atoi(optarg);
815 break;
816 case 'g':
817 GPU_TASK = 1;
818 GPU_PARTITION_SIZE = atoi(optarg);
819 break;
820 case 'G':
821 GPU_PARTITION = atoi(optarg);
822 break;
823 case 'S':
824 SEND_SIZE = (size_t)(atof(optarg)*1024);
825 break;
826 case 'R':
827 RECV_SIZE = (size_t)(atof(optarg)*1024);
828 break;
829 case 'T':
830 STATE_SIZE = (size_t)(atof(optarg)*1024);
831 break;
832 case 'B':
833 BROADCAST_STATE = true;
834 break;
835 case 'M':
836 MIGRATE_VIA_SYSMEM = true;
837 break;
838 case 'a':
839 ENABLE_AFFINITY = true;
840 break;
841 case 'r':
842 RELAX_FIFO_MAX_LEN = true;
843 break;
844 case 'L':
845 USE_KFMLP = true;
846 break;
847 case 'y':
848 USE_DYNAMIC_GROUP_LOCKS = true;
849 break;
850 case 'C':
851 ENABLE_CHUNKING = true;
852 CHUNK_SIZE = (size_t)(atof(optarg)*1024);
853 break;
854 case 'W':
855 create_shm = atoi(optarg);
856 break;
857 case 'N':
858 num_tasks = atoi(optarg);
859 break;
860 case ':':
861 usage("Argument missing.");
862 break;
863 case '?':
864 default:
865 usage("Bad argument.");
866 break;
867 }
868 }
869
870 if (test_loop) {
871 debug_delay_loop();
872 return 0;
873 }
874
875// if (file) {
876// int num_jobs_tmp;
877// get_exec_times(file, column, &num_jobs_tmp, &exec_times);
878// num_jobs = num_jobs_tmp;
879//
880// if (argc - optind < 2)
881// usage("Arguments missing.");
882//
883// for (cur_job = 0; cur_job < num_jobs; ++cur_job) {
884// /* convert the execution time to seconds */
885// duration += exec_times[cur_job] * 0.001;
886// }
887// } else {
888 /*
889 * if we're not reading from the CSV file, then we need
890 * three parameters
891 */
892 if (argc - optind < 3)
893 usage("Arguments missing.");
894// }
895
896 wcet_ms = atof(argv[optind + 0]);
897 period_ms = atof(argv[optind + 1]);
898
899 wcet = wcet_ms * __NS_PER_MS;
900 period = period_ms * __NS_PER_MS;
901 if (wcet <= 0)
902 usage("The worst-case execution time must be a "
903 "positive number.");
904 if (period <= 0)
905 usage("The period must be a positive number.");
906 if (!file && wcet > period) {
907 usage("The worst-case execution time must not "
908 "exceed the period.");
909 }
910
911 if (!file)
912 {
913 duration = atof(argv[optind + 2]);
914 num_jobs = ((double)duration*1e3)/period_ms;
915 ++num_jobs; // padding
916 }
917 else if (file && num_jobs > 1)
918 {
919 duration += period_ms * 0.001 * (num_jobs - 1);
920 }
921
922 if (migrate) {
923 ret = be_migrate_to(cpu);
924 if (ret < 0)
925 bail_out("could not migrate to target partition");
926 }
927
928 if(ENABLE_WAIT)
929 {
930 if(num_tasks > 0)
931 {
932 printf("%d creating release shared memory\n", getpid());
933 shared_memory_object::remove("release_barrier_memory");
934 release_segment_ptr = new managed_shared_memory(create_only, "release_barrier_memory", 4*1024);
935
936 printf("%d creating release barrier for %d users\n", getpid(), num_tasks);
937 release_barrier = release_segment_ptr->construct<barrier>("barrier release_barrier")(num_tasks);
938
939 init_release_time = release_segment_ptr->construct<uint64_t>("uint64_t instance")();
940 *init_release_time = 0;
941 }
942 else
943 {
944 do
945 {
946 try
947 {
948 printf("%d opening release shared memory\n", getpid());
949 segment_ptr = new managed_shared_memory(open_only, "release_barrier_memory");
950 }
951 catch(...)
952 {
953 printf("%d shared memory not ready. sleeping\n", getpid());
954 sleep(1);
955 }
956 }while(segment_ptr == NULL);
957
958 release_barrier = segment_ptr->find<barrier>("barrier release_barrier").first;
959 init_release_time = segment_ptr->find<uint64_t>("uint64_t instance").first;
960 }
961 }
962
963
964 if(GPU_TASK)
965 {
966 if(ENABLE_WAIT)
967 {
968 if(create_shm > -1)
969 {
970 printf("%d creating shared memory\n", getpid());
971 shared_memory_object::remove("gpu_barrier_memory");
972 segment_ptr = new managed_shared_memory(create_only, "gpu_barrier_memory", 4*1024);
973
974 printf("%d creating a barrier for %d users\n", getpid(), create_shm);
975 gpu_barrier = segment_ptr->construct<barrier>("barrier instance")(create_shm);
976 printf("%d creating gpu mgmt mutexes for 8 devices\n", getpid());
977 gpu_mgmt_mutexes = segment_ptr->construct<interprocess_mutex>("interprocess_mutex m")[8]();
978 }
979 else
980 {
981 do
982 {
983 try
984 {
985 printf("%d opening shared memory\n", getpid());
986 segment_ptr = new managed_shared_memory(open_only, "gpu_barrier_memory");
987 }
988 catch(...)
989 {
990 printf("%d shared memory not ready. sleeping\n", getpid());
991 sleep(1);
992 }
993 }while(segment_ptr == NULL);
994
995 gpu_barrier = segment_ptr->find<barrier>("barrier instance").first;
996 gpu_mgmt_mutexes = segment_ptr->find<interprocess_mutex>("interprocess_mutex m").first;
997 }
998 }
999
1000 // scale data transmission too??
1001 SEND_SIZE *= scale;
1002 RECV_SIZE *= scale;
1003 STATE_SIZE *= scale;
1004
1005 init_cuda();
1006 }
1007
1008 ret = sporadic_task_ns(wcet, period, 0, cpu, RT_CLASS_SOFT,
1009 want_enforcement ? PRECISE_ENFORCEMENT
1010 : NO_ENFORCEMENT,
1011 migrate);
1012 if (ret < 0)
1013 bail_out("could not setup rt task params");
1014
1015 init_litmus();
1016
1017 ret = task_mode(LITMUS_RT_TASK);
1018 if (ret != 0)
1019 bail_out("could not become RT task");
1020
1021
1022
1023 uint64_t jobCount = 0;
1024 blitz::Array<uint64_t, 1> responseTimeLog(num_jobs+1);
1025
1026 struct timespec spec;
1027 uint64_t release;
1028 uint64_t finish;
1029
1030
1031 if (ENABLE_WAIT) {
1032 printf("Waiting for release.\n");
1033 ret = wait_for_ts_release();
1034 if (ret != 0)
1035 bail_out("wait_for_ts_release()");
1036 }
1037 else
1038 {
1039 sleep_next_period();
1040 }
1041
1042 clock_gettime(CLOCK_MONOTONIC, &spec);
1043 release = timespec_to_ns(spec);
1044 if (!__sync_bool_compare_and_swap(init_release_time, 0, release))
1045 {
1046 release = *init_release_time;
1047 }
1048
1049 releaseTime = wctime();
1050 double failsafeEnd = releaseTime + duration;
1051
1052
1053 if (file) {
1054 /* use times read from the CSV file */
1055 for (cur_job = 0; cur_job < num_jobs; ++cur_job) {
1056 /* convert job's length to seconds */
1057 job(exec_times[cur_job] * 0.001 * scale,
1058 gpu_sec_ms * 0.001 * scale,
1059 failsafeEnd);
1060 }
1061 } else {
1062 /* convert to seconds and scale */
1063 int keepGoing;
1064 do
1065 {
1066 keepGoing = job(wcet_ms * 0.001 * scale, gpu_sec_ms * 0.001 * scale, failsafeEnd);
1067
1068
1069 clock_gettime(CLOCK_MONOTONIC, &spec);
1070 finish = timespec_to_ns(spec);
1071
1072 responseTimeLog(min(num_jobs,jobCount++)) = finish - release;
1073
1074 // this is an estimated upper-bound on release time. it may be off by several microseconds.
1075#ifdef RESET_RELEASE_ON_MISS
1076 release = (release + period < finish) ?
1077 finish : /* missed deadline. adopt next release as current time. */
1078 release + period; /* some time in the future. */
1079#else
1080 release = release + period; // allow things to get progressively later.
1081#endif
1082
1083 sleep_next_period();
1084 clock_gettime(CLOCK_MONOTONIC, &spec);
1085 release = min(timespec_to_ns(spec), release);
1086
1087 } while(keepGoing);
1088 }
1089
1090 if(GPU_TASK && ENABLE_WAIT)
1091 {
1092 printf("%d waiting at barrier\n", getpid());
1093 gpu_barrier->wait();
1094 }
1095
1096 ret = task_mode(BACKGROUND_TASK);
1097 if (ret != 0)
1098 bail_out("could not become regular task (huh?)");
1099
1100 if (file)
1101 free(exec_times);
1102
1103 if(GPU_TASK)
1104 {
1105 /*
1106 if(ENABLE_WAIT)
1107 {
1108 // wait for all GPU using tasks ext RT mode.
1109 printf("%d waiting at barrier\n", getpid());
1110 gpu_barrier->wait();
1111 }
1112 */
1113
1114 exit_cuda();
1115
1116 if(ENABLE_WAIT)
1117 {
1118 /* wait before we clean up memory */
1119 printf("%d waiting for all to shutdown GPUs\n", getpid());
1120 gpu_barrier->wait();
1121
1122/*
1123 if(create_shm > -1)
1124 {
1125 printf("%d removing shared memory\n", getpid());
1126 shared_memory_object::remove("gpu_barrier_memory");
1127 }
1128*/
1129 }
1130 }
1131
1132
1133 if (ENABLE_WAIT)
1134 {
1135 printf("%d waiting at exit barrier\n", getpid());
1136 release_barrier->wait();
1137 }
1138
1139
1140 char gpu_using_str[] = "GPU\n";
1141 char cpu_only_str[] = "CPU\n";
1142 #define USED(arr) (arr)(Range(fromStart,min(num_jobs-1,jobCount-1)))
1143 // period (ms), avg-rt, min-rt, max-rt, avg-slack, numMisses
1144 printf("DONE,%d,%d,%f,%f,%f,%lu,%lu,%f,%lu,%d,%d,%s",
1145 cpu,
1146 getpid(),
1147 period_ms,
1148 // average
1149 blitz::mean(USED(responseTimeLog)),
1150 // average pct of period
1151 100.0*(blitz::mean(USED(responseTimeLog))/period),
1152 // min
1153 blitz::min(USED(responseTimeLog)),
1154 // max
1155 blitz::max(USED(responseTimeLog)),
1156 // average slack
1157 blitz::mean((uint64_t)period - USED(responseTimeLog)),
1158 // num jobs
1159 min(num_jobs-1,jobCount-1),
1160 // num misses
1161 blitz::count(USED(responseTimeLog) > (uint64_t)period),
1162 // num misses w/ unbounded
1163 blitz::count(USED(responseTimeLog) > (uint64_t)(2*period)),
1164 // flag gpu-using tasks
1165 ((GPU_TASK) ? gpu_using_str : cpu_only_str)
1166 );
1167
1168 return 0;
1169}