aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJoshua Bakita <bakitajoshua@gmail.com>2020-10-15 20:44:33 -0400
committerJoshua Bakita <bakitajoshua@gmail.com>2020-10-15 20:44:33 -0400
commit9e82e2c7cca65a8eb60d5bd99da66241c01a2991 (patch)
treef5ac2263d40995c09a3ae656f81d860eac3f658c
Import GPU scheduler code from 2019 GM deliverable
Code provided by Don Smith via tar file. `.gitignore` added by me.
-rw-r--r--.gitignore1
-rw-r--r--Makefile13
-rw-r--r--MinFitMinIntfR2.h120
-rw-r--r--SoftwareDocumentation.docxbin0 -> 105301 bytes
-rw-r--r--libcudart_wrapper.c3343
-rw-r--r--schedAPI.h10
-rw-r--r--schedLib.c629
7 files changed, 4116 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..be747da
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
libcudart_wrapper.so
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..15fe29a
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,13 @@
1CC := gcc
2CCFLAGS := -Wall -fPIC -shared
3LDFLAGS := -ldl -lpthread -lrt
4INCLUDES := -I/usr/local/cuda/include/
5
6################################################################################
7
8all: build
9
10build: libcudart_wrapper.so
11
12libcudart_wrapper.so: schedLib.c libcudart_wrapper.c
13 $(CC) $(CCFLAGS) $(INCLUDES) -o libcudart_wrapper.so schedLib.c libcudart_wrapper.c $(LDFLAGS)
diff --git a/MinFitMinIntfR2.h b/MinFitMinIntfR2.h
new file mode 100644
index 0000000..819b2b4
--- /dev/null
+++ b/MinFitMinIntfR2.h
@@ -0,0 +1,120 @@
1/* Scheduling policy function that implements a "min thread use, min interference" policy,
2 * i.e., find the ready-to-launch kernel that will occupy the smallest number of available
3 * GPU threads AND does not fail a test for interference effects. The test for
4 * interference effects requires that ratio between the number of threads in the
5 * kernel under consideration and any kernel already scheduled does not exceed a
6 * threshold (in this implementation, 2.0). This test is motivated by empirical
7 * measurements that have shown interfernce effects such as 500% or higher for
8 * large thread ratios between concurrently executing kernels. This is thought
9 * to be an artifact of the un-documented warp scheduling algorithm in the NVIDIA SMs.
10 */
11
12//put any global (static) declarations here:
13
14
15#define MAX_THREAD_RATIO 2.0 // Threshold ratio between scheduled and new kernel
16
17
18int find_best_kernel(void) {
19 int i;
20 int this_one = -1; //default return value indicating no kernel to launch
21 int need_threads, available_threads, left_over;
22 int k;
23 int occupied_threads[MAX_STREAMS]; //GPU threads allocated to scheduled kernels
24
25 //Must be called with sched_lock held
26
27
28 //record the allocated GPU threads in the kernel scheduled for each stream
29 for (i = 0; i < stream_count; i++)
30 occupied_threads[i] = GPU.stream_threads[i];
31
32 //GPU threads available for allocation
33 available_threads = (MAX_GPU_THREADS - GPU.threads_occupied);
34 left_over = -1; //the number of threads left available if a kernel is scheduled
35
36 for (i = 0; i < stream_count; i++) { //examine all streams
37 if (Stream[i].state == READY_LAUNCH) { //only threads/streams ready to launch are considered
38
39 //determine how many threads would be allocated for this kernel (see
40 //allocate_gpu_threads() for a description)
41 need_threads = min(MAX_GPU_THREADS, Stream[i].blocks * Stream[i].block_threads);
42 if (need_threads > available_threads) //can't be scheduled
43 continue;
44
45 // find kernel with smalled thread allocation that does not create thread imbalance
46 //?? should there be a starvation-prevention part of this policy ??
47
48 if ((available_threads - need_threads) > left_over) {
49 //found kernel with smallest thread allocation so far
50 //compute and test the ratios of threads between it and all kernels scheduled
51
52 for (k = 0; k < stream_count; k++) {//examine all streams
53 if (occupied_threads[k] == 0) //stream has no kernel scheduled
54 continue;
55 //if test fails for any already scheduled kernel, this stream can't launch
56 if ((float)(occupied_threads[k] / (float)need_threads) > MAX_THREAD_RATIO)
57 break;
58 if ((float)(need_threads / (float)occupied_threads[k]) > MAX_THREAD_RATIO)
59 break;
60 }
61 //if the test is passed for all scheduled kernels, this stream's kernel can launch
62 if (k == stream_count) {
63 this_one = i; //the final value of this_one is the stream index to schedule (or -1)
64 left_over = available_threads - need_threads; //current smallest thread allocation
65 }
66 } //end test for smaller thread allocation
67 } //end test for stream ready to launch
68 } //end outer for loop
69
70 if (TRACE_ON) {
71 show_gpu_state();
72 show_stream_state(this_one);
73 }
74 return this_one; //the scheduling decision (stream index)
75}
76
77// Utility function to trace GPU state used in scheduling policy decisions
78void show_gpu_state(void) {
79
80 //Must be called with sched_lock held
81
82 int i;
83 if (trc_idx >= MAX_SCHED_TRACE)
84 return;
85
86 for (i = 0; i < MAX_STREAMS; i++) {
87 SchedTrace[trc_idx].stream[i] = GPU.streams[i];
88 SchedTrace[trc_idx].stream_threads[i] = GPU.stream_threads[i];
89 SchedTrace[trc_idx].next = 0;
90 strcpy(SchedTrace[trc_idx].type, "GPU");
91 }
92 trc_idx++;
93}
94
95// Utility function to trace stream state used in scheduling policy decisions
96void show_stream_state(int this_one) {
97
98 //Must be called with sched_lock held
99
100 int i;
101 int need_threads;
102 if (trc_idx >= MAX_SCHED_TRACE)
103 return;
104
105 for (i = 0; i < MAX_STREAMS; i++) {
106 need_threads = min(MAX_GPU_THREADS, Stream[i].blocks * Stream[i].block_threads);
107 if ((Stream[i].state != READY_LAUNCH) &&
108 (Stream[i].state != LAUNCHED))
109 need_threads = -need_threads; //encode unschedulable state in threads with minus
110 SchedTrace[trc_idx].stream[i] = Stream[i].thread;
111 SchedTrace[trc_idx].stream_threads[i] = need_threads;
112 if (this_one == -1)
113 SchedTrace[trc_idx].next = this_one;
114 else
115 SchedTrace[trc_idx].next = Stream[this_one].thread;
116 strcpy(SchedTrace[trc_idx].type, "STR");
117 }
118 trc_idx++;
119}
120
diff --git a/SoftwareDocumentation.docx b/SoftwareDocumentation.docx
new file mode 100644
index 0000000..903706f
--- /dev/null
+++ b/SoftwareDocumentation.docx
Binary files differ
diff --git a/libcudart_wrapper.c b/libcudart_wrapper.c
new file mode 100644
index 0000000..8e4f005
--- /dev/null
+++ b/libcudart_wrapper.c
@@ -0,0 +1,3343 @@
1/* Wrapper functions to implement transparent extension of the CUDA runtime library
2 * (libcudart) by dynamic linking this set of function interfaces ahead of the
3 * "real" library using LD_PRELOAD. Calls to library extension functions
4 * can be embedded in these wrapper functions. The specific calls used in the
5 * scheduling "middleware" are defined in the include file schedAPI.h below.
6 *
7 * WARNING - Do not change this file unless you are totally sure you know what you
8 * are doing!
9 *
10 */
11
12#define _GNU_SOURCE
13
14#include <stdio.h>
15#include <dlfcn.h>
16#include <unistd.h>
17#include <sys/types.h>
18#include <sys/syscall.h>
19#include "cuda_runtime_api.h"
20#include "schedAPI.h" //function prototypes for calls to library extensions
21
22static __host__ cudaError_t CUDARTAPI (*orig_cudaDeviceReset)(void) = NULL;
23
24__host__ cudaError_t CUDARTAPI cudaDeviceReset(void) {
25 cudaError_t ret;
26 // Write your own custom c code in the cudaDeviceReset.c file
27 tracePrint();
28 ret = orig_cudaDeviceReset();
29 return ret;
30
31}
32
33static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaDeviceSynchronize)(void) = NULL;
34
35__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceSynchronize(void) {
36 cudaError_t ret;
37 // Write your own custom c code in the cudaDeviceSynchronize.c file
38 ret = orig_cudaDeviceSynchronize();
39 return ret;
40
41}
42
43static __host__ cudaError_t CUDARTAPI (*orig_cudaDeviceSetLimit)(enum cudaLimit limit, size_t value) = NULL;
44
45__host__ cudaError_t CUDARTAPI cudaDeviceSetLimit(enum cudaLimit limit, size_t value) {
46 cudaError_t ret;
47 // Write your own custom c code in the cudaDeviceSetLimit.c file
48 ret = orig_cudaDeviceSetLimit(limit, value);
49 return ret;
50
51}
52
53static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaDeviceGetLimit)(size_t *pValue, enum cudaLimit limit) = NULL;
54
55__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit) {
56 cudaError_t ret;
57 // Write your own custom c code in the cudaDeviceGetLimit.c file
58 ret = orig_cudaDeviceGetLimit(pValue, limit);
59 return ret;
60
61}
62
63static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaDeviceGetCacheConfig)(enum cudaFuncCache *pCacheConfig) = NULL;
64
65__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig) {
66 cudaError_t ret;
67 // Write your own custom c code in the cudaDeviceGetCacheConfig.c file
68 ret = orig_cudaDeviceGetCacheConfig(pCacheConfig);
69 return ret;
70
71}
72
73static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaDeviceGetStreamPriorityRange)(int *leastPriority, int *greatestPriority) = NULL;
74
75__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetStreamPriorityRange(int *leastPriority, int *greatestPriority) {
76 cudaError_t ret;
77 // Write your own custom c code in the cudaDeviceGetStreamPriorityRange.c file
78 ret = orig_cudaDeviceGetStreamPriorityRange(leastPriority, greatestPriority);
79 return ret;
80
81}
82
83static __host__ cudaError_t CUDARTAPI (*orig_cudaDeviceSetCacheConfig)(enum cudaFuncCache cacheConfig) = NULL;
84
85__host__ cudaError_t CUDARTAPI cudaDeviceSetCacheConfig(enum cudaFuncCache cacheConfig) {
86 cudaError_t ret;
87 // Write your own custom c code in the cudaDeviceSetCacheConfig.c file
88 ret = orig_cudaDeviceSetCacheConfig(cacheConfig);
89 return ret;
90
91}
92
93static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaDeviceGetSharedMemConfig)(enum cudaSharedMemConfig *pConfig) = NULL;
94
95__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig) {
96 cudaError_t ret;
97 // Write your own custom c code in the cudaDeviceGetSharedMemConfig.c file
98 ret = orig_cudaDeviceGetSharedMemConfig(pConfig);
99 return ret;
100
101}
102
103static __host__ cudaError_t CUDARTAPI (*orig_cudaDeviceSetSharedMemConfig)(enum cudaSharedMemConfig config) = NULL;
104
105__host__ cudaError_t CUDARTAPI cudaDeviceSetSharedMemConfig(enum cudaSharedMemConfig config) {
106 cudaError_t ret;
107 // Write your own custom c code in the cudaDeviceSetSharedMemConfig.c file
108 ret = orig_cudaDeviceSetSharedMemConfig(config);
109 return ret;
110
111}
112
113static __host__ cudaError_t CUDARTAPI (*orig_cudaDeviceGetByPCIBusId)(int *device, const char *pciBusId) = NULL;
114
115__host__ cudaError_t CUDARTAPI cudaDeviceGetByPCIBusId(int *device, const char *pciBusId) {
116 cudaError_t ret;
117 // Write your own custom c code in the cudaDeviceGetByPCIBusId.c file
118 ret = orig_cudaDeviceGetByPCIBusId(device, pciBusId);
119 return ret;
120
121}
122
123static __host__ cudaError_t CUDARTAPI (*orig_cudaDeviceGetPCIBusId)(char *pciBusId, int len, int device) = NULL;
124
125__host__ cudaError_t CUDARTAPI cudaDeviceGetPCIBusId(char *pciBusId, int len, int device) {
126 cudaError_t ret;
127 // Write your own custom c code in the cudaDeviceGetPCIBusId.c file
128 ret = orig_cudaDeviceGetPCIBusId(pciBusId, len, device);
129 return ret;
130
131}
132
133static __host__ cudaError_t CUDARTAPI (*orig_cudaIpcGetEventHandle)(cudaIpcEventHandle_t *handle, cudaEvent_t event) = NULL;
134
135__host__ cudaError_t CUDARTAPI cudaIpcGetEventHandle(cudaIpcEventHandle_t *handle, cudaEvent_t event) {
136 cudaError_t ret;
137 // Write your own custom c code in the cudaIpcGetEventHandle.c file
138 ret = orig_cudaIpcGetEventHandle(handle, event);
139 return ret;
140
141}
142
143static __host__ cudaError_t CUDARTAPI (*orig_cudaIpcOpenEventHandle)(cudaEvent_t *event, cudaIpcEventHandle_t handle) = NULL;
144
145__host__ cudaError_t CUDARTAPI cudaIpcOpenEventHandle(cudaEvent_t *event, cudaIpcEventHandle_t handle) {
146 cudaError_t ret;
147 // Write your own custom c code in the cudaIpcOpenEventHandle.c file
148 ret = orig_cudaIpcOpenEventHandle(event, handle);
149 return ret;
150
151}
152
153static __host__ cudaError_t CUDARTAPI (*orig_cudaIpcGetMemHandle)(cudaIpcMemHandle_t *handle, void *devPtr) = NULL;
154
155__host__ cudaError_t CUDARTAPI cudaIpcGetMemHandle(cudaIpcMemHandle_t *handle, void *devPtr) {
156 cudaError_t ret;
157 // Write your own custom c code in the cudaIpcGetMemHandle.c file
158 ret = orig_cudaIpcGetMemHandle(handle, devPtr);
159 return ret;
160
161}
162
163static __host__ cudaError_t CUDARTAPI (*orig_cudaIpcOpenMemHandle)(void **devPtr, cudaIpcMemHandle_t handle, unsigned int flags) = NULL;
164
165__host__ cudaError_t CUDARTAPI cudaIpcOpenMemHandle(void **devPtr, cudaIpcMemHandle_t handle, unsigned int flags) {
166 cudaError_t ret;
167 // Write your own custom c code in the cudaIpcOpenMemHandle.c file
168 ret = orig_cudaIpcOpenMemHandle(devPtr, handle, flags);
169 return ret;
170
171}
172
173static __host__ cudaError_t CUDARTAPI (*orig_cudaIpcCloseMemHandle)(void *devPtr) = NULL;
174
175__host__ cudaError_t CUDARTAPI cudaIpcCloseMemHandle(void *devPtr) {
176 cudaError_t ret;
177 // Write your own custom c code in the cudaIpcCloseMemHandle.c file
178 ret = orig_cudaIpcCloseMemHandle(devPtr);
179 return ret;
180
181}
182
183static __host__ cudaError_t CUDARTAPI (*orig_cudaThreadExit)(void) = NULL;
184
185__host__ cudaError_t CUDARTAPI cudaThreadExit(void) {
186 cudaError_t ret;
187 // Write your own custom c code in the cudaThreadExit.c file
188 ret = orig_cudaThreadExit();
189 return ret;
190
191}
192
193static __host__ cudaError_t CUDARTAPI (*orig_cudaThreadSynchronize)(void) = NULL;
194
195__host__ cudaError_t CUDARTAPI cudaThreadSynchronize(void) {
196 cudaError_t ret;
197 // Write your own custom c code in the cudaThreadSynchronize.c file
198 ret = orig_cudaThreadSynchronize();
199 return ret;
200
201}
202
203static __host__ cudaError_t CUDARTAPI (*orig_cudaThreadSetLimit)(enum cudaLimit limit, size_t value) = NULL;
204
205__host__ cudaError_t CUDARTAPI cudaThreadSetLimit(enum cudaLimit limit, size_t value) {
206 cudaError_t ret;
207 // Write your own custom c code in the cudaThreadSetLimit.c file
208 ret = orig_cudaThreadSetLimit(limit, value);
209 return ret;
210
211}
212
213static __host__ cudaError_t CUDARTAPI (*orig_cudaThreadGetLimit)(size_t *pValue, enum cudaLimit limit) = NULL;
214
215__host__ cudaError_t CUDARTAPI cudaThreadGetLimit(size_t *pValue, enum cudaLimit limit) {
216 cudaError_t ret;
217 // Write your own custom c code in the cudaThreadGetLimit.c file
218 ret = orig_cudaThreadGetLimit(pValue, limit);
219 return ret;
220
221}
222
223static __host__ cudaError_t CUDARTAPI (*orig_cudaThreadGetCacheConfig)(enum cudaFuncCache *pCacheConfig) = NULL;
224
225__host__ cudaError_t CUDARTAPI cudaThreadGetCacheConfig(enum cudaFuncCache *pCacheConfig) {
226 cudaError_t ret;
227 // Write your own custom c code in the cudaThreadGetCacheConfig.c file
228 ret = orig_cudaThreadGetCacheConfig(pCacheConfig);
229 return ret;
230
231}
232
233static __host__ cudaError_t CUDARTAPI (*orig_cudaThreadSetCacheConfig)(enum cudaFuncCache cacheConfig) = NULL;
234
235__host__ cudaError_t CUDARTAPI cudaThreadSetCacheConfig(enum cudaFuncCache cacheConfig) {
236 cudaError_t ret;
237 // Write your own custom c code in the cudaThreadSetCacheConfig.c file
238 ret = orig_cudaThreadSetCacheConfig(cacheConfig);
239 return ret;
240
241}
242
243static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaGetLastError)(void) = NULL;
244
245__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetLastError(void) {
246 cudaError_t ret;
247 // Write your own custom c code in the cudaGetLastError.c file
248 ret = orig_cudaGetLastError();
249 return ret;
250
251}
252
253static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaPeekAtLastError)(void) = NULL;
254
255__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaPeekAtLastError(void) {
256 cudaError_t ret;
257 // Write your own custom c code in the cudaPeekAtLastError.c file
258 ret = orig_cudaPeekAtLastError();
259 return ret;
260
261}
262
263static __host__ __cudart_builtin__ const char* CUDARTAPI (*orig_cudaGetErrorName)(cudaError_t error) = NULL;
264
265__host__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorName(cudaError_t error) {
266 const char* ret;
267 // Write your own custom c code in the cudaGetErrorName.c file
268 ret = orig_cudaGetErrorName(error);
269 return ret;
270
271}
272
273static __host__ __cudart_builtin__ const char* CUDARTAPI (*orig_cudaGetErrorString)(cudaError_t error) = NULL;
274
275__host__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorString(cudaError_t error) {
276 const char* ret;
277 // Write your own custom c code in the cudaGetErrorString.c file
278 ret = orig_cudaGetErrorString(error);
279 return ret;
280
281}
282
283static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaGetDeviceCount)(int *count) = NULL;
284
285__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceCount(int *count) {
286 cudaError_t ret;
287 // Write your own custom c code in the cudaGetDeviceCount.c file
288 ret = orig_cudaGetDeviceCount(count);
289 return ret;
290
291}
292
293static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaGetDeviceProperties)(struct cudaDeviceProp *prop, int device) = NULL;
294
295__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceProperties(struct cudaDeviceProp *prop, int device) {
296 cudaError_t ret;
297 // Write your own custom c code in the cudaGetDeviceProperties.c file
298 ret = orig_cudaGetDeviceProperties(prop, device);
299 return ret;
300
301}
302
303static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaDeviceGetAttribute)(int *value, enum cudaDeviceAttr attr, int device) = NULL;
304
305__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device) {
306 cudaError_t ret;
307 // Write your own custom c code in the cudaDeviceGetAttribute.c file
308 ret = orig_cudaDeviceGetAttribute(value, attr, device);
309 return ret;
310
311}
312
313static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaDeviceGetP2PAttribute)(int *value, enum cudaDeviceP2PAttr attr, int srcDevice, int dstDevice) = NULL;
314
315__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetP2PAttribute(int *value, enum cudaDeviceP2PAttr attr, int srcDevice, int dstDevice) {
316 cudaError_t ret;
317 // Write your own custom c code in the cudaDeviceGetP2PAttribute.c file
318 ret = orig_cudaDeviceGetP2PAttribute(value, attr, srcDevice, dstDevice);
319 return ret;
320
321}
322
323static __host__ cudaError_t CUDARTAPI (*orig_cudaChooseDevice)(int *device, const struct cudaDeviceProp *prop) = NULL;
324
325__host__ cudaError_t CUDARTAPI cudaChooseDevice(int *device, const struct cudaDeviceProp *prop) {
326 cudaError_t ret;
327 // Write your own custom c code in the cudaChooseDevice.c file
328 ret = orig_cudaChooseDevice(device, prop);
329 return ret;
330
331}
332
333static __host__ cudaError_t CUDARTAPI (*orig_cudaSetDevice)(int device) = NULL;
334
335__host__ cudaError_t CUDARTAPI cudaSetDevice(int device) {
336 cudaError_t ret;
337 // Write your own custom c code in the cudaSetDevice.c file
338 ret = orig_cudaSetDevice(device);
339 return ret;
340
341}
342
343static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaGetDevice)(int *device) = NULL;
344
345__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDevice(int *device) {
346 cudaError_t ret;
347 // Write your own custom c code in the cudaGetDevice.c file
348 ret = orig_cudaGetDevice(device);
349 return ret;
350
351}
352
353static __host__ cudaError_t CUDARTAPI (*orig_cudaSetValidDevices)(int *device_arr, int len) = NULL;
354
355__host__ cudaError_t CUDARTAPI cudaSetValidDevices(int *device_arr, int len) {
356 cudaError_t ret;
357 // Write your own custom c code in the cudaSetValidDevices.c file
358 ret = orig_cudaSetValidDevices(device_arr, len);
359 return ret;
360
361}
362
363static __host__ cudaError_t CUDARTAPI (*orig_cudaSetDeviceFlags)( unsigned int flags ) = NULL;
364
365__host__ cudaError_t CUDARTAPI cudaSetDeviceFlags( unsigned int flags ) {
366 cudaError_t ret;
367 // Write your own custom c code in the cudaSetDeviceFlags.c file
368 ret = orig_cudaSetDeviceFlags(flags);
369 return ret;
370
371}
372
373static __host__ cudaError_t CUDARTAPI (*orig_cudaGetDeviceFlags)( unsigned int *flags ) = NULL;
374
375__host__ cudaError_t CUDARTAPI cudaGetDeviceFlags( unsigned int *flags ) {
376 cudaError_t ret;
377 // Write your own custom c code in the cudaGetDeviceFlags.c file
378 ret = orig_cudaGetDeviceFlags(flags);
379 return ret;
380
381}
382
383static __host__ cudaError_t CUDARTAPI (*orig_cudaStreamCreate)(cudaStream_t *pStream) = NULL;
384
385__host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStream) {
386 cudaError_t ret;
387 pid_t my_tid = syscall(SYS_gettid);
388 streamInit(my_tid, 0);
389
390 // Write your own custom c code in the cudaStreamCreate.c file
391 ret = orig_cudaStreamCreate(pStream);
392 return ret;
393
394}
395
396static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaStreamCreateWithFlags)(cudaStream_t *pStream, unsigned int flags) = NULL;
397
398__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags) {
399 cudaError_t ret;
400 pid_t my_tid = syscall(SYS_gettid);
401 streamInit(my_tid, 0);
402
403 // Write your own custom c code in the cudaStreamCreateWithFlags.c file
404 ret = orig_cudaStreamCreateWithFlags(pStream, flags);
405 return ret;
406
407}
408
409static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaStreamCreateWithPriority)(cudaStream_t *pStream, unsigned int flags, int priority) = NULL;
410
411__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithPriority(cudaStream_t *pStream, unsigned int flags, int priority) {
412 cudaError_t ret;
413 pid_t my_tid = syscall(SYS_gettid);
414 streamInit(my_tid, priority);
415
416 // Write your own custom c code in the cudaStreamCreateWithPriority.c file
417 ret = orig_cudaStreamCreateWithPriority(pStream, flags, priority);
418 return ret;
419
420}
421
422static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaStreamGetPriority)(cudaStream_t hStream, int *priority) = NULL;
423
424__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamGetPriority(cudaStream_t hStream, int *priority) {
425 cudaError_t ret;
426 // Write your own custom c code in the cudaStreamGetPriority.c file
427 ret = orig_cudaStreamGetPriority(hStream, priority);
428 return ret;
429
430}
431
432static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaStreamGetFlags)(cudaStream_t hStream, unsigned int *flags) = NULL;
433
434__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamGetFlags(cudaStream_t hStream, unsigned int *flags) {
435 cudaError_t ret;
436 // Write your own custom c code in the cudaStreamGetFlags.c file
437 ret = orig_cudaStreamGetFlags(hStream, flags);
438 return ret;
439
440}
441
442static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaStreamDestroy)(cudaStream_t stream) = NULL;
443
444__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamDestroy(cudaStream_t stream) {
445 cudaError_t ret;
446 // Write your own custom c code in the cudaStreamDestroy.c file
447 ret = orig_cudaStreamDestroy(stream);
448 return ret;
449
450}
451
452static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaStreamWaitEvent)(cudaStream_t stream, cudaEvent_t event, unsigned int flags) = NULL;
453
454__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags) {
455 cudaError_t ret;
456 // Write your own custom c code in the cudaStreamWaitEvent.c file
457 ret = orig_cudaStreamWaitEvent(stream, event, flags);
458 return ret;
459
460}
461
462static __host__ cudaError_t CUDARTAPI (*orig_cudaStreamAddCallback)(cudaStream_t stream, cudaStreamCallback_t callback, void *userData, unsigned int flags) = NULL;
463
464__host__ cudaError_t CUDARTAPI cudaStreamAddCallback(cudaStream_t stream, cudaStreamCallback_t callback, void *userData, unsigned int flags) {
465 cudaError_t ret;
466 // Write your own custom c code in the cudaStreamAddCallback.c file
467 ret = orig_cudaStreamAddCallback(stream, callback, userData, flags);
468 return ret;
469
470}
471
472static __host__ cudaError_t CUDARTAPI (*orig_cudaStreamSynchronize)(cudaStream_t stream) = NULL;
473
474__host__ cudaError_t CUDARTAPI cudaStreamSynchronize(cudaStream_t stream) {
475 cudaError_t ret;
476 pid_t my_tid = syscall(SYS_gettid);
477
478 // Write your own custom c code in the cudaStreamSynchronize.c file
479 //printf("cudaStreamSynchronize stream %p\n", (void *)stream);
480
481 ret = orig_cudaStreamSynchronize(stream);
482 schedSync(my_tid, (void *)stream);
483 return ret;
484
485}
486
487static __host__ cudaError_t CUDARTAPI (*orig_cudaStreamQuery)(cudaStream_t stream) = NULL;
488
489__host__ cudaError_t CUDARTAPI cudaStreamQuery(cudaStream_t stream) {
490 cudaError_t ret;
491 // Write your own custom c code in the cudaStreamQuery.c file
492 ret = orig_cudaStreamQuery(stream);
493 return ret;
494
495}
496
497static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaStreamAttachMemAsync)(cudaStream_t stream, void *devPtr, size_t length , unsigned int flags ) = NULL;
498
499__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr, size_t length , unsigned int flags ) {
500 cudaError_t ret;
501 // Write your own custom c code in the cudaStreamAttachMemAsync.c file
502 ret = orig_cudaStreamAttachMemAsync(stream, devPtr, length, flags);
503 return ret;
504
505}
506
507static __host__ cudaError_t CUDARTAPI (*orig_cudaEventCreate)(cudaEvent_t *event) = NULL;
508
509__host__ cudaError_t CUDARTAPI cudaEventCreate(cudaEvent_t *event) {
510 cudaError_t ret;
511 // Write your own custom c code in the cudaEventCreate.c file
512 ret = orig_cudaEventCreate(event);
513 return ret;
514
515}
516
517static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaEventCreateWithFlags)(cudaEvent_t *event, unsigned int flags) = NULL;
518
519__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags) {
520 cudaError_t ret;
521 // Write your own custom c code in the cudaEventCreateWithFlags.c file
522 ret = orig_cudaEventCreateWithFlags(event, flags);
523 return ret;
524
525}
526
527static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaEventRecord)(cudaEvent_t event, cudaStream_t stream ) = NULL;
528
529__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord(cudaEvent_t event, cudaStream_t stream ) {
530 cudaError_t ret;
531 // Write your own custom c code in the cudaEventRecord.c file
532 ret = orig_cudaEventRecord(event, stream);
533 return ret;
534
535}
536
537static __host__ cudaError_t CUDARTAPI (*orig_cudaEventQuery)(cudaEvent_t event) = NULL;
538
539__host__ cudaError_t CUDARTAPI cudaEventQuery(cudaEvent_t event) {
540 cudaError_t ret;
541 // Write your own custom c code in the cudaEventQuery.c file
542 ret = orig_cudaEventQuery(event);
543 return ret;
544
545}
546
547static __host__ cudaError_t CUDARTAPI (*orig_cudaEventSynchronize)(cudaEvent_t event) = NULL;
548
549__host__ cudaError_t CUDARTAPI cudaEventSynchronize(cudaEvent_t event) {
550 cudaError_t ret;
551 // Write your own custom c code in the cudaEventSynchronize.c file
552 ret = orig_cudaEventSynchronize(event);
553 return ret;
554
555}
556
557static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaEventDestroy)(cudaEvent_t event) = NULL;
558
559__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventDestroy(cudaEvent_t event) {
560 cudaError_t ret;
561 // Write your own custom c code in the cudaEventDestroy.c file
562 ret = orig_cudaEventDestroy(event);
563 return ret;
564
565}
566
567static __host__ cudaError_t CUDARTAPI (*orig_cudaEventElapsedTime)(float *ms, cudaEvent_t start, cudaEvent_t end) = NULL;
568
569__host__ cudaError_t CUDARTAPI cudaEventElapsedTime(float *ms, cudaEvent_t start, cudaEvent_t end) {
570 cudaError_t ret;
571 // Write your own custom c code in the cudaEventElapsedTime.c file
572 ret = orig_cudaEventElapsedTime(ms, start, end);
573 return ret;
574
575}
576
577static __host__ cudaError_t CUDARTAPI (*orig_cudaLaunchKernel)(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) = NULL;
578
579__host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) {
580 cudaError_t ret;
581 // Write your own custom c code in the cudaLaunchKernel.c file
582 ret = orig_cudaLaunchKernel(func, gridDim, blockDim, args, sharedMem, stream);
583 return ret;
584
585}
586
587static __host__ cudaError_t CUDARTAPI (*orig_cudaLaunchCooperativeKernel)(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) = NULL;
588
589__host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) {
590 cudaError_t ret;
591 // Write your own custom c code in the cudaLaunchCooperativeKernel.c file
592 ret = orig_cudaLaunchCooperativeKernel(func, gridDim, blockDim, args, sharedMem, stream);
593 return ret;
594
595}
596
597static __host__ cudaError_t CUDARTAPI (*orig_cudaLaunchCooperativeKernelMultiDevice)(struct cudaLaunchParams *launchParamsList, unsigned int numDevices, unsigned int flags ) = NULL;
598
599__host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams *launchParamsList, unsigned int numDevices, unsigned int flags ) {
600 cudaError_t ret;
601 // Write your own custom c code in the cudaLaunchCooperativeKernelMultiDevice.c file
602 ret = orig_cudaLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags);
603 return ret;
604
605}
606
607static __host__ cudaError_t CUDARTAPI (*orig_cudaFuncSetCacheConfig)(const void *func, enum cudaFuncCache cacheConfig) = NULL;
608
609__host__ cudaError_t CUDARTAPI cudaFuncSetCacheConfig(const void *func, enum cudaFuncCache cacheConfig) {
610 cudaError_t ret;
611 // Write your own custom c code in the cudaFuncSetCacheConfig.c file
612 ret = orig_cudaFuncSetCacheConfig(func, cacheConfig);
613 return ret;
614
615}
616
617static __host__ cudaError_t CUDARTAPI (*orig_cudaFuncSetSharedMemConfig)(const void *func, enum cudaSharedMemConfig config) = NULL;
618
619__host__ cudaError_t CUDARTAPI cudaFuncSetSharedMemConfig(const void *func, enum cudaSharedMemConfig config) {
620 cudaError_t ret;
621 // Write your own custom c code in the cudaFuncSetSharedMemConfig.c file
622 ret = orig_cudaFuncSetSharedMemConfig(func, config);
623 return ret;
624
625}
626
627static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaFuncGetAttributes)(struct cudaFuncAttributes *attr, const void *func) = NULL;
628
629__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func) {
630 cudaError_t ret;
631 // Write your own custom c code in the cudaFuncGetAttributes.c file
632 ret = orig_cudaFuncGetAttributes(attr, func);
633 return ret;
634
635}
636
637static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaFuncSetAttribute)(const void *func, enum cudaFuncAttribute attr, int value) = NULL;
638
639__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFuncSetAttribute(const void *func, enum cudaFuncAttribute attr, int value) {
640 cudaError_t ret;
641 // Write your own custom c code in the cudaFuncSetAttribute.c file
642 ret = orig_cudaFuncSetAttribute(func, attr, value);
643 return ret;
644
645}
646
647static __host__ cudaError_t CUDARTAPI (*orig_cudaSetDoubleForDevice)(double *d) = NULL;
648
649__host__ cudaError_t CUDARTAPI cudaSetDoubleForDevice(double *d) {
650 cudaError_t ret;
651 // Write your own custom c code in the cudaSetDoubleForDevice.c file
652 ret = orig_cudaSetDoubleForDevice(d);
653 return ret;
654
655}
656
657static __host__ cudaError_t CUDARTAPI (*orig_cudaSetDoubleForHost)(double *d) = NULL;
658
659__host__ cudaError_t CUDARTAPI cudaSetDoubleForHost(double *d) {
660 cudaError_t ret;
661 // Write your own custom c code in the cudaSetDoubleForHost.c file
662 ret = orig_cudaSetDoubleForHost(d);
663 return ret;
664
665}
666
667static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaOccupancyMaxActiveBlocksPerMultiprocessor)(int *numBlocks, const void *func, int blockSize, size_t dynamicSMemSize) = NULL;
668
669__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSMemSize) {
670 cudaError_t ret;
671 // Write your own custom c code in the cudaOccupancyMaxActiveBlocksPerMultiprocessor.c file
672 ret = orig_cudaOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, func, blockSize, dynamicSMemSize);
673 return ret;
674
675}
676
677static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags)(int *numBlocks, const void *func, int blockSize, size_t dynamicSMemSize, unsigned int flags) = NULL;
678
679__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSMemSize, unsigned int flags) {
680 cudaError_t ret;
681 // Write your own custom c code in the cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.c file
682 ret = orig_cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks, func, blockSize, dynamicSMemSize, flags);
683 return ret;
684
685}
686
687static __host__ cudaError_t CUDARTAPI (*orig_cudaConfigureCall)(dim3 gridDim, dim3 blockDim, size_t sharedMem , cudaStream_t stream ) = NULL;
688
689__host__ cudaError_t CUDARTAPI cudaConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem , cudaStream_t stream ) {
690 cudaError_t ret;
691 pid_t my_tid = syscall(SYS_gettid);
692 // Write your own custom c code in the cudaConfigureCall.c file
693 // printf("cudaConfigureCall TID %d stream %p blocks %d threads %d\n",
694 schedConfCall(my_tid, (void *)stream, gridDim.x * gridDim.y, blockDim.x * blockDim.y);
695 ret = orig_cudaConfigureCall(gridDim, blockDim, sharedMem, stream);
696 return ret;
697
698}
699
700static __host__ cudaError_t CUDARTAPI (*orig_cudaSetupArgument)(const void *arg, size_t size, size_t offset) = NULL;
701
702__host__ cudaError_t CUDARTAPI cudaSetupArgument(const void *arg, size_t size, size_t offset) {
703 cudaError_t ret;
704 //pid_t my_tid = syscall(SYS_gettid);
705 // Write your own custom c code in the cudaSetupArgument.c file
706 // printf("cudaSetupArugment TID %d size %lu offset %lu\n", my_tid, size, offset);
707 ret = orig_cudaSetupArgument(arg, size, offset);
708 return ret;
709
710}
711
712static __host__ cudaError_t CUDARTAPI (*orig_cudaLaunch)(const void *func) = NULL;
713
714__host__ cudaError_t CUDARTAPI cudaLaunch(const void *func) {
715 cudaError_t ret;
716 pid_t my_tid = syscall(SYS_gettid);
717 //printf("cudaLaunch TID %d\n", my_tid);
718 schedLaunch(my_tid);
719 ret = orig_cudaLaunch(func);
720 return ret;
721
722}
723
724static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaMallocManaged)(void **devPtr, size_t size, unsigned int flags ) = NULL;
725
726__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMallocManaged(void **devPtr, size_t size, unsigned int flags ) {
727 cudaError_t ret;
728 // Write your own custom c code in the cudaMallocManaged.c file
729 ret = orig_cudaMallocManaged(devPtr, size, flags);
730 return ret;
731
732}
733
734static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaMalloc)(void **devPtr, size_t size) = NULL;
735
736__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size) {
737 cudaError_t ret;
738 // Write your own custom c code in the cudaMalloc.c file
739 ret = orig_cudaMalloc(devPtr, size);
740 return ret;
741
742}
743
744static __host__ cudaError_t CUDARTAPI (*orig_cudaMallocHost)(void **ptr, size_t size) = NULL;
745
746__host__ cudaError_t CUDARTAPI cudaMallocHost(void **ptr, size_t size) {
747 cudaError_t ret;
748 // Write your own custom c code in the cudaMallocHost.c file
749 ret = orig_cudaMallocHost(ptr, size);
750 return ret;
751
752}
753
754static __host__ cudaError_t CUDARTAPI (*orig_cudaMallocPitch)(void **devPtr, size_t *pitch, size_t width, size_t height) = NULL;
755
756__host__ cudaError_t CUDARTAPI cudaMallocPitch(void **devPtr, size_t *pitch, size_t width, size_t height) {
757 cudaError_t ret;
758 // Write your own custom c code in the cudaMallocPitch.c file
759 ret = orig_cudaMallocPitch(devPtr, pitch, width, height);
760 return ret;
761
762}
763
764static __host__ cudaError_t CUDARTAPI (*orig_cudaMallocArray)(cudaArray_t *array, const struct cudaChannelFormatDesc *desc, size_t width, size_t height , unsigned int flags ) = NULL;
765
766__host__ cudaError_t CUDARTAPI cudaMallocArray(cudaArray_t *array, const struct cudaChannelFormatDesc *desc, size_t width, size_t height , unsigned int flags ) {
767 cudaError_t ret;
768 // Write your own custom c code in the cudaMallocArray.c file
769 ret = orig_cudaMallocArray(array, desc, width, height, flags);
770 return ret;
771
772}
773
774static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaFree)(void *devPtr) = NULL;
775
776__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFree(void *devPtr) {
777 cudaError_t ret;
778 // Write your own custom c code in the cudaFree.c file
779 ret = orig_cudaFree(devPtr);
780 return ret;
781
782}
783
784static __host__ cudaError_t CUDARTAPI (*orig_cudaFreeHost)(void *ptr) = NULL;
785
786__host__ cudaError_t CUDARTAPI cudaFreeHost(void *ptr) {
787 cudaError_t ret;
788 // Write your own custom c code in the cudaFreeHost.c file
789 ret = orig_cudaFreeHost(ptr);
790 return ret;
791
792}
793
794static __host__ cudaError_t CUDARTAPI (*orig_cudaFreeArray)(cudaArray_t array) = NULL;
795
796__host__ cudaError_t CUDARTAPI cudaFreeArray(cudaArray_t array) {
797 cudaError_t ret;
798 // Write your own custom c code in the cudaFreeArray.c file
799 ret = orig_cudaFreeArray(array);
800 return ret;
801
802}
803
804static __host__ cudaError_t CUDARTAPI (*orig_cudaFreeMipmappedArray)(cudaMipmappedArray_t mipmappedArray) = NULL;
805
806__host__ cudaError_t CUDARTAPI cudaFreeMipmappedArray(cudaMipmappedArray_t mipmappedArray) {
807 cudaError_t ret;
808 // Write your own custom c code in the cudaFreeMipmappedArray.c file
809 ret = orig_cudaFreeMipmappedArray(mipmappedArray);
810 return ret;
811
812}
813
814static __host__ cudaError_t CUDARTAPI (*orig_cudaHostAlloc)(void **pHost, size_t size, unsigned int flags) = NULL;
815
816__host__ cudaError_t CUDARTAPI cudaHostAlloc(void **pHost, size_t size, unsigned int flags) {
817 cudaError_t ret;
818 // Write your own custom c code in the cudaHostAlloc.c file
819 ret = orig_cudaHostAlloc(pHost, size, flags);
820 return ret;
821
822}
823
824static __host__ cudaError_t CUDARTAPI (*orig_cudaHostRegister)(void *ptr, size_t size, unsigned int flags) = NULL;
825
826__host__ cudaError_t CUDARTAPI cudaHostRegister(void *ptr, size_t size, unsigned int flags) {
827 cudaError_t ret;
828 // Write your own custom c code in the cudaHostRegister.c file
829 ret = orig_cudaHostRegister(ptr, size, flags);
830 return ret;
831
832}
833
834static __host__ cudaError_t CUDARTAPI (*orig_cudaHostUnregister)(void *ptr) = NULL;
835
836__host__ cudaError_t CUDARTAPI cudaHostUnregister(void *ptr) {
837 cudaError_t ret;
838 // Write your own custom c code in the cudaHostUnregister.c file
839 ret = orig_cudaHostUnregister(ptr);
840 return ret;
841
842}
843
844static __host__ cudaError_t CUDARTAPI (*orig_cudaHostGetDevicePointer)(void **pDevice, void *pHost, unsigned int flags) = NULL;
845
846__host__ cudaError_t CUDARTAPI cudaHostGetDevicePointer(void **pDevice, void *pHost, unsigned int flags) {
847 cudaError_t ret;
848 // Write your own custom c code in the cudaHostGetDevicePointer.c file
849 ret = orig_cudaHostGetDevicePointer(pDevice, pHost, flags);
850 return ret;
851
852}
853
854static __host__ cudaError_t CUDARTAPI (*orig_cudaHostGetFlags)(unsigned int *pFlags, void *pHost) = NULL;
855
856__host__ cudaError_t CUDARTAPI cudaHostGetFlags(unsigned int *pFlags, void *pHost) {
857 cudaError_t ret;
858 // Write your own custom c code in the cudaHostGetFlags.c file
859 ret = orig_cudaHostGetFlags(pFlags, pHost);
860 return ret;
861
862}
863
864static __host__ cudaError_t CUDARTAPI (*orig_cudaMalloc3D)(struct cudaPitchedPtr* pitchedDevPtr, struct cudaExtent extent) = NULL;
865
866__host__ cudaError_t CUDARTAPI cudaMalloc3D(struct cudaPitchedPtr* pitchedDevPtr, struct cudaExtent extent) {
867 cudaError_t ret;
868 // Write your own custom c code in the cudaMalloc3D.c file
869 ret = orig_cudaMalloc3D(pitchedDevPtr, extent);
870 return ret;
871
872}
873
874static __host__ cudaError_t CUDARTAPI (*orig_cudaMalloc3DArray)(cudaArray_t *array, const struct cudaChannelFormatDesc* desc, struct cudaExtent extent, unsigned int flags ) = NULL;
875
876__host__ cudaError_t CUDARTAPI cudaMalloc3DArray(cudaArray_t *array, const struct cudaChannelFormatDesc* desc, struct cudaExtent extent, unsigned int flags ) {
877 cudaError_t ret;
878 // Write your own custom c code in the cudaMalloc3DArray.c file
879 ret = orig_cudaMalloc3DArray(array, desc, extent, flags);
880 return ret;
881
882}
883
884static __host__ cudaError_t CUDARTAPI (*orig_cudaMallocMipmappedArray)(cudaMipmappedArray_t *mipmappedArray, const struct cudaChannelFormatDesc* desc, struct cudaExtent extent, unsigned int numLevels, unsigned int flags ) = NULL;
885
886__host__ cudaError_t CUDARTAPI cudaMallocMipmappedArray(cudaMipmappedArray_t *mipmappedArray, const struct cudaChannelFormatDesc* desc, struct cudaExtent extent, unsigned int numLevels, unsigned int flags ) {
887 cudaError_t ret;
888 // Write your own custom c code in the cudaMallocMipmappedArray.c file
889 ret = orig_cudaMallocMipmappedArray(mipmappedArray, desc, extent, numLevels, flags);
890 return ret;
891
892}
893
894static __host__ cudaError_t CUDARTAPI (*orig_cudaGetMipmappedArrayLevel)(cudaArray_t *levelArray, cudaMipmappedArray_const_t mipmappedArray, unsigned int level) = NULL;
895
896__host__ cudaError_t CUDARTAPI cudaGetMipmappedArrayLevel(cudaArray_t *levelArray, cudaMipmappedArray_const_t mipmappedArray, unsigned int level) {
897 cudaError_t ret;
898 // Write your own custom c code in the cudaGetMipmappedArrayLevel.c file
899 ret = orig_cudaGetMipmappedArrayLevel(levelArray, mipmappedArray, level);
900 return ret;
901
902}
903
904static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpy3D)(const struct cudaMemcpy3DParms *p) = NULL;
905
906__host__ cudaError_t CUDARTAPI cudaMemcpy3D(const struct cudaMemcpy3DParms *p) {
907 cudaError_t ret;
908 // Write your own custom c code in the cudaMemcpy3D.c file
909 ret = orig_cudaMemcpy3D(p);
910 return ret;
911
912}
913
914static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpy3DPeer)(const struct cudaMemcpy3DPeerParms *p) = NULL;
915
916__host__ cudaError_t CUDARTAPI cudaMemcpy3DPeer(const struct cudaMemcpy3DPeerParms *p) {
917 cudaError_t ret;
918 // Write your own custom c code in the cudaMemcpy3DPeer.c file
919 ret = orig_cudaMemcpy3DPeer(p);
920 return ret;
921
922}
923
924static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaMemcpy3DAsync)(const struct cudaMemcpy3DParms *p, cudaStream_t stream ) = NULL;
925
926__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(const struct cudaMemcpy3DParms *p, cudaStream_t stream ) {
927 cudaError_t ret;
928 // Write your own custom c code in the cudaMemcpy3DAsync.c file
929 ret = orig_cudaMemcpy3DAsync(p, stream);
930 return ret;
931
932}
933
934static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpy3DPeerAsync)(const struct cudaMemcpy3DPeerParms *p, cudaStream_t stream ) = NULL;
935
936__host__ cudaError_t CUDARTAPI cudaMemcpy3DPeerAsync(const struct cudaMemcpy3DPeerParms *p, cudaStream_t stream ) {
937 cudaError_t ret;
938 // Write your own custom c code in the cudaMemcpy3DPeerAsync.c file
939 ret = orig_cudaMemcpy3DPeerAsync(p, stream);
940 return ret;
941
942}
943
944static __host__ cudaError_t CUDARTAPI (*orig_cudaMemGetInfo)(size_t *free, size_t *total) = NULL;
945
946__host__ cudaError_t CUDARTAPI cudaMemGetInfo(size_t *free, size_t *total) {
947 cudaError_t ret;
948 // Write your own custom c code in the cudaMemGetInfo.c file
949 ret = orig_cudaMemGetInfo(free, total);
950 return ret;
951
952}
953
954static __host__ cudaError_t CUDARTAPI (*orig_cudaArrayGetInfo)(struct cudaChannelFormatDesc *desc, struct cudaExtent *extent, unsigned int *flags, cudaArray_t array) = NULL;
955
956__host__ cudaError_t CUDARTAPI cudaArrayGetInfo(struct cudaChannelFormatDesc *desc, struct cudaExtent *extent, unsigned int *flags, cudaArray_t array) {
957 cudaError_t ret;
958 // Write your own custom c code in the cudaArrayGetInfo.c file
959 ret = orig_cudaArrayGetInfo(desc, extent, flags, array);
960 return ret;
961
962}
963
964static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpy)(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind) = NULL;
965
966__host__ cudaError_t CUDARTAPI cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind) {
967 cudaError_t ret;
968 // Write your own custom c code in the cudaMemcpy.c file
969 printf("cudaMemcpy\n");
970 ret = orig_cudaMemcpy(dst, src, count, kind);
971 return ret;
972
973}
974
975static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpyPeer)(void *dst, int dstDevice, const void *src, int srcDevice, size_t count) = NULL;
976
977__host__ cudaError_t CUDARTAPI cudaMemcpyPeer(void *dst, int dstDevice, const void *src, int srcDevice, size_t count) {
978 cudaError_t ret;
979 // Write your own custom c code in the cudaMemcpyPeer.c file
980 ret = orig_cudaMemcpyPeer(dst, dstDevice, src, srcDevice, count);
981 return ret;
982
983}
984
985static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpyToArray)(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind) = NULL;
986
987__host__ cudaError_t CUDARTAPI cudaMemcpyToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind) {
988 cudaError_t ret;
989 // Write your own custom c code in the cudaMemcpyToArray.c file
990 ret = orig_cudaMemcpyToArray(dst, wOffset, hOffset, src, count, kind);
991 return ret;
992
993}
994
995static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpyFromArray)(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind) = NULL;
996
997__host__ cudaError_t CUDARTAPI cudaMemcpyFromArray(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind) {
998 cudaError_t ret;
999 // Write your own custom c code in the cudaMemcpyFromArray.c file
1000 ret = orig_cudaMemcpyFromArray(dst, src, wOffset, hOffset, count, kind);
1001 return ret;
1002
1003}
1004
1005static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpyArrayToArray)(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count, enum cudaMemcpyKind kind ) = NULL;
1006
1007__host__ cudaError_t CUDARTAPI cudaMemcpyArrayToArray(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count, enum cudaMemcpyKind kind ) {
1008 cudaError_t ret;
1009 // Write your own custom c code in the cudaMemcpyArrayToArray.c file
1010 ret = orig_cudaMemcpyArrayToArray(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc, count, kind);
1011 return ret;
1012
1013}
1014
1015static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpy2D)(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind) = NULL;
1016
1017__host__ cudaError_t CUDARTAPI cudaMemcpy2D(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind) {
1018 cudaError_t ret;
1019 // Write your own custom c code in the cudaMemcpy2D.c file
1020 ret = orig_cudaMemcpy2D(dst, dpitch, src, spitch, width, height, kind);
1021 return ret;
1022
1023}
1024
1025static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpy2DToArray)(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind) = NULL;
1026
1027__host__ cudaError_t CUDARTAPI cudaMemcpy2DToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind) {
1028 cudaError_t ret;
1029 // Write your own custom c code in the cudaMemcpy2DToArray.c file
1030 ret = orig_cudaMemcpy2DToArray(dst, wOffset, hOffset, src, spitch, width, height, kind);
1031 return ret;
1032
1033}
1034
1035static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpy2DFromArray)(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind) = NULL;
1036
1037__host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArray(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind) {
1038 cudaError_t ret;
1039 // Write your own custom c code in the cudaMemcpy2DFromArray.c file
1040 ret = orig_cudaMemcpy2DFromArray(dst, dpitch, src, wOffset, hOffset, width, height, kind);
1041 return ret;
1042
1043}
1044
1045static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpy2DArrayToArray)(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, enum cudaMemcpyKind kind ) = NULL;
1046
1047__host__ cudaError_t CUDARTAPI cudaMemcpy2DArrayToArray(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, enum cudaMemcpyKind kind ) {
1048 cudaError_t ret;
1049 // Write your own custom c code in the cudaMemcpy2DArrayToArray.c file
1050 ret = orig_cudaMemcpy2DArrayToArray(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc, width, height, kind);
1051 return ret;
1052
1053}
1054
1055static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpyToSymbol)(const void *symbol, const void *src, size_t count, size_t offset , enum cudaMemcpyKind kind ) = NULL;
1056
1057__host__ cudaError_t CUDARTAPI cudaMemcpyToSymbol(const void *symbol, const void *src, size_t count, size_t offset , enum cudaMemcpyKind kind ) {
1058 cudaError_t ret;
1059 // Write your own custom c code in the cudaMemcpyToSymbol.c file
1060 ret = orig_cudaMemcpyToSymbol(symbol, src, count, offset, kind);
1061 return ret;
1062
1063}
1064
1065static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpyFromSymbol)(void *dst, const void *symbol, size_t count, size_t offset , enum cudaMemcpyKind kind ) = NULL;
1066
1067__host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbol(void *dst, const void *symbol, size_t count, size_t offset , enum cudaMemcpyKind kind ) {
1068 cudaError_t ret;
1069 // Write your own custom c code in the cudaMemcpyFromSymbol.c file
1070 ret = orig_cudaMemcpyFromSymbol(dst, symbol, count, offset, kind);
1071 return ret;
1072
1073}
1074
1075static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaMemcpyAsync)(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream ) = NULL;
1076
1077__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream ) {
1078 cudaError_t ret;
1079 // Write your own custom c code in the cudaMemcpyAsync.c file
1080 printf("cudaMemcpyAsync stream %p\n", (void *)stream);
1081 ret = orig_cudaMemcpyAsync(dst, src, count, kind, stream);
1082 return ret;
1083
1084}
1085
1086static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpyPeerAsync)(void *dst, int dstDevice, const void *src, int srcDevice, size_t count, cudaStream_t stream ) = NULL;
1087
1088__host__ cudaError_t CUDARTAPI cudaMemcpyPeerAsync(void *dst, int dstDevice, const void *src, int srcDevice, size_t count, cudaStream_t stream ) {
1089 cudaError_t ret;
1090 // Write your own custom c code in the cudaMemcpyPeerAsync.c file
1091 ret = orig_cudaMemcpyPeerAsync(dst, dstDevice, src, srcDevice, count, stream);
1092 return ret;
1093
1094}
1095
1096static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpyToArrayAsync)(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream ) = NULL;
1097
1098__host__ cudaError_t CUDARTAPI cudaMemcpyToArrayAsync(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream ) {
1099 cudaError_t ret;
1100 // Write your own custom c code in the cudaMemcpyToArrayAsync.c file
1101 ret = orig_cudaMemcpyToArrayAsync(dst, wOffset, hOffset, src, count, kind, stream);
1102 return ret;
1103
1104}
1105
1106static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpyFromArrayAsync)(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream ) = NULL;
1107
1108__host__ cudaError_t CUDARTAPI cudaMemcpyFromArrayAsync(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream ) {
1109 cudaError_t ret;
1110 // Write your own custom c code in the cudaMemcpyFromArrayAsync.c file
1111 ret = orig_cudaMemcpyFromArrayAsync(dst, src, wOffset, hOffset, count, kind, stream);
1112 return ret;
1113
1114}
1115
1116static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaMemcpy2DAsync)(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream ) = NULL;
1117
1118__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream ) {
1119 cudaError_t ret;
1120 // Write your own custom c code in the cudaMemcpy2DAsync.c file
1121 ret = orig_cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, kind, stream);
1122 return ret;
1123
1124}
1125
1126static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpy2DToArrayAsync)(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream ) = NULL;
1127
1128__host__ cudaError_t CUDARTAPI cudaMemcpy2DToArrayAsync(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream ) {
1129 cudaError_t ret;
1130 // Write your own custom c code in the cudaMemcpy2DToArrayAsync.c file
1131 ret = orig_cudaMemcpy2DToArrayAsync(dst, wOffset, hOffset, src, spitch, width, height, kind, stream);
1132 return ret;
1133
1134}
1135
1136static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpy2DFromArrayAsync)(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream ) = NULL;
1137
1138__host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArrayAsync(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream ) {
1139 cudaError_t ret;
1140 // Write your own custom c code in the cudaMemcpy2DFromArrayAsync.c file
1141 ret = orig_cudaMemcpy2DFromArrayAsync(dst, dpitch, src, wOffset, hOffset, width, height, kind, stream);
1142 return ret;
1143
1144}
1145
1146static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpyToSymbolAsync)(const void *symbol, const void *src, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream ) = NULL;
1147
1148__host__ cudaError_t CUDARTAPI cudaMemcpyToSymbolAsync(const void *symbol, const void *src, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream ) {
1149 cudaError_t ret;
1150 // Write your own custom c code in the cudaMemcpyToSymbolAsync.c file
1151 ret = orig_cudaMemcpyToSymbolAsync(symbol, src, count, offset, kind, stream);
1152 return ret;
1153
1154}
1155
1156static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpyFromSymbolAsync)(void *dst, const void *symbol, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream ) = NULL;
1157
1158__host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbolAsync(void *dst, const void *symbol, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream ) {
1159 cudaError_t ret;
1160 // Write your own custom c code in the cudaMemcpyFromSymbolAsync.c file
1161 ret = orig_cudaMemcpyFromSymbolAsync(dst, symbol, count, offset, kind, stream);
1162 return ret;
1163
1164}
1165
1166static __host__ cudaError_t CUDARTAPI (*orig_cudaMemset)(void *devPtr, int value, size_t count) = NULL;
1167
1168__host__ cudaError_t CUDARTAPI cudaMemset(void *devPtr, int value, size_t count) {
1169 cudaError_t ret;
1170 // Write your own custom c code in the cudaMemset.c file
1171 ret = orig_cudaMemset(devPtr, value, count);
1172 return ret;
1173
1174}
1175
1176static __host__ cudaError_t CUDARTAPI (*orig_cudaMemset2D)(void *devPtr, size_t pitch, int value, size_t width, size_t height) = NULL;
1177
1178__host__ cudaError_t CUDARTAPI cudaMemset2D(void *devPtr, size_t pitch, int value, size_t width, size_t height) {
1179 cudaError_t ret;
1180 // Write your own custom c code in the cudaMemset2D.c file
1181 ret = orig_cudaMemset2D(devPtr, pitch, value, width, height);
1182 return ret;
1183
1184}
1185
1186static __host__ cudaError_t CUDARTAPI (*orig_cudaMemset3D)(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent) = NULL;
1187
1188__host__ cudaError_t CUDARTAPI cudaMemset3D(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent) {
1189 cudaError_t ret;
1190 // Write your own custom c code in the cudaMemset3D.c file
1191 ret = orig_cudaMemset3D(pitchedDevPtr, value, extent);
1192 return ret;
1193
1194}
1195
1196static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaMemsetAsync)(void *devPtr, int value, size_t count, cudaStream_t stream ) = NULL;
1197
1198__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(void *devPtr, int value, size_t count, cudaStream_t stream ) {
1199 cudaError_t ret;
1200 // Write your own custom c code in the cudaMemsetAsync.c file
1201 ret = orig_cudaMemsetAsync(devPtr, value, count, stream);
1202 return ret;
1203
1204}
1205
1206static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaMemset2DAsync)(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream ) = NULL;
1207
1208__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream ) {
1209 cudaError_t ret;
1210 // Write your own custom c code in the cudaMemset2DAsync.c file
1211 ret = orig_cudaMemset2DAsync(devPtr, pitch, value, width, height, stream);
1212 return ret;
1213
1214}
1215
1216static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaMemset3DAsync)(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream ) = NULL;
1217
1218__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream ) {
1219 cudaError_t ret;
1220 // Write your own custom c code in the cudaMemset3DAsync.c file
1221 ret = orig_cudaMemset3DAsync(pitchedDevPtr, value, extent, stream);
1222 return ret;
1223
1224}
1225
1226static __host__ cudaError_t CUDARTAPI (*orig_cudaGetSymbolAddress)(void **devPtr, const void *symbol) = NULL;
1227
1228__host__ cudaError_t CUDARTAPI cudaGetSymbolAddress(void **devPtr, const void *symbol) {
1229 cudaError_t ret;
1230 // Write your own custom c code in the cudaGetSymbolAddress.c file
1231 ret = orig_cudaGetSymbolAddress(devPtr, symbol);
1232 return ret;
1233
1234}
1235
1236static __host__ cudaError_t CUDARTAPI (*orig_cudaGetSymbolSize)(size_t *size, const void *symbol) = NULL;
1237
1238__host__ cudaError_t CUDARTAPI cudaGetSymbolSize(size_t *size, const void *symbol) {
1239 cudaError_t ret;
1240 // Write your own custom c code in the cudaGetSymbolSize.c file
1241 ret = orig_cudaGetSymbolSize(size, symbol);
1242 return ret;
1243
1244}
1245
1246static __host__ cudaError_t CUDARTAPI (*orig_cudaMemPrefetchAsync)(const void *devPtr, size_t count, int dstDevice, cudaStream_t stream ) = NULL;
1247
1248__host__ cudaError_t CUDARTAPI cudaMemPrefetchAsync(const void *devPtr, size_t count, int dstDevice, cudaStream_t stream ) {
1249 cudaError_t ret;
1250 // Write your own custom c code in the cudaMemPrefetchAsync.c file
1251 ret = orig_cudaMemPrefetchAsync(devPtr, count, dstDevice, stream);
1252 return ret;
1253
1254}
1255
1256static __host__ cudaError_t CUDARTAPI (*orig_cudaMemAdvise)(const void *devPtr, size_t count, enum cudaMemoryAdvise advice, int device) = NULL;
1257
1258__host__ cudaError_t CUDARTAPI cudaMemAdvise(const void *devPtr, size_t count, enum cudaMemoryAdvise advice, int device) {
1259 cudaError_t ret;
1260 // Write your own custom c code in the cudaMemAdvise.c file
1261 ret = orig_cudaMemAdvise(devPtr, count, advice, device);
1262 return ret;
1263
1264}
1265
1266static __host__ cudaError_t CUDARTAPI (*orig_cudaMemRangeGetAttribute)(void *data, size_t dataSize, enum cudaMemRangeAttribute attribute, const void *devPtr, size_t count) = NULL;
1267
1268__host__ cudaError_t CUDARTAPI cudaMemRangeGetAttribute(void *data, size_t dataSize, enum cudaMemRangeAttribute attribute, const void *devPtr, size_t count) {
1269 cudaError_t ret;
1270 // Write your own custom c code in the cudaMemRangeGetAttribute.c file
1271 ret = orig_cudaMemRangeGetAttribute(data, dataSize, attribute, devPtr, count);
1272 return ret;
1273
1274}
1275
1276static __host__ cudaError_t CUDARTAPI (*orig_cudaMemRangeGetAttributes)(void **data, size_t *dataSizes, enum cudaMemRangeAttribute *attributes, size_t numAttributes, const void *devPtr, size_t count) = NULL;
1277
1278__host__ cudaError_t CUDARTAPI cudaMemRangeGetAttributes(void **data, size_t *dataSizes, enum cudaMemRangeAttribute *attributes, size_t numAttributes, const void *devPtr, size_t count) {
1279 cudaError_t ret;
1280 // Write your own custom c code in the cudaMemRangeGetAttributes.c file
1281 ret = orig_cudaMemRangeGetAttributes(data, dataSizes, attributes, numAttributes, devPtr, count);
1282 return ret;
1283
1284}
1285
1286static __host__ cudaError_t CUDARTAPI (*orig_cudaPointerGetAttributes)(struct cudaPointerAttributes *attributes, const void *ptr) = NULL;
1287
1288__host__ cudaError_t CUDARTAPI cudaPointerGetAttributes(struct cudaPointerAttributes *attributes, const void *ptr) {
1289 cudaError_t ret;
1290 // Write your own custom c code in the cudaPointerGetAttributes.c file
1291 ret = orig_cudaPointerGetAttributes(attributes, ptr);
1292 return ret;
1293
1294}
1295
1296static __host__ cudaError_t CUDARTAPI (*orig_cudaDeviceCanAccessPeer)(int *canAccessPeer, int device, int peerDevice) = NULL;
1297
1298__host__ cudaError_t CUDARTAPI cudaDeviceCanAccessPeer(int *canAccessPeer, int device, int peerDevice) {
1299 cudaError_t ret;
1300 // Write your own custom c code in the cudaDeviceCanAccessPeer.c file
1301 ret = orig_cudaDeviceCanAccessPeer(canAccessPeer, device, peerDevice);
1302 return ret;
1303
1304}
1305
1306static __host__ cudaError_t CUDARTAPI (*orig_cudaDeviceEnablePeerAccess)(int peerDevice, unsigned int flags) = NULL;
1307
1308__host__ cudaError_t CUDARTAPI cudaDeviceEnablePeerAccess(int peerDevice, unsigned int flags) {
1309 cudaError_t ret;
1310 // Write your own custom c code in the cudaDeviceEnablePeerAccess.c file
1311 ret = orig_cudaDeviceEnablePeerAccess(peerDevice, flags);
1312 return ret;
1313
1314}
1315
1316static __host__ cudaError_t CUDARTAPI (*orig_cudaDeviceDisablePeerAccess)(int peerDevice) = NULL;
1317
1318__host__ cudaError_t CUDARTAPI cudaDeviceDisablePeerAccess(int peerDevice) {
1319 cudaError_t ret;
1320 // Write your own custom c code in the cudaDeviceDisablePeerAccess.c file
1321 ret = orig_cudaDeviceDisablePeerAccess(peerDevice);
1322 return ret;
1323
1324}
1325
1326static __host__ cudaError_t CUDARTAPI (*orig_cudaGraphicsUnregisterResource)(cudaGraphicsResource_t resource) = NULL;
1327
1328__host__ cudaError_t CUDARTAPI cudaGraphicsUnregisterResource(cudaGraphicsResource_t resource) {
1329 cudaError_t ret;
1330 // Write your own custom c code in the cudaGraphicsUnregisterResource.c file
1331 ret = orig_cudaGraphicsUnregisterResource(resource);
1332 return ret;
1333
1334}
1335
1336static __host__ cudaError_t CUDARTAPI (*orig_cudaGraphicsResourceSetMapFlags)(cudaGraphicsResource_t resource, unsigned int flags) = NULL;
1337
1338__host__ cudaError_t CUDARTAPI cudaGraphicsResourceSetMapFlags(cudaGraphicsResource_t resource, unsigned int flags) {
1339 cudaError_t ret;
1340 // Write your own custom c code in the cudaGraphicsResourceSetMapFlags.c file
1341 ret = orig_cudaGraphicsResourceSetMapFlags(resource, flags);
1342 return ret;
1343
1344}
1345
1346static __host__ cudaError_t CUDARTAPI (*orig_cudaGraphicsMapResources)(int count, cudaGraphicsResource_t *resources, cudaStream_t stream ) = NULL;
1347
1348__host__ cudaError_t CUDARTAPI cudaGraphicsMapResources(int count, cudaGraphicsResource_t *resources, cudaStream_t stream ) {
1349 cudaError_t ret;
1350 // Write your own custom c code in the cudaGraphicsMapResources.c file
1351 ret = orig_cudaGraphicsMapResources(count, resources, stream);
1352 return ret;
1353
1354}
1355
1356static __host__ cudaError_t CUDARTAPI (*orig_cudaGraphicsUnmapResources)(int count, cudaGraphicsResource_t *resources, cudaStream_t stream ) = NULL;
1357
1358__host__ cudaError_t CUDARTAPI cudaGraphicsUnmapResources(int count, cudaGraphicsResource_t *resources, cudaStream_t stream ) {
1359 cudaError_t ret;
1360 // Write your own custom c code in the cudaGraphicsUnmapResources.c file
1361 ret = orig_cudaGraphicsUnmapResources(count, resources, stream);
1362 return ret;
1363
1364}
1365
1366static __host__ cudaError_t CUDARTAPI (*orig_cudaGraphicsResourceGetMappedPointer)(void **devPtr, size_t *size, cudaGraphicsResource_t resource) = NULL;
1367
1368__host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedPointer(void **devPtr, size_t *size, cudaGraphicsResource_t resource) {
1369 cudaError_t ret;
1370 // Write your own custom c code in the cudaGraphicsResourceGetMappedPointer.c file
1371 ret = orig_cudaGraphicsResourceGetMappedPointer(devPtr, size, resource);
1372 return ret;
1373
1374}
1375
1376static __host__ cudaError_t CUDARTAPI (*orig_cudaGraphicsSubResourceGetMappedArray)(cudaArray_t *array, cudaGraphicsResource_t resource, unsigned int arrayIndex, unsigned int mipLevel) = NULL;
1377
1378__host__ cudaError_t CUDARTAPI cudaGraphicsSubResourceGetMappedArray(cudaArray_t *array, cudaGraphicsResource_t resource, unsigned int arrayIndex, unsigned int mipLevel) {
1379 cudaError_t ret;
1380 // Write your own custom c code in the cudaGraphicsSubResourceGetMappedArray.c file
1381 ret = orig_cudaGraphicsSubResourceGetMappedArray(array, resource, arrayIndex, mipLevel);
1382 return ret;
1383
1384}
1385
1386static __host__ cudaError_t CUDARTAPI (*orig_cudaGraphicsResourceGetMappedMipmappedArray)(cudaMipmappedArray_t *mipmappedArray, cudaGraphicsResource_t resource) = NULL;
1387
1388__host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedMipmappedArray(cudaMipmappedArray_t *mipmappedArray, cudaGraphicsResource_t resource) {
1389 cudaError_t ret;
1390 // Write your own custom c code in the cudaGraphicsResourceGetMappedMipmappedArray.c file
1391 ret = orig_cudaGraphicsResourceGetMappedMipmappedArray(mipmappedArray, resource);
1392 return ret;
1393
1394}
1395
1396static __host__ cudaError_t CUDARTAPI (*orig_cudaGetChannelDesc)(struct cudaChannelFormatDesc *desc, cudaArray_const_t array) = NULL;
1397
1398__host__ cudaError_t CUDARTAPI cudaGetChannelDesc(struct cudaChannelFormatDesc *desc, cudaArray_const_t array) {
1399 cudaError_t ret;
1400 // Write your own custom c code in the cudaGetChannelDesc.c file
1401 ret = orig_cudaGetChannelDesc(desc, array);
1402 return ret;
1403
1404}
1405
1406static __host__ struct cudaChannelFormatDesc CUDARTAPI (*orig_cudaCreateChannelDesc)(int x, int y, int z, int w, enum cudaChannelFormatKind f) = NULL;
1407
1408__host__ struct cudaChannelFormatDesc CUDARTAPI cudaCreateChannelDesc(int x, int y, int z, int w, enum cudaChannelFormatKind f) {
1409 struct cudaChannelFormatDesc ret;
1410 // Write your own custom c code in the cudaCreateChannelDesc.c file
1411 ret = orig_cudaCreateChannelDesc(x, y, z, w, f);
1412 return ret;
1413
1414}
1415
1416static __host__ cudaError_t CUDARTAPI (*orig_cudaBindTexture)(size_t *offset, const struct textureReference *texref, const void *devPtr, const struct cudaChannelFormatDesc *desc, size_t size ) = NULL;
1417
1418__host__ cudaError_t CUDARTAPI cudaBindTexture(size_t *offset, const struct textureReference *texref, const void *devPtr, const struct cudaChannelFormatDesc *desc, size_t size ) {
1419 cudaError_t ret;
1420 // Write your own custom c code in the cudaBindTexture.c file
1421 ret = orig_cudaBindTexture(offset, texref, devPtr, desc, size);
1422 return ret;
1423
1424}
1425
1426static __host__ cudaError_t CUDARTAPI (*orig_cudaBindTexture2D)(size_t *offset, const struct textureReference *texref, const void *devPtr, const struct cudaChannelFormatDesc *desc, size_t width, size_t height, size_t pitch) = NULL;
1427
1428__host__ cudaError_t CUDARTAPI cudaBindTexture2D(size_t *offset, const struct textureReference *texref, const void *devPtr, const struct cudaChannelFormatDesc *desc, size_t width, size_t height, size_t pitch) {
1429 cudaError_t ret;
1430 // Write your own custom c code in the cudaBindTexture2D.c file
1431 ret = orig_cudaBindTexture2D(offset, texref, devPtr, desc, width, height, pitch);
1432 return ret;
1433
1434}
1435
1436static __host__ cudaError_t CUDARTAPI (*orig_cudaBindTextureToArray)(const struct textureReference *texref, cudaArray_const_t array, const struct cudaChannelFormatDesc *desc) = NULL;
1437
1438__host__ cudaError_t CUDARTAPI cudaBindTextureToArray(const struct textureReference *texref, cudaArray_const_t array, const struct cudaChannelFormatDesc *desc) {
1439 cudaError_t ret;
1440 // Write your own custom c code in the cudaBindTextureToArray.c file
1441 ret = orig_cudaBindTextureToArray(texref, array, desc);
1442 return ret;
1443
1444}
1445
1446static __host__ cudaError_t CUDARTAPI (*orig_cudaBindTextureToMipmappedArray)(const struct textureReference *texref, cudaMipmappedArray_const_t mipmappedArray, const struct cudaChannelFormatDesc *desc) = NULL;
1447
1448__host__ cudaError_t CUDARTAPI cudaBindTextureToMipmappedArray(const struct textureReference *texref, cudaMipmappedArray_const_t mipmappedArray, const struct cudaChannelFormatDesc *desc) {
1449 cudaError_t ret;
1450 // Write your own custom c code in the cudaBindTextureToMipmappedArray.c file
1451 ret = orig_cudaBindTextureToMipmappedArray(texref, mipmappedArray, desc);
1452 return ret;
1453
1454}
1455
1456static __host__ cudaError_t CUDARTAPI (*orig_cudaUnbindTexture)(const struct textureReference *texref) = NULL;
1457
1458__host__ cudaError_t CUDARTAPI cudaUnbindTexture(const struct textureReference *texref) {
1459 cudaError_t ret;
1460 // Write your own custom c code in the cudaUnbindTexture.c file
1461 ret = orig_cudaUnbindTexture(texref);
1462 return ret;
1463
1464}
1465
1466static __host__ cudaError_t CUDARTAPI (*orig_cudaGetTextureAlignmentOffset)(size_t *offset, const struct textureReference *texref) = NULL;
1467
1468__host__ cudaError_t CUDARTAPI cudaGetTextureAlignmentOffset(size_t *offset, const struct textureReference *texref) {
1469 cudaError_t ret;
1470 // Write your own custom c code in the cudaGetTextureAlignmentOffset.c file
1471 ret = orig_cudaGetTextureAlignmentOffset(offset, texref);
1472 return ret;
1473
1474}
1475
1476static __host__ cudaError_t CUDARTAPI (*orig_cudaGetTextureReference)(const struct textureReference **texref, const void *symbol) = NULL;
1477
1478__host__ cudaError_t CUDARTAPI cudaGetTextureReference(const struct textureReference **texref, const void *symbol) {
1479 cudaError_t ret;
1480 // Write your own custom c code in the cudaGetTextureReference.c file
1481 ret = orig_cudaGetTextureReference(texref, symbol);
1482 return ret;
1483
1484}
1485
1486static __host__ cudaError_t CUDARTAPI (*orig_cudaBindSurfaceToArray)(const struct surfaceReference *surfref, cudaArray_const_t array, const struct cudaChannelFormatDesc *desc) = NULL;
1487
1488__host__ cudaError_t CUDARTAPI cudaBindSurfaceToArray(const struct surfaceReference *surfref, cudaArray_const_t array, const struct cudaChannelFormatDesc *desc) {
1489 cudaError_t ret;
1490 // Write your own custom c code in the cudaBindSurfaceToArray.c file
1491 ret = orig_cudaBindSurfaceToArray(surfref, array, desc);
1492 return ret;
1493
1494}
1495
1496static __host__ cudaError_t CUDARTAPI (*orig_cudaGetSurfaceReference)(const struct surfaceReference **surfref, const void *symbol) = NULL;
1497
1498__host__ cudaError_t CUDARTAPI cudaGetSurfaceReference(const struct surfaceReference **surfref, const void *symbol) {
1499 cudaError_t ret;
1500 // Write your own custom c code in the cudaGetSurfaceReference.c file
1501 ret = orig_cudaGetSurfaceReference(surfref, symbol);
1502 return ret;
1503
1504}
1505
1506static __host__ cudaError_t CUDARTAPI (*orig_cudaCreateTextureObject)(cudaTextureObject_t *pTexObject, const struct cudaResourceDesc *pResDesc, const struct cudaTextureDesc *pTexDesc, const struct cudaResourceViewDesc *pResViewDesc) = NULL;
1507
1508__host__ cudaError_t CUDARTAPI cudaCreateTextureObject(cudaTextureObject_t *pTexObject, const struct cudaResourceDesc *pResDesc, const struct cudaTextureDesc *pTexDesc, const struct cudaResourceViewDesc *pResViewDesc) {
1509 cudaError_t ret;
1510 // Write your own custom c code in the cudaCreateTextureObject.c file
1511 ret = orig_cudaCreateTextureObject(pTexObject, pResDesc, pTexDesc, pResViewDesc);
1512 return ret;
1513
1514}
1515
1516static __host__ cudaError_t CUDARTAPI (*orig_cudaDestroyTextureObject)(cudaTextureObject_t texObject) = NULL;
1517
1518__host__ cudaError_t CUDARTAPI cudaDestroyTextureObject(cudaTextureObject_t texObject) {
1519 cudaError_t ret;
1520 // Write your own custom c code in the cudaDestroyTextureObject.c file
1521 ret = orig_cudaDestroyTextureObject(texObject);
1522 return ret;
1523
1524}
1525
1526static __host__ cudaError_t CUDARTAPI (*orig_cudaGetTextureObjectResourceDesc)(struct cudaResourceDesc *pResDesc, cudaTextureObject_t texObject) = NULL;
1527
1528__host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceDesc(struct cudaResourceDesc *pResDesc, cudaTextureObject_t texObject) {
1529 cudaError_t ret;
1530 // Write your own custom c code in the cudaGetTextureObjectResourceDesc.c file
1531 ret = orig_cudaGetTextureObjectResourceDesc(pResDesc, texObject);
1532 return ret;
1533
1534}
1535
1536static __host__ cudaError_t CUDARTAPI (*orig_cudaGetTextureObjectTextureDesc)(struct cudaTextureDesc *pTexDesc, cudaTextureObject_t texObject) = NULL;
1537
1538__host__ cudaError_t CUDARTAPI cudaGetTextureObjectTextureDesc(struct cudaTextureDesc *pTexDesc, cudaTextureObject_t texObject) {
1539 cudaError_t ret;
1540 // Write your own custom c code in the cudaGetTextureObjectTextureDesc.c file
1541 ret = orig_cudaGetTextureObjectTextureDesc(pTexDesc, texObject);
1542 return ret;
1543
1544}
1545
1546static __host__ cudaError_t CUDARTAPI (*orig_cudaGetTextureObjectResourceViewDesc)(struct cudaResourceViewDesc *pResViewDesc, cudaTextureObject_t texObject) = NULL;
1547
1548__host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceViewDesc(struct cudaResourceViewDesc *pResViewDesc, cudaTextureObject_t texObject) {
1549 cudaError_t ret;
1550 // Write your own custom c code in the cudaGetTextureObjectResourceViewDesc.c file
1551 ret = orig_cudaGetTextureObjectResourceViewDesc(pResViewDesc, texObject);
1552 return ret;
1553
1554}
1555
1556static __host__ cudaError_t CUDARTAPI (*orig_cudaCreateSurfaceObject)(cudaSurfaceObject_t *pSurfObject, const struct cudaResourceDesc *pResDesc) = NULL;
1557
1558__host__ cudaError_t CUDARTAPI cudaCreateSurfaceObject(cudaSurfaceObject_t *pSurfObject, const struct cudaResourceDesc *pResDesc) {
1559 cudaError_t ret;
1560 // Write your own custom c code in the cudaCreateSurfaceObject.c file
1561 ret = orig_cudaCreateSurfaceObject(pSurfObject, pResDesc);
1562 return ret;
1563
1564}
1565
1566static __host__ cudaError_t CUDARTAPI (*orig_cudaDestroySurfaceObject)(cudaSurfaceObject_t surfObject) = NULL;
1567
1568__host__ cudaError_t CUDARTAPI cudaDestroySurfaceObject(cudaSurfaceObject_t surfObject) {
1569 cudaError_t ret;
1570 // Write your own custom c code in the cudaDestroySurfaceObject.c file
1571 ret = orig_cudaDestroySurfaceObject(surfObject);
1572 return ret;
1573
1574}
1575
1576static __host__ cudaError_t CUDARTAPI (*orig_cudaGetSurfaceObjectResourceDesc)(struct cudaResourceDesc *pResDesc, cudaSurfaceObject_t surfObject) = NULL;
1577
1578__host__ cudaError_t CUDARTAPI cudaGetSurfaceObjectResourceDesc(struct cudaResourceDesc *pResDesc, cudaSurfaceObject_t surfObject) {
1579 cudaError_t ret;
1580 // Write your own custom c code in the cudaGetSurfaceObjectResourceDesc.c file
1581 ret = orig_cudaGetSurfaceObjectResourceDesc(pResDesc, surfObject);
1582 return ret;
1583
1584}
1585
1586static __host__ cudaError_t CUDARTAPI (*orig_cudaDriverGetVersion)(int *driverVersion) = NULL;
1587
1588__host__ cudaError_t CUDARTAPI cudaDriverGetVersion(int *driverVersion) {
1589 cudaError_t ret;
1590 // Write your own custom c code in the cudaDriverGetVersion.c file
1591 ret = orig_cudaDriverGetVersion(driverVersion);
1592 return ret;
1593
1594}
1595
1596static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaRuntimeGetVersion)(int *runtimeVersion) = NULL;
1597
1598__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaRuntimeGetVersion(int *runtimeVersion) {
1599 cudaError_t ret;
1600 // Write your own custom c code in the cudaRuntimeGetVersion.c file
1601 ret = orig_cudaRuntimeGetVersion(runtimeVersion);
1602 return ret;
1603
1604}
1605
1606static __host__ cudaError_t CUDARTAPI (*orig_cudaGetExportTable)(const void **ppExportTable, const cudaUUID_t *pExportTableId) = NULL;
1607
1608__host__ cudaError_t CUDARTAPI cudaGetExportTable(const void **ppExportTable, const cudaUUID_t *pExportTableId) {
1609 cudaError_t ret;
1610 // Write your own custom c code in the cudaGetExportTable.c file
1611 ret = orig_cudaGetExportTable(ppExportTable, pExportTableId);
1612 return ret;
1613
1614}
1615__attribute__((constructor)) static void init() {
1616 char *dl_error;
1617 // clear dl error
1618 dlerror();
1619 if (orig_cudaDeviceReset == NULL) {
1620 orig_cudaDeviceReset = dlsym(RTLD_NEXT, "cudaDeviceReset");
1621 }
1622 if ((dl_error = dlerror()) != NULL)
1623 {
1624 fprintf(stderr, ">>>>>>> %s\n", dl_error);
1625 }
1626
1627
1628 // clear dl error
1629 dlerror();
1630 if (orig_cudaDeviceSynchronize == NULL) {
1631 orig_cudaDeviceSynchronize = dlsym(RTLD_NEXT, "cudaDeviceSynchronize");
1632 }
1633 if ((dl_error = dlerror()) != NULL)
1634 {
1635 fprintf(stderr, ">>>>>>> %s\n", dl_error);
1636 }
1637
1638
1639 // clear dl error
1640 dlerror();
1641 if (orig_cudaDeviceSetLimit == NULL) {
1642 orig_cudaDeviceSetLimit = dlsym(RTLD_NEXT, "cudaDeviceSetLimit");
1643 }
1644 if ((dl_error = dlerror()) != NULL)
1645 {
1646 fprintf(stderr, ">>>>>>> %s\n", dl_error);
1647 }
1648
1649
1650 // clear dl error
1651 dlerror();
1652 if (orig_cudaDeviceGetLimit == NULL) {
1653 orig_cudaDeviceGetLimit = dlsym(RTLD_NEXT, "cudaDeviceGetLimit");
1654 }
1655 if ((dl_error = dlerror()) != NULL)
1656 {
1657 fprintf(stderr, ">>>>>>> %s\n", dl_error);
1658 }
1659
1660
1661 // clear dl error
1662 dlerror();
1663 if (orig_cudaDeviceGetCacheConfig == NULL) {
1664 orig_cudaDeviceGetCacheConfig = dlsym(RTLD_NEXT, "cudaDeviceGetCacheConfig");
1665 }
1666 if ((dl_error = dlerror()) != NULL)
1667 {
1668 fprintf(stderr, ">>>>>>> %s\n", dl_error);
1669 }
1670
1671
1672 // clear dl error
1673 dlerror();
1674 if (orig_cudaDeviceGetStreamPriorityRange == NULL) {
1675 orig_cudaDeviceGetStreamPriorityRange = dlsym(RTLD_NEXT, "cudaDeviceGetStreamPriorityRange");
1676 }
1677 if ((dl_error = dlerror()) != NULL)
1678 {
1679 fprintf(stderr, ">>>>>>> %s\n", dl_error);
1680 }
1681
1682
1683 // clear dl error
1684 dlerror();
1685 if (orig_cudaDeviceSetCacheConfig == NULL) {
1686 orig_cudaDeviceSetCacheConfig = dlsym(RTLD_NEXT, "cudaDeviceSetCacheConfig");
1687 }
1688 if ((dl_error = dlerror()) != NULL)
1689 {
1690 fprintf(stderr, ">>>>>>> %s\n", dl_error);
1691 }
1692
1693
1694 // clear dl error
1695 dlerror();
1696 if (orig_cudaDeviceGetSharedMemConfig == NULL) {
1697 orig_cudaDeviceGetSharedMemConfig = dlsym(RTLD_NEXT, "cudaDeviceGetSharedMemConfig");
1698 }
1699 if ((dl_error = dlerror()) != NULL)
1700 {
1701 fprintf(stderr, ">>>>>>> %s\n", dl_error);
1702 }
1703
1704
1705 // clear dl error
1706 dlerror();
1707 if (orig_cudaDeviceSetSharedMemConfig == NULL) {
1708 orig_cudaDeviceSetSharedMemConfig = dlsym(RTLD_NEXT, "cudaDeviceSetSharedMemConfig");
1709 }
1710 if ((dl_error = dlerror()) != NULL)
1711 {
1712 fprintf(stderr, ">>>>>>> %s\n", dl_error);
1713 }
1714
1715
1716 // clear dl error
1717 dlerror();
1718 if (orig_cudaDeviceGetByPCIBusId == NULL) {
1719 orig_cudaDeviceGetByPCIBusId = dlsym(RTLD_NEXT, "cudaDeviceGetByPCIBusId");
1720 }
1721 if ((dl_error = dlerror()) != NULL)
1722 {
1723 fprintf(stderr, ">>>>>>> %s\n", dl_error);
1724 }
1725
1726
1727 // clear dl error
1728 dlerror();
1729 if (orig_cudaDeviceGetPCIBusId == NULL) {
1730 orig_cudaDeviceGetPCIBusId = dlsym(RTLD_NEXT, "cudaDeviceGetPCIBusId");
1731 }
1732 if ((dl_error = dlerror()) != NULL)
1733 {
1734 fprintf(stderr, ">>>>>>> %s\n", dl_error);
1735 }
1736
1737
1738 // clear dl error
1739 dlerror();
1740 if (orig_cudaIpcGetEventHandle == NULL) {
1741 orig_cudaIpcGetEventHandle = dlsym(RTLD_NEXT, "cudaIpcGetEventHandle");
1742 }
1743 if ((dl_error = dlerror()) != NULL)
1744 {
1745 fprintf(stderr, ">>>>>>> %s\n", dl_error);
1746 }
1747
1748
1749 // clear dl error
1750 dlerror();
1751 if (orig_cudaIpcOpenEventHandle == NULL) {
1752 orig_cudaIpcOpenEventHandle = dlsym(RTLD_NEXT, "cudaIpcOpenEventHandle");
1753 }
1754 if ((dl_error = dlerror()) != NULL)
1755 {
1756 fprintf(stderr, ">>>>>>> %s\n", dl_error);
1757 }
1758
1759
1760 // clear dl error
1761 dlerror();
1762 if (orig_cudaIpcGetMemHandle == NULL) {
1763 orig_cudaIpcGetMemHandle = dlsym(RTLD_NEXT, "cudaIpcGetMemHandle");
1764 }
1765 if ((dl_error = dlerror()) != NULL)
1766 {
1767 fprintf(stderr, ">>>>>>> %s\n", dl_error);
1768 }
1769
1770
1771 // clear dl error
1772 dlerror();
1773 if (orig_cudaIpcOpenMemHandle == NULL) {
1774 orig_cudaIpcOpenMemHandle = dlsym(RTLD_NEXT, "cudaIpcOpenMemHandle");
1775 }
1776 if ((dl_error = dlerror()) != NULL)
1777 {
1778 fprintf(stderr, ">>>>>>> %s\n", dl_error);
1779 }
1780
1781
1782 // clear dl error
1783 dlerror();
1784 if (orig_cudaIpcCloseMemHandle == NULL) {
1785 orig_cudaIpcCloseMemHandle = dlsym(RTLD_NEXT, "cudaIpcCloseMemHandle");
1786 }
1787 if ((dl_error = dlerror()) != NULL)
1788 {
1789 fprintf(stderr, ">>>>>>> %s\n", dl_error);
1790 }
1791
1792
1793 // clear dl error
1794 dlerror();
1795 if (orig_cudaThreadExit == NULL) {
1796 orig_cudaThreadExit = dlsym(RTLD_NEXT, "cudaThreadExit");
1797 }
1798 if ((dl_error = dlerror()) != NULL)
1799 {
1800 fprintf(stderr, ">>>>>>> %s\n", dl_error);
1801 }
1802
1803
1804 // clear dl error
1805 dlerror();
1806 if (orig_cudaThreadSynchronize == NULL) {
1807 orig_cudaThreadSynchronize = dlsym(RTLD_NEXT, "cudaThreadSynchronize");
1808 }
1809 if ((dl_error = dlerror()) != NULL)
1810 {
1811 fprintf(stderr, ">>>>>>> %s\n", dl_error);
1812 }
1813
1814
1815 // clear dl error
1816 dlerror();
1817 if (orig_cudaThreadSetLimit == NULL) {
1818 orig_cudaThreadSetLimit = dlsym(RTLD_NEXT, "cudaThreadSetLimit");
1819 }
1820 if ((dl_error = dlerror()) != NULL)
1821 {
1822 fprintf(stderr, ">>>>>>> %s\n", dl_error);
1823 }
1824
1825
1826 // clear dl error
1827 dlerror();
1828 if (orig_cudaThreadGetLimit == NULL) {
1829 orig_cudaThreadGetLimit = dlsym(RTLD_NEXT, "cudaThreadGetLimit");
1830 }
1831 if ((dl_error = dlerror()) != NULL)
1832 {
1833 fprintf(stderr, ">>>>>>> %s\n", dl_error);
1834 }
1835
1836
1837 // clear dl error
1838 dlerror();
1839 if (orig_cudaThreadGetCacheConfig == NULL) {
1840 orig_cudaThreadGetCacheConfig = dlsym(RTLD_NEXT, "cudaThreadGetCacheConfig");
1841 }
1842 if ((dl_error = dlerror()) != NULL)
1843 {
1844 fprintf(stderr, ">>>>>>> %s\n", dl_error);
1845 }
1846
1847
1848 // clear dl error
1849 dlerror();
1850 if (orig_cudaThreadSetCacheConfig == NULL) {
1851 orig_cudaThreadSetCacheConfig = dlsym(RTLD_NEXT, "cudaThreadSetCacheConfig");
1852 }
1853 if ((dl_error = dlerror()) != NULL)
1854 {
1855 fprintf(stderr, ">>>>>>> %s\n", dl_error);
1856 }
1857
1858
1859 // clear dl error
1860 dlerror();
1861 if (orig_cudaGetLastError == NULL) {
1862 orig_cudaGetLastError = dlsym(RTLD_NEXT, "cudaGetLastError");
1863 }
1864 if ((dl_error = dlerror()) != NULL)
1865 {
1866 fprintf(stderr, ">>>>>>> %s\n", dl_error);
1867 }
1868
1869
1870 // clear dl error
1871 dlerror();
1872 if (orig_cudaPeekAtLastError == NULL) {
1873 orig_cudaPeekAtLastError = dlsym(RTLD_NEXT, "cudaPeekAtLastError");
1874 }
1875 if ((dl_error = dlerror()) != NULL)
1876 {
1877 fprintf(stderr, ">>>>>>> %s\n", dl_error);
1878 }
1879
1880
1881 // clear dl error
1882 dlerror();
1883 if (orig_cudaGetErrorName == NULL) {
1884 orig_cudaGetErrorName = dlsym(RTLD_NEXT, "cudaGetErrorName");
1885 }
1886 if ((dl_error = dlerror()) != NULL)
1887 {
1888 fprintf(stderr, ">>>>>>> %s\n", dl_error);
1889 }
1890
1891
1892 // clear dl error
1893 dlerror();
1894 if (orig_cudaGetErrorString == NULL) {
1895 orig_cudaGetErrorString = dlsym(RTLD_NEXT, "cudaGetErrorString");
1896 }
1897 if ((dl_error = dlerror()) != NULL)
1898 {
1899 fprintf(stderr, ">>>>>>> %s\n", dl_error);
1900 }
1901
1902
1903 // clear dl error
1904 dlerror();
1905 if (orig_cudaGetDeviceCount == NULL) {
1906 orig_cudaGetDeviceCount = dlsym(RTLD_NEXT, "cudaGetDeviceCount");
1907 }
1908 if ((dl_error = dlerror()) != NULL)
1909 {
1910 fprintf(stderr, ">>>>>>> %s\n", dl_error);
1911 }
1912
1913
1914 // clear dl error
1915 dlerror();
1916 if (orig_cudaGetDeviceProperties == NULL) {
1917 orig_cudaGetDeviceProperties = dlsym(RTLD_NEXT, "cudaGetDeviceProperties");
1918 }
1919 if ((dl_error = dlerror()) != NULL)
1920 {
1921 fprintf(stderr, ">>>>>>> %s\n", dl_error);
1922 }
1923
1924
1925 // clear dl error
1926 dlerror();
1927 if (orig_cudaDeviceGetAttribute == NULL) {
1928 orig_cudaDeviceGetAttribute = dlsym(RTLD_NEXT, "cudaDeviceGetAttribute");
1929 }
1930 if ((dl_error = dlerror()) != NULL)
1931 {
1932 fprintf(stderr, ">>>>>>> %s\n", dl_error);
1933 }
1934
1935
1936 // clear dl error
1937 dlerror();
1938 if (orig_cudaDeviceGetP2PAttribute == NULL) {
1939 orig_cudaDeviceGetP2PAttribute = dlsym(RTLD_NEXT, "cudaDeviceGetP2PAttribute");
1940 }
1941 if ((dl_error = dlerror()) != NULL)
1942 {
1943 fprintf(stderr, ">>>>>>> %s\n", dl_error);
1944 }
1945
1946
1947 // clear dl error
1948 dlerror();
1949 if (orig_cudaChooseDevice == NULL) {
1950 orig_cudaChooseDevice = dlsym(RTLD_NEXT, "cudaChooseDevice");
1951 }
1952 if ((dl_error = dlerror()) != NULL)
1953 {
1954 fprintf(stderr, ">>>>>>> %s\n", dl_error);
1955 }
1956
1957
1958 // clear dl error
1959 dlerror();
1960 if (orig_cudaSetDevice == NULL) {
1961 orig_cudaSetDevice = dlsym(RTLD_NEXT, "cudaSetDevice");
1962 }
1963 if ((dl_error = dlerror()) != NULL)
1964 {
1965 fprintf(stderr, ">>>>>>> %s\n", dl_error);
1966 }
1967
1968
1969 // clear dl error
1970 dlerror();
1971 if (orig_cudaGetDevice == NULL) {
1972 orig_cudaGetDevice = dlsym(RTLD_NEXT, "cudaGetDevice");
1973 }
1974 if ((dl_error = dlerror()) != NULL)
1975 {
1976 fprintf(stderr, ">>>>>>> %s\n", dl_error);
1977 }
1978
1979
1980 // clear dl error
1981 dlerror();
1982 if (orig_cudaSetValidDevices == NULL) {
1983 orig_cudaSetValidDevices = dlsym(RTLD_NEXT, "cudaSetValidDevices");
1984 }
1985 if ((dl_error = dlerror()) != NULL)
1986 {
1987 fprintf(stderr, ">>>>>>> %s\n", dl_error);
1988 }
1989
1990
1991 // clear dl error
1992 dlerror();
1993 if (orig_cudaSetDeviceFlags == NULL) {
1994 orig_cudaSetDeviceFlags = dlsym(RTLD_NEXT, "cudaSetDeviceFlags");
1995 }
1996 if ((dl_error = dlerror()) != NULL)
1997 {
1998 fprintf(stderr, ">>>>>>> %s\n", dl_error);
1999 }
2000
2001
2002 // clear dl error
2003 dlerror();
2004 if (orig_cudaGetDeviceFlags == NULL) {
2005 orig_cudaGetDeviceFlags = dlsym(RTLD_NEXT, "cudaGetDeviceFlags");
2006 }
2007 if ((dl_error = dlerror()) != NULL)
2008 {
2009 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2010 }
2011
2012
2013 // clear dl error
2014 dlerror();
2015 if (orig_cudaStreamCreate == NULL) {
2016 orig_cudaStreamCreate = dlsym(RTLD_NEXT, "cudaStreamCreate");
2017 }
2018 if ((dl_error = dlerror()) != NULL)
2019 {
2020 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2021 }
2022
2023
2024 // clear dl error
2025 dlerror();
2026 if (orig_cudaStreamCreateWithFlags == NULL) {
2027 orig_cudaStreamCreateWithFlags = dlsym(RTLD_NEXT, "cudaStreamCreateWithFlags");
2028 }
2029 if ((dl_error = dlerror()) != NULL)
2030 {
2031 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2032 }
2033
2034
2035 // clear dl error
2036 dlerror();
2037 if (orig_cudaStreamCreateWithPriority == NULL) {
2038 orig_cudaStreamCreateWithPriority = dlsym(RTLD_NEXT, "cudaStreamCreateWithPriority");
2039 }
2040 if ((dl_error = dlerror()) != NULL)
2041 {
2042 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2043 }
2044
2045
2046 // clear dl error
2047 dlerror();
2048 if (orig_cudaStreamGetPriority == NULL) {
2049 orig_cudaStreamGetPriority = dlsym(RTLD_NEXT, "cudaStreamGetPriority");
2050 }
2051 if ((dl_error = dlerror()) != NULL)
2052 {
2053 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2054 }
2055
2056
2057 // clear dl error
2058 dlerror();
2059 if (orig_cudaStreamGetFlags == NULL) {
2060 orig_cudaStreamGetFlags = dlsym(RTLD_NEXT, "cudaStreamGetFlags");
2061 }
2062 if ((dl_error = dlerror()) != NULL)
2063 {
2064 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2065 }
2066
2067
2068 // clear dl error
2069 dlerror();
2070 if (orig_cudaStreamDestroy == NULL) {
2071 orig_cudaStreamDestroy = dlsym(RTLD_NEXT, "cudaStreamDestroy");
2072 }
2073 if ((dl_error = dlerror()) != NULL)
2074 {
2075 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2076 }
2077
2078
2079 // clear dl error
2080 dlerror();
2081 if (orig_cudaStreamWaitEvent == NULL) {
2082 orig_cudaStreamWaitEvent = dlsym(RTLD_NEXT, "cudaStreamWaitEvent");
2083 }
2084 if ((dl_error = dlerror()) != NULL)
2085 {
2086 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2087 }
2088
2089
2090 // clear dl error
2091 dlerror();
2092 if (orig_cudaStreamAddCallback == NULL) {
2093 orig_cudaStreamAddCallback = dlsym(RTLD_NEXT, "cudaStreamAddCallback");
2094 }
2095 if ((dl_error = dlerror()) != NULL)
2096 {
2097 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2098 }
2099
2100
2101 // clear dl error
2102 dlerror();
2103 if (orig_cudaStreamSynchronize == NULL) {
2104 orig_cudaStreamSynchronize = dlsym(RTLD_NEXT, "cudaStreamSynchronize");
2105 }
2106 if ((dl_error = dlerror()) != NULL)
2107 {
2108 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2109 }
2110
2111
2112 // clear dl error
2113 dlerror();
2114 if (orig_cudaStreamQuery == NULL) {
2115 orig_cudaStreamQuery = dlsym(RTLD_NEXT, "cudaStreamQuery");
2116 }
2117 if ((dl_error = dlerror()) != NULL)
2118 {
2119 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2120 }
2121
2122
2123 // clear dl error
2124 dlerror();
2125 if (orig_cudaStreamAttachMemAsync == NULL) {
2126 orig_cudaStreamAttachMemAsync = dlsym(RTLD_NEXT, "cudaStreamAttachMemAsync");
2127 }
2128 if ((dl_error = dlerror()) != NULL)
2129 {
2130 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2131 }
2132
2133
2134 // clear dl error
2135 dlerror();
2136 if (orig_cudaEventCreate == NULL) {
2137 orig_cudaEventCreate = dlsym(RTLD_NEXT, "cudaEventCreate");
2138 }
2139 if ((dl_error = dlerror()) != NULL)
2140 {
2141 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2142 }
2143
2144
2145 // clear dl error
2146 dlerror();
2147 if (orig_cudaEventCreateWithFlags == NULL) {
2148 orig_cudaEventCreateWithFlags = dlsym(RTLD_NEXT, "cudaEventCreateWithFlags");
2149 }
2150 if ((dl_error = dlerror()) != NULL)
2151 {
2152 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2153 }
2154
2155
2156 // clear dl error
2157 dlerror();
2158 if (orig_cudaEventRecord == NULL) {
2159 orig_cudaEventRecord = dlsym(RTLD_NEXT, "cudaEventRecord");
2160 }
2161 if ((dl_error = dlerror()) != NULL)
2162 {
2163 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2164 }
2165
2166
2167 // clear dl error
2168 dlerror();
2169 if (orig_cudaEventQuery == NULL) {
2170 orig_cudaEventQuery = dlsym(RTLD_NEXT, "cudaEventQuery");
2171 }
2172 if ((dl_error = dlerror()) != NULL)
2173 {
2174 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2175 }
2176
2177
2178 // clear dl error
2179 dlerror();
2180 if (orig_cudaEventSynchronize == NULL) {
2181 orig_cudaEventSynchronize = dlsym(RTLD_NEXT, "cudaEventSynchronize");
2182 }
2183 if ((dl_error = dlerror()) != NULL)
2184 {
2185 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2186 }
2187
2188
2189 // clear dl error
2190 dlerror();
2191 if (orig_cudaEventDestroy == NULL) {
2192 orig_cudaEventDestroy = dlsym(RTLD_NEXT, "cudaEventDestroy");
2193 }
2194 if ((dl_error = dlerror()) != NULL)
2195 {
2196 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2197 }
2198
2199
2200 // clear dl error
2201 dlerror();
2202 if (orig_cudaEventElapsedTime == NULL) {
2203 orig_cudaEventElapsedTime = dlsym(RTLD_NEXT, "cudaEventElapsedTime");
2204 }
2205 if ((dl_error = dlerror()) != NULL)
2206 {
2207 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2208 }
2209
2210
2211 // clear dl error
2212 dlerror();
2213 if (orig_cudaLaunchKernel == NULL) {
2214 orig_cudaLaunchKernel = dlsym(RTLD_NEXT, "cudaLaunchKernel");
2215 }
2216 if ((dl_error = dlerror()) != NULL)
2217 {
2218 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2219 }
2220
2221
2222 // clear dl error
2223 dlerror();
2224 if (orig_cudaLaunchCooperativeKernel == NULL) {
2225 orig_cudaLaunchCooperativeKernel = dlsym(RTLD_NEXT, "cudaLaunchCooperativeKernel");
2226 }
2227 if ((dl_error = dlerror()) != NULL)
2228 {
2229 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2230 }
2231
2232
2233 // clear dl error
2234 dlerror();
2235 if (orig_cudaLaunchCooperativeKernelMultiDevice == NULL) {
2236 orig_cudaLaunchCooperativeKernelMultiDevice = dlsym(RTLD_NEXT, "cudaLaunchCooperativeKernelMultiDevice");
2237 }
2238 if ((dl_error = dlerror()) != NULL)
2239 {
2240 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2241 }
2242
2243
2244 // clear dl error
2245 dlerror();
2246 if (orig_cudaFuncSetCacheConfig == NULL) {
2247 orig_cudaFuncSetCacheConfig = dlsym(RTLD_NEXT, "cudaFuncSetCacheConfig");
2248 }
2249 if ((dl_error = dlerror()) != NULL)
2250 {
2251 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2252 }
2253
2254
2255 // clear dl error
2256 dlerror();
2257 if (orig_cudaFuncSetSharedMemConfig == NULL) {
2258 orig_cudaFuncSetSharedMemConfig = dlsym(RTLD_NEXT, "cudaFuncSetSharedMemConfig");
2259 }
2260 if ((dl_error = dlerror()) != NULL)
2261 {
2262 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2263 }
2264
2265
2266 // clear dl error
2267 dlerror();
2268 if (orig_cudaFuncGetAttributes == NULL) {
2269 orig_cudaFuncGetAttributes = dlsym(RTLD_NEXT, "cudaFuncGetAttributes");
2270 }
2271 if ((dl_error = dlerror()) != NULL)
2272 {
2273 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2274 }
2275
2276
2277 // clear dl error
2278 dlerror();
2279 if (orig_cudaFuncSetAttribute == NULL) {
2280 orig_cudaFuncSetAttribute = dlsym(RTLD_NEXT, "cudaFuncSetAttribute");
2281 }
2282 if ((dl_error = dlerror()) != NULL)
2283 {
2284 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2285 }
2286
2287
2288 // clear dl error
2289 dlerror();
2290 if (orig_cudaSetDoubleForDevice == NULL) {
2291 orig_cudaSetDoubleForDevice = dlsym(RTLD_NEXT, "cudaSetDoubleForDevice");
2292 }
2293 if ((dl_error = dlerror()) != NULL)
2294 {
2295 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2296 }
2297
2298
2299 // clear dl error
2300 dlerror();
2301 if (orig_cudaSetDoubleForHost == NULL) {
2302 orig_cudaSetDoubleForHost = dlsym(RTLD_NEXT, "cudaSetDoubleForHost");
2303 }
2304 if ((dl_error = dlerror()) != NULL)
2305 {
2306 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2307 }
2308
2309
2310 // clear dl error
2311 dlerror();
2312 if (orig_cudaOccupancyMaxActiveBlocksPerMultiprocessor == NULL) {
2313 orig_cudaOccupancyMaxActiveBlocksPerMultiprocessor = dlsym(RTLD_NEXT, "cudaOccupancyMaxActiveBlocksPerMultiprocessor");
2314 }
2315 if ((dl_error = dlerror()) != NULL)
2316 {
2317 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2318 }
2319
2320
2321 // clear dl error
2322 dlerror();
2323 if (orig_cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags == NULL) {
2324 orig_cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags = dlsym(RTLD_NEXT, "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags");
2325 }
2326 if ((dl_error = dlerror()) != NULL)
2327 {
2328 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2329 }
2330
2331
2332 // clear dl error
2333 dlerror();
2334 if (orig_cudaConfigureCall == NULL) {
2335 orig_cudaConfigureCall = dlsym(RTLD_NEXT, "cudaConfigureCall");
2336 }
2337 if ((dl_error = dlerror()) != NULL)
2338 {
2339 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2340 }
2341
2342
2343 // clear dl error
2344 dlerror();
2345 if (orig_cudaSetupArgument == NULL) {
2346 orig_cudaSetupArgument = dlsym(RTLD_NEXT, "cudaSetupArgument");
2347 }
2348 if ((dl_error = dlerror()) != NULL)
2349 {
2350 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2351 }
2352
2353
2354 // clear dl error
2355 dlerror();
2356 if (orig_cudaLaunch == NULL) {
2357 orig_cudaLaunch = dlsym(RTLD_NEXT, "cudaLaunch");
2358 }
2359 if ((dl_error = dlerror()) != NULL)
2360 {
2361 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2362 }
2363
2364
2365 // clear dl error
2366 dlerror();
2367 if (orig_cudaMallocManaged == NULL) {
2368 orig_cudaMallocManaged = dlsym(RTLD_NEXT, "cudaMallocManaged");
2369 }
2370 if ((dl_error = dlerror()) != NULL)
2371 {
2372 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2373 }
2374
2375
2376 // clear dl error
2377 dlerror();
2378 if (orig_cudaMalloc == NULL) {
2379 orig_cudaMalloc = dlsym(RTLD_NEXT, "cudaMalloc");
2380 }
2381 if ((dl_error = dlerror()) != NULL)
2382 {
2383 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2384 }
2385
2386
2387 // clear dl error
2388 dlerror();
2389 if (orig_cudaMallocHost == NULL) {
2390 orig_cudaMallocHost = dlsym(RTLD_NEXT, "cudaMallocHost");
2391 }
2392 if ((dl_error = dlerror()) != NULL)
2393 {
2394 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2395 }
2396
2397
2398 // clear dl error
2399 dlerror();
2400 if (orig_cudaMallocPitch == NULL) {
2401 orig_cudaMallocPitch = dlsym(RTLD_NEXT, "cudaMallocPitch");
2402 }
2403 if ((dl_error = dlerror()) != NULL)
2404 {
2405 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2406 }
2407
2408
2409 // clear dl error
2410 dlerror();
2411 if (orig_cudaMallocArray == NULL) {
2412 orig_cudaMallocArray = dlsym(RTLD_NEXT, "cudaMallocArray");
2413 }
2414 if ((dl_error = dlerror()) != NULL)
2415 {
2416 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2417 }
2418
2419
2420 // clear dl error
2421 dlerror();
2422 if (orig_cudaFree == NULL) {
2423 orig_cudaFree = dlsym(RTLD_NEXT, "cudaFree");
2424 }
2425 if ((dl_error = dlerror()) != NULL)
2426 {
2427 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2428 }
2429
2430
2431 // clear dl error
2432 dlerror();
2433 if (orig_cudaFreeHost == NULL) {
2434 orig_cudaFreeHost = dlsym(RTLD_NEXT, "cudaFreeHost");
2435 }
2436 if ((dl_error = dlerror()) != NULL)
2437 {
2438 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2439 }
2440
2441
2442 // clear dl error
2443 dlerror();
2444 if (orig_cudaFreeArray == NULL) {
2445 orig_cudaFreeArray = dlsym(RTLD_NEXT, "cudaFreeArray");
2446 }
2447 if ((dl_error = dlerror()) != NULL)
2448 {
2449 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2450 }
2451
2452
2453 // clear dl error
2454 dlerror();
2455 if (orig_cudaFreeMipmappedArray == NULL) {
2456 orig_cudaFreeMipmappedArray = dlsym(RTLD_NEXT, "cudaFreeMipmappedArray");
2457 }
2458 if ((dl_error = dlerror()) != NULL)
2459 {
2460 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2461 }
2462
2463
2464 // clear dl error
2465 dlerror();
2466 if (orig_cudaHostAlloc == NULL) {
2467 orig_cudaHostAlloc = dlsym(RTLD_NEXT, "cudaHostAlloc");
2468 }
2469 if ((dl_error = dlerror()) != NULL)
2470 {
2471 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2472 }
2473
2474
2475 // clear dl error
2476 dlerror();
2477 if (orig_cudaHostRegister == NULL) {
2478 orig_cudaHostRegister = dlsym(RTLD_NEXT, "cudaHostRegister");
2479 }
2480 if ((dl_error = dlerror()) != NULL)
2481 {
2482 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2483 }
2484
2485
2486 // clear dl error
2487 dlerror();
2488 if (orig_cudaHostUnregister == NULL) {
2489 orig_cudaHostUnregister = dlsym(RTLD_NEXT, "cudaHostUnregister");
2490 }
2491 if ((dl_error = dlerror()) != NULL)
2492 {
2493 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2494 }
2495
2496
2497 // clear dl error
2498 dlerror();
2499 if (orig_cudaHostGetDevicePointer == NULL) {
2500 orig_cudaHostGetDevicePointer = dlsym(RTLD_NEXT, "cudaHostGetDevicePointer");
2501 }
2502 if ((dl_error = dlerror()) != NULL)
2503 {
2504 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2505 }
2506
2507
2508 // clear dl error
2509 dlerror();
2510 if (orig_cudaHostGetFlags == NULL) {
2511 orig_cudaHostGetFlags = dlsym(RTLD_NEXT, "cudaHostGetFlags");
2512 }
2513 if ((dl_error = dlerror()) != NULL)
2514 {
2515 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2516 }
2517
2518
2519 // clear dl error
2520 dlerror();
2521 if (orig_cudaMalloc3D == NULL) {
2522 orig_cudaMalloc3D = dlsym(RTLD_NEXT, "cudaMalloc3D");
2523 }
2524 if ((dl_error = dlerror()) != NULL)
2525 {
2526 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2527 }
2528
2529
2530 // clear dl error
2531 dlerror();
2532 if (orig_cudaMalloc3DArray == NULL) {
2533 orig_cudaMalloc3DArray = dlsym(RTLD_NEXT, "cudaMalloc3DArray");
2534 }
2535 if ((dl_error = dlerror()) != NULL)
2536 {
2537 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2538 }
2539
2540
2541 // clear dl error
2542 dlerror();
2543 if (orig_cudaMallocMipmappedArray == NULL) {
2544 orig_cudaMallocMipmappedArray = dlsym(RTLD_NEXT, "cudaMallocMipmappedArray");
2545 }
2546 if ((dl_error = dlerror()) != NULL)
2547 {
2548 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2549 }
2550
2551
2552 // clear dl error
2553 dlerror();
2554 if (orig_cudaGetMipmappedArrayLevel == NULL) {
2555 orig_cudaGetMipmappedArrayLevel = dlsym(RTLD_NEXT, "cudaGetMipmappedArrayLevel");
2556 }
2557 if ((dl_error = dlerror()) != NULL)
2558 {
2559 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2560 }
2561
2562
2563 // clear dl error
2564 dlerror();
2565 if (orig_cudaMemcpy3D == NULL) {
2566 orig_cudaMemcpy3D = dlsym(RTLD_NEXT, "cudaMemcpy3D");
2567 }
2568 if ((dl_error = dlerror()) != NULL)
2569 {
2570 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2571 }
2572
2573
2574 // clear dl error
2575 dlerror();
2576 if (orig_cudaMemcpy3DPeer == NULL) {
2577 orig_cudaMemcpy3DPeer = dlsym(RTLD_NEXT, "cudaMemcpy3DPeer");
2578 }
2579 if ((dl_error = dlerror()) != NULL)
2580 {
2581 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2582 }
2583
2584
2585 // clear dl error
2586 dlerror();
2587 if (orig_cudaMemcpy3DAsync == NULL) {
2588 orig_cudaMemcpy3DAsync = dlsym(RTLD_NEXT, "cudaMemcpy3DAsync");
2589 }
2590 if ((dl_error = dlerror()) != NULL)
2591 {
2592 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2593 }
2594
2595
2596 // clear dl error
2597 dlerror();
2598 if (orig_cudaMemcpy3DPeerAsync == NULL) {
2599 orig_cudaMemcpy3DPeerAsync = dlsym(RTLD_NEXT, "cudaMemcpy3DPeerAsync");
2600 }
2601 if ((dl_error = dlerror()) != NULL)
2602 {
2603 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2604 }
2605
2606
2607 // clear dl error
2608 dlerror();
2609 if (orig_cudaMemGetInfo == NULL) {
2610 orig_cudaMemGetInfo = dlsym(RTLD_NEXT, "cudaMemGetInfo");
2611 }
2612 if ((dl_error = dlerror()) != NULL)
2613 {
2614 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2615 }
2616
2617
2618 // clear dl error
2619 dlerror();
2620 if (orig_cudaArrayGetInfo == NULL) {
2621 orig_cudaArrayGetInfo = dlsym(RTLD_NEXT, "cudaArrayGetInfo");
2622 }
2623 if ((dl_error = dlerror()) != NULL)
2624 {
2625 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2626 }
2627
2628
2629 // clear dl error
2630 dlerror();
2631 if (orig_cudaMemcpy == NULL) {
2632 orig_cudaMemcpy = dlsym(RTLD_NEXT, "cudaMemcpy");
2633 }
2634 if ((dl_error = dlerror()) != NULL)
2635 {
2636 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2637 }
2638
2639
2640 // clear dl error
2641 dlerror();
2642 if (orig_cudaMemcpyPeer == NULL) {
2643 orig_cudaMemcpyPeer = dlsym(RTLD_NEXT, "cudaMemcpyPeer");
2644 }
2645 if ((dl_error = dlerror()) != NULL)
2646 {
2647 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2648 }
2649
2650
2651 // clear dl error
2652 dlerror();
2653 if (orig_cudaMemcpyToArray == NULL) {
2654 orig_cudaMemcpyToArray = dlsym(RTLD_NEXT, "cudaMemcpyToArray");
2655 }
2656 if ((dl_error = dlerror()) != NULL)
2657 {
2658 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2659 }
2660
2661
2662 // clear dl error
2663 dlerror();
2664 if (orig_cudaMemcpyFromArray == NULL) {
2665 orig_cudaMemcpyFromArray = dlsym(RTLD_NEXT, "cudaMemcpyFromArray");
2666 }
2667 if ((dl_error = dlerror()) != NULL)
2668 {
2669 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2670 }
2671
2672
2673 // clear dl error
2674 dlerror();
2675 if (orig_cudaMemcpyArrayToArray == NULL) {
2676 orig_cudaMemcpyArrayToArray = dlsym(RTLD_NEXT, "cudaMemcpyArrayToArray");
2677 }
2678 if ((dl_error = dlerror()) != NULL)
2679 {
2680 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2681 }
2682
2683
2684 // clear dl error
2685 dlerror();
2686 if (orig_cudaMemcpy2D == NULL) {
2687 orig_cudaMemcpy2D = dlsym(RTLD_NEXT, "cudaMemcpy2D");
2688 }
2689 if ((dl_error = dlerror()) != NULL)
2690 {
2691 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2692 }
2693
2694
2695 // clear dl error
2696 dlerror();
2697 if (orig_cudaMemcpy2DToArray == NULL) {
2698 orig_cudaMemcpy2DToArray = dlsym(RTLD_NEXT, "cudaMemcpy2DToArray");
2699 }
2700 if ((dl_error = dlerror()) != NULL)
2701 {
2702 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2703 }
2704
2705
2706 // clear dl error
2707 dlerror();
2708 if (orig_cudaMemcpy2DFromArray == NULL) {
2709 orig_cudaMemcpy2DFromArray = dlsym(RTLD_NEXT, "cudaMemcpy2DFromArray");
2710 }
2711 if ((dl_error = dlerror()) != NULL)
2712 {
2713 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2714 }
2715
2716
2717 // clear dl error
2718 dlerror();
2719 if (orig_cudaMemcpy2DArrayToArray == NULL) {
2720 orig_cudaMemcpy2DArrayToArray = dlsym(RTLD_NEXT, "cudaMemcpy2DArrayToArray");
2721 }
2722 if ((dl_error = dlerror()) != NULL)
2723 {
2724 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2725 }
2726
2727
2728 // clear dl error
2729 dlerror();
2730 if (orig_cudaMemcpyToSymbol == NULL) {
2731 orig_cudaMemcpyToSymbol = dlsym(RTLD_NEXT, "cudaMemcpyToSymbol");
2732 }
2733 if ((dl_error = dlerror()) != NULL)
2734 {
2735 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2736 }
2737
2738
2739 // clear dl error
2740 dlerror();
2741 if (orig_cudaMemcpyFromSymbol == NULL) {
2742 orig_cudaMemcpyFromSymbol = dlsym(RTLD_NEXT, "cudaMemcpyFromSymbol");
2743 }
2744 if ((dl_error = dlerror()) != NULL)
2745 {
2746 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2747 }
2748
2749
2750 // clear dl error
2751 dlerror();
2752 if (orig_cudaMemcpyAsync == NULL) {
2753 orig_cudaMemcpyAsync = dlsym(RTLD_NEXT, "cudaMemcpyAsync");
2754 }
2755 if ((dl_error = dlerror()) != NULL)
2756 {
2757 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2758 }
2759
2760
2761 // clear dl error
2762 dlerror();
2763 if (orig_cudaMemcpyPeerAsync == NULL) {
2764 orig_cudaMemcpyPeerAsync = dlsym(RTLD_NEXT, "cudaMemcpyPeerAsync");
2765 }
2766 if ((dl_error = dlerror()) != NULL)
2767 {
2768 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2769 }
2770
2771
2772 // clear dl error
2773 dlerror();
2774 if (orig_cudaMemcpyToArrayAsync == NULL) {
2775 orig_cudaMemcpyToArrayAsync = dlsym(RTLD_NEXT, "cudaMemcpyToArrayAsync");
2776 }
2777 if ((dl_error = dlerror()) != NULL)
2778 {
2779 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2780 }
2781
2782
2783 // clear dl error
2784 dlerror();
2785 if (orig_cudaMemcpyFromArrayAsync == NULL) {
2786 orig_cudaMemcpyFromArrayAsync = dlsym(RTLD_NEXT, "cudaMemcpyFromArrayAsync");
2787 }
2788 if ((dl_error = dlerror()) != NULL)
2789 {
2790 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2791 }
2792
2793
2794 // clear dl error
2795 dlerror();
2796 if (orig_cudaMemcpy2DAsync == NULL) {
2797 orig_cudaMemcpy2DAsync = dlsym(RTLD_NEXT, "cudaMemcpy2DAsync");
2798 }
2799 if ((dl_error = dlerror()) != NULL)
2800 {
2801 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2802 }
2803
2804
2805 // clear dl error
2806 dlerror();
2807 if (orig_cudaMemcpy2DToArrayAsync == NULL) {
2808 orig_cudaMemcpy2DToArrayAsync = dlsym(RTLD_NEXT, "cudaMemcpy2DToArrayAsync");
2809 }
2810 if ((dl_error = dlerror()) != NULL)
2811 {
2812 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2813 }
2814
2815
2816 // clear dl error
2817 dlerror();
2818 if (orig_cudaMemcpy2DFromArrayAsync == NULL) {
2819 orig_cudaMemcpy2DFromArrayAsync = dlsym(RTLD_NEXT, "cudaMemcpy2DFromArrayAsync");
2820 }
2821 if ((dl_error = dlerror()) != NULL)
2822 {
2823 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2824 }
2825
2826
2827 // clear dl error
2828 dlerror();
2829 if (orig_cudaMemcpyToSymbolAsync == NULL) {
2830 orig_cudaMemcpyToSymbolAsync = dlsym(RTLD_NEXT, "cudaMemcpyToSymbolAsync");
2831 }
2832 if ((dl_error = dlerror()) != NULL)
2833 {
2834 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2835 }
2836
2837
2838 // clear dl error
2839 dlerror();
2840 if (orig_cudaMemcpyFromSymbolAsync == NULL) {
2841 orig_cudaMemcpyFromSymbolAsync = dlsym(RTLD_NEXT, "cudaMemcpyFromSymbolAsync");
2842 }
2843 if ((dl_error = dlerror()) != NULL)
2844 {
2845 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2846 }
2847
2848
2849 // clear dl error
2850 dlerror();
2851 if (orig_cudaMemset == NULL) {
2852 orig_cudaMemset = dlsym(RTLD_NEXT, "cudaMemset");
2853 }
2854 if ((dl_error = dlerror()) != NULL)
2855 {
2856 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2857 }
2858
2859
2860 // clear dl error
2861 dlerror();
2862 if (orig_cudaMemset2D == NULL) {
2863 orig_cudaMemset2D = dlsym(RTLD_NEXT, "cudaMemset2D");
2864 }
2865 if ((dl_error = dlerror()) != NULL)
2866 {
2867 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2868 }
2869
2870
2871 // clear dl error
2872 dlerror();
2873 if (orig_cudaMemset3D == NULL) {
2874 orig_cudaMemset3D = dlsym(RTLD_NEXT, "cudaMemset3D");
2875 }
2876 if ((dl_error = dlerror()) != NULL)
2877 {
2878 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2879 }
2880
2881
2882 // clear dl error
2883 dlerror();
2884 if (orig_cudaMemsetAsync == NULL) {
2885 orig_cudaMemsetAsync = dlsym(RTLD_NEXT, "cudaMemsetAsync");
2886 }
2887 if ((dl_error = dlerror()) != NULL)
2888 {
2889 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2890 }
2891
2892
2893 // clear dl error
2894 dlerror();
2895 if (orig_cudaMemset2DAsync == NULL) {
2896 orig_cudaMemset2DAsync = dlsym(RTLD_NEXT, "cudaMemset2DAsync");
2897 }
2898 if ((dl_error = dlerror()) != NULL)
2899 {
2900 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2901 }
2902
2903
2904 // clear dl error
2905 dlerror();
2906 if (orig_cudaMemset3DAsync == NULL) {
2907 orig_cudaMemset3DAsync = dlsym(RTLD_NEXT, "cudaMemset3DAsync");
2908 }
2909 if ((dl_error = dlerror()) != NULL)
2910 {
2911 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2912 }
2913
2914
2915 // clear dl error
2916 dlerror();
2917 if (orig_cudaGetSymbolAddress == NULL) {
2918 orig_cudaGetSymbolAddress = dlsym(RTLD_NEXT, "cudaGetSymbolAddress");
2919 }
2920 if ((dl_error = dlerror()) != NULL)
2921 {
2922 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2923 }
2924
2925
2926 // clear dl error
2927 dlerror();
2928 if (orig_cudaGetSymbolSize == NULL) {
2929 orig_cudaGetSymbolSize = dlsym(RTLD_NEXT, "cudaGetSymbolSize");
2930 }
2931 if ((dl_error = dlerror()) != NULL)
2932 {
2933 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2934 }
2935
2936
2937 // clear dl error
2938 dlerror();
2939 if (orig_cudaMemPrefetchAsync == NULL) {
2940 orig_cudaMemPrefetchAsync = dlsym(RTLD_NEXT, "cudaMemPrefetchAsync");
2941 }
2942 if ((dl_error = dlerror()) != NULL)
2943 {
2944 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2945 }
2946
2947
2948 // clear dl error
2949 dlerror();
2950 if (orig_cudaMemAdvise == NULL) {
2951 orig_cudaMemAdvise = dlsym(RTLD_NEXT, "cudaMemAdvise");
2952 }
2953 if ((dl_error = dlerror()) != NULL)
2954 {
2955 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2956 }
2957
2958
2959 // clear dl error
2960 dlerror();
2961 if (orig_cudaMemRangeGetAttribute == NULL) {
2962 orig_cudaMemRangeGetAttribute = dlsym(RTLD_NEXT, "cudaMemRangeGetAttribute");
2963 }
2964 if ((dl_error = dlerror()) != NULL)
2965 {
2966 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2967 }
2968
2969
2970 // clear dl error
2971 dlerror();
2972 if (orig_cudaMemRangeGetAttributes == NULL) {
2973 orig_cudaMemRangeGetAttributes = dlsym(RTLD_NEXT, "cudaMemRangeGetAttributes");
2974 }
2975 if ((dl_error = dlerror()) != NULL)
2976 {
2977 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2978 }
2979
2980
2981 // clear dl error
2982 dlerror();
2983 if (orig_cudaPointerGetAttributes == NULL) {
2984 orig_cudaPointerGetAttributes = dlsym(RTLD_NEXT, "cudaPointerGetAttributes");
2985 }
2986 if ((dl_error = dlerror()) != NULL)
2987 {
2988 fprintf(stderr, ">>>>>>> %s\n", dl_error);
2989 }
2990
2991
2992 // clear dl error
2993 dlerror();
2994 if (orig_cudaDeviceCanAccessPeer == NULL) {
2995 orig_cudaDeviceCanAccessPeer = dlsym(RTLD_NEXT, "cudaDeviceCanAccessPeer");
2996 }
2997 if ((dl_error = dlerror()) != NULL)
2998 {
2999 fprintf(stderr, ">>>>>>> %s\n", dl_error);
3000 }
3001
3002
3003 // clear dl error
3004 dlerror();
3005 if (orig_cudaDeviceEnablePeerAccess == NULL) {
3006 orig_cudaDeviceEnablePeerAccess = dlsym(RTLD_NEXT, "cudaDeviceEnablePeerAccess");
3007 }
3008 if ((dl_error = dlerror()) != NULL)
3009 {
3010 fprintf(stderr, ">>>>>>> %s\n", dl_error);
3011 }
3012
3013
3014 // clear dl error
3015 dlerror();
3016 if (orig_cudaDeviceDisablePeerAccess == NULL) {
3017 orig_cudaDeviceDisablePeerAccess = dlsym(RTLD_NEXT, "cudaDeviceDisablePeerAccess");
3018 }
3019 if ((dl_error = dlerror()) != NULL)
3020 {
3021 fprintf(stderr, ">>>>>>> %s\n", dl_error);
3022 }
3023
3024
3025 // clear dl error
3026 dlerror();
3027 if (orig_cudaGraphicsUnregisterResource == NULL) {
3028 orig_cudaGraphicsUnregisterResource = dlsym(RTLD_NEXT, "cudaGraphicsUnregisterResource");
3029 }
3030 if ((dl_error = dlerror()) != NULL)
3031 {
3032 fprintf(stderr, ">>>>>>> %s\n", dl_error);
3033 }
3034
3035
3036 // clear dl error
3037 dlerror();
3038 if (orig_cudaGraphicsResourceSetMapFlags == NULL) {
3039 orig_cudaGraphicsResourceSetMapFlags = dlsym(RTLD_NEXT, "cudaGraphicsResourceSetMapFlags");
3040 }
3041 if ((dl_error = dlerror()) != NULL)
3042 {
3043 fprintf(stderr, ">>>>>>> %s\n", dl_error);
3044 }
3045
3046
3047 // clear dl error
3048 dlerror();
3049 if (orig_cudaGraphicsMapResources == NULL) {
3050 orig_cudaGraphicsMapResources = dlsym(RTLD_NEXT, "cudaGraphicsMapResources");
3051 }
3052 if ((dl_error = dlerror()) != NULL)
3053 {
3054 fprintf(stderr, ">>>>>>> %s\n", dl_error);
3055 }
3056
3057
3058 // clear dl error
3059 dlerror();
3060 if (orig_cudaGraphicsUnmapResources == NULL) {
3061 orig_cudaGraphicsUnmapResources = dlsym(RTLD_NEXT, "cudaGraphicsUnmapResources");
3062 }
3063 if ((dl_error = dlerror()) != NULL)
3064 {
3065 fprintf(stderr, ">>>>>>> %s\n", dl_error);
3066 }
3067
3068
3069 // clear dl error
3070 dlerror();
3071 if (orig_cudaGraphicsResourceGetMappedPointer == NULL) {
3072 orig_cudaGraphicsResourceGetMappedPointer = dlsym(RTLD_NEXT, "cudaGraphicsResourceGetMappedPointer");
3073 }
3074 if ((dl_error = dlerror()) != NULL)
3075 {
3076 fprintf(stderr, ">>>>>>> %s\n", dl_error);
3077 }
3078
3079
3080 // clear dl error
3081 dlerror();
3082 if (orig_cudaGraphicsSubResourceGetMappedArray == NULL) {
3083 orig_cudaGraphicsSubResourceGetMappedArray = dlsym(RTLD_NEXT, "cudaGraphicsSubResourceGetMappedArray");
3084 }
3085 if ((dl_error = dlerror()) != NULL)
3086 {
3087 fprintf(stderr, ">>>>>>> %s\n", dl_error);
3088 }
3089
3090
3091 // clear dl error
3092 dlerror();
3093 if (orig_cudaGraphicsResourceGetMappedMipmappedArray == NULL) {
3094 orig_cudaGraphicsResourceGetMappedMipmappedArray = dlsym(RTLD_NEXT, "cudaGraphicsResourceGetMappedMipmappedArray");
3095 }
3096 if ((dl_error = dlerror()) != NULL)
3097 {
3098 fprintf(stderr, ">>>>>>> %s\n", dl_error);
3099 }
3100
3101
3102 // clear dl error
3103 dlerror();
3104 if (orig_cudaGetChannelDesc == NULL) {
3105 orig_cudaGetChannelDesc = dlsym(RTLD_NEXT, "cudaGetChannelDesc");
3106 }
3107 if ((dl_error = dlerror()) != NULL)
3108 {
3109 fprintf(stderr, ">>>>>>> %s\n", dl_error);
3110 }
3111
3112
3113 // clear dl error
3114 dlerror();
3115 if (orig_cudaCreateChannelDesc == NULL) {
3116 orig_cudaCreateChannelDesc = dlsym(RTLD_NEXT, "cudaCreateChannelDesc");
3117 }
3118 if ((dl_error = dlerror()) != NULL)
3119 {
3120 fprintf(stderr, ">>>>>>> %s\n", dl_error);
3121 }
3122
3123
3124 // clear dl error
3125 dlerror();
3126 if (orig_cudaBindTexture == NULL) {
3127 orig_cudaBindTexture = dlsym(RTLD_NEXT, "cudaBindTexture");
3128 }
3129 if ((dl_error = dlerror()) != NULL)
3130 {
3131 fprintf(stderr, ">>>>>>> %s\n", dl_error);
3132 }
3133
3134
3135 // clear dl error
3136 dlerror();
3137 if (orig_cudaBindTexture2D == NULL) {
3138 orig_cudaBindTexture2D = dlsym(RTLD_NEXT, "cudaBindTexture2D");
3139 }
3140 if ((dl_error = dlerror()) != NULL)
3141 {
3142 fprintf(stderr, ">>>>>>> %s\n", dl_error);
3143 }
3144
3145
3146 // clear dl error
3147 dlerror();
3148 if (orig_cudaBindTextureToArray == NULL) {
3149 orig_cudaBindTextureToArray = dlsym(RTLD_NEXT, "cudaBindTextureToArray");
3150 }
3151 if ((dl_error = dlerror()) != NULL)
3152 {
3153 fprintf(stderr, ">>>>>>> %s\n", dl_error);
3154 }
3155
3156
3157 // clear dl error
3158 dlerror();
3159 if (orig_cudaBindTextureToMipmappedArray == NULL) {
3160 orig_cudaBindTextureToMipmappedArray = dlsym(RTLD_NEXT, "cudaBindTextureToMipmappedArray");
3161 }
3162 if ((dl_error = dlerror()) != NULL)
3163 {
3164 fprintf(stderr, ">>>>>>> %s\n", dl_error);
3165 }
3166
3167
3168 // clear dl error
3169 dlerror();
3170 if (orig_cudaUnbindTexture == NULL) {
3171 orig_cudaUnbindTexture = dlsym(RTLD_NEXT, "cudaUnbindTexture");
3172 }
3173 if ((dl_error = dlerror()) != NULL)
3174 {
3175 fprintf(stderr, ">>>>>>> %s\n", dl_error);
3176 }
3177
3178
3179 // clear dl error
3180 dlerror();
3181 if (orig_cudaGetTextureAlignmentOffset == NULL) {
3182 orig_cudaGetTextureAlignmentOffset = dlsym(RTLD_NEXT, "cudaGetTextureAlignmentOffset");
3183 }
3184 if ((dl_error = dlerror()) != NULL)
3185 {
3186 fprintf(stderr, ">>>>>>> %s\n", dl_error);
3187 }
3188
3189
3190 // clear dl error
3191 dlerror();
3192 if (orig_cudaGetTextureReference == NULL) {
3193 orig_cudaGetTextureReference = dlsym(RTLD_NEXT, "cudaGetTextureReference");
3194 }
3195 if ((dl_error = dlerror()) != NULL)
3196 {
3197 fprintf(stderr, ">>>>>>> %s\n", dl_error);
3198 }
3199
3200
3201 // clear dl error
3202 dlerror();
3203 if (orig_cudaBindSurfaceToArray == NULL) {
3204 orig_cudaBindSurfaceToArray = dlsym(RTLD_NEXT, "cudaBindSurfaceToArray");
3205 }
3206 if ((dl_error = dlerror()) != NULL)
3207 {
3208 fprintf(stderr, ">>>>>>> %s\n", dl_error);
3209 }
3210
3211
3212 // clear dl error
3213 dlerror();
3214 if (orig_cudaGetSurfaceReference == NULL) {
3215 orig_cudaGetSurfaceReference = dlsym(RTLD_NEXT, "cudaGetSurfaceReference");
3216 }
3217 if ((dl_error = dlerror()) != NULL)
3218 {
3219 fprintf(stderr, ">>>>>>> %s\n", dl_error);
3220 }
3221
3222
3223 // clear dl error
3224 dlerror();
3225 if (orig_cudaCreateTextureObject == NULL) {
3226 orig_cudaCreateTextureObject = dlsym(RTLD_NEXT, "cudaCreateTextureObject");
3227 }
3228 if ((dl_error = dlerror()) != NULL)
3229 {
3230 fprintf(stderr, ">>>>>>> %s\n", dl_error);
3231 }
3232
3233
3234 // clear dl error
3235 dlerror();
3236 if (orig_cudaDestroyTextureObject == NULL) {
3237 orig_cudaDestroyTextureObject = dlsym(RTLD_NEXT, "cudaDestroyTextureObject");
3238 }
3239 if ((dl_error = dlerror()) != NULL)
3240 {
3241 fprintf(stderr, ">>>>>>> %s\n", dl_error);
3242 }
3243
3244
3245 // clear dl error
3246 dlerror();
3247 if (orig_cudaGetTextureObjectResourceDesc == NULL) {
3248 orig_cudaGetTextureObjectResourceDesc = dlsym(RTLD_NEXT, "cudaGetTextureObjectResourceDesc");
3249 }
3250 if ((dl_error = dlerror()) != NULL)
3251 {
3252 fprintf(stderr, ">>>>>>> %s\n", dl_error);
3253 }
3254
3255
3256 // clear dl error
3257 dlerror();
3258 if (orig_cudaGetTextureObjectTextureDesc == NULL) {
3259 orig_cudaGetTextureObjectTextureDesc = dlsym(RTLD_NEXT, "cudaGetTextureObjectTextureDesc");
3260 }
3261 if ((dl_error = dlerror()) != NULL)
3262 {
3263 fprintf(stderr, ">>>>>>> %s\n", dl_error);
3264 }
3265
3266
3267 // clear dl error
3268 dlerror();
3269 if (orig_cudaGetTextureObjectResourceViewDesc == NULL) {
3270 orig_cudaGetTextureObjectResourceViewDesc = dlsym(RTLD_NEXT, "cudaGetTextureObjectResourceViewDesc");
3271 }
3272 if ((dl_error = dlerror()) != NULL)
3273 {
3274 fprintf(stderr, ">>>>>>> %s\n", dl_error);
3275 }
3276
3277
3278 // clear dl error
3279 dlerror();
3280 if (orig_cudaCreateSurfaceObject == NULL) {
3281 orig_cudaCreateSurfaceObject = dlsym(RTLD_NEXT, "cudaCreateSurfaceObject");
3282 }
3283 if ((dl_error = dlerror()) != NULL)
3284 {
3285 fprintf(stderr, ">>>>>>> %s\n", dl_error);
3286 }
3287
3288
3289 // clear dl error
3290 dlerror();
3291 if (orig_cudaDestroySurfaceObject == NULL) {
3292 orig_cudaDestroySurfaceObject = dlsym(RTLD_NEXT, "cudaDestroySurfaceObject");
3293 }
3294 if ((dl_error = dlerror()) != NULL)
3295 {
3296 fprintf(stderr, ">>>>>>> %s\n", dl_error);
3297 }
3298
3299
3300 // clear dl error
3301 dlerror();
3302 if (orig_cudaGetSurfaceObjectResourceDesc == NULL) {
3303 orig_cudaGetSurfaceObjectResourceDesc = dlsym(RTLD_NEXT, "cudaGetSurfaceObjectResourceDesc");
3304 }
3305 if ((dl_error = dlerror()) != NULL)
3306 {
3307 fprintf(stderr, ">>>>>>> %s\n", dl_error);
3308 }
3309
3310
3311 // clear dl error
3312 dlerror();
3313 if (orig_cudaDriverGetVersion == NULL) {
3314 orig_cudaDriverGetVersion = dlsym(RTLD_NEXT, "cudaDriverGetVersion");
3315 }
3316 if ((dl_error = dlerror()) != NULL)
3317 {
3318 fprintf(stderr, ">>>>>>> %s\n", dl_error);
3319 }
3320
3321
3322 // clear dl error
3323 dlerror();
3324 if (orig_cudaRuntimeGetVersion == NULL) {
3325 orig_cudaRuntimeGetVersion = dlsym(RTLD_NEXT, "cudaRuntimeGetVersion");
3326 }
3327 if ((dl_error = dlerror()) != NULL)
3328 {
3329 fprintf(stderr, ">>>>>>> %s\n", dl_error);
3330 }
3331
3332
3333 // clear dl error
3334 dlerror();
3335 if (orig_cudaGetExportTable == NULL) {
3336 orig_cudaGetExportTable = dlsym(RTLD_NEXT, "cudaGetExportTable");
3337 }
3338 if ((dl_error = dlerror()) != NULL)
3339 {
3340 fprintf(stderr, ">>>>>>> %s\n", dl_error);
3341 }
3342
3343}
diff --git a/schedAPI.h b/schedAPI.h
new file mode 100644
index 0000000..00105fc
--- /dev/null
+++ b/schedAPI.h
@@ -0,0 +1,10 @@
1/*
2 * This include file provides function prototypes for the
3 * GPU scheduler functions implemented by schedLib.c and used
4 * in the Cuda wrapper functions.
5 */
6void streamInit(pid_t my_tid, int priority);
7void schedConfCall(pid_t my_tid, void *stream, int blocks, int threads);
8void schedLaunch(pid_t my_tid);
9void schedSync(pid_t my_pid, void *stream);
10void tracePrint(void);
diff --git a/schedLib.c b/schedLib.c
new file mode 100644
index 0000000..d16a066
--- /dev/null
+++ b/schedLib.c
@@ -0,0 +1,629 @@
1/*
2 * This library implements a transparent extension of the NVIDIA runtime
3 * API (libcudart) that is dynamically linked with CUDA programs. This
4 * extension provides a "middleware" scheduling infrastructure that controls
5 * CUDA kernel launch requests. It is designed to control kernel scheduling
6 * for CUDA programs with the following characteristics commonly used for
7 * concurrent GPU sharing:
8 * - A main process that creates multiple threads (pthreads) sharing a
9 * single process address space (i.e., the conditions under which
10 * kernels can run concurrently on a GPU).
11 * - Each thread creates one user-defined CUDA stream (FIFO queue)
12 * that it manages and uses for invoking GPU operations. There is a
13 * one-to-one relationship between threads and streams.
14 * - The program is written to launch kernels using the angle-brackets
15 * syntax (<<<.....>>>) and synchronizes the CPU and GPU with at least
16 * one call to cudaStreamSynchronize() between successive instances of
17 * kernel launches in a given stream.
18 * - The CUDA program is dynamically linked with the CUDA library libcudart
19 *
20 * In the case of a CUDA program with multiple user-defined streams, the NVIDIA
21 * scheduling rules for choosing among multiple streams with kernels at the
22 * top of their FIFO queues are not documented. This middleware attempts to
23 * implement and control some of the scheduling choices that can be made.
24 *
25 * The library functions are transparently invoked by "wrapping" calls
26 * to certain of the original CUDA API functions (described below) and
27 * performing scheduling choices before or after invoking the "real" CUDA
28 * code. Control over which kernel launch requests can be presented to
29 * the NVIDIA software and hardware scheduling mechanisms is achieved
30 * by blocking and signaling operations on the program threads.
31 *
32 * The new library functions were designed following the fundamental
33 * principle of separation between mechanism and policy. Most of the library
34 * is for implementing the mechanisms that are required for any policy.
35 * Many scheduling policies are possible given adequate mechanisms for
36 * carrying out a given policy. The separation of mechanism and policy
37 * makes it easy to try out and evaluate different policies. In the library
38 * code, all aspects of policy are implemented in a single function,
39 * find_next_kernel(), which returns either an identifier for a stream
40 * to launch a kernel or -1 to indicate that no new launch is allowed.
41 * The policy functions are intended to be implemented as instances
42 * of the find_next_kernel() function each contained in a .h file named
43 * in a #include statement.
44 *
45 * For a given thread/stream, the basic sequence of actions is:
46 * - The program creates a user-defined stream using one of the calls
47 * cudaStreamCreate(), cudaStreamCreateWithFlags(), or
48 * cudaStreamCreateWithPriority(). These calls first invoke the
49 * new library function streamInit() that initializes state about the
50 * stream and, on the first call, initializes state about the GPU.
51 * The "real" cuda runtime code for creating a stream is then executed.
52 *
53 * - A CUDA kernel launch on a created stream using the angle-brackets syntax
54 * (<<<.....>>>) is compiled with nvcc by generating two API calls which
55 * result in calls to the new library functions:
56 * - cudaConfigureCall() first calls the new library function
57 * schedConfCall() which records configuration information (e.g.,
58 * number of blocks and threads) about the kernel about to be launched
59 * and then executes the "real" library code for configuring a launch.
60 * - cudaLaunch() first calls the new library function schedLaunch().
61 * This function implements a scheduling decision that determines
62 * which, if any, threads, including the one attempting a launch, should
63 * not be blocked so the actual launch can be allowed to happen. For those
64 * that are already blocked, the corresponding condition is signaled.
65 * For the current launching thread, the thread is not blocked and it
66 * executes the "real" CUDA launch. If the current launching thread
67 * should be blocked for later scheduling, the corresponding pthread
68 * condition wait is executed.
69 *
70 * - The CUDA program synchronizes the CPU thread with a launched kernel
71 * using a cudaStreamSynchronize() call. The "wrapper" for this function
72 * has a different sequence of operations. It first executes the "real"
73 * synchronization function which may result in the CPU thread being
74 * blocked until the kernel completes on the GPU. When the "real" call
75 * returns (kernel completed), the new library function schedSync() is called.
76 * It implements a scheduling decision that determines whether any blocked
77 * kernel launches can now be executed and, if so, signals the conditions
78 * that are blocking the threads attempting to launch a kernel. The kernel
79 * launch is then handled by the "real" NVIDIA scheduling functions.
80 *
81 * The limitation inherent in this design is that the underlying Linux scheduler
82 * actually determines the order in which blocked threads run when unblocked.
83 * In the case that multiple blocked threads are signalled, the actual order of
84 * launches depends on how the Linux scheduler orders the thread dispatching.
85 * Only in the case where the scheduling algorithm allows only one thread to launch
86 * a kernel at a time (essentially eliminating any concurrency) can the launch
87 * order be made completely deterministic. In all other cases, the schedluer
88 * can only control the set of kernels that are allowed to run concurrently,
89 * not the specific order in which they start executing.
90 *
91 * The new scheduling "middleware" is implemented as a library that is compiled
92 * and linked with the cuda wrappers as a dynamic-linked load module (see the Makefile).
93 * A process has one copy of this library program and all threads created by the
94 * process share the global state for the library.
95 *
96 * IMPORTANT: Assumes that a process will create threads only with the POSIX
97 * API call pthread_create() and not use a system call like clone() directly.
98 * Also assumes that there is a one-to-one relationship between threads and
99 * streams and that the Linux thread TID is sufficient to identify a stream.
100 *
101 * Note that all new library calls have a void return. If a call returns, it
102 * can be assumed that the call completed without encountering potential errors.
103 * If any error is identified, the process is terminated.
104 *
105 * Written by Don Smith, Department of Computer Science,
106 * University of North Carolina at Chapel Hill.
107 * 2019.
108 */
109
110#include <stdio.h>
111#include <stdlib.h>
112#include <string.h>
113#include <fcntl.h>
114#include <time.h>
115#include <errno.h>
116#define _GNU_SOURCE
117#include <unistd.h>
118#include <sys/syscall.h>
119#include <sys/types.h>
120
121#include <pthread.h>
122
123#define TRACE_ON 1 //change to 1 for producing trace of launch decisions, 0 for not
124#define MAX_SCHED_TRACE 100000
125
126#define MAX_STREAMS 4 // One per basic ARM core on TX2
127#define MAX_GPU_BLOCKS 64 //Max blocks on 2 SM TX2
128#define MAX_GPU_THREADS 4096 //Max threads on 2 SM TX2
129
130#define min(a,b) ((a) <= (b) ? (a) : (b))
131#define max(a,b) ((a) >= (b) ? (a) : (b))
132
133int trc_idx = 0;
134struct schedTrace *tr_ptr;
135struct schedTrace {
136 int stream[MAX_STREAMS];
137 int stream_threads[MAX_STREAMS];
138 int next;
139 char type[4];
140}SchedTrace[MAX_SCHED_TRACE];
141
142
143int Initialized = 0; //Only initialize GPU once -- set to 1 the first time
144enum st_states {INIT, IDLE, PENDING, READY_LAUNCH, LAUNCHED}; //stream states
145enum gpu_states {FREE, BUSY}; //gpu states
146
147// this mutex is required to protect shared stream and GPU states
148pthread_mutex_t sched_lock = PTHREAD_MUTEX_INITIALIZER;
149
150struct stream {
151 pid_t thread; //the tid (Linux thread id) of the thread "owning" the stream
152 void *stream; //the CUDA runtime pointer of the stream (not currently used).
153 int priority; //stream priority from cudaCreateStreamPriority() or 0.
154 enum st_states state; //current stream state
155 int blocks; //number of blocks in kernel ready to launch
156 int block_threads; //number of threads per block
157 int look_count; //for use in policy algorithms concerned with starvation
158 pthread_mutex_t st_lock; //required for using condition wait/signal on stream
159 pthread_cond_t st_cond; //condition variable for thread/stream block/signal
160} Stream[MAX_STREAMS]; //a thread/stream is identified by an index (str_idx) in the array.
161
162int stream_count = 0; //number of streams that have been created
163int next = 0; //index of stream that can launch the next kernel
164
165struct gpu {
166 enum gpu_states GPU_state; //current GPU state
167 int threads_occupied; //total threads allocated over both SMs
168 int kernels_dispatched; //number of kernels currently dispatched to SMs
169 int streams[MAX_STREAMS]; //for kernel executing, its thread/stream tid, else 0
170 int stream_threads[MAX_STREAMS]; //for kernel executing, its allocated threads, else 0
171};
172
173struct gpu GPU;
174
175void free_gpu_threads(pid_t my_tid, int str_idx);
176void gpu_exit(pid_t my_tid, int str_idx);
177void alloc_gpu_threads(pid_t my_tid, int str_idx);
178void gpu_run(pid_t my_tid, int str_idx);
179int find_best_kernel(void);
180void ready_launch(pid_t my_tid);
181void schedule_next(pid_t my_tid);
182void dispatch_next(int this_one, pid_t my_tid);
183int get_stream(pid_t my_tid);
184void show_gpu_state(void);
185void show_stream_state(int this_one);
186
187
188//Include here the .h file containing the scheduling policy implementation
189//in funtion: int find_best_kernel(void)
190#include "MinFitMinIntfR2.h"
191
192/* Function called from library wrapper of cudaDeviceReset().
193 * If any entries have been made in a trace of scheduling decisions
194 * made by find_best_kernel(), they are formatted and written to
195 * stdout.
196 */
197void tracePrint(void)
198{
199 int i, j;
200 for (i = 0; i < trc_idx; i++) {
201 fprintf(stderr, "%d %s %d ", i, SchedTrace[i].type, SchedTrace[i].next);
202 for (j = 0; j < MAX_STREAMS; j++) {
203 fprintf(stderr, "[%d, %d] ", SchedTrace[i].stream[j], SchedTrace[i].stream_threads[j]);
204 }
205 fprintf(stderr, "\n");
206 }
207}
208
209/* Function called from library wrapper of cudaStreamCreateXXXX().
210 * The stream structure at the current index into the Stream structure
211 * array is initialized. The GPU state for the created stream is
212 * also initialized. Each invocation creates a new index
213 * by incrementing stream_count. On the first invocation, the part of
214 * the GPU structure not specific to a stream is also initialized.
215 */
216void streamInit(pid_t my_tid, int priority)
217//my_tid is the thread creating a user-defined stream
218{
219 //WARNING: any flags are ignored.
220
221 //printf("cudaStreamCreate TID %d\n", my_tid);
222 //fflush(stdout);
223
224 pthread_mutex_lock(&sched_lock);
225
226 Stream[stream_count].thread = my_tid; //stream identified by tid of creating thread
227 Stream[stream_count].priority = priority;
228 Stream[stream_count].state = INIT;
229 Stream[stream_count].look_count = 0;
230
231 //stream mutex and condition variable initialized to free
232 pthread_mutex_init(&Stream[stream_count].st_lock, NULL);
233 pthread_cond_init(&Stream[stream_count].st_cond, NULL);
234
235 //initialize GPU state for this newly created stream
236 GPU.streams[stream_count] = 0; //no kernel from stream running
237 GPU.stream_threads[stream_count] = 0; //no threads allocated
238
239 stream_count += 1; //increment stream index
240
241 if (Initialized == 0) { //initialize GPU state on first stream create
242 GPU.threads_occupied = 0;
243 GPU.kernels_dispatched = 0;
244 GPU.GPU_state = FREE;
245
246 //tr_ptr = (struct schedTrace *)mem_ptr;
247 Initialized = 1;
248 }
249
250 pthread_mutex_unlock(&sched_lock);
251}
252
253/* Function called from the library wrapper of cudaConfigureCall()
254 * generated from the <<<.....>>> kernel launch statement in the
255 * CUDA program. The stream state for the stream is initialized
256 * with the block and threads/block counts for the kernel.
257 */
258void schedConfCall(pid_t my_tid, void *stream, int blocks, int threads)
259//my_tid is the thread/stream attempting to launch
260{
261 int str_idx;
262
263 pthread_mutex_lock(&sched_lock);
264
265 //printf("cudaConfigureCall TID %d stream %p blocks %d threads %d\n",
266 // my_tid, stream, blocks, threads);
267 //fflush(stdout);
268
269 // get the stream array index for the thread that "owns" this stream
270 str_idx = get_stream(my_tid);
271
272 //initialize state for the kernel that the thread is launching
273 Stream[str_idx].state = PENDING; //call configured but not launched
274 Stream[str_idx].blocks = blocks; //total blocks in the kernel
275 Stream[str_idx].block_threads = threads; //total threads per block
276
277 pthread_mutex_unlock(&sched_lock);
278}
279
280/* Function called from the library wrapper of cudaLaunch()
281 * generated from the <<<.....>>> kernel launch statement in the
282 * CUDA program. The stream state for the stream is changed to
283 * show that the kernel is ready for launching. The utility
284 * function ready_launch() is called. On return from ready_launch()
285 * this function returns to the wrapper which then invokes the
286 * "real" CUDA launch. The return from ready_launch() is
287 * immediate in the case the scheduler determines that this kernel
288 * can be launched. The call may instead result in blocking
289 * the thread if the scheduler determines that the launch should
290 * be deferred. When the blocking condition is signalled by the
291 * scheduler, ready_launch() then returns to this function.
292 */
293void schedLaunch(pid_t my_tid) {//my_tid is the thread/stream attempting to launch
294 int str_idx;
295
296 pthread_mutex_lock(&sched_lock);
297
298 //printf("cudaLaunch TID %d\n", my_tid);
299 //fflush(stdout);
300
301 // get the stream array index for the thread that "owns" this stream
302 str_idx = get_stream(my_tid);
303
304 Stream[str_idx].state = READY_LAUNCH; //kernel can be considered for scheduling
305
306 //printf("TID %d Ready, Blocks %d Threads %d\n", my_tid,
307 // Stream[str_idx].blocks, Stream[str_idx].block_threads);
308 //fflush(stdout);
309
310 // ready_launch() is called with the sched_lock still held. The function will
311 // either block (and the thread will run when signaled) or will return
312 // immediately. In either case, (a) the lock will have been unlocked, and
313 // (b) the kernal will be launched by the "real" CUDA launch.
314
315 ready_launch(my_tid); // thread/stream will launch on return or after blocking
316}
317
318
319/* Utility function called from schedLaunch(). It invokes the scheduling policy
320 * function, find_best_kernel(), one or more times to determine which, if any,
321 * streams have a kernel that is ready to launch and should be launched. For
322 * streams, other than the one that invoked schedLaunch(), having kernels that
323 * should launch, the utility function dispatch_next() is called to unblock
324 * their owning threads. If the stream owned by the calling thread has a kernel
325 * to launch, the sched_lock is released and the function just returns. If the
326 * kernel in the stream of the calling thread is to be deferred, sched_lock is
327 * released and the thread blocks with a pthread_cond_wait on its stream condition.
328 */
329void ready_launch(pid_t my_tid) {
330//my_tid is the thread/stream attempting to launch
331 int str_idx, rc;
332 int this_one;
333 int will_block;
334
335 //Must be called with sched_lock held. It must release the lock before
336 //returning or blocking and then returning.
337
338 // get the stream array index for the thread that "owns" this stream
339 str_idx = get_stream(my_tid);
340
341 will_block = 1; // will not block if a kernel scheduled on this stream
342
343 do {
344 /*
345 if (TRACE_ON) {
346 printf("TID %d find new kernel on Launch\n", my_tid);
347 fflush(stdout);
348 }
349 */
350 //call the scheduling policy function. It returns a stream index for
351 //a stream in the READY_LAUNCH state with a kernel to be launched now
352 //(returns -1 if none found)
353 this_one = find_best_kernel();
354
355 if (this_one == str_idx) {//kernel from calling thread can launch
356 will_block = 0; // no block, just return
357 alloc_gpu_threads(my_tid, str_idx); //set up GPU state to launch
358 gpu_run(my_tid, str_idx);
359 Stream[str_idx].state = LAUNCHED; //kernel has been scheduled
360 }
361 else {//kernel from a different thread/stream should be launched
362 if (this_one >= 0)
363 dispatch_next(this_one, my_tid); //set state and signal
364 }
365 } while (this_one >= 0); // -1 indicates no more kernel launches now
366
367 //Must unlock so calling thread can return or block
368 pthread_mutex_unlock(&sched_lock);
369
370 if (will_block == 0)
371 return; //allows launch from calling thread to take place
372
373 // thread/stream must block until scheduler indicates its kernel can launch
374 rc = pthread_mutex_lock(&Stream[str_idx].st_lock);
375 if (rc != 0) {
376 fprintf(stderr, "TID %d Failed - Locking Stream Mutex\n", my_tid);
377 exit (-1);
378 }
379 rc = pthread_cond_wait(&Stream[str_idx].st_cond, &Stream[str_idx].st_lock);
380 if (rc != 0) {
381 fprintf(stderr, "TID %d Failed - Waiting Stream Condition\n", my_tid);
382 exit (-1);
383 }
384 rc = pthread_mutex_unlock(&Stream[str_idx].st_lock);
385 if (rc != 0) {
386 fprintf(stderr, "TID %d Failed - Unlocking Stream Mutex\n", my_tid);
387 exit (-1);
388 }
389}
390
391/* Utility function called from ready_launch() and schedule_next() to set
392 * state and signal a blocked thread/stream so it can execute the "real"
393 * CUDA launch.
394 */
395void dispatch_next(int this_one, pid_t my_tid) {//my_tid is calling thread/stream
396 //this_one is the stream index of the stream to launch a kernel
397 pid_t new_tid;
398 int rc;
399
400 //Must be called with sched_lock held; will be unlocked by caller
401
402 //new_tid is the thread/stream that has been scheduled for kernel launch
403 new_tid = Stream[this_one].thread;
404
405 alloc_gpu_threads(new_tid, this_one); //set up GPU state for launch
406 gpu_run(new_tid, this_one);
407 Stream[this_one].state = LAUNCHED; //kernel has been scheduled
408
409 //signal the blocked thread/stream so it can execute the "real" launch
410 rc = pthread_mutex_lock(&Stream[this_one].st_lock);
411 if (rc != 0) {
412 fprintf(stderr, "TID %d Failed - Locking Stream Mutex\n", my_tid);
413 exit (-1);
414 }
415 rc = pthread_cond_signal(&Stream[this_one].st_cond);
416 if (rc != 0) {
417 fprintf(stderr, "TID %d Failed - Signaling Stream Condition\n", my_tid);
418 exit (-1);
419 }
420 rc = pthread_mutex_unlock(&Stream[this_one].st_lock);
421 if (rc != 0) {
422 fprintf(stderr, "TID %d Failed - Unlocking Stream Mutex\n", my_tid);
423 exit (-1);
424 }
425}
426
427/* Function called from library wrapper of cudaStreamSynchronize().
428 * This CUDA function provides an essential notification that a kernel has
429 * completed execution on the GPU. The CUDA program must be written so
430 * that it synchronizes the CPU and GPU with at least one call to
431 * cudaStreamSynchronize() between successive instances of kernel
432 * launches on a given stream.
433 *
434 * The function sets the stream and GPU state to reflect the kernel's
435 * completion which frees GPU resources for use to execute a new
436 * kernel. It then calls the utility function, schedule_next() to
437 * schedule launches of any kernels the scheduling policy determines should
438 * be eligible to run now.
439 */
440void schedSync(pid_t my_tid, void *stream) {
441//my_tid is the thread/stream synchronizing the CPU with a GPU kernel completion
442 int str_idx;
443
444 pthread_mutex_lock(&sched_lock);
445
446 //printf("cudaStreamSynchronize TID %d stream %p\n", my_tid, stream);
447 //fflush(stdout);
448
449 // get the stream array index for the thread that "owns" this stream
450 str_idx = get_stream(my_tid);
451
452 // if the stream is idle (does not have a kernel being executed), the
453 // call is not related to kernel execution (e.g., is for an asynchronous
454 // cudaMemcpy). It can be ignored.
455 if (Stream[str_idx].state == IDLE) {
456 // unlock for return
457 pthread_mutex_unlock(&sched_lock);
458 return;
459 }
460 // still holding sched_lock here
461 free_gpu_threads(my_tid, str_idx); //set up GPU state for kernel completion
462 gpu_exit(my_tid, str_idx);
463
464 Stream[str_idx].state = IDLE; //set up stream state for kernel completion
465 Stream[str_idx].blocks = 0;
466 Stream[str_idx].block_threads = 0;
467
468 // schedule_next is called with sched_lock held. The function must
469 // release it before returning.
470 schedule_next(my_tid); // which, if any, thread/stream should launch now?
471}
472
473/* Utility function called from schedSynch(). It invokes the scheduling policy
474 * function, find_best_kernel(), one or more times to determine which, if any,
475 * streams have a kernel that is ready to launch and should be launched. For
476 * streams having kernels that should launch, the utility function dispatch_next()
477 * is called to unblock their owning threads. Note that the calling thread/stream
478 * cannot have a kernel to schedule until it executes another launch.
479 */
480void schedule_next(pid_t my_tid) {
481//my_tid is the thread/stream synchronizing the CPU with a GPU kernel completion
482
483 int this_one;
484
485 //Must be called with sched_lock held. It must release the lock before
486 //returning.
487
488 do {
489 /*
490 if (TRACE_ON) {
491 printf("TID %d find new kernel on Sync\n", my_tid);
492 fflush(stdout);
493 }
494 */
495
496 //call the scheduling policy function. It returns a stream index for
497 //a stream in the READY_LAUNCH state with a kernel to be launched now
498 //(returns -1 if none found)
499 this_one = find_best_kernel();
500
501 if (this_one >= 0)
502 dispatch_next(this_one, my_tid);
503 } while (this_one >= 0); // -1 indicates no more kernel launches now
504
505 pthread_mutex_unlock(&sched_lock);
506}
507
508/* Utility function called from schedSync() to free GPU threads for a
509 * completed kernel.
510 */
511void free_gpu_threads(pid_t my_tid, int str_idx) {
512 //str_idx is the stream index of the stream with a completed kernel
513 int alloc_threads;
514
515 //Must be called with sched_lock held
516
517 //see alloc_gpu_threads() for a description of thread allocations
518 alloc_threads = min(MAX_GPU_THREADS, Stream[str_idx].blocks * Stream[str_idx].block_threads);
519 GPU.threads_occupied -= alloc_threads;
520 if (GPU.threads_occupied < 0) {
521 fprintf(stderr, "TID %d Failed - GPU Threads < 0\n", my_tid);
522 exit (-1);
523 }
524}
525
526/* Utility function called from schedSync() to set GPU stream state for
527 * a completed kernel.
528 */
529void gpu_exit(pid_t my_tid, int str_idx) {
530 //str_idx is the stream index of the stream with a completed kernel
531
532 //Must be called with sched_lock held
533
534 //printf("GPU Kernel End %d Threads\n",GPU.stream_threads[str_idx]);
535
536 GPU.streams[str_idx] = 0;
537 GPU.stream_threads[str_idx] = 0;
538
539 GPU.kernels_dispatched -= 1;
540 if (GPU.kernels_dispatched < 0) {
541 fprintf(stderr, "TID %d Failed - GPU Kernels < 0\n", my_tid);
542 exit (-1);
543 }
544
545}
546
547/* Utility function called from ready_launch() and dispatch_next() to
548 * allocate GPU threads for a kernel scheduled for launching. The total
549 * number of threads required by the kernel is computed as the number
550 * of blocks in the kernel * the number of threads per block. If the
551 * total threads is >= MAX_GPU_THREADS, the number of allocated threads
552 * on the GPU is set to MAX_GPU_THREADS so all GPU threads are occupied
553 * until the kernel completes.
554 * NOTE: Once a kernel is launched that occupies all the GPU threads,
555 * no additional kernels can launch until that kernel completes.
556 * This prevents the GPU from concurrently executing the last blocks of
557 * a current kernel with the first blocks of a newly dispatched kernel.
558 */
559void alloc_gpu_threads(pid_t my_tid, int str_idx) {
560 //str_idx is the stream index of the stream with a scheduled kernel
561
562 int alloc_threads;
563
564 //Must be called with sched_lock held
565
566 alloc_threads = min(MAX_GPU_THREADS, Stream[str_idx].blocks * Stream[str_idx].block_threads);
567 GPU.threads_occupied += alloc_threads;
568 if (GPU.threads_occupied > MAX_GPU_THREADS) {
569 fprintf(stderr, "TID %d Failed - GPU Threads Exceeded\n", my_tid);
570 exit (-1);
571 }
572}
573
574/* Utility function called from ready_launch() and dispatch_next() to set
575 * stream-related and kernel-dispatch state on the GPU for a kernel
576 * scheduled for launching.
577 */
578void gpu_run(pid_t my_tid, int str_idx) {
579 //str_idx is the stream index of the stream with a scheduled kernel
580
581
582 //Must be called with sched_lock held
583
584 GPU.streams[str_idx] = my_tid;
585
586 //see alloc_gpu_threads for a description of thread allocations
587 GPU.stream_threads[str_idx] = min(MAX_GPU_THREADS,
588 Stream[str_idx].blocks * Stream[str_idx].block_threads);
589 /*
590 int i;
591
592 if (TRACE_ON) {
593 printf("GPU Thread Blocks [ ");
594 for (i = 0; i < stream_count; i++) {
595 if (GPU.stream_threads[i] != 0)
596 printf("%d ", GPU.stream_threads[i]);
597 }
598 printf("]\n");
599 }
600 */
601
602 GPU.kernels_dispatched += 1;
603 if (GPU.kernels_dispatched > stream_count) {
604 fprintf(stderr, "TID %d Failed - GPU Kernels > streams\n", my_tid);
605 exit (-1);
606 }
607}
608
609/* Utility function used in multiple functions to find the index in the
610 * stream array for the stream owned by the thread with TID of my_tid.
611 * The stream must have been previously created (cudaStreamCreate()).
612 */
613int get_stream(pid_t my_tid)
614{
615 //Must be called with sched_lock held
616
617 int i;
618
619 for (i = 0; i < MAX_STREAMS; i++) {
620 if (Stream[i].thread == my_tid)
621 break;
622 }
623 if (i == MAX_STREAMS) {
624 fprintf(stderr, "TID %d Failed - get_stream()", my_tid);
625 exit (-1);
626 }
627 return i;
628}
629