diff options
author | Joshua Bakita <bakitajoshua@gmail.com> | 2020-10-15 20:44:33 -0400 |
---|---|---|
committer | Joshua Bakita <bakitajoshua@gmail.com> | 2020-10-15 20:44:33 -0400 |
commit | 9e82e2c7cca65a8eb60d5bd99da66241c01a2991 (patch) | |
tree | f5ac2263d40995c09a3ae656f81d860eac3f658c |
Import GPU scheduler code from 2019 GM deliverable
Code provided by Don Smith via tar file. `.gitignore` added by me.
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | Makefile | 13 | ||||
-rw-r--r-- | MinFitMinIntfR2.h | 120 | ||||
-rw-r--r-- | SoftwareDocumentation.docx | bin | 0 -> 105301 bytes | |||
-rw-r--r-- | libcudart_wrapper.c | 3343 | ||||
-rw-r--r-- | schedAPI.h | 10 | ||||
-rw-r--r-- | schedLib.c | 629 |
7 files changed, 4116 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..be747da --- /dev/null +++ b/.gitignore | |||
@@ -0,0 +1 @@ | |||
libcudart_wrapper.so | |||
diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..15fe29a --- /dev/null +++ b/Makefile | |||
@@ -0,0 +1,13 @@ | |||
1 | CC := gcc | ||
2 | CCFLAGS := -Wall -fPIC -shared | ||
3 | LDFLAGS := -ldl -lpthread -lrt | ||
4 | INCLUDES := -I/usr/local/cuda/include/ | ||
5 | |||
6 | ################################################################################ | ||
7 | |||
8 | all: build | ||
9 | |||
10 | build: libcudart_wrapper.so | ||
11 | |||
12 | libcudart_wrapper.so: schedLib.c libcudart_wrapper.c | ||
13 | $(CC) $(CCFLAGS) $(INCLUDES) -o libcudart_wrapper.so schedLib.c libcudart_wrapper.c $(LDFLAGS) | ||
diff --git a/MinFitMinIntfR2.h b/MinFitMinIntfR2.h new file mode 100644 index 0000000..819b2b4 --- /dev/null +++ b/MinFitMinIntfR2.h | |||
@@ -0,0 +1,120 @@ | |||
1 | /* Scheduling policy function that implements a "min thread use, min interference" policy, | ||
2 | * i.e., find the ready-to-launch kernel that will occupy the smallest number of available | ||
3 | * GPU threads AND does not fail a test for interference effects. The test for | ||
4 | * interference effects requires that ratio between the number of threads in the | ||
5 | * kernel under consideration and any kernel already scheduled does not exceed a | ||
6 | * threshold (in this implementation, 2.0). This test is motivated by empirical | ||
7 | * measurements that have shown interfernce effects such as 500% or higher for | ||
8 | * large thread ratios between concurrently executing kernels. This is thought | ||
9 | * to be an artifact of the un-documented warp scheduling algorithm in the NVIDIA SMs. | ||
10 | */ | ||
11 | |||
12 | //put any global (static) declarations here: | ||
13 | |||
14 | |||
15 | #define MAX_THREAD_RATIO 2.0 // Threshold ratio between scheduled and new kernel | ||
16 | |||
17 | |||
18 | int find_best_kernel(void) { | ||
19 | int i; | ||
20 | int this_one = -1; //default return value indicating no kernel to launch | ||
21 | int need_threads, available_threads, left_over; | ||
22 | int k; | ||
23 | int occupied_threads[MAX_STREAMS]; //GPU threads allocated to scheduled kernels | ||
24 | |||
25 | //Must be called with sched_lock held | ||
26 | |||
27 | |||
28 | //record the allocated GPU threads in the kernel scheduled for each stream | ||
29 | for (i = 0; i < stream_count; i++) | ||
30 | occupied_threads[i] = GPU.stream_threads[i]; | ||
31 | |||
32 | //GPU threads available for allocation | ||
33 | available_threads = (MAX_GPU_THREADS - GPU.threads_occupied); | ||
34 | left_over = -1; //the number of threads left available if a kernel is scheduled | ||
35 | |||
36 | for (i = 0; i < stream_count; i++) { //examine all streams | ||
37 | if (Stream[i].state == READY_LAUNCH) { //only threads/streams ready to launch are considered | ||
38 | |||
39 | //determine how many threads would be allocated for this kernel (see | ||
40 | //allocate_gpu_threads() for a description) | ||
41 | need_threads = min(MAX_GPU_THREADS, Stream[i].blocks * Stream[i].block_threads); | ||
42 | if (need_threads > available_threads) //can't be scheduled | ||
43 | continue; | ||
44 | |||
45 | // find kernel with smalled thread allocation that does not create thread imbalance | ||
46 | //?? should there be a starvation-prevention part of this policy ?? | ||
47 | |||
48 | if ((available_threads - need_threads) > left_over) { | ||
49 | //found kernel with smallest thread allocation so far | ||
50 | //compute and test the ratios of threads between it and all kernels scheduled | ||
51 | |||
52 | for (k = 0; k < stream_count; k++) {//examine all streams | ||
53 | if (occupied_threads[k] == 0) //stream has no kernel scheduled | ||
54 | continue; | ||
55 | //if test fails for any already scheduled kernel, this stream can't launch | ||
56 | if ((float)(occupied_threads[k] / (float)need_threads) > MAX_THREAD_RATIO) | ||
57 | break; | ||
58 | if ((float)(need_threads / (float)occupied_threads[k]) > MAX_THREAD_RATIO) | ||
59 | break; | ||
60 | } | ||
61 | //if the test is passed for all scheduled kernels, this stream's kernel can launch | ||
62 | if (k == stream_count) { | ||
63 | this_one = i; //the final value of this_one is the stream index to schedule (or -1) | ||
64 | left_over = available_threads - need_threads; //current smallest thread allocation | ||
65 | } | ||
66 | } //end test for smaller thread allocation | ||
67 | } //end test for stream ready to launch | ||
68 | } //end outer for loop | ||
69 | |||
70 | if (TRACE_ON) { | ||
71 | show_gpu_state(); | ||
72 | show_stream_state(this_one); | ||
73 | } | ||
74 | return this_one; //the scheduling decision (stream index) | ||
75 | } | ||
76 | |||
77 | // Utility function to trace GPU state used in scheduling policy decisions | ||
78 | void show_gpu_state(void) { | ||
79 | |||
80 | //Must be called with sched_lock held | ||
81 | |||
82 | int i; | ||
83 | if (trc_idx >= MAX_SCHED_TRACE) | ||
84 | return; | ||
85 | |||
86 | for (i = 0; i < MAX_STREAMS; i++) { | ||
87 | SchedTrace[trc_idx].stream[i] = GPU.streams[i]; | ||
88 | SchedTrace[trc_idx].stream_threads[i] = GPU.stream_threads[i]; | ||
89 | SchedTrace[trc_idx].next = 0; | ||
90 | strcpy(SchedTrace[trc_idx].type, "GPU"); | ||
91 | } | ||
92 | trc_idx++; | ||
93 | } | ||
94 | |||
95 | // Utility function to trace stream state used in scheduling policy decisions | ||
96 | void show_stream_state(int this_one) { | ||
97 | |||
98 | //Must be called with sched_lock held | ||
99 | |||
100 | int i; | ||
101 | int need_threads; | ||
102 | if (trc_idx >= MAX_SCHED_TRACE) | ||
103 | return; | ||
104 | |||
105 | for (i = 0; i < MAX_STREAMS; i++) { | ||
106 | need_threads = min(MAX_GPU_THREADS, Stream[i].blocks * Stream[i].block_threads); | ||
107 | if ((Stream[i].state != READY_LAUNCH) && | ||
108 | (Stream[i].state != LAUNCHED)) | ||
109 | need_threads = -need_threads; //encode unschedulable state in threads with minus | ||
110 | SchedTrace[trc_idx].stream[i] = Stream[i].thread; | ||
111 | SchedTrace[trc_idx].stream_threads[i] = need_threads; | ||
112 | if (this_one == -1) | ||
113 | SchedTrace[trc_idx].next = this_one; | ||
114 | else | ||
115 | SchedTrace[trc_idx].next = Stream[this_one].thread; | ||
116 | strcpy(SchedTrace[trc_idx].type, "STR"); | ||
117 | } | ||
118 | trc_idx++; | ||
119 | } | ||
120 | |||
diff --git a/SoftwareDocumentation.docx b/SoftwareDocumentation.docx new file mode 100644 index 0000000..903706f --- /dev/null +++ b/SoftwareDocumentation.docx | |||
Binary files differ | |||
diff --git a/libcudart_wrapper.c b/libcudart_wrapper.c new file mode 100644 index 0000000..8e4f005 --- /dev/null +++ b/libcudart_wrapper.c | |||
@@ -0,0 +1,3343 @@ | |||
1 | /* Wrapper functions to implement transparent extension of the CUDA runtime library | ||
2 | * (libcudart) by dynamic linking this set of function interfaces ahead of the | ||
3 | * "real" library using LD_PRELOAD. Calls to library extension functions | ||
4 | * can be embedded in these wrapper functions. The specific calls used in the | ||
5 | * scheduling "middleware" are defined in the include file schedAPI.h below. | ||
6 | * | ||
7 | * WARNING - Do not change this file unless you are totally sure you know what you | ||
8 | * are doing! | ||
9 | * | ||
10 | */ | ||
11 | |||
12 | #define _GNU_SOURCE | ||
13 | |||
14 | #include <stdio.h> | ||
15 | #include <dlfcn.h> | ||
16 | #include <unistd.h> | ||
17 | #include <sys/types.h> | ||
18 | #include <sys/syscall.h> | ||
19 | #include "cuda_runtime_api.h" | ||
20 | #include "schedAPI.h" //function prototypes for calls to library extensions | ||
21 | |||
22 | static __host__ cudaError_t CUDARTAPI (*orig_cudaDeviceReset)(void) = NULL; | ||
23 | |||
24 | __host__ cudaError_t CUDARTAPI cudaDeviceReset(void) { | ||
25 | cudaError_t ret; | ||
26 | // Write your own custom c code in the cudaDeviceReset.c file | ||
27 | tracePrint(); | ||
28 | ret = orig_cudaDeviceReset(); | ||
29 | return ret; | ||
30 | |||
31 | } | ||
32 | |||
33 | static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaDeviceSynchronize)(void) = NULL; | ||
34 | |||
35 | __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceSynchronize(void) { | ||
36 | cudaError_t ret; | ||
37 | // Write your own custom c code in the cudaDeviceSynchronize.c file | ||
38 | ret = orig_cudaDeviceSynchronize(); | ||
39 | return ret; | ||
40 | |||
41 | } | ||
42 | |||
43 | static __host__ cudaError_t CUDARTAPI (*orig_cudaDeviceSetLimit)(enum cudaLimit limit, size_t value) = NULL; | ||
44 | |||
45 | __host__ cudaError_t CUDARTAPI cudaDeviceSetLimit(enum cudaLimit limit, size_t value) { | ||
46 | cudaError_t ret; | ||
47 | // Write your own custom c code in the cudaDeviceSetLimit.c file | ||
48 | ret = orig_cudaDeviceSetLimit(limit, value); | ||
49 | return ret; | ||
50 | |||
51 | } | ||
52 | |||
53 | static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaDeviceGetLimit)(size_t *pValue, enum cudaLimit limit) = NULL; | ||
54 | |||
55 | __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit) { | ||
56 | cudaError_t ret; | ||
57 | // Write your own custom c code in the cudaDeviceGetLimit.c file | ||
58 | ret = orig_cudaDeviceGetLimit(pValue, limit); | ||
59 | return ret; | ||
60 | |||
61 | } | ||
62 | |||
63 | static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaDeviceGetCacheConfig)(enum cudaFuncCache *pCacheConfig) = NULL; | ||
64 | |||
65 | __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig) { | ||
66 | cudaError_t ret; | ||
67 | // Write your own custom c code in the cudaDeviceGetCacheConfig.c file | ||
68 | ret = orig_cudaDeviceGetCacheConfig(pCacheConfig); | ||
69 | return ret; | ||
70 | |||
71 | } | ||
72 | |||
73 | static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaDeviceGetStreamPriorityRange)(int *leastPriority, int *greatestPriority) = NULL; | ||
74 | |||
75 | __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetStreamPriorityRange(int *leastPriority, int *greatestPriority) { | ||
76 | cudaError_t ret; | ||
77 | // Write your own custom c code in the cudaDeviceGetStreamPriorityRange.c file | ||
78 | ret = orig_cudaDeviceGetStreamPriorityRange(leastPriority, greatestPriority); | ||
79 | return ret; | ||
80 | |||
81 | } | ||
82 | |||
83 | static __host__ cudaError_t CUDARTAPI (*orig_cudaDeviceSetCacheConfig)(enum cudaFuncCache cacheConfig) = NULL; | ||
84 | |||
85 | __host__ cudaError_t CUDARTAPI cudaDeviceSetCacheConfig(enum cudaFuncCache cacheConfig) { | ||
86 | cudaError_t ret; | ||
87 | // Write your own custom c code in the cudaDeviceSetCacheConfig.c file | ||
88 | ret = orig_cudaDeviceSetCacheConfig(cacheConfig); | ||
89 | return ret; | ||
90 | |||
91 | } | ||
92 | |||
93 | static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaDeviceGetSharedMemConfig)(enum cudaSharedMemConfig *pConfig) = NULL; | ||
94 | |||
95 | __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig) { | ||
96 | cudaError_t ret; | ||
97 | // Write your own custom c code in the cudaDeviceGetSharedMemConfig.c file | ||
98 | ret = orig_cudaDeviceGetSharedMemConfig(pConfig); | ||
99 | return ret; | ||
100 | |||
101 | } | ||
102 | |||
103 | static __host__ cudaError_t CUDARTAPI (*orig_cudaDeviceSetSharedMemConfig)(enum cudaSharedMemConfig config) = NULL; | ||
104 | |||
105 | __host__ cudaError_t CUDARTAPI cudaDeviceSetSharedMemConfig(enum cudaSharedMemConfig config) { | ||
106 | cudaError_t ret; | ||
107 | // Write your own custom c code in the cudaDeviceSetSharedMemConfig.c file | ||
108 | ret = orig_cudaDeviceSetSharedMemConfig(config); | ||
109 | return ret; | ||
110 | |||
111 | } | ||
112 | |||
113 | static __host__ cudaError_t CUDARTAPI (*orig_cudaDeviceGetByPCIBusId)(int *device, const char *pciBusId) = NULL; | ||
114 | |||
115 | __host__ cudaError_t CUDARTAPI cudaDeviceGetByPCIBusId(int *device, const char *pciBusId) { | ||
116 | cudaError_t ret; | ||
117 | // Write your own custom c code in the cudaDeviceGetByPCIBusId.c file | ||
118 | ret = orig_cudaDeviceGetByPCIBusId(device, pciBusId); | ||
119 | return ret; | ||
120 | |||
121 | } | ||
122 | |||
123 | static __host__ cudaError_t CUDARTAPI (*orig_cudaDeviceGetPCIBusId)(char *pciBusId, int len, int device) = NULL; | ||
124 | |||
125 | __host__ cudaError_t CUDARTAPI cudaDeviceGetPCIBusId(char *pciBusId, int len, int device) { | ||
126 | cudaError_t ret; | ||
127 | // Write your own custom c code in the cudaDeviceGetPCIBusId.c file | ||
128 | ret = orig_cudaDeviceGetPCIBusId(pciBusId, len, device); | ||
129 | return ret; | ||
130 | |||
131 | } | ||
132 | |||
133 | static __host__ cudaError_t CUDARTAPI (*orig_cudaIpcGetEventHandle)(cudaIpcEventHandle_t *handle, cudaEvent_t event) = NULL; | ||
134 | |||
135 | __host__ cudaError_t CUDARTAPI cudaIpcGetEventHandle(cudaIpcEventHandle_t *handle, cudaEvent_t event) { | ||
136 | cudaError_t ret; | ||
137 | // Write your own custom c code in the cudaIpcGetEventHandle.c file | ||
138 | ret = orig_cudaIpcGetEventHandle(handle, event); | ||
139 | return ret; | ||
140 | |||
141 | } | ||
142 | |||
143 | static __host__ cudaError_t CUDARTAPI (*orig_cudaIpcOpenEventHandle)(cudaEvent_t *event, cudaIpcEventHandle_t handle) = NULL; | ||
144 | |||
145 | __host__ cudaError_t CUDARTAPI cudaIpcOpenEventHandle(cudaEvent_t *event, cudaIpcEventHandle_t handle) { | ||
146 | cudaError_t ret; | ||
147 | // Write your own custom c code in the cudaIpcOpenEventHandle.c file | ||
148 | ret = orig_cudaIpcOpenEventHandle(event, handle); | ||
149 | return ret; | ||
150 | |||
151 | } | ||
152 | |||
153 | static __host__ cudaError_t CUDARTAPI (*orig_cudaIpcGetMemHandle)(cudaIpcMemHandle_t *handle, void *devPtr) = NULL; | ||
154 | |||
155 | __host__ cudaError_t CUDARTAPI cudaIpcGetMemHandle(cudaIpcMemHandle_t *handle, void *devPtr) { | ||
156 | cudaError_t ret; | ||
157 | // Write your own custom c code in the cudaIpcGetMemHandle.c file | ||
158 | ret = orig_cudaIpcGetMemHandle(handle, devPtr); | ||
159 | return ret; | ||
160 | |||
161 | } | ||
162 | |||
163 | static __host__ cudaError_t CUDARTAPI (*orig_cudaIpcOpenMemHandle)(void **devPtr, cudaIpcMemHandle_t handle, unsigned int flags) = NULL; | ||
164 | |||
165 | __host__ cudaError_t CUDARTAPI cudaIpcOpenMemHandle(void **devPtr, cudaIpcMemHandle_t handle, unsigned int flags) { | ||
166 | cudaError_t ret; | ||
167 | // Write your own custom c code in the cudaIpcOpenMemHandle.c file | ||
168 | ret = orig_cudaIpcOpenMemHandle(devPtr, handle, flags); | ||
169 | return ret; | ||
170 | |||
171 | } | ||
172 | |||
173 | static __host__ cudaError_t CUDARTAPI (*orig_cudaIpcCloseMemHandle)(void *devPtr) = NULL; | ||
174 | |||
175 | __host__ cudaError_t CUDARTAPI cudaIpcCloseMemHandle(void *devPtr) { | ||
176 | cudaError_t ret; | ||
177 | // Write your own custom c code in the cudaIpcCloseMemHandle.c file | ||
178 | ret = orig_cudaIpcCloseMemHandle(devPtr); | ||
179 | return ret; | ||
180 | |||
181 | } | ||
182 | |||
183 | static __host__ cudaError_t CUDARTAPI (*orig_cudaThreadExit)(void) = NULL; | ||
184 | |||
185 | __host__ cudaError_t CUDARTAPI cudaThreadExit(void) { | ||
186 | cudaError_t ret; | ||
187 | // Write your own custom c code in the cudaThreadExit.c file | ||
188 | ret = orig_cudaThreadExit(); | ||
189 | return ret; | ||
190 | |||
191 | } | ||
192 | |||
193 | static __host__ cudaError_t CUDARTAPI (*orig_cudaThreadSynchronize)(void) = NULL; | ||
194 | |||
195 | __host__ cudaError_t CUDARTAPI cudaThreadSynchronize(void) { | ||
196 | cudaError_t ret; | ||
197 | // Write your own custom c code in the cudaThreadSynchronize.c file | ||
198 | ret = orig_cudaThreadSynchronize(); | ||
199 | return ret; | ||
200 | |||
201 | } | ||
202 | |||
203 | static __host__ cudaError_t CUDARTAPI (*orig_cudaThreadSetLimit)(enum cudaLimit limit, size_t value) = NULL; | ||
204 | |||
205 | __host__ cudaError_t CUDARTAPI cudaThreadSetLimit(enum cudaLimit limit, size_t value) { | ||
206 | cudaError_t ret; | ||
207 | // Write your own custom c code in the cudaThreadSetLimit.c file | ||
208 | ret = orig_cudaThreadSetLimit(limit, value); | ||
209 | return ret; | ||
210 | |||
211 | } | ||
212 | |||
213 | static __host__ cudaError_t CUDARTAPI (*orig_cudaThreadGetLimit)(size_t *pValue, enum cudaLimit limit) = NULL; | ||
214 | |||
215 | __host__ cudaError_t CUDARTAPI cudaThreadGetLimit(size_t *pValue, enum cudaLimit limit) { | ||
216 | cudaError_t ret; | ||
217 | // Write your own custom c code in the cudaThreadGetLimit.c file | ||
218 | ret = orig_cudaThreadGetLimit(pValue, limit); | ||
219 | return ret; | ||
220 | |||
221 | } | ||
222 | |||
223 | static __host__ cudaError_t CUDARTAPI (*orig_cudaThreadGetCacheConfig)(enum cudaFuncCache *pCacheConfig) = NULL; | ||
224 | |||
225 | __host__ cudaError_t CUDARTAPI cudaThreadGetCacheConfig(enum cudaFuncCache *pCacheConfig) { | ||
226 | cudaError_t ret; | ||
227 | // Write your own custom c code in the cudaThreadGetCacheConfig.c file | ||
228 | ret = orig_cudaThreadGetCacheConfig(pCacheConfig); | ||
229 | return ret; | ||
230 | |||
231 | } | ||
232 | |||
233 | static __host__ cudaError_t CUDARTAPI (*orig_cudaThreadSetCacheConfig)(enum cudaFuncCache cacheConfig) = NULL; | ||
234 | |||
235 | __host__ cudaError_t CUDARTAPI cudaThreadSetCacheConfig(enum cudaFuncCache cacheConfig) { | ||
236 | cudaError_t ret; | ||
237 | // Write your own custom c code in the cudaThreadSetCacheConfig.c file | ||
238 | ret = orig_cudaThreadSetCacheConfig(cacheConfig); | ||
239 | return ret; | ||
240 | |||
241 | } | ||
242 | |||
243 | static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaGetLastError)(void) = NULL; | ||
244 | |||
245 | __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetLastError(void) { | ||
246 | cudaError_t ret; | ||
247 | // Write your own custom c code in the cudaGetLastError.c file | ||
248 | ret = orig_cudaGetLastError(); | ||
249 | return ret; | ||
250 | |||
251 | } | ||
252 | |||
253 | static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaPeekAtLastError)(void) = NULL; | ||
254 | |||
255 | __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaPeekAtLastError(void) { | ||
256 | cudaError_t ret; | ||
257 | // Write your own custom c code in the cudaPeekAtLastError.c file | ||
258 | ret = orig_cudaPeekAtLastError(); | ||
259 | return ret; | ||
260 | |||
261 | } | ||
262 | |||
263 | static __host__ __cudart_builtin__ const char* CUDARTAPI (*orig_cudaGetErrorName)(cudaError_t error) = NULL; | ||
264 | |||
265 | __host__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorName(cudaError_t error) { | ||
266 | const char* ret; | ||
267 | // Write your own custom c code in the cudaGetErrorName.c file | ||
268 | ret = orig_cudaGetErrorName(error); | ||
269 | return ret; | ||
270 | |||
271 | } | ||
272 | |||
273 | static __host__ __cudart_builtin__ const char* CUDARTAPI (*orig_cudaGetErrorString)(cudaError_t error) = NULL; | ||
274 | |||
275 | __host__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorString(cudaError_t error) { | ||
276 | const char* ret; | ||
277 | // Write your own custom c code in the cudaGetErrorString.c file | ||
278 | ret = orig_cudaGetErrorString(error); | ||
279 | return ret; | ||
280 | |||
281 | } | ||
282 | |||
283 | static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaGetDeviceCount)(int *count) = NULL; | ||
284 | |||
285 | __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceCount(int *count) { | ||
286 | cudaError_t ret; | ||
287 | // Write your own custom c code in the cudaGetDeviceCount.c file | ||
288 | ret = orig_cudaGetDeviceCount(count); | ||
289 | return ret; | ||
290 | |||
291 | } | ||
292 | |||
293 | static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaGetDeviceProperties)(struct cudaDeviceProp *prop, int device) = NULL; | ||
294 | |||
295 | __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceProperties(struct cudaDeviceProp *prop, int device) { | ||
296 | cudaError_t ret; | ||
297 | // Write your own custom c code in the cudaGetDeviceProperties.c file | ||
298 | ret = orig_cudaGetDeviceProperties(prop, device); | ||
299 | return ret; | ||
300 | |||
301 | } | ||
302 | |||
303 | static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaDeviceGetAttribute)(int *value, enum cudaDeviceAttr attr, int device) = NULL; | ||
304 | |||
305 | __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device) { | ||
306 | cudaError_t ret; | ||
307 | // Write your own custom c code in the cudaDeviceGetAttribute.c file | ||
308 | ret = orig_cudaDeviceGetAttribute(value, attr, device); | ||
309 | return ret; | ||
310 | |||
311 | } | ||
312 | |||
313 | static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaDeviceGetP2PAttribute)(int *value, enum cudaDeviceP2PAttr attr, int srcDevice, int dstDevice) = NULL; | ||
314 | |||
315 | __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetP2PAttribute(int *value, enum cudaDeviceP2PAttr attr, int srcDevice, int dstDevice) { | ||
316 | cudaError_t ret; | ||
317 | // Write your own custom c code in the cudaDeviceGetP2PAttribute.c file | ||
318 | ret = orig_cudaDeviceGetP2PAttribute(value, attr, srcDevice, dstDevice); | ||
319 | return ret; | ||
320 | |||
321 | } | ||
322 | |||
323 | static __host__ cudaError_t CUDARTAPI (*orig_cudaChooseDevice)(int *device, const struct cudaDeviceProp *prop) = NULL; | ||
324 | |||
325 | __host__ cudaError_t CUDARTAPI cudaChooseDevice(int *device, const struct cudaDeviceProp *prop) { | ||
326 | cudaError_t ret; | ||
327 | // Write your own custom c code in the cudaChooseDevice.c file | ||
328 | ret = orig_cudaChooseDevice(device, prop); | ||
329 | return ret; | ||
330 | |||
331 | } | ||
332 | |||
333 | static __host__ cudaError_t CUDARTAPI (*orig_cudaSetDevice)(int device) = NULL; | ||
334 | |||
335 | __host__ cudaError_t CUDARTAPI cudaSetDevice(int device) { | ||
336 | cudaError_t ret; | ||
337 | // Write your own custom c code in the cudaSetDevice.c file | ||
338 | ret = orig_cudaSetDevice(device); | ||
339 | return ret; | ||
340 | |||
341 | } | ||
342 | |||
343 | static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaGetDevice)(int *device) = NULL; | ||
344 | |||
345 | __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDevice(int *device) { | ||
346 | cudaError_t ret; | ||
347 | // Write your own custom c code in the cudaGetDevice.c file | ||
348 | ret = orig_cudaGetDevice(device); | ||
349 | return ret; | ||
350 | |||
351 | } | ||
352 | |||
353 | static __host__ cudaError_t CUDARTAPI (*orig_cudaSetValidDevices)(int *device_arr, int len) = NULL; | ||
354 | |||
355 | __host__ cudaError_t CUDARTAPI cudaSetValidDevices(int *device_arr, int len) { | ||
356 | cudaError_t ret; | ||
357 | // Write your own custom c code in the cudaSetValidDevices.c file | ||
358 | ret = orig_cudaSetValidDevices(device_arr, len); | ||
359 | return ret; | ||
360 | |||
361 | } | ||
362 | |||
363 | static __host__ cudaError_t CUDARTAPI (*orig_cudaSetDeviceFlags)( unsigned int flags ) = NULL; | ||
364 | |||
365 | __host__ cudaError_t CUDARTAPI cudaSetDeviceFlags( unsigned int flags ) { | ||
366 | cudaError_t ret; | ||
367 | // Write your own custom c code in the cudaSetDeviceFlags.c file | ||
368 | ret = orig_cudaSetDeviceFlags(flags); | ||
369 | return ret; | ||
370 | |||
371 | } | ||
372 | |||
373 | static __host__ cudaError_t CUDARTAPI (*orig_cudaGetDeviceFlags)( unsigned int *flags ) = NULL; | ||
374 | |||
375 | __host__ cudaError_t CUDARTAPI cudaGetDeviceFlags( unsigned int *flags ) { | ||
376 | cudaError_t ret; | ||
377 | // Write your own custom c code in the cudaGetDeviceFlags.c file | ||
378 | ret = orig_cudaGetDeviceFlags(flags); | ||
379 | return ret; | ||
380 | |||
381 | } | ||
382 | |||
383 | static __host__ cudaError_t CUDARTAPI (*orig_cudaStreamCreate)(cudaStream_t *pStream) = NULL; | ||
384 | |||
385 | __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStream) { | ||
386 | cudaError_t ret; | ||
387 | pid_t my_tid = syscall(SYS_gettid); | ||
388 | streamInit(my_tid, 0); | ||
389 | |||
390 | // Write your own custom c code in the cudaStreamCreate.c file | ||
391 | ret = orig_cudaStreamCreate(pStream); | ||
392 | return ret; | ||
393 | |||
394 | } | ||
395 | |||
396 | static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaStreamCreateWithFlags)(cudaStream_t *pStream, unsigned int flags) = NULL; | ||
397 | |||
398 | __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags) { | ||
399 | cudaError_t ret; | ||
400 | pid_t my_tid = syscall(SYS_gettid); | ||
401 | streamInit(my_tid, 0); | ||
402 | |||
403 | // Write your own custom c code in the cudaStreamCreateWithFlags.c file | ||
404 | ret = orig_cudaStreamCreateWithFlags(pStream, flags); | ||
405 | return ret; | ||
406 | |||
407 | } | ||
408 | |||
409 | static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaStreamCreateWithPriority)(cudaStream_t *pStream, unsigned int flags, int priority) = NULL; | ||
410 | |||
411 | __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithPriority(cudaStream_t *pStream, unsigned int flags, int priority) { | ||
412 | cudaError_t ret; | ||
413 | pid_t my_tid = syscall(SYS_gettid); | ||
414 | streamInit(my_tid, priority); | ||
415 | |||
416 | // Write your own custom c code in the cudaStreamCreateWithPriority.c file | ||
417 | ret = orig_cudaStreamCreateWithPriority(pStream, flags, priority); | ||
418 | return ret; | ||
419 | |||
420 | } | ||
421 | |||
422 | static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaStreamGetPriority)(cudaStream_t hStream, int *priority) = NULL; | ||
423 | |||
424 | __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamGetPriority(cudaStream_t hStream, int *priority) { | ||
425 | cudaError_t ret; | ||
426 | // Write your own custom c code in the cudaStreamGetPriority.c file | ||
427 | ret = orig_cudaStreamGetPriority(hStream, priority); | ||
428 | return ret; | ||
429 | |||
430 | } | ||
431 | |||
432 | static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaStreamGetFlags)(cudaStream_t hStream, unsigned int *flags) = NULL; | ||
433 | |||
434 | __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamGetFlags(cudaStream_t hStream, unsigned int *flags) { | ||
435 | cudaError_t ret; | ||
436 | // Write your own custom c code in the cudaStreamGetFlags.c file | ||
437 | ret = orig_cudaStreamGetFlags(hStream, flags); | ||
438 | return ret; | ||
439 | |||
440 | } | ||
441 | |||
442 | static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaStreamDestroy)(cudaStream_t stream) = NULL; | ||
443 | |||
444 | __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamDestroy(cudaStream_t stream) { | ||
445 | cudaError_t ret; | ||
446 | // Write your own custom c code in the cudaStreamDestroy.c file | ||
447 | ret = orig_cudaStreamDestroy(stream); | ||
448 | return ret; | ||
449 | |||
450 | } | ||
451 | |||
452 | static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaStreamWaitEvent)(cudaStream_t stream, cudaEvent_t event, unsigned int flags) = NULL; | ||
453 | |||
454 | __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags) { | ||
455 | cudaError_t ret; | ||
456 | // Write your own custom c code in the cudaStreamWaitEvent.c file | ||
457 | ret = orig_cudaStreamWaitEvent(stream, event, flags); | ||
458 | return ret; | ||
459 | |||
460 | } | ||
461 | |||
462 | static __host__ cudaError_t CUDARTAPI (*orig_cudaStreamAddCallback)(cudaStream_t stream, cudaStreamCallback_t callback, void *userData, unsigned int flags) = NULL; | ||
463 | |||
464 | __host__ cudaError_t CUDARTAPI cudaStreamAddCallback(cudaStream_t stream, cudaStreamCallback_t callback, void *userData, unsigned int flags) { | ||
465 | cudaError_t ret; | ||
466 | // Write your own custom c code in the cudaStreamAddCallback.c file | ||
467 | ret = orig_cudaStreamAddCallback(stream, callback, userData, flags); | ||
468 | return ret; | ||
469 | |||
470 | } | ||
471 | |||
472 | static __host__ cudaError_t CUDARTAPI (*orig_cudaStreamSynchronize)(cudaStream_t stream) = NULL; | ||
473 | |||
474 | __host__ cudaError_t CUDARTAPI cudaStreamSynchronize(cudaStream_t stream) { | ||
475 | cudaError_t ret; | ||
476 | pid_t my_tid = syscall(SYS_gettid); | ||
477 | |||
478 | // Write your own custom c code in the cudaStreamSynchronize.c file | ||
479 | //printf("cudaStreamSynchronize stream %p\n", (void *)stream); | ||
480 | |||
481 | ret = orig_cudaStreamSynchronize(stream); | ||
482 | schedSync(my_tid, (void *)stream); | ||
483 | return ret; | ||
484 | |||
485 | } | ||
486 | |||
487 | static __host__ cudaError_t CUDARTAPI (*orig_cudaStreamQuery)(cudaStream_t stream) = NULL; | ||
488 | |||
489 | __host__ cudaError_t CUDARTAPI cudaStreamQuery(cudaStream_t stream) { | ||
490 | cudaError_t ret; | ||
491 | // Write your own custom c code in the cudaStreamQuery.c file | ||
492 | ret = orig_cudaStreamQuery(stream); | ||
493 | return ret; | ||
494 | |||
495 | } | ||
496 | |||
497 | static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaStreamAttachMemAsync)(cudaStream_t stream, void *devPtr, size_t length , unsigned int flags ) = NULL; | ||
498 | |||
499 | __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr, size_t length , unsigned int flags ) { | ||
500 | cudaError_t ret; | ||
501 | // Write your own custom c code in the cudaStreamAttachMemAsync.c file | ||
502 | ret = orig_cudaStreamAttachMemAsync(stream, devPtr, length, flags); | ||
503 | return ret; | ||
504 | |||
505 | } | ||
506 | |||
507 | static __host__ cudaError_t CUDARTAPI (*orig_cudaEventCreate)(cudaEvent_t *event) = NULL; | ||
508 | |||
509 | __host__ cudaError_t CUDARTAPI cudaEventCreate(cudaEvent_t *event) { | ||
510 | cudaError_t ret; | ||
511 | // Write your own custom c code in the cudaEventCreate.c file | ||
512 | ret = orig_cudaEventCreate(event); | ||
513 | return ret; | ||
514 | |||
515 | } | ||
516 | |||
517 | static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaEventCreateWithFlags)(cudaEvent_t *event, unsigned int flags) = NULL; | ||
518 | |||
519 | __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags) { | ||
520 | cudaError_t ret; | ||
521 | // Write your own custom c code in the cudaEventCreateWithFlags.c file | ||
522 | ret = orig_cudaEventCreateWithFlags(event, flags); | ||
523 | return ret; | ||
524 | |||
525 | } | ||
526 | |||
527 | static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaEventRecord)(cudaEvent_t event, cudaStream_t stream ) = NULL; | ||
528 | |||
529 | __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord(cudaEvent_t event, cudaStream_t stream ) { | ||
530 | cudaError_t ret; | ||
531 | // Write your own custom c code in the cudaEventRecord.c file | ||
532 | ret = orig_cudaEventRecord(event, stream); | ||
533 | return ret; | ||
534 | |||
535 | } | ||
536 | |||
537 | static __host__ cudaError_t CUDARTAPI (*orig_cudaEventQuery)(cudaEvent_t event) = NULL; | ||
538 | |||
539 | __host__ cudaError_t CUDARTAPI cudaEventQuery(cudaEvent_t event) { | ||
540 | cudaError_t ret; | ||
541 | // Write your own custom c code in the cudaEventQuery.c file | ||
542 | ret = orig_cudaEventQuery(event); | ||
543 | return ret; | ||
544 | |||
545 | } | ||
546 | |||
547 | static __host__ cudaError_t CUDARTAPI (*orig_cudaEventSynchronize)(cudaEvent_t event) = NULL; | ||
548 | |||
549 | __host__ cudaError_t CUDARTAPI cudaEventSynchronize(cudaEvent_t event) { | ||
550 | cudaError_t ret; | ||
551 | // Write your own custom c code in the cudaEventSynchronize.c file | ||
552 | ret = orig_cudaEventSynchronize(event); | ||
553 | return ret; | ||
554 | |||
555 | } | ||
556 | |||
557 | static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaEventDestroy)(cudaEvent_t event) = NULL; | ||
558 | |||
559 | __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventDestroy(cudaEvent_t event) { | ||
560 | cudaError_t ret; | ||
561 | // Write your own custom c code in the cudaEventDestroy.c file | ||
562 | ret = orig_cudaEventDestroy(event); | ||
563 | return ret; | ||
564 | |||
565 | } | ||
566 | |||
567 | static __host__ cudaError_t CUDARTAPI (*orig_cudaEventElapsedTime)(float *ms, cudaEvent_t start, cudaEvent_t end) = NULL; | ||
568 | |||
569 | __host__ cudaError_t CUDARTAPI cudaEventElapsedTime(float *ms, cudaEvent_t start, cudaEvent_t end) { | ||
570 | cudaError_t ret; | ||
571 | // Write your own custom c code in the cudaEventElapsedTime.c file | ||
572 | ret = orig_cudaEventElapsedTime(ms, start, end); | ||
573 | return ret; | ||
574 | |||
575 | } | ||
576 | |||
577 | static __host__ cudaError_t CUDARTAPI (*orig_cudaLaunchKernel)(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) = NULL; | ||
578 | |||
579 | __host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) { | ||
580 | cudaError_t ret; | ||
581 | // Write your own custom c code in the cudaLaunchKernel.c file | ||
582 | ret = orig_cudaLaunchKernel(func, gridDim, blockDim, args, sharedMem, stream); | ||
583 | return ret; | ||
584 | |||
585 | } | ||
586 | |||
587 | static __host__ cudaError_t CUDARTAPI (*orig_cudaLaunchCooperativeKernel)(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) = NULL; | ||
588 | |||
589 | __host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) { | ||
590 | cudaError_t ret; | ||
591 | // Write your own custom c code in the cudaLaunchCooperativeKernel.c file | ||
592 | ret = orig_cudaLaunchCooperativeKernel(func, gridDim, blockDim, args, sharedMem, stream); | ||
593 | return ret; | ||
594 | |||
595 | } | ||
596 | |||
597 | static __host__ cudaError_t CUDARTAPI (*orig_cudaLaunchCooperativeKernelMultiDevice)(struct cudaLaunchParams *launchParamsList, unsigned int numDevices, unsigned int flags ) = NULL; | ||
598 | |||
599 | __host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams *launchParamsList, unsigned int numDevices, unsigned int flags ) { | ||
600 | cudaError_t ret; | ||
601 | // Write your own custom c code in the cudaLaunchCooperativeKernelMultiDevice.c file | ||
602 | ret = orig_cudaLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags); | ||
603 | return ret; | ||
604 | |||
605 | } | ||
606 | |||
607 | static __host__ cudaError_t CUDARTAPI (*orig_cudaFuncSetCacheConfig)(const void *func, enum cudaFuncCache cacheConfig) = NULL; | ||
608 | |||
609 | __host__ cudaError_t CUDARTAPI cudaFuncSetCacheConfig(const void *func, enum cudaFuncCache cacheConfig) { | ||
610 | cudaError_t ret; | ||
611 | // Write your own custom c code in the cudaFuncSetCacheConfig.c file | ||
612 | ret = orig_cudaFuncSetCacheConfig(func, cacheConfig); | ||
613 | return ret; | ||
614 | |||
615 | } | ||
616 | |||
617 | static __host__ cudaError_t CUDARTAPI (*orig_cudaFuncSetSharedMemConfig)(const void *func, enum cudaSharedMemConfig config) = NULL; | ||
618 | |||
619 | __host__ cudaError_t CUDARTAPI cudaFuncSetSharedMemConfig(const void *func, enum cudaSharedMemConfig config) { | ||
620 | cudaError_t ret; | ||
621 | // Write your own custom c code in the cudaFuncSetSharedMemConfig.c file | ||
622 | ret = orig_cudaFuncSetSharedMemConfig(func, config); | ||
623 | return ret; | ||
624 | |||
625 | } | ||
626 | |||
627 | static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaFuncGetAttributes)(struct cudaFuncAttributes *attr, const void *func) = NULL; | ||
628 | |||
629 | __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func) { | ||
630 | cudaError_t ret; | ||
631 | // Write your own custom c code in the cudaFuncGetAttributes.c file | ||
632 | ret = orig_cudaFuncGetAttributes(attr, func); | ||
633 | return ret; | ||
634 | |||
635 | } | ||
636 | |||
637 | static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaFuncSetAttribute)(const void *func, enum cudaFuncAttribute attr, int value) = NULL; | ||
638 | |||
639 | __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFuncSetAttribute(const void *func, enum cudaFuncAttribute attr, int value) { | ||
640 | cudaError_t ret; | ||
641 | // Write your own custom c code in the cudaFuncSetAttribute.c file | ||
642 | ret = orig_cudaFuncSetAttribute(func, attr, value); | ||
643 | return ret; | ||
644 | |||
645 | } | ||
646 | |||
647 | static __host__ cudaError_t CUDARTAPI (*orig_cudaSetDoubleForDevice)(double *d) = NULL; | ||
648 | |||
649 | __host__ cudaError_t CUDARTAPI cudaSetDoubleForDevice(double *d) { | ||
650 | cudaError_t ret; | ||
651 | // Write your own custom c code in the cudaSetDoubleForDevice.c file | ||
652 | ret = orig_cudaSetDoubleForDevice(d); | ||
653 | return ret; | ||
654 | |||
655 | } | ||
656 | |||
657 | static __host__ cudaError_t CUDARTAPI (*orig_cudaSetDoubleForHost)(double *d) = NULL; | ||
658 | |||
659 | __host__ cudaError_t CUDARTAPI cudaSetDoubleForHost(double *d) { | ||
660 | cudaError_t ret; | ||
661 | // Write your own custom c code in the cudaSetDoubleForHost.c file | ||
662 | ret = orig_cudaSetDoubleForHost(d); | ||
663 | return ret; | ||
664 | |||
665 | } | ||
666 | |||
667 | static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaOccupancyMaxActiveBlocksPerMultiprocessor)(int *numBlocks, const void *func, int blockSize, size_t dynamicSMemSize) = NULL; | ||
668 | |||
669 | __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSMemSize) { | ||
670 | cudaError_t ret; | ||
671 | // Write your own custom c code in the cudaOccupancyMaxActiveBlocksPerMultiprocessor.c file | ||
672 | ret = orig_cudaOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, func, blockSize, dynamicSMemSize); | ||
673 | return ret; | ||
674 | |||
675 | } | ||
676 | |||
677 | static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags)(int *numBlocks, const void *func, int blockSize, size_t dynamicSMemSize, unsigned int flags) = NULL; | ||
678 | |||
679 | __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSMemSize, unsigned int flags) { | ||
680 | cudaError_t ret; | ||
681 | // Write your own custom c code in the cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.c file | ||
682 | ret = orig_cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks, func, blockSize, dynamicSMemSize, flags); | ||
683 | return ret; | ||
684 | |||
685 | } | ||
686 | |||
687 | static __host__ cudaError_t CUDARTAPI (*orig_cudaConfigureCall)(dim3 gridDim, dim3 blockDim, size_t sharedMem , cudaStream_t stream ) = NULL; | ||
688 | |||
689 | __host__ cudaError_t CUDARTAPI cudaConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem , cudaStream_t stream ) { | ||
690 | cudaError_t ret; | ||
691 | pid_t my_tid = syscall(SYS_gettid); | ||
692 | // Write your own custom c code in the cudaConfigureCall.c file | ||
693 | // printf("cudaConfigureCall TID %d stream %p blocks %d threads %d\n", | ||
694 | schedConfCall(my_tid, (void *)stream, gridDim.x * gridDim.y, blockDim.x * blockDim.y); | ||
695 | ret = orig_cudaConfigureCall(gridDim, blockDim, sharedMem, stream); | ||
696 | return ret; | ||
697 | |||
698 | } | ||
699 | |||
700 | static __host__ cudaError_t CUDARTAPI (*orig_cudaSetupArgument)(const void *arg, size_t size, size_t offset) = NULL; | ||
701 | |||
702 | __host__ cudaError_t CUDARTAPI cudaSetupArgument(const void *arg, size_t size, size_t offset) { | ||
703 | cudaError_t ret; | ||
704 | //pid_t my_tid = syscall(SYS_gettid); | ||
705 | // Write your own custom c code in the cudaSetupArgument.c file | ||
706 | // printf("cudaSetupArugment TID %d size %lu offset %lu\n", my_tid, size, offset); | ||
707 | ret = orig_cudaSetupArgument(arg, size, offset); | ||
708 | return ret; | ||
709 | |||
710 | } | ||
711 | |||
712 | static __host__ cudaError_t CUDARTAPI (*orig_cudaLaunch)(const void *func) = NULL; | ||
713 | |||
714 | __host__ cudaError_t CUDARTAPI cudaLaunch(const void *func) { | ||
715 | cudaError_t ret; | ||
716 | pid_t my_tid = syscall(SYS_gettid); | ||
717 | //printf("cudaLaunch TID %d\n", my_tid); | ||
718 | schedLaunch(my_tid); | ||
719 | ret = orig_cudaLaunch(func); | ||
720 | return ret; | ||
721 | |||
722 | } | ||
723 | |||
724 | static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaMallocManaged)(void **devPtr, size_t size, unsigned int flags ) = NULL; | ||
725 | |||
726 | __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMallocManaged(void **devPtr, size_t size, unsigned int flags ) { | ||
727 | cudaError_t ret; | ||
728 | // Write your own custom c code in the cudaMallocManaged.c file | ||
729 | ret = orig_cudaMallocManaged(devPtr, size, flags); | ||
730 | return ret; | ||
731 | |||
732 | } | ||
733 | |||
734 | static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaMalloc)(void **devPtr, size_t size) = NULL; | ||
735 | |||
736 | __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size) { | ||
737 | cudaError_t ret; | ||
738 | // Write your own custom c code in the cudaMalloc.c file | ||
739 | ret = orig_cudaMalloc(devPtr, size); | ||
740 | return ret; | ||
741 | |||
742 | } | ||
743 | |||
744 | static __host__ cudaError_t CUDARTAPI (*orig_cudaMallocHost)(void **ptr, size_t size) = NULL; | ||
745 | |||
746 | __host__ cudaError_t CUDARTAPI cudaMallocHost(void **ptr, size_t size) { | ||
747 | cudaError_t ret; | ||
748 | // Write your own custom c code in the cudaMallocHost.c file | ||
749 | ret = orig_cudaMallocHost(ptr, size); | ||
750 | return ret; | ||
751 | |||
752 | } | ||
753 | |||
754 | static __host__ cudaError_t CUDARTAPI (*orig_cudaMallocPitch)(void **devPtr, size_t *pitch, size_t width, size_t height) = NULL; | ||
755 | |||
756 | __host__ cudaError_t CUDARTAPI cudaMallocPitch(void **devPtr, size_t *pitch, size_t width, size_t height) { | ||
757 | cudaError_t ret; | ||
758 | // Write your own custom c code in the cudaMallocPitch.c file | ||
759 | ret = orig_cudaMallocPitch(devPtr, pitch, width, height); | ||
760 | return ret; | ||
761 | |||
762 | } | ||
763 | |||
764 | static __host__ cudaError_t CUDARTAPI (*orig_cudaMallocArray)(cudaArray_t *array, const struct cudaChannelFormatDesc *desc, size_t width, size_t height , unsigned int flags ) = NULL; | ||
765 | |||
766 | __host__ cudaError_t CUDARTAPI cudaMallocArray(cudaArray_t *array, const struct cudaChannelFormatDesc *desc, size_t width, size_t height , unsigned int flags ) { | ||
767 | cudaError_t ret; | ||
768 | // Write your own custom c code in the cudaMallocArray.c file | ||
769 | ret = orig_cudaMallocArray(array, desc, width, height, flags); | ||
770 | return ret; | ||
771 | |||
772 | } | ||
773 | |||
774 | static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaFree)(void *devPtr) = NULL; | ||
775 | |||
776 | __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFree(void *devPtr) { | ||
777 | cudaError_t ret; | ||
778 | // Write your own custom c code in the cudaFree.c file | ||
779 | ret = orig_cudaFree(devPtr); | ||
780 | return ret; | ||
781 | |||
782 | } | ||
783 | |||
784 | static __host__ cudaError_t CUDARTAPI (*orig_cudaFreeHost)(void *ptr) = NULL; | ||
785 | |||
786 | __host__ cudaError_t CUDARTAPI cudaFreeHost(void *ptr) { | ||
787 | cudaError_t ret; | ||
788 | // Write your own custom c code in the cudaFreeHost.c file | ||
789 | ret = orig_cudaFreeHost(ptr); | ||
790 | return ret; | ||
791 | |||
792 | } | ||
793 | |||
794 | static __host__ cudaError_t CUDARTAPI (*orig_cudaFreeArray)(cudaArray_t array) = NULL; | ||
795 | |||
796 | __host__ cudaError_t CUDARTAPI cudaFreeArray(cudaArray_t array) { | ||
797 | cudaError_t ret; | ||
798 | // Write your own custom c code in the cudaFreeArray.c file | ||
799 | ret = orig_cudaFreeArray(array); | ||
800 | return ret; | ||
801 | |||
802 | } | ||
803 | |||
804 | static __host__ cudaError_t CUDARTAPI (*orig_cudaFreeMipmappedArray)(cudaMipmappedArray_t mipmappedArray) = NULL; | ||
805 | |||
806 | __host__ cudaError_t CUDARTAPI cudaFreeMipmappedArray(cudaMipmappedArray_t mipmappedArray) { | ||
807 | cudaError_t ret; | ||
808 | // Write your own custom c code in the cudaFreeMipmappedArray.c file | ||
809 | ret = orig_cudaFreeMipmappedArray(mipmappedArray); | ||
810 | return ret; | ||
811 | |||
812 | } | ||
813 | |||
814 | static __host__ cudaError_t CUDARTAPI (*orig_cudaHostAlloc)(void **pHost, size_t size, unsigned int flags) = NULL; | ||
815 | |||
816 | __host__ cudaError_t CUDARTAPI cudaHostAlloc(void **pHost, size_t size, unsigned int flags) { | ||
817 | cudaError_t ret; | ||
818 | // Write your own custom c code in the cudaHostAlloc.c file | ||
819 | ret = orig_cudaHostAlloc(pHost, size, flags); | ||
820 | return ret; | ||
821 | |||
822 | } | ||
823 | |||
824 | static __host__ cudaError_t CUDARTAPI (*orig_cudaHostRegister)(void *ptr, size_t size, unsigned int flags) = NULL; | ||
825 | |||
826 | __host__ cudaError_t CUDARTAPI cudaHostRegister(void *ptr, size_t size, unsigned int flags) { | ||
827 | cudaError_t ret; | ||
828 | // Write your own custom c code in the cudaHostRegister.c file | ||
829 | ret = orig_cudaHostRegister(ptr, size, flags); | ||
830 | return ret; | ||
831 | |||
832 | } | ||
833 | |||
834 | static __host__ cudaError_t CUDARTAPI (*orig_cudaHostUnregister)(void *ptr) = NULL; | ||
835 | |||
836 | __host__ cudaError_t CUDARTAPI cudaHostUnregister(void *ptr) { | ||
837 | cudaError_t ret; | ||
838 | // Write your own custom c code in the cudaHostUnregister.c file | ||
839 | ret = orig_cudaHostUnregister(ptr); | ||
840 | return ret; | ||
841 | |||
842 | } | ||
843 | |||
844 | static __host__ cudaError_t CUDARTAPI (*orig_cudaHostGetDevicePointer)(void **pDevice, void *pHost, unsigned int flags) = NULL; | ||
845 | |||
846 | __host__ cudaError_t CUDARTAPI cudaHostGetDevicePointer(void **pDevice, void *pHost, unsigned int flags) { | ||
847 | cudaError_t ret; | ||
848 | // Write your own custom c code in the cudaHostGetDevicePointer.c file | ||
849 | ret = orig_cudaHostGetDevicePointer(pDevice, pHost, flags); | ||
850 | return ret; | ||
851 | |||
852 | } | ||
853 | |||
854 | static __host__ cudaError_t CUDARTAPI (*orig_cudaHostGetFlags)(unsigned int *pFlags, void *pHost) = NULL; | ||
855 | |||
856 | __host__ cudaError_t CUDARTAPI cudaHostGetFlags(unsigned int *pFlags, void *pHost) { | ||
857 | cudaError_t ret; | ||
858 | // Write your own custom c code in the cudaHostGetFlags.c file | ||
859 | ret = orig_cudaHostGetFlags(pFlags, pHost); | ||
860 | return ret; | ||
861 | |||
862 | } | ||
863 | |||
864 | static __host__ cudaError_t CUDARTAPI (*orig_cudaMalloc3D)(struct cudaPitchedPtr* pitchedDevPtr, struct cudaExtent extent) = NULL; | ||
865 | |||
866 | __host__ cudaError_t CUDARTAPI cudaMalloc3D(struct cudaPitchedPtr* pitchedDevPtr, struct cudaExtent extent) { | ||
867 | cudaError_t ret; | ||
868 | // Write your own custom c code in the cudaMalloc3D.c file | ||
869 | ret = orig_cudaMalloc3D(pitchedDevPtr, extent); | ||
870 | return ret; | ||
871 | |||
872 | } | ||
873 | |||
874 | static __host__ cudaError_t CUDARTAPI (*orig_cudaMalloc3DArray)(cudaArray_t *array, const struct cudaChannelFormatDesc* desc, struct cudaExtent extent, unsigned int flags ) = NULL; | ||
875 | |||
876 | __host__ cudaError_t CUDARTAPI cudaMalloc3DArray(cudaArray_t *array, const struct cudaChannelFormatDesc* desc, struct cudaExtent extent, unsigned int flags ) { | ||
877 | cudaError_t ret; | ||
878 | // Write your own custom c code in the cudaMalloc3DArray.c file | ||
879 | ret = orig_cudaMalloc3DArray(array, desc, extent, flags); | ||
880 | return ret; | ||
881 | |||
882 | } | ||
883 | |||
884 | static __host__ cudaError_t CUDARTAPI (*orig_cudaMallocMipmappedArray)(cudaMipmappedArray_t *mipmappedArray, const struct cudaChannelFormatDesc* desc, struct cudaExtent extent, unsigned int numLevels, unsigned int flags ) = NULL; | ||
885 | |||
886 | __host__ cudaError_t CUDARTAPI cudaMallocMipmappedArray(cudaMipmappedArray_t *mipmappedArray, const struct cudaChannelFormatDesc* desc, struct cudaExtent extent, unsigned int numLevels, unsigned int flags ) { | ||
887 | cudaError_t ret; | ||
888 | // Write your own custom c code in the cudaMallocMipmappedArray.c file | ||
889 | ret = orig_cudaMallocMipmappedArray(mipmappedArray, desc, extent, numLevels, flags); | ||
890 | return ret; | ||
891 | |||
892 | } | ||
893 | |||
894 | static __host__ cudaError_t CUDARTAPI (*orig_cudaGetMipmappedArrayLevel)(cudaArray_t *levelArray, cudaMipmappedArray_const_t mipmappedArray, unsigned int level) = NULL; | ||
895 | |||
896 | __host__ cudaError_t CUDARTAPI cudaGetMipmappedArrayLevel(cudaArray_t *levelArray, cudaMipmappedArray_const_t mipmappedArray, unsigned int level) { | ||
897 | cudaError_t ret; | ||
898 | // Write your own custom c code in the cudaGetMipmappedArrayLevel.c file | ||
899 | ret = orig_cudaGetMipmappedArrayLevel(levelArray, mipmappedArray, level); | ||
900 | return ret; | ||
901 | |||
902 | } | ||
903 | |||
904 | static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpy3D)(const struct cudaMemcpy3DParms *p) = NULL; | ||
905 | |||
906 | __host__ cudaError_t CUDARTAPI cudaMemcpy3D(const struct cudaMemcpy3DParms *p) { | ||
907 | cudaError_t ret; | ||
908 | // Write your own custom c code in the cudaMemcpy3D.c file | ||
909 | ret = orig_cudaMemcpy3D(p); | ||
910 | return ret; | ||
911 | |||
912 | } | ||
913 | |||
914 | static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpy3DPeer)(const struct cudaMemcpy3DPeerParms *p) = NULL; | ||
915 | |||
916 | __host__ cudaError_t CUDARTAPI cudaMemcpy3DPeer(const struct cudaMemcpy3DPeerParms *p) { | ||
917 | cudaError_t ret; | ||
918 | // Write your own custom c code in the cudaMemcpy3DPeer.c file | ||
919 | ret = orig_cudaMemcpy3DPeer(p); | ||
920 | return ret; | ||
921 | |||
922 | } | ||
923 | |||
924 | static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaMemcpy3DAsync)(const struct cudaMemcpy3DParms *p, cudaStream_t stream ) = NULL; | ||
925 | |||
926 | __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(const struct cudaMemcpy3DParms *p, cudaStream_t stream ) { | ||
927 | cudaError_t ret; | ||
928 | // Write your own custom c code in the cudaMemcpy3DAsync.c file | ||
929 | ret = orig_cudaMemcpy3DAsync(p, stream); | ||
930 | return ret; | ||
931 | |||
932 | } | ||
933 | |||
934 | static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpy3DPeerAsync)(const struct cudaMemcpy3DPeerParms *p, cudaStream_t stream ) = NULL; | ||
935 | |||
936 | __host__ cudaError_t CUDARTAPI cudaMemcpy3DPeerAsync(const struct cudaMemcpy3DPeerParms *p, cudaStream_t stream ) { | ||
937 | cudaError_t ret; | ||
938 | // Write your own custom c code in the cudaMemcpy3DPeerAsync.c file | ||
939 | ret = orig_cudaMemcpy3DPeerAsync(p, stream); | ||
940 | return ret; | ||
941 | |||
942 | } | ||
943 | |||
944 | static __host__ cudaError_t CUDARTAPI (*orig_cudaMemGetInfo)(size_t *free, size_t *total) = NULL; | ||
945 | |||
946 | __host__ cudaError_t CUDARTAPI cudaMemGetInfo(size_t *free, size_t *total) { | ||
947 | cudaError_t ret; | ||
948 | // Write your own custom c code in the cudaMemGetInfo.c file | ||
949 | ret = orig_cudaMemGetInfo(free, total); | ||
950 | return ret; | ||
951 | |||
952 | } | ||
953 | |||
954 | static __host__ cudaError_t CUDARTAPI (*orig_cudaArrayGetInfo)(struct cudaChannelFormatDesc *desc, struct cudaExtent *extent, unsigned int *flags, cudaArray_t array) = NULL; | ||
955 | |||
956 | __host__ cudaError_t CUDARTAPI cudaArrayGetInfo(struct cudaChannelFormatDesc *desc, struct cudaExtent *extent, unsigned int *flags, cudaArray_t array) { | ||
957 | cudaError_t ret; | ||
958 | // Write your own custom c code in the cudaArrayGetInfo.c file | ||
959 | ret = orig_cudaArrayGetInfo(desc, extent, flags, array); | ||
960 | return ret; | ||
961 | |||
962 | } | ||
963 | |||
964 | static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpy)(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind) = NULL; | ||
965 | |||
966 | __host__ cudaError_t CUDARTAPI cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind) { | ||
967 | cudaError_t ret; | ||
968 | // Write your own custom c code in the cudaMemcpy.c file | ||
969 | printf("cudaMemcpy\n"); | ||
970 | ret = orig_cudaMemcpy(dst, src, count, kind); | ||
971 | return ret; | ||
972 | |||
973 | } | ||
974 | |||
975 | static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpyPeer)(void *dst, int dstDevice, const void *src, int srcDevice, size_t count) = NULL; | ||
976 | |||
977 | __host__ cudaError_t CUDARTAPI cudaMemcpyPeer(void *dst, int dstDevice, const void *src, int srcDevice, size_t count) { | ||
978 | cudaError_t ret; | ||
979 | // Write your own custom c code in the cudaMemcpyPeer.c file | ||
980 | ret = orig_cudaMemcpyPeer(dst, dstDevice, src, srcDevice, count); | ||
981 | return ret; | ||
982 | |||
983 | } | ||
984 | |||
985 | static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpyToArray)(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind) = NULL; | ||
986 | |||
987 | __host__ cudaError_t CUDARTAPI cudaMemcpyToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind) { | ||
988 | cudaError_t ret; | ||
989 | // Write your own custom c code in the cudaMemcpyToArray.c file | ||
990 | ret = orig_cudaMemcpyToArray(dst, wOffset, hOffset, src, count, kind); | ||
991 | return ret; | ||
992 | |||
993 | } | ||
994 | |||
995 | static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpyFromArray)(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind) = NULL; | ||
996 | |||
997 | __host__ cudaError_t CUDARTAPI cudaMemcpyFromArray(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind) { | ||
998 | cudaError_t ret; | ||
999 | // Write your own custom c code in the cudaMemcpyFromArray.c file | ||
1000 | ret = orig_cudaMemcpyFromArray(dst, src, wOffset, hOffset, count, kind); | ||
1001 | return ret; | ||
1002 | |||
1003 | } | ||
1004 | |||
1005 | static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpyArrayToArray)(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count, enum cudaMemcpyKind kind ) = NULL; | ||
1006 | |||
1007 | __host__ cudaError_t CUDARTAPI cudaMemcpyArrayToArray(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count, enum cudaMemcpyKind kind ) { | ||
1008 | cudaError_t ret; | ||
1009 | // Write your own custom c code in the cudaMemcpyArrayToArray.c file | ||
1010 | ret = orig_cudaMemcpyArrayToArray(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc, count, kind); | ||
1011 | return ret; | ||
1012 | |||
1013 | } | ||
1014 | |||
1015 | static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpy2D)(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind) = NULL; | ||
1016 | |||
1017 | __host__ cudaError_t CUDARTAPI cudaMemcpy2D(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind) { | ||
1018 | cudaError_t ret; | ||
1019 | // Write your own custom c code in the cudaMemcpy2D.c file | ||
1020 | ret = orig_cudaMemcpy2D(dst, dpitch, src, spitch, width, height, kind); | ||
1021 | return ret; | ||
1022 | |||
1023 | } | ||
1024 | |||
1025 | static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpy2DToArray)(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind) = NULL; | ||
1026 | |||
1027 | __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind) { | ||
1028 | cudaError_t ret; | ||
1029 | // Write your own custom c code in the cudaMemcpy2DToArray.c file | ||
1030 | ret = orig_cudaMemcpy2DToArray(dst, wOffset, hOffset, src, spitch, width, height, kind); | ||
1031 | return ret; | ||
1032 | |||
1033 | } | ||
1034 | |||
1035 | static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpy2DFromArray)(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind) = NULL; | ||
1036 | |||
1037 | __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArray(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind) { | ||
1038 | cudaError_t ret; | ||
1039 | // Write your own custom c code in the cudaMemcpy2DFromArray.c file | ||
1040 | ret = orig_cudaMemcpy2DFromArray(dst, dpitch, src, wOffset, hOffset, width, height, kind); | ||
1041 | return ret; | ||
1042 | |||
1043 | } | ||
1044 | |||
1045 | static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpy2DArrayToArray)(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, enum cudaMemcpyKind kind ) = NULL; | ||
1046 | |||
1047 | __host__ cudaError_t CUDARTAPI cudaMemcpy2DArrayToArray(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, enum cudaMemcpyKind kind ) { | ||
1048 | cudaError_t ret; | ||
1049 | // Write your own custom c code in the cudaMemcpy2DArrayToArray.c file | ||
1050 | ret = orig_cudaMemcpy2DArrayToArray(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc, width, height, kind); | ||
1051 | return ret; | ||
1052 | |||
1053 | } | ||
1054 | |||
1055 | static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpyToSymbol)(const void *symbol, const void *src, size_t count, size_t offset , enum cudaMemcpyKind kind ) = NULL; | ||
1056 | |||
1057 | __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbol(const void *symbol, const void *src, size_t count, size_t offset , enum cudaMemcpyKind kind ) { | ||
1058 | cudaError_t ret; | ||
1059 | // Write your own custom c code in the cudaMemcpyToSymbol.c file | ||
1060 | ret = orig_cudaMemcpyToSymbol(symbol, src, count, offset, kind); | ||
1061 | return ret; | ||
1062 | |||
1063 | } | ||
1064 | |||
1065 | static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpyFromSymbol)(void *dst, const void *symbol, size_t count, size_t offset , enum cudaMemcpyKind kind ) = NULL; | ||
1066 | |||
1067 | __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbol(void *dst, const void *symbol, size_t count, size_t offset , enum cudaMemcpyKind kind ) { | ||
1068 | cudaError_t ret; | ||
1069 | // Write your own custom c code in the cudaMemcpyFromSymbol.c file | ||
1070 | ret = orig_cudaMemcpyFromSymbol(dst, symbol, count, offset, kind); | ||
1071 | return ret; | ||
1072 | |||
1073 | } | ||
1074 | |||
1075 | static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaMemcpyAsync)(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream ) = NULL; | ||
1076 | |||
1077 | __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream ) { | ||
1078 | cudaError_t ret; | ||
1079 | // Write your own custom c code in the cudaMemcpyAsync.c file | ||
1080 | printf("cudaMemcpyAsync stream %p\n", (void *)stream); | ||
1081 | ret = orig_cudaMemcpyAsync(dst, src, count, kind, stream); | ||
1082 | return ret; | ||
1083 | |||
1084 | } | ||
1085 | |||
1086 | static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpyPeerAsync)(void *dst, int dstDevice, const void *src, int srcDevice, size_t count, cudaStream_t stream ) = NULL; | ||
1087 | |||
1088 | __host__ cudaError_t CUDARTAPI cudaMemcpyPeerAsync(void *dst, int dstDevice, const void *src, int srcDevice, size_t count, cudaStream_t stream ) { | ||
1089 | cudaError_t ret; | ||
1090 | // Write your own custom c code in the cudaMemcpyPeerAsync.c file | ||
1091 | ret = orig_cudaMemcpyPeerAsync(dst, dstDevice, src, srcDevice, count, stream); | ||
1092 | return ret; | ||
1093 | |||
1094 | } | ||
1095 | |||
1096 | static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpyToArrayAsync)(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream ) = NULL; | ||
1097 | |||
1098 | __host__ cudaError_t CUDARTAPI cudaMemcpyToArrayAsync(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream ) { | ||
1099 | cudaError_t ret; | ||
1100 | // Write your own custom c code in the cudaMemcpyToArrayAsync.c file | ||
1101 | ret = orig_cudaMemcpyToArrayAsync(dst, wOffset, hOffset, src, count, kind, stream); | ||
1102 | return ret; | ||
1103 | |||
1104 | } | ||
1105 | |||
1106 | static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpyFromArrayAsync)(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream ) = NULL; | ||
1107 | |||
1108 | __host__ cudaError_t CUDARTAPI cudaMemcpyFromArrayAsync(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream ) { | ||
1109 | cudaError_t ret; | ||
1110 | // Write your own custom c code in the cudaMemcpyFromArrayAsync.c file | ||
1111 | ret = orig_cudaMemcpyFromArrayAsync(dst, src, wOffset, hOffset, count, kind, stream); | ||
1112 | return ret; | ||
1113 | |||
1114 | } | ||
1115 | |||
1116 | static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaMemcpy2DAsync)(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream ) = NULL; | ||
1117 | |||
1118 | __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream ) { | ||
1119 | cudaError_t ret; | ||
1120 | // Write your own custom c code in the cudaMemcpy2DAsync.c file | ||
1121 | ret = orig_cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, kind, stream); | ||
1122 | return ret; | ||
1123 | |||
1124 | } | ||
1125 | |||
1126 | static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpy2DToArrayAsync)(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream ) = NULL; | ||
1127 | |||
1128 | __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArrayAsync(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream ) { | ||
1129 | cudaError_t ret; | ||
1130 | // Write your own custom c code in the cudaMemcpy2DToArrayAsync.c file | ||
1131 | ret = orig_cudaMemcpy2DToArrayAsync(dst, wOffset, hOffset, src, spitch, width, height, kind, stream); | ||
1132 | return ret; | ||
1133 | |||
1134 | } | ||
1135 | |||
1136 | static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpy2DFromArrayAsync)(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream ) = NULL; | ||
1137 | |||
1138 | __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArrayAsync(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream ) { | ||
1139 | cudaError_t ret; | ||
1140 | // Write your own custom c code in the cudaMemcpy2DFromArrayAsync.c file | ||
1141 | ret = orig_cudaMemcpy2DFromArrayAsync(dst, dpitch, src, wOffset, hOffset, width, height, kind, stream); | ||
1142 | return ret; | ||
1143 | |||
1144 | } | ||
1145 | |||
1146 | static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpyToSymbolAsync)(const void *symbol, const void *src, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream ) = NULL; | ||
1147 | |||
1148 | __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbolAsync(const void *symbol, const void *src, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream ) { | ||
1149 | cudaError_t ret; | ||
1150 | // Write your own custom c code in the cudaMemcpyToSymbolAsync.c file | ||
1151 | ret = orig_cudaMemcpyToSymbolAsync(symbol, src, count, offset, kind, stream); | ||
1152 | return ret; | ||
1153 | |||
1154 | } | ||
1155 | |||
1156 | static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpyFromSymbolAsync)(void *dst, const void *symbol, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream ) = NULL; | ||
1157 | |||
1158 | __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbolAsync(void *dst, const void *symbol, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream ) { | ||
1159 | cudaError_t ret; | ||
1160 | // Write your own custom c code in the cudaMemcpyFromSymbolAsync.c file | ||
1161 | ret = orig_cudaMemcpyFromSymbolAsync(dst, symbol, count, offset, kind, stream); | ||
1162 | return ret; | ||
1163 | |||
1164 | } | ||
1165 | |||
1166 | static __host__ cudaError_t CUDARTAPI (*orig_cudaMemset)(void *devPtr, int value, size_t count) = NULL; | ||
1167 | |||
1168 | __host__ cudaError_t CUDARTAPI cudaMemset(void *devPtr, int value, size_t count) { | ||
1169 | cudaError_t ret; | ||
1170 | // Write your own custom c code in the cudaMemset.c file | ||
1171 | ret = orig_cudaMemset(devPtr, value, count); | ||
1172 | return ret; | ||
1173 | |||
1174 | } | ||
1175 | |||
1176 | static __host__ cudaError_t CUDARTAPI (*orig_cudaMemset2D)(void *devPtr, size_t pitch, int value, size_t width, size_t height) = NULL; | ||
1177 | |||
1178 | __host__ cudaError_t CUDARTAPI cudaMemset2D(void *devPtr, size_t pitch, int value, size_t width, size_t height) { | ||
1179 | cudaError_t ret; | ||
1180 | // Write your own custom c code in the cudaMemset2D.c file | ||
1181 | ret = orig_cudaMemset2D(devPtr, pitch, value, width, height); | ||
1182 | return ret; | ||
1183 | |||
1184 | } | ||
1185 | |||
1186 | static __host__ cudaError_t CUDARTAPI (*orig_cudaMemset3D)(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent) = NULL; | ||
1187 | |||
1188 | __host__ cudaError_t CUDARTAPI cudaMemset3D(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent) { | ||
1189 | cudaError_t ret; | ||
1190 | // Write your own custom c code in the cudaMemset3D.c file | ||
1191 | ret = orig_cudaMemset3D(pitchedDevPtr, value, extent); | ||
1192 | return ret; | ||
1193 | |||
1194 | } | ||
1195 | |||
1196 | static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaMemsetAsync)(void *devPtr, int value, size_t count, cudaStream_t stream ) = NULL; | ||
1197 | |||
1198 | __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(void *devPtr, int value, size_t count, cudaStream_t stream ) { | ||
1199 | cudaError_t ret; | ||
1200 | // Write your own custom c code in the cudaMemsetAsync.c file | ||
1201 | ret = orig_cudaMemsetAsync(devPtr, value, count, stream); | ||
1202 | return ret; | ||
1203 | |||
1204 | } | ||
1205 | |||
1206 | static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaMemset2DAsync)(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream ) = NULL; | ||
1207 | |||
1208 | __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream ) { | ||
1209 | cudaError_t ret; | ||
1210 | // Write your own custom c code in the cudaMemset2DAsync.c file | ||
1211 | ret = orig_cudaMemset2DAsync(devPtr, pitch, value, width, height, stream); | ||
1212 | return ret; | ||
1213 | |||
1214 | } | ||
1215 | |||
1216 | static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaMemset3DAsync)(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream ) = NULL; | ||
1217 | |||
1218 | __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream ) { | ||
1219 | cudaError_t ret; | ||
1220 | // Write your own custom c code in the cudaMemset3DAsync.c file | ||
1221 | ret = orig_cudaMemset3DAsync(pitchedDevPtr, value, extent, stream); | ||
1222 | return ret; | ||
1223 | |||
1224 | } | ||
1225 | |||
1226 | static __host__ cudaError_t CUDARTAPI (*orig_cudaGetSymbolAddress)(void **devPtr, const void *symbol) = NULL; | ||
1227 | |||
1228 | __host__ cudaError_t CUDARTAPI cudaGetSymbolAddress(void **devPtr, const void *symbol) { | ||
1229 | cudaError_t ret; | ||
1230 | // Write your own custom c code in the cudaGetSymbolAddress.c file | ||
1231 | ret = orig_cudaGetSymbolAddress(devPtr, symbol); | ||
1232 | return ret; | ||
1233 | |||
1234 | } | ||
1235 | |||
1236 | static __host__ cudaError_t CUDARTAPI (*orig_cudaGetSymbolSize)(size_t *size, const void *symbol) = NULL; | ||
1237 | |||
1238 | __host__ cudaError_t CUDARTAPI cudaGetSymbolSize(size_t *size, const void *symbol) { | ||
1239 | cudaError_t ret; | ||
1240 | // Write your own custom c code in the cudaGetSymbolSize.c file | ||
1241 | ret = orig_cudaGetSymbolSize(size, symbol); | ||
1242 | return ret; | ||
1243 | |||
1244 | } | ||
1245 | |||
1246 | static __host__ cudaError_t CUDARTAPI (*orig_cudaMemPrefetchAsync)(const void *devPtr, size_t count, int dstDevice, cudaStream_t stream ) = NULL; | ||
1247 | |||
1248 | __host__ cudaError_t CUDARTAPI cudaMemPrefetchAsync(const void *devPtr, size_t count, int dstDevice, cudaStream_t stream ) { | ||
1249 | cudaError_t ret; | ||
1250 | // Write your own custom c code in the cudaMemPrefetchAsync.c file | ||
1251 | ret = orig_cudaMemPrefetchAsync(devPtr, count, dstDevice, stream); | ||
1252 | return ret; | ||
1253 | |||
1254 | } | ||
1255 | |||
1256 | static __host__ cudaError_t CUDARTAPI (*orig_cudaMemAdvise)(const void *devPtr, size_t count, enum cudaMemoryAdvise advice, int device) = NULL; | ||
1257 | |||
1258 | __host__ cudaError_t CUDARTAPI cudaMemAdvise(const void *devPtr, size_t count, enum cudaMemoryAdvise advice, int device) { | ||
1259 | cudaError_t ret; | ||
1260 | // Write your own custom c code in the cudaMemAdvise.c file | ||
1261 | ret = orig_cudaMemAdvise(devPtr, count, advice, device); | ||
1262 | return ret; | ||
1263 | |||
1264 | } | ||
1265 | |||
1266 | static __host__ cudaError_t CUDARTAPI (*orig_cudaMemRangeGetAttribute)(void *data, size_t dataSize, enum cudaMemRangeAttribute attribute, const void *devPtr, size_t count) = NULL; | ||
1267 | |||
1268 | __host__ cudaError_t CUDARTAPI cudaMemRangeGetAttribute(void *data, size_t dataSize, enum cudaMemRangeAttribute attribute, const void *devPtr, size_t count) { | ||
1269 | cudaError_t ret; | ||
1270 | // Write your own custom c code in the cudaMemRangeGetAttribute.c file | ||
1271 | ret = orig_cudaMemRangeGetAttribute(data, dataSize, attribute, devPtr, count); | ||
1272 | return ret; | ||
1273 | |||
1274 | } | ||
1275 | |||
1276 | static __host__ cudaError_t CUDARTAPI (*orig_cudaMemRangeGetAttributes)(void **data, size_t *dataSizes, enum cudaMemRangeAttribute *attributes, size_t numAttributes, const void *devPtr, size_t count) = NULL; | ||
1277 | |||
1278 | __host__ cudaError_t CUDARTAPI cudaMemRangeGetAttributes(void **data, size_t *dataSizes, enum cudaMemRangeAttribute *attributes, size_t numAttributes, const void *devPtr, size_t count) { | ||
1279 | cudaError_t ret; | ||
1280 | // Write your own custom c code in the cudaMemRangeGetAttributes.c file | ||
1281 | ret = orig_cudaMemRangeGetAttributes(data, dataSizes, attributes, numAttributes, devPtr, count); | ||
1282 | return ret; | ||
1283 | |||
1284 | } | ||
1285 | |||
1286 | static __host__ cudaError_t CUDARTAPI (*orig_cudaPointerGetAttributes)(struct cudaPointerAttributes *attributes, const void *ptr) = NULL; | ||
1287 | |||
1288 | __host__ cudaError_t CUDARTAPI cudaPointerGetAttributes(struct cudaPointerAttributes *attributes, const void *ptr) { | ||
1289 | cudaError_t ret; | ||
1290 | // Write your own custom c code in the cudaPointerGetAttributes.c file | ||
1291 | ret = orig_cudaPointerGetAttributes(attributes, ptr); | ||
1292 | return ret; | ||
1293 | |||
1294 | } | ||
1295 | |||
1296 | static __host__ cudaError_t CUDARTAPI (*orig_cudaDeviceCanAccessPeer)(int *canAccessPeer, int device, int peerDevice) = NULL; | ||
1297 | |||
1298 | __host__ cudaError_t CUDARTAPI cudaDeviceCanAccessPeer(int *canAccessPeer, int device, int peerDevice) { | ||
1299 | cudaError_t ret; | ||
1300 | // Write your own custom c code in the cudaDeviceCanAccessPeer.c file | ||
1301 | ret = orig_cudaDeviceCanAccessPeer(canAccessPeer, device, peerDevice); | ||
1302 | return ret; | ||
1303 | |||
1304 | } | ||
1305 | |||
1306 | static __host__ cudaError_t CUDARTAPI (*orig_cudaDeviceEnablePeerAccess)(int peerDevice, unsigned int flags) = NULL; | ||
1307 | |||
1308 | __host__ cudaError_t CUDARTAPI cudaDeviceEnablePeerAccess(int peerDevice, unsigned int flags) { | ||
1309 | cudaError_t ret; | ||
1310 | // Write your own custom c code in the cudaDeviceEnablePeerAccess.c file | ||
1311 | ret = orig_cudaDeviceEnablePeerAccess(peerDevice, flags); | ||
1312 | return ret; | ||
1313 | |||
1314 | } | ||
1315 | |||
1316 | static __host__ cudaError_t CUDARTAPI (*orig_cudaDeviceDisablePeerAccess)(int peerDevice) = NULL; | ||
1317 | |||
1318 | __host__ cudaError_t CUDARTAPI cudaDeviceDisablePeerAccess(int peerDevice) { | ||
1319 | cudaError_t ret; | ||
1320 | // Write your own custom c code in the cudaDeviceDisablePeerAccess.c file | ||
1321 | ret = orig_cudaDeviceDisablePeerAccess(peerDevice); | ||
1322 | return ret; | ||
1323 | |||
1324 | } | ||
1325 | |||
1326 | static __host__ cudaError_t CUDARTAPI (*orig_cudaGraphicsUnregisterResource)(cudaGraphicsResource_t resource) = NULL; | ||
1327 | |||
1328 | __host__ cudaError_t CUDARTAPI cudaGraphicsUnregisterResource(cudaGraphicsResource_t resource) { | ||
1329 | cudaError_t ret; | ||
1330 | // Write your own custom c code in the cudaGraphicsUnregisterResource.c file | ||
1331 | ret = orig_cudaGraphicsUnregisterResource(resource); | ||
1332 | return ret; | ||
1333 | |||
1334 | } | ||
1335 | |||
1336 | static __host__ cudaError_t CUDARTAPI (*orig_cudaGraphicsResourceSetMapFlags)(cudaGraphicsResource_t resource, unsigned int flags) = NULL; | ||
1337 | |||
1338 | __host__ cudaError_t CUDARTAPI cudaGraphicsResourceSetMapFlags(cudaGraphicsResource_t resource, unsigned int flags) { | ||
1339 | cudaError_t ret; | ||
1340 | // Write your own custom c code in the cudaGraphicsResourceSetMapFlags.c file | ||
1341 | ret = orig_cudaGraphicsResourceSetMapFlags(resource, flags); | ||
1342 | return ret; | ||
1343 | |||
1344 | } | ||
1345 | |||
1346 | static __host__ cudaError_t CUDARTAPI (*orig_cudaGraphicsMapResources)(int count, cudaGraphicsResource_t *resources, cudaStream_t stream ) = NULL; | ||
1347 | |||
1348 | __host__ cudaError_t CUDARTAPI cudaGraphicsMapResources(int count, cudaGraphicsResource_t *resources, cudaStream_t stream ) { | ||
1349 | cudaError_t ret; | ||
1350 | // Write your own custom c code in the cudaGraphicsMapResources.c file | ||
1351 | ret = orig_cudaGraphicsMapResources(count, resources, stream); | ||
1352 | return ret; | ||
1353 | |||
1354 | } | ||
1355 | |||
1356 | static __host__ cudaError_t CUDARTAPI (*orig_cudaGraphicsUnmapResources)(int count, cudaGraphicsResource_t *resources, cudaStream_t stream ) = NULL; | ||
1357 | |||
1358 | __host__ cudaError_t CUDARTAPI cudaGraphicsUnmapResources(int count, cudaGraphicsResource_t *resources, cudaStream_t stream ) { | ||
1359 | cudaError_t ret; | ||
1360 | // Write your own custom c code in the cudaGraphicsUnmapResources.c file | ||
1361 | ret = orig_cudaGraphicsUnmapResources(count, resources, stream); | ||
1362 | return ret; | ||
1363 | |||
1364 | } | ||
1365 | |||
1366 | static __host__ cudaError_t CUDARTAPI (*orig_cudaGraphicsResourceGetMappedPointer)(void **devPtr, size_t *size, cudaGraphicsResource_t resource) = NULL; | ||
1367 | |||
1368 | __host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedPointer(void **devPtr, size_t *size, cudaGraphicsResource_t resource) { | ||
1369 | cudaError_t ret; | ||
1370 | // Write your own custom c code in the cudaGraphicsResourceGetMappedPointer.c file | ||
1371 | ret = orig_cudaGraphicsResourceGetMappedPointer(devPtr, size, resource); | ||
1372 | return ret; | ||
1373 | |||
1374 | } | ||
1375 | |||
1376 | static __host__ cudaError_t CUDARTAPI (*orig_cudaGraphicsSubResourceGetMappedArray)(cudaArray_t *array, cudaGraphicsResource_t resource, unsigned int arrayIndex, unsigned int mipLevel) = NULL; | ||
1377 | |||
1378 | __host__ cudaError_t CUDARTAPI cudaGraphicsSubResourceGetMappedArray(cudaArray_t *array, cudaGraphicsResource_t resource, unsigned int arrayIndex, unsigned int mipLevel) { | ||
1379 | cudaError_t ret; | ||
1380 | // Write your own custom c code in the cudaGraphicsSubResourceGetMappedArray.c file | ||
1381 | ret = orig_cudaGraphicsSubResourceGetMappedArray(array, resource, arrayIndex, mipLevel); | ||
1382 | return ret; | ||
1383 | |||
1384 | } | ||
1385 | |||
1386 | static __host__ cudaError_t CUDARTAPI (*orig_cudaGraphicsResourceGetMappedMipmappedArray)(cudaMipmappedArray_t *mipmappedArray, cudaGraphicsResource_t resource) = NULL; | ||
1387 | |||
1388 | __host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedMipmappedArray(cudaMipmappedArray_t *mipmappedArray, cudaGraphicsResource_t resource) { | ||
1389 | cudaError_t ret; | ||
1390 | // Write your own custom c code in the cudaGraphicsResourceGetMappedMipmappedArray.c file | ||
1391 | ret = orig_cudaGraphicsResourceGetMappedMipmappedArray(mipmappedArray, resource); | ||
1392 | return ret; | ||
1393 | |||
1394 | } | ||
1395 | |||
1396 | static __host__ cudaError_t CUDARTAPI (*orig_cudaGetChannelDesc)(struct cudaChannelFormatDesc *desc, cudaArray_const_t array) = NULL; | ||
1397 | |||
1398 | __host__ cudaError_t CUDARTAPI cudaGetChannelDesc(struct cudaChannelFormatDesc *desc, cudaArray_const_t array) { | ||
1399 | cudaError_t ret; | ||
1400 | // Write your own custom c code in the cudaGetChannelDesc.c file | ||
1401 | ret = orig_cudaGetChannelDesc(desc, array); | ||
1402 | return ret; | ||
1403 | |||
1404 | } | ||
1405 | |||
1406 | static __host__ struct cudaChannelFormatDesc CUDARTAPI (*orig_cudaCreateChannelDesc)(int x, int y, int z, int w, enum cudaChannelFormatKind f) = NULL; | ||
1407 | |||
1408 | __host__ struct cudaChannelFormatDesc CUDARTAPI cudaCreateChannelDesc(int x, int y, int z, int w, enum cudaChannelFormatKind f) { | ||
1409 | struct cudaChannelFormatDesc ret; | ||
1410 | // Write your own custom c code in the cudaCreateChannelDesc.c file | ||
1411 | ret = orig_cudaCreateChannelDesc(x, y, z, w, f); | ||
1412 | return ret; | ||
1413 | |||
1414 | } | ||
1415 | |||
1416 | static __host__ cudaError_t CUDARTAPI (*orig_cudaBindTexture)(size_t *offset, const struct textureReference *texref, const void *devPtr, const struct cudaChannelFormatDesc *desc, size_t size ) = NULL; | ||
1417 | |||
1418 | __host__ cudaError_t CUDARTAPI cudaBindTexture(size_t *offset, const struct textureReference *texref, const void *devPtr, const struct cudaChannelFormatDesc *desc, size_t size ) { | ||
1419 | cudaError_t ret; | ||
1420 | // Write your own custom c code in the cudaBindTexture.c file | ||
1421 | ret = orig_cudaBindTexture(offset, texref, devPtr, desc, size); | ||
1422 | return ret; | ||
1423 | |||
1424 | } | ||
1425 | |||
1426 | static __host__ cudaError_t CUDARTAPI (*orig_cudaBindTexture2D)(size_t *offset, const struct textureReference *texref, const void *devPtr, const struct cudaChannelFormatDesc *desc, size_t width, size_t height, size_t pitch) = NULL; | ||
1427 | |||
1428 | __host__ cudaError_t CUDARTAPI cudaBindTexture2D(size_t *offset, const struct textureReference *texref, const void *devPtr, const struct cudaChannelFormatDesc *desc, size_t width, size_t height, size_t pitch) { | ||
1429 | cudaError_t ret; | ||
1430 | // Write your own custom c code in the cudaBindTexture2D.c file | ||
1431 | ret = orig_cudaBindTexture2D(offset, texref, devPtr, desc, width, height, pitch); | ||
1432 | return ret; | ||
1433 | |||
1434 | } | ||
1435 | |||
1436 | static __host__ cudaError_t CUDARTAPI (*orig_cudaBindTextureToArray)(const struct textureReference *texref, cudaArray_const_t array, const struct cudaChannelFormatDesc *desc) = NULL; | ||
1437 | |||
1438 | __host__ cudaError_t CUDARTAPI cudaBindTextureToArray(const struct textureReference *texref, cudaArray_const_t array, const struct cudaChannelFormatDesc *desc) { | ||
1439 | cudaError_t ret; | ||
1440 | // Write your own custom c code in the cudaBindTextureToArray.c file | ||
1441 | ret = orig_cudaBindTextureToArray(texref, array, desc); | ||
1442 | return ret; | ||
1443 | |||
1444 | } | ||
1445 | |||
1446 | static __host__ cudaError_t CUDARTAPI (*orig_cudaBindTextureToMipmappedArray)(const struct textureReference *texref, cudaMipmappedArray_const_t mipmappedArray, const struct cudaChannelFormatDesc *desc) = NULL; | ||
1447 | |||
1448 | __host__ cudaError_t CUDARTAPI cudaBindTextureToMipmappedArray(const struct textureReference *texref, cudaMipmappedArray_const_t mipmappedArray, const struct cudaChannelFormatDesc *desc) { | ||
1449 | cudaError_t ret; | ||
1450 | // Write your own custom c code in the cudaBindTextureToMipmappedArray.c file | ||
1451 | ret = orig_cudaBindTextureToMipmappedArray(texref, mipmappedArray, desc); | ||
1452 | return ret; | ||
1453 | |||
1454 | } | ||
1455 | |||
1456 | static __host__ cudaError_t CUDARTAPI (*orig_cudaUnbindTexture)(const struct textureReference *texref) = NULL; | ||
1457 | |||
1458 | __host__ cudaError_t CUDARTAPI cudaUnbindTexture(const struct textureReference *texref) { | ||
1459 | cudaError_t ret; | ||
1460 | // Write your own custom c code in the cudaUnbindTexture.c file | ||
1461 | ret = orig_cudaUnbindTexture(texref); | ||
1462 | return ret; | ||
1463 | |||
1464 | } | ||
1465 | |||
1466 | static __host__ cudaError_t CUDARTAPI (*orig_cudaGetTextureAlignmentOffset)(size_t *offset, const struct textureReference *texref) = NULL; | ||
1467 | |||
1468 | __host__ cudaError_t CUDARTAPI cudaGetTextureAlignmentOffset(size_t *offset, const struct textureReference *texref) { | ||
1469 | cudaError_t ret; | ||
1470 | // Write your own custom c code in the cudaGetTextureAlignmentOffset.c file | ||
1471 | ret = orig_cudaGetTextureAlignmentOffset(offset, texref); | ||
1472 | return ret; | ||
1473 | |||
1474 | } | ||
1475 | |||
1476 | static __host__ cudaError_t CUDARTAPI (*orig_cudaGetTextureReference)(const struct textureReference **texref, const void *symbol) = NULL; | ||
1477 | |||
1478 | __host__ cudaError_t CUDARTAPI cudaGetTextureReference(const struct textureReference **texref, const void *symbol) { | ||
1479 | cudaError_t ret; | ||
1480 | // Write your own custom c code in the cudaGetTextureReference.c file | ||
1481 | ret = orig_cudaGetTextureReference(texref, symbol); | ||
1482 | return ret; | ||
1483 | |||
1484 | } | ||
1485 | |||
1486 | static __host__ cudaError_t CUDARTAPI (*orig_cudaBindSurfaceToArray)(const struct surfaceReference *surfref, cudaArray_const_t array, const struct cudaChannelFormatDesc *desc) = NULL; | ||
1487 | |||
1488 | __host__ cudaError_t CUDARTAPI cudaBindSurfaceToArray(const struct surfaceReference *surfref, cudaArray_const_t array, const struct cudaChannelFormatDesc *desc) { | ||
1489 | cudaError_t ret; | ||
1490 | // Write your own custom c code in the cudaBindSurfaceToArray.c file | ||
1491 | ret = orig_cudaBindSurfaceToArray(surfref, array, desc); | ||
1492 | return ret; | ||
1493 | |||
1494 | } | ||
1495 | |||
1496 | static __host__ cudaError_t CUDARTAPI (*orig_cudaGetSurfaceReference)(const struct surfaceReference **surfref, const void *symbol) = NULL; | ||
1497 | |||
1498 | __host__ cudaError_t CUDARTAPI cudaGetSurfaceReference(const struct surfaceReference **surfref, const void *symbol) { | ||
1499 | cudaError_t ret; | ||
1500 | // Write your own custom c code in the cudaGetSurfaceReference.c file | ||
1501 | ret = orig_cudaGetSurfaceReference(surfref, symbol); | ||
1502 | return ret; | ||
1503 | |||
1504 | } | ||
1505 | |||
1506 | static __host__ cudaError_t CUDARTAPI (*orig_cudaCreateTextureObject)(cudaTextureObject_t *pTexObject, const struct cudaResourceDesc *pResDesc, const struct cudaTextureDesc *pTexDesc, const struct cudaResourceViewDesc *pResViewDesc) = NULL; | ||
1507 | |||
1508 | __host__ cudaError_t CUDARTAPI cudaCreateTextureObject(cudaTextureObject_t *pTexObject, const struct cudaResourceDesc *pResDesc, const struct cudaTextureDesc *pTexDesc, const struct cudaResourceViewDesc *pResViewDesc) { | ||
1509 | cudaError_t ret; | ||
1510 | // Write your own custom c code in the cudaCreateTextureObject.c file | ||
1511 | ret = orig_cudaCreateTextureObject(pTexObject, pResDesc, pTexDesc, pResViewDesc); | ||
1512 | return ret; | ||
1513 | |||
1514 | } | ||
1515 | |||
1516 | static __host__ cudaError_t CUDARTAPI (*orig_cudaDestroyTextureObject)(cudaTextureObject_t texObject) = NULL; | ||
1517 | |||
1518 | __host__ cudaError_t CUDARTAPI cudaDestroyTextureObject(cudaTextureObject_t texObject) { | ||
1519 | cudaError_t ret; | ||
1520 | // Write your own custom c code in the cudaDestroyTextureObject.c file | ||
1521 | ret = orig_cudaDestroyTextureObject(texObject); | ||
1522 | return ret; | ||
1523 | |||
1524 | } | ||
1525 | |||
1526 | static __host__ cudaError_t CUDARTAPI (*orig_cudaGetTextureObjectResourceDesc)(struct cudaResourceDesc *pResDesc, cudaTextureObject_t texObject) = NULL; | ||
1527 | |||
1528 | __host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceDesc(struct cudaResourceDesc *pResDesc, cudaTextureObject_t texObject) { | ||
1529 | cudaError_t ret; | ||
1530 | // Write your own custom c code in the cudaGetTextureObjectResourceDesc.c file | ||
1531 | ret = orig_cudaGetTextureObjectResourceDesc(pResDesc, texObject); | ||
1532 | return ret; | ||
1533 | |||
1534 | } | ||
1535 | |||
1536 | static __host__ cudaError_t CUDARTAPI (*orig_cudaGetTextureObjectTextureDesc)(struct cudaTextureDesc *pTexDesc, cudaTextureObject_t texObject) = NULL; | ||
1537 | |||
1538 | __host__ cudaError_t CUDARTAPI cudaGetTextureObjectTextureDesc(struct cudaTextureDesc *pTexDesc, cudaTextureObject_t texObject) { | ||
1539 | cudaError_t ret; | ||
1540 | // Write your own custom c code in the cudaGetTextureObjectTextureDesc.c file | ||
1541 | ret = orig_cudaGetTextureObjectTextureDesc(pTexDesc, texObject); | ||
1542 | return ret; | ||
1543 | |||
1544 | } | ||
1545 | |||
1546 | static __host__ cudaError_t CUDARTAPI (*orig_cudaGetTextureObjectResourceViewDesc)(struct cudaResourceViewDesc *pResViewDesc, cudaTextureObject_t texObject) = NULL; | ||
1547 | |||
1548 | __host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceViewDesc(struct cudaResourceViewDesc *pResViewDesc, cudaTextureObject_t texObject) { | ||
1549 | cudaError_t ret; | ||
1550 | // Write your own custom c code in the cudaGetTextureObjectResourceViewDesc.c file | ||
1551 | ret = orig_cudaGetTextureObjectResourceViewDesc(pResViewDesc, texObject); | ||
1552 | return ret; | ||
1553 | |||
1554 | } | ||
1555 | |||
1556 | static __host__ cudaError_t CUDARTAPI (*orig_cudaCreateSurfaceObject)(cudaSurfaceObject_t *pSurfObject, const struct cudaResourceDesc *pResDesc) = NULL; | ||
1557 | |||
1558 | __host__ cudaError_t CUDARTAPI cudaCreateSurfaceObject(cudaSurfaceObject_t *pSurfObject, const struct cudaResourceDesc *pResDesc) { | ||
1559 | cudaError_t ret; | ||
1560 | // Write your own custom c code in the cudaCreateSurfaceObject.c file | ||
1561 | ret = orig_cudaCreateSurfaceObject(pSurfObject, pResDesc); | ||
1562 | return ret; | ||
1563 | |||
1564 | } | ||
1565 | |||
1566 | static __host__ cudaError_t CUDARTAPI (*orig_cudaDestroySurfaceObject)(cudaSurfaceObject_t surfObject) = NULL; | ||
1567 | |||
1568 | __host__ cudaError_t CUDARTAPI cudaDestroySurfaceObject(cudaSurfaceObject_t surfObject) { | ||
1569 | cudaError_t ret; | ||
1570 | // Write your own custom c code in the cudaDestroySurfaceObject.c file | ||
1571 | ret = orig_cudaDestroySurfaceObject(surfObject); | ||
1572 | return ret; | ||
1573 | |||
1574 | } | ||
1575 | |||
1576 | static __host__ cudaError_t CUDARTAPI (*orig_cudaGetSurfaceObjectResourceDesc)(struct cudaResourceDesc *pResDesc, cudaSurfaceObject_t surfObject) = NULL; | ||
1577 | |||
1578 | __host__ cudaError_t CUDARTAPI cudaGetSurfaceObjectResourceDesc(struct cudaResourceDesc *pResDesc, cudaSurfaceObject_t surfObject) { | ||
1579 | cudaError_t ret; | ||
1580 | // Write your own custom c code in the cudaGetSurfaceObjectResourceDesc.c file | ||
1581 | ret = orig_cudaGetSurfaceObjectResourceDesc(pResDesc, surfObject); | ||
1582 | return ret; | ||
1583 | |||
1584 | } | ||
1585 | |||
1586 | static __host__ cudaError_t CUDARTAPI (*orig_cudaDriverGetVersion)(int *driverVersion) = NULL; | ||
1587 | |||
1588 | __host__ cudaError_t CUDARTAPI cudaDriverGetVersion(int *driverVersion) { | ||
1589 | cudaError_t ret; | ||
1590 | // Write your own custom c code in the cudaDriverGetVersion.c file | ||
1591 | ret = orig_cudaDriverGetVersion(driverVersion); | ||
1592 | return ret; | ||
1593 | |||
1594 | } | ||
1595 | |||
1596 | static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaRuntimeGetVersion)(int *runtimeVersion) = NULL; | ||
1597 | |||
1598 | __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaRuntimeGetVersion(int *runtimeVersion) { | ||
1599 | cudaError_t ret; | ||
1600 | // Write your own custom c code in the cudaRuntimeGetVersion.c file | ||
1601 | ret = orig_cudaRuntimeGetVersion(runtimeVersion); | ||
1602 | return ret; | ||
1603 | |||
1604 | } | ||
1605 | |||
1606 | static __host__ cudaError_t CUDARTAPI (*orig_cudaGetExportTable)(const void **ppExportTable, const cudaUUID_t *pExportTableId) = NULL; | ||
1607 | |||
1608 | __host__ cudaError_t CUDARTAPI cudaGetExportTable(const void **ppExportTable, const cudaUUID_t *pExportTableId) { | ||
1609 | cudaError_t ret; | ||
1610 | // Write your own custom c code in the cudaGetExportTable.c file | ||
1611 | ret = orig_cudaGetExportTable(ppExportTable, pExportTableId); | ||
1612 | return ret; | ||
1613 | |||
1614 | } | ||
1615 | __attribute__((constructor)) static void init() { | ||
1616 | char *dl_error; | ||
1617 | // clear dl error | ||
1618 | dlerror(); | ||
1619 | if (orig_cudaDeviceReset == NULL) { | ||
1620 | orig_cudaDeviceReset = dlsym(RTLD_NEXT, "cudaDeviceReset"); | ||
1621 | } | ||
1622 | if ((dl_error = dlerror()) != NULL) | ||
1623 | { | ||
1624 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
1625 | } | ||
1626 | |||
1627 | |||
1628 | // clear dl error | ||
1629 | dlerror(); | ||
1630 | if (orig_cudaDeviceSynchronize == NULL) { | ||
1631 | orig_cudaDeviceSynchronize = dlsym(RTLD_NEXT, "cudaDeviceSynchronize"); | ||
1632 | } | ||
1633 | if ((dl_error = dlerror()) != NULL) | ||
1634 | { | ||
1635 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
1636 | } | ||
1637 | |||
1638 | |||
1639 | // clear dl error | ||
1640 | dlerror(); | ||
1641 | if (orig_cudaDeviceSetLimit == NULL) { | ||
1642 | orig_cudaDeviceSetLimit = dlsym(RTLD_NEXT, "cudaDeviceSetLimit"); | ||
1643 | } | ||
1644 | if ((dl_error = dlerror()) != NULL) | ||
1645 | { | ||
1646 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
1647 | } | ||
1648 | |||
1649 | |||
1650 | // clear dl error | ||
1651 | dlerror(); | ||
1652 | if (orig_cudaDeviceGetLimit == NULL) { | ||
1653 | orig_cudaDeviceGetLimit = dlsym(RTLD_NEXT, "cudaDeviceGetLimit"); | ||
1654 | } | ||
1655 | if ((dl_error = dlerror()) != NULL) | ||
1656 | { | ||
1657 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
1658 | } | ||
1659 | |||
1660 | |||
1661 | // clear dl error | ||
1662 | dlerror(); | ||
1663 | if (orig_cudaDeviceGetCacheConfig == NULL) { | ||
1664 | orig_cudaDeviceGetCacheConfig = dlsym(RTLD_NEXT, "cudaDeviceGetCacheConfig"); | ||
1665 | } | ||
1666 | if ((dl_error = dlerror()) != NULL) | ||
1667 | { | ||
1668 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
1669 | } | ||
1670 | |||
1671 | |||
1672 | // clear dl error | ||
1673 | dlerror(); | ||
1674 | if (orig_cudaDeviceGetStreamPriorityRange == NULL) { | ||
1675 | orig_cudaDeviceGetStreamPriorityRange = dlsym(RTLD_NEXT, "cudaDeviceGetStreamPriorityRange"); | ||
1676 | } | ||
1677 | if ((dl_error = dlerror()) != NULL) | ||
1678 | { | ||
1679 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
1680 | } | ||
1681 | |||
1682 | |||
1683 | // clear dl error | ||
1684 | dlerror(); | ||
1685 | if (orig_cudaDeviceSetCacheConfig == NULL) { | ||
1686 | orig_cudaDeviceSetCacheConfig = dlsym(RTLD_NEXT, "cudaDeviceSetCacheConfig"); | ||
1687 | } | ||
1688 | if ((dl_error = dlerror()) != NULL) | ||
1689 | { | ||
1690 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
1691 | } | ||
1692 | |||
1693 | |||
1694 | // clear dl error | ||
1695 | dlerror(); | ||
1696 | if (orig_cudaDeviceGetSharedMemConfig == NULL) { | ||
1697 | orig_cudaDeviceGetSharedMemConfig = dlsym(RTLD_NEXT, "cudaDeviceGetSharedMemConfig"); | ||
1698 | } | ||
1699 | if ((dl_error = dlerror()) != NULL) | ||
1700 | { | ||
1701 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
1702 | } | ||
1703 | |||
1704 | |||
1705 | // clear dl error | ||
1706 | dlerror(); | ||
1707 | if (orig_cudaDeviceSetSharedMemConfig == NULL) { | ||
1708 | orig_cudaDeviceSetSharedMemConfig = dlsym(RTLD_NEXT, "cudaDeviceSetSharedMemConfig"); | ||
1709 | } | ||
1710 | if ((dl_error = dlerror()) != NULL) | ||
1711 | { | ||
1712 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
1713 | } | ||
1714 | |||
1715 | |||
1716 | // clear dl error | ||
1717 | dlerror(); | ||
1718 | if (orig_cudaDeviceGetByPCIBusId == NULL) { | ||
1719 | orig_cudaDeviceGetByPCIBusId = dlsym(RTLD_NEXT, "cudaDeviceGetByPCIBusId"); | ||
1720 | } | ||
1721 | if ((dl_error = dlerror()) != NULL) | ||
1722 | { | ||
1723 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
1724 | } | ||
1725 | |||
1726 | |||
1727 | // clear dl error | ||
1728 | dlerror(); | ||
1729 | if (orig_cudaDeviceGetPCIBusId == NULL) { | ||
1730 | orig_cudaDeviceGetPCIBusId = dlsym(RTLD_NEXT, "cudaDeviceGetPCIBusId"); | ||
1731 | } | ||
1732 | if ((dl_error = dlerror()) != NULL) | ||
1733 | { | ||
1734 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
1735 | } | ||
1736 | |||
1737 | |||
1738 | // clear dl error | ||
1739 | dlerror(); | ||
1740 | if (orig_cudaIpcGetEventHandle == NULL) { | ||
1741 | orig_cudaIpcGetEventHandle = dlsym(RTLD_NEXT, "cudaIpcGetEventHandle"); | ||
1742 | } | ||
1743 | if ((dl_error = dlerror()) != NULL) | ||
1744 | { | ||
1745 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
1746 | } | ||
1747 | |||
1748 | |||
1749 | // clear dl error | ||
1750 | dlerror(); | ||
1751 | if (orig_cudaIpcOpenEventHandle == NULL) { | ||
1752 | orig_cudaIpcOpenEventHandle = dlsym(RTLD_NEXT, "cudaIpcOpenEventHandle"); | ||
1753 | } | ||
1754 | if ((dl_error = dlerror()) != NULL) | ||
1755 | { | ||
1756 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
1757 | } | ||
1758 | |||
1759 | |||
1760 | // clear dl error | ||
1761 | dlerror(); | ||
1762 | if (orig_cudaIpcGetMemHandle == NULL) { | ||
1763 | orig_cudaIpcGetMemHandle = dlsym(RTLD_NEXT, "cudaIpcGetMemHandle"); | ||
1764 | } | ||
1765 | if ((dl_error = dlerror()) != NULL) | ||
1766 | { | ||
1767 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
1768 | } | ||
1769 | |||
1770 | |||
1771 | // clear dl error | ||
1772 | dlerror(); | ||
1773 | if (orig_cudaIpcOpenMemHandle == NULL) { | ||
1774 | orig_cudaIpcOpenMemHandle = dlsym(RTLD_NEXT, "cudaIpcOpenMemHandle"); | ||
1775 | } | ||
1776 | if ((dl_error = dlerror()) != NULL) | ||
1777 | { | ||
1778 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
1779 | } | ||
1780 | |||
1781 | |||
1782 | // clear dl error | ||
1783 | dlerror(); | ||
1784 | if (orig_cudaIpcCloseMemHandle == NULL) { | ||
1785 | orig_cudaIpcCloseMemHandle = dlsym(RTLD_NEXT, "cudaIpcCloseMemHandle"); | ||
1786 | } | ||
1787 | if ((dl_error = dlerror()) != NULL) | ||
1788 | { | ||
1789 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
1790 | } | ||
1791 | |||
1792 | |||
1793 | // clear dl error | ||
1794 | dlerror(); | ||
1795 | if (orig_cudaThreadExit == NULL) { | ||
1796 | orig_cudaThreadExit = dlsym(RTLD_NEXT, "cudaThreadExit"); | ||
1797 | } | ||
1798 | if ((dl_error = dlerror()) != NULL) | ||
1799 | { | ||
1800 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
1801 | } | ||
1802 | |||
1803 | |||
1804 | // clear dl error | ||
1805 | dlerror(); | ||
1806 | if (orig_cudaThreadSynchronize == NULL) { | ||
1807 | orig_cudaThreadSynchronize = dlsym(RTLD_NEXT, "cudaThreadSynchronize"); | ||
1808 | } | ||
1809 | if ((dl_error = dlerror()) != NULL) | ||
1810 | { | ||
1811 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
1812 | } | ||
1813 | |||
1814 | |||
1815 | // clear dl error | ||
1816 | dlerror(); | ||
1817 | if (orig_cudaThreadSetLimit == NULL) { | ||
1818 | orig_cudaThreadSetLimit = dlsym(RTLD_NEXT, "cudaThreadSetLimit"); | ||
1819 | } | ||
1820 | if ((dl_error = dlerror()) != NULL) | ||
1821 | { | ||
1822 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
1823 | } | ||
1824 | |||
1825 | |||
1826 | // clear dl error | ||
1827 | dlerror(); | ||
1828 | if (orig_cudaThreadGetLimit == NULL) { | ||
1829 | orig_cudaThreadGetLimit = dlsym(RTLD_NEXT, "cudaThreadGetLimit"); | ||
1830 | } | ||
1831 | if ((dl_error = dlerror()) != NULL) | ||
1832 | { | ||
1833 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
1834 | } | ||
1835 | |||
1836 | |||
1837 | // clear dl error | ||
1838 | dlerror(); | ||
1839 | if (orig_cudaThreadGetCacheConfig == NULL) { | ||
1840 | orig_cudaThreadGetCacheConfig = dlsym(RTLD_NEXT, "cudaThreadGetCacheConfig"); | ||
1841 | } | ||
1842 | if ((dl_error = dlerror()) != NULL) | ||
1843 | { | ||
1844 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
1845 | } | ||
1846 | |||
1847 | |||
1848 | // clear dl error | ||
1849 | dlerror(); | ||
1850 | if (orig_cudaThreadSetCacheConfig == NULL) { | ||
1851 | orig_cudaThreadSetCacheConfig = dlsym(RTLD_NEXT, "cudaThreadSetCacheConfig"); | ||
1852 | } | ||
1853 | if ((dl_error = dlerror()) != NULL) | ||
1854 | { | ||
1855 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
1856 | } | ||
1857 | |||
1858 | |||
1859 | // clear dl error | ||
1860 | dlerror(); | ||
1861 | if (orig_cudaGetLastError == NULL) { | ||
1862 | orig_cudaGetLastError = dlsym(RTLD_NEXT, "cudaGetLastError"); | ||
1863 | } | ||
1864 | if ((dl_error = dlerror()) != NULL) | ||
1865 | { | ||
1866 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
1867 | } | ||
1868 | |||
1869 | |||
1870 | // clear dl error | ||
1871 | dlerror(); | ||
1872 | if (orig_cudaPeekAtLastError == NULL) { | ||
1873 | orig_cudaPeekAtLastError = dlsym(RTLD_NEXT, "cudaPeekAtLastError"); | ||
1874 | } | ||
1875 | if ((dl_error = dlerror()) != NULL) | ||
1876 | { | ||
1877 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
1878 | } | ||
1879 | |||
1880 | |||
1881 | // clear dl error | ||
1882 | dlerror(); | ||
1883 | if (orig_cudaGetErrorName == NULL) { | ||
1884 | orig_cudaGetErrorName = dlsym(RTLD_NEXT, "cudaGetErrorName"); | ||
1885 | } | ||
1886 | if ((dl_error = dlerror()) != NULL) | ||
1887 | { | ||
1888 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
1889 | } | ||
1890 | |||
1891 | |||
1892 | // clear dl error | ||
1893 | dlerror(); | ||
1894 | if (orig_cudaGetErrorString == NULL) { | ||
1895 | orig_cudaGetErrorString = dlsym(RTLD_NEXT, "cudaGetErrorString"); | ||
1896 | } | ||
1897 | if ((dl_error = dlerror()) != NULL) | ||
1898 | { | ||
1899 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
1900 | } | ||
1901 | |||
1902 | |||
1903 | // clear dl error | ||
1904 | dlerror(); | ||
1905 | if (orig_cudaGetDeviceCount == NULL) { | ||
1906 | orig_cudaGetDeviceCount = dlsym(RTLD_NEXT, "cudaGetDeviceCount"); | ||
1907 | } | ||
1908 | if ((dl_error = dlerror()) != NULL) | ||
1909 | { | ||
1910 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
1911 | } | ||
1912 | |||
1913 | |||
1914 | // clear dl error | ||
1915 | dlerror(); | ||
1916 | if (orig_cudaGetDeviceProperties == NULL) { | ||
1917 | orig_cudaGetDeviceProperties = dlsym(RTLD_NEXT, "cudaGetDeviceProperties"); | ||
1918 | } | ||
1919 | if ((dl_error = dlerror()) != NULL) | ||
1920 | { | ||
1921 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
1922 | } | ||
1923 | |||
1924 | |||
1925 | // clear dl error | ||
1926 | dlerror(); | ||
1927 | if (orig_cudaDeviceGetAttribute == NULL) { | ||
1928 | orig_cudaDeviceGetAttribute = dlsym(RTLD_NEXT, "cudaDeviceGetAttribute"); | ||
1929 | } | ||
1930 | if ((dl_error = dlerror()) != NULL) | ||
1931 | { | ||
1932 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
1933 | } | ||
1934 | |||
1935 | |||
1936 | // clear dl error | ||
1937 | dlerror(); | ||
1938 | if (orig_cudaDeviceGetP2PAttribute == NULL) { | ||
1939 | orig_cudaDeviceGetP2PAttribute = dlsym(RTLD_NEXT, "cudaDeviceGetP2PAttribute"); | ||
1940 | } | ||
1941 | if ((dl_error = dlerror()) != NULL) | ||
1942 | { | ||
1943 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
1944 | } | ||
1945 | |||
1946 | |||
1947 | // clear dl error | ||
1948 | dlerror(); | ||
1949 | if (orig_cudaChooseDevice == NULL) { | ||
1950 | orig_cudaChooseDevice = dlsym(RTLD_NEXT, "cudaChooseDevice"); | ||
1951 | } | ||
1952 | if ((dl_error = dlerror()) != NULL) | ||
1953 | { | ||
1954 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
1955 | } | ||
1956 | |||
1957 | |||
1958 | // clear dl error | ||
1959 | dlerror(); | ||
1960 | if (orig_cudaSetDevice == NULL) { | ||
1961 | orig_cudaSetDevice = dlsym(RTLD_NEXT, "cudaSetDevice"); | ||
1962 | } | ||
1963 | if ((dl_error = dlerror()) != NULL) | ||
1964 | { | ||
1965 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
1966 | } | ||
1967 | |||
1968 | |||
1969 | // clear dl error | ||
1970 | dlerror(); | ||
1971 | if (orig_cudaGetDevice == NULL) { | ||
1972 | orig_cudaGetDevice = dlsym(RTLD_NEXT, "cudaGetDevice"); | ||
1973 | } | ||
1974 | if ((dl_error = dlerror()) != NULL) | ||
1975 | { | ||
1976 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
1977 | } | ||
1978 | |||
1979 | |||
1980 | // clear dl error | ||
1981 | dlerror(); | ||
1982 | if (orig_cudaSetValidDevices == NULL) { | ||
1983 | orig_cudaSetValidDevices = dlsym(RTLD_NEXT, "cudaSetValidDevices"); | ||
1984 | } | ||
1985 | if ((dl_error = dlerror()) != NULL) | ||
1986 | { | ||
1987 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
1988 | } | ||
1989 | |||
1990 | |||
1991 | // clear dl error | ||
1992 | dlerror(); | ||
1993 | if (orig_cudaSetDeviceFlags == NULL) { | ||
1994 | orig_cudaSetDeviceFlags = dlsym(RTLD_NEXT, "cudaSetDeviceFlags"); | ||
1995 | } | ||
1996 | if ((dl_error = dlerror()) != NULL) | ||
1997 | { | ||
1998 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
1999 | } | ||
2000 | |||
2001 | |||
2002 | // clear dl error | ||
2003 | dlerror(); | ||
2004 | if (orig_cudaGetDeviceFlags == NULL) { | ||
2005 | orig_cudaGetDeviceFlags = dlsym(RTLD_NEXT, "cudaGetDeviceFlags"); | ||
2006 | } | ||
2007 | if ((dl_error = dlerror()) != NULL) | ||
2008 | { | ||
2009 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2010 | } | ||
2011 | |||
2012 | |||
2013 | // clear dl error | ||
2014 | dlerror(); | ||
2015 | if (orig_cudaStreamCreate == NULL) { | ||
2016 | orig_cudaStreamCreate = dlsym(RTLD_NEXT, "cudaStreamCreate"); | ||
2017 | } | ||
2018 | if ((dl_error = dlerror()) != NULL) | ||
2019 | { | ||
2020 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2021 | } | ||
2022 | |||
2023 | |||
2024 | // clear dl error | ||
2025 | dlerror(); | ||
2026 | if (orig_cudaStreamCreateWithFlags == NULL) { | ||
2027 | orig_cudaStreamCreateWithFlags = dlsym(RTLD_NEXT, "cudaStreamCreateWithFlags"); | ||
2028 | } | ||
2029 | if ((dl_error = dlerror()) != NULL) | ||
2030 | { | ||
2031 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2032 | } | ||
2033 | |||
2034 | |||
2035 | // clear dl error | ||
2036 | dlerror(); | ||
2037 | if (orig_cudaStreamCreateWithPriority == NULL) { | ||
2038 | orig_cudaStreamCreateWithPriority = dlsym(RTLD_NEXT, "cudaStreamCreateWithPriority"); | ||
2039 | } | ||
2040 | if ((dl_error = dlerror()) != NULL) | ||
2041 | { | ||
2042 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2043 | } | ||
2044 | |||
2045 | |||
2046 | // clear dl error | ||
2047 | dlerror(); | ||
2048 | if (orig_cudaStreamGetPriority == NULL) { | ||
2049 | orig_cudaStreamGetPriority = dlsym(RTLD_NEXT, "cudaStreamGetPriority"); | ||
2050 | } | ||
2051 | if ((dl_error = dlerror()) != NULL) | ||
2052 | { | ||
2053 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2054 | } | ||
2055 | |||
2056 | |||
2057 | // clear dl error | ||
2058 | dlerror(); | ||
2059 | if (orig_cudaStreamGetFlags == NULL) { | ||
2060 | orig_cudaStreamGetFlags = dlsym(RTLD_NEXT, "cudaStreamGetFlags"); | ||
2061 | } | ||
2062 | if ((dl_error = dlerror()) != NULL) | ||
2063 | { | ||
2064 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2065 | } | ||
2066 | |||
2067 | |||
2068 | // clear dl error | ||
2069 | dlerror(); | ||
2070 | if (orig_cudaStreamDestroy == NULL) { | ||
2071 | orig_cudaStreamDestroy = dlsym(RTLD_NEXT, "cudaStreamDestroy"); | ||
2072 | } | ||
2073 | if ((dl_error = dlerror()) != NULL) | ||
2074 | { | ||
2075 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2076 | } | ||
2077 | |||
2078 | |||
2079 | // clear dl error | ||
2080 | dlerror(); | ||
2081 | if (orig_cudaStreamWaitEvent == NULL) { | ||
2082 | orig_cudaStreamWaitEvent = dlsym(RTLD_NEXT, "cudaStreamWaitEvent"); | ||
2083 | } | ||
2084 | if ((dl_error = dlerror()) != NULL) | ||
2085 | { | ||
2086 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2087 | } | ||
2088 | |||
2089 | |||
2090 | // clear dl error | ||
2091 | dlerror(); | ||
2092 | if (orig_cudaStreamAddCallback == NULL) { | ||
2093 | orig_cudaStreamAddCallback = dlsym(RTLD_NEXT, "cudaStreamAddCallback"); | ||
2094 | } | ||
2095 | if ((dl_error = dlerror()) != NULL) | ||
2096 | { | ||
2097 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2098 | } | ||
2099 | |||
2100 | |||
2101 | // clear dl error | ||
2102 | dlerror(); | ||
2103 | if (orig_cudaStreamSynchronize == NULL) { | ||
2104 | orig_cudaStreamSynchronize = dlsym(RTLD_NEXT, "cudaStreamSynchronize"); | ||
2105 | } | ||
2106 | if ((dl_error = dlerror()) != NULL) | ||
2107 | { | ||
2108 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2109 | } | ||
2110 | |||
2111 | |||
2112 | // clear dl error | ||
2113 | dlerror(); | ||
2114 | if (orig_cudaStreamQuery == NULL) { | ||
2115 | orig_cudaStreamQuery = dlsym(RTLD_NEXT, "cudaStreamQuery"); | ||
2116 | } | ||
2117 | if ((dl_error = dlerror()) != NULL) | ||
2118 | { | ||
2119 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2120 | } | ||
2121 | |||
2122 | |||
2123 | // clear dl error | ||
2124 | dlerror(); | ||
2125 | if (orig_cudaStreamAttachMemAsync == NULL) { | ||
2126 | orig_cudaStreamAttachMemAsync = dlsym(RTLD_NEXT, "cudaStreamAttachMemAsync"); | ||
2127 | } | ||
2128 | if ((dl_error = dlerror()) != NULL) | ||
2129 | { | ||
2130 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2131 | } | ||
2132 | |||
2133 | |||
2134 | // clear dl error | ||
2135 | dlerror(); | ||
2136 | if (orig_cudaEventCreate == NULL) { | ||
2137 | orig_cudaEventCreate = dlsym(RTLD_NEXT, "cudaEventCreate"); | ||
2138 | } | ||
2139 | if ((dl_error = dlerror()) != NULL) | ||
2140 | { | ||
2141 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2142 | } | ||
2143 | |||
2144 | |||
2145 | // clear dl error | ||
2146 | dlerror(); | ||
2147 | if (orig_cudaEventCreateWithFlags == NULL) { | ||
2148 | orig_cudaEventCreateWithFlags = dlsym(RTLD_NEXT, "cudaEventCreateWithFlags"); | ||
2149 | } | ||
2150 | if ((dl_error = dlerror()) != NULL) | ||
2151 | { | ||
2152 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2153 | } | ||
2154 | |||
2155 | |||
2156 | // clear dl error | ||
2157 | dlerror(); | ||
2158 | if (orig_cudaEventRecord == NULL) { | ||
2159 | orig_cudaEventRecord = dlsym(RTLD_NEXT, "cudaEventRecord"); | ||
2160 | } | ||
2161 | if ((dl_error = dlerror()) != NULL) | ||
2162 | { | ||
2163 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2164 | } | ||
2165 | |||
2166 | |||
2167 | // clear dl error | ||
2168 | dlerror(); | ||
2169 | if (orig_cudaEventQuery == NULL) { | ||
2170 | orig_cudaEventQuery = dlsym(RTLD_NEXT, "cudaEventQuery"); | ||
2171 | } | ||
2172 | if ((dl_error = dlerror()) != NULL) | ||
2173 | { | ||
2174 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2175 | } | ||
2176 | |||
2177 | |||
2178 | // clear dl error | ||
2179 | dlerror(); | ||
2180 | if (orig_cudaEventSynchronize == NULL) { | ||
2181 | orig_cudaEventSynchronize = dlsym(RTLD_NEXT, "cudaEventSynchronize"); | ||
2182 | } | ||
2183 | if ((dl_error = dlerror()) != NULL) | ||
2184 | { | ||
2185 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2186 | } | ||
2187 | |||
2188 | |||
2189 | // clear dl error | ||
2190 | dlerror(); | ||
2191 | if (orig_cudaEventDestroy == NULL) { | ||
2192 | orig_cudaEventDestroy = dlsym(RTLD_NEXT, "cudaEventDestroy"); | ||
2193 | } | ||
2194 | if ((dl_error = dlerror()) != NULL) | ||
2195 | { | ||
2196 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2197 | } | ||
2198 | |||
2199 | |||
2200 | // clear dl error | ||
2201 | dlerror(); | ||
2202 | if (orig_cudaEventElapsedTime == NULL) { | ||
2203 | orig_cudaEventElapsedTime = dlsym(RTLD_NEXT, "cudaEventElapsedTime"); | ||
2204 | } | ||
2205 | if ((dl_error = dlerror()) != NULL) | ||
2206 | { | ||
2207 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2208 | } | ||
2209 | |||
2210 | |||
2211 | // clear dl error | ||
2212 | dlerror(); | ||
2213 | if (orig_cudaLaunchKernel == NULL) { | ||
2214 | orig_cudaLaunchKernel = dlsym(RTLD_NEXT, "cudaLaunchKernel"); | ||
2215 | } | ||
2216 | if ((dl_error = dlerror()) != NULL) | ||
2217 | { | ||
2218 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2219 | } | ||
2220 | |||
2221 | |||
2222 | // clear dl error | ||
2223 | dlerror(); | ||
2224 | if (orig_cudaLaunchCooperativeKernel == NULL) { | ||
2225 | orig_cudaLaunchCooperativeKernel = dlsym(RTLD_NEXT, "cudaLaunchCooperativeKernel"); | ||
2226 | } | ||
2227 | if ((dl_error = dlerror()) != NULL) | ||
2228 | { | ||
2229 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2230 | } | ||
2231 | |||
2232 | |||
2233 | // clear dl error | ||
2234 | dlerror(); | ||
2235 | if (orig_cudaLaunchCooperativeKernelMultiDevice == NULL) { | ||
2236 | orig_cudaLaunchCooperativeKernelMultiDevice = dlsym(RTLD_NEXT, "cudaLaunchCooperativeKernelMultiDevice"); | ||
2237 | } | ||
2238 | if ((dl_error = dlerror()) != NULL) | ||
2239 | { | ||
2240 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2241 | } | ||
2242 | |||
2243 | |||
2244 | // clear dl error | ||
2245 | dlerror(); | ||
2246 | if (orig_cudaFuncSetCacheConfig == NULL) { | ||
2247 | orig_cudaFuncSetCacheConfig = dlsym(RTLD_NEXT, "cudaFuncSetCacheConfig"); | ||
2248 | } | ||
2249 | if ((dl_error = dlerror()) != NULL) | ||
2250 | { | ||
2251 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2252 | } | ||
2253 | |||
2254 | |||
2255 | // clear dl error | ||
2256 | dlerror(); | ||
2257 | if (orig_cudaFuncSetSharedMemConfig == NULL) { | ||
2258 | orig_cudaFuncSetSharedMemConfig = dlsym(RTLD_NEXT, "cudaFuncSetSharedMemConfig"); | ||
2259 | } | ||
2260 | if ((dl_error = dlerror()) != NULL) | ||
2261 | { | ||
2262 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2263 | } | ||
2264 | |||
2265 | |||
2266 | // clear dl error | ||
2267 | dlerror(); | ||
2268 | if (orig_cudaFuncGetAttributes == NULL) { | ||
2269 | orig_cudaFuncGetAttributes = dlsym(RTLD_NEXT, "cudaFuncGetAttributes"); | ||
2270 | } | ||
2271 | if ((dl_error = dlerror()) != NULL) | ||
2272 | { | ||
2273 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2274 | } | ||
2275 | |||
2276 | |||
2277 | // clear dl error | ||
2278 | dlerror(); | ||
2279 | if (orig_cudaFuncSetAttribute == NULL) { | ||
2280 | orig_cudaFuncSetAttribute = dlsym(RTLD_NEXT, "cudaFuncSetAttribute"); | ||
2281 | } | ||
2282 | if ((dl_error = dlerror()) != NULL) | ||
2283 | { | ||
2284 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2285 | } | ||
2286 | |||
2287 | |||
2288 | // clear dl error | ||
2289 | dlerror(); | ||
2290 | if (orig_cudaSetDoubleForDevice == NULL) { | ||
2291 | orig_cudaSetDoubleForDevice = dlsym(RTLD_NEXT, "cudaSetDoubleForDevice"); | ||
2292 | } | ||
2293 | if ((dl_error = dlerror()) != NULL) | ||
2294 | { | ||
2295 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2296 | } | ||
2297 | |||
2298 | |||
2299 | // clear dl error | ||
2300 | dlerror(); | ||
2301 | if (orig_cudaSetDoubleForHost == NULL) { | ||
2302 | orig_cudaSetDoubleForHost = dlsym(RTLD_NEXT, "cudaSetDoubleForHost"); | ||
2303 | } | ||
2304 | if ((dl_error = dlerror()) != NULL) | ||
2305 | { | ||
2306 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2307 | } | ||
2308 | |||
2309 | |||
2310 | // clear dl error | ||
2311 | dlerror(); | ||
2312 | if (orig_cudaOccupancyMaxActiveBlocksPerMultiprocessor == NULL) { | ||
2313 | orig_cudaOccupancyMaxActiveBlocksPerMultiprocessor = dlsym(RTLD_NEXT, "cudaOccupancyMaxActiveBlocksPerMultiprocessor"); | ||
2314 | } | ||
2315 | if ((dl_error = dlerror()) != NULL) | ||
2316 | { | ||
2317 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2318 | } | ||
2319 | |||
2320 | |||
2321 | // clear dl error | ||
2322 | dlerror(); | ||
2323 | if (orig_cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags == NULL) { | ||
2324 | orig_cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags = dlsym(RTLD_NEXT, "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags"); | ||
2325 | } | ||
2326 | if ((dl_error = dlerror()) != NULL) | ||
2327 | { | ||
2328 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2329 | } | ||
2330 | |||
2331 | |||
2332 | // clear dl error | ||
2333 | dlerror(); | ||
2334 | if (orig_cudaConfigureCall == NULL) { | ||
2335 | orig_cudaConfigureCall = dlsym(RTLD_NEXT, "cudaConfigureCall"); | ||
2336 | } | ||
2337 | if ((dl_error = dlerror()) != NULL) | ||
2338 | { | ||
2339 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2340 | } | ||
2341 | |||
2342 | |||
2343 | // clear dl error | ||
2344 | dlerror(); | ||
2345 | if (orig_cudaSetupArgument == NULL) { | ||
2346 | orig_cudaSetupArgument = dlsym(RTLD_NEXT, "cudaSetupArgument"); | ||
2347 | } | ||
2348 | if ((dl_error = dlerror()) != NULL) | ||
2349 | { | ||
2350 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2351 | } | ||
2352 | |||
2353 | |||
2354 | // clear dl error | ||
2355 | dlerror(); | ||
2356 | if (orig_cudaLaunch == NULL) { | ||
2357 | orig_cudaLaunch = dlsym(RTLD_NEXT, "cudaLaunch"); | ||
2358 | } | ||
2359 | if ((dl_error = dlerror()) != NULL) | ||
2360 | { | ||
2361 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2362 | } | ||
2363 | |||
2364 | |||
2365 | // clear dl error | ||
2366 | dlerror(); | ||
2367 | if (orig_cudaMallocManaged == NULL) { | ||
2368 | orig_cudaMallocManaged = dlsym(RTLD_NEXT, "cudaMallocManaged"); | ||
2369 | } | ||
2370 | if ((dl_error = dlerror()) != NULL) | ||
2371 | { | ||
2372 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2373 | } | ||
2374 | |||
2375 | |||
2376 | // clear dl error | ||
2377 | dlerror(); | ||
2378 | if (orig_cudaMalloc == NULL) { | ||
2379 | orig_cudaMalloc = dlsym(RTLD_NEXT, "cudaMalloc"); | ||
2380 | } | ||
2381 | if ((dl_error = dlerror()) != NULL) | ||
2382 | { | ||
2383 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2384 | } | ||
2385 | |||
2386 | |||
2387 | // clear dl error | ||
2388 | dlerror(); | ||
2389 | if (orig_cudaMallocHost == NULL) { | ||
2390 | orig_cudaMallocHost = dlsym(RTLD_NEXT, "cudaMallocHost"); | ||
2391 | } | ||
2392 | if ((dl_error = dlerror()) != NULL) | ||
2393 | { | ||
2394 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2395 | } | ||
2396 | |||
2397 | |||
2398 | // clear dl error | ||
2399 | dlerror(); | ||
2400 | if (orig_cudaMallocPitch == NULL) { | ||
2401 | orig_cudaMallocPitch = dlsym(RTLD_NEXT, "cudaMallocPitch"); | ||
2402 | } | ||
2403 | if ((dl_error = dlerror()) != NULL) | ||
2404 | { | ||
2405 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2406 | } | ||
2407 | |||
2408 | |||
2409 | // clear dl error | ||
2410 | dlerror(); | ||
2411 | if (orig_cudaMallocArray == NULL) { | ||
2412 | orig_cudaMallocArray = dlsym(RTLD_NEXT, "cudaMallocArray"); | ||
2413 | } | ||
2414 | if ((dl_error = dlerror()) != NULL) | ||
2415 | { | ||
2416 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2417 | } | ||
2418 | |||
2419 | |||
2420 | // clear dl error | ||
2421 | dlerror(); | ||
2422 | if (orig_cudaFree == NULL) { | ||
2423 | orig_cudaFree = dlsym(RTLD_NEXT, "cudaFree"); | ||
2424 | } | ||
2425 | if ((dl_error = dlerror()) != NULL) | ||
2426 | { | ||
2427 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2428 | } | ||
2429 | |||
2430 | |||
2431 | // clear dl error | ||
2432 | dlerror(); | ||
2433 | if (orig_cudaFreeHost == NULL) { | ||
2434 | orig_cudaFreeHost = dlsym(RTLD_NEXT, "cudaFreeHost"); | ||
2435 | } | ||
2436 | if ((dl_error = dlerror()) != NULL) | ||
2437 | { | ||
2438 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2439 | } | ||
2440 | |||
2441 | |||
2442 | // clear dl error | ||
2443 | dlerror(); | ||
2444 | if (orig_cudaFreeArray == NULL) { | ||
2445 | orig_cudaFreeArray = dlsym(RTLD_NEXT, "cudaFreeArray"); | ||
2446 | } | ||
2447 | if ((dl_error = dlerror()) != NULL) | ||
2448 | { | ||
2449 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2450 | } | ||
2451 | |||
2452 | |||
2453 | // clear dl error | ||
2454 | dlerror(); | ||
2455 | if (orig_cudaFreeMipmappedArray == NULL) { | ||
2456 | orig_cudaFreeMipmappedArray = dlsym(RTLD_NEXT, "cudaFreeMipmappedArray"); | ||
2457 | } | ||
2458 | if ((dl_error = dlerror()) != NULL) | ||
2459 | { | ||
2460 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2461 | } | ||
2462 | |||
2463 | |||
2464 | // clear dl error | ||
2465 | dlerror(); | ||
2466 | if (orig_cudaHostAlloc == NULL) { | ||
2467 | orig_cudaHostAlloc = dlsym(RTLD_NEXT, "cudaHostAlloc"); | ||
2468 | } | ||
2469 | if ((dl_error = dlerror()) != NULL) | ||
2470 | { | ||
2471 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2472 | } | ||
2473 | |||
2474 | |||
2475 | // clear dl error | ||
2476 | dlerror(); | ||
2477 | if (orig_cudaHostRegister == NULL) { | ||
2478 | orig_cudaHostRegister = dlsym(RTLD_NEXT, "cudaHostRegister"); | ||
2479 | } | ||
2480 | if ((dl_error = dlerror()) != NULL) | ||
2481 | { | ||
2482 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2483 | } | ||
2484 | |||
2485 | |||
2486 | // clear dl error | ||
2487 | dlerror(); | ||
2488 | if (orig_cudaHostUnregister == NULL) { | ||
2489 | orig_cudaHostUnregister = dlsym(RTLD_NEXT, "cudaHostUnregister"); | ||
2490 | } | ||
2491 | if ((dl_error = dlerror()) != NULL) | ||
2492 | { | ||
2493 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2494 | } | ||
2495 | |||
2496 | |||
2497 | // clear dl error | ||
2498 | dlerror(); | ||
2499 | if (orig_cudaHostGetDevicePointer == NULL) { | ||
2500 | orig_cudaHostGetDevicePointer = dlsym(RTLD_NEXT, "cudaHostGetDevicePointer"); | ||
2501 | } | ||
2502 | if ((dl_error = dlerror()) != NULL) | ||
2503 | { | ||
2504 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2505 | } | ||
2506 | |||
2507 | |||
2508 | // clear dl error | ||
2509 | dlerror(); | ||
2510 | if (orig_cudaHostGetFlags == NULL) { | ||
2511 | orig_cudaHostGetFlags = dlsym(RTLD_NEXT, "cudaHostGetFlags"); | ||
2512 | } | ||
2513 | if ((dl_error = dlerror()) != NULL) | ||
2514 | { | ||
2515 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2516 | } | ||
2517 | |||
2518 | |||
2519 | // clear dl error | ||
2520 | dlerror(); | ||
2521 | if (orig_cudaMalloc3D == NULL) { | ||
2522 | orig_cudaMalloc3D = dlsym(RTLD_NEXT, "cudaMalloc3D"); | ||
2523 | } | ||
2524 | if ((dl_error = dlerror()) != NULL) | ||
2525 | { | ||
2526 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2527 | } | ||
2528 | |||
2529 | |||
2530 | // clear dl error | ||
2531 | dlerror(); | ||
2532 | if (orig_cudaMalloc3DArray == NULL) { | ||
2533 | orig_cudaMalloc3DArray = dlsym(RTLD_NEXT, "cudaMalloc3DArray"); | ||
2534 | } | ||
2535 | if ((dl_error = dlerror()) != NULL) | ||
2536 | { | ||
2537 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2538 | } | ||
2539 | |||
2540 | |||
2541 | // clear dl error | ||
2542 | dlerror(); | ||
2543 | if (orig_cudaMallocMipmappedArray == NULL) { | ||
2544 | orig_cudaMallocMipmappedArray = dlsym(RTLD_NEXT, "cudaMallocMipmappedArray"); | ||
2545 | } | ||
2546 | if ((dl_error = dlerror()) != NULL) | ||
2547 | { | ||
2548 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2549 | } | ||
2550 | |||
2551 | |||
2552 | // clear dl error | ||
2553 | dlerror(); | ||
2554 | if (orig_cudaGetMipmappedArrayLevel == NULL) { | ||
2555 | orig_cudaGetMipmappedArrayLevel = dlsym(RTLD_NEXT, "cudaGetMipmappedArrayLevel"); | ||
2556 | } | ||
2557 | if ((dl_error = dlerror()) != NULL) | ||
2558 | { | ||
2559 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2560 | } | ||
2561 | |||
2562 | |||
2563 | // clear dl error | ||
2564 | dlerror(); | ||
2565 | if (orig_cudaMemcpy3D == NULL) { | ||
2566 | orig_cudaMemcpy3D = dlsym(RTLD_NEXT, "cudaMemcpy3D"); | ||
2567 | } | ||
2568 | if ((dl_error = dlerror()) != NULL) | ||
2569 | { | ||
2570 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2571 | } | ||
2572 | |||
2573 | |||
2574 | // clear dl error | ||
2575 | dlerror(); | ||
2576 | if (orig_cudaMemcpy3DPeer == NULL) { | ||
2577 | orig_cudaMemcpy3DPeer = dlsym(RTLD_NEXT, "cudaMemcpy3DPeer"); | ||
2578 | } | ||
2579 | if ((dl_error = dlerror()) != NULL) | ||
2580 | { | ||
2581 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2582 | } | ||
2583 | |||
2584 | |||
2585 | // clear dl error | ||
2586 | dlerror(); | ||
2587 | if (orig_cudaMemcpy3DAsync == NULL) { | ||
2588 | orig_cudaMemcpy3DAsync = dlsym(RTLD_NEXT, "cudaMemcpy3DAsync"); | ||
2589 | } | ||
2590 | if ((dl_error = dlerror()) != NULL) | ||
2591 | { | ||
2592 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2593 | } | ||
2594 | |||
2595 | |||
2596 | // clear dl error | ||
2597 | dlerror(); | ||
2598 | if (orig_cudaMemcpy3DPeerAsync == NULL) { | ||
2599 | orig_cudaMemcpy3DPeerAsync = dlsym(RTLD_NEXT, "cudaMemcpy3DPeerAsync"); | ||
2600 | } | ||
2601 | if ((dl_error = dlerror()) != NULL) | ||
2602 | { | ||
2603 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2604 | } | ||
2605 | |||
2606 | |||
2607 | // clear dl error | ||
2608 | dlerror(); | ||
2609 | if (orig_cudaMemGetInfo == NULL) { | ||
2610 | orig_cudaMemGetInfo = dlsym(RTLD_NEXT, "cudaMemGetInfo"); | ||
2611 | } | ||
2612 | if ((dl_error = dlerror()) != NULL) | ||
2613 | { | ||
2614 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2615 | } | ||
2616 | |||
2617 | |||
2618 | // clear dl error | ||
2619 | dlerror(); | ||
2620 | if (orig_cudaArrayGetInfo == NULL) { | ||
2621 | orig_cudaArrayGetInfo = dlsym(RTLD_NEXT, "cudaArrayGetInfo"); | ||
2622 | } | ||
2623 | if ((dl_error = dlerror()) != NULL) | ||
2624 | { | ||
2625 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2626 | } | ||
2627 | |||
2628 | |||
2629 | // clear dl error | ||
2630 | dlerror(); | ||
2631 | if (orig_cudaMemcpy == NULL) { | ||
2632 | orig_cudaMemcpy = dlsym(RTLD_NEXT, "cudaMemcpy"); | ||
2633 | } | ||
2634 | if ((dl_error = dlerror()) != NULL) | ||
2635 | { | ||
2636 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2637 | } | ||
2638 | |||
2639 | |||
2640 | // clear dl error | ||
2641 | dlerror(); | ||
2642 | if (orig_cudaMemcpyPeer == NULL) { | ||
2643 | orig_cudaMemcpyPeer = dlsym(RTLD_NEXT, "cudaMemcpyPeer"); | ||
2644 | } | ||
2645 | if ((dl_error = dlerror()) != NULL) | ||
2646 | { | ||
2647 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2648 | } | ||
2649 | |||
2650 | |||
2651 | // clear dl error | ||
2652 | dlerror(); | ||
2653 | if (orig_cudaMemcpyToArray == NULL) { | ||
2654 | orig_cudaMemcpyToArray = dlsym(RTLD_NEXT, "cudaMemcpyToArray"); | ||
2655 | } | ||
2656 | if ((dl_error = dlerror()) != NULL) | ||
2657 | { | ||
2658 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2659 | } | ||
2660 | |||
2661 | |||
2662 | // clear dl error | ||
2663 | dlerror(); | ||
2664 | if (orig_cudaMemcpyFromArray == NULL) { | ||
2665 | orig_cudaMemcpyFromArray = dlsym(RTLD_NEXT, "cudaMemcpyFromArray"); | ||
2666 | } | ||
2667 | if ((dl_error = dlerror()) != NULL) | ||
2668 | { | ||
2669 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2670 | } | ||
2671 | |||
2672 | |||
2673 | // clear dl error | ||
2674 | dlerror(); | ||
2675 | if (orig_cudaMemcpyArrayToArray == NULL) { | ||
2676 | orig_cudaMemcpyArrayToArray = dlsym(RTLD_NEXT, "cudaMemcpyArrayToArray"); | ||
2677 | } | ||
2678 | if ((dl_error = dlerror()) != NULL) | ||
2679 | { | ||
2680 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2681 | } | ||
2682 | |||
2683 | |||
2684 | // clear dl error | ||
2685 | dlerror(); | ||
2686 | if (orig_cudaMemcpy2D == NULL) { | ||
2687 | orig_cudaMemcpy2D = dlsym(RTLD_NEXT, "cudaMemcpy2D"); | ||
2688 | } | ||
2689 | if ((dl_error = dlerror()) != NULL) | ||
2690 | { | ||
2691 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2692 | } | ||
2693 | |||
2694 | |||
2695 | // clear dl error | ||
2696 | dlerror(); | ||
2697 | if (orig_cudaMemcpy2DToArray == NULL) { | ||
2698 | orig_cudaMemcpy2DToArray = dlsym(RTLD_NEXT, "cudaMemcpy2DToArray"); | ||
2699 | } | ||
2700 | if ((dl_error = dlerror()) != NULL) | ||
2701 | { | ||
2702 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2703 | } | ||
2704 | |||
2705 | |||
2706 | // clear dl error | ||
2707 | dlerror(); | ||
2708 | if (orig_cudaMemcpy2DFromArray == NULL) { | ||
2709 | orig_cudaMemcpy2DFromArray = dlsym(RTLD_NEXT, "cudaMemcpy2DFromArray"); | ||
2710 | } | ||
2711 | if ((dl_error = dlerror()) != NULL) | ||
2712 | { | ||
2713 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2714 | } | ||
2715 | |||
2716 | |||
2717 | // clear dl error | ||
2718 | dlerror(); | ||
2719 | if (orig_cudaMemcpy2DArrayToArray == NULL) { | ||
2720 | orig_cudaMemcpy2DArrayToArray = dlsym(RTLD_NEXT, "cudaMemcpy2DArrayToArray"); | ||
2721 | } | ||
2722 | if ((dl_error = dlerror()) != NULL) | ||
2723 | { | ||
2724 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2725 | } | ||
2726 | |||
2727 | |||
2728 | // clear dl error | ||
2729 | dlerror(); | ||
2730 | if (orig_cudaMemcpyToSymbol == NULL) { | ||
2731 | orig_cudaMemcpyToSymbol = dlsym(RTLD_NEXT, "cudaMemcpyToSymbol"); | ||
2732 | } | ||
2733 | if ((dl_error = dlerror()) != NULL) | ||
2734 | { | ||
2735 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2736 | } | ||
2737 | |||
2738 | |||
2739 | // clear dl error | ||
2740 | dlerror(); | ||
2741 | if (orig_cudaMemcpyFromSymbol == NULL) { | ||
2742 | orig_cudaMemcpyFromSymbol = dlsym(RTLD_NEXT, "cudaMemcpyFromSymbol"); | ||
2743 | } | ||
2744 | if ((dl_error = dlerror()) != NULL) | ||
2745 | { | ||
2746 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2747 | } | ||
2748 | |||
2749 | |||
2750 | // clear dl error | ||
2751 | dlerror(); | ||
2752 | if (orig_cudaMemcpyAsync == NULL) { | ||
2753 | orig_cudaMemcpyAsync = dlsym(RTLD_NEXT, "cudaMemcpyAsync"); | ||
2754 | } | ||
2755 | if ((dl_error = dlerror()) != NULL) | ||
2756 | { | ||
2757 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2758 | } | ||
2759 | |||
2760 | |||
2761 | // clear dl error | ||
2762 | dlerror(); | ||
2763 | if (orig_cudaMemcpyPeerAsync == NULL) { | ||
2764 | orig_cudaMemcpyPeerAsync = dlsym(RTLD_NEXT, "cudaMemcpyPeerAsync"); | ||
2765 | } | ||
2766 | if ((dl_error = dlerror()) != NULL) | ||
2767 | { | ||
2768 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2769 | } | ||
2770 | |||
2771 | |||
2772 | // clear dl error | ||
2773 | dlerror(); | ||
2774 | if (orig_cudaMemcpyToArrayAsync == NULL) { | ||
2775 | orig_cudaMemcpyToArrayAsync = dlsym(RTLD_NEXT, "cudaMemcpyToArrayAsync"); | ||
2776 | } | ||
2777 | if ((dl_error = dlerror()) != NULL) | ||
2778 | { | ||
2779 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2780 | } | ||
2781 | |||
2782 | |||
2783 | // clear dl error | ||
2784 | dlerror(); | ||
2785 | if (orig_cudaMemcpyFromArrayAsync == NULL) { | ||
2786 | orig_cudaMemcpyFromArrayAsync = dlsym(RTLD_NEXT, "cudaMemcpyFromArrayAsync"); | ||
2787 | } | ||
2788 | if ((dl_error = dlerror()) != NULL) | ||
2789 | { | ||
2790 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2791 | } | ||
2792 | |||
2793 | |||
2794 | // clear dl error | ||
2795 | dlerror(); | ||
2796 | if (orig_cudaMemcpy2DAsync == NULL) { | ||
2797 | orig_cudaMemcpy2DAsync = dlsym(RTLD_NEXT, "cudaMemcpy2DAsync"); | ||
2798 | } | ||
2799 | if ((dl_error = dlerror()) != NULL) | ||
2800 | { | ||
2801 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2802 | } | ||
2803 | |||
2804 | |||
2805 | // clear dl error | ||
2806 | dlerror(); | ||
2807 | if (orig_cudaMemcpy2DToArrayAsync == NULL) { | ||
2808 | orig_cudaMemcpy2DToArrayAsync = dlsym(RTLD_NEXT, "cudaMemcpy2DToArrayAsync"); | ||
2809 | } | ||
2810 | if ((dl_error = dlerror()) != NULL) | ||
2811 | { | ||
2812 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2813 | } | ||
2814 | |||
2815 | |||
2816 | // clear dl error | ||
2817 | dlerror(); | ||
2818 | if (orig_cudaMemcpy2DFromArrayAsync == NULL) { | ||
2819 | orig_cudaMemcpy2DFromArrayAsync = dlsym(RTLD_NEXT, "cudaMemcpy2DFromArrayAsync"); | ||
2820 | } | ||
2821 | if ((dl_error = dlerror()) != NULL) | ||
2822 | { | ||
2823 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2824 | } | ||
2825 | |||
2826 | |||
2827 | // clear dl error | ||
2828 | dlerror(); | ||
2829 | if (orig_cudaMemcpyToSymbolAsync == NULL) { | ||
2830 | orig_cudaMemcpyToSymbolAsync = dlsym(RTLD_NEXT, "cudaMemcpyToSymbolAsync"); | ||
2831 | } | ||
2832 | if ((dl_error = dlerror()) != NULL) | ||
2833 | { | ||
2834 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2835 | } | ||
2836 | |||
2837 | |||
2838 | // clear dl error | ||
2839 | dlerror(); | ||
2840 | if (orig_cudaMemcpyFromSymbolAsync == NULL) { | ||
2841 | orig_cudaMemcpyFromSymbolAsync = dlsym(RTLD_NEXT, "cudaMemcpyFromSymbolAsync"); | ||
2842 | } | ||
2843 | if ((dl_error = dlerror()) != NULL) | ||
2844 | { | ||
2845 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2846 | } | ||
2847 | |||
2848 | |||
2849 | // clear dl error | ||
2850 | dlerror(); | ||
2851 | if (orig_cudaMemset == NULL) { | ||
2852 | orig_cudaMemset = dlsym(RTLD_NEXT, "cudaMemset"); | ||
2853 | } | ||
2854 | if ((dl_error = dlerror()) != NULL) | ||
2855 | { | ||
2856 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2857 | } | ||
2858 | |||
2859 | |||
2860 | // clear dl error | ||
2861 | dlerror(); | ||
2862 | if (orig_cudaMemset2D == NULL) { | ||
2863 | orig_cudaMemset2D = dlsym(RTLD_NEXT, "cudaMemset2D"); | ||
2864 | } | ||
2865 | if ((dl_error = dlerror()) != NULL) | ||
2866 | { | ||
2867 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2868 | } | ||
2869 | |||
2870 | |||
2871 | // clear dl error | ||
2872 | dlerror(); | ||
2873 | if (orig_cudaMemset3D == NULL) { | ||
2874 | orig_cudaMemset3D = dlsym(RTLD_NEXT, "cudaMemset3D"); | ||
2875 | } | ||
2876 | if ((dl_error = dlerror()) != NULL) | ||
2877 | { | ||
2878 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2879 | } | ||
2880 | |||
2881 | |||
2882 | // clear dl error | ||
2883 | dlerror(); | ||
2884 | if (orig_cudaMemsetAsync == NULL) { | ||
2885 | orig_cudaMemsetAsync = dlsym(RTLD_NEXT, "cudaMemsetAsync"); | ||
2886 | } | ||
2887 | if ((dl_error = dlerror()) != NULL) | ||
2888 | { | ||
2889 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2890 | } | ||
2891 | |||
2892 | |||
2893 | // clear dl error | ||
2894 | dlerror(); | ||
2895 | if (orig_cudaMemset2DAsync == NULL) { | ||
2896 | orig_cudaMemset2DAsync = dlsym(RTLD_NEXT, "cudaMemset2DAsync"); | ||
2897 | } | ||
2898 | if ((dl_error = dlerror()) != NULL) | ||
2899 | { | ||
2900 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2901 | } | ||
2902 | |||
2903 | |||
2904 | // clear dl error | ||
2905 | dlerror(); | ||
2906 | if (orig_cudaMemset3DAsync == NULL) { | ||
2907 | orig_cudaMemset3DAsync = dlsym(RTLD_NEXT, "cudaMemset3DAsync"); | ||
2908 | } | ||
2909 | if ((dl_error = dlerror()) != NULL) | ||
2910 | { | ||
2911 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2912 | } | ||
2913 | |||
2914 | |||
2915 | // clear dl error | ||
2916 | dlerror(); | ||
2917 | if (orig_cudaGetSymbolAddress == NULL) { | ||
2918 | orig_cudaGetSymbolAddress = dlsym(RTLD_NEXT, "cudaGetSymbolAddress"); | ||
2919 | } | ||
2920 | if ((dl_error = dlerror()) != NULL) | ||
2921 | { | ||
2922 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2923 | } | ||
2924 | |||
2925 | |||
2926 | // clear dl error | ||
2927 | dlerror(); | ||
2928 | if (orig_cudaGetSymbolSize == NULL) { | ||
2929 | orig_cudaGetSymbolSize = dlsym(RTLD_NEXT, "cudaGetSymbolSize"); | ||
2930 | } | ||
2931 | if ((dl_error = dlerror()) != NULL) | ||
2932 | { | ||
2933 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2934 | } | ||
2935 | |||
2936 | |||
2937 | // clear dl error | ||
2938 | dlerror(); | ||
2939 | if (orig_cudaMemPrefetchAsync == NULL) { | ||
2940 | orig_cudaMemPrefetchAsync = dlsym(RTLD_NEXT, "cudaMemPrefetchAsync"); | ||
2941 | } | ||
2942 | if ((dl_error = dlerror()) != NULL) | ||
2943 | { | ||
2944 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2945 | } | ||
2946 | |||
2947 | |||
2948 | // clear dl error | ||
2949 | dlerror(); | ||
2950 | if (orig_cudaMemAdvise == NULL) { | ||
2951 | orig_cudaMemAdvise = dlsym(RTLD_NEXT, "cudaMemAdvise"); | ||
2952 | } | ||
2953 | if ((dl_error = dlerror()) != NULL) | ||
2954 | { | ||
2955 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2956 | } | ||
2957 | |||
2958 | |||
2959 | // clear dl error | ||
2960 | dlerror(); | ||
2961 | if (orig_cudaMemRangeGetAttribute == NULL) { | ||
2962 | orig_cudaMemRangeGetAttribute = dlsym(RTLD_NEXT, "cudaMemRangeGetAttribute"); | ||
2963 | } | ||
2964 | if ((dl_error = dlerror()) != NULL) | ||
2965 | { | ||
2966 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2967 | } | ||
2968 | |||
2969 | |||
2970 | // clear dl error | ||
2971 | dlerror(); | ||
2972 | if (orig_cudaMemRangeGetAttributes == NULL) { | ||
2973 | orig_cudaMemRangeGetAttributes = dlsym(RTLD_NEXT, "cudaMemRangeGetAttributes"); | ||
2974 | } | ||
2975 | if ((dl_error = dlerror()) != NULL) | ||
2976 | { | ||
2977 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2978 | } | ||
2979 | |||
2980 | |||
2981 | // clear dl error | ||
2982 | dlerror(); | ||
2983 | if (orig_cudaPointerGetAttributes == NULL) { | ||
2984 | orig_cudaPointerGetAttributes = dlsym(RTLD_NEXT, "cudaPointerGetAttributes"); | ||
2985 | } | ||
2986 | if ((dl_error = dlerror()) != NULL) | ||
2987 | { | ||
2988 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
2989 | } | ||
2990 | |||
2991 | |||
2992 | // clear dl error | ||
2993 | dlerror(); | ||
2994 | if (orig_cudaDeviceCanAccessPeer == NULL) { | ||
2995 | orig_cudaDeviceCanAccessPeer = dlsym(RTLD_NEXT, "cudaDeviceCanAccessPeer"); | ||
2996 | } | ||
2997 | if ((dl_error = dlerror()) != NULL) | ||
2998 | { | ||
2999 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
3000 | } | ||
3001 | |||
3002 | |||
3003 | // clear dl error | ||
3004 | dlerror(); | ||
3005 | if (orig_cudaDeviceEnablePeerAccess == NULL) { | ||
3006 | orig_cudaDeviceEnablePeerAccess = dlsym(RTLD_NEXT, "cudaDeviceEnablePeerAccess"); | ||
3007 | } | ||
3008 | if ((dl_error = dlerror()) != NULL) | ||
3009 | { | ||
3010 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
3011 | } | ||
3012 | |||
3013 | |||
3014 | // clear dl error | ||
3015 | dlerror(); | ||
3016 | if (orig_cudaDeviceDisablePeerAccess == NULL) { | ||
3017 | orig_cudaDeviceDisablePeerAccess = dlsym(RTLD_NEXT, "cudaDeviceDisablePeerAccess"); | ||
3018 | } | ||
3019 | if ((dl_error = dlerror()) != NULL) | ||
3020 | { | ||
3021 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
3022 | } | ||
3023 | |||
3024 | |||
3025 | // clear dl error | ||
3026 | dlerror(); | ||
3027 | if (orig_cudaGraphicsUnregisterResource == NULL) { | ||
3028 | orig_cudaGraphicsUnregisterResource = dlsym(RTLD_NEXT, "cudaGraphicsUnregisterResource"); | ||
3029 | } | ||
3030 | if ((dl_error = dlerror()) != NULL) | ||
3031 | { | ||
3032 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
3033 | } | ||
3034 | |||
3035 | |||
3036 | // clear dl error | ||
3037 | dlerror(); | ||
3038 | if (orig_cudaGraphicsResourceSetMapFlags == NULL) { | ||
3039 | orig_cudaGraphicsResourceSetMapFlags = dlsym(RTLD_NEXT, "cudaGraphicsResourceSetMapFlags"); | ||
3040 | } | ||
3041 | if ((dl_error = dlerror()) != NULL) | ||
3042 | { | ||
3043 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
3044 | } | ||
3045 | |||
3046 | |||
3047 | // clear dl error | ||
3048 | dlerror(); | ||
3049 | if (orig_cudaGraphicsMapResources == NULL) { | ||
3050 | orig_cudaGraphicsMapResources = dlsym(RTLD_NEXT, "cudaGraphicsMapResources"); | ||
3051 | } | ||
3052 | if ((dl_error = dlerror()) != NULL) | ||
3053 | { | ||
3054 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
3055 | } | ||
3056 | |||
3057 | |||
3058 | // clear dl error | ||
3059 | dlerror(); | ||
3060 | if (orig_cudaGraphicsUnmapResources == NULL) { | ||
3061 | orig_cudaGraphicsUnmapResources = dlsym(RTLD_NEXT, "cudaGraphicsUnmapResources"); | ||
3062 | } | ||
3063 | if ((dl_error = dlerror()) != NULL) | ||
3064 | { | ||
3065 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
3066 | } | ||
3067 | |||
3068 | |||
3069 | // clear dl error | ||
3070 | dlerror(); | ||
3071 | if (orig_cudaGraphicsResourceGetMappedPointer == NULL) { | ||
3072 | orig_cudaGraphicsResourceGetMappedPointer = dlsym(RTLD_NEXT, "cudaGraphicsResourceGetMappedPointer"); | ||
3073 | } | ||
3074 | if ((dl_error = dlerror()) != NULL) | ||
3075 | { | ||
3076 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
3077 | } | ||
3078 | |||
3079 | |||
3080 | // clear dl error | ||
3081 | dlerror(); | ||
3082 | if (orig_cudaGraphicsSubResourceGetMappedArray == NULL) { | ||
3083 | orig_cudaGraphicsSubResourceGetMappedArray = dlsym(RTLD_NEXT, "cudaGraphicsSubResourceGetMappedArray"); | ||
3084 | } | ||
3085 | if ((dl_error = dlerror()) != NULL) | ||
3086 | { | ||
3087 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
3088 | } | ||
3089 | |||
3090 | |||
3091 | // clear dl error | ||
3092 | dlerror(); | ||
3093 | if (orig_cudaGraphicsResourceGetMappedMipmappedArray == NULL) { | ||
3094 | orig_cudaGraphicsResourceGetMappedMipmappedArray = dlsym(RTLD_NEXT, "cudaGraphicsResourceGetMappedMipmappedArray"); | ||
3095 | } | ||
3096 | if ((dl_error = dlerror()) != NULL) | ||
3097 | { | ||
3098 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
3099 | } | ||
3100 | |||
3101 | |||
3102 | // clear dl error | ||
3103 | dlerror(); | ||
3104 | if (orig_cudaGetChannelDesc == NULL) { | ||
3105 | orig_cudaGetChannelDesc = dlsym(RTLD_NEXT, "cudaGetChannelDesc"); | ||
3106 | } | ||
3107 | if ((dl_error = dlerror()) != NULL) | ||
3108 | { | ||
3109 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
3110 | } | ||
3111 | |||
3112 | |||
3113 | // clear dl error | ||
3114 | dlerror(); | ||
3115 | if (orig_cudaCreateChannelDesc == NULL) { | ||
3116 | orig_cudaCreateChannelDesc = dlsym(RTLD_NEXT, "cudaCreateChannelDesc"); | ||
3117 | } | ||
3118 | if ((dl_error = dlerror()) != NULL) | ||
3119 | { | ||
3120 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
3121 | } | ||
3122 | |||
3123 | |||
3124 | // clear dl error | ||
3125 | dlerror(); | ||
3126 | if (orig_cudaBindTexture == NULL) { | ||
3127 | orig_cudaBindTexture = dlsym(RTLD_NEXT, "cudaBindTexture"); | ||
3128 | } | ||
3129 | if ((dl_error = dlerror()) != NULL) | ||
3130 | { | ||
3131 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
3132 | } | ||
3133 | |||
3134 | |||
3135 | // clear dl error | ||
3136 | dlerror(); | ||
3137 | if (orig_cudaBindTexture2D == NULL) { | ||
3138 | orig_cudaBindTexture2D = dlsym(RTLD_NEXT, "cudaBindTexture2D"); | ||
3139 | } | ||
3140 | if ((dl_error = dlerror()) != NULL) | ||
3141 | { | ||
3142 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
3143 | } | ||
3144 | |||
3145 | |||
3146 | // clear dl error | ||
3147 | dlerror(); | ||
3148 | if (orig_cudaBindTextureToArray == NULL) { | ||
3149 | orig_cudaBindTextureToArray = dlsym(RTLD_NEXT, "cudaBindTextureToArray"); | ||
3150 | } | ||
3151 | if ((dl_error = dlerror()) != NULL) | ||
3152 | { | ||
3153 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
3154 | } | ||
3155 | |||
3156 | |||
3157 | // clear dl error | ||
3158 | dlerror(); | ||
3159 | if (orig_cudaBindTextureToMipmappedArray == NULL) { | ||
3160 | orig_cudaBindTextureToMipmappedArray = dlsym(RTLD_NEXT, "cudaBindTextureToMipmappedArray"); | ||
3161 | } | ||
3162 | if ((dl_error = dlerror()) != NULL) | ||
3163 | { | ||
3164 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
3165 | } | ||
3166 | |||
3167 | |||
3168 | // clear dl error | ||
3169 | dlerror(); | ||
3170 | if (orig_cudaUnbindTexture == NULL) { | ||
3171 | orig_cudaUnbindTexture = dlsym(RTLD_NEXT, "cudaUnbindTexture"); | ||
3172 | } | ||
3173 | if ((dl_error = dlerror()) != NULL) | ||
3174 | { | ||
3175 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
3176 | } | ||
3177 | |||
3178 | |||
3179 | // clear dl error | ||
3180 | dlerror(); | ||
3181 | if (orig_cudaGetTextureAlignmentOffset == NULL) { | ||
3182 | orig_cudaGetTextureAlignmentOffset = dlsym(RTLD_NEXT, "cudaGetTextureAlignmentOffset"); | ||
3183 | } | ||
3184 | if ((dl_error = dlerror()) != NULL) | ||
3185 | { | ||
3186 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
3187 | } | ||
3188 | |||
3189 | |||
3190 | // clear dl error | ||
3191 | dlerror(); | ||
3192 | if (orig_cudaGetTextureReference == NULL) { | ||
3193 | orig_cudaGetTextureReference = dlsym(RTLD_NEXT, "cudaGetTextureReference"); | ||
3194 | } | ||
3195 | if ((dl_error = dlerror()) != NULL) | ||
3196 | { | ||
3197 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
3198 | } | ||
3199 | |||
3200 | |||
3201 | // clear dl error | ||
3202 | dlerror(); | ||
3203 | if (orig_cudaBindSurfaceToArray == NULL) { | ||
3204 | orig_cudaBindSurfaceToArray = dlsym(RTLD_NEXT, "cudaBindSurfaceToArray"); | ||
3205 | } | ||
3206 | if ((dl_error = dlerror()) != NULL) | ||
3207 | { | ||
3208 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
3209 | } | ||
3210 | |||
3211 | |||
3212 | // clear dl error | ||
3213 | dlerror(); | ||
3214 | if (orig_cudaGetSurfaceReference == NULL) { | ||
3215 | orig_cudaGetSurfaceReference = dlsym(RTLD_NEXT, "cudaGetSurfaceReference"); | ||
3216 | } | ||
3217 | if ((dl_error = dlerror()) != NULL) | ||
3218 | { | ||
3219 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
3220 | } | ||
3221 | |||
3222 | |||
3223 | // clear dl error | ||
3224 | dlerror(); | ||
3225 | if (orig_cudaCreateTextureObject == NULL) { | ||
3226 | orig_cudaCreateTextureObject = dlsym(RTLD_NEXT, "cudaCreateTextureObject"); | ||
3227 | } | ||
3228 | if ((dl_error = dlerror()) != NULL) | ||
3229 | { | ||
3230 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
3231 | } | ||
3232 | |||
3233 | |||
3234 | // clear dl error | ||
3235 | dlerror(); | ||
3236 | if (orig_cudaDestroyTextureObject == NULL) { | ||
3237 | orig_cudaDestroyTextureObject = dlsym(RTLD_NEXT, "cudaDestroyTextureObject"); | ||
3238 | } | ||
3239 | if ((dl_error = dlerror()) != NULL) | ||
3240 | { | ||
3241 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
3242 | } | ||
3243 | |||
3244 | |||
3245 | // clear dl error | ||
3246 | dlerror(); | ||
3247 | if (orig_cudaGetTextureObjectResourceDesc == NULL) { | ||
3248 | orig_cudaGetTextureObjectResourceDesc = dlsym(RTLD_NEXT, "cudaGetTextureObjectResourceDesc"); | ||
3249 | } | ||
3250 | if ((dl_error = dlerror()) != NULL) | ||
3251 | { | ||
3252 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
3253 | } | ||
3254 | |||
3255 | |||
3256 | // clear dl error | ||
3257 | dlerror(); | ||
3258 | if (orig_cudaGetTextureObjectTextureDesc == NULL) { | ||
3259 | orig_cudaGetTextureObjectTextureDesc = dlsym(RTLD_NEXT, "cudaGetTextureObjectTextureDesc"); | ||
3260 | } | ||
3261 | if ((dl_error = dlerror()) != NULL) | ||
3262 | { | ||
3263 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
3264 | } | ||
3265 | |||
3266 | |||
3267 | // clear dl error | ||
3268 | dlerror(); | ||
3269 | if (orig_cudaGetTextureObjectResourceViewDesc == NULL) { | ||
3270 | orig_cudaGetTextureObjectResourceViewDesc = dlsym(RTLD_NEXT, "cudaGetTextureObjectResourceViewDesc"); | ||
3271 | } | ||
3272 | if ((dl_error = dlerror()) != NULL) | ||
3273 | { | ||
3274 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
3275 | } | ||
3276 | |||
3277 | |||
3278 | // clear dl error | ||
3279 | dlerror(); | ||
3280 | if (orig_cudaCreateSurfaceObject == NULL) { | ||
3281 | orig_cudaCreateSurfaceObject = dlsym(RTLD_NEXT, "cudaCreateSurfaceObject"); | ||
3282 | } | ||
3283 | if ((dl_error = dlerror()) != NULL) | ||
3284 | { | ||
3285 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
3286 | } | ||
3287 | |||
3288 | |||
3289 | // clear dl error | ||
3290 | dlerror(); | ||
3291 | if (orig_cudaDestroySurfaceObject == NULL) { | ||
3292 | orig_cudaDestroySurfaceObject = dlsym(RTLD_NEXT, "cudaDestroySurfaceObject"); | ||
3293 | } | ||
3294 | if ((dl_error = dlerror()) != NULL) | ||
3295 | { | ||
3296 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
3297 | } | ||
3298 | |||
3299 | |||
3300 | // clear dl error | ||
3301 | dlerror(); | ||
3302 | if (orig_cudaGetSurfaceObjectResourceDesc == NULL) { | ||
3303 | orig_cudaGetSurfaceObjectResourceDesc = dlsym(RTLD_NEXT, "cudaGetSurfaceObjectResourceDesc"); | ||
3304 | } | ||
3305 | if ((dl_error = dlerror()) != NULL) | ||
3306 | { | ||
3307 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
3308 | } | ||
3309 | |||
3310 | |||
3311 | // clear dl error | ||
3312 | dlerror(); | ||
3313 | if (orig_cudaDriverGetVersion == NULL) { | ||
3314 | orig_cudaDriverGetVersion = dlsym(RTLD_NEXT, "cudaDriverGetVersion"); | ||
3315 | } | ||
3316 | if ((dl_error = dlerror()) != NULL) | ||
3317 | { | ||
3318 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
3319 | } | ||
3320 | |||
3321 | |||
3322 | // clear dl error | ||
3323 | dlerror(); | ||
3324 | if (orig_cudaRuntimeGetVersion == NULL) { | ||
3325 | orig_cudaRuntimeGetVersion = dlsym(RTLD_NEXT, "cudaRuntimeGetVersion"); | ||
3326 | } | ||
3327 | if ((dl_error = dlerror()) != NULL) | ||
3328 | { | ||
3329 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
3330 | } | ||
3331 | |||
3332 | |||
3333 | // clear dl error | ||
3334 | dlerror(); | ||
3335 | if (orig_cudaGetExportTable == NULL) { | ||
3336 | orig_cudaGetExportTable = dlsym(RTLD_NEXT, "cudaGetExportTable"); | ||
3337 | } | ||
3338 | if ((dl_error = dlerror()) != NULL) | ||
3339 | { | ||
3340 | fprintf(stderr, ">>>>>>> %s\n", dl_error); | ||
3341 | } | ||
3342 | |||
3343 | } | ||
diff --git a/schedAPI.h b/schedAPI.h new file mode 100644 index 0000000..00105fc --- /dev/null +++ b/schedAPI.h | |||
@@ -0,0 +1,10 @@ | |||
1 | /* | ||
2 | * This include file provides function prototypes for the | ||
3 | * GPU scheduler functions implemented by schedLib.c and used | ||
4 | * in the Cuda wrapper functions. | ||
5 | */ | ||
6 | void streamInit(pid_t my_tid, int priority); | ||
7 | void schedConfCall(pid_t my_tid, void *stream, int blocks, int threads); | ||
8 | void schedLaunch(pid_t my_tid); | ||
9 | void schedSync(pid_t my_pid, void *stream); | ||
10 | void tracePrint(void); | ||
diff --git a/schedLib.c b/schedLib.c new file mode 100644 index 0000000..d16a066 --- /dev/null +++ b/schedLib.c | |||
@@ -0,0 +1,629 @@ | |||
1 | /* | ||
2 | * This library implements a transparent extension of the NVIDIA runtime | ||
3 | * API (libcudart) that is dynamically linked with CUDA programs. This | ||
4 | * extension provides a "middleware" scheduling infrastructure that controls | ||
5 | * CUDA kernel launch requests. It is designed to control kernel scheduling | ||
6 | * for CUDA programs with the following characteristics commonly used for | ||
7 | * concurrent GPU sharing: | ||
8 | * - A main process that creates multiple threads (pthreads) sharing a | ||
9 | * single process address space (i.e., the conditions under which | ||
10 | * kernels can run concurrently on a GPU). | ||
11 | * - Each thread creates one user-defined CUDA stream (FIFO queue) | ||
12 | * that it manages and uses for invoking GPU operations. There is a | ||
13 | * one-to-one relationship between threads and streams. | ||
14 | * - The program is written to launch kernels using the angle-brackets | ||
15 | * syntax (<<<.....>>>) and synchronizes the CPU and GPU with at least | ||
16 | * one call to cudaStreamSynchronize() between successive instances of | ||
17 | * kernel launches in a given stream. | ||
18 | * - The CUDA program is dynamically linked with the CUDA library libcudart | ||
19 | * | ||
20 | * In the case of a CUDA program with multiple user-defined streams, the NVIDIA | ||
21 | * scheduling rules for choosing among multiple streams with kernels at the | ||
22 | * top of their FIFO queues are not documented. This middleware attempts to | ||
23 | * implement and control some of the scheduling choices that can be made. | ||
24 | * | ||
25 | * The library functions are transparently invoked by "wrapping" calls | ||
26 | * to certain of the original CUDA API functions (described below) and | ||
27 | * performing scheduling choices before or after invoking the "real" CUDA | ||
28 | * code. Control over which kernel launch requests can be presented to | ||
29 | * the NVIDIA software and hardware scheduling mechanisms is achieved | ||
30 | * by blocking and signaling operations on the program threads. | ||
31 | * | ||
32 | * The new library functions were designed following the fundamental | ||
33 | * principle of separation between mechanism and policy. Most of the library | ||
34 | * is for implementing the mechanisms that are required for any policy. | ||
35 | * Many scheduling policies are possible given adequate mechanisms for | ||
36 | * carrying out a given policy. The separation of mechanism and policy | ||
37 | * makes it easy to try out and evaluate different policies. In the library | ||
38 | * code, all aspects of policy are implemented in a single function, | ||
39 | * find_next_kernel(), which returns either an identifier for a stream | ||
40 | * to launch a kernel or -1 to indicate that no new launch is allowed. | ||
41 | * The policy functions are intended to be implemented as instances | ||
42 | * of the find_next_kernel() function each contained in a .h file named | ||
43 | * in a #include statement. | ||
44 | * | ||
45 | * For a given thread/stream, the basic sequence of actions is: | ||
46 | * - The program creates a user-defined stream using one of the calls | ||
47 | * cudaStreamCreate(), cudaStreamCreateWithFlags(), or | ||
48 | * cudaStreamCreateWithPriority(). These calls first invoke the | ||
49 | * new library function streamInit() that initializes state about the | ||
50 | * stream and, on the first call, initializes state about the GPU. | ||
51 | * The "real" cuda runtime code for creating a stream is then executed. | ||
52 | * | ||
53 | * - A CUDA kernel launch on a created stream using the angle-brackets syntax | ||
54 | * (<<<.....>>>) is compiled with nvcc by generating two API calls which | ||
55 | * result in calls to the new library functions: | ||
56 | * - cudaConfigureCall() first calls the new library function | ||
57 | * schedConfCall() which records configuration information (e.g., | ||
58 | * number of blocks and threads) about the kernel about to be launched | ||
59 | * and then executes the "real" library code for configuring a launch. | ||
60 | * - cudaLaunch() first calls the new library function schedLaunch(). | ||
61 | * This function implements a scheduling decision that determines | ||
62 | * which, if any, threads, including the one attempting a launch, should | ||
63 | * not be blocked so the actual launch can be allowed to happen. For those | ||
64 | * that are already blocked, the corresponding condition is signaled. | ||
65 | * For the current launching thread, the thread is not blocked and it | ||
66 | * executes the "real" CUDA launch. If the current launching thread | ||
67 | * should be blocked for later scheduling, the corresponding pthread | ||
68 | * condition wait is executed. | ||
69 | * | ||
70 | * - The CUDA program synchronizes the CPU thread with a launched kernel | ||
71 | * using a cudaStreamSynchronize() call. The "wrapper" for this function | ||
72 | * has a different sequence of operations. It first executes the "real" | ||
73 | * synchronization function which may result in the CPU thread being | ||
74 | * blocked until the kernel completes on the GPU. When the "real" call | ||
75 | * returns (kernel completed), the new library function schedSync() is called. | ||
76 | * It implements a scheduling decision that determines whether any blocked | ||
77 | * kernel launches can now be executed and, if so, signals the conditions | ||
78 | * that are blocking the threads attempting to launch a kernel. The kernel | ||
79 | * launch is then handled by the "real" NVIDIA scheduling functions. | ||
80 | * | ||
81 | * The limitation inherent in this design is that the underlying Linux scheduler | ||
82 | * actually determines the order in which blocked threads run when unblocked. | ||
83 | * In the case that multiple blocked threads are signalled, the actual order of | ||
84 | * launches depends on how the Linux scheduler orders the thread dispatching. | ||
85 | * Only in the case where the scheduling algorithm allows only one thread to launch | ||
86 | * a kernel at a time (essentially eliminating any concurrency) can the launch | ||
87 | * order be made completely deterministic. In all other cases, the schedluer | ||
88 | * can only control the set of kernels that are allowed to run concurrently, | ||
89 | * not the specific order in which they start executing. | ||
90 | * | ||
91 | * The new scheduling "middleware" is implemented as a library that is compiled | ||
92 | * and linked with the cuda wrappers as a dynamic-linked load module (see the Makefile). | ||
93 | * A process has one copy of this library program and all threads created by the | ||
94 | * process share the global state for the library. | ||
95 | * | ||
96 | * IMPORTANT: Assumes that a process will create threads only with the POSIX | ||
97 | * API call pthread_create() and not use a system call like clone() directly. | ||
98 | * Also assumes that there is a one-to-one relationship between threads and | ||
99 | * streams and that the Linux thread TID is sufficient to identify a stream. | ||
100 | * | ||
101 | * Note that all new library calls have a void return. If a call returns, it | ||
102 | * can be assumed that the call completed without encountering potential errors. | ||
103 | * If any error is identified, the process is terminated. | ||
104 | * | ||
105 | * Written by Don Smith, Department of Computer Science, | ||
106 | * University of North Carolina at Chapel Hill. | ||
107 | * 2019. | ||
108 | */ | ||
109 | |||
110 | #include <stdio.h> | ||
111 | #include <stdlib.h> | ||
112 | #include <string.h> | ||
113 | #include <fcntl.h> | ||
114 | #include <time.h> | ||
115 | #include <errno.h> | ||
116 | #define _GNU_SOURCE | ||
117 | #include <unistd.h> | ||
118 | #include <sys/syscall.h> | ||
119 | #include <sys/types.h> | ||
120 | |||
121 | #include <pthread.h> | ||
122 | |||
123 | #define TRACE_ON 1 //change to 1 for producing trace of launch decisions, 0 for not | ||
124 | #define MAX_SCHED_TRACE 100000 | ||
125 | |||
126 | #define MAX_STREAMS 4 // One per basic ARM core on TX2 | ||
127 | #define MAX_GPU_BLOCKS 64 //Max blocks on 2 SM TX2 | ||
128 | #define MAX_GPU_THREADS 4096 //Max threads on 2 SM TX2 | ||
129 | |||
130 | #define min(a,b) ((a) <= (b) ? (a) : (b)) | ||
131 | #define max(a,b) ((a) >= (b) ? (a) : (b)) | ||
132 | |||
133 | int trc_idx = 0; | ||
134 | struct schedTrace *tr_ptr; | ||
135 | struct schedTrace { | ||
136 | int stream[MAX_STREAMS]; | ||
137 | int stream_threads[MAX_STREAMS]; | ||
138 | int next; | ||
139 | char type[4]; | ||
140 | }SchedTrace[MAX_SCHED_TRACE]; | ||
141 | |||
142 | |||
143 | int Initialized = 0; //Only initialize GPU once -- set to 1 the first time | ||
144 | enum st_states {INIT, IDLE, PENDING, READY_LAUNCH, LAUNCHED}; //stream states | ||
145 | enum gpu_states {FREE, BUSY}; //gpu states | ||
146 | |||
147 | // this mutex is required to protect shared stream and GPU states | ||
148 | pthread_mutex_t sched_lock = PTHREAD_MUTEX_INITIALIZER; | ||
149 | |||
150 | struct stream { | ||
151 | pid_t thread; //the tid (Linux thread id) of the thread "owning" the stream | ||
152 | void *stream; //the CUDA runtime pointer of the stream (not currently used). | ||
153 | int priority; //stream priority from cudaCreateStreamPriority() or 0. | ||
154 | enum st_states state; //current stream state | ||
155 | int blocks; //number of blocks in kernel ready to launch | ||
156 | int block_threads; //number of threads per block | ||
157 | int look_count; //for use in policy algorithms concerned with starvation | ||
158 | pthread_mutex_t st_lock; //required for using condition wait/signal on stream | ||
159 | pthread_cond_t st_cond; //condition variable for thread/stream block/signal | ||
160 | } Stream[MAX_STREAMS]; //a thread/stream is identified by an index (str_idx) in the array. | ||
161 | |||
162 | int stream_count = 0; //number of streams that have been created | ||
163 | int next = 0; //index of stream that can launch the next kernel | ||
164 | |||
165 | struct gpu { | ||
166 | enum gpu_states GPU_state; //current GPU state | ||
167 | int threads_occupied; //total threads allocated over both SMs | ||
168 | int kernels_dispatched; //number of kernels currently dispatched to SMs | ||
169 | int streams[MAX_STREAMS]; //for kernel executing, its thread/stream tid, else 0 | ||
170 | int stream_threads[MAX_STREAMS]; //for kernel executing, its allocated threads, else 0 | ||
171 | }; | ||
172 | |||
173 | struct gpu GPU; | ||
174 | |||
175 | void free_gpu_threads(pid_t my_tid, int str_idx); | ||
176 | void gpu_exit(pid_t my_tid, int str_idx); | ||
177 | void alloc_gpu_threads(pid_t my_tid, int str_idx); | ||
178 | void gpu_run(pid_t my_tid, int str_idx); | ||
179 | int find_best_kernel(void); | ||
180 | void ready_launch(pid_t my_tid); | ||
181 | void schedule_next(pid_t my_tid); | ||
182 | void dispatch_next(int this_one, pid_t my_tid); | ||
183 | int get_stream(pid_t my_tid); | ||
184 | void show_gpu_state(void); | ||
185 | void show_stream_state(int this_one); | ||
186 | |||
187 | |||
188 | //Include here the .h file containing the scheduling policy implementation | ||
189 | //in funtion: int find_best_kernel(void) | ||
190 | #include "MinFitMinIntfR2.h" | ||
191 | |||
192 | /* Function called from library wrapper of cudaDeviceReset(). | ||
193 | * If any entries have been made in a trace of scheduling decisions | ||
194 | * made by find_best_kernel(), they are formatted and written to | ||
195 | * stdout. | ||
196 | */ | ||
197 | void tracePrint(void) | ||
198 | { | ||
199 | int i, j; | ||
200 | for (i = 0; i < trc_idx; i++) { | ||
201 | fprintf(stderr, "%d %s %d ", i, SchedTrace[i].type, SchedTrace[i].next); | ||
202 | for (j = 0; j < MAX_STREAMS; j++) { | ||
203 | fprintf(stderr, "[%d, %d] ", SchedTrace[i].stream[j], SchedTrace[i].stream_threads[j]); | ||
204 | } | ||
205 | fprintf(stderr, "\n"); | ||
206 | } | ||
207 | } | ||
208 | |||
209 | /* Function called from library wrapper of cudaStreamCreateXXXX(). | ||
210 | * The stream structure at the current index into the Stream structure | ||
211 | * array is initialized. The GPU state for the created stream is | ||
212 | * also initialized. Each invocation creates a new index | ||
213 | * by incrementing stream_count. On the first invocation, the part of | ||
214 | * the GPU structure not specific to a stream is also initialized. | ||
215 | */ | ||
216 | void streamInit(pid_t my_tid, int priority) | ||
217 | //my_tid is the thread creating a user-defined stream | ||
218 | { | ||
219 | //WARNING: any flags are ignored. | ||
220 | |||
221 | //printf("cudaStreamCreate TID %d\n", my_tid); | ||
222 | //fflush(stdout); | ||
223 | |||
224 | pthread_mutex_lock(&sched_lock); | ||
225 | |||
226 | Stream[stream_count].thread = my_tid; //stream identified by tid of creating thread | ||
227 | Stream[stream_count].priority = priority; | ||
228 | Stream[stream_count].state = INIT; | ||
229 | Stream[stream_count].look_count = 0; | ||
230 | |||
231 | //stream mutex and condition variable initialized to free | ||
232 | pthread_mutex_init(&Stream[stream_count].st_lock, NULL); | ||
233 | pthread_cond_init(&Stream[stream_count].st_cond, NULL); | ||
234 | |||
235 | //initialize GPU state for this newly created stream | ||
236 | GPU.streams[stream_count] = 0; //no kernel from stream running | ||
237 | GPU.stream_threads[stream_count] = 0; //no threads allocated | ||
238 | |||
239 | stream_count += 1; //increment stream index | ||
240 | |||
241 | if (Initialized == 0) { //initialize GPU state on first stream create | ||
242 | GPU.threads_occupied = 0; | ||
243 | GPU.kernels_dispatched = 0; | ||
244 | GPU.GPU_state = FREE; | ||
245 | |||
246 | //tr_ptr = (struct schedTrace *)mem_ptr; | ||
247 | Initialized = 1; | ||
248 | } | ||
249 | |||
250 | pthread_mutex_unlock(&sched_lock); | ||
251 | } | ||
252 | |||
253 | /* Function called from the library wrapper of cudaConfigureCall() | ||
254 | * generated from the <<<.....>>> kernel launch statement in the | ||
255 | * CUDA program. The stream state for the stream is initialized | ||
256 | * with the block and threads/block counts for the kernel. | ||
257 | */ | ||
258 | void schedConfCall(pid_t my_tid, void *stream, int blocks, int threads) | ||
259 | //my_tid is the thread/stream attempting to launch | ||
260 | { | ||
261 | int str_idx; | ||
262 | |||
263 | pthread_mutex_lock(&sched_lock); | ||
264 | |||
265 | //printf("cudaConfigureCall TID %d stream %p blocks %d threads %d\n", | ||
266 | // my_tid, stream, blocks, threads); | ||
267 | //fflush(stdout); | ||
268 | |||
269 | // get the stream array index for the thread that "owns" this stream | ||
270 | str_idx = get_stream(my_tid); | ||
271 | |||
272 | //initialize state for the kernel that the thread is launching | ||
273 | Stream[str_idx].state = PENDING; //call configured but not launched | ||
274 | Stream[str_idx].blocks = blocks; //total blocks in the kernel | ||
275 | Stream[str_idx].block_threads = threads; //total threads per block | ||
276 | |||
277 | pthread_mutex_unlock(&sched_lock); | ||
278 | } | ||
279 | |||
280 | /* Function called from the library wrapper of cudaLaunch() | ||
281 | * generated from the <<<.....>>> kernel launch statement in the | ||
282 | * CUDA program. The stream state for the stream is changed to | ||
283 | * show that the kernel is ready for launching. The utility | ||
284 | * function ready_launch() is called. On return from ready_launch() | ||
285 | * this function returns to the wrapper which then invokes the | ||
286 | * "real" CUDA launch. The return from ready_launch() is | ||
287 | * immediate in the case the scheduler determines that this kernel | ||
288 | * can be launched. The call may instead result in blocking | ||
289 | * the thread if the scheduler determines that the launch should | ||
290 | * be deferred. When the blocking condition is signalled by the | ||
291 | * scheduler, ready_launch() then returns to this function. | ||
292 | */ | ||
293 | void schedLaunch(pid_t my_tid) {//my_tid is the thread/stream attempting to launch | ||
294 | int str_idx; | ||
295 | |||
296 | pthread_mutex_lock(&sched_lock); | ||
297 | |||
298 | //printf("cudaLaunch TID %d\n", my_tid); | ||
299 | //fflush(stdout); | ||
300 | |||
301 | // get the stream array index for the thread that "owns" this stream | ||
302 | str_idx = get_stream(my_tid); | ||
303 | |||
304 | Stream[str_idx].state = READY_LAUNCH; //kernel can be considered for scheduling | ||
305 | |||
306 | //printf("TID %d Ready, Blocks %d Threads %d\n", my_tid, | ||
307 | // Stream[str_idx].blocks, Stream[str_idx].block_threads); | ||
308 | //fflush(stdout); | ||
309 | |||
310 | // ready_launch() is called with the sched_lock still held. The function will | ||
311 | // either block (and the thread will run when signaled) or will return | ||
312 | // immediately. In either case, (a) the lock will have been unlocked, and | ||
313 | // (b) the kernal will be launched by the "real" CUDA launch. | ||
314 | |||
315 | ready_launch(my_tid); // thread/stream will launch on return or after blocking | ||
316 | } | ||
317 | |||
318 | |||
319 | /* Utility function called from schedLaunch(). It invokes the scheduling policy | ||
320 | * function, find_best_kernel(), one or more times to determine which, if any, | ||
321 | * streams have a kernel that is ready to launch and should be launched. For | ||
322 | * streams, other than the one that invoked schedLaunch(), having kernels that | ||
323 | * should launch, the utility function dispatch_next() is called to unblock | ||
324 | * their owning threads. If the stream owned by the calling thread has a kernel | ||
325 | * to launch, the sched_lock is released and the function just returns. If the | ||
326 | * kernel in the stream of the calling thread is to be deferred, sched_lock is | ||
327 | * released and the thread blocks with a pthread_cond_wait on its stream condition. | ||
328 | */ | ||
329 | void ready_launch(pid_t my_tid) { | ||
330 | //my_tid is the thread/stream attempting to launch | ||
331 | int str_idx, rc; | ||
332 | int this_one; | ||
333 | int will_block; | ||
334 | |||
335 | //Must be called with sched_lock held. It must release the lock before | ||
336 | //returning or blocking and then returning. | ||
337 | |||
338 | // get the stream array index for the thread that "owns" this stream | ||
339 | str_idx = get_stream(my_tid); | ||
340 | |||
341 | will_block = 1; // will not block if a kernel scheduled on this stream | ||
342 | |||
343 | do { | ||
344 | /* | ||
345 | if (TRACE_ON) { | ||
346 | printf("TID %d find new kernel on Launch\n", my_tid); | ||
347 | fflush(stdout); | ||
348 | } | ||
349 | */ | ||
350 | //call the scheduling policy function. It returns a stream index for | ||
351 | //a stream in the READY_LAUNCH state with a kernel to be launched now | ||
352 | //(returns -1 if none found) | ||
353 | this_one = find_best_kernel(); | ||
354 | |||
355 | if (this_one == str_idx) {//kernel from calling thread can launch | ||
356 | will_block = 0; // no block, just return | ||
357 | alloc_gpu_threads(my_tid, str_idx); //set up GPU state to launch | ||
358 | gpu_run(my_tid, str_idx); | ||
359 | Stream[str_idx].state = LAUNCHED; //kernel has been scheduled | ||
360 | } | ||
361 | else {//kernel from a different thread/stream should be launched | ||
362 | if (this_one >= 0) | ||
363 | dispatch_next(this_one, my_tid); //set state and signal | ||
364 | } | ||
365 | } while (this_one >= 0); // -1 indicates no more kernel launches now | ||
366 | |||
367 | //Must unlock so calling thread can return or block | ||
368 | pthread_mutex_unlock(&sched_lock); | ||
369 | |||
370 | if (will_block == 0) | ||
371 | return; //allows launch from calling thread to take place | ||
372 | |||
373 | // thread/stream must block until scheduler indicates its kernel can launch | ||
374 | rc = pthread_mutex_lock(&Stream[str_idx].st_lock); | ||
375 | if (rc != 0) { | ||
376 | fprintf(stderr, "TID %d Failed - Locking Stream Mutex\n", my_tid); | ||
377 | exit (-1); | ||
378 | } | ||
379 | rc = pthread_cond_wait(&Stream[str_idx].st_cond, &Stream[str_idx].st_lock); | ||
380 | if (rc != 0) { | ||
381 | fprintf(stderr, "TID %d Failed - Waiting Stream Condition\n", my_tid); | ||
382 | exit (-1); | ||
383 | } | ||
384 | rc = pthread_mutex_unlock(&Stream[str_idx].st_lock); | ||
385 | if (rc != 0) { | ||
386 | fprintf(stderr, "TID %d Failed - Unlocking Stream Mutex\n", my_tid); | ||
387 | exit (-1); | ||
388 | } | ||
389 | } | ||
390 | |||
391 | /* Utility function called from ready_launch() and schedule_next() to set | ||
392 | * state and signal a blocked thread/stream so it can execute the "real" | ||
393 | * CUDA launch. | ||
394 | */ | ||
395 | void dispatch_next(int this_one, pid_t my_tid) {//my_tid is calling thread/stream | ||
396 | //this_one is the stream index of the stream to launch a kernel | ||
397 | pid_t new_tid; | ||
398 | int rc; | ||
399 | |||
400 | //Must be called with sched_lock held; will be unlocked by caller | ||
401 | |||
402 | //new_tid is the thread/stream that has been scheduled for kernel launch | ||
403 | new_tid = Stream[this_one].thread; | ||
404 | |||
405 | alloc_gpu_threads(new_tid, this_one); //set up GPU state for launch | ||
406 | gpu_run(new_tid, this_one); | ||
407 | Stream[this_one].state = LAUNCHED; //kernel has been scheduled | ||
408 | |||
409 | //signal the blocked thread/stream so it can execute the "real" launch | ||
410 | rc = pthread_mutex_lock(&Stream[this_one].st_lock); | ||
411 | if (rc != 0) { | ||
412 | fprintf(stderr, "TID %d Failed - Locking Stream Mutex\n", my_tid); | ||
413 | exit (-1); | ||
414 | } | ||
415 | rc = pthread_cond_signal(&Stream[this_one].st_cond); | ||
416 | if (rc != 0) { | ||
417 | fprintf(stderr, "TID %d Failed - Signaling Stream Condition\n", my_tid); | ||
418 | exit (-1); | ||
419 | } | ||
420 | rc = pthread_mutex_unlock(&Stream[this_one].st_lock); | ||
421 | if (rc != 0) { | ||
422 | fprintf(stderr, "TID %d Failed - Unlocking Stream Mutex\n", my_tid); | ||
423 | exit (-1); | ||
424 | } | ||
425 | } | ||
426 | |||
427 | /* Function called from library wrapper of cudaStreamSynchronize(). | ||
428 | * This CUDA function provides an essential notification that a kernel has | ||
429 | * completed execution on the GPU. The CUDA program must be written so | ||
430 | * that it synchronizes the CPU and GPU with at least one call to | ||
431 | * cudaStreamSynchronize() between successive instances of kernel | ||
432 | * launches on a given stream. | ||
433 | * | ||
434 | * The function sets the stream and GPU state to reflect the kernel's | ||
435 | * completion which frees GPU resources for use to execute a new | ||
436 | * kernel. It then calls the utility function, schedule_next() to | ||
437 | * schedule launches of any kernels the scheduling policy determines should | ||
438 | * be eligible to run now. | ||
439 | */ | ||
440 | void schedSync(pid_t my_tid, void *stream) { | ||
441 | //my_tid is the thread/stream synchronizing the CPU with a GPU kernel completion | ||
442 | int str_idx; | ||
443 | |||
444 | pthread_mutex_lock(&sched_lock); | ||
445 | |||
446 | //printf("cudaStreamSynchronize TID %d stream %p\n", my_tid, stream); | ||
447 | //fflush(stdout); | ||
448 | |||
449 | // get the stream array index for the thread that "owns" this stream | ||
450 | str_idx = get_stream(my_tid); | ||
451 | |||
452 | // if the stream is idle (does not have a kernel being executed), the | ||
453 | // call is not related to kernel execution (e.g., is for an asynchronous | ||
454 | // cudaMemcpy). It can be ignored. | ||
455 | if (Stream[str_idx].state == IDLE) { | ||
456 | // unlock for return | ||
457 | pthread_mutex_unlock(&sched_lock); | ||
458 | return; | ||
459 | } | ||
460 | // still holding sched_lock here | ||
461 | free_gpu_threads(my_tid, str_idx); //set up GPU state for kernel completion | ||
462 | gpu_exit(my_tid, str_idx); | ||
463 | |||
464 | Stream[str_idx].state = IDLE; //set up stream state for kernel completion | ||
465 | Stream[str_idx].blocks = 0; | ||
466 | Stream[str_idx].block_threads = 0; | ||
467 | |||
468 | // schedule_next is called with sched_lock held. The function must | ||
469 | // release it before returning. | ||
470 | schedule_next(my_tid); // which, if any, thread/stream should launch now? | ||
471 | } | ||
472 | |||
473 | /* Utility function called from schedSynch(). It invokes the scheduling policy | ||
474 | * function, find_best_kernel(), one or more times to determine which, if any, | ||
475 | * streams have a kernel that is ready to launch and should be launched. For | ||
476 | * streams having kernels that should launch, the utility function dispatch_next() | ||
477 | * is called to unblock their owning threads. Note that the calling thread/stream | ||
478 | * cannot have a kernel to schedule until it executes another launch. | ||
479 | */ | ||
480 | void schedule_next(pid_t my_tid) { | ||
481 | //my_tid is the thread/stream synchronizing the CPU with a GPU kernel completion | ||
482 | |||
483 | int this_one; | ||
484 | |||
485 | //Must be called with sched_lock held. It must release the lock before | ||
486 | //returning. | ||
487 | |||
488 | do { | ||
489 | /* | ||
490 | if (TRACE_ON) { | ||
491 | printf("TID %d find new kernel on Sync\n", my_tid); | ||
492 | fflush(stdout); | ||
493 | } | ||
494 | */ | ||
495 | |||
496 | //call the scheduling policy function. It returns a stream index for | ||
497 | //a stream in the READY_LAUNCH state with a kernel to be launched now | ||
498 | //(returns -1 if none found) | ||
499 | this_one = find_best_kernel(); | ||
500 | |||
501 | if (this_one >= 0) | ||
502 | dispatch_next(this_one, my_tid); | ||
503 | } while (this_one >= 0); // -1 indicates no more kernel launches now | ||
504 | |||
505 | pthread_mutex_unlock(&sched_lock); | ||
506 | } | ||
507 | |||
508 | /* Utility function called from schedSync() to free GPU threads for a | ||
509 | * completed kernel. | ||
510 | */ | ||
511 | void free_gpu_threads(pid_t my_tid, int str_idx) { | ||
512 | //str_idx is the stream index of the stream with a completed kernel | ||
513 | int alloc_threads; | ||
514 | |||
515 | //Must be called with sched_lock held | ||
516 | |||
517 | //see alloc_gpu_threads() for a description of thread allocations | ||
518 | alloc_threads = min(MAX_GPU_THREADS, Stream[str_idx].blocks * Stream[str_idx].block_threads); | ||
519 | GPU.threads_occupied -= alloc_threads; | ||
520 | if (GPU.threads_occupied < 0) { | ||
521 | fprintf(stderr, "TID %d Failed - GPU Threads < 0\n", my_tid); | ||
522 | exit (-1); | ||
523 | } | ||
524 | } | ||
525 | |||
526 | /* Utility function called from schedSync() to set GPU stream state for | ||
527 | * a completed kernel. | ||
528 | */ | ||
529 | void gpu_exit(pid_t my_tid, int str_idx) { | ||
530 | //str_idx is the stream index of the stream with a completed kernel | ||
531 | |||
532 | //Must be called with sched_lock held | ||
533 | |||
534 | //printf("GPU Kernel End %d Threads\n",GPU.stream_threads[str_idx]); | ||
535 | |||
536 | GPU.streams[str_idx] = 0; | ||
537 | GPU.stream_threads[str_idx] = 0; | ||
538 | |||
539 | GPU.kernels_dispatched -= 1; | ||
540 | if (GPU.kernels_dispatched < 0) { | ||
541 | fprintf(stderr, "TID %d Failed - GPU Kernels < 0\n", my_tid); | ||
542 | exit (-1); | ||
543 | } | ||
544 | |||
545 | } | ||
546 | |||
547 | /* Utility function called from ready_launch() and dispatch_next() to | ||
548 | * allocate GPU threads for a kernel scheduled for launching. The total | ||
549 | * number of threads required by the kernel is computed as the number | ||
550 | * of blocks in the kernel * the number of threads per block. If the | ||
551 | * total threads is >= MAX_GPU_THREADS, the number of allocated threads | ||
552 | * on the GPU is set to MAX_GPU_THREADS so all GPU threads are occupied | ||
553 | * until the kernel completes. | ||
554 | * NOTE: Once a kernel is launched that occupies all the GPU threads, | ||
555 | * no additional kernels can launch until that kernel completes. | ||
556 | * This prevents the GPU from concurrently executing the last blocks of | ||
557 | * a current kernel with the first blocks of a newly dispatched kernel. | ||
558 | */ | ||
559 | void alloc_gpu_threads(pid_t my_tid, int str_idx) { | ||
560 | //str_idx is the stream index of the stream with a scheduled kernel | ||
561 | |||
562 | int alloc_threads; | ||
563 | |||
564 | //Must be called with sched_lock held | ||
565 | |||
566 | alloc_threads = min(MAX_GPU_THREADS, Stream[str_idx].blocks * Stream[str_idx].block_threads); | ||
567 | GPU.threads_occupied += alloc_threads; | ||
568 | if (GPU.threads_occupied > MAX_GPU_THREADS) { | ||
569 | fprintf(stderr, "TID %d Failed - GPU Threads Exceeded\n", my_tid); | ||
570 | exit (-1); | ||
571 | } | ||
572 | } | ||
573 | |||
574 | /* Utility function called from ready_launch() and dispatch_next() to set | ||
575 | * stream-related and kernel-dispatch state on the GPU for a kernel | ||
576 | * scheduled for launching. | ||
577 | */ | ||
578 | void gpu_run(pid_t my_tid, int str_idx) { | ||
579 | //str_idx is the stream index of the stream with a scheduled kernel | ||
580 | |||
581 | |||
582 | //Must be called with sched_lock held | ||
583 | |||
584 | GPU.streams[str_idx] = my_tid; | ||
585 | |||
586 | //see alloc_gpu_threads for a description of thread allocations | ||
587 | GPU.stream_threads[str_idx] = min(MAX_GPU_THREADS, | ||
588 | Stream[str_idx].blocks * Stream[str_idx].block_threads); | ||
589 | /* | ||
590 | int i; | ||
591 | |||
592 | if (TRACE_ON) { | ||
593 | printf("GPU Thread Blocks [ "); | ||
594 | for (i = 0; i < stream_count; i++) { | ||
595 | if (GPU.stream_threads[i] != 0) | ||
596 | printf("%d ", GPU.stream_threads[i]); | ||
597 | } | ||
598 | printf("]\n"); | ||
599 | } | ||
600 | */ | ||
601 | |||
602 | GPU.kernels_dispatched += 1; | ||
603 | if (GPU.kernels_dispatched > stream_count) { | ||
604 | fprintf(stderr, "TID %d Failed - GPU Kernels > streams\n", my_tid); | ||
605 | exit (-1); | ||
606 | } | ||
607 | } | ||
608 | |||
609 | /* Utility function used in multiple functions to find the index in the | ||
610 | * stream array for the stream owned by the thread with TID of my_tid. | ||
611 | * The stream must have been previously created (cudaStreamCreate()). | ||
612 | */ | ||
613 | int get_stream(pid_t my_tid) | ||
614 | { | ||
615 | //Must be called with sched_lock held | ||
616 | |||
617 | int i; | ||
618 | |||
619 | for (i = 0; i < MAX_STREAMS; i++) { | ||
620 | if (Stream[i].thread == my_tid) | ||
621 | break; | ||
622 | } | ||
623 | if (i == MAX_STREAMS) { | ||
624 | fprintf(stderr, "TID %d Failed - get_stream()", my_tid); | ||
625 | exit (-1); | ||
626 | } | ||
627 | return i; | ||
628 | } | ||
629 | |||