Import GPU scheduler code from 2019 GM deliverable

Code provided by Don Smith via tar file. `.gitignore` added by me.
author: Joshua Bakita <bakitajoshua@gmail.com> 2020-10-15 20:44:33 -0400
committer: Joshua Bakita <bakitajoshua@gmail.com> 2020-10-15 20:44:33 -0400
commit: 9e82e2c7cca65a8eb60d5bd99da66241c01a2991 (patch)
tree: f5ac2263d40995c09a3ae656f81d860eac3f658c
7 files changed, 4116 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..be747da
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+libcudart_wrapper.so
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..15fe29a
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,13 @@
+CC := gcc
+CCFLAGS := -Wall -fPIC -shared
+LDFLAGS := -ldl -lpthread -lrt
+INCLUDES := -I/usr/local/cuda/include/
+################################################################################
+all: build
+build: libcudart_wrapper.so
+libcudart_wrapper.so: schedLib.c libcudart_wrapper.c 
+        $(CC) $(CCFLAGS) $(INCLUDES) -o libcudart_wrapper.so schedLib.c libcudart_wrapper.c $(LDFLAGS)
diff --git a/MinFitMinIntfR2.h b/MinFitMinIntfR2.h
new file mode 100644
index 0000000..819b2b4
--- /dev/null
+++ b/MinFitMinIntfR2.h
@@ -0,0 +1,120 @@
+/* Scheduling policy function that implements a "min thread use, min interference" policy,
+ * i.e., find the ready-to-launch kernel that will occupy the smallest number of available
+ * GPU threads AND does not fail a test for interference effects.  The test for
+ * interference effects requires that ratio between the number of threads in the 
+ * kernel under consideration and any kernel already scheduled does not exceed a 
+ * threshold (in this implementation, 2.0).  This test is motivated by empirical
+ * measurements that have shown interfernce effects such as 500% or higher for 
+ * large thread ratios between concurrently executing kernels.  This is thought
+ * to be an artifact of the un-documented warp scheduling algorithm in the NVIDIA SMs.  
+ */
+//put any global (static) declarations here:
+#define MAX_THREAD_RATIO 2.0  // Threshold ratio between scheduled and new kernel
+int find_best_kernel(void) {
+  int i;
+  int this_one = -1;  //default return value indicating no kernel to launch
+  int need_threads, available_threads, left_over;
+  int k;
+  int occupied_threads[MAX_STREAMS];  //GPU threads allocated to scheduled kernels
+  //Must be called with sched_lock held
+  //record the allocated GPU threads in the kernel scheduled for each stream
+  for (i = 0; i < stream_count; i++) 
+    occupied_threads[i] = GPU.stream_threads[i];
+   //GPU threads available for allocation
+   available_threads = (MAX_GPU_THREADS - GPU.threads_occupied);
+   left_over = -1;  //the number of threads left available if a kernel is scheduled
+   for (i = 0; i < stream_count; i++) { //examine all streams
+      if (Stream[i].state == READY_LAUNCH) { //only threads/streams ready to launch are considered
+         //determine how many threads would be allocated for this kernel (see
+         //allocate_gpu_threads() for a description)
+         need_threads = min(MAX_GPU_THREADS, Stream[i].blocks * Stream[i].block_threads);
+         if (need_threads > available_threads) //can't be scheduled
+            continue;
+         // find kernel with smalled thread allocation that does not create thread imbalance
+         //?? should there be a starvation-prevention part of this policy ??
+         if ((available_threads - need_threads) > left_over) {
+            //found kernel with smallest thread allocation so far
+            //compute and test the ratios of threads between it and all kernels scheduled
+            for (k = 0; k < stream_count; k++) {//examine all streams
+                if (occupied_threads[k] == 0)  //stream has no kernel scheduled
+                   continue;
+                //if test fails for any already scheduled kernel, this stream can't launch
+                if ((float)(occupied_threads[k] / (float)need_threads) > MAX_THREAD_RATIO)
+                    break;
+                if ((float)(need_threads / (float)occupied_threads[k]) > MAX_THREAD_RATIO)
+                    break;
+            }
+            //if the test is passed for all scheduled kernels, this stream's kernel can launch
+            if (k == stream_count) {
+               this_one = i; //the final value of this_one is the stream index to schedule (or -1)
+               left_over = available_threads - need_threads;  //current smallest thread allocation
+            }
+         } //end test for smaller thread allocation
+      } //end test for stream ready to launch
+   } //end outer for loop
+  if (TRACE_ON) {
+     show_gpu_state();
+     show_stream_state(this_one);
+  }
+  return this_one; //the scheduling decision (stream index)
+}
+// Utility function to trace GPU state used in scheduling policy decisions
+void show_gpu_state(void) {
+  //Must be called with sched_lock held
+  int i;
+  if (trc_idx >= MAX_SCHED_TRACE)
+    return;
+  for (i = 0; i < MAX_STREAMS; i++) {
+    SchedTrace[trc_idx].stream[i] = GPU.streams[i];
+    SchedTrace[trc_idx].stream_threads[i] = GPU.stream_threads[i];
+    SchedTrace[trc_idx].next = 0;
+    strcpy(SchedTrace[trc_idx].type, "GPU");
+  }
+  trc_idx++;
+}
+// Utility function to trace stream state used in scheduling policy decisions
+void show_stream_state(int this_one) {
+  //Must be called with sched_lock held
+  int i;
+  int need_threads;
+  if (trc_idx >= MAX_SCHED_TRACE)
+    return;
+  for (i = 0; i < MAX_STREAMS; i++) {
+    need_threads = min(MAX_GPU_THREADS, Stream[i].blocks * Stream[i].block_threads);
+    if ((Stream[i].state != READY_LAUNCH) &&
+        (Stream[i].state != LAUNCHED))
+      need_threads = -need_threads; //encode unschedulable state in threads with minus
+    SchedTrace[trc_idx].stream[i] = Stream[i].thread;
+    SchedTrace[trc_idx].stream_threads[i] = need_threads;
+    if (this_one == -1)
+        SchedTrace[trc_idx].next = this_one;
+    else
+        SchedTrace[trc_idx].next = Stream[this_one].thread; 
+    strcpy(SchedTrace[trc_idx].type, "STR");
+  }
+  trc_idx++;
+}
diff --git a/SoftwareDocumentation.docx b/SoftwareDocumentation.docx
new file mode 100644
index 0000000..903706f
--- /dev/null
+++ b/SoftwareDocumentation.docx
Binary files differ
diff --git a/libcudart_wrapper.c b/libcudart_wrapper.c
new file mode 100644
index 0000000..8e4f005
--- /dev/null
+++ b/libcudart_wrapper.c
@@ -0,0 +1,3343 @@
+/* Wrapper functions to implement transparent extension of the CUDA runtime library
+ * (libcudart) by dynamic linking this set of function interfaces ahead of the
+ * "real" library using LD_PRELOAD.  Calls to library extension functions 
+ * can be embedded in these wrapper functions.  The specific calls used in the 
+ * scheduling "middleware" are defined in the include file schedAPI.h below.
+ *
+ * WARNING - Do not change this file unless you are totally sure you know what you
+ * are doing!
+ *
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <dlfcn.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include "cuda_runtime_api.h"
+#include "schedAPI.h" //function prototypes for calls to library extensions
+static __host__ cudaError_t CUDARTAPI (*orig_cudaDeviceReset)(void) = NULL;
+__host__ cudaError_t CUDARTAPI cudaDeviceReset(void) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaDeviceReset.c file
+        tracePrint();
+        ret = orig_cudaDeviceReset();
+        return ret;
+}
+static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaDeviceSynchronize)(void) = NULL;
+__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceSynchronize(void) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaDeviceSynchronize.c file
+        ret = orig_cudaDeviceSynchronize();
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaDeviceSetLimit)(enum cudaLimit limit, size_t value) = NULL;
+__host__ cudaError_t CUDARTAPI cudaDeviceSetLimit(enum cudaLimit limit, size_t value) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaDeviceSetLimit.c file
+        ret = orig_cudaDeviceSetLimit(limit, value);
+        return ret;
+}
+static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaDeviceGetLimit)(size_t *pValue, enum cudaLimit limit) = NULL;
+__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaDeviceGetLimit.c file
+        ret = orig_cudaDeviceGetLimit(pValue, limit);
+        return ret;
+}
+static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaDeviceGetCacheConfig)(enum cudaFuncCache *pCacheConfig) = NULL;
+__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaDeviceGetCacheConfig.c file
+        ret = orig_cudaDeviceGetCacheConfig(pCacheConfig);
+        return ret;
+}
+static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaDeviceGetStreamPriorityRange)(int *leastPriority, int *greatestPriority) = NULL;
+__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetStreamPriorityRange(int *leastPriority, int *greatestPriority) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaDeviceGetStreamPriorityRange.c file
+        ret = orig_cudaDeviceGetStreamPriorityRange(leastPriority, greatestPriority);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaDeviceSetCacheConfig)(enum cudaFuncCache cacheConfig) = NULL;
+__host__ cudaError_t CUDARTAPI cudaDeviceSetCacheConfig(enum cudaFuncCache cacheConfig) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaDeviceSetCacheConfig.c file
+        ret = orig_cudaDeviceSetCacheConfig(cacheConfig);
+        return ret;
+}
+static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaDeviceGetSharedMemConfig)(enum cudaSharedMemConfig *pConfig) = NULL;
+__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaDeviceGetSharedMemConfig.c file
+        ret = orig_cudaDeviceGetSharedMemConfig(pConfig);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaDeviceSetSharedMemConfig)(enum cudaSharedMemConfig config) = NULL;
+__host__ cudaError_t CUDARTAPI cudaDeviceSetSharedMemConfig(enum cudaSharedMemConfig config) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaDeviceSetSharedMemConfig.c file
+        ret = orig_cudaDeviceSetSharedMemConfig(config);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaDeviceGetByPCIBusId)(int *device, const char *pciBusId) = NULL;
+__host__ cudaError_t CUDARTAPI cudaDeviceGetByPCIBusId(int *device, const char *pciBusId) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaDeviceGetByPCIBusId.c file
+        ret = orig_cudaDeviceGetByPCIBusId(device, pciBusId);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaDeviceGetPCIBusId)(char *pciBusId, int len, int device) = NULL;
+__host__ cudaError_t CUDARTAPI cudaDeviceGetPCIBusId(char *pciBusId, int len, int device) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaDeviceGetPCIBusId.c file
+        ret = orig_cudaDeviceGetPCIBusId(pciBusId, len, device);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaIpcGetEventHandle)(cudaIpcEventHandle_t *handle, cudaEvent_t event) = NULL;
+__host__ cudaError_t CUDARTAPI cudaIpcGetEventHandle(cudaIpcEventHandle_t *handle, cudaEvent_t event) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaIpcGetEventHandle.c file
+        ret = orig_cudaIpcGetEventHandle(handle, event);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaIpcOpenEventHandle)(cudaEvent_t *event, cudaIpcEventHandle_t handle) = NULL;
+__host__ cudaError_t CUDARTAPI cudaIpcOpenEventHandle(cudaEvent_t *event, cudaIpcEventHandle_t handle) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaIpcOpenEventHandle.c file
+        ret = orig_cudaIpcOpenEventHandle(event, handle);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaIpcGetMemHandle)(cudaIpcMemHandle_t *handle, void *devPtr) = NULL;
+__host__ cudaError_t CUDARTAPI cudaIpcGetMemHandle(cudaIpcMemHandle_t *handle, void *devPtr) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaIpcGetMemHandle.c file
+        ret = orig_cudaIpcGetMemHandle(handle, devPtr);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaIpcOpenMemHandle)(void **devPtr, cudaIpcMemHandle_t handle, unsigned int flags) = NULL;
+__host__ cudaError_t CUDARTAPI cudaIpcOpenMemHandle(void **devPtr, cudaIpcMemHandle_t handle, unsigned int flags) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaIpcOpenMemHandle.c file
+        ret = orig_cudaIpcOpenMemHandle(devPtr, handle, flags);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaIpcCloseMemHandle)(void *devPtr) = NULL;
+__host__ cudaError_t CUDARTAPI cudaIpcCloseMemHandle(void *devPtr) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaIpcCloseMemHandle.c file
+        ret = orig_cudaIpcCloseMemHandle(devPtr);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaThreadExit)(void) = NULL;
+__host__ cudaError_t CUDARTAPI cudaThreadExit(void) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaThreadExit.c file
+        ret = orig_cudaThreadExit();
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaThreadSynchronize)(void) = NULL;
+__host__ cudaError_t CUDARTAPI cudaThreadSynchronize(void) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaThreadSynchronize.c file
+        ret = orig_cudaThreadSynchronize();
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaThreadSetLimit)(enum cudaLimit limit, size_t value) = NULL;
+__host__ cudaError_t CUDARTAPI cudaThreadSetLimit(enum cudaLimit limit, size_t value) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaThreadSetLimit.c file
+        ret = orig_cudaThreadSetLimit(limit, value);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaThreadGetLimit)(size_t *pValue, enum cudaLimit limit) = NULL;
+__host__ cudaError_t CUDARTAPI cudaThreadGetLimit(size_t *pValue, enum cudaLimit limit) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaThreadGetLimit.c file
+        ret = orig_cudaThreadGetLimit(pValue, limit);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaThreadGetCacheConfig)(enum cudaFuncCache *pCacheConfig) = NULL;
+__host__ cudaError_t CUDARTAPI cudaThreadGetCacheConfig(enum cudaFuncCache *pCacheConfig) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaThreadGetCacheConfig.c file
+        ret = orig_cudaThreadGetCacheConfig(pCacheConfig);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaThreadSetCacheConfig)(enum cudaFuncCache cacheConfig) = NULL;
+__host__ cudaError_t CUDARTAPI cudaThreadSetCacheConfig(enum cudaFuncCache cacheConfig) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaThreadSetCacheConfig.c file
+        ret = orig_cudaThreadSetCacheConfig(cacheConfig);
+        return ret;
+}
+static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaGetLastError)(void) = NULL;
+__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetLastError(void) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaGetLastError.c file
+        ret = orig_cudaGetLastError();
+        return ret;
+}
+static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaPeekAtLastError)(void) = NULL;
+__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaPeekAtLastError(void) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaPeekAtLastError.c file
+        ret = orig_cudaPeekAtLastError();
+        return ret;
+}
+static __host__ __cudart_builtin__ const char* CUDARTAPI (*orig_cudaGetErrorName)(cudaError_t error) = NULL;
+__host__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorName(cudaError_t error) {
+        const char* ret;
+        // Write your own custom c code in the cudaGetErrorName.c file
+        ret = orig_cudaGetErrorName(error);
+        return ret;
+}
+static __host__ __cudart_builtin__ const char* CUDARTAPI (*orig_cudaGetErrorString)(cudaError_t error) = NULL;
+__host__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorString(cudaError_t error) {
+        const char* ret;
+        // Write your own custom c code in the cudaGetErrorString.c file
+        ret = orig_cudaGetErrorString(error);
+        return ret;
+}
+static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaGetDeviceCount)(int *count) = NULL;
+__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceCount(int *count) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaGetDeviceCount.c file
+        ret = orig_cudaGetDeviceCount(count);
+        return ret;
+}
+static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaGetDeviceProperties)(struct cudaDeviceProp *prop, int device) = NULL;
+__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceProperties(struct cudaDeviceProp *prop, int device) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaGetDeviceProperties.c file
+        ret = orig_cudaGetDeviceProperties(prop, device);
+        return ret;
+}
+static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaDeviceGetAttribute)(int *value, enum cudaDeviceAttr attr, int device) = NULL;
+__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaDeviceGetAttribute.c file
+        ret = orig_cudaDeviceGetAttribute(value, attr, device);
+        return ret;
+}
+static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaDeviceGetP2PAttribute)(int *value, enum cudaDeviceP2PAttr attr, int srcDevice, int dstDevice) = NULL;
+__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetP2PAttribute(int *value, enum cudaDeviceP2PAttr attr, int srcDevice, int dstDevice) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaDeviceGetP2PAttribute.c file
+        ret = orig_cudaDeviceGetP2PAttribute(value, attr, srcDevice, dstDevice);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaChooseDevice)(int *device, const struct cudaDeviceProp *prop) = NULL;
+__host__ cudaError_t CUDARTAPI cudaChooseDevice(int *device, const struct cudaDeviceProp *prop) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaChooseDevice.c file
+        ret = orig_cudaChooseDevice(device, prop);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaSetDevice)(int device) = NULL;
+__host__ cudaError_t CUDARTAPI cudaSetDevice(int device) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaSetDevice.c file
+        ret = orig_cudaSetDevice(device);
+        return ret;
+}
+static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaGetDevice)(int *device) = NULL;
+__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDevice(int *device) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaGetDevice.c file
+        ret = orig_cudaGetDevice(device);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaSetValidDevices)(int *device_arr, int len) = NULL;
+__host__ cudaError_t CUDARTAPI cudaSetValidDevices(int *device_arr, int len) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaSetValidDevices.c file
+        ret = orig_cudaSetValidDevices(device_arr, len);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaSetDeviceFlags)( unsigned int flags ) = NULL;
+__host__ cudaError_t CUDARTAPI cudaSetDeviceFlags( unsigned int flags ) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaSetDeviceFlags.c file
+        ret = orig_cudaSetDeviceFlags(flags);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaGetDeviceFlags)( unsigned int *flags ) = NULL;
+__host__ cudaError_t CUDARTAPI cudaGetDeviceFlags( unsigned int *flags ) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaGetDeviceFlags.c file
+        ret = orig_cudaGetDeviceFlags(flags);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaStreamCreate)(cudaStream_t *pStream) = NULL;
+__host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStream) {
+        cudaError_t ret;
+        pid_t my_tid = syscall(SYS_gettid);
+        streamInit(my_tid, 0);
+        // Write your own custom c code in the cudaStreamCreate.c file
+        ret = orig_cudaStreamCreate(pStream);
+        return ret;
+}
+static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaStreamCreateWithFlags)(cudaStream_t *pStream, unsigned int flags) = NULL;
+__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags) {
+        cudaError_t ret;
+        pid_t my_tid = syscall(SYS_gettid);
+        streamInit(my_tid, 0);
+        
+        // Write your own custom c code in the cudaStreamCreateWithFlags.c file
+        ret = orig_cudaStreamCreateWithFlags(pStream, flags);
+        return ret;
+}
+static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaStreamCreateWithPriority)(cudaStream_t *pStream, unsigned int flags, int priority) = NULL;
+__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithPriority(cudaStream_t *pStream, unsigned int flags, int priority) {
+        cudaError_t ret;
+        pid_t my_tid = syscall(SYS_gettid);
+        streamInit(my_tid, priority);
+        
+        // Write your own custom c code in the cudaStreamCreateWithPriority.c file
+        ret = orig_cudaStreamCreateWithPriority(pStream, flags, priority);
+        return ret;
+}
+static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaStreamGetPriority)(cudaStream_t hStream, int *priority) = NULL;
+__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamGetPriority(cudaStream_t hStream, int *priority) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaStreamGetPriority.c file
+        ret = orig_cudaStreamGetPriority(hStream, priority);
+        return ret;
+}
+static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaStreamGetFlags)(cudaStream_t hStream, unsigned int *flags) = NULL;
+__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamGetFlags(cudaStream_t hStream, unsigned int *flags) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaStreamGetFlags.c file
+        ret = orig_cudaStreamGetFlags(hStream, flags);
+        return ret;
+}
+static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaStreamDestroy)(cudaStream_t stream) = NULL;
+__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamDestroy(cudaStream_t stream) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaStreamDestroy.c file
+        ret = orig_cudaStreamDestroy(stream);
+        return ret;
+}
+static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaStreamWaitEvent)(cudaStream_t stream, cudaEvent_t event, unsigned int flags) = NULL;
+__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaStreamWaitEvent.c file
+        ret = orig_cudaStreamWaitEvent(stream, event, flags);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaStreamAddCallback)(cudaStream_t stream,        cudaStreamCallback_t callback, void *userData, unsigned int flags) = NULL;
+__host__ cudaError_t CUDARTAPI cudaStreamAddCallback(cudaStream_t stream,        cudaStreamCallback_t callback, void *userData, unsigned int flags) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaStreamAddCallback.c file
+        ret = orig_cudaStreamAddCallback(stream, callback, userData, flags);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaStreamSynchronize)(cudaStream_t stream) = NULL;
+__host__ cudaError_t CUDARTAPI cudaStreamSynchronize(cudaStream_t stream) {
+        cudaError_t ret;
+        pid_t my_tid = syscall(SYS_gettid);
+        
+        // Write your own custom c code in the cudaStreamSynchronize.c file
+        //printf("cudaStreamSynchronize stream %p\n", (void *)stream);
+        ret = orig_cudaStreamSynchronize(stream);
+        schedSync(my_tid, (void *)stream);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaStreamQuery)(cudaStream_t stream) = NULL;
+__host__ cudaError_t CUDARTAPI cudaStreamQuery(cudaStream_t stream) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaStreamQuery.c file
+        ret = orig_cudaStreamQuery(stream);
+        return ret;
+}
+static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaStreamAttachMemAsync)(cudaStream_t stream, void *devPtr, size_t length , unsigned int flags ) = NULL;
+__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr, size_t length , unsigned int flags ) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaStreamAttachMemAsync.c file
+        ret = orig_cudaStreamAttachMemAsync(stream, devPtr, length, flags);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaEventCreate)(cudaEvent_t *event) = NULL;
+__host__ cudaError_t CUDARTAPI cudaEventCreate(cudaEvent_t *event) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaEventCreate.c file
+        ret = orig_cudaEventCreate(event);
+        return ret;
+}
+static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaEventCreateWithFlags)(cudaEvent_t *event, unsigned int flags) = NULL;
+__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaEventCreateWithFlags.c file
+        ret = orig_cudaEventCreateWithFlags(event, flags);
+        return ret;
+}
+static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaEventRecord)(cudaEvent_t event, cudaStream_t stream ) = NULL;
+__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord(cudaEvent_t event, cudaStream_t stream ) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaEventRecord.c file
+        ret = orig_cudaEventRecord(event, stream);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaEventQuery)(cudaEvent_t event) = NULL;
+__host__ cudaError_t CUDARTAPI cudaEventQuery(cudaEvent_t event) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaEventQuery.c file
+        ret = orig_cudaEventQuery(event);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaEventSynchronize)(cudaEvent_t event) = NULL;
+__host__ cudaError_t CUDARTAPI cudaEventSynchronize(cudaEvent_t event) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaEventSynchronize.c file
+        ret = orig_cudaEventSynchronize(event);
+        return ret;
+}
+static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaEventDestroy)(cudaEvent_t event) = NULL;
+__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventDestroy(cudaEvent_t event) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaEventDestroy.c file
+        ret = orig_cudaEventDestroy(event);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaEventElapsedTime)(float *ms, cudaEvent_t start, cudaEvent_t end) = NULL;
+__host__ cudaError_t CUDARTAPI cudaEventElapsedTime(float *ms, cudaEvent_t start, cudaEvent_t end) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaEventElapsedTime.c file
+        ret = orig_cudaEventElapsedTime(ms, start, end);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaLaunchKernel)(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) = NULL;
+__host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaLaunchKernel.c file
+        ret = orig_cudaLaunchKernel(func, gridDim, blockDim, args, sharedMem, stream);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaLaunchCooperativeKernel)(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) = NULL;
+__host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaLaunchCooperativeKernel.c file
+        ret = orig_cudaLaunchCooperativeKernel(func, gridDim, blockDim, args, sharedMem, stream);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaLaunchCooperativeKernelMultiDevice)(struct cudaLaunchParams *launchParamsList, unsigned int numDevices, unsigned int flags  ) = NULL;
+__host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams *launchParamsList, unsigned int numDevices, unsigned int flags  ) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaLaunchCooperativeKernelMultiDevice.c file
+        ret = orig_cudaLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaFuncSetCacheConfig)(const void *func, enum cudaFuncCache cacheConfig) = NULL;
+__host__ cudaError_t CUDARTAPI cudaFuncSetCacheConfig(const void *func, enum cudaFuncCache cacheConfig) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaFuncSetCacheConfig.c file
+        ret = orig_cudaFuncSetCacheConfig(func, cacheConfig);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaFuncSetSharedMemConfig)(const void *func, enum cudaSharedMemConfig config) = NULL;
+__host__ cudaError_t CUDARTAPI cudaFuncSetSharedMemConfig(const void *func, enum cudaSharedMemConfig config) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaFuncSetSharedMemConfig.c file
+        ret = orig_cudaFuncSetSharedMemConfig(func, config);
+        return ret;
+}
+static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaFuncGetAttributes)(struct cudaFuncAttributes *attr, const void *func) = NULL;
+__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaFuncGetAttributes.c file
+        ret = orig_cudaFuncGetAttributes(attr, func);
+        return ret;
+}
+static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaFuncSetAttribute)(const void *func, enum cudaFuncAttribute attr, int value) = NULL;
+__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFuncSetAttribute(const void *func, enum cudaFuncAttribute attr, int value) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaFuncSetAttribute.c file
+        ret = orig_cudaFuncSetAttribute(func, attr, value);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaSetDoubleForDevice)(double *d) = NULL;
+__host__ cudaError_t CUDARTAPI cudaSetDoubleForDevice(double *d) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaSetDoubleForDevice.c file
+        ret = orig_cudaSetDoubleForDevice(d);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaSetDoubleForHost)(double *d) = NULL;
+__host__ cudaError_t CUDARTAPI cudaSetDoubleForHost(double *d) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaSetDoubleForHost.c file
+        ret = orig_cudaSetDoubleForHost(d);
+        return ret;
+}
+static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaOccupancyMaxActiveBlocksPerMultiprocessor)(int *numBlocks, const void *func, int blockSize, size_t dynamicSMemSize) = NULL;
+__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSMemSize) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaOccupancyMaxActiveBlocksPerMultiprocessor.c file
+        ret = orig_cudaOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, func, blockSize, dynamicSMemSize);
+        return ret;
+}
+static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags)(int *numBlocks, const void *func, int blockSize, size_t dynamicSMemSize, unsigned int flags) = NULL;
+__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSMemSize, unsigned int flags) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.c file
+        ret = orig_cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks, func, blockSize, dynamicSMemSize, flags);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaConfigureCall)(dim3 gridDim, dim3 blockDim, size_t sharedMem , cudaStream_t stream ) = NULL;
+__host__ cudaError_t CUDARTAPI cudaConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem , cudaStream_t stream ) {
+        cudaError_t ret;
+        pid_t my_tid = syscall(SYS_gettid);
+        // Write your own custom c code in the cudaConfigureCall.c file
+        // printf("cudaConfigureCall TID %d stream %p blocks %d threads %d\n",
+        schedConfCall(my_tid, (void *)stream, gridDim.x * gridDim.y, blockDim.x * blockDim.y);
+        ret = orig_cudaConfigureCall(gridDim, blockDim, sharedMem, stream);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaSetupArgument)(const void *arg, size_t size, size_t offset) = NULL;
+__host__ cudaError_t CUDARTAPI cudaSetupArgument(const void *arg, size_t size, size_t offset) {
+        cudaError_t ret;
+        //pid_t my_tid = syscall(SYS_gettid);
+        // Write your own custom c code in the cudaSetupArgument.c file
+        // printf("cudaSetupArugment TID %d size %lu offset %lu\n", my_tid, size, offset);
+        ret = orig_cudaSetupArgument(arg, size, offset);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaLaunch)(const void *func) = NULL;
+__host__ cudaError_t CUDARTAPI cudaLaunch(const void *func) {
+        cudaError_t ret;
+        pid_t my_tid = syscall(SYS_gettid);
+        //printf("cudaLaunch TID %d\n", my_tid);
+        schedLaunch(my_tid);
+        ret = orig_cudaLaunch(func);
+        return ret;
+}
+static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaMallocManaged)(void **devPtr, size_t size, unsigned int flags ) = NULL;
+__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMallocManaged(void **devPtr, size_t size, unsigned int flags ) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaMallocManaged.c file
+        ret = orig_cudaMallocManaged(devPtr, size, flags);
+        return ret;
+}
+static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaMalloc)(void **devPtr, size_t size) = NULL;
+__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaMalloc.c file
+        ret = orig_cudaMalloc(devPtr, size);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaMallocHost)(void **ptr, size_t size) = NULL;
+__host__ cudaError_t CUDARTAPI cudaMallocHost(void **ptr, size_t size) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaMallocHost.c file
+        ret = orig_cudaMallocHost(ptr, size);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaMallocPitch)(void **devPtr, size_t *pitch, size_t width, size_t height) = NULL;
+__host__ cudaError_t CUDARTAPI cudaMallocPitch(void **devPtr, size_t *pitch, size_t width, size_t height) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaMallocPitch.c file
+        ret = orig_cudaMallocPitch(devPtr, pitch, width, height);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaMallocArray)(cudaArray_t *array, const struct cudaChannelFormatDesc *desc, size_t width, size_t height , unsigned int flags ) = NULL;
+__host__ cudaError_t CUDARTAPI cudaMallocArray(cudaArray_t *array, const struct cudaChannelFormatDesc *desc, size_t width, size_t height , unsigned int flags ) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaMallocArray.c file
+        ret = orig_cudaMallocArray(array, desc, width, height, flags);
+        return ret;
+}
+static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaFree)(void *devPtr) = NULL;
+__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFree(void *devPtr) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaFree.c file
+        ret = orig_cudaFree(devPtr);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaFreeHost)(void *ptr) = NULL;
+__host__ cudaError_t CUDARTAPI cudaFreeHost(void *ptr) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaFreeHost.c file
+        ret = orig_cudaFreeHost(ptr);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaFreeArray)(cudaArray_t array) = NULL;
+__host__ cudaError_t CUDARTAPI cudaFreeArray(cudaArray_t array) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaFreeArray.c file
+        ret = orig_cudaFreeArray(array);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaFreeMipmappedArray)(cudaMipmappedArray_t mipmappedArray) = NULL;
+__host__ cudaError_t CUDARTAPI cudaFreeMipmappedArray(cudaMipmappedArray_t mipmappedArray) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaFreeMipmappedArray.c file
+        ret = orig_cudaFreeMipmappedArray(mipmappedArray);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaHostAlloc)(void **pHost, size_t size, unsigned int flags) = NULL;
+__host__ cudaError_t CUDARTAPI cudaHostAlloc(void **pHost, size_t size, unsigned int flags) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaHostAlloc.c file
+        ret = orig_cudaHostAlloc(pHost, size, flags);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaHostRegister)(void *ptr, size_t size, unsigned int flags) = NULL;
+__host__ cudaError_t CUDARTAPI cudaHostRegister(void *ptr, size_t size, unsigned int flags) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaHostRegister.c file
+        ret = orig_cudaHostRegister(ptr, size, flags);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaHostUnregister)(void *ptr) = NULL;
+__host__ cudaError_t CUDARTAPI cudaHostUnregister(void *ptr) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaHostUnregister.c file
+        ret = orig_cudaHostUnregister(ptr);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaHostGetDevicePointer)(void **pDevice, void *pHost, unsigned int flags) = NULL;
+__host__ cudaError_t CUDARTAPI cudaHostGetDevicePointer(void **pDevice, void *pHost, unsigned int flags) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaHostGetDevicePointer.c file
+        ret = orig_cudaHostGetDevicePointer(pDevice, pHost, flags);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaHostGetFlags)(unsigned int *pFlags, void *pHost) = NULL;
+__host__ cudaError_t CUDARTAPI cudaHostGetFlags(unsigned int *pFlags, void *pHost) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaHostGetFlags.c file
+        ret = orig_cudaHostGetFlags(pFlags, pHost);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaMalloc3D)(struct cudaPitchedPtr* pitchedDevPtr, struct cudaExtent extent) = NULL;
+__host__ cudaError_t CUDARTAPI cudaMalloc3D(struct cudaPitchedPtr* pitchedDevPtr, struct cudaExtent extent) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaMalloc3D.c file
+        ret = orig_cudaMalloc3D(pitchedDevPtr, extent);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaMalloc3DArray)(cudaArray_t *array, const struct cudaChannelFormatDesc* desc, struct cudaExtent extent, unsigned int flags ) = NULL;
+__host__ cudaError_t CUDARTAPI cudaMalloc3DArray(cudaArray_t *array, const struct cudaChannelFormatDesc* desc, struct cudaExtent extent, unsigned int flags ) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaMalloc3DArray.c file
+        ret = orig_cudaMalloc3DArray(array, desc, extent, flags);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaMallocMipmappedArray)(cudaMipmappedArray_t *mipmappedArray, const struct cudaChannelFormatDesc* desc, struct cudaExtent extent, unsigned int numLevels, unsigned int flags ) = NULL;
+__host__ cudaError_t CUDARTAPI cudaMallocMipmappedArray(cudaMipmappedArray_t *mipmappedArray, const struct cudaChannelFormatDesc* desc, struct cudaExtent extent, unsigned int numLevels, unsigned int flags ) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaMallocMipmappedArray.c file
+        ret = orig_cudaMallocMipmappedArray(mipmappedArray, desc, extent, numLevels, flags);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaGetMipmappedArrayLevel)(cudaArray_t *levelArray, cudaMipmappedArray_const_t mipmappedArray, unsigned int level) = NULL;
+__host__ cudaError_t CUDARTAPI cudaGetMipmappedArrayLevel(cudaArray_t *levelArray, cudaMipmappedArray_const_t mipmappedArray, unsigned int level) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaGetMipmappedArrayLevel.c file
+        ret = orig_cudaGetMipmappedArrayLevel(levelArray, mipmappedArray, level);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpy3D)(const struct cudaMemcpy3DParms *p) = NULL;
+__host__ cudaError_t CUDARTAPI cudaMemcpy3D(const struct cudaMemcpy3DParms *p) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaMemcpy3D.c file
+        ret = orig_cudaMemcpy3D(p);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpy3DPeer)(const struct cudaMemcpy3DPeerParms *p) = NULL;
+__host__ cudaError_t CUDARTAPI cudaMemcpy3DPeer(const struct cudaMemcpy3DPeerParms *p) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaMemcpy3DPeer.c file
+        ret = orig_cudaMemcpy3DPeer(p);
+        return ret;
+}
+static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaMemcpy3DAsync)(const struct cudaMemcpy3DParms *p, cudaStream_t stream ) = NULL;
+__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(const struct cudaMemcpy3DParms *p, cudaStream_t stream ) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaMemcpy3DAsync.c file
+        ret = orig_cudaMemcpy3DAsync(p, stream);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpy3DPeerAsync)(const struct cudaMemcpy3DPeerParms *p, cudaStream_t stream ) = NULL;
+__host__ cudaError_t CUDARTAPI cudaMemcpy3DPeerAsync(const struct cudaMemcpy3DPeerParms *p, cudaStream_t stream ) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaMemcpy3DPeerAsync.c file
+        ret = orig_cudaMemcpy3DPeerAsync(p, stream);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaMemGetInfo)(size_t *free, size_t *total) = NULL;
+__host__ cudaError_t CUDARTAPI cudaMemGetInfo(size_t *free, size_t *total) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaMemGetInfo.c file
+        ret = orig_cudaMemGetInfo(free, total);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaArrayGetInfo)(struct cudaChannelFormatDesc *desc, struct cudaExtent *extent, unsigned int *flags, cudaArray_t array) = NULL;
+__host__ cudaError_t CUDARTAPI cudaArrayGetInfo(struct cudaChannelFormatDesc *desc, struct cudaExtent *extent, unsigned int *flags, cudaArray_t array) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaArrayGetInfo.c file
+        ret = orig_cudaArrayGetInfo(desc, extent, flags, array);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpy)(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind) = NULL;
+__host__ cudaError_t CUDARTAPI cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaMemcpy.c file
+        printf("cudaMemcpy\n");
+        ret = orig_cudaMemcpy(dst, src, count, kind);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpyPeer)(void *dst, int dstDevice, const void *src, int srcDevice, size_t count) = NULL;
+__host__ cudaError_t CUDARTAPI cudaMemcpyPeer(void *dst, int dstDevice, const void *src, int srcDevice, size_t count) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaMemcpyPeer.c file
+        ret = orig_cudaMemcpyPeer(dst, dstDevice, src, srcDevice, count);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpyToArray)(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind) = NULL;
+__host__ cudaError_t CUDARTAPI cudaMemcpyToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaMemcpyToArray.c file
+        ret = orig_cudaMemcpyToArray(dst, wOffset, hOffset, src, count, kind);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpyFromArray)(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind) = NULL;
+__host__ cudaError_t CUDARTAPI cudaMemcpyFromArray(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaMemcpyFromArray.c file
+        ret = orig_cudaMemcpyFromArray(dst, src, wOffset, hOffset, count, kind);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpyArrayToArray)(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count, enum cudaMemcpyKind kind ) = NULL;
+__host__ cudaError_t CUDARTAPI cudaMemcpyArrayToArray(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count, enum cudaMemcpyKind kind ) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaMemcpyArrayToArray.c file
+        ret = orig_cudaMemcpyArrayToArray(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc, count, kind);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpy2D)(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind) = NULL;
+__host__ cudaError_t CUDARTAPI cudaMemcpy2D(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaMemcpy2D.c file
+        ret = orig_cudaMemcpy2D(dst, dpitch, src, spitch, width, height, kind);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpy2DToArray)(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind) = NULL;
+__host__ cudaError_t CUDARTAPI cudaMemcpy2DToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaMemcpy2DToArray.c file
+        ret = orig_cudaMemcpy2DToArray(dst, wOffset, hOffset, src, spitch, width, height, kind);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpy2DFromArray)(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind) = NULL;
+__host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArray(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaMemcpy2DFromArray.c file
+        ret = orig_cudaMemcpy2DFromArray(dst, dpitch, src, wOffset, hOffset, width, height, kind);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpy2DArrayToArray)(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, enum cudaMemcpyKind kind ) = NULL;
+__host__ cudaError_t CUDARTAPI cudaMemcpy2DArrayToArray(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, enum cudaMemcpyKind kind ) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaMemcpy2DArrayToArray.c file
+        ret = orig_cudaMemcpy2DArrayToArray(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc, width, height, kind);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpyToSymbol)(const void *symbol, const void *src, size_t count, size_t offset , enum cudaMemcpyKind kind ) = NULL;
+__host__ cudaError_t CUDARTAPI cudaMemcpyToSymbol(const void *symbol, const void *src, size_t count, size_t offset , enum cudaMemcpyKind kind ) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaMemcpyToSymbol.c file
+        ret = orig_cudaMemcpyToSymbol(symbol, src, count, offset, kind);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpyFromSymbol)(void *dst, const void *symbol, size_t count, size_t offset , enum cudaMemcpyKind kind ) = NULL;
+__host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbol(void *dst, const void *symbol, size_t count, size_t offset , enum cudaMemcpyKind kind ) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaMemcpyFromSymbol.c file
+        ret = orig_cudaMemcpyFromSymbol(dst, symbol, count, offset, kind);
+        return ret;
+}
+static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaMemcpyAsync)(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream ) = NULL;
+__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream ) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaMemcpyAsync.c file
+        printf("cudaMemcpyAsync stream %p\n", (void *)stream);
+        ret = orig_cudaMemcpyAsync(dst, src, count, kind, stream);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpyPeerAsync)(void *dst, int dstDevice, const void *src, int srcDevice, size_t count, cudaStream_t stream ) = NULL;
+__host__ cudaError_t CUDARTAPI cudaMemcpyPeerAsync(void *dst, int dstDevice, const void *src, int srcDevice, size_t count, cudaStream_t stream ) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaMemcpyPeerAsync.c file
+        ret = orig_cudaMemcpyPeerAsync(dst, dstDevice, src, srcDevice, count, stream);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpyToArrayAsync)(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream ) = NULL;
+__host__ cudaError_t CUDARTAPI cudaMemcpyToArrayAsync(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream ) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaMemcpyToArrayAsync.c file
+        ret = orig_cudaMemcpyToArrayAsync(dst, wOffset, hOffset, src, count, kind, stream);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpyFromArrayAsync)(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream ) = NULL;
+__host__ cudaError_t CUDARTAPI cudaMemcpyFromArrayAsync(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream ) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaMemcpyFromArrayAsync.c file
+        ret = orig_cudaMemcpyFromArrayAsync(dst, src, wOffset, hOffset, count, kind, stream);
+        return ret;
+}
+static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaMemcpy2DAsync)(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream ) = NULL;
+__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream ) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaMemcpy2DAsync.c file
+        ret = orig_cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, kind, stream);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpy2DToArrayAsync)(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream ) = NULL;
+__host__ cudaError_t CUDARTAPI cudaMemcpy2DToArrayAsync(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream ) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaMemcpy2DToArrayAsync.c file
+        ret = orig_cudaMemcpy2DToArrayAsync(dst, wOffset, hOffset, src, spitch, width, height, kind, stream);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpy2DFromArrayAsync)(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream ) = NULL;
+__host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArrayAsync(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream ) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaMemcpy2DFromArrayAsync.c file
+        ret = orig_cudaMemcpy2DFromArrayAsync(dst, dpitch, src, wOffset, hOffset, width, height, kind, stream);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpyToSymbolAsync)(const void *symbol, const void *src, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream ) = NULL;
+__host__ cudaError_t CUDARTAPI cudaMemcpyToSymbolAsync(const void *symbol, const void *src, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream ) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaMemcpyToSymbolAsync.c file
+        ret = orig_cudaMemcpyToSymbolAsync(symbol, src, count, offset, kind, stream);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaMemcpyFromSymbolAsync)(void *dst, const void *symbol, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream ) = NULL;
+__host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbolAsync(void *dst, const void *symbol, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream ) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaMemcpyFromSymbolAsync.c file
+        ret = orig_cudaMemcpyFromSymbolAsync(dst, symbol, count, offset, kind, stream);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaMemset)(void *devPtr, int value, size_t count) = NULL;
+__host__ cudaError_t CUDARTAPI cudaMemset(void *devPtr, int value, size_t count) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaMemset.c file
+        ret = orig_cudaMemset(devPtr, value, count);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaMemset2D)(void *devPtr, size_t pitch, int value, size_t width, size_t height) = NULL;
+__host__ cudaError_t CUDARTAPI cudaMemset2D(void *devPtr, size_t pitch, int value, size_t width, size_t height) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaMemset2D.c file
+        ret = orig_cudaMemset2D(devPtr, pitch, value, width, height);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaMemset3D)(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent) = NULL;
+__host__ cudaError_t CUDARTAPI cudaMemset3D(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaMemset3D.c file
+        ret = orig_cudaMemset3D(pitchedDevPtr, value, extent);
+        return ret;
+}
+static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaMemsetAsync)(void *devPtr, int value, size_t count, cudaStream_t stream ) = NULL;
+__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(void *devPtr, int value, size_t count, cudaStream_t stream ) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaMemsetAsync.c file
+        ret = orig_cudaMemsetAsync(devPtr, value, count, stream);
+        return ret;
+}
+static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaMemset2DAsync)(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream ) = NULL;
+__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream ) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaMemset2DAsync.c file
+        ret = orig_cudaMemset2DAsync(devPtr, pitch, value, width, height, stream);
+        return ret;
+}
+static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaMemset3DAsync)(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream ) = NULL;
+__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream ) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaMemset3DAsync.c file
+        ret = orig_cudaMemset3DAsync(pitchedDevPtr, value, extent, stream);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaGetSymbolAddress)(void **devPtr, const void *symbol) = NULL;
+__host__ cudaError_t CUDARTAPI cudaGetSymbolAddress(void **devPtr, const void *symbol) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaGetSymbolAddress.c file
+        ret = orig_cudaGetSymbolAddress(devPtr, symbol);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaGetSymbolSize)(size_t *size, const void *symbol) = NULL;
+__host__ cudaError_t CUDARTAPI cudaGetSymbolSize(size_t *size, const void *symbol) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaGetSymbolSize.c file
+        ret = orig_cudaGetSymbolSize(size, symbol);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaMemPrefetchAsync)(const void *devPtr, size_t count, int dstDevice, cudaStream_t stream ) = NULL;
+__host__ cudaError_t CUDARTAPI cudaMemPrefetchAsync(const void *devPtr, size_t count, int dstDevice, cudaStream_t stream ) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaMemPrefetchAsync.c file
+        ret = orig_cudaMemPrefetchAsync(devPtr, count, dstDevice, stream);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaMemAdvise)(const void *devPtr, size_t count, enum cudaMemoryAdvise advice, int device) = NULL;
+__host__ cudaError_t CUDARTAPI cudaMemAdvise(const void *devPtr, size_t count, enum cudaMemoryAdvise advice, int device) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaMemAdvise.c file
+        ret = orig_cudaMemAdvise(devPtr, count, advice, device);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaMemRangeGetAttribute)(void *data, size_t dataSize, enum cudaMemRangeAttribute attribute, const void *devPtr, size_t count) = NULL;
+__host__ cudaError_t CUDARTAPI cudaMemRangeGetAttribute(void *data, size_t dataSize, enum cudaMemRangeAttribute attribute, const void *devPtr, size_t count) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaMemRangeGetAttribute.c file
+        ret = orig_cudaMemRangeGetAttribute(data, dataSize, attribute, devPtr, count);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaMemRangeGetAttributes)(void **data, size_t *dataSizes, enum cudaMemRangeAttribute *attributes, size_t numAttributes, const void *devPtr, size_t count) = NULL;
+__host__ cudaError_t CUDARTAPI cudaMemRangeGetAttributes(void **data, size_t *dataSizes, enum cudaMemRangeAttribute *attributes, size_t numAttributes, const void *devPtr, size_t count) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaMemRangeGetAttributes.c file
+        ret = orig_cudaMemRangeGetAttributes(data, dataSizes, attributes, numAttributes, devPtr, count);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaPointerGetAttributes)(struct cudaPointerAttributes *attributes, const void *ptr) = NULL;
+__host__ cudaError_t CUDARTAPI cudaPointerGetAttributes(struct cudaPointerAttributes *attributes, const void *ptr) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaPointerGetAttributes.c file
+        ret = orig_cudaPointerGetAttributes(attributes, ptr);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaDeviceCanAccessPeer)(int *canAccessPeer, int device, int peerDevice) = NULL;
+__host__ cudaError_t CUDARTAPI cudaDeviceCanAccessPeer(int *canAccessPeer, int device, int peerDevice) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaDeviceCanAccessPeer.c file
+        ret = orig_cudaDeviceCanAccessPeer(canAccessPeer, device, peerDevice);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaDeviceEnablePeerAccess)(int peerDevice, unsigned int flags) = NULL;
+__host__ cudaError_t CUDARTAPI cudaDeviceEnablePeerAccess(int peerDevice, unsigned int flags) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaDeviceEnablePeerAccess.c file
+        ret = orig_cudaDeviceEnablePeerAccess(peerDevice, flags);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaDeviceDisablePeerAccess)(int peerDevice) = NULL;
+__host__ cudaError_t CUDARTAPI cudaDeviceDisablePeerAccess(int peerDevice) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaDeviceDisablePeerAccess.c file
+        ret = orig_cudaDeviceDisablePeerAccess(peerDevice);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaGraphicsUnregisterResource)(cudaGraphicsResource_t resource) = NULL;
+__host__ cudaError_t CUDARTAPI cudaGraphicsUnregisterResource(cudaGraphicsResource_t resource) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaGraphicsUnregisterResource.c file
+        ret = orig_cudaGraphicsUnregisterResource(resource);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaGraphicsResourceSetMapFlags)(cudaGraphicsResource_t resource, unsigned int flags) = NULL;
+__host__ cudaError_t CUDARTAPI cudaGraphicsResourceSetMapFlags(cudaGraphicsResource_t resource, unsigned int flags) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaGraphicsResourceSetMapFlags.c file
+        ret = orig_cudaGraphicsResourceSetMapFlags(resource, flags);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaGraphicsMapResources)(int count, cudaGraphicsResource_t *resources, cudaStream_t stream ) = NULL;
+__host__ cudaError_t CUDARTAPI cudaGraphicsMapResources(int count, cudaGraphicsResource_t *resources, cudaStream_t stream ) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaGraphicsMapResources.c file
+        ret = orig_cudaGraphicsMapResources(count, resources, stream);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaGraphicsUnmapResources)(int count, cudaGraphicsResource_t *resources, cudaStream_t stream ) = NULL;
+__host__ cudaError_t CUDARTAPI cudaGraphicsUnmapResources(int count, cudaGraphicsResource_t *resources, cudaStream_t stream ) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaGraphicsUnmapResources.c file
+        ret = orig_cudaGraphicsUnmapResources(count, resources, stream);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaGraphicsResourceGetMappedPointer)(void **devPtr, size_t *size, cudaGraphicsResource_t resource) = NULL;
+__host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedPointer(void **devPtr, size_t *size, cudaGraphicsResource_t resource) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaGraphicsResourceGetMappedPointer.c file
+        ret = orig_cudaGraphicsResourceGetMappedPointer(devPtr, size, resource);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaGraphicsSubResourceGetMappedArray)(cudaArray_t *array, cudaGraphicsResource_t resource, unsigned int arrayIndex, unsigned int mipLevel) = NULL;
+__host__ cudaError_t CUDARTAPI cudaGraphicsSubResourceGetMappedArray(cudaArray_t *array, cudaGraphicsResource_t resource, unsigned int arrayIndex, unsigned int mipLevel) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaGraphicsSubResourceGetMappedArray.c file
+        ret = orig_cudaGraphicsSubResourceGetMappedArray(array, resource, arrayIndex, mipLevel);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaGraphicsResourceGetMappedMipmappedArray)(cudaMipmappedArray_t *mipmappedArray, cudaGraphicsResource_t resource) = NULL;
+__host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedMipmappedArray(cudaMipmappedArray_t *mipmappedArray, cudaGraphicsResource_t resource) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaGraphicsResourceGetMappedMipmappedArray.c file
+        ret = orig_cudaGraphicsResourceGetMappedMipmappedArray(mipmappedArray, resource);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaGetChannelDesc)(struct cudaChannelFormatDesc *desc, cudaArray_const_t array) = NULL;
+__host__ cudaError_t CUDARTAPI cudaGetChannelDesc(struct cudaChannelFormatDesc *desc, cudaArray_const_t array) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaGetChannelDesc.c file
+        ret = orig_cudaGetChannelDesc(desc, array);
+        return ret;
+}
+static __host__ struct cudaChannelFormatDesc CUDARTAPI (*orig_cudaCreateChannelDesc)(int x, int y, int z, int w, enum cudaChannelFormatKind f) = NULL;
+__host__ struct cudaChannelFormatDesc CUDARTAPI cudaCreateChannelDesc(int x, int y, int z, int w, enum cudaChannelFormatKind f) {
+        struct cudaChannelFormatDesc ret;
+        // Write your own custom c code in the cudaCreateChannelDesc.c file
+        ret = orig_cudaCreateChannelDesc(x, y, z, w, f);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaBindTexture)(size_t *offset, const struct textureReference *texref, const void *devPtr, const struct cudaChannelFormatDesc *desc, size_t size ) = NULL;
+__host__ cudaError_t CUDARTAPI cudaBindTexture(size_t *offset, const struct textureReference *texref, const void *devPtr, const struct cudaChannelFormatDesc *desc, size_t size ) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaBindTexture.c file
+        ret = orig_cudaBindTexture(offset, texref, devPtr, desc, size);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaBindTexture2D)(size_t *offset, const struct textureReference *texref, const void *devPtr, const struct cudaChannelFormatDesc *desc, size_t width, size_t height, size_t pitch) = NULL;
+__host__ cudaError_t CUDARTAPI cudaBindTexture2D(size_t *offset, const struct textureReference *texref, const void *devPtr, const struct cudaChannelFormatDesc *desc, size_t width, size_t height, size_t pitch) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaBindTexture2D.c file
+        ret = orig_cudaBindTexture2D(offset, texref, devPtr, desc, width, height, pitch);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaBindTextureToArray)(const struct textureReference *texref, cudaArray_const_t array, const struct cudaChannelFormatDesc *desc) = NULL;
+__host__ cudaError_t CUDARTAPI cudaBindTextureToArray(const struct textureReference *texref, cudaArray_const_t array, const struct cudaChannelFormatDesc *desc) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaBindTextureToArray.c file
+        ret = orig_cudaBindTextureToArray(texref, array, desc);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaBindTextureToMipmappedArray)(const struct textureReference *texref, cudaMipmappedArray_const_t mipmappedArray, const struct cudaChannelFormatDesc *desc) = NULL;
+__host__ cudaError_t CUDARTAPI cudaBindTextureToMipmappedArray(const struct textureReference *texref, cudaMipmappedArray_const_t mipmappedArray, const struct cudaChannelFormatDesc *desc) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaBindTextureToMipmappedArray.c file
+        ret = orig_cudaBindTextureToMipmappedArray(texref, mipmappedArray, desc);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaUnbindTexture)(const struct textureReference *texref) = NULL;
+__host__ cudaError_t CUDARTAPI cudaUnbindTexture(const struct textureReference *texref) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaUnbindTexture.c file
+        ret = orig_cudaUnbindTexture(texref);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaGetTextureAlignmentOffset)(size_t *offset, const struct textureReference *texref) = NULL;
+__host__ cudaError_t CUDARTAPI cudaGetTextureAlignmentOffset(size_t *offset, const struct textureReference *texref) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaGetTextureAlignmentOffset.c file
+        ret = orig_cudaGetTextureAlignmentOffset(offset, texref);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaGetTextureReference)(const struct textureReference **texref, const void *symbol) = NULL;
+__host__ cudaError_t CUDARTAPI cudaGetTextureReference(const struct textureReference **texref, const void *symbol) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaGetTextureReference.c file
+        ret = orig_cudaGetTextureReference(texref, symbol);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaBindSurfaceToArray)(const struct surfaceReference *surfref, cudaArray_const_t array, const struct cudaChannelFormatDesc *desc) = NULL;
+__host__ cudaError_t CUDARTAPI cudaBindSurfaceToArray(const struct surfaceReference *surfref, cudaArray_const_t array, const struct cudaChannelFormatDesc *desc) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaBindSurfaceToArray.c file
+        ret = orig_cudaBindSurfaceToArray(surfref, array, desc);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaGetSurfaceReference)(const struct surfaceReference **surfref, const void *symbol) = NULL;
+__host__ cudaError_t CUDARTAPI cudaGetSurfaceReference(const struct surfaceReference **surfref, const void *symbol) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaGetSurfaceReference.c file
+        ret = orig_cudaGetSurfaceReference(surfref, symbol);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaCreateTextureObject)(cudaTextureObject_t *pTexObject, const struct cudaResourceDesc *pResDesc, const struct cudaTextureDesc *pTexDesc, const struct cudaResourceViewDesc *pResViewDesc) = NULL;
+__host__ cudaError_t CUDARTAPI cudaCreateTextureObject(cudaTextureObject_t *pTexObject, const struct cudaResourceDesc *pResDesc, const struct cudaTextureDesc *pTexDesc, const struct cudaResourceViewDesc *pResViewDesc) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaCreateTextureObject.c file
+        ret = orig_cudaCreateTextureObject(pTexObject, pResDesc, pTexDesc, pResViewDesc);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaDestroyTextureObject)(cudaTextureObject_t texObject) = NULL;
+__host__ cudaError_t CUDARTAPI cudaDestroyTextureObject(cudaTextureObject_t texObject) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaDestroyTextureObject.c file
+        ret = orig_cudaDestroyTextureObject(texObject);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaGetTextureObjectResourceDesc)(struct cudaResourceDesc *pResDesc, cudaTextureObject_t texObject) = NULL;
+__host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceDesc(struct cudaResourceDesc *pResDesc, cudaTextureObject_t texObject) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaGetTextureObjectResourceDesc.c file
+        ret = orig_cudaGetTextureObjectResourceDesc(pResDesc, texObject);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaGetTextureObjectTextureDesc)(struct cudaTextureDesc *pTexDesc, cudaTextureObject_t texObject) = NULL;
+__host__ cudaError_t CUDARTAPI cudaGetTextureObjectTextureDesc(struct cudaTextureDesc *pTexDesc, cudaTextureObject_t texObject) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaGetTextureObjectTextureDesc.c file
+        ret = orig_cudaGetTextureObjectTextureDesc(pTexDesc, texObject);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaGetTextureObjectResourceViewDesc)(struct cudaResourceViewDesc *pResViewDesc, cudaTextureObject_t texObject) = NULL;
+__host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceViewDesc(struct cudaResourceViewDesc *pResViewDesc, cudaTextureObject_t texObject) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaGetTextureObjectResourceViewDesc.c file
+        ret = orig_cudaGetTextureObjectResourceViewDesc(pResViewDesc, texObject);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaCreateSurfaceObject)(cudaSurfaceObject_t *pSurfObject, const struct cudaResourceDesc *pResDesc) = NULL;
+__host__ cudaError_t CUDARTAPI cudaCreateSurfaceObject(cudaSurfaceObject_t *pSurfObject, const struct cudaResourceDesc *pResDesc) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaCreateSurfaceObject.c file
+        ret = orig_cudaCreateSurfaceObject(pSurfObject, pResDesc);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaDestroySurfaceObject)(cudaSurfaceObject_t surfObject) = NULL;
+__host__ cudaError_t CUDARTAPI cudaDestroySurfaceObject(cudaSurfaceObject_t surfObject) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaDestroySurfaceObject.c file
+        ret = orig_cudaDestroySurfaceObject(surfObject);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaGetSurfaceObjectResourceDesc)(struct cudaResourceDesc *pResDesc, cudaSurfaceObject_t surfObject) = NULL;
+__host__ cudaError_t CUDARTAPI cudaGetSurfaceObjectResourceDesc(struct cudaResourceDesc *pResDesc, cudaSurfaceObject_t surfObject) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaGetSurfaceObjectResourceDesc.c file
+        ret = orig_cudaGetSurfaceObjectResourceDesc(pResDesc, surfObject);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaDriverGetVersion)(int *driverVersion) = NULL;
+__host__ cudaError_t CUDARTAPI cudaDriverGetVersion(int *driverVersion) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaDriverGetVersion.c file
+        ret = orig_cudaDriverGetVersion(driverVersion);
+        return ret;
+}
+static __host__ __cudart_builtin__ cudaError_t CUDARTAPI (*orig_cudaRuntimeGetVersion)(int *runtimeVersion) = NULL;
+__host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaRuntimeGetVersion(int *runtimeVersion) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaRuntimeGetVersion.c file
+        ret = orig_cudaRuntimeGetVersion(runtimeVersion);
+        return ret;
+}
+static __host__ cudaError_t CUDARTAPI (*orig_cudaGetExportTable)(const void **ppExportTable, const cudaUUID_t *pExportTableId) = NULL;
+__host__ cudaError_t CUDARTAPI cudaGetExportTable(const void **ppExportTable, const cudaUUID_t *pExportTableId) {
+        cudaError_t ret;
+        // Write your own custom c code in the cudaGetExportTable.c file
+        ret = orig_cudaGetExportTable(ppExportTable, pExportTableId);
+        return ret;
+}
+__attribute__((constructor)) static void init() {
+        char *dl_error;
+    // clear dl error
+    dlerror();
+    if (orig_cudaDeviceReset == NULL) {
+        orig_cudaDeviceReset = dlsym(RTLD_NEXT, "cudaDeviceReset");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaDeviceSynchronize == NULL) {
+        orig_cudaDeviceSynchronize = dlsym(RTLD_NEXT, "cudaDeviceSynchronize");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaDeviceSetLimit == NULL) {
+        orig_cudaDeviceSetLimit = dlsym(RTLD_NEXT, "cudaDeviceSetLimit");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaDeviceGetLimit == NULL) {
+        orig_cudaDeviceGetLimit = dlsym(RTLD_NEXT, "cudaDeviceGetLimit");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaDeviceGetCacheConfig == NULL) {
+        orig_cudaDeviceGetCacheConfig = dlsym(RTLD_NEXT, "cudaDeviceGetCacheConfig");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaDeviceGetStreamPriorityRange == NULL) {
+        orig_cudaDeviceGetStreamPriorityRange = dlsym(RTLD_NEXT, "cudaDeviceGetStreamPriorityRange");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaDeviceSetCacheConfig == NULL) {
+        orig_cudaDeviceSetCacheConfig = dlsym(RTLD_NEXT, "cudaDeviceSetCacheConfig");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaDeviceGetSharedMemConfig == NULL) {
+        orig_cudaDeviceGetSharedMemConfig = dlsym(RTLD_NEXT, "cudaDeviceGetSharedMemConfig");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaDeviceSetSharedMemConfig == NULL) {
+        orig_cudaDeviceSetSharedMemConfig = dlsym(RTLD_NEXT, "cudaDeviceSetSharedMemConfig");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaDeviceGetByPCIBusId == NULL) {
+        orig_cudaDeviceGetByPCIBusId = dlsym(RTLD_NEXT, "cudaDeviceGetByPCIBusId");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaDeviceGetPCIBusId == NULL) {
+        orig_cudaDeviceGetPCIBusId = dlsym(RTLD_NEXT, "cudaDeviceGetPCIBusId");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaIpcGetEventHandle == NULL) {
+        orig_cudaIpcGetEventHandle = dlsym(RTLD_NEXT, "cudaIpcGetEventHandle");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaIpcOpenEventHandle == NULL) {
+        orig_cudaIpcOpenEventHandle = dlsym(RTLD_NEXT, "cudaIpcOpenEventHandle");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaIpcGetMemHandle == NULL) {
+        orig_cudaIpcGetMemHandle = dlsym(RTLD_NEXT, "cudaIpcGetMemHandle");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaIpcOpenMemHandle == NULL) {
+        orig_cudaIpcOpenMemHandle = dlsym(RTLD_NEXT, "cudaIpcOpenMemHandle");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaIpcCloseMemHandle == NULL) {
+        orig_cudaIpcCloseMemHandle = dlsym(RTLD_NEXT, "cudaIpcCloseMemHandle");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaThreadExit == NULL) {
+        orig_cudaThreadExit = dlsym(RTLD_NEXT, "cudaThreadExit");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaThreadSynchronize == NULL) {
+        orig_cudaThreadSynchronize = dlsym(RTLD_NEXT, "cudaThreadSynchronize");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaThreadSetLimit == NULL) {
+        orig_cudaThreadSetLimit = dlsym(RTLD_NEXT, "cudaThreadSetLimit");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaThreadGetLimit == NULL) {
+        orig_cudaThreadGetLimit = dlsym(RTLD_NEXT, "cudaThreadGetLimit");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaThreadGetCacheConfig == NULL) {
+        orig_cudaThreadGetCacheConfig = dlsym(RTLD_NEXT, "cudaThreadGetCacheConfig");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaThreadSetCacheConfig == NULL) {
+        orig_cudaThreadSetCacheConfig = dlsym(RTLD_NEXT, "cudaThreadSetCacheConfig");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaGetLastError == NULL) {
+        orig_cudaGetLastError = dlsym(RTLD_NEXT, "cudaGetLastError");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaPeekAtLastError == NULL) {
+        orig_cudaPeekAtLastError = dlsym(RTLD_NEXT, "cudaPeekAtLastError");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaGetErrorName == NULL) {
+        orig_cudaGetErrorName = dlsym(RTLD_NEXT, "cudaGetErrorName");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaGetErrorString == NULL) {
+        orig_cudaGetErrorString = dlsym(RTLD_NEXT, "cudaGetErrorString");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaGetDeviceCount == NULL) {
+        orig_cudaGetDeviceCount = dlsym(RTLD_NEXT, "cudaGetDeviceCount");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaGetDeviceProperties == NULL) {
+        orig_cudaGetDeviceProperties = dlsym(RTLD_NEXT, "cudaGetDeviceProperties");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaDeviceGetAttribute == NULL) {
+        orig_cudaDeviceGetAttribute = dlsym(RTLD_NEXT, "cudaDeviceGetAttribute");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaDeviceGetP2PAttribute == NULL) {
+        orig_cudaDeviceGetP2PAttribute = dlsym(RTLD_NEXT, "cudaDeviceGetP2PAttribute");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaChooseDevice == NULL) {
+        orig_cudaChooseDevice = dlsym(RTLD_NEXT, "cudaChooseDevice");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaSetDevice == NULL) {
+        orig_cudaSetDevice = dlsym(RTLD_NEXT, "cudaSetDevice");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaGetDevice == NULL) {
+        orig_cudaGetDevice = dlsym(RTLD_NEXT, "cudaGetDevice");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaSetValidDevices == NULL) {
+        orig_cudaSetValidDevices = dlsym(RTLD_NEXT, "cudaSetValidDevices");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaSetDeviceFlags == NULL) {
+        orig_cudaSetDeviceFlags = dlsym(RTLD_NEXT, "cudaSetDeviceFlags");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaGetDeviceFlags == NULL) {
+        orig_cudaGetDeviceFlags = dlsym(RTLD_NEXT, "cudaGetDeviceFlags");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaStreamCreate == NULL) {
+        orig_cudaStreamCreate = dlsym(RTLD_NEXT, "cudaStreamCreate");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaStreamCreateWithFlags == NULL) {
+        orig_cudaStreamCreateWithFlags = dlsym(RTLD_NEXT, "cudaStreamCreateWithFlags");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaStreamCreateWithPriority == NULL) {
+        orig_cudaStreamCreateWithPriority = dlsym(RTLD_NEXT, "cudaStreamCreateWithPriority");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaStreamGetPriority == NULL) {
+        orig_cudaStreamGetPriority = dlsym(RTLD_NEXT, "cudaStreamGetPriority");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaStreamGetFlags == NULL) {
+        orig_cudaStreamGetFlags = dlsym(RTLD_NEXT, "cudaStreamGetFlags");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaStreamDestroy == NULL) {
+        orig_cudaStreamDestroy = dlsym(RTLD_NEXT, "cudaStreamDestroy");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaStreamWaitEvent == NULL) {
+        orig_cudaStreamWaitEvent = dlsym(RTLD_NEXT, "cudaStreamWaitEvent");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaStreamAddCallback == NULL) {
+        orig_cudaStreamAddCallback = dlsym(RTLD_NEXT, "cudaStreamAddCallback");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaStreamSynchronize == NULL) {
+        orig_cudaStreamSynchronize = dlsym(RTLD_NEXT, "cudaStreamSynchronize");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaStreamQuery == NULL) {
+        orig_cudaStreamQuery = dlsym(RTLD_NEXT, "cudaStreamQuery");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaStreamAttachMemAsync == NULL) {
+        orig_cudaStreamAttachMemAsync = dlsym(RTLD_NEXT, "cudaStreamAttachMemAsync");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaEventCreate == NULL) {
+        orig_cudaEventCreate = dlsym(RTLD_NEXT, "cudaEventCreate");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaEventCreateWithFlags == NULL) {
+        orig_cudaEventCreateWithFlags = dlsym(RTLD_NEXT, "cudaEventCreateWithFlags");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaEventRecord == NULL) {
+        orig_cudaEventRecord = dlsym(RTLD_NEXT, "cudaEventRecord");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaEventQuery == NULL) {
+        orig_cudaEventQuery = dlsym(RTLD_NEXT, "cudaEventQuery");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaEventSynchronize == NULL) {
+        orig_cudaEventSynchronize = dlsym(RTLD_NEXT, "cudaEventSynchronize");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaEventDestroy == NULL) {
+        orig_cudaEventDestroy = dlsym(RTLD_NEXT, "cudaEventDestroy");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaEventElapsedTime == NULL) {
+        orig_cudaEventElapsedTime = dlsym(RTLD_NEXT, "cudaEventElapsedTime");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaLaunchKernel == NULL) {
+        orig_cudaLaunchKernel = dlsym(RTLD_NEXT, "cudaLaunchKernel");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaLaunchCooperativeKernel == NULL) {
+        orig_cudaLaunchCooperativeKernel = dlsym(RTLD_NEXT, "cudaLaunchCooperativeKernel");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaLaunchCooperativeKernelMultiDevice == NULL) {
+        orig_cudaLaunchCooperativeKernelMultiDevice = dlsym(RTLD_NEXT, "cudaLaunchCooperativeKernelMultiDevice");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaFuncSetCacheConfig == NULL) {
+        orig_cudaFuncSetCacheConfig = dlsym(RTLD_NEXT, "cudaFuncSetCacheConfig");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaFuncSetSharedMemConfig == NULL) {
+        orig_cudaFuncSetSharedMemConfig = dlsym(RTLD_NEXT, "cudaFuncSetSharedMemConfig");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaFuncGetAttributes == NULL) {
+        orig_cudaFuncGetAttributes = dlsym(RTLD_NEXT, "cudaFuncGetAttributes");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaFuncSetAttribute == NULL) {
+        orig_cudaFuncSetAttribute = dlsym(RTLD_NEXT, "cudaFuncSetAttribute");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaSetDoubleForDevice == NULL) {
+        orig_cudaSetDoubleForDevice = dlsym(RTLD_NEXT, "cudaSetDoubleForDevice");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaSetDoubleForHost == NULL) {
+        orig_cudaSetDoubleForHost = dlsym(RTLD_NEXT, "cudaSetDoubleForHost");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaOccupancyMaxActiveBlocksPerMultiprocessor == NULL) {
+        orig_cudaOccupancyMaxActiveBlocksPerMultiprocessor = dlsym(RTLD_NEXT, "cudaOccupancyMaxActiveBlocksPerMultiprocessor");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags == NULL) {
+        orig_cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags = dlsym(RTLD_NEXT, "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaConfigureCall == NULL) {
+        orig_cudaConfigureCall = dlsym(RTLD_NEXT, "cudaConfigureCall");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaSetupArgument == NULL) {
+        orig_cudaSetupArgument = dlsym(RTLD_NEXT, "cudaSetupArgument");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaLaunch == NULL) {
+        orig_cudaLaunch = dlsym(RTLD_NEXT, "cudaLaunch");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaMallocManaged == NULL) {
+        orig_cudaMallocManaged = dlsym(RTLD_NEXT, "cudaMallocManaged");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaMalloc == NULL) {
+        orig_cudaMalloc = dlsym(RTLD_NEXT, "cudaMalloc");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaMallocHost == NULL) {
+        orig_cudaMallocHost = dlsym(RTLD_NEXT, "cudaMallocHost");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaMallocPitch == NULL) {
+        orig_cudaMallocPitch = dlsym(RTLD_NEXT, "cudaMallocPitch");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaMallocArray == NULL) {
+        orig_cudaMallocArray = dlsym(RTLD_NEXT, "cudaMallocArray");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaFree == NULL) {
+        orig_cudaFree = dlsym(RTLD_NEXT, "cudaFree");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaFreeHost == NULL) {
+        orig_cudaFreeHost = dlsym(RTLD_NEXT, "cudaFreeHost");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaFreeArray == NULL) {
+        orig_cudaFreeArray = dlsym(RTLD_NEXT, "cudaFreeArray");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaFreeMipmappedArray == NULL) {
+        orig_cudaFreeMipmappedArray = dlsym(RTLD_NEXT, "cudaFreeMipmappedArray");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaHostAlloc == NULL) {
+        orig_cudaHostAlloc = dlsym(RTLD_NEXT, "cudaHostAlloc");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaHostRegister == NULL) {
+        orig_cudaHostRegister = dlsym(RTLD_NEXT, "cudaHostRegister");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaHostUnregister == NULL) {
+        orig_cudaHostUnregister = dlsym(RTLD_NEXT, "cudaHostUnregister");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaHostGetDevicePointer == NULL) {
+        orig_cudaHostGetDevicePointer = dlsym(RTLD_NEXT, "cudaHostGetDevicePointer");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaHostGetFlags == NULL) {
+        orig_cudaHostGetFlags = dlsym(RTLD_NEXT, "cudaHostGetFlags");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaMalloc3D == NULL) {
+        orig_cudaMalloc3D = dlsym(RTLD_NEXT, "cudaMalloc3D");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaMalloc3DArray == NULL) {
+        orig_cudaMalloc3DArray = dlsym(RTLD_NEXT, "cudaMalloc3DArray");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaMallocMipmappedArray == NULL) {
+        orig_cudaMallocMipmappedArray = dlsym(RTLD_NEXT, "cudaMallocMipmappedArray");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaGetMipmappedArrayLevel == NULL) {
+        orig_cudaGetMipmappedArrayLevel = dlsym(RTLD_NEXT, "cudaGetMipmappedArrayLevel");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaMemcpy3D == NULL) {
+        orig_cudaMemcpy3D = dlsym(RTLD_NEXT, "cudaMemcpy3D");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaMemcpy3DPeer == NULL) {
+        orig_cudaMemcpy3DPeer = dlsym(RTLD_NEXT, "cudaMemcpy3DPeer");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaMemcpy3DAsync == NULL) {
+        orig_cudaMemcpy3DAsync = dlsym(RTLD_NEXT, "cudaMemcpy3DAsync");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaMemcpy3DPeerAsync == NULL) {
+        orig_cudaMemcpy3DPeerAsync = dlsym(RTLD_NEXT, "cudaMemcpy3DPeerAsync");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaMemGetInfo == NULL) {
+        orig_cudaMemGetInfo = dlsym(RTLD_NEXT, "cudaMemGetInfo");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaArrayGetInfo == NULL) {
+        orig_cudaArrayGetInfo = dlsym(RTLD_NEXT, "cudaArrayGetInfo");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaMemcpy == NULL) {
+        orig_cudaMemcpy = dlsym(RTLD_NEXT, "cudaMemcpy");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaMemcpyPeer == NULL) {
+        orig_cudaMemcpyPeer = dlsym(RTLD_NEXT, "cudaMemcpyPeer");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaMemcpyToArray == NULL) {
+        orig_cudaMemcpyToArray = dlsym(RTLD_NEXT, "cudaMemcpyToArray");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaMemcpyFromArray == NULL) {
+        orig_cudaMemcpyFromArray = dlsym(RTLD_NEXT, "cudaMemcpyFromArray");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaMemcpyArrayToArray == NULL) {
+        orig_cudaMemcpyArrayToArray = dlsym(RTLD_NEXT, "cudaMemcpyArrayToArray");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaMemcpy2D == NULL) {
+        orig_cudaMemcpy2D = dlsym(RTLD_NEXT, "cudaMemcpy2D");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaMemcpy2DToArray == NULL) {
+        orig_cudaMemcpy2DToArray = dlsym(RTLD_NEXT, "cudaMemcpy2DToArray");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaMemcpy2DFromArray == NULL) {
+        orig_cudaMemcpy2DFromArray = dlsym(RTLD_NEXT, "cudaMemcpy2DFromArray");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaMemcpy2DArrayToArray == NULL) {
+        orig_cudaMemcpy2DArrayToArray = dlsym(RTLD_NEXT, "cudaMemcpy2DArrayToArray");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaMemcpyToSymbol == NULL) {
+        orig_cudaMemcpyToSymbol = dlsym(RTLD_NEXT, "cudaMemcpyToSymbol");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaMemcpyFromSymbol == NULL) {
+        orig_cudaMemcpyFromSymbol = dlsym(RTLD_NEXT, "cudaMemcpyFromSymbol");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaMemcpyAsync == NULL) {
+        orig_cudaMemcpyAsync = dlsym(RTLD_NEXT, "cudaMemcpyAsync");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaMemcpyPeerAsync == NULL) {
+        orig_cudaMemcpyPeerAsync = dlsym(RTLD_NEXT, "cudaMemcpyPeerAsync");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaMemcpyToArrayAsync == NULL) {
+        orig_cudaMemcpyToArrayAsync = dlsym(RTLD_NEXT, "cudaMemcpyToArrayAsync");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaMemcpyFromArrayAsync == NULL) {
+        orig_cudaMemcpyFromArrayAsync = dlsym(RTLD_NEXT, "cudaMemcpyFromArrayAsync");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaMemcpy2DAsync == NULL) {
+        orig_cudaMemcpy2DAsync = dlsym(RTLD_NEXT, "cudaMemcpy2DAsync");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaMemcpy2DToArrayAsync == NULL) {
+        orig_cudaMemcpy2DToArrayAsync = dlsym(RTLD_NEXT, "cudaMemcpy2DToArrayAsync");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaMemcpy2DFromArrayAsync == NULL) {
+        orig_cudaMemcpy2DFromArrayAsync = dlsym(RTLD_NEXT, "cudaMemcpy2DFromArrayAsync");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaMemcpyToSymbolAsync == NULL) {
+        orig_cudaMemcpyToSymbolAsync = dlsym(RTLD_NEXT, "cudaMemcpyToSymbolAsync");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaMemcpyFromSymbolAsync == NULL) {
+        orig_cudaMemcpyFromSymbolAsync = dlsym(RTLD_NEXT, "cudaMemcpyFromSymbolAsync");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaMemset == NULL) {
+        orig_cudaMemset = dlsym(RTLD_NEXT, "cudaMemset");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaMemset2D == NULL) {
+        orig_cudaMemset2D = dlsym(RTLD_NEXT, "cudaMemset2D");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaMemset3D == NULL) {
+        orig_cudaMemset3D = dlsym(RTLD_NEXT, "cudaMemset3D");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaMemsetAsync == NULL) {
+        orig_cudaMemsetAsync = dlsym(RTLD_NEXT, "cudaMemsetAsync");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaMemset2DAsync == NULL) {
+        orig_cudaMemset2DAsync = dlsym(RTLD_NEXT, "cudaMemset2DAsync");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaMemset3DAsync == NULL) {
+        orig_cudaMemset3DAsync = dlsym(RTLD_NEXT, "cudaMemset3DAsync");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaGetSymbolAddress == NULL) {
+        orig_cudaGetSymbolAddress = dlsym(RTLD_NEXT, "cudaGetSymbolAddress");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaGetSymbolSize == NULL) {
+        orig_cudaGetSymbolSize = dlsym(RTLD_NEXT, "cudaGetSymbolSize");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaMemPrefetchAsync == NULL) {
+        orig_cudaMemPrefetchAsync = dlsym(RTLD_NEXT, "cudaMemPrefetchAsync");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaMemAdvise == NULL) {
+        orig_cudaMemAdvise = dlsym(RTLD_NEXT, "cudaMemAdvise");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaMemRangeGetAttribute == NULL) {
+        orig_cudaMemRangeGetAttribute = dlsym(RTLD_NEXT, "cudaMemRangeGetAttribute");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaMemRangeGetAttributes == NULL) {
+        orig_cudaMemRangeGetAttributes = dlsym(RTLD_NEXT, "cudaMemRangeGetAttributes");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaPointerGetAttributes == NULL) {
+        orig_cudaPointerGetAttributes = dlsym(RTLD_NEXT, "cudaPointerGetAttributes");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaDeviceCanAccessPeer == NULL) {
+        orig_cudaDeviceCanAccessPeer = dlsym(RTLD_NEXT, "cudaDeviceCanAccessPeer");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaDeviceEnablePeerAccess == NULL) {
+        orig_cudaDeviceEnablePeerAccess = dlsym(RTLD_NEXT, "cudaDeviceEnablePeerAccess");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaDeviceDisablePeerAccess == NULL) {
+        orig_cudaDeviceDisablePeerAccess = dlsym(RTLD_NEXT, "cudaDeviceDisablePeerAccess");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaGraphicsUnregisterResource == NULL) {
+        orig_cudaGraphicsUnregisterResource = dlsym(RTLD_NEXT, "cudaGraphicsUnregisterResource");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaGraphicsResourceSetMapFlags == NULL) {
+        orig_cudaGraphicsResourceSetMapFlags = dlsym(RTLD_NEXT, "cudaGraphicsResourceSetMapFlags");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaGraphicsMapResources == NULL) {
+        orig_cudaGraphicsMapResources = dlsym(RTLD_NEXT, "cudaGraphicsMapResources");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaGraphicsUnmapResources == NULL) {
+        orig_cudaGraphicsUnmapResources = dlsym(RTLD_NEXT, "cudaGraphicsUnmapResources");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaGraphicsResourceGetMappedPointer == NULL) {
+        orig_cudaGraphicsResourceGetMappedPointer = dlsym(RTLD_NEXT, "cudaGraphicsResourceGetMappedPointer");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaGraphicsSubResourceGetMappedArray == NULL) {
+        orig_cudaGraphicsSubResourceGetMappedArray = dlsym(RTLD_NEXT, "cudaGraphicsSubResourceGetMappedArray");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaGraphicsResourceGetMappedMipmappedArray == NULL) {
+        orig_cudaGraphicsResourceGetMappedMipmappedArray = dlsym(RTLD_NEXT, "cudaGraphicsResourceGetMappedMipmappedArray");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaGetChannelDesc == NULL) {
+        orig_cudaGetChannelDesc = dlsym(RTLD_NEXT, "cudaGetChannelDesc");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaCreateChannelDesc == NULL) {
+        orig_cudaCreateChannelDesc = dlsym(RTLD_NEXT, "cudaCreateChannelDesc");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaBindTexture == NULL) {
+        orig_cudaBindTexture = dlsym(RTLD_NEXT, "cudaBindTexture");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaBindTexture2D == NULL) {
+        orig_cudaBindTexture2D = dlsym(RTLD_NEXT, "cudaBindTexture2D");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaBindTextureToArray == NULL) {
+        orig_cudaBindTextureToArray = dlsym(RTLD_NEXT, "cudaBindTextureToArray");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaBindTextureToMipmappedArray == NULL) {
+        orig_cudaBindTextureToMipmappedArray = dlsym(RTLD_NEXT, "cudaBindTextureToMipmappedArray");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaUnbindTexture == NULL) {
+        orig_cudaUnbindTexture = dlsym(RTLD_NEXT, "cudaUnbindTexture");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaGetTextureAlignmentOffset == NULL) {
+        orig_cudaGetTextureAlignmentOffset = dlsym(RTLD_NEXT, "cudaGetTextureAlignmentOffset");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaGetTextureReference == NULL) {
+        orig_cudaGetTextureReference = dlsym(RTLD_NEXT, "cudaGetTextureReference");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaBindSurfaceToArray == NULL) {
+        orig_cudaBindSurfaceToArray = dlsym(RTLD_NEXT, "cudaBindSurfaceToArray");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaGetSurfaceReference == NULL) {
+        orig_cudaGetSurfaceReference = dlsym(RTLD_NEXT, "cudaGetSurfaceReference");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaCreateTextureObject == NULL) {
+        orig_cudaCreateTextureObject = dlsym(RTLD_NEXT, "cudaCreateTextureObject");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaDestroyTextureObject == NULL) {
+        orig_cudaDestroyTextureObject = dlsym(RTLD_NEXT, "cudaDestroyTextureObject");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaGetTextureObjectResourceDesc == NULL) {
+        orig_cudaGetTextureObjectResourceDesc = dlsym(RTLD_NEXT, "cudaGetTextureObjectResourceDesc");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaGetTextureObjectTextureDesc == NULL) {
+        orig_cudaGetTextureObjectTextureDesc = dlsym(RTLD_NEXT, "cudaGetTextureObjectTextureDesc");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaGetTextureObjectResourceViewDesc == NULL) {
+        orig_cudaGetTextureObjectResourceViewDesc = dlsym(RTLD_NEXT, "cudaGetTextureObjectResourceViewDesc");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaCreateSurfaceObject == NULL) {
+        orig_cudaCreateSurfaceObject = dlsym(RTLD_NEXT, "cudaCreateSurfaceObject");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaDestroySurfaceObject == NULL) {
+        orig_cudaDestroySurfaceObject = dlsym(RTLD_NEXT, "cudaDestroySurfaceObject");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaGetSurfaceObjectResourceDesc == NULL) {
+        orig_cudaGetSurfaceObjectResourceDesc = dlsym(RTLD_NEXT, "cudaGetSurfaceObjectResourceDesc");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaDriverGetVersion == NULL) {
+        orig_cudaDriverGetVersion = dlsym(RTLD_NEXT, "cudaDriverGetVersion");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaRuntimeGetVersion == NULL) {
+        orig_cudaRuntimeGetVersion = dlsym(RTLD_NEXT, "cudaRuntimeGetVersion");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+    // clear dl error
+    dlerror();
+    if (orig_cudaGetExportTable == NULL) {
+        orig_cudaGetExportTable = dlsym(RTLD_NEXT, "cudaGetExportTable");
+    }
+    if ((dl_error = dlerror()) != NULL)
+    {
+        fprintf(stderr, ">>>>>>> %s\n", dl_error);
+    }
+}
diff --git a/schedAPI.h b/schedAPI.h
new file mode 100644
index 0000000..00105fc
--- /dev/null
+++ b/schedAPI.h
@@ -0,0 +1,10 @@
+/*
+ * This include file provides function prototypes for the
+ * GPU scheduler functions implemented by schedLib.c and used
+ * in the Cuda wrapper functions.
+ */
+void streamInit(pid_t my_tid, int priority);
+void schedConfCall(pid_t my_tid, void *stream, int blocks, int threads);
+void schedLaunch(pid_t my_tid);
+void schedSync(pid_t my_pid, void *stream);
+void tracePrint(void);
diff --git a/schedLib.c b/schedLib.c
new file mode 100644
index 0000000..d16a066
--- /dev/null
+++ b/schedLib.c
@@ -0,0 +1,629 @@
+/*
+ * This library implements a transparent extension of the NVIDIA runtime
+ * API (libcudart) that is dynamically linked with CUDA programs.  This
+ * extension provides a "middleware" scheduling infrastructure that controls
+ * CUDA kernel launch requests.  It is designed to control kernel scheduling 
+ * for CUDA programs with the following characteristics commonly used for 
+ * concurrent GPU sharing:
+ *  - A main process that creates multiple threads (pthreads) sharing a 
+ *    single process address space (i.e., the conditions under which
+ *    kernels can run concurrently on a GPU).
+ *  - Each thread creates one user-defined CUDA stream (FIFO queue)
+ *    that it manages and uses for invoking GPU operations.  There is a
+ *    one-to-one relationship between threads and streams.
+ *  - The program is written to launch kernels using the angle-brackets
+ *    syntax (<<<.....>>>) and synchronizes the CPU and GPU with at least 
+ *    one call to cudaStreamSynchronize() between successive instances of
+ *    kernel launches in a given stream.
+ *  - The CUDA program is dynamically linked with the CUDA library libcudart
+ *
+ * In the case of a CUDA program with multiple user-defined streams, the NVIDIA 
+ * scheduling rules for choosing among multiple streams with kernels at the
+ * top of their FIFO queues are not documented.  This middleware attempts to
+ * implement and control some of the scheduling choices that can be made.
+ * 
+ * The library functions are transparently invoked by "wrapping" calls
+ * to certain of the original CUDA API functions (described below) and
+ * performing scheduling choices before or after invoking the "real" CUDA
+ * code.  Control over which kernel launch requests can be presented to
+ * the NVIDIA software and hardware scheduling mechanisms is achieved
+ * by blocking and signaling operations on the program threads.
+ *
+ * The new library functions were designed following the fundamental
+ * principle of separation between mechanism and policy.  Most of the library
+ * is for implementing the mechanisms that are required for any policy.
+ * Many scheduling policies are possible given adequate mechanisms for
+ * carrying out a given policy.  The separation of mechanism and policy
+ * makes it easy to try out and evaluate different policies.  In the library
+ * code, all aspects of policy are implemented in a single function,
+ * find_next_kernel(), which returns either an identifier for a stream
+ * to launch a kernel or -1 to indicate that no new launch is allowed.
+ * The policy functions are intended to be implemented as instances
+ * of the find_next_kernel() function each contained in a .h file named
+ * in a #include statement. 
+ *
+ * For a given thread/stream, the basic sequence of actions is:
+ *  - The program creates a user-defined stream using one of the calls
+ *    cudaStreamCreate(), cudaStreamCreateWithFlags(), or 
+ *    cudaStreamCreateWithPriority().  These calls first invoke the
+ *    new library function streamInit() that initializes state about the
+ *    stream and, on the first call, initializes state about the GPU.  
+ *    The "real" cuda runtime code for creating a stream is then executed.
+ *
+ *  - A CUDA kernel launch on a created stream using the angle-brackets syntax 
+ *    (<<<.....>>>) is compiled with nvcc by generating two API calls which 
+ *    result in calls to the new library functions:
+ *    - cudaConfigureCall() first calls the new library function 
+ *      schedConfCall() which records configuration information (e.g., 
+ *      number of blocks and threads) about the kernel about to be launched
+ *      and then executes the "real" library code for configuring a launch.
+ *    - cudaLaunch() first calls the new library function schedLaunch().
+ *      This function implements a scheduling decision that determines
+ *      which, if any, threads, including the one attempting a launch, should
+ *      not be blocked so the actual launch can be allowed to happen.  For those
+ *      that are already blocked, the corresponding condition is signaled.
+ *      For the current launching thread, the thread is not blocked and it
+ *      executes the "real" CUDA launch.  If the current launching thread
+ *      should be blocked for later scheduling, the corresponding pthread
+ *      condition wait is executed.  
+ *       
+ *  - The CUDA program synchronizes the CPU thread with a launched kernel
+ *    using a cudaStreamSynchronize() call.  The "wrapper" for this function
+ *    has a different sequence of operations.  It first executes the "real"
+ *    synchronization function which may result in the CPU thread being 
+ *    blocked until the kernel completes on the GPU.  When the "real" call
+ *    returns (kernel completed), the new library function schedSync() is called.
+ *    It implements a scheduling decision that determines whether any blocked 
+ *    kernel launches can now be executed and, if so, signals the conditions 
+ *    that are blocking the threads attempting to launch a kernel.  The kernel
+ *    launch is then handled by the "real" NVIDIA scheduling functions.
+ *
+ * The limitation inherent in this design is that the underlying Linux scheduler
+ * actually determines the order in which blocked threads run when unblocked.
+ * In the case that multiple blocked threads are signalled, the actual order of
+ * launches depends on how the Linux scheduler orders the thread dispatching.
+ * Only in the case where the scheduling algorithm allows only one thread to launch
+ * a kernel at a time (essentially eliminating any concurrency) can the launch
+ * order be made completely deterministic.  In all other cases, the schedluer
+ * can only control the set of kernels that are allowed to run concurrently, 
+ * not the specific order in which they start executing.
+ *
+ * The new scheduling "middleware" is implemented as a library that is compiled 
+ * and linked with the cuda wrappers as a dynamic-linked load module (see the Makefile). 
+ * A process has one copy of this library program and all threads created by the 
+ * process share the global state for the library.
+ *
+ * IMPORTANT: Assumes that a process will create threads only with the POSIX 
+ * API call pthread_create() and not use a system call like clone() directly.
+ * Also assumes that there is a one-to-one relationship between threads and
+ * streams and that the Linux thread TID is sufficient to identify a stream.
+ *
+ * Note that all new library calls have a void return.  If a call returns, it
+ * can be assumed that the call completed without encountering potential errors.  
+ * If any error is identified, the process is terminated.
+ *
+ * Written by Don Smith, Department of Computer Science,
+ * University of North Carolina at Chapel Hill.
+ * 2019.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <time.h>
+#include <errno.h>
+#define _GNU_SOURCE
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+         
+#include <pthread.h>
+#define TRACE_ON  1   //change to 1 for producing trace of launch decisions, 0 for not
+#define MAX_SCHED_TRACE 100000
+#define MAX_STREAMS 4 // One per basic ARM core on TX2
+#define MAX_GPU_BLOCKS 64  //Max blocks on 2 SM TX2
+#define MAX_GPU_THREADS 4096 //Max threads on 2 SM TX2
+#define min(a,b) ((a) <= (b) ? (a) : (b))
+#define max(a,b) ((a) >= (b) ? (a) : (b))
+int trc_idx = 0;
+struct schedTrace *tr_ptr;
+struct schedTrace {
+  int stream[MAX_STREAMS];
+  int stream_threads[MAX_STREAMS];
+  int next;
+  char type[4];
+}SchedTrace[MAX_SCHED_TRACE];
+int Initialized = 0;  //Only initialize GPU once -- set to 1 the first time
+enum st_states {INIT, IDLE, PENDING, READY_LAUNCH, LAUNCHED}; //stream states
+enum gpu_states {FREE, BUSY}; //gpu states
+// this mutex is required to protect shared stream and GPU states
+pthread_mutex_t sched_lock = PTHREAD_MUTEX_INITIALIZER;
+struct stream {
+  pid_t thread;  //the tid (Linux thread id) of the thread "owning" the stream
+  void *stream;  //the CUDA runtime pointer of the stream (not currently used).
+  int priority;  //stream priority from cudaCreateStreamPriority() or 0.
+  enum st_states state;  //current stream state
+  int blocks;        //number of blocks in kernel ready to launch
+  int block_threads; //number of threads per block
+  int look_count;    //for use in policy algorithms concerned with starvation
+  pthread_mutex_t st_lock;  //required for using condition wait/signal on stream
+  pthread_cond_t st_cond;   //condition variable for thread/stream block/signal
+} Stream[MAX_STREAMS];    //a thread/stream is identified by an index (str_idx) in the array.
+int stream_count = 0;  //number of streams that have been created
+int next = 0;  //index of stream that can launch the next kernel
+struct gpu {
+  enum gpu_states GPU_state;  //current GPU state
+  int threads_occupied;       //total threads allocated over both SMs
+  int kernels_dispatched;     //number of kernels currently dispatched to SMs
+  int streams[MAX_STREAMS];   //for kernel executing, its thread/stream tid, else 0
+  int stream_threads[MAX_STREAMS];  //for kernel executing, its allocated threads, else 0
+};
+struct gpu GPU;
+void free_gpu_threads(pid_t my_tid, int str_idx);
+void gpu_exit(pid_t my_tid, int str_idx);
+void alloc_gpu_threads(pid_t my_tid, int str_idx);
+void gpu_run(pid_t my_tid, int str_idx);
+int find_best_kernel(void);
+void ready_launch(pid_t my_tid);
+void schedule_next(pid_t my_tid);
+void dispatch_next(int this_one, pid_t my_tid);
+int get_stream(pid_t my_tid);
+void show_gpu_state(void);
+void show_stream_state(int this_one);
+//Include here the .h file containing the scheduling policy implementation
+//in funtion: int find_best_kernel(void)
+#include "MinFitMinIntfR2.h"
+/* Function called from library wrapper of cudaDeviceReset().
+ * If any entries have been made in a trace of scheduling decisions
+ * made by find_best_kernel(), they are formatted and written to
+ * stdout.
+ */
+void tracePrint(void)
+{
+  int i, j;
+  for (i = 0; i < trc_idx; i++) {
+    fprintf(stderr, "%d %s %d ", i, SchedTrace[i].type, SchedTrace[i].next);
+    for (j = 0; j < MAX_STREAMS; j++) {
+      fprintf(stderr, "[%d, %d] ", SchedTrace[i].stream[j], SchedTrace[i].stream_threads[j]);
+    }
+    fprintf(stderr, "\n");
+  }
+}
+/* Function called from library wrapper of cudaStreamCreateXXXX().
+ * The stream structure at the current index into the Stream structure
+ * array is initialized.  The GPU state for the created stream is 
+ * also initialized.  Each invocation creates a new index 
+ * by incrementing stream_count. On the first invocation, the part of
+ * the GPU structure not specific to a stream is also initialized.
+ */
+void streamInit(pid_t my_tid, int priority) 
+//my_tid is the thread creating a user-defined stream
+{
+  //WARNING: any flags are ignored.
+  //printf("cudaStreamCreate TID %d\n", my_tid);
+  //fflush(stdout);
+  pthread_mutex_lock(&sched_lock);
+    Stream[stream_count].thread = my_tid; //stream identified by tid of creating thread
+    Stream[stream_count].priority = priority; 
+    Stream[stream_count].state = INIT;
+    Stream[stream_count].look_count = 0;
+    //stream mutex and condition variable initialized to free
+    pthread_mutex_init(&Stream[stream_count].st_lock, NULL);
+    pthread_cond_init(&Stream[stream_count].st_cond, NULL);
+    //initialize GPU state for this newly created stream
+    GPU.streams[stream_count] = 0;  //no kernel from stream running
+    GPU.stream_threads[stream_count] = 0; //no threads allocated
+    stream_count += 1; //increment stream index
+    if (Initialized == 0) { //initialize GPU state on first stream create
+      GPU.threads_occupied = 0;
+      GPU.kernels_dispatched = 0;
+      GPU.GPU_state = FREE;
+      //tr_ptr = (struct schedTrace *)mem_ptr;
+      Initialized = 1;
+    }
+  pthread_mutex_unlock(&sched_lock);
+}
+/* Function called from the library wrapper of cudaConfigureCall()
+ * generated from the <<<.....>>> kernel launch statement in the
+ * CUDA program.  The stream state for the stream is initialized
+ * with the block and threads/block counts for the kernel.
+ */
+void schedConfCall(pid_t my_tid, void *stream, int blocks, int threads) 
+//my_tid is the thread/stream attempting to launch
+{
+  int str_idx;
+  pthread_mutex_lock(&sched_lock);
+  //printf("cudaConfigureCall TID %d stream %p blocks %d threads %d\n", 
+  //        my_tid, stream, blocks, threads);
+  //fflush(stdout);
+     // get the stream array index for the thread that "owns" this stream
+     str_idx = get_stream(my_tid);
+ 
+     //initialize state for the kernel that the thread is launching 
+     Stream[str_idx].state = PENDING; //call configured but not launched
+     Stream[str_idx].blocks = blocks; //total blocks in the kernel
+     Stream[str_idx].block_threads = threads; //total threads per block
+  pthread_mutex_unlock(&sched_lock);
+}
+/* Function called from the library wrapper of cudaLaunch()
+ * generated from the <<<.....>>> kernel launch statement in the
+ * CUDA program.  The stream state for the stream is changed to
+ * show that the kernel is ready for launching.  The utility
+ * function ready_launch() is called.  On return from ready_launch()
+ * this function returns to the wrapper which then invokes the
+ * "real" CUDA launch.  The return from ready_launch() is 
+ * immediate in the case the scheduler determines that this kernel
+ * can be launched.  The call may instead result in blocking 
+ * the thread if the scheduler determines that the launch should 
+ * be deferred.  When the blocking condition is signalled by the
+ * scheduler, ready_launch() then returns to this function.
+ */ 
+void schedLaunch(pid_t my_tid) {//my_tid is the thread/stream attempting to launch
+  int str_idx;
+  pthread_mutex_lock(&sched_lock);
+  //printf("cudaLaunch TID %d\n", my_tid);
+  //fflush(stdout);
+     // get the stream array index for the thread that "owns" this stream
+     str_idx = get_stream(my_tid);
+     Stream[str_idx].state = READY_LAUNCH; //kernel can be considered for scheduling
+     //printf("TID %d Ready, Blocks %d Threads %d\n", my_tid, 
+     //     Stream[str_idx].blocks, Stream[str_idx].block_threads);
+     //fflush(stdout);
+     // ready_launch() is called with the sched_lock still held. The function will
+     // either block (and the thread will run when signaled) or will return
+     // immediately.  In either case, (a) the lock will have been unlocked, and 
+     // (b) the kernal will be launched by the "real" CUDA launch.
+     ready_launch(my_tid);   // thread/stream will launch on return or after blocking
+}
+/* Utility function called from schedLaunch().  It invokes the scheduling policy
+ * function, find_best_kernel(), one or more times to determine which, if any,
+ * streams have a kernel that is ready to launch and should be launched.  For
+ * streams, other than the one that invoked schedLaunch(), having kernels that
+ * should launch, the utility function dispatch_next() is called to unblock
+ * their owning threads.  If the stream owned by the calling thread has a kernel
+ * to launch, the sched_lock is released and the function just returns.  If the
+ * kernel in the stream of the calling thread is to be deferred, sched_lock is
+ * released and the thread blocks with a pthread_cond_wait on its stream condition.
+ */
+void ready_launch(pid_t my_tid) {
+//my_tid is the thread/stream attempting to launch
+  int str_idx, rc;
+  int this_one;
+  int will_block;  
+  //Must be called with sched_lock held.  It must release the lock before
+  //returning or blocking and then returning.
+     // get the stream array index for the thread that "owns" this stream
+     str_idx = get_stream(my_tid);
+     will_block = 1; // will not block if a kernel scheduled on this stream 
+     do {
+       /*       
+         if (TRACE_ON) {
+            printf("TID %d find new kernel on Launch\n", my_tid);
+            fflush(stdout);
+         }
+       */
+         //call the scheduling policy function.  It returns a stream index for
+         //a stream in the READY_LAUNCH state with a kernel to be launched now 
+         //(returns -1 if none found)
+         this_one = find_best_kernel();
+         if (this_one == str_idx) {//kernel from calling thread can launch
+            will_block = 0;  // no block, just return
+            alloc_gpu_threads(my_tid, str_idx);  //set up GPU state to launch
+            gpu_run(my_tid, str_idx);
+            Stream[str_idx].state = LAUNCHED;  //kernel has been scheduled
+         }
+         else {//kernel from a different thread/stream should be launched
+               if (this_one >= 0)
+                  dispatch_next(this_one, my_tid);  //set state and signal
+         }
+     } while (this_one >= 0); //  -1 indicates no more kernel launches now
+              
+     //Must unlock so calling thread can return or block 
+  pthread_mutex_unlock(&sched_lock);
+     if (will_block == 0)
+        return;  //allows launch from calling thread to take place
+     // thread/stream must block until scheduler indicates its kernel can launch
+     rc = pthread_mutex_lock(&Stream[str_idx].st_lock);
+     if (rc != 0) {
+         fprintf(stderr, "TID %d Failed - Locking Stream Mutex\n", my_tid);
+         exit (-1);
+     }
+     rc = pthread_cond_wait(&Stream[str_idx].st_cond, &Stream[str_idx].st_lock);
+     if (rc != 0) {
+         fprintf(stderr, "TID %d Failed - Waiting Stream Condition\n", my_tid);
+         exit (-1);
+     }
+     rc = pthread_mutex_unlock(&Stream[str_idx].st_lock);
+     if (rc != 0) {
+         fprintf(stderr, "TID %d Failed - Unlocking Stream Mutex\n", my_tid);
+         exit (-1);
+     }
+}
+/* Utility function called from ready_launch() and schedule_next() to set
+ * state and signal a blocked thread/stream so it can execute the "real"
+ * CUDA launch.
+ */
+void dispatch_next(int this_one, pid_t my_tid) {//my_tid is calling thread/stream
+  //this_one is the stream index of the stream to launch a kernel
+  pid_t new_tid;
+  int rc;
+  //Must be called with sched_lock held; will be unlocked by caller
+      //new_tid is the thread/stream that has been scheduled for kernel launch
+      new_tid = Stream[this_one].thread;
+      alloc_gpu_threads(new_tid, this_one); //set up GPU state for launch
+      gpu_run(new_tid, this_one);
+      Stream[this_one].state = LAUNCHED; //kernel has been scheduled
+       
+     //signal the blocked thread/stream so it can execute the "real" launch
+       rc = pthread_mutex_lock(&Stream[this_one].st_lock);
+       if (rc != 0) {
+           fprintf(stderr, "TID %d Failed - Locking Stream Mutex\n", my_tid);
+           exit (-1);
+       }
+       rc = pthread_cond_signal(&Stream[this_one].st_cond);
+       if (rc != 0) {
+           fprintf(stderr, "TID %d Failed - Signaling Stream Condition\n", my_tid);
+           exit (-1);
+       }
+       rc = pthread_mutex_unlock(&Stream[this_one].st_lock);
+       if (rc != 0) {
+           fprintf(stderr, "TID %d Failed - Unlocking Stream Mutex\n", my_tid);
+           exit (-1);
+       }
+}
+/* Function called from library wrapper of cudaStreamSynchronize().
+ * This CUDA function provides an essential notification that a kernel has
+ * completed execution on the GPU.  The CUDA program must be written so 
+ * that it synchronizes the CPU and GPU with at least one call to 
+ * cudaStreamSynchronize() between successive instances of kernel
+ * launches on a given stream.
+ *
+ * The function sets the stream and GPU state to reflect the kernel's
+ * completion which frees GPU resources for use to execute a new
+ * kernel.  It then calls the utility function, schedule_next() to
+ * schedule launches of any kernels the scheduling policy determines should
+ * be eligible to run now.
+ */
+void schedSync(pid_t my_tid, void *stream) {
+//my_tid is the thread/stream synchronizing the CPU with a GPU kernel completion
+  int str_idx;
+  pthread_mutex_lock(&sched_lock);
+  //printf("cudaStreamSynchronize TID %d stream %p\n", my_tid, stream);
+  //fflush(stdout);
+     // get the stream array index for the thread that "owns" this stream
+     str_idx = get_stream(my_tid);
+     // if the stream is idle (does not have a kernel being executed), the
+     // call is not related to kernel execution (e.g., is for an asynchronous
+     // cudaMemcpy).  It can be ignored.
+     if (Stream[str_idx].state == IDLE) {
+  // unlock for return
+  pthread_mutex_unlock(&sched_lock);
+        return;
+     }
+     // still holding sched_lock here
+     free_gpu_threads(my_tid, str_idx); //set up GPU state for kernel completion
+     gpu_exit(my_tid, str_idx);
+     
+     Stream[str_idx].state = IDLE;   //set up stream state for kernel completion
+     Stream[str_idx].blocks = 0;
+     Stream[str_idx].block_threads = 0;
+     // schedule_next is called with sched_lock held.  The function must 
+     // release it before returning.
+     schedule_next(my_tid); // which, if any, thread/stream should launch now?
+}
+/* Utility function called from schedSynch().  It invokes the scheduling policy
+ * function, find_best_kernel(), one or more times to determine which, if any,
+ * streams have a kernel that is ready to launch and should be launched.  For
+ * streams having kernels that should launch, the utility function dispatch_next() 
+ * is called to unblock their owning threads.  Note that the calling thread/stream
+ * cannot have a kernel to schedule until it executes another launch.
+ */
+void schedule_next(pid_t my_tid) {
+//my_tid is the thread/stream synchronizing the CPU with a GPU kernel completion
+ 
+  int this_one;
+  //Must be called with sched_lock held.  It must release the lock before
+  //returning.
+  do {
+    /*
+      if (TRACE_ON) {
+         printf("TID %d find new kernel on Sync\n", my_tid);
+         fflush(stdout);
+      }
+    */
+    
+      //call the scheduling policy function.  It returns a stream index for 
+      //a stream in the READY_LAUNCH state with a kernel to be launched now 
+      //(returns -1 if none found)
+      this_one = find_best_kernel();
+      if (this_one >= 0) 
+          dispatch_next(this_one, my_tid);
+  } while (this_one >= 0); //  -1 indicates no more kernel launches now
+       
+  pthread_mutex_unlock(&sched_lock);
+}
+/* Utility function called from schedSync() to free GPU threads for a
+ * completed kernel.
+ */
+void free_gpu_threads(pid_t my_tid, int str_idx) {
+  //str_idx is the stream index of the stream with a completed kernel
+  int alloc_threads;
+  //Must be called with sched_lock held
+     //see alloc_gpu_threads() for a description of thread allocations
+     alloc_threads = min(MAX_GPU_THREADS, Stream[str_idx].blocks * Stream[str_idx].block_threads); 
+     GPU.threads_occupied -= alloc_threads;
+     if (GPU.threads_occupied < 0) {
+         fprintf(stderr, "TID %d Failed - GPU Threads < 0\n", my_tid);
+         exit (-1);
+     }
+}
+/* Utility function called from schedSync() to set GPU stream state for
+ * a completed kernel.
+ */
+void gpu_exit(pid_t my_tid, int str_idx) {
+  //str_idx is the stream index of the stream with a completed kernel
+  //Must be called with sched_lock held
+  //printf("GPU Kernel End %d Threads\n",GPU.stream_threads[str_idx]);
+  GPU.streams[str_idx] = 0;
+  GPU.stream_threads[str_idx] = 0;
+  GPU.kernels_dispatched -= 1;
+  if (GPU.kernels_dispatched < 0) {
+      fprintf(stderr, "TID %d Failed - GPU Kernels < 0\n", my_tid);
+      exit (-1);
+  }
+}
+/* Utility function called from ready_launch() and dispatch_next() to
+ * allocate GPU threads for a kernel scheduled for launching.  The total
+ * number of threads required by the kernel is computed as the number
+ * of blocks in the kernel * the number of threads per block.  If the
+ * total threads is >= MAX_GPU_THREADS, the number of allocated threads
+ * on the GPU is set to MAX_GPU_THREADS so all GPU threads are occupied
+ * until the kernel completes.  
+ * NOTE: Once a kernel is launched that occupies all the GPU threads, 
+ * no additional kernels can launch until that kernel completes.  
+ * This prevents the GPU from concurrently executing the last blocks of 
+ * a current kernel with the first blocks of a newly dispatched kernel.
+ */  
+void alloc_gpu_threads(pid_t my_tid, int str_idx) {
+  //str_idx is the stream index of the stream with a scheduled kernel
+  int alloc_threads;
+  //Must be called with sched_lock held
+     alloc_threads = min(MAX_GPU_THREADS, Stream[str_idx].blocks * Stream[str_idx].block_threads); 
+     GPU.threads_occupied += alloc_threads;
+     if (GPU.threads_occupied > MAX_GPU_THREADS) {
+         fprintf(stderr, "TID %d Failed - GPU Threads Exceeded\n", my_tid);
+         exit (-1);
+     }
+}
+/* Utility function called from ready_launch() and dispatch_next() to set
+ * stream-related and kernel-dispatch state on the GPU for a kernel
+ * scheduled for launching.
+ */
+void gpu_run(pid_t my_tid, int str_idx) {
+  //str_idx is the stream index of the stream with a scheduled kernel
+  //Must be called with sched_lock held
+  GPU.streams[str_idx] = my_tid;
+  //see alloc_gpu_threads for a description of thread allocations
+  GPU.stream_threads[str_idx] = min(MAX_GPU_THREADS, 
+                                Stream[str_idx].blocks * Stream[str_idx].block_threads);
+  /*
+  int i;
+  if (TRACE_ON) {
+     printf("GPU Thread Blocks [ ");
+     for (i = 0; i < stream_count; i++) {
+         if (GPU.stream_threads[i] != 0) 
+             printf("%d ", GPU.stream_threads[i]);
+     }
+     printf("]\n");
+  }
+  */
+  
+  GPU.kernels_dispatched += 1;
+  if (GPU.kernels_dispatched > stream_count) {
+      fprintf(stderr, "TID %d Failed - GPU Kernels > streams\n", my_tid);
+      exit (-1);
+  }
+}
+/* Utility function used in multiple functions to find the index in the
+ * stream array for the stream owned by the thread with TID of my_tid.
+ * The stream must have been previously created (cudaStreamCreate()).
+ */
+int get_stream(pid_t my_tid)
+{
+  //Must be called with sched_lock held
+ 
+  int i;
+  for (i = 0; i < MAX_STREAMS; i++) {
+    if (Stream[i].thread == my_tid)
+        break;
+  }
+  if (i == MAX_STREAMS) {
+      fprintf(stderr, "TID %d Failed - get_stream()", my_tid);
+      exit (-1);
+  }
+  return i;
+}
author	Joshua Bakita <bakitajoshua@gmail.com>	2020-10-15 20:44:33 -0400
committer	Joshua Bakita <bakitajoshua@gmail.com>	2020-10-15 20:44:33 -0400
commit	9e82e2c7cca65a8eb60d5bd99da66241c01a2991 (patch)
tree	f5ac2263d40995c09a3ae656f81d860eac3f658c