diff options
Diffstat (limited to 'libsmctrl.c')
-rw-r--r-- | libsmctrl.c | 26 |
1 files changed, 24 insertions, 2 deletions
diff --git a/libsmctrl.c b/libsmctrl.c index dfd71b8..f932b5f 100644 --- a/libsmctrl.c +++ b/libsmctrl.c | |||
@@ -2,6 +2,22 @@ | |||
2 | * Copyright 2023 Joshua Bakita | 2 | * Copyright 2023 Joshua Bakita |
3 | * Library to control SM masks on CUDA launches. Co-opts preexisting debug | 3 | * Library to control SM masks on CUDA launches. Co-opts preexisting debug |
4 | * logic in the CUDA driver library, and thus requires a build with -lcuda. | 4 | * logic in the CUDA driver library, and thus requires a build with -lcuda. |
5 | * | ||
6 | * This file implements partitioning via three different mechanisms: | ||
7 | * - Modifying the QMD/TMD immediately prior to upload | ||
8 | * - Changing a field in CUDA's global struct that CUDA applies to the QMD/TMD | ||
9 | * - Changing a field in CUDA's stream struct that CUDA applies to the QMD/TMD | ||
10 | * This table shows the mechanism used with each CUDA version: | ||
11 | * +-----------+---------------+---------------+--------------+ | ||
12 | * | Version | Global Mask | Stream Mask | Next Mask | | ||
13 | * +-----------+---------------+---------------+--------------+ | ||
14 | * | 11.0-12.2 | TMD/QMD Hook | stream struct | TMD/QMD Hook | | ||
15 | * | 10.2 | global struct | stream struct | N/A | | ||
16 | * | 8.0-10.1 | N/A | stream struct | N/A | | ||
17 | * +-----------+---------------+---------------+--------------+ | ||
18 | * "N/A" indicates that a mask type is unsupported on that CUDA version. | ||
19 | * Please contact the authors if support is needed for a particular feature on | ||
20 | * an older CUDA version. Support for those is unimplemented, not impossible. | ||
5 | */ | 21 | */ |
6 | #include <cuda.h> | 22 | #include <cuda.h> |
7 | 23 | ||
@@ -12,6 +28,8 @@ | |||
12 | #include <stdio.h> | 28 | #include <stdio.h> |
13 | #include <unistd.h> | 29 | #include <unistd.h> |
14 | 30 | ||
31 | #include <dlfcn.h> | ||
32 | |||
15 | // In functions that do not return an error code, we favor terminating with an | 33 | // In functions that do not return an error code, we favor terminating with an |
16 | // error rather than merely printing a warning and continuing. | 34 | // error rather than merely printing a warning and continuing. |
17 | #define abort(ret, errno, ...) error_at_line(ret, errno, __FILE__, __LINE__, \ | 35 | #define abort(ret, errno, ...) error_at_line(ret, errno, __FILE__, __LINE__, \ |
@@ -49,8 +67,12 @@ static void setup_g_sm_control_10() { | |||
49 | // `cudbgReportDriverApiErrorFlags` as our reference point. (This ends | 67 | // `cudbgReportDriverApiErrorFlags` as our reference point. (This ends |
50 | // up being the closest to an intermediate table we use as part of our | 68 | // up being the closest to an intermediate table we use as part of our |
51 | // lookup---process discussed below.) | 69 | // lookup---process discussed below.) |
52 | extern uint32_t cudbgReportDriverApiErrorFlags; | 70 | // |
53 | uint32_t* sym = &cudbgReportDriverApiErrorFlags; | 71 | // Unfortunately, the symbol we reference is errantly omitted from the |
72 | // libcuda.so stub used by nvcc starting around CUDA 11.8, so we have to | ||
73 | // use dlsym to avoid build-time issues. | ||
74 | void* hndl = dlopen(NULL, RTLD_LAZY); | ||
75 | uint32_t* sym = dlsym(hndl, "cudbgReportDriverApiErrorFlags"); | ||
54 | 76 | ||
55 | // == Deriving Location: | 77 | // == Deriving Location: |
56 | // The number of CUDA devices available is co-located in the same CUDA | 78 | // The number of CUDA devices available is co-located in the same CUDA |