diff options
| author | Joshua J Bakita <jbakita@rtsrv.cs.unc.edu> | 2023-11-08 14:41:47 -0500 |
|---|---|---|
| committer | Joshua Bakita <bakitajoshua@gmail.com> | 2023-11-08 15:01:24 -0500 |
| commit | 3aab3c220f3f0bcc3d3d58d0daf6fd6acf1819e2 (patch) | |
| tree | 71a0fef6595e65d42808e1f963cdd4957c2f28e6 | |
| parent | b9d8f6a83a8e5fec38e9e20a54ee13838936fa10 (diff) | |
Expand support for printing LCE<->PCE and GRCE->LCE configurationrtas24-ae
Tested working on Pascal, Volta, Volta Integrated, Turing, Ampere,
and Ada.
Also clean up minor spacing issues, an errantly added file
(nvdebug.mod), and fix some inconsistencies with upstream.
| -rw-r--r-- | device_info_procfs.c | 43 | ||||
| -rw-r--r-- | nvdebug.h | 21 | ||||
| -rw-r--r-- | nvdebug.mod | 2 | ||||
| -rw-r--r-- | nvdebug_entry.c | 111 |
4 files changed, 109 insertions, 68 deletions
diff --git a/device_info_procfs.c b/device_info_procfs.c index d5350c8..168905f 100644 --- a/device_info_procfs.c +++ b/device_info_procfs.c | |||
| @@ -9,7 +9,7 @@ | |||
| 9 | // @param off Requested offset. Updated by number of characters written. | 9 | // @param off Requested offset. Updated by number of characters written. |
| 10 | // @return -errno on error, otherwise number of bytes written to *buf | 10 | // @return -errno on error, otherwise number of bytes written to *buf |
| 11 | // Note: Parent `data` field MUST be the GPU index | 11 | // Note: Parent `data` field MUST be the GPU index |
| 12 | static ssize_t nvdebug_reg32_read(struct file *f, char __user *buf, size_t size, loff_t *off){ | 12 | static ssize_t nvdebug_reg32_read(struct file *f, char __user *buf, size_t size, loff_t *off) { |
| 13 | char out[16]; | 13 | char out[16]; |
| 14 | int chars_written; | 14 | int chars_written; |
| 15 | struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(f)]; | 15 | struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(f)]; |
| @@ -22,37 +22,42 @@ static ssize_t nvdebug_reg32_read(struct file *f, char __user *buf, size_t size, | |||
| 22 | *off += chars_written; | 22 | *off += chars_written; |
| 23 | return chars_written; | 23 | return chars_written; |
| 24 | } | 24 | } |
| 25 | static ssize_t nvdebug_read4_pascal(struct file *f, char __user *buf, size_t size, loff_t *off){ | 25 | |
| 26 | char out[16]; | 26 | static ssize_t nvdebug_reg_range_read(struct file *f, char __user *buf, size_t size, loff_t *off) { |
| 27 | char out[12]; | ||
| 27 | int chars_written; | 28 | int chars_written; |
| 29 | uint32_t read, mask; | ||
| 28 | struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(f)]; | 30 | struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(f)]; |
| 29 | void* data = PDE_DATA(file_inode(f)); | 31 | // See comment in nvdebug_entry.c to understand `union reg_range` |
| 30 | struct combo local_combo = *(struct combo*) &data; | 32 | union reg_range range; |
| 33 | range.raw = (uintptr_t)PDE_DATA(file_inode(f)); | ||
| 31 | 34 | ||
| 32 | // 32 bit register will always take less than 16 characters to print | 35 | // "0x" + up to 32-bit register as hex + "\n\0" is at most 12 characters |
| 33 | if (size < 16 || *off != 0) | 36 | if (size < 12 || *off != 0) |
| 34 | return 0; | 37 | return 0; |
| 35 | if (local_combo.index % 2 == 0) | 38 | |
| 36 | chars_written = scnprintf(out, 16, "%#0x\n", (nvdebug_readl(g, local_combo.offset) & 0x0f)); | 39 | // Print bits `start_bit` to `stop_bit` from 32 bits at address `offset` |
| 37 | else | 40 | if ((read = nvdebug_readl(g, range.offset)) == -1) |
| 38 | chars_written = scnprintf(out, 16, "%#0x\n", (nvdebug_readl(g, local_combo.offset) & 0xf0) >> 4); | 41 | return -EOPNOTSUPP; |
| 42 | // Setup `mask` used to throw out unused upper bits | ||
| 43 | mask = -1u >> (32 - range.stop_bit + range.start_bit); | ||
| 44 | // Throw out unused lower bits via a shift, apply the mask, and print | ||
| 45 | chars_written = scnprintf(out, 12, "%#0x\n", (read >> range.start_bit) & mask); | ||
| 39 | if (copy_to_user(buf, out, chars_written)) | 46 | if (copy_to_user(buf, out, chars_written)) |
| 40 | printk(KERN_WARNING "Unable to copy all data for %s\n", file_dentry(f)->d_name.name); | 47 | printk(KERN_WARNING "Unable to copy all data for %s\n", file_dentry(f)->d_name.name); |
| 41 | *off += chars_written; | 48 | *off += chars_written; |
| 42 | return chars_written; | 49 | return chars_written; |
| 43 | |||
| 44 | //(nvdebug_readl(g,NV_LCE_FOR_PCE_GP100(*(int*)PDE_DATA(file_inode(f)))) | ||
| 45 | |||
| 46 | |||
| 47 | |||
| 48 | } | 50 | } |
| 51 | |||
| 49 | struct file_operations nvdebug_read_reg32_file_ops = { | 52 | struct file_operations nvdebug_read_reg32_file_ops = { |
| 50 | .read = nvdebug_reg32_read, | 53 | .read = nvdebug_reg32_read, |
| 51 | .llseek = default_llseek, | 54 | .llseek = default_llseek, |
| 52 | }; | 55 | }; |
| 53 | // File operation for reading 4 bits in 32 bit register (used for Pascal copy engine offsets) | 56 | |
| 54 | struct file_operations nvdebug_read4_pascal_file_ops = { | 57 | // Generic mechanism used for printing a subset of bits from a register |
| 55 | .read = nvdebug_read4_pascal, | 58 | // Please store a `union reg_range` rather than a `uintptr_t` in the PDE_DATA |
| 59 | struct file_operations nvdebug_read_reg_range_file_ops = { | ||
| 60 | .read = nvdebug_reg_range_read, | ||
| 56 | .llseek = default_llseek, | 61 | .llseek = default_llseek, |
| 57 | }; | 62 | }; |
| 58 | 63 | ||
| @@ -391,8 +391,11 @@ typedef union { | |||
| 391 | #define NV_CHIP_ID_KEPLER 0x0E0 | 391 | #define NV_CHIP_ID_KEPLER 0x0E0 |
| 392 | #define NV_CHIP_ID_PASCAL 0x130 | 392 | #define NV_CHIP_ID_PASCAL 0x130 |
| 393 | #define NV_CHIP_ID_VOLTA 0x140 | 393 | #define NV_CHIP_ID_VOLTA 0x140 |
| 394 | #define NV_CHIP_ID_VOLTA_INTEGRATED 0x150 | ||
| 394 | #define NV_CHIP_ID_TURING 0x160 | 395 | #define NV_CHIP_ID_TURING 0x160 |
| 395 | #define NV_CHIP_ID_AMPERE 0x170 | 396 | #define NV_CHIP_ID_AMPERE 0x170 |
| 397 | #define NV_CHIP_ID_HOPPER 0x180 | ||
| 398 | #define NV_CHIP_ID_ADA 0x190 | ||
| 396 | 399 | ||
| 397 | inline static const char* ARCH2NAME(uint32_t arch) { | 400 | inline static const char* ARCH2NAME(uint32_t arch) { |
| 398 | switch (arch) { | 401 | switch (arch) { |
| @@ -692,16 +695,20 @@ typedef union { | |||
| 692 | // Defined number of GRCEs for a GPU | 695 | // Defined number of GRCEs for a GPU |
| 693 | # define NV_GRCE_NUM 2 | 696 | # define NV_GRCE_NUM 2 |
| 694 | // Defined GRCE->CE mapping offsets from nvgpu | 697 | // Defined GRCE->CE mapping offsets from nvgpu |
| 695 | #define NV_GRCE_FOR_CE(i) (0x00104034+(i)*4) | 698 | #define NV_GRCE_FOR_CE_GP100(i) (0x00104034+(i)*4) |
| 699 | #define NV_GRCE_FOR_CE_GA100(i) (0x001041c0+(i)*4) | ||
| 696 | // Defined LCE->PCE mapping offset from nvgpu (same as ce_pce2lce_config_r(i) in nvgpu) | 700 | // Defined LCE->PCE mapping offset from nvgpu (same as ce_pce2lce_config_r(i) in nvgpu) |
| 701 | #define NV_LCE_FOR_PCE_GP100(i) (0x0010402c+(i)/2) | ||
| 697 | #define NV_LCE_FOR_PCE_GV100(i) (0x00104040+(i)*4) | 702 | #define NV_LCE_FOR_PCE_GV100(i) (0x00104040+(i)*4) |
| 698 | #define NV_LCE_FOR_PCE_GA100(i) (0x00104100+(i)*4) | 703 | #define NV_LCE_FOR_PCE_GA100(i) (0x00104100+(i)*4) |
| 699 | #define NV_LCE_FOR_PCE_GP100(i) (0x0010402c+(i)/2) | 704 | // Struct for use with nvdebug_reg_range_read() |
| 700 | #define NV_LCE_FOR_PCE_TU104(i) (0x00104040+(i)*4) | 705 | union reg_range { |
| 701 | // Defined struct for storing PCE index and offset for proc_create | 706 | struct { |
| 702 | struct combo { | 707 | uint32_t offset; |
| 703 | uint32_t offset:32; | 708 | uint8_t start_bit; |
| 704 | uint32_t index:32; | 709 | uint8_t stop_bit; |
| 710 | }; | ||
| 711 | uint64_t raw; | ||
| 705 | }; | 712 | }; |
| 706 | 713 | ||
| 707 | /* Physical Copy Engine (PCE) information | 714 | /* Physical Copy Engine (PCE) information |
diff --git a/nvdebug.mod b/nvdebug.mod deleted file mode 100644 index 5ffaef7..0000000 --- a/nvdebug.mod +++ /dev/null | |||
| @@ -1,2 +0,0 @@ | |||
| 1 | /home/saman63/nvdebug/runlist_procfs.o /home/saman63/nvdebug/device_info_procfs.o /home/saman63/nvdebug/runlist.o /home/saman63/nvdebug/mmu.o /home/saman63/nvdebug/nvdebug_entry.o | ||
| 2 | |||
diff --git a/nvdebug_entry.c b/nvdebug_entry.c index 3815e06..78860e6 100644 --- a/nvdebug_entry.c +++ b/nvdebug_entry.c | |||
| @@ -28,7 +28,8 @@ extern struct file_operations enable_channel_file_ops; | |||
| 28 | extern struct file_operations switch_to_tsg_file_ops; | 28 | extern struct file_operations switch_to_tsg_file_ops; |
| 29 | extern struct file_operations device_info_file_ops; | 29 | extern struct file_operations device_info_file_ops; |
| 30 | extern struct file_operations nvdebug_read_reg32_file_ops; | 30 | extern struct file_operations nvdebug_read_reg32_file_ops; |
| 31 | extern struct file_operations nvdebug_read4_pascal_file_ops; | 31 | extern struct file_operations nvdebug_read_reg_range_file_ops; |
| 32 | |||
| 32 | // Bus types are global symbols in the kernel | 33 | // Bus types are global symbols in the kernel |
| 33 | extern struct bus_type platform_bus_type; | 34 | extern struct bus_type platform_bus_type; |
| 34 | struct nvdebug_state g_nvdebug_state[NVDEBUG_MAX_DEVICES]; | 35 | struct nvdebug_state g_nvdebug_state[NVDEBUG_MAX_DEVICES]; |
| @@ -261,62 +262,92 @@ int __init nvdebug_init(void) { | |||
| 261 | (void*)NV_FUSE_GPC); | 262 | (void*)NV_FUSE_GPC); |
| 262 | // In both nouveau and nvgpu, the PCE_MAP register is available on Pascal+ | 263 | // In both nouveau and nvgpu, the PCE_MAP register is available on Pascal+ |
| 263 | if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_PASCAL){ | 264 | if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_PASCAL){ |
| 264 | // Declare struct for storing pce index and offset | 265 | // Used for reading a subset of a register on pascal |
| 265 | struct combo local_combo; | 266 | union reg_range pascal_reg; |
| 266 | struct combo* local_combo_ptr = &local_combo; | ||
| 267 | // Create a pce mask for iteration | 267 | // Create a pce mask for iteration |
| 268 | u32 ce_pce_map = nvdebug_readl(&g_nvdebug_state[device_id], NV_CE_PCE_MAP); | 268 | u32 ce_pce_map = nvdebug_readl(&g_nvdebug_state[device_id], NV_CE_PCE_MAP); |
| 269 | char file_name[20]; | 269 | char file_name[21]; |
| 270 | int pce_id = 0; | 270 | int pce_id = 0; |
| 271 | int pce_num = 0; | ||
| 271 | int i; | 272 | int i; |
| 272 | for (i = 0; i < MAP_SIZE; i++){ | 273 | for (pce_id = 0; pce_id < MAP_SIZE; pce_id++) { |
| 273 | // If pce is enabled, create files and iterate pce_id; otherwise, do nothing | 274 | // If pce is enabled, create files and iterate pce_id; otherwise, do nothing |
| 274 | if ((1 << i) & ce_pce_map){ | 275 | if ((1 << pce_id) & ce_pce_map) { |
| 275 | snprintf(file_name, 20, "lce_for_pce%d",pce_id); | 276 | snprintf(file_name, 20, "lce_for_pce%d", pce_num); |
| 276 | // Depending on GPU architecture, fetch data for the LCE of particular PCE | 277 | // Depending on GPU architecture, fetch data for the LCE of particular PCE |
| 277 | switch (g_nvdebug_state[res].chip_id & 0xff0){ | 278 | switch (g_nvdebug_state[res].chip_id & 0xff0) { |
| 278 | |||
| 279 | case NV_CHIP_ID_PASCAL: | 279 | case NV_CHIP_ID_PASCAL: |
| 280 | local_combo.offset = NV_LCE_FOR_PCE_GP100(pce_id); | 280 | // On Pascal, two PCE configurations are packed per-byte. |
| 281 | local_combo.index = pce_id; | 281 | // Work around this by leveraging that we only run on 64-bit |
| 282 | // platforms (can assume that a void* is 64-bits), and that | ||
| 283 | // GPU register offsets are only 32-bits. Use the other 32 | ||
| 284 | // bits to store which bits to print. | ||
| 285 | pascal_reg.offset = NV_LCE_FOR_PCE_GP100(0); | ||
| 286 | pascal_reg.start_bit = pce_id * 4; | ||
| 287 | pascal_reg.stop_bit = pce_id * 4 + 4; | ||
| 282 | lce_for_pce_entry = proc_create_data( | 288 | lce_for_pce_entry = proc_create_data( |
| 283 | file_name, 0444, dir, compat_ops(&nvdebug_read4_pascal_file_ops), | 289 | file_name, 0444, dir, compat_ops(&nvdebug_read_reg_range_file_ops), |
| 284 | *(void**)local_combo_ptr); | 290 | (void*)pascal_reg.raw); |
| 285 | break; | 291 | break; |
| 286 | case NV_CHIP_ID_VOLTA: | 292 | case NV_CHIP_ID_VOLTA: |
| 287 | lce_for_pce_entry = proc_create_data( | 293 | case NV_CHIP_ID_VOLTA_INTEGRATED: |
| 288 | file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops), | 294 | case NV_CHIP_ID_TURING: |
| 289 | (void*)(uintptr_t)NV_LCE_FOR_PCE_GV100(pce_id)); | 295 | lce_for_pce_entry = proc_create_data( |
| 296 | file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops), | ||
| 297 | (void*)(uintptr_t)NV_LCE_FOR_PCE_GV100(pce_id)); | ||
| 290 | break; | 298 | break; |
| 291 | case NV_CHIP_ID_AMPERE: | 299 | case NV_CHIP_ID_AMPERE: |
| 300 | case NV_CHIP_ID_HOPPER: | ||
| 301 | case NV_CHIP_ID_ADA: | ||
| 292 | lce_for_pce_entry = proc_create_data( | 302 | lce_for_pce_entry = proc_create_data( |
| 293 | file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops), | 303 | file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops), |
| 294 | (void*)(uintptr_t)NV_LCE_FOR_PCE_GA100(pce_id)); | 304 | (void*)(uintptr_t)NV_LCE_FOR_PCE_GA100(pce_id)); |
| 295 | break; | 305 | break; |
| 296 | case NV_CHIP_ID_TURING: | ||
| 297 | lce_for_pce_entry = proc_create_data( | ||
| 298 | file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops), | ||
| 299 | (void*)(uintptr_t)NV_LCE_FOR_PCE_TU104(pce_id)); | ||
| 300 | break; | ||
| 301 | |||
| 302 | } | ||
| 303 | // Make 2 files for 2 GRCEs | ||
| 304 | if (pce_id < NV_GRCE_NUM){ | ||
| 305 | local_combo.offset = NV_GRCE_FOR_CE(pce_id); | ||
| 306 | local_combo.index = 0; | ||
| 307 | snprintf(file_name, 20, "pce_for_grce%d",pce_id); | ||
| 308 | grce_for_pce_entry = proc_create_data( | ||
| 309 | file_name, 0444, dir, compat_ops(&nvdebug_read4_pascal_file_ops), | ||
| 310 | *(void**)local_combo_ptr); | ||
| 311 | } | 306 | } |
| 312 | if (!lce_for_pce_entry || !grce_for_pce_entry) | 307 | if (!lce_for_pce_entry) |
| 313 | return -ENOMEM; | 308 | return -ENOMEM; |
| 314 | pce_id++; | 309 | pce_num++; |
| 315 | 310 | } | |
| 316 | } | 311 | } |
| 317 | } | 312 | // We assume 2 GRCEs (reminder: GRCE0 and 1 are just LCE0 and 1) |
| 313 | for (i = 0; i < 2; i++) { | ||
| 314 | union reg_range grce_reg = {0}; | ||
| 315 | snprintf(file_name, 21, "shared_lce_for_grce%d", i); | ||
| 316 | // The offset used here is only documented for Turing | ||
| 317 | // Actually, Pascal through Turing | ||
| 318 | // On Pascal, it's only 3 bits, every 8 bits | ||
| 319 | // On Volta-Turing, it start at same offset, but it's lower 4 bits, every 32 bits | ||
| 320 | // On Ampere+ it starts at 0x001041c0, but is the same layout as Volta-Turing | ||
| 321 | switch (g_nvdebug_state[res].chip_id & 0xff0) { | ||
| 322 | case NV_CHIP_ID_PASCAL: | ||
| 323 | grce_reg.offset = NV_GRCE_FOR_CE_GP100(0); | ||
| 324 | grce_reg.start_bit = i * 8; | ||
| 325 | grce_reg.stop_bit = grce_reg.start_bit + 3; | ||
| 326 | break; | ||
| 327 | case NV_CHIP_ID_VOLTA: | ||
| 328 | case NV_CHIP_ID_VOLTA_INTEGRATED: | ||
| 329 | case NV_CHIP_ID_TURING: | ||
| 330 | grce_reg.offset = NV_GRCE_FOR_CE_GP100(i); | ||
| 331 | grce_reg.start_bit = 0; | ||
| 332 | grce_reg.stop_bit = grce_reg.start_bit + 4; | ||
| 333 | break; | ||
| 334 | case NV_CHIP_ID_AMPERE: | ||
| 335 | case NV_CHIP_ID_HOPPER: | ||
| 336 | case NV_CHIP_ID_ADA: | ||
| 337 | grce_reg.offset = NV_GRCE_FOR_CE_GA100(i); | ||
| 338 | grce_reg.start_bit = 0; | ||
| 339 | grce_reg.stop_bit = grce_reg.start_bit + 4; | ||
| 340 | break; | ||
| 341 | } | ||
| 342 | grce_for_pce_entry = proc_create_data( | ||
| 343 | file_name, 0444, dir, compat_ops(&nvdebug_read_reg_range_file_ops), | ||
| 344 | (void*)grce_reg.raw); | ||
| 345 | if (!grce_for_pce_entry) | ||
| 346 | return -ENOMEM; | ||
| 347 | } | ||
| 318 | 348 | ||
| 319 | // TODO: Redo to num_pces | 349 | // TODO: Redo to num_pces |
| 350 | // Create file `/proc/gpu#/pce_map`, world readable | ||
| 320 | num_gpcs_entry = proc_create_data( | 351 | num_gpcs_entry = proc_create_data( |
| 321 | "pce_map", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops), | 352 | "pce_map", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops), |
| 322 | (void*)NV_CE_PCE_MAP); | 353 | (void*)NV_CE_PCE_MAP); |
