From 3aab3c220f3f0bcc3d3d58d0daf6fd6acf1819e2 Mon Sep 17 00:00:00 2001 From: Joshua J Bakita Date: Wed, 8 Nov 2023 14:41:47 -0500 Subject: Expand support for printing LCE<->PCE and GRCE->LCE configuration Tested working on Pascal, Volta, Volta Integrated, Turing, Ampere, and Ada. Also clean up minor spacing issues, an errantly added file (nvdebug.mod), and fix some inconsistencies with upstream. --- device_info_procfs.c | 43 +++++++++++--------- nvdebug.h | 21 ++++++---- nvdebug.mod | 2 - nvdebug_entry.c | 111 ++++++++++++++++++++++++++++++++------------------- 4 files changed, 109 insertions(+), 68 deletions(-) delete mode 100644 nvdebug.mod diff --git a/device_info_procfs.c b/device_info_procfs.c index d5350c8..168905f 100644 --- a/device_info_procfs.c +++ b/device_info_procfs.c @@ -9,7 +9,7 @@ // @param off Requested offset. Updated by number of characters written. // @return -errno on error, otherwise number of bytes written to *buf // Note: Parent `data` field MUST be the GPU index -static ssize_t nvdebug_reg32_read(struct file *f, char __user *buf, size_t size, loff_t *off){ +static ssize_t nvdebug_reg32_read(struct file *f, char __user *buf, size_t size, loff_t *off) { char out[16]; int chars_written; struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(f)]; @@ -22,37 +22,42 @@ static ssize_t nvdebug_reg32_read(struct file *f, char __user *buf, size_t size, *off += chars_written; return chars_written; } -static ssize_t nvdebug_read4_pascal(struct file *f, char __user *buf, size_t size, loff_t *off){ - char out[16]; + +static ssize_t nvdebug_reg_range_read(struct file *f, char __user *buf, size_t size, loff_t *off) { + char out[12]; int chars_written; + uint32_t read, mask; struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(f)]; - void* data = PDE_DATA(file_inode(f)); - struct combo local_combo = *(struct combo*) &data; + // See comment in nvdebug_entry.c to understand `union reg_range` + union reg_range range; + range.raw = (uintptr_t)PDE_DATA(file_inode(f)); - // 32 bit register will always take less than 16 characters to print - if (size < 16 || *off != 0) + // "0x" + up to 32-bit register as hex + "\n\0" is at most 12 characters + if (size < 12 || *off != 0) return 0; - if (local_combo.index % 2 == 0) - chars_written = scnprintf(out, 16, "%#0x\n", (nvdebug_readl(g, local_combo.offset) & 0x0f)); - else - chars_written = scnprintf(out, 16, "%#0x\n", (nvdebug_readl(g, local_combo.offset) & 0xf0) >> 4); + + // Print bits `start_bit` to `stop_bit` from 32 bits at address `offset` + if ((read = nvdebug_readl(g, range.offset)) == -1) + return -EOPNOTSUPP; + // Setup `mask` used to throw out unused upper bits + mask = -1u >> (32 - range.stop_bit + range.start_bit); + // Throw out unused lower bits via a shift, apply the mask, and print + chars_written = scnprintf(out, 12, "%#0x\n", (read >> range.start_bit) & mask); if (copy_to_user(buf, out, chars_written)) printk(KERN_WARNING "Unable to copy all data for %s\n", file_dentry(f)->d_name.name); *off += chars_written; return chars_written; - -//(nvdebug_readl(g,NV_LCE_FOR_PCE_GP100(*(int*)PDE_DATA(file_inode(f)))) - - - } + struct file_operations nvdebug_read_reg32_file_ops = { .read = nvdebug_reg32_read, .llseek = default_llseek, }; -// File operation for reading 4 bits in 32 bit register (used for Pascal copy engine offsets) -struct file_operations nvdebug_read4_pascal_file_ops = { - .read = nvdebug_read4_pascal, + +// Generic mechanism used for printing a subset of bits from a register +// Please store a `union reg_range` rather than a `uintptr_t` in the PDE_DATA +struct file_operations nvdebug_read_reg_range_file_ops = { + .read = nvdebug_reg_range_read, .llseek = default_llseek, }; diff --git a/nvdebug.h b/nvdebug.h index b0e6bb8..a9366e0 100644 --- a/nvdebug.h +++ b/nvdebug.h @@ -391,8 +391,11 @@ typedef union { #define NV_CHIP_ID_KEPLER 0x0E0 #define NV_CHIP_ID_PASCAL 0x130 #define NV_CHIP_ID_VOLTA 0x140 +#define NV_CHIP_ID_VOLTA_INTEGRATED 0x150 #define NV_CHIP_ID_TURING 0x160 #define NV_CHIP_ID_AMPERE 0x170 +#define NV_CHIP_ID_HOPPER 0x180 +#define NV_CHIP_ID_ADA 0x190 inline static const char* ARCH2NAME(uint32_t arch) { switch (arch) { @@ -692,16 +695,20 @@ typedef union { // Defined number of GRCEs for a GPU # define NV_GRCE_NUM 2 // Defined GRCE->CE mapping offsets from nvgpu -#define NV_GRCE_FOR_CE(i) (0x00104034+(i)*4) +#define NV_GRCE_FOR_CE_GP100(i) (0x00104034+(i)*4) +#define NV_GRCE_FOR_CE_GA100(i) (0x001041c0+(i)*4) // Defined LCE->PCE mapping offset from nvgpu (same as ce_pce2lce_config_r(i) in nvgpu) +#define NV_LCE_FOR_PCE_GP100(i) (0x0010402c+(i)/2) #define NV_LCE_FOR_PCE_GV100(i) (0x00104040+(i)*4) #define NV_LCE_FOR_PCE_GA100(i) (0x00104100+(i)*4) -#define NV_LCE_FOR_PCE_GP100(i) (0x0010402c+(i)/2) -#define NV_LCE_FOR_PCE_TU104(i) (0x00104040+(i)*4) -// Defined struct for storing PCE index and offset for proc_create -struct combo { - uint32_t offset:32; - uint32_t index:32; +// Struct for use with nvdebug_reg_range_read() +union reg_range { + struct { + uint32_t offset; + uint8_t start_bit; + uint8_t stop_bit; + }; + uint64_t raw; }; /* Physical Copy Engine (PCE) information diff --git a/nvdebug.mod b/nvdebug.mod deleted file mode 100644 index 5ffaef7..0000000 --- a/nvdebug.mod +++ /dev/null @@ -1,2 +0,0 @@ -/home/saman63/nvdebug/runlist_procfs.o /home/saman63/nvdebug/device_info_procfs.o /home/saman63/nvdebug/runlist.o /home/saman63/nvdebug/mmu.o /home/saman63/nvdebug/nvdebug_entry.o - diff --git a/nvdebug_entry.c b/nvdebug_entry.c index 3815e06..78860e6 100644 --- a/nvdebug_entry.c +++ b/nvdebug_entry.c @@ -28,7 +28,8 @@ extern struct file_operations enable_channel_file_ops; extern struct file_operations switch_to_tsg_file_ops; extern struct file_operations device_info_file_ops; extern struct file_operations nvdebug_read_reg32_file_ops; -extern struct file_operations nvdebug_read4_pascal_file_ops; +extern struct file_operations nvdebug_read_reg_range_file_ops; + // Bus types are global symbols in the kernel extern struct bus_type platform_bus_type; struct nvdebug_state g_nvdebug_state[NVDEBUG_MAX_DEVICES]; @@ -261,62 +262,92 @@ int __init nvdebug_init(void) { (void*)NV_FUSE_GPC); // In both nouveau and nvgpu, the PCE_MAP register is available on Pascal+ if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_PASCAL){ - // Declare struct for storing pce index and offset - struct combo local_combo; - struct combo* local_combo_ptr = &local_combo; + // Used for reading a subset of a register on pascal + union reg_range pascal_reg; // Create a pce mask for iteration u32 ce_pce_map = nvdebug_readl(&g_nvdebug_state[device_id], NV_CE_PCE_MAP); - char file_name[20]; + char file_name[21]; int pce_id = 0; + int pce_num = 0; int i; - for (i = 0; i < MAP_SIZE; i++){ + for (pce_id = 0; pce_id < MAP_SIZE; pce_id++) { // If pce is enabled, create files and iterate pce_id; otherwise, do nothing - if ((1 << i) & ce_pce_map){ - snprintf(file_name, 20, "lce_for_pce%d",pce_id); + if ((1 << pce_id) & ce_pce_map) { + snprintf(file_name, 20, "lce_for_pce%d", pce_num); // Depending on GPU architecture, fetch data for the LCE of particular PCE - switch (g_nvdebug_state[res].chip_id & 0xff0){ - + switch (g_nvdebug_state[res].chip_id & 0xff0) { case NV_CHIP_ID_PASCAL: - local_combo.offset = NV_LCE_FOR_PCE_GP100(pce_id); - local_combo.index = pce_id; + // On Pascal, two PCE configurations are packed per-byte. + // Work around this by leveraging that we only run on 64-bit + // platforms (can assume that a void* is 64-bits), and that + // GPU register offsets are only 32-bits. Use the other 32 + // bits to store which bits to print. + pascal_reg.offset = NV_LCE_FOR_PCE_GP100(0); + pascal_reg.start_bit = pce_id * 4; + pascal_reg.stop_bit = pce_id * 4 + 4; lce_for_pce_entry = proc_create_data( - file_name, 0444, dir, compat_ops(&nvdebug_read4_pascal_file_ops), - *(void**)local_combo_ptr); + file_name, 0444, dir, compat_ops(&nvdebug_read_reg_range_file_ops), + (void*)pascal_reg.raw); break; case NV_CHIP_ID_VOLTA: - lce_for_pce_entry = proc_create_data( - file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops), - (void*)(uintptr_t)NV_LCE_FOR_PCE_GV100(pce_id)); + case NV_CHIP_ID_VOLTA_INTEGRATED: + case NV_CHIP_ID_TURING: + lce_for_pce_entry = proc_create_data( + file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops), + (void*)(uintptr_t)NV_LCE_FOR_PCE_GV100(pce_id)); break; case NV_CHIP_ID_AMPERE: + case NV_CHIP_ID_HOPPER: + case NV_CHIP_ID_ADA: lce_for_pce_entry = proc_create_data( - file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops), - (void*)(uintptr_t)NV_LCE_FOR_PCE_GA100(pce_id)); + file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops), + (void*)(uintptr_t)NV_LCE_FOR_PCE_GA100(pce_id)); break; - case NV_CHIP_ID_TURING: - lce_for_pce_entry = proc_create_data( - file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops), - (void*)(uintptr_t)NV_LCE_FOR_PCE_TU104(pce_id)); - break; - - } - // Make 2 files for 2 GRCEs - if (pce_id < NV_GRCE_NUM){ - local_combo.offset = NV_GRCE_FOR_CE(pce_id); - local_combo.index = 0; - snprintf(file_name, 20, "pce_for_grce%d",pce_id); - grce_for_pce_entry = proc_create_data( - file_name, 0444, dir, compat_ops(&nvdebug_read4_pascal_file_ops), - *(void**)local_combo_ptr); } - if (!lce_for_pce_entry || !grce_for_pce_entry) - return -ENOMEM; - pce_id++; - - } - } + if (!lce_for_pce_entry) + return -ENOMEM; + pce_num++; + } + } + // We assume 2 GRCEs (reminder: GRCE0 and 1 are just LCE0 and 1) + for (i = 0; i < 2; i++) { + union reg_range grce_reg = {0}; + snprintf(file_name, 21, "shared_lce_for_grce%d", i); + // The offset used here is only documented for Turing + // Actually, Pascal through Turing + // On Pascal, it's only 3 bits, every 8 bits + // On Volta-Turing, it start at same offset, but it's lower 4 bits, every 32 bits + // On Ampere+ it starts at 0x001041c0, but is the same layout as Volta-Turing + switch (g_nvdebug_state[res].chip_id & 0xff0) { + case NV_CHIP_ID_PASCAL: + grce_reg.offset = NV_GRCE_FOR_CE_GP100(0); + grce_reg.start_bit = i * 8; + grce_reg.stop_bit = grce_reg.start_bit + 3; + break; + case NV_CHIP_ID_VOLTA: + case NV_CHIP_ID_VOLTA_INTEGRATED: + case NV_CHIP_ID_TURING: + grce_reg.offset = NV_GRCE_FOR_CE_GP100(i); + grce_reg.start_bit = 0; + grce_reg.stop_bit = grce_reg.start_bit + 4; + break; + case NV_CHIP_ID_AMPERE: + case NV_CHIP_ID_HOPPER: + case NV_CHIP_ID_ADA: + grce_reg.offset = NV_GRCE_FOR_CE_GA100(i); + grce_reg.start_bit = 0; + grce_reg.stop_bit = grce_reg.start_bit + 4; + break; + } + grce_for_pce_entry = proc_create_data( + file_name, 0444, dir, compat_ops(&nvdebug_read_reg_range_file_ops), + (void*)grce_reg.raw); + if (!grce_for_pce_entry) + return -ENOMEM; + } // TODO: Redo to num_pces + // Create file `/proc/gpu#/pce_map`, world readable num_gpcs_entry = proc_create_data( "pce_map", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops), (void*)NV_CE_PCE_MAP); -- cgit v1.2.2