summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJoshua J Bakita <jbakita@rtsrv.cs.unc.edu>2023-11-08 14:41:47 -0500
committerJoshua Bakita <bakitajoshua@gmail.com>2023-11-08 15:01:24 -0500
commit3aab3c220f3f0bcc3d3d58d0daf6fd6acf1819e2 (patch)
tree71a0fef6595e65d42808e1f963cdd4957c2f28e6
parentb9d8f6a83a8e5fec38e9e20a54ee13838936fa10 (diff)
Expand support for printing LCE<->PCE and GRCE->LCE configurationrtas24-ae
Tested working on Pascal, Volta, Volta Integrated, Turing, Ampere, and Ada. Also clean up minor spacing issues, an errantly added file (nvdebug.mod), and fix some inconsistencies with upstream.
-rw-r--r--device_info_procfs.c43
-rw-r--r--nvdebug.h21
-rw-r--r--nvdebug.mod2
-rw-r--r--nvdebug_entry.c111
4 files changed, 109 insertions, 68 deletions
diff --git a/device_info_procfs.c b/device_info_procfs.c
index d5350c8..168905f 100644
--- a/device_info_procfs.c
+++ b/device_info_procfs.c
@@ -9,7 +9,7 @@
9// @param off Requested offset. Updated by number of characters written. 9// @param off Requested offset. Updated by number of characters written.
10// @return -errno on error, otherwise number of bytes written to *buf 10// @return -errno on error, otherwise number of bytes written to *buf
11// Note: Parent `data` field MUST be the GPU index 11// Note: Parent `data` field MUST be the GPU index
12static ssize_t nvdebug_reg32_read(struct file *f, char __user *buf, size_t size, loff_t *off){ 12static ssize_t nvdebug_reg32_read(struct file *f, char __user *buf, size_t size, loff_t *off) {
13 char out[16]; 13 char out[16];
14 int chars_written; 14 int chars_written;
15 struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(f)]; 15 struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(f)];
@@ -22,37 +22,42 @@ static ssize_t nvdebug_reg32_read(struct file *f, char __user *buf, size_t size,
22 *off += chars_written; 22 *off += chars_written;
23 return chars_written; 23 return chars_written;
24} 24}
25static ssize_t nvdebug_read4_pascal(struct file *f, char __user *buf, size_t size, loff_t *off){ 25
26 char out[16]; 26static ssize_t nvdebug_reg_range_read(struct file *f, char __user *buf, size_t size, loff_t *off) {
27 char out[12];
27 int chars_written; 28 int chars_written;
29 uint32_t read, mask;
28 struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(f)]; 30 struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(f)];
29 void* data = PDE_DATA(file_inode(f)); 31 // See comment in nvdebug_entry.c to understand `union reg_range`
30 struct combo local_combo = *(struct combo*) &data; 32 union reg_range range;
33 range.raw = (uintptr_t)PDE_DATA(file_inode(f));
31 34
32 // 32 bit register will always take less than 16 characters to print 35 // "0x" + up to 32-bit register as hex + "\n\0" is at most 12 characters
33 if (size < 16 || *off != 0) 36 if (size < 12 || *off != 0)
34 return 0; 37 return 0;
35 if (local_combo.index % 2 == 0) 38
36 chars_written = scnprintf(out, 16, "%#0x\n", (nvdebug_readl(g, local_combo.offset) & 0x0f)); 39 // Print bits `start_bit` to `stop_bit` from 32 bits at address `offset`
37 else 40 if ((read = nvdebug_readl(g, range.offset)) == -1)
38 chars_written = scnprintf(out, 16, "%#0x\n", (nvdebug_readl(g, local_combo.offset) & 0xf0) >> 4); 41 return -EOPNOTSUPP;
42 // Setup `mask` used to throw out unused upper bits
43 mask = -1u >> (32 - range.stop_bit + range.start_bit);
44 // Throw out unused lower bits via a shift, apply the mask, and print
45 chars_written = scnprintf(out, 12, "%#0x\n", (read >> range.start_bit) & mask);
39 if (copy_to_user(buf, out, chars_written)) 46 if (copy_to_user(buf, out, chars_written))
40 printk(KERN_WARNING "Unable to copy all data for %s\n", file_dentry(f)->d_name.name); 47 printk(KERN_WARNING "Unable to copy all data for %s\n", file_dentry(f)->d_name.name);
41 *off += chars_written; 48 *off += chars_written;
42 return chars_written; 49 return chars_written;
43
44//(nvdebug_readl(g,NV_LCE_FOR_PCE_GP100(*(int*)PDE_DATA(file_inode(f))))
45
46
47
48} 50}
51
49struct file_operations nvdebug_read_reg32_file_ops = { 52struct file_operations nvdebug_read_reg32_file_ops = {
50 .read = nvdebug_reg32_read, 53 .read = nvdebug_reg32_read,
51 .llseek = default_llseek, 54 .llseek = default_llseek,
52}; 55};
53// File operation for reading 4 bits in 32 bit register (used for Pascal copy engine offsets) 56
54struct file_operations nvdebug_read4_pascal_file_ops = { 57// Generic mechanism used for printing a subset of bits from a register
55 .read = nvdebug_read4_pascal, 58// Please store a `union reg_range` rather than a `uintptr_t` in the PDE_DATA
59struct file_operations nvdebug_read_reg_range_file_ops = {
60 .read = nvdebug_reg_range_read,
56 .llseek = default_llseek, 61 .llseek = default_llseek,
57}; 62};
58 63
diff --git a/nvdebug.h b/nvdebug.h
index b0e6bb8..a9366e0 100644
--- a/nvdebug.h
+++ b/nvdebug.h
@@ -391,8 +391,11 @@ typedef union {
391#define NV_CHIP_ID_KEPLER 0x0E0 391#define NV_CHIP_ID_KEPLER 0x0E0
392#define NV_CHIP_ID_PASCAL 0x130 392#define NV_CHIP_ID_PASCAL 0x130
393#define NV_CHIP_ID_VOLTA 0x140 393#define NV_CHIP_ID_VOLTA 0x140
394#define NV_CHIP_ID_VOLTA_INTEGRATED 0x150
394#define NV_CHIP_ID_TURING 0x160 395#define NV_CHIP_ID_TURING 0x160
395#define NV_CHIP_ID_AMPERE 0x170 396#define NV_CHIP_ID_AMPERE 0x170
397#define NV_CHIP_ID_HOPPER 0x180
398#define NV_CHIP_ID_ADA 0x190
396 399
397inline static const char* ARCH2NAME(uint32_t arch) { 400inline static const char* ARCH2NAME(uint32_t arch) {
398 switch (arch) { 401 switch (arch) {
@@ -692,16 +695,20 @@ typedef union {
692// Defined number of GRCEs for a GPU 695// Defined number of GRCEs for a GPU
693# define NV_GRCE_NUM 2 696# define NV_GRCE_NUM 2
694// Defined GRCE->CE mapping offsets from nvgpu 697// Defined GRCE->CE mapping offsets from nvgpu
695#define NV_GRCE_FOR_CE(i) (0x00104034+(i)*4) 698#define NV_GRCE_FOR_CE_GP100(i) (0x00104034+(i)*4)
699#define NV_GRCE_FOR_CE_GA100(i) (0x001041c0+(i)*4)
696// Defined LCE->PCE mapping offset from nvgpu (same as ce_pce2lce_config_r(i) in nvgpu) 700// Defined LCE->PCE mapping offset from nvgpu (same as ce_pce2lce_config_r(i) in nvgpu)
701#define NV_LCE_FOR_PCE_GP100(i) (0x0010402c+(i)/2)
697#define NV_LCE_FOR_PCE_GV100(i) (0x00104040+(i)*4) 702#define NV_LCE_FOR_PCE_GV100(i) (0x00104040+(i)*4)
698#define NV_LCE_FOR_PCE_GA100(i) (0x00104100+(i)*4) 703#define NV_LCE_FOR_PCE_GA100(i) (0x00104100+(i)*4)
699#define NV_LCE_FOR_PCE_GP100(i) (0x0010402c+(i)/2) 704// Struct for use with nvdebug_reg_range_read()
700#define NV_LCE_FOR_PCE_TU104(i) (0x00104040+(i)*4) 705union reg_range {
701// Defined struct for storing PCE index and offset for proc_create 706 struct {
702struct combo { 707 uint32_t offset;
703 uint32_t offset:32; 708 uint8_t start_bit;
704 uint32_t index:32; 709 uint8_t stop_bit;
710 };
711 uint64_t raw;
705}; 712};
706 713
707/* Physical Copy Engine (PCE) information 714/* Physical Copy Engine (PCE) information
diff --git a/nvdebug.mod b/nvdebug.mod
deleted file mode 100644
index 5ffaef7..0000000
--- a/nvdebug.mod
+++ /dev/null
@@ -1,2 +0,0 @@
1/home/saman63/nvdebug/runlist_procfs.o /home/saman63/nvdebug/device_info_procfs.o /home/saman63/nvdebug/runlist.o /home/saman63/nvdebug/mmu.o /home/saman63/nvdebug/nvdebug_entry.o
2
diff --git a/nvdebug_entry.c b/nvdebug_entry.c
index 3815e06..78860e6 100644
--- a/nvdebug_entry.c
+++ b/nvdebug_entry.c
@@ -28,7 +28,8 @@ extern struct file_operations enable_channel_file_ops;
28extern struct file_operations switch_to_tsg_file_ops; 28extern struct file_operations switch_to_tsg_file_ops;
29extern struct file_operations device_info_file_ops; 29extern struct file_operations device_info_file_ops;
30extern struct file_operations nvdebug_read_reg32_file_ops; 30extern struct file_operations nvdebug_read_reg32_file_ops;
31extern struct file_operations nvdebug_read4_pascal_file_ops; 31extern struct file_operations nvdebug_read_reg_range_file_ops;
32
32// Bus types are global symbols in the kernel 33// Bus types are global symbols in the kernel
33extern struct bus_type platform_bus_type; 34extern struct bus_type platform_bus_type;
34struct nvdebug_state g_nvdebug_state[NVDEBUG_MAX_DEVICES]; 35struct nvdebug_state g_nvdebug_state[NVDEBUG_MAX_DEVICES];
@@ -261,62 +262,92 @@ int __init nvdebug_init(void) {
261 (void*)NV_FUSE_GPC); 262 (void*)NV_FUSE_GPC);
262 // In both nouveau and nvgpu, the PCE_MAP register is available on Pascal+ 263 // In both nouveau and nvgpu, the PCE_MAP register is available on Pascal+
263 if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_PASCAL){ 264 if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_PASCAL){
264 // Declare struct for storing pce index and offset 265 // Used for reading a subset of a register on pascal
265 struct combo local_combo; 266 union reg_range pascal_reg;
266 struct combo* local_combo_ptr = &local_combo;
267 // Create a pce mask for iteration 267 // Create a pce mask for iteration
268 u32 ce_pce_map = nvdebug_readl(&g_nvdebug_state[device_id], NV_CE_PCE_MAP); 268 u32 ce_pce_map = nvdebug_readl(&g_nvdebug_state[device_id], NV_CE_PCE_MAP);
269 char file_name[20]; 269 char file_name[21];
270 int pce_id = 0; 270 int pce_id = 0;
271 int pce_num = 0;
271 int i; 272 int i;
272 for (i = 0; i < MAP_SIZE; i++){ 273 for (pce_id = 0; pce_id < MAP_SIZE; pce_id++) {
273 // If pce is enabled, create files and iterate pce_id; otherwise, do nothing 274 // If pce is enabled, create files and iterate pce_id; otherwise, do nothing
274 if ((1 << i) & ce_pce_map){ 275 if ((1 << pce_id) & ce_pce_map) {
275 snprintf(file_name, 20, "lce_for_pce%d",pce_id); 276 snprintf(file_name, 20, "lce_for_pce%d", pce_num);
276 // Depending on GPU architecture, fetch data for the LCE of particular PCE 277 // Depending on GPU architecture, fetch data for the LCE of particular PCE
277 switch (g_nvdebug_state[res].chip_id & 0xff0){ 278 switch (g_nvdebug_state[res].chip_id & 0xff0) {
278
279 case NV_CHIP_ID_PASCAL: 279 case NV_CHIP_ID_PASCAL:
280 local_combo.offset = NV_LCE_FOR_PCE_GP100(pce_id); 280 // On Pascal, two PCE configurations are packed per-byte.
281 local_combo.index = pce_id; 281 // Work around this by leveraging that we only run on 64-bit
282 // platforms (can assume that a void* is 64-bits), and that
283 // GPU register offsets are only 32-bits. Use the other 32
284 // bits to store which bits to print.
285 pascal_reg.offset = NV_LCE_FOR_PCE_GP100(0);
286 pascal_reg.start_bit = pce_id * 4;
287 pascal_reg.stop_bit = pce_id * 4 + 4;
282 lce_for_pce_entry = proc_create_data( 288 lce_for_pce_entry = proc_create_data(
283 file_name, 0444, dir, compat_ops(&nvdebug_read4_pascal_file_ops), 289 file_name, 0444, dir, compat_ops(&nvdebug_read_reg_range_file_ops),
284 *(void**)local_combo_ptr); 290 (void*)pascal_reg.raw);
285 break; 291 break;
286 case NV_CHIP_ID_VOLTA: 292 case NV_CHIP_ID_VOLTA:
287 lce_for_pce_entry = proc_create_data( 293 case NV_CHIP_ID_VOLTA_INTEGRATED:
288 file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops), 294 case NV_CHIP_ID_TURING:
289 (void*)(uintptr_t)NV_LCE_FOR_PCE_GV100(pce_id)); 295 lce_for_pce_entry = proc_create_data(
296 file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
297 (void*)(uintptr_t)NV_LCE_FOR_PCE_GV100(pce_id));
290 break; 298 break;
291 case NV_CHIP_ID_AMPERE: 299 case NV_CHIP_ID_AMPERE:
300 case NV_CHIP_ID_HOPPER:
301 case NV_CHIP_ID_ADA:
292 lce_for_pce_entry = proc_create_data( 302 lce_for_pce_entry = proc_create_data(
293 file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops), 303 file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
294 (void*)(uintptr_t)NV_LCE_FOR_PCE_GA100(pce_id)); 304 (void*)(uintptr_t)NV_LCE_FOR_PCE_GA100(pce_id));
295 break; 305 break;
296 case NV_CHIP_ID_TURING:
297 lce_for_pce_entry = proc_create_data(
298 file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
299 (void*)(uintptr_t)NV_LCE_FOR_PCE_TU104(pce_id));
300 break;
301
302 }
303 // Make 2 files for 2 GRCEs
304 if (pce_id < NV_GRCE_NUM){
305 local_combo.offset = NV_GRCE_FOR_CE(pce_id);
306 local_combo.index = 0;
307 snprintf(file_name, 20, "pce_for_grce%d",pce_id);
308 grce_for_pce_entry = proc_create_data(
309 file_name, 0444, dir, compat_ops(&nvdebug_read4_pascal_file_ops),
310 *(void**)local_combo_ptr);
311 } 306 }
312 if (!lce_for_pce_entry || !grce_for_pce_entry) 307 if (!lce_for_pce_entry)
313 return -ENOMEM; 308 return -ENOMEM;
314 pce_id++; 309 pce_num++;
315 310 }
316 } 311 }
317 } 312 // We assume 2 GRCEs (reminder: GRCE0 and 1 are just LCE0 and 1)
313 for (i = 0; i < 2; i++) {
314 union reg_range grce_reg = {0};
315 snprintf(file_name, 21, "shared_lce_for_grce%d", i);
316 // The offset used here is only documented for Turing
317 // Actually, Pascal through Turing
318 // On Pascal, it's only 3 bits, every 8 bits
319 // On Volta-Turing, it start at same offset, but it's lower 4 bits, every 32 bits
320 // On Ampere+ it starts at 0x001041c0, but is the same layout as Volta-Turing
321 switch (g_nvdebug_state[res].chip_id & 0xff0) {
322 case NV_CHIP_ID_PASCAL:
323 grce_reg.offset = NV_GRCE_FOR_CE_GP100(0);
324 grce_reg.start_bit = i * 8;
325 grce_reg.stop_bit = grce_reg.start_bit + 3;
326 break;
327 case NV_CHIP_ID_VOLTA:
328 case NV_CHIP_ID_VOLTA_INTEGRATED:
329 case NV_CHIP_ID_TURING:
330 grce_reg.offset = NV_GRCE_FOR_CE_GP100(i);
331 grce_reg.start_bit = 0;
332 grce_reg.stop_bit = grce_reg.start_bit + 4;
333 break;
334 case NV_CHIP_ID_AMPERE:
335 case NV_CHIP_ID_HOPPER:
336 case NV_CHIP_ID_ADA:
337 grce_reg.offset = NV_GRCE_FOR_CE_GA100(i);
338 grce_reg.start_bit = 0;
339 grce_reg.stop_bit = grce_reg.start_bit + 4;
340 break;
341 }
342 grce_for_pce_entry = proc_create_data(
343 file_name, 0444, dir, compat_ops(&nvdebug_read_reg_range_file_ops),
344 (void*)grce_reg.raw);
345 if (!grce_for_pce_entry)
346 return -ENOMEM;
347 }
318 348
319 // TODO: Redo to num_pces 349 // TODO: Redo to num_pces
350 // Create file `/proc/gpu#/pce_map`, world readable
320 num_gpcs_entry = proc_create_data( 351 num_gpcs_entry = proc_create_data(
321 "pce_map", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops), 352 "pce_map", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
322 (void*)NV_CE_PCE_MAP); 353 (void*)NV_CE_PCE_MAP);