From ac60151ea0a4a1f3882fde3c486af870029b7977 Mon Sep 17 00:00:00 2001 From: Joshua Bakita Date: Mon, 8 Apr 2024 13:33:28 -0400 Subject: Rework LCE<->PCE and GRCE->LCE configuration printing API Rather than up to dozens of individual files exposing part of each copy engine's configuration, have one file which exposes a unified view of the full topology. Example new output on RTX 2080 Ti: $ cat /proc/gpu0/copy_topology GRCE0 -> LCE04 GRCE1 -> LCE03 LCE02 -> PCE02 LCE03 -> PCE03 LCE04 -> PCE01 Old output: $ tail -n 1 /proc/gpu0/lce_for_pce* ==> /proc/gpu0/lce_for_pce0 <== 0xf ==> /proc/gpu0/lce_for_pce1 <== 0x4 ==> /proc/gpu0/lce_for_pce2 <== 0x2 ==> /proc/gpu0/lce_for_pce3 <== 0x3 $ tail -n 1 /proc/gpu1/shared_lce_for_grce* ==> /proc/gpu0/shared_lce_for_grce0 <== 0x4 ==> /proc/gpu0/shared_lce_for_grce1 <== 0x3 Specifically: - Add `copy_topology` API - Remove `shared_lce_for_grce#` and `lce_for_pce#` APIs - Move logic from `nvdebug_entry.c` to `copy_topology_procfs.c` - Do not print PCE or Shared LCE configuration if flagged absent - Refer to LCE0 and LCE1 as GRCE0 and GRCE1 - Print by LCE ID, which is move helpful when attempting to trace how a given copy runlist maps to a physical copy engine. - Document two errata with CE registers Tested working on Pascal Integrated, Pascal, Volta Integrated Volta, Turing, and Ampere Integrated on Linux 4.9 through 5.10. --- Makefile | 3 +- copy_topology_procfs.c | 206 +++++++++++++++++++++++++++++++++++++++++++++++++ nvdebug.h | 18 +++-- nvdebug_entry.c | 106 ++++--------------------- 4 files changed, 233 insertions(+), 100 deletions(-) create mode 100644 copy_topology_procfs.c diff --git a/Makefile b/Makefile index 2dc90c7..2e588fa 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,6 @@ obj-m += nvdebug.o -nvdebug-objs = runlist_procfs.o device_info_procfs.o runlist.o mmu.o nvdebug_entry.o +nvdebug-objs = runlist_procfs.o device_info_procfs.o runlist.o mmu.o \ + nvdebug_entry.o copy_topology_procfs.o KBUILD_CFLAGS += -DGIT_HASH=\"$(shell git --git-dir=$(PWD)/.git rev-parse --short HEAD)\" # -mfentry above if not building due to mcount missing diff --git a/copy_topology_procfs.c b/copy_topology_procfs.c new file mode 100644 index 0000000..cfedce7 --- /dev/null +++ b/copy_topology_procfs.c @@ -0,0 +1,206 @@ +// Copyright 2024 Joshua Bakita + +#include "nvdebug.h" + +// Maximum number of LCEs that we will print +#define MAX_LCES 32 + +/* Which Logical Copy Engine (LCE) maps to a given Physical Copy Engine (PCE)? + @param pce_id PCE index + @return LCE index if mapping, -ENODEV on no mapping, and -errno otherwise +*/ +int get_lce_for_pce(struct nvdebug_state *g, uint8_t pce_id) { + int res; + // LCEs only exist on Pascal+ + if (g->chip_id < NV_CHIP_ID_PASCAL) + return -EOPNOTSUPP; + + if (g->chip_id < NV_CHIP_ID_VOLTA) { + uint32_t config = nvdebug_readl(g, NV_LCE_FOR_PCE_GP100); + if (config == -1) + return -EIO; + // On Pascal, two PCE configurations are packed per-byte. + res = (config >> (pce_id * 4)) & 0xf; + // 0x7 is the flag value for unconfigured on Pascal + if (res == 0x7) + return -ENODEV; + } else if (g->chip_id < NV_CHIP_ID_AMPERE) { + res = nvdebug_readl(g, NV_LCE_FOR_PCE_GV100(pce_id)); + // On the Titan V (GV100), bogus 0xbadf3000 observed if the GPU has yet to be + // used since reset + if (res == -1 || res == 0xbadf3000) + return -EIO; + } else { + // Works through at least Ada + res = nvdebug_readl(g, NV_LCE_FOR_PCE_GA100(pce_id)); + if (res == -1) + return -EIO; + } + // At least on Volta through Ampere, 0xf is a flag value for unconfigured. + if (res == 0xf) + return -ENODEV; + return res; +} + +/* Which LCE does this GRaphics Copy Engine (GRCE) map to? + @param grce_id GRCE index + @return LCE index if mapping, -ENODEV on no mapping, and -errno otherwise +*/ +int get_shared_lce_for_grce(struct nvdebug_state *g, uint8_t grce_id) { + int res; + uint32_t config; + // LCEs only exist on Pascal+ + if (g->chip_id < NV_CHIP_ID_PASCAL) + return -EOPNOTSUPP; + + if (g->chip_id < NV_CHIP_ID_VOLTA) { + if ((config = nvdebug_readl(g, NV_GRCE_FOR_CE_GP100(0))) == -1) + return -EIO; + // One config per byte; bit 4 flags if shared + if (((config >> (grce_id * 8)) & 0x8) == 0) + return -ENODEV; + // lower 3 bits contain the mapping + res = (config >> (grce_id * 8)) & 0x7; + } else if (g->chip_id < NV_CHIP_ID_AMPERE) { + if ((config = nvdebug_readl(g, NV_GRCE_FOR_CE_GP100(grce_id))) == -1) + return -EIO; + // Only the lower 4 bits contain the mapping + res = config & 0xf; + if (res == 0xf) + return -ENODEV; + } else { + // Works through at least Ada + if ((config = nvdebug_readl(g, NV_GRCE_FOR_CE_GA100(grce_id))) == -1) + return -EIO; + // Only the lower 4 bits contain the mapping + res = config & 0xf; + if (res == 0xf) + return -ENODEV; + } + return res; +} + +typedef struct { + enum {INVALID_CE, SHARED_LCE, PCE} type; + uint8_t ce; +} lce2pce_entry_t; + +/* Which PCE/LCE is each LCE mapped to? + @param lce2pce Array of lce2pce_entry_t to store mappings in + @param lce2pce_len Number of array entries; at least 16 recommended + @return -errno on error, 0 on success. +*/ +int get_pces_for_lces(struct nvdebug_state *g, lce2pce_entry_t *lce2pce, int lce2pce_len) { + uint32_t pce_id, grce_id, ce_pce_map; + memset(lce2pce, INVALID_CE, lce2pce_len * sizeof(lce2pce_entry_t)); + + if ((ce_pce_map = nvdebug_readl(g, NV_CE_PCE_MAP)) == -1) + return -EIO; + // Pull configuration for LCEs which directly map to a PCE + for (pce_id = 0; pce_id < NV_CE_PCE_MAP_SIZE; pce_id++) { + int lce; + // Skip reading configuration if PCE is disabled + if (((1 << pce_id) & ce_pce_map) == 0) + continue; + lce = get_lce_for_pce(g, pce_id); + if (lce == -ENODEV) + continue; + if (lce < 0) + return lce; + if (lce > lce2pce_len) + return -ERANGE; + lce2pce[lce].type = PCE; + lce2pce[lce].ce = pce_id; + } + // Pull configuration for LCEs which share a PCE with another LCE + // GRCE0 is synonymous with LCE0 (GRCE1 and LCE1 likewise) + // Only aware of up to two GRCEs per GPU + for (grce_id = 0; grce_id < NV_GRCE_MAX; grce_id++) { + int shared_lce; + // GRCEs with a PCE already associated do not share with an LCE + if (lce2pce[grce_id].type != INVALID_CE) + continue; + shared_lce = get_shared_lce_for_grce(g, grce_id); + // Each GRCE should be associated with a PCE or shared LCE + if (shared_lce == -ENODEV) { + printk(KERN_WARNING "[nvdebug] GRCE%d unconfigured.\n", grce_id); + continue; + } + if (shared_lce < 0) + return shared_lce; + lce2pce[grce_id].type = SHARED_LCE; + lce2pce[grce_id].ce = shared_lce; + } + return 0; +} + +typedef struct { + int idx; // Index of LCE to print + lce2pce_entry_t lce2pce[MAX_LCES]; // MAX_LCES-length table from get_pces_for_lces() +} copy_topology_iter_t; + +// The *_seq_* functions in this file follow the patterns in +// device_info_procfs.c. See there for comments on implementation. +static void *copy_topology_file_seq_start(struct seq_file *s, loff_t *pos) { + struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(s->file)]; + static copy_topology_iter_t iter; + int err; + if (*pos == 0) + iter.idx = 0; + if ((err = get_pces_for_lces(g, iter.lce2pce, MAX_LCES)) < 0) + return ERR_PTR(err); + if (iter.idx >= MAX_LCES) + return NULL; + return &iter; +} + +static void* copy_topology_file_seq_next(struct seq_file *s, void *iter_raw, + loff_t *pos) { + copy_topology_iter_t *iter = (copy_topology_iter_t*)iter_raw; + (*pos)++; // Required by seq interface + if (++iter->idx >= MAX_LCES) + return NULL; + return iter; +} + +static int copy_topology_file_seq_show(struct seq_file *s, void *iter_raw) { + copy_topology_iter_t *iter = (copy_topology_iter_t*)iter_raw; + lce2pce_entry_t entry = iter->lce2pce[iter->idx]; + if (entry.type == INVALID_CE) + return 0; + // First half: The LCE/GRCE in question + if (iter->idx >= NV_GRCE_MAX) + seq_printf(s, "LCE%02d -> ", iter->idx); + else + seq_printf(s, "GRCE%d -> ", iter->idx); + // Second half: The PCE/LCE/GRCE that the LCE/GRCE in question is mapped to + if (entry.type == PCE) + seq_printf(s, "PCE%02d\n", entry.ce); + else if (entry.ce >= NV_GRCE_MAX) // Shared LCE + seq_printf(s, "LCE%02d\n", entry.ce); + else // Shared GRCE + seq_printf(s, "GRCE%d\n", entry.ce); + return 0; +} + +static void copy_topology_file_seq_stop(struct seq_file *s, void *lce2pce) { + // No cleanup needed +} + +static const struct seq_operations copy_topology_file_seq_ops = { + .start = copy_topology_file_seq_start, + .next = copy_topology_file_seq_next, + .show = copy_topology_file_seq_show, + .stop = copy_topology_file_seq_stop, +}; + +static int copy_topology_file_open(struct inode *inode, struct file *f) { + return seq_open(f, ©_topology_file_seq_ops); +} + +struct file_operations copy_topology_file_ops = { + .open = copy_topology_file_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; diff --git a/nvdebug.h b/nvdebug.h index a9366e0..ac254f0 100644 --- a/nvdebug.h +++ b/nvdebug.h @@ -1,4 +1,4 @@ -/* Copyright 2021 Joshua Bakita +/* Copyright 2024 Joshua Bakita * SPDX-License-Identifier: MIT * * File outline: @@ -688,17 +688,20 @@ typedef union { SCAL_NUM_CES : Number of externally accessible copy engines + Errata: Incorrectly reports "3" on Jetson TX1 and TX2. Should report "1" to be + consistent with PTOP data. + Support: Kepler through (at least) Blackwell Also see dev_ce.ref.txt of NVIDIA's open-gpu-doc for info. */ #define NV_PTOP_SCAL_NUM_CES 0x00022444 -// Defined number of GRCEs for a GPU -# define NV_GRCE_NUM 2 +// Defined max number of GRCEs for a GPU (TX2 has only one) +# define NV_GRCE_MAX 2 // Defined GRCE->CE mapping offsets from nvgpu #define NV_GRCE_FOR_CE_GP100(i) (0x00104034+(i)*4) #define NV_GRCE_FOR_CE_GA100(i) (0x001041c0+(i)*4) // Defined LCE->PCE mapping offset from nvgpu (same as ce_pce2lce_config_r(i) in nvgpu) -#define NV_LCE_FOR_PCE_GP100(i) (0x0010402c+(i)/2) +#define NV_LCE_FOR_PCE_GP100 0x0010402c #define NV_LCE_FOR_PCE_GV100(i) (0x00104040+(i)*4) #define NV_LCE_FOR_PCE_GA100(i) (0x00104100+(i)*4) // Struct for use with nvdebug_reg_range_read() @@ -717,13 +720,14 @@ union reg_range { CE_PCE_MAP : A bitmask, where a set bit indicates that the PCE for that index is enabled (not floorswept) on this GPU. Count the number of set - bits to get the number of PCEs. + bits to get the number of PCEs. Note that this may be bogus if + the GPU has not been used since reset. - Support: Kepler through (at least) Blackwell + Support: Pascal through (at least) Blackwell Also see dev_ce.ref.txt of NVIDIA's open-gpu-doc for info. */ #define NV_CE_PCE_MAP 0x00104028 -#define MAP_SIZE 32 +#define NV_CE_PCE_MAP_SIZE 32 /* Location of the 1Kb instance block with page tables for BAR1 and BAR2. diff --git a/nvdebug_entry.c b/nvdebug_entry.c index 78860e6..ed82e58 100644 --- a/nvdebug_entry.c +++ b/nvdebug_entry.c @@ -1,4 +1,4 @@ -/* Copyright 2021 Joshua Bakita +/* Copyright 2024 Joshua Bakita * SPDX-License-Identifier: MIT */ @@ -27,6 +27,7 @@ extern struct file_operations disable_channel_file_ops; extern struct file_operations enable_channel_file_ops; extern struct file_operations switch_to_tsg_file_ops; extern struct file_operations device_info_file_ops; +extern struct file_operations copy_topology_file_ops; extern struct file_operations nvdebug_read_reg32_file_ops; extern struct file_operations nvdebug_read_reg_range_file_ops; @@ -204,7 +205,7 @@ int create_tpc_mask_files(int device_id, struct proc_dir_entry *dir) { int __init nvdebug_init(void) { struct proc_dir_entry *dir, *preempt_entry, *disable_channel_entry, *enable_channel_entry, *switch_to_tsg_entry, *device_info_entry, - *num_gpcs_entry, *lce_for_pce_entry, *grce_for_pce_entry; + *num_gpcs_entry; int rl_create_err, tpc_masks_create_err; // Check that an NVIDIA GPU is present and initialize g_nvdebug_state int res = probe_and_cache_device(); @@ -260,97 +261,18 @@ int __init nvdebug_init(void) { num_gpcs_entry = proc_create_data( "gpc_mask", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops), (void*)NV_FUSE_GPC); - // In both nouveau and nvgpu, the PCE_MAP register is available on Pascal+ - if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_PASCAL){ - // Used for reading a subset of a register on pascal - union reg_range pascal_reg; - // Create a pce mask for iteration - u32 ce_pce_map = nvdebug_readl(&g_nvdebug_state[device_id], NV_CE_PCE_MAP); - char file_name[21]; - int pce_id = 0; - int pce_num = 0; - int i; - for (pce_id = 0; pce_id < MAP_SIZE; pce_id++) { - // If pce is enabled, create files and iterate pce_id; otherwise, do nothing - if ((1 << pce_id) & ce_pce_map) { - snprintf(file_name, 20, "lce_for_pce%d", pce_num); - // Depending on GPU architecture, fetch data for the LCE of particular PCE - switch (g_nvdebug_state[res].chip_id & 0xff0) { - case NV_CHIP_ID_PASCAL: - // On Pascal, two PCE configurations are packed per-byte. - // Work around this by leveraging that we only run on 64-bit - // platforms (can assume that a void* is 64-bits), and that - // GPU register offsets are only 32-bits. Use the other 32 - // bits to store which bits to print. - pascal_reg.offset = NV_LCE_FOR_PCE_GP100(0); - pascal_reg.start_bit = pce_id * 4; - pascal_reg.stop_bit = pce_id * 4 + 4; - lce_for_pce_entry = proc_create_data( - file_name, 0444, dir, compat_ops(&nvdebug_read_reg_range_file_ops), - (void*)pascal_reg.raw); - break; - case NV_CHIP_ID_VOLTA: - case NV_CHIP_ID_VOLTA_INTEGRATED: - case NV_CHIP_ID_TURING: - lce_for_pce_entry = proc_create_data( - file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops), - (void*)(uintptr_t)NV_LCE_FOR_PCE_GV100(pce_id)); - break; - case NV_CHIP_ID_AMPERE: - case NV_CHIP_ID_HOPPER: - case NV_CHIP_ID_ADA: - lce_for_pce_entry = proc_create_data( - file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops), - (void*)(uintptr_t)NV_LCE_FOR_PCE_GA100(pce_id)); - break; - } - if (!lce_for_pce_entry) - return -ENOMEM; - pce_num++; - } - } - // We assume 2 GRCEs (reminder: GRCE0 and 1 are just LCE0 and 1) - for (i = 0; i < 2; i++) { - union reg_range grce_reg = {0}; - snprintf(file_name, 21, "shared_lce_for_grce%d", i); - // The offset used here is only documented for Turing - // Actually, Pascal through Turing - // On Pascal, it's only 3 bits, every 8 bits - // On Volta-Turing, it start at same offset, but it's lower 4 bits, every 32 bits - // On Ampere+ it starts at 0x001041c0, but is the same layout as Volta-Turing - switch (g_nvdebug_state[res].chip_id & 0xff0) { - case NV_CHIP_ID_PASCAL: - grce_reg.offset = NV_GRCE_FOR_CE_GP100(0); - grce_reg.start_bit = i * 8; - grce_reg.stop_bit = grce_reg.start_bit + 3; - break; - case NV_CHIP_ID_VOLTA: - case NV_CHIP_ID_VOLTA_INTEGRATED: - case NV_CHIP_ID_TURING: - grce_reg.offset = NV_GRCE_FOR_CE_GP100(i); - grce_reg.start_bit = 0; - grce_reg.stop_bit = grce_reg.start_bit + 4; - break; - case NV_CHIP_ID_AMPERE: - case NV_CHIP_ID_HOPPER: - case NV_CHIP_ID_ADA: - grce_reg.offset = NV_GRCE_FOR_CE_GA100(i); - grce_reg.start_bit = 0; - grce_reg.stop_bit = grce_reg.start_bit + 4; - break; - } - grce_for_pce_entry = proc_create_data( - file_name, 0444, dir, compat_ops(&nvdebug_read_reg_range_file_ops), - (void*)grce_reg.raw); - if (!grce_for_pce_entry) - return -ENOMEM; - } - - // TODO: Redo to num_pces + // Create files exposing LCE and PCE configuration (Pascal+) + if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_PASCAL) { + // Create file `/proc/gpu#/copy_topology`, world readable + if (!proc_create_data( + "copy_topology", 0444, dir, compat_ops(©_topology_file_ops), + (void*)0)) + goto out_nomem; // Create file `/proc/gpu#/pce_map`, world readable - num_gpcs_entry = proc_create_data( - "pce_map", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops), - (void*)NV_CE_PCE_MAP); + if (!proc_create_data( + "pce_map", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops), + (void*)NV_CE_PCE_MAP)) + goto out_nomem; } // ProcFS entry creation only fails if out of memory if (rl_create_err || tpc_masks_create_err || !preempt_entry || -- cgit v1.2.2