From ac60151ea0a4a1f3882fde3c486af870029b7977 Mon Sep 17 00:00:00 2001
From: Joshua Bakita <bakitajoshua@gmail.com>
Date: Mon, 8 Apr 2024 13:33:28 -0400
Subject: Rework LCE<->PCE and GRCE->LCE configuration printing API

Rather than up to dozens of individual files exposing part of each
copy engine's configuration, have one file which exposes a unified
view of the full topology. Example new output on RTX 2080 Ti:

$ cat /proc/gpu0/copy_topology
GRCE0 -> LCE04
GRCE1 -> LCE03
LCE02 -> PCE02
LCE03 -> PCE03
LCE04 -> PCE01

Old output:
$ tail -n 1 /proc/gpu0/lce_for_pce*
==> /proc/gpu0/lce_for_pce0 <==
0xf
==> /proc/gpu0/lce_for_pce1 <==
0x4
==> /proc/gpu0/lce_for_pce2 <==
0x2
==> /proc/gpu0/lce_for_pce3 <==
0x3

$ tail -n 1 /proc/gpu1/shared_lce_for_grce*
==> /proc/gpu0/shared_lce_for_grce0 <==
0x4
==> /proc/gpu0/shared_lce_for_grce1 <==
0x3

Specifically:
- Add `copy_topology` API
- Remove `shared_lce_for_grce#` and `lce_for_pce#` APIs
- Move logic from `nvdebug_entry.c` to `copy_topology_procfs.c`
- Do not print PCE or Shared LCE configuration if flagged absent
- Refer to LCE0 and LCE1 as GRCE0 and GRCE1
- Print by LCE ID, which is move helpful when attempting to trace
  how a given copy runlist maps to a physical copy engine.
- Document two errata with CE registers

Tested working on Pascal Integrated, Pascal, Volta Integrated
Volta, Turing, and Ampere Integrated on Linux 4.9 through 5.10.
---
 Makefile               |   3 +-
 copy_topology_procfs.c | 206 +++++++++++++++++++++++++++++++++++++++++++++++++
 nvdebug.h              |  18 +++--
 nvdebug_entry.c        | 106 ++++---------------------
 4 files changed, 233 insertions(+), 100 deletions(-)
 create mode 100644 copy_topology_procfs.c

diff --git a/Makefile b/Makefile
index 2dc90c7..2e588fa 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,6 @@
 obj-m += nvdebug.o
-nvdebug-objs = runlist_procfs.o device_info_procfs.o runlist.o mmu.o nvdebug_entry.o
+nvdebug-objs = runlist_procfs.o device_info_procfs.o runlist.o mmu.o \
+               nvdebug_entry.o copy_topology_procfs.o
 KBUILD_CFLAGS += -DGIT_HASH=\"$(shell git --git-dir=$(PWD)/.git rev-parse --short HEAD)\"
 # -mfentry above if not building due to mcount missing
 
diff --git a/copy_topology_procfs.c b/copy_topology_procfs.c
new file mode 100644
index 0000000..cfedce7
--- /dev/null
+++ b/copy_topology_procfs.c
@@ -0,0 +1,206 @@
+// Copyright 2024 Joshua Bakita
+
+#include "nvdebug.h"
+
+// Maximum number of LCEs that we will print
+#define MAX_LCES 32
+
+/* Which Logical Copy Engine (LCE) maps to a given Physical Copy Engine (PCE)?
+  @param pce_id PCE index
+  @return LCE index if mapping, -ENODEV on no mapping, and -errno otherwise
+*/
+int get_lce_for_pce(struct nvdebug_state *g, uint8_t pce_id) {
+	int res;
+	// LCEs only exist on Pascal+
+	if (g->chip_id < NV_CHIP_ID_PASCAL)
+		return -EOPNOTSUPP;
+
+	if (g->chip_id < NV_CHIP_ID_VOLTA) {
+		uint32_t config = nvdebug_readl(g, NV_LCE_FOR_PCE_GP100);
+		if (config == -1)
+			return -EIO;
+		// On Pascal, two PCE configurations are packed per-byte.
+		res = (config >> (pce_id * 4)) & 0xf;
+		// 0x7 is the flag value for unconfigured on Pascal
+		if (res == 0x7)
+			return -ENODEV;
+	} else if (g->chip_id < NV_CHIP_ID_AMPERE) {
+		res = nvdebug_readl(g, NV_LCE_FOR_PCE_GV100(pce_id));
+		// On the Titan V (GV100), bogus 0xbadf3000 observed if the GPU has yet to be
+		// used since reset
+		if (res == -1 || res == 0xbadf3000)
+			return -EIO;
+	} else {
+		// Works through at least Ada
+		res = nvdebug_readl(g, NV_LCE_FOR_PCE_GA100(pce_id));
+		if (res == -1)
+			return -EIO;
+	}
+	// At least on Volta through Ampere, 0xf is a flag value for unconfigured.
+	if (res == 0xf)
+		return -ENODEV;
+	return res;
+}
+
+/* Which LCE does this GRaphics Copy Engine (GRCE) map to?
+  @param grce_id GRCE index
+  @return LCE index if mapping, -ENODEV on no mapping, and -errno otherwise
+*/
+int get_shared_lce_for_grce(struct nvdebug_state *g, uint8_t grce_id) {
+	int res;
+	uint32_t config;
+	// LCEs only exist on Pascal+
+	if (g->chip_id < NV_CHIP_ID_PASCAL)
+		return -EOPNOTSUPP;
+
+	if (g->chip_id < NV_CHIP_ID_VOLTA) {
+		if ((config = nvdebug_readl(g, NV_GRCE_FOR_CE_GP100(0))) == -1)
+			return -EIO;
+		// One config per byte; bit 4 flags if shared
+		if (((config >> (grce_id * 8)) & 0x8) == 0)
+			return -ENODEV;
+		// lower 3 bits contain the mapping
+		res = (config >> (grce_id * 8)) & 0x7;
+	} else if (g->chip_id < NV_CHIP_ID_AMPERE) {
+		if ((config = nvdebug_readl(g, NV_GRCE_FOR_CE_GP100(grce_id))) == -1)
+			return -EIO;
+		// Only the lower 4 bits contain the mapping
+		res = config & 0xf;
+		if (res == 0xf)
+			return -ENODEV;
+	} else {
+		// Works through at least Ada
+		if ((config = nvdebug_readl(g, NV_GRCE_FOR_CE_GA100(grce_id))) == -1)
+			return -EIO;
+		// Only the lower 4 bits contain the mapping
+		res = config & 0xf;
+		if (res == 0xf)
+			return -ENODEV;
+	}
+	return res;
+}
+
+typedef struct {
+	enum {INVALID_CE, SHARED_LCE, PCE} type;
+	uint8_t ce;
+} lce2pce_entry_t;
+
+/* Which PCE/LCE is each LCE mapped to?
+  @param lce2pce     Array of lce2pce_entry_t to store mappings in
+  @param lce2pce_len Number of array entries; at least 16 recommended
+  @return -errno on error, 0 on success.
+*/
+int get_pces_for_lces(struct nvdebug_state *g, lce2pce_entry_t *lce2pce, int lce2pce_len) {
+	uint32_t pce_id, grce_id, ce_pce_map;
+	memset(lce2pce, INVALID_CE, lce2pce_len * sizeof(lce2pce_entry_t));
+
+	if ((ce_pce_map = nvdebug_readl(g, NV_CE_PCE_MAP)) == -1)
+		return -EIO;
+	// Pull configuration for LCEs which directly map to a PCE
+	for (pce_id = 0; pce_id < NV_CE_PCE_MAP_SIZE; pce_id++) {
+		int lce;
+		// Skip reading configuration if PCE is disabled
+		if (((1 << pce_id) & ce_pce_map) == 0)
+			continue;
+		lce = get_lce_for_pce(g, pce_id);
+		if (lce == -ENODEV)
+			continue;
+		if (lce < 0)
+			return lce;
+		if (lce > lce2pce_len)
+			return -ERANGE;
+		lce2pce[lce].type = PCE;
+		lce2pce[lce].ce = pce_id;
+	}
+	// Pull configuration for LCEs which share a PCE with another LCE
+	// GRCE0 is synonymous with LCE0 (GRCE1 and LCE1 likewise)
+	// Only aware of up to two GRCEs per GPU
+	for (grce_id = 0; grce_id < NV_GRCE_MAX; grce_id++) {
+		int shared_lce;
+		// GRCEs with a PCE already associated do not share with an LCE
+		if (lce2pce[grce_id].type != INVALID_CE)
+			continue;
+		shared_lce = get_shared_lce_for_grce(g, grce_id);
+		// Each GRCE should be associated with a PCE or shared LCE
+		if (shared_lce == -ENODEV) {
+			printk(KERN_WARNING "[nvdebug] GRCE%d unconfigured.\n", grce_id);
+			continue;
+		}
+		if (shared_lce < 0)
+			return shared_lce;
+		lce2pce[grce_id].type = SHARED_LCE;
+		lce2pce[grce_id].ce = shared_lce;
+	}
+	return 0;
+}
+
+typedef struct {
+	int idx; // Index of LCE to print
+	lce2pce_entry_t lce2pce[MAX_LCES]; // MAX_LCES-length table from get_pces_for_lces()
+} copy_topology_iter_t;
+
+// The *_seq_* functions in this file follow the patterns in
+// device_info_procfs.c. See there for comments on implementation.
+static void *copy_topology_file_seq_start(struct seq_file *s, loff_t *pos) {
+	struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(s->file)];
+	static copy_topology_iter_t iter;
+	int err;
+	if (*pos == 0)
+		iter.idx = 0;
+	if ((err = get_pces_for_lces(g, iter.lce2pce, MAX_LCES)) < 0)
+		return ERR_PTR(err);
+	if (iter.idx >= MAX_LCES)
+		return NULL;
+	return &iter;
+}
+
+static void* copy_topology_file_seq_next(struct seq_file *s, void *iter_raw,
+                                         loff_t *pos) {
+	copy_topology_iter_t *iter = (copy_topology_iter_t*)iter_raw;
+	(*pos)++; // Required by seq interface
+	if (++iter->idx >= MAX_LCES)
+		return NULL;
+	return iter;
+}
+
+static int copy_topology_file_seq_show(struct seq_file *s, void *iter_raw) {
+	copy_topology_iter_t *iter = (copy_topology_iter_t*)iter_raw;
+	lce2pce_entry_t entry = iter->lce2pce[iter->idx];
+	if (entry.type == INVALID_CE)
+		return 0;
+	// First half: The LCE/GRCE in question
+	if (iter->idx >= NV_GRCE_MAX)
+		seq_printf(s, "LCE%02d -> ", iter->idx);
+	else
+		seq_printf(s, "GRCE%d -> ", iter->idx);
+	// Second half: The PCE/LCE/GRCE that the LCE/GRCE in question is mapped to
+	if (entry.type == PCE)
+		seq_printf(s, "PCE%02d\n", entry.ce);
+	else if (entry.ce >= NV_GRCE_MAX) // Shared LCE
+		seq_printf(s, "LCE%02d\n", entry.ce);
+	else // Shared GRCE
+		seq_printf(s, "GRCE%d\n", entry.ce);
+	return 0;
+}
+
+static void copy_topology_file_seq_stop(struct seq_file *s, void *lce2pce) {
+	// No cleanup needed
+}
+
+static const struct seq_operations copy_topology_file_seq_ops = {
+	.start = copy_topology_file_seq_start,
+	.next = copy_topology_file_seq_next,
+	.show = copy_topology_file_seq_show,
+	.stop = copy_topology_file_seq_stop,
+};
+
+static int copy_topology_file_open(struct inode *inode, struct file *f) {
+	return seq_open(f, &copy_topology_file_seq_ops);
+}
+
+struct file_operations copy_topology_file_ops = {
+	.open = copy_topology_file_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+};
diff --git a/nvdebug.h b/nvdebug.h
index a9366e0..ac254f0 100644
--- a/nvdebug.h
+++ b/nvdebug.h
@@ -1,4 +1,4 @@
-/* Copyright 2021 Joshua Bakita
+/* Copyright 2024 Joshua Bakita
  * SPDX-License-Identifier: MIT
  *
  * File outline:
@@ -688,17 +688,20 @@ typedef union {
 
   SCAL_NUM_CES : Number of externally accessible copy engines
 
+  Errata: Incorrectly reports "3" on Jetson TX1 and TX2. Should report "1" to be
+  consistent with PTOP data.
+
   Support: Kepler through (at least) Blackwell
   Also see dev_ce.ref.txt of NVIDIA's open-gpu-doc for info.
 */
 #define NV_PTOP_SCAL_NUM_CES 0x00022444
-// Defined number of GRCEs for a GPU
-# define NV_GRCE_NUM 2
+// Defined max number of GRCEs for a GPU (TX2 has only one)
+# define NV_GRCE_MAX 2
 // Defined GRCE->CE mapping offsets from nvgpu
 #define NV_GRCE_FOR_CE_GP100(i) (0x00104034+(i)*4)
 #define NV_GRCE_FOR_CE_GA100(i) (0x001041c0+(i)*4)
 // Defined LCE->PCE mapping offset from nvgpu (same as ce_pce2lce_config_r(i) in nvgpu)
-#define NV_LCE_FOR_PCE_GP100(i) (0x0010402c+(i)/2)
+#define NV_LCE_FOR_PCE_GP100 0x0010402c
 #define NV_LCE_FOR_PCE_GV100(i) (0x00104040+(i)*4)
 #define NV_LCE_FOR_PCE_GA100(i) (0x00104100+(i)*4)
 // Struct for use with nvdebug_reg_range_read()
@@ -717,13 +720,14 @@ union reg_range {
 
   CE_PCE_MAP : A bitmask, where a set bit indicates that the PCE for that index
                is enabled (not floorswept) on this GPU. Count the number of set
-               bits to get the number of PCEs.
+               bits to get the number of PCEs. Note that this may be bogus if
+               the GPU has not been used since reset.
 
-  Support: Kepler through (at least) Blackwell
+  Support: Pascal through (at least) Blackwell
   Also see dev_ce.ref.txt of NVIDIA's open-gpu-doc for info.
 */
 #define NV_CE_PCE_MAP 0x00104028
-#define MAP_SIZE 32
+#define NV_CE_PCE_MAP_SIZE 32
 
 
 /* Location of the 1Kb instance block with page tables for BAR1 and BAR2.
diff --git a/nvdebug_entry.c b/nvdebug_entry.c
index 78860e6..ed82e58 100644
--- a/nvdebug_entry.c
+++ b/nvdebug_entry.c
@@ -1,4 +1,4 @@
-/* Copyright 2021 Joshua Bakita
+/* Copyright 2024 Joshua Bakita
  * SPDX-License-Identifier: MIT
  */
 
@@ -27,6 +27,7 @@ extern struct file_operations disable_channel_file_ops;
 extern struct file_operations enable_channel_file_ops;
 extern struct file_operations switch_to_tsg_file_ops;
 extern struct file_operations device_info_file_ops;
+extern struct file_operations copy_topology_file_ops;
 extern struct file_operations nvdebug_read_reg32_file_ops;
 extern struct file_operations nvdebug_read_reg_range_file_ops;
 
@@ -204,7 +205,7 @@ int create_tpc_mask_files(int device_id, struct proc_dir_entry *dir) {
 int __init nvdebug_init(void) {
 	struct proc_dir_entry *dir, *preempt_entry, *disable_channel_entry,
 			      *enable_channel_entry, *switch_to_tsg_entry, *device_info_entry,
-			      *num_gpcs_entry, *lce_for_pce_entry, *grce_for_pce_entry;
+			      *num_gpcs_entry;
 	int rl_create_err, tpc_masks_create_err;
 	// Check that an NVIDIA GPU is present and initialize g_nvdebug_state
 	int res = probe_and_cache_device();
@@ -260,97 +261,18 @@ int __init nvdebug_init(void) {
 		num_gpcs_entry = proc_create_data(
 			"gpc_mask", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
 			(void*)NV_FUSE_GPC);
-		// In both nouveau and nvgpu, the PCE_MAP register is  available on Pascal+
-		if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_PASCAL){
-			// Used for reading a subset of a register on pascal
-			union reg_range pascal_reg;
-			// Create a pce mask for iteration
-			u32 ce_pce_map = nvdebug_readl(&g_nvdebug_state[device_id], NV_CE_PCE_MAP);
-			char file_name[21];
-			int pce_id = 0;
-			int pce_num = 0;
-			int i;
-			for (pce_id = 0; pce_id < MAP_SIZE; pce_id++) {
-				// If pce is enabled, create files and iterate pce_id; otherwise, do nothing
-				if ((1 << pce_id) & ce_pce_map) {
-					snprintf(file_name, 20, "lce_for_pce%d", pce_num);
-					// Depending on GPU architecture, fetch data for the LCE of particular PCE
-					switch (g_nvdebug_state[res].chip_id & 0xff0) {
-						case NV_CHIP_ID_PASCAL:
-							// On Pascal, two PCE configurations are packed per-byte.
-							// Work around this by leveraging that we only run on 64-bit
-							// platforms (can assume that a void* is 64-bits), and that
-							// GPU register offsets are only 32-bits. Use the other 32
-							// bits to store which bits to print.
-							pascal_reg.offset = NV_LCE_FOR_PCE_GP100(0);
-							pascal_reg.start_bit = pce_id * 4;
-							pascal_reg.stop_bit = pce_id * 4 + 4;
-							lce_for_pce_entry = proc_create_data(
-											file_name, 0444, dir, compat_ops(&nvdebug_read_reg_range_file_ops),
-											(void*)pascal_reg.raw);
-							break;
-						case NV_CHIP_ID_VOLTA:
-						case NV_CHIP_ID_VOLTA_INTEGRATED:
-						case NV_CHIP_ID_TURING:
-							lce_for_pce_entry = proc_create_data(
-											file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
-											(void*)(uintptr_t)NV_LCE_FOR_PCE_GV100(pce_id));
-							break;
-						case NV_CHIP_ID_AMPERE:
-						case NV_CHIP_ID_HOPPER:
-						case NV_CHIP_ID_ADA:
-							 lce_for_pce_entry = proc_create_data(
-											file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
-											(void*)(uintptr_t)NV_LCE_FOR_PCE_GA100(pce_id));
-							break;
-					}
-					if (!lce_for_pce_entry)
-						return -ENOMEM;
-					pce_num++;
-				}
-			}
-			// We assume 2 GRCEs (reminder: GRCE0 and 1 are just LCE0 and 1)
-			for (i = 0; i < 2; i++) {
-				union reg_range grce_reg = {0};
-				snprintf(file_name, 21, "shared_lce_for_grce%d", i);
-				// The offset used here is only documented for Turing
-				// Actually, Pascal through Turing
-				// On Pascal, it's only 3 bits, every 8 bits
-				// On Volta-Turing, it start at same offset, but it's lower 4 bits, every 32 bits
-				// On Ampere+ it starts at 0x001041c0, but is the same layout as Volta-Turing
-				switch (g_nvdebug_state[res].chip_id & 0xff0) {
-					case NV_CHIP_ID_PASCAL:
-						grce_reg.offset = NV_GRCE_FOR_CE_GP100(0);
-						grce_reg.start_bit = i * 8;
-						grce_reg.stop_bit = grce_reg.start_bit + 3;
-						break;
-					case NV_CHIP_ID_VOLTA:
-					case NV_CHIP_ID_VOLTA_INTEGRATED:
-					case NV_CHIP_ID_TURING:
-						grce_reg.offset = NV_GRCE_FOR_CE_GP100(i);
-						grce_reg.start_bit = 0;
-						grce_reg.stop_bit = grce_reg.start_bit + 4;
-						break;
-					case NV_CHIP_ID_AMPERE:
-					case NV_CHIP_ID_HOPPER:
-					case NV_CHIP_ID_ADA:
-						grce_reg.offset = NV_GRCE_FOR_CE_GA100(i);
-						grce_reg.start_bit = 0;
-						grce_reg.stop_bit = grce_reg.start_bit + 4;
-						break;
-				}
-				grce_for_pce_entry = proc_create_data(
-								file_name, 0444, dir, compat_ops(&nvdebug_read_reg_range_file_ops),
-								(void*)grce_reg.raw);
-				if (!grce_for_pce_entry)
-					return -ENOMEM;
-			}
-
-			// TODO: Redo to num_pces
+		// Create files exposing LCE and PCE configuration (Pascal+)
+		if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_PASCAL) {
+			// Create file `/proc/gpu#/copy_topology`, world readable
+			if (!proc_create_data(
+					"copy_topology", 0444, dir, compat_ops(&copy_topology_file_ops),
+					(void*)0))
+				goto out_nomem;
 			// Create file `/proc/gpu#/pce_map`, world readable
-			num_gpcs_entry = proc_create_data(
-				"pce_map", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
-				(void*)NV_CE_PCE_MAP);
+			if (!proc_create_data(
+					"pce_map", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
+					(void*)NV_CE_PCE_MAP))
+				goto out_nomem;
 		}
 		// ProcFS entry creation only fails if out of memory
 		if (rl_create_err || tpc_masks_create_err || !preempt_entry ||
-- 
cgit v1.2.2