From 4768fe31f114c5ad788012db5518ce8e37f79c7a Mon Sep 17 00:00:00 2001
From: Joshua Bakita <jbakita@cs.unc.edu>
Date: Tue, 9 Apr 2024 13:07:19 -0400
Subject: Correctly handle startup errors and fix gpc*_mask APIs

- Do not create gpc*_mask files on pre-Maxwell GPUs (tested
  unavailable on the K5000s)
- Use correct register offsets for gpc*_mask files on Ampere+ GPUs
- Document GPC and TPC count and fuse registers.
- Correctly handle errors for creation of all ProcFS files
- Remove unecessary error-handling temp variables in nvdebug_entry
- Misc naming, comment, and layout cleanup
---
 nvdebug_entry.c | 150 ++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 92 insertions(+), 58 deletions(-)

(limited to 'nvdebug_entry.c')

diff --git a/nvdebug_entry.c b/nvdebug_entry.c
index 7593a3a..0cf5344 100644
--- a/nvdebug_entry.c
+++ b/nvdebug_entry.c
@@ -12,7 +12,8 @@
 #include "nvdebug.h"
 #include "stubs.h"
 
-// Enable to intercept and log GPU interrupts
+// Enable to intercept and log GPU interrupts. Historically used to benchmark
+// interrupt latency.
 #define INTERRUPT_DEBUG 0
 
 // MIT is GPL-compatible. We need to be GPL-compatible for symbols like
@@ -31,14 +32,16 @@ extern struct file_operations copy_topology_file_ops;
 extern struct file_operations nvdebug_read_reg32_file_ops;
 extern struct file_operations nvdebug_read_reg_range_file_ops;
 
-// Bus types are global symbols in the kernel
-extern struct bus_type platform_bus_type;
 struct nvdebug_state g_nvdebug_state[NVDEBUG_MAX_DEVICES];
 unsigned int g_nvdebug_devices = 0;
+// Bus types are global symbols in the kernel
+extern struct bus_type platform_bus_type;
 
-// Starting in Kernel 5.6, proc_ops is required instead of file_operations
+// Starting in Kernel 5.6, proc_ops is required instead of file_operations.
+// As file_operations is larger than proc_ops, we can overwrite the memory
+// backing the file_operations struct to follow the proc_ops layout, and then
+// cast on newer kernels.
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(5,6,0)
-// This rewrites the struct to the proc_ops layout on newer kernels
 const struct proc_ops* compat_ops(const struct file_operations* ops) {
 	struct proc_ops new_ops = {};
 	new_ops.proc_open = ops->open;
@@ -64,7 +67,7 @@ irqreturn_t nvdebug_irq_tap(int irq_num, void * dev) {
 
 // Find any and all NVIDIA GPUs in the system
 // Note: This function fails if any of them are in a bad state
-int probe_and_cache_device(void) {
+int probe_and_cache_devices(void) {
 	// platform bus (SoC) iterators
 	struct device *dev = NULL;
 	struct device *temp_dev;
@@ -143,13 +146,14 @@ int probe_and_cache_device(void) {
 #endif // INTERRUPT_DEBUG
 		i++;
 	}
-	// Return the number of devices we found
+	// Return the number of devices found
 	if (i > 0)
 		return i;
 	return -ENODEV;
 }
 
 // Create files `/proc/gpu#/runlist#`, world readable
+// Support: Fermi, Maxwell, Pascal, Volta, Turing
 int create_runlist_files(int device_id, struct proc_dir_entry *dir) {
 	ptop_device_info_gk104_t info;
 	struct proc_dir_entry *rl_entry;
@@ -179,16 +183,24 @@ int create_runlist_files(int device_id, struct proc_dir_entry *dir) {
 	return 0;
 }
 
-// Create files /proc/gpu#
-// TODO: Don't run this on unsupported GPUs
+// Create files `/proc/gpu#/gpc#_tpc_mask`, world readable
+// Support: Maxwell+
 int create_tpc_mask_files(int device_id, struct proc_dir_entry *dir) {
+	struct nvdebug_state* g = &g_nvdebug_state[device_id];
 	char file_name[20];
 	int i;
 	struct proc_dir_entry *gpc_tpc_mask_entry;
-	// Get a bitmask of which GPCs are disabled
-	uint32_t gpcs_mask = nvdebug_readl(&g_nvdebug_state[device_id], NV_FUSE_GPC);
 	// Get maximum number of enabled GPCs for this chip
 	uint32_t max_gpcs = nvdebug_readl(&g_nvdebug_state[device_id], NV_PTOP_SCAL_NUM_GPCS);
+	// Get a bitmask of which GPCs are disabled
+	uint32_t gpcs_mask;
+	if (g->chip_id < NV_CHIP_ID_AMPERE)
+		gpcs_mask = nvdebug_readl(g, NV_FUSE_GPC_GM107);
+	else
+		gpcs_mask = nvdebug_readl(g, NV_FUSE_GPC_GA100);
+	// Verify the reads succeeded
+	if (max_gpcs == -1 || gpcs_mask == -1)
+		return -EIO;
 	// For each enabled GPC, expose a mask of disabled TPCs
 	for (i = 0; i < max_gpcs; i++) {
 		// Do nothing if GPC is disabled
@@ -196,9 +208,14 @@ int create_tpc_mask_files(int device_id, struct proc_dir_entry *dir) {
 			continue;
 		// If GPC is enabled, create an entry to read disabled TPCs mask
 		snprintf(file_name, 20, "gpc%d_tpc_mask", i);
-		gpc_tpc_mask_entry = proc_create_data(
-			file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
-			(void*)(uintptr_t)NV_FUSE_TPC_FOR_GPC(i));
+		if (g->chip_id < NV_CHIP_ID_AMPERE)
+			gpc_tpc_mask_entry = proc_create_data(
+				file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
+				(void*)(uintptr_t)NV_FUSE_TPC_FOR_GPC_GM107(i));
+		else
+			gpc_tpc_mask_entry = proc_create_data(
+				file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
+				(void*)(uintptr_t)NV_FUSE_TPC_FOR_GPC_GA100(i));
 		if (!gpc_tpc_mask_entry)
 			return -ENOMEM;
 	}
@@ -206,64 +223,84 @@ int create_tpc_mask_files(int device_id, struct proc_dir_entry *dir) {
 }
 
 int __init nvdebug_init(void) {
-	struct proc_dir_entry *dir, *preempt_entry, *disable_channel_entry,
-			      *enable_channel_entry, *switch_to_tsg_entry, *device_info_entry,
-			      *num_gpcs_entry;
-	int rl_create_err, tpc_masks_create_err;
+	struct proc_dir_entry *dir;
+	int err, res;
 	// Check that an NVIDIA GPU is present and initialize g_nvdebug_state
-	int res = probe_and_cache_device();
-	if (res < 0)
+	if ((res = probe_and_cache_devices()) < 0)
 		return res;
 	g_nvdebug_devices = res;
 	// Create seperate ProcFS directories for each gpu
 	while (res--) {
 		char device_id_str[7];
-		uintptr_t device_id = res;  // This is uintptr as we abuse the *data field on proc_dir_entry to store the GPU id
+		// Create a wider copy of the GPU ID to allow us to abuse the *data
+		// field of proc_dir_entry to store the GPU ID.
+		uintptr_t device_id = res;
 		// Create directory /proc/gpu# where # is the GPU number
+		// As ProcFS entry creation only fails if out of memory, we auto-skip
+		// to handling that on any error in creating ProcFS files.
 		snprintf(device_id_str, 7, "gpu%ld", device_id);
 		if (!(dir = proc_mkdir_data(device_id_str, 0555, NULL, (void*)device_id)))
 			goto out_nomem;
 		// Create files `/proc/gpu#/runlist#`, world readable
 		if (g_nvdebug_state[device_id].chip_id < NV_CHIP_ID_AMPERE)
-			create_runlist_files(device_id, dir);
-		// Create files `/proc/gpu#/gpc#_tpc_mask`, world readable
-		tpc_masks_create_err = create_tpc_mask_files(device_id, dir);
+			if ((err = create_runlist_files(device_id, dir)))
+				goto out_err;
 		// Create file `/proc/gpu#/preempt_tsg`, world writable
-		preempt_entry = proc_create_data(
-			"preempt_tsg", 0222, dir, compat_ops(&preempt_tsg_file_ops),
-			(void*)device_id);
+		if (!proc_create_data(
+				"preempt_tsg", 0222, dir, compat_ops(&preempt_tsg_file_ops),
+				(void*)device_id))
+			goto out_nomem;
 		// Create file `/proc/gpu#/disable_channel`, world writable
-		disable_channel_entry = proc_create_data(
-			"disable_channel", 0222, dir, compat_ops(&disable_channel_file_ops),
-			(void*)device_id);
+		if (!proc_create_data(
+				"disable_channel", 0222, dir, compat_ops(&disable_channel_file_ops),
+				(void*)device_id))
+			goto out_nomem;
 		// Create file `/proc/gpu#/enable_channel`, world writable
-		enable_channel_entry = proc_create_data(
-			"enable_channel", 0222, dir, compat_ops(&enable_channel_file_ops),
-			(void*)device_id);
+		if (!proc_create_data(
+				"enable_channel", 0222, dir, compat_ops(&enable_channel_file_ops),
+				(void*)device_id))
+			goto out_nomem;
 		// Create file `/proc/gpu#/switch_to_tsg`, world writable
-		switch_to_tsg_entry = proc_create_data(
-			"switch_to_tsg", 0222, dir, compat_ops(&switch_to_tsg_file_ops),
-			(void*)device_id);
+		if (!proc_create_data(
+				"switch_to_tsg", 0222, dir, compat_ops(&switch_to_tsg_file_ops),
+				(void*)device_id))
+			goto out_nomem;
 		// Create file `/proc/gpu#/device_info`, world readable
-		device_info_entry = proc_create_data(
-			"device_info", 0444, dir, compat_ops(&device_info_file_ops),
-			(void*)device_id);
+		if (!proc_create_data(
+				"device_info", 0444, dir, compat_ops(&device_info_file_ops),
+				(void*)device_id))
+			goto out_nomem;
 		// Create file `/proc/gpu#/num_gpcs`, world readable
-		num_gpcs_entry = proc_create_data(
-			"num_gpcs", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
-			(void*)NV_PTOP_SCAL_NUM_GPCS);
+		if (!proc_create_data(
+				"num_gpcs", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
+				(void*)NV_PTOP_SCAL_NUM_GPCS))
+			goto out_nomem;
 		// Create file `/proc/gpu#/num_tpc_per_gpc`, world readable
-		num_gpcs_entry = proc_create_data(
-			"num_tpc_per_gpc", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
-			(void*)NV_PTOP_SCAL_NUM_TPC_PER_GPC);
-		// Create file `/proc/gpu#/num_ces`, world readable
-		num_gpcs_entry = proc_create_data(
-			"num_ces", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
-			(void*)NV_PTOP_SCAL_NUM_CES);
+		if (!proc_create_data(
+				"num_tpc_per_gpc", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
+				(void*)NV_PTOP_SCAL_NUM_TPC_PER_GPC))
+			goto out_nomem;
 		// Create file `/proc/gpu#/num_ces`, world readable
-		num_gpcs_entry = proc_create_data(
-			"gpc_mask", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
-			(void*)NV_FUSE_GPC);
+		if (!proc_create_data(
+				"num_ces", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
+				(void*)NV_PTOP_SCAL_NUM_CES))
+			goto out_nomem;
+		// Create files `/proc/gpu#/gpc#_tpc_mask`, world readable (Maxwell+)
+		if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_MAXWELL)
+			if ((err = create_tpc_mask_files(device_id, dir)))
+				goto out_err;
+		// Create file `/proc/gpu#/gpc_mask`, world readable (Maxwell+)
+		if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_AMPERE) {
+			if (!proc_create_data(
+					"gpc_mask", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
+					(void*)NV_FUSE_GPC_GA100))
+				goto out_nomem;
+		} else if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_MAXWELL) {
+			if (!proc_create_data(
+					"gpc_mask", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
+					(void*)NV_FUSE_GPC_GM107))
+				goto out_nomem;
+		}
 		// Create files exposing LCE and PCE configuration (Pascal+)
 		if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_PASCAL) {
 			// Create file `/proc/gpu#/copy_topology`, world readable
@@ -277,16 +314,13 @@ int __init nvdebug_init(void) {
 					(void*)NV_CE_PCE_MAP))
 				goto out_nomem;
 		}
-		// ProcFS entry creation only fails if out of memory
-		if (rl_create_err || tpc_masks_create_err || !preempt_entry ||
-		    !disable_channel_entry || !enable_channel_entry ||
-		    !switch_to_tsg_entry || !device_info_entry || !num_gpcs_entry)
-			goto out_nomem;
 	}
 	// (See Makefile if you want to know the origin of GIT_HASH.)
 	printk(KERN_INFO "[nvdebug] Module version "GIT_HASH" initialized\n");
 	return 0;
 out_nomem:
+	err = -ENOMEM;
+out_err:
 	// Make sure to clear all ProcFS directories on error
 	while (res < g_nvdebug_devices) {
 		char device_id_str[7];
@@ -294,7 +328,7 @@ out_nomem:
 		remove_proc_subtree(device_id_str, NULL);
 		res++;
 	}
-	return -ENOMEM;
+	return err;
 }
 
 static void __exit nvdebug_exit(void) {
-- 
cgit v1.2.2