From 4768fe31f114c5ad788012db5518ce8e37f79c7a Mon Sep 17 00:00:00 2001
From: Joshua Bakita <jbakita@cs.unc.edu>
Date: Tue, 9 Apr 2024 13:07:19 -0400
Subject: Correctly handle startup errors and fix gpc*_mask APIs

- Do not create gpc*_mask files on pre-Maxwell GPUs (tested
  unavailable on the K5000s)
- Use correct register offsets for gpc*_mask files on Ampere+ GPUs
- Document GPC and TPC count and fuse registers.
- Correctly handle errors for creation of all ProcFS files
- Remove unecessary error-handling temp variables in nvdebug_entry
- Misc naming, comment, and layout cleanup
---
 nvdebug.h | 46 +++++++++++++++++++++++++++++++++-------------
 1 file changed, 33 insertions(+), 13 deletions(-)

(limited to 'nvdebug.h')

diff --git a/nvdebug.h b/nvdebug.h
index f0662cd..39b2e6e 100644
--- a/nvdebug.h
+++ b/nvdebug.h
@@ -436,7 +436,9 @@ typedef union {
 #define NV_MC_BOOT_0 0x00000000
 #define NV_CHIP_ID_GP106 0x136 // Discrete GeForce GTX 1060
 #define NV_CHIP_ID_GV11B 0x15B // Jetson Xavier embedded GPU
+
 #define NV_CHIP_ID_KEPLER 0x0E0
+#define NV_CHIP_ID_MAXWELL 0x120
 #define NV_CHIP_ID_PASCAL 0x130
 #define NV_CHIP_ID_VOLTA 0x140
 #define NV_CHIP_ID_VOLTA_INTEGRATED 0x150
@@ -700,29 +702,47 @@ typedef union {
 	uint32_t raw;
 } ptop_device_info_gk104_t;
 
-/* Graphics Processing Cluster (GPC) information
+/* Graphics Processing Cluster (GPC) on-chip information
   The GPU's Compute/Graphics engine is subdivided into Graphics Processing
   Clusters (also known as GPU Processing Clusters, starting with Ampere).
 
   Each GPC is subdivided into Texture Processing Clusters (TPCs) which contain
   Streaming Multiprocessors (SMs).
 
+  The number of these units etched onto the chip may vary from the number
+  enabled and software-visible. These registers expose the number of on-chip
+  GPCs, the number of on-chip TPCs inside a GPC.
 
+  Support: Fermi through (at least) Blackwell
 */
-// Support: Fermi through Blackwell
-// Get the number of GPCs **on die**
 #define NV_PTOP_SCAL_NUM_GPCS 0x00022430
-// Get the number of TPCs per GPC **on die**
 #define NV_PTOP_SCAL_NUM_TPC_PER_GPC 0x00022434
-// GPC and TPC masks
-// Support: Maxwell, Pascal, Volta, Turing
-// Bitmask of which GPC **are enabled** of the max on die
-#define NV_FUSE_GPC 0x00021c1c
-// Bitmask of which TPCs **are enabled** on each GPC
-#define NV_FUSE_TPC_FOR_GPC(i) (0x00021c38+(i)*4)
-// Support: Ampere, Ada, Hopper, Blackwell
-//#define NV_FUSE_GPC 0x00820c1c
-//#define NV_FUSE_TPC_FOR_GPC(i) (0x00820c38+(i)*4)
+
+/* Graphics Processing Cluster (GPC) enablement information
+  (See above for a description of GPCs and TPCs.)
+
+  The number of on-chip GPCs and TPCs enabled is driven by:
+  1) Manufacturing errors which make some units nonfunctional.
+  2) Commercialization decisions about how many units should be enabled for a
+     specific GPU model.
+
+  Generally, reason (1) drives disablement early in product manufacturing,
+  whereas, as the manufacturing process matures, (2) steps in to ensure
+  consistency between early-manufactured and late-manufactured products.
+
+  On-chip fuses are used to dictate which units are enabled and disabled. These
+  registers expose the fuse configuration for GPCs, and the TPCs in each GPC.
+
+  FUSE_GPC            : Bitmask of which GPCs are enabled
+  FUSE_TPC_FOR_GPC(i) : Bitmask of which TPCs are enabled for GPC i
+
+  Support: Maxwell through Blackwell
+           Note the registers were relocated starting with Ampere.
+*/
+#define NV_FUSE_GPC_GM107 0x00021c1c
+#define NV_FUSE_TPC_FOR_GPC_GM107(i) (0x00021c38+(i)*4)
+#define NV_FUSE_GPC_GA100 0x00820c1c
+#define NV_FUSE_TPC_FOR_GPC_GA100(i) (0x00820c38+(i)*4)
 
 /* Logical Copy Engine (LCE) Information
   Every GPU has some number of copy engines which can process transfers to,
-- 
cgit v1.2.2