Correctly handle startup errors and fix gpc*_mask APIs

- Do not create gpc*_mask files on pre-Maxwell GPUs (tested unavailable on the K5000s) - Use correct register offsets for gpc*_mask files on Ampere+ GPUs - Document GPC and TPC count and fuse registers. - Correctly handle errors for creation of all ProcFS files - Remove unecessary error-handling temp variables in nvdebug_entry - Misc naming, comment, and layout cleanup
author: Joshua Bakita <jbakita@cs.unc.edu> 2024-04-09 13:07:19 -0400
committer: Joshua Bakita <jbakita@cs.unc.edu> 2024-04-09 13:07:19 -0400
commit: 4768fe31f114c5ad788012db5518ce8e37f79c7a (patch)
tree: 03fe90108bf9341b8b9d299df3ba8a6245c509d0 /nvdebug.h
parent: 31964208e4dc0243b6b31b9967c77a791aeb995c (diff)
1 files changed, 33 insertions, 13 deletions
diff --git a/nvdebug.h b/nvdebug.h
index f0662cd..39b2e6e 100644
--- a/nvdebug.h
+++ b/nvdebug.h
@@ -436,7 +436,9 @@ typedef union {
 #define NV_MC_BOOT_0 0x00000000
 #define NV_CHIP_ID_GP106 0x136 // Discrete GeForce GTX 1060
 #define NV_CHIP_ID_GV11B 0x15B // Jetson Xavier embedded GPU
 #define NV_CHIP_ID_KEPLER 0x0E0
+#define NV_CHIP_ID_MAXWELL 0x120
 #define NV_CHIP_ID_PASCAL 0x130
 #define NV_CHIP_ID_VOLTA 0x140
 #define NV_CHIP_ID_VOLTA_INTEGRATED 0x150
@@ -700,29 +702,47 @@ typedef union {
        uint32_t raw;
 } ptop_device_info_gk104_t;
-/* Graphics Processing Cluster (GPC) information
+/* Graphics Processing Cluster (GPC) on-chip information
  The GPU's Compute/Graphics engine is subdivided into Graphics Processing
  Clusters (also known as GPU Processing Clusters, starting with Ampere).
  Each GPC is subdivided into Texture Processing Clusters (TPCs) which contain
  Streaming Multiprocessors (SMs).
+  The number of these units etched onto the chip may vary from the number
+  enabled and software-visible. These registers expose the number of on-chip
+  GPCs, the number of on-chip TPCs inside a GPC.
+  Support: Fermi through (at least) Blackwell
 */
-// Support: Fermi through Blackwell
-// Get the number of GPCs **on die**
 #define NV_PTOP_SCAL_NUM_GPCS 0x00022430
-// Get the number of TPCs per GPC **on die**
 #define NV_PTOP_SCAL_NUM_TPC_PER_GPC 0x00022434
-// GPC and TPC masks
-// Support: Maxwell, Pascal, Volta, Turing
+/* Graphics Processing Cluster (GPC) enablement information
-// Bitmask of which GPC **are enabled** of the max on die
+  (See above for a description of GPCs and TPCs.)
-#define NV_FUSE_GPC 0x00021c1c
-// Bitmask of which TPCs **are enabled** on each GPC
+  The number of on-chip GPCs and TPCs enabled is driven by:
-#define NV_FUSE_TPC_FOR_GPC(i) (0x00021c38+(i)*4)
+  1) Manufacturing errors which make some units nonfunctional.
-// Support: Ampere, Ada, Hopper, Blackwell
+  2) Commercialization decisions about how many units should be enabled for a
-//#define NV_FUSE_GPC 0x00820c1c
+     specific GPU model.
-//#define NV_FUSE_TPC_FOR_GPC(i) (0x00820c38+(i)*4)
+  Generally, reason (1) drives disablement early in product manufacturing,
+  whereas, as the manufacturing process matures, (2) steps in to ensure
+  consistency between early-manufactured and late-manufactured products.
+  On-chip fuses are used to dictate which units are enabled and disabled. These
+  registers expose the fuse configuration for GPCs, and the TPCs in each GPC.
+  FUSE_GPC            : Bitmask of which GPCs are enabled
+  FUSE_TPC_FOR_GPC(i) : Bitmask of which TPCs are enabled for GPC i
+  Support: Maxwell through Blackwell
+           Note the registers were relocated starting with Ampere.
+*/
+#define NV_FUSE_GPC_GM107 0x00021c1c
+#define NV_FUSE_TPC_FOR_GPC_GM107(i) (0x00021c38+(i)*4)
+#define NV_FUSE_GPC_GA100 0x00820c1c
+#define NV_FUSE_TPC_FOR_GPC_GA100(i) (0x00820c38+(i)*4)
 /* Logical Copy Engine (LCE) Information
  Every GPU has some number of copy engines which can process transfers to,
author	Joshua Bakita <jbakita@cs.unc.edu>	2024-04-09 13:07:19 -0400
committer	Joshua Bakita <jbakita@cs.unc.edu>	2024-04-09 13:07:19 -0400
commit	4768fe31f114c5ad788012db5518ce8e37f79c7a (patch)
tree	03fe90108bf9341b8b9d299df3ba8a6245c509d0 /nvdebug.h
parent	31964208e4dc0243b6b31b9967c77a791aeb995c (diff)

diff --git a/nvdebug.h b/nvdebug.h index f0662cd..39b2e6e 100644 --- a/nvdebug.h +++ b/nvdebug.h
@@ -436,7 +436,9 @@ typedef union {
436	#define NV_MC_BOOT_0 0x00000000	436	#define NV_MC_BOOT_0 0x00000000
437	#define NV_CHIP_ID_GP106 0x136 // Discrete GeForce GTX 1060	437	#define NV_CHIP_ID_GP106 0x136 // Discrete GeForce GTX 1060
438	#define NV_CHIP_ID_GV11B 0x15B // Jetson Xavier embedded GPU	438	#define NV_CHIP_ID_GV11B 0x15B // Jetson Xavier embedded GPU
		439
439	#define NV_CHIP_ID_KEPLER 0x0E0	440	#define NV_CHIP_ID_KEPLER 0x0E0
		441	#define NV_CHIP_ID_MAXWELL 0x120
440	#define NV_CHIP_ID_PASCAL 0x130	442	#define NV_CHIP_ID_PASCAL 0x130
441	#define NV_CHIP_ID_VOLTA 0x140	443	#define NV_CHIP_ID_VOLTA 0x140
442	#define NV_CHIP_ID_VOLTA_INTEGRATED 0x150	444	#define NV_CHIP_ID_VOLTA_INTEGRATED 0x150
@@ -700,29 +702,47 @@ typedef union {
700	uint32_t raw;	702	uint32_t raw;
701	} ptop_device_info_gk104_t;	703	} ptop_device_info_gk104_t;
702		704
703	/* Graphics Processing Cluster (GPC) information	705	/* Graphics Processing Cluster (GPC) on-chip information
704	The GPU's Compute/Graphics engine is subdivided into Graphics Processing	706	The GPU's Compute/Graphics engine is subdivided into Graphics Processing
705	Clusters (also known as GPU Processing Clusters, starting with Ampere).	707	Clusters (also known as GPU Processing Clusters, starting with Ampere).
706		708
707	Each GPC is subdivided into Texture Processing Clusters (TPCs) which contain	709	Each GPC is subdivided into Texture Processing Clusters (TPCs) which contain
708	Streaming Multiprocessors (SMs).	710	Streaming Multiprocessors (SMs).
709		711
		712	The number of these units etched onto the chip may vary from the number
		713	enabled and software-visible. These registers expose the number of on-chip
		714	GPCs, the number of on-chip TPCs inside a GPC.
710		715
		716	Support: Fermi through (at least) Blackwell
711	*/	717	*/
712	// Support: Fermi through Blackwell
713	// Get the number of GPCs on die
714	#define NV_PTOP_SCAL_NUM_GPCS 0x00022430	718	#define NV_PTOP_SCAL_NUM_GPCS 0x00022430
715	// Get the number of TPCs per GPC on die
716	#define NV_PTOP_SCAL_NUM_TPC_PER_GPC 0x00022434	719	#define NV_PTOP_SCAL_NUM_TPC_PER_GPC 0x00022434
717	// GPC and TPC masks	720
718	// Support: Maxwell, Pascal, Volta, Turing	721	/* Graphics Processing Cluster (GPC) enablement information
719	// Bitmask of which GPC are enabled of the max on die	722	(See above for a description of GPCs and TPCs.)
720	#define NV_FUSE_GPC 0x00021c1c	723
721	// Bitmask of which TPCs are enabled on each GPC	724	The number of on-chip GPCs and TPCs enabled is driven by:
722	#define NV_FUSE_TPC_FOR_GPC(i) (0x00021c38+(i)*4)	725	1) Manufacturing errors which make some units nonfunctional.
723	// Support: Ampere, Ada, Hopper, Blackwell	726	2) Commercialization decisions about how many units should be enabled for a
724	//#define NV_FUSE_GPC 0x00820c1c	727	specific GPU model.
725	//#define NV_FUSE_TPC_FOR_GPC(i) (0x00820c38+(i)*4)	728
		729	Generally, reason (1) drives disablement early in product manufacturing,
		730	whereas, as the manufacturing process matures, (2) steps in to ensure
		731	consistency between early-manufactured and late-manufactured products.
		732
		733	On-chip fuses are used to dictate which units are enabled and disabled. These
		734	registers expose the fuse configuration for GPCs, and the TPCs in each GPC.
		735
		736	FUSE_GPC : Bitmask of which GPCs are enabled
		737	FUSE_TPC_FOR_GPC(i) : Bitmask of which TPCs are enabled for GPC i
		738
		739	Support: Maxwell through Blackwell
		740	Note the registers were relocated starting with Ampere.
		741	*/
		742	#define NV_FUSE_GPC_GM107 0x00021c1c
		743	#define NV_FUSE_TPC_FOR_GPC_GM107(i) (0x00021c38+(i)*4)
		744	#define NV_FUSE_GPC_GA100 0x00820c1c
		745	#define NV_FUSE_TPC_FOR_GPC_GA100(i) (0x00820c38+(i)*4)
726		746
727	/* Logical Copy Engine (LCE) Information	747	/* Logical Copy Engine (LCE) Information
728	Every GPU has some number of copy engines which can process transfers to,	748	Every GPU has some number of copy engines which can process transfers to,