From 3653aee74ae8338b9da1f0304b0eaa1171dd640f Mon Sep 17 00:00:00 2001
From: Joshua Bakita <bakitajoshua@gmail.com>
Date: Thu, 19 Sep 2024 15:38:53 -0400
Subject: Correctly check for read errors in the nvdebug_read* functions

Follows how NVIDIA's open-source GPU driver checks for bad reads.
---
 nvdebug_linux.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

(limited to 'nvdebug_linux.c')

diff --git a/nvdebug_linux.c b/nvdebug_linux.c
index 1d76bc9..830ec6e 100644
--- a/nvdebug_linux.c
+++ b/nvdebug_linux.c
@@ -13,9 +13,17 @@ u32 nvdebug_readl(struct nvdebug_state *s, u32 r) {
 		return -1;
 	}
 	ret = readl(s->regs + r);
-	// It seems like the GPU returns this as a flag value for bad addresses
-	if (ret == 0xbadf5040) {
-		printk(KERN_ERR "[nvdebug] nvdebug_readl: Unable to read from register offset %#x; bad data\n", r);
+	// According to open-gpu-kernel-modules, the GPU "will return 0xbad in the
+	// upper 3 nibbles when there is a possible issue". Further code uses the
+	// middle three nibbles as an error code, and ignores the bottom two.
+	if ((ret & 0xfff00000) == 0xbad00000) {
+		printk(KERN_ERR "[nvdebug] nvdebug_readl: Unable to read from register offset %#x; bad data of %#10x\n", r, ret);
+		// It would be best to check INTR_0_PRI_* error is pending, to verify
+		// that this was actually a bad read. Possible future work...
+		// Generally a failure here in the context of nvdebug indicates that a
+		// register does not exist on this platform, but one can know for sure
+		// by checking which NV_PPRIV_SYS_PRI_ERROR_CODE_* define the bad read
+		// matches.
 		return -1;
 	}
 	return ret;
@@ -28,12 +36,12 @@ u64 nvdebug_readq(struct nvdebug_state *s, u32 r) {
 		printk(KERN_ERR "[nvdebug] nvdebug_readq: Unable to read; registers unavailable. Is GPU on?\n");
 		return -1;
 	}
-	// readq seems to always return the uppermost 32 bits as 0, so workaround with readl
+	// readq seems to always (?) return the uppermost 32 bits as 0, so workaround with readl
 	ret = readl(s->regs + r);
 	ret |= ((u64)readl(s->regs + r + 4)) << 32;
-	// It seems like the GPU returns this as a flag value for bad addresses
-	if ((ret & 0xffffffffull) == 0xbadf5040ull) {
-		printk(KERN_ERR "[nvdebug] nvdebug_readq: Unable to read from register offset %#x; bad data\n", r);
+	// See comment in nvdebug_readl() regarding error checking
+	if ((ret & 0xfff00000ull) == 0xbad00000ull) {
+		printk(KERN_ERR "[nvdebug] nvdebug_readq: Unable to read from register offset %#x; bad data of %#18llx\n", r, ret);
 		return -1;
 	}
 	return ret;
@@ -50,7 +58,7 @@ void nvdebug_writel(struct nvdebug_state *s, u32 r, u32 v) {
 }
 
 // quadword version of nvdebug_writel()
-// XXX: This probably doesn't work XXX: Untested
+// XXX: Not clear this works on all platforms
 void nvdebug_writeq(struct nvdebug_state *s, u32 r, u64 v) {
 	if (unlikely(!s->regs || (s->g && !gk20a_regs(s->g)))) {
 		printk(KERN_ERR "[nvdebug] nvdebug_writeq: Unable to write; registers unavailable. Is GPU on?\n");
-- 
cgit v1.2.2