Correctly check for read errors in the nvdebug_read* functions

Follows how NVIDIA's open-source GPU driver checks for bad reads.
author: Joshua Bakita <bakitajoshua@gmail.com> 2024-09-19 15:38:53 -0400
committer: Joshua Bakita <bakitajoshua@gmail.com> 2024-09-19 15:38:53 -0400
commit: 3653aee74ae8338b9da1f0304b0eaa1171dd640f (patch)
tree: 554c12f901c209b962ca9dac48bde9d3c59f2281
parent: 48f9e45b9d9ebfca7d3c673597f7fbed9427a5af (diff)
1 files changed, 16 insertions, 8 deletions
diff --git a/nvdebug_linux.c b/nvdebug_linux.c
index 1d76bc9..830ec6e 100644
--- a/nvdebug_linux.c
+++ b/nvdebug_linux.c
@@ -13,9 +13,17 @@ u32 nvdebug_readl(struct nvdebug_state *s, u32 r) {
                return -1;
        }
        ret = readl(s->regs + r);
-        // It seems like the GPU returns this as a flag value for bad addresses
+        // According to open-gpu-kernel-modules, the GPU "will return 0xbad in the
-        if (ret == 0xbadf5040) {
+        // upper 3 nibbles when there is a possible issue". Further code uses the
-                printk(KERN_ERR "[nvdebug] nvdebug_readl: Unable to read from register offset %#x; bad data\n", r);
+        // middle three nibbles as an error code, and ignores the bottom two.
+        if ((ret & 0xfff00000) == 0xbad00000) {
+                printk(KERN_ERR "[nvdebug] nvdebug_readl: Unable to read from register offset %#x; bad data of %#10x\n", r, ret);
+                // It would be best to check INTR_0_PRI_* error is pending, to verify
+                // that this was actually a bad read. Possible future work...
+                // Generally a failure here in the context of nvdebug indicates that a
+                // register does not exist on this platform, but one can know for sure
+                // by checking which NV_PPRIV_SYS_PRI_ERROR_CODE_* define the bad read
+                // matches.
                return -1;
        }
        return ret;
@@ -28,12 +36,12 @@ u64 nvdebug_readq(struct nvdebug_state *s, u32 r) {
                printk(KERN_ERR "[nvdebug] nvdebug_readq: Unable to read; registers unavailable. Is GPU on?\n");
                return -1;
        }
-        // readq seems to always return the uppermost 32 bits as 0, so workaround with readl
+        // readq seems to always (?) return the uppermost 32 bits as 0, so workaround with readl
        ret = readl(s->regs + r);
        ret |= ((u64)readl(s->regs + r + 4)) << 32;
-        // It seems like the GPU returns this as a flag value for bad addresses
+        // See comment in nvdebug_readl() regarding error checking
-        if ((ret & 0xffffffffull) == 0xbadf5040ull) {
+        if ((ret & 0xfff00000ull) == 0xbad00000ull) {
-                printk(KERN_ERR "[nvdebug] nvdebug_readq: Unable to read from register offset %#x; bad data\n", r);
+                printk(KERN_ERR "[nvdebug] nvdebug_readq: Unable to read from register offset %#x; bad data of %#18llx\n", r, ret);
                return -1;
        }
        return ret;
@@ -50,7 +58,7 @@ void nvdebug_writel(struct nvdebug_state *s, u32 r, u32 v) {
 }
 // quadword version of nvdebug_writel()
-// XXX: This probably doesn't work XXX: Untested
+// XXX: Not clear this works on all platforms
 void nvdebug_writeq(struct nvdebug_state *s, u32 r, u64 v) {
        if (unlikely(!s->regs || (s->g && !gk20a_regs(s->g)))) {
                printk(KERN_ERR "[nvdebug] nvdebug_writeq: Unable to write; registers unavailable. Is GPU on?\n");
author	Joshua Bakita <bakitajoshua@gmail.com>	2024-09-19 15:38:53 -0400
committer	Joshua Bakita <bakitajoshua@gmail.com>	2024-09-19 15:38:53 -0400
commit	3653aee74ae8338b9da1f0304b0eaa1171dd640f (patch)
tree	554c12f901c209b962ca9dac48bde9d3c59f2281
parent	48f9e45b9d9ebfca7d3c673597f7fbed9427a5af (diff)