From 3653aee74ae8338b9da1f0304b0eaa1171dd640f Mon Sep 17 00:00:00 2001 From: Joshua Bakita Date: Thu, 19 Sep 2024 15:38:53 -0400 Subject: Correctly check for read errors in the nvdebug_read* functions Follows how NVIDIA's open-source GPU driver checks for bad reads. --- nvdebug_linux.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'nvdebug_linux.c') diff --git a/nvdebug_linux.c b/nvdebug_linux.c index 1d76bc9..830ec6e 100644 --- a/nvdebug_linux.c +++ b/nvdebug_linux.c @@ -13,9 +13,17 @@ u32 nvdebug_readl(struct nvdebug_state *s, u32 r) { return -1; } ret = readl(s->regs + r); - // It seems like the GPU returns this as a flag value for bad addresses - if (ret == 0xbadf5040) { - printk(KERN_ERR "[nvdebug] nvdebug_readl: Unable to read from register offset %#x; bad data\n", r); + // According to open-gpu-kernel-modules, the GPU "will return 0xbad in the + // upper 3 nibbles when there is a possible issue". Further code uses the + // middle three nibbles as an error code, and ignores the bottom two. + if ((ret & 0xfff00000) == 0xbad00000) { + printk(KERN_ERR "[nvdebug] nvdebug_readl: Unable to read from register offset %#x; bad data of %#10x\n", r, ret); + // It would be best to check INTR_0_PRI_* error is pending, to verify + // that this was actually a bad read. Possible future work... + // Generally a failure here in the context of nvdebug indicates that a + // register does not exist on this platform, but one can know for sure + // by checking which NV_PPRIV_SYS_PRI_ERROR_CODE_* define the bad read + // matches. return -1; } return ret; @@ -28,12 +36,12 @@ u64 nvdebug_readq(struct nvdebug_state *s, u32 r) { printk(KERN_ERR "[nvdebug] nvdebug_readq: Unable to read; registers unavailable. Is GPU on?\n"); return -1; } - // readq seems to always return the uppermost 32 bits as 0, so workaround with readl + // readq seems to always (?) return the uppermost 32 bits as 0, so workaround with readl ret = readl(s->regs + r); ret |= ((u64)readl(s->regs + r + 4)) << 32; - // It seems like the GPU returns this as a flag value for bad addresses - if ((ret & 0xffffffffull) == 0xbadf5040ull) { - printk(KERN_ERR "[nvdebug] nvdebug_readq: Unable to read from register offset %#x; bad data\n", r); + // See comment in nvdebug_readl() regarding error checking + if ((ret & 0xfff00000ull) == 0xbad00000ull) { + printk(KERN_ERR "[nvdebug] nvdebug_readq: Unable to read from register offset %#x; bad data of %#18llx\n", r, ret); return -1; } return ret; @@ -50,7 +58,7 @@ void nvdebug_writel(struct nvdebug_state *s, u32 r, u32 v) { } // quadword version of nvdebug_writel() -// XXX: This probably doesn't work XXX: Untested +// XXX: Not clear this works on all platforms void nvdebug_writeq(struct nvdebug_state *s, u32 r, u64 v) { if (unlikely(!s->regs || (s->g && !gk20a_regs(s->g)))) { printk(KERN_ERR "[nvdebug] nvdebug_writeq: Unable to write; registers unavailable. Is GPU on?\n"); -- cgit v1.2.2