Fix a race condition in nvdebug_{readl,readq,writel,writeq}HEAD master

When the GPU is powered off, attempts to read any of its registers (such as via nvdebug_readl()) result in a fatal interrupt. The pm_runtime_get() call included in nvdebug sent a request to nvgpu to turn the GPU back on. **However,** this call did not wait for the power-on command to take effect. This resulted in a race between nvdebug and the power management logic, meaning that the GPU may not have powered-on by the time that nvdebug attempted to read its registers. Use pm_runtime_get_sync() instead, which explicitly waits for the power-on command to complete (or fail) before returning. This eliminates the race condition. Thank you to Diego Alejandro Parra Guzman <diego.guzman@tttech-auto.com>, who brought this issue to my attention.
author: Joshua Bakita <jbakita@cs.unc.edu> 2025-04-04 10:29:54 -0400
committer: Joshua Bakita <jbakita@cs.unc.edu> 2025-04-04 10:29:54 -0400
commit: 494df296bf4abe9b2b484bde1a4fad28c989afec (patch)
tree: 123a4b696545e70a9953ada907ed00dfccb6038a
parent: 6143114460e5125621747cde2f712fed445b9a15 (diff)
1 files changed, 13 insertions, 6 deletions
diff --git a/nvdebug_linux.c b/nvdebug_linux.c
index 111d5aa..e673a8b 100644
--- a/nvdebug_linux.c
+++ b/nvdebug_linux.c
@@ -1,4 +1,4 @@
-/* Copyright 2024 Joshua Bakita
+/* Copyright 2024-2025 Joshua Bakita
 * Implementation of Kernel-specific function implementations
 */
 #include "nvdebug_linux.h"
@@ -9,10 +9,14 @@ u32 nvdebug_readl(struct nvdebug_state *s, u32 r) {
        u32 ret;
        // If this is an integrated ("platform") GPU, make sure that it's on first
        // (pm_runtime_enabled() will return false until nvgpu is started. Once
-        // nvgpu is started, pm_runtime_get() will attempt to resume the GPU.)
+        // nvgpu is started, pm_runtime_get_sync() will attempt to resume the GPU.
+        // This still increments the usage counter on failure, so we undo that with
+        // pm_runtime_put_noidle(). We avoid pm_runtime_resume_and_get() as it was
+        // not added until Linux 5.9.11)
        // This works to bring up the TX2, Xavier, and Orin, but not the TX1.
-        if (s->platd && (!pm_runtime_enabled(s->dev) || pm_runtime_get(s->dev) < 0)) {
+        if (s->platd && (!pm_runtime_enabled(s->dev) || pm_runtime_get_sync(s->dev) < 0)) {
                printk(KERN_ERR "[nvdebug] nvdebug_readl: Unable to read; registers unavailable. Is GPU on?\n");
+                pm_runtime_put_noidle(s->dev); // No-op if !pm_runtime_enabled()
                return -1;
        }
        ret = readl(s->regs + r);
@@ -39,8 +43,9 @@ u32 nvdebug_readl(struct nvdebug_state *s, u32 r) {
 u64 nvdebug_readq(struct nvdebug_state *s, u32 r) {
        u64 ret;
        // If this is an integrated ("platform") GPU, make sure that it's on first
-        if (s->platd && (!pm_runtime_enabled(s->dev) || pm_runtime_get(s->dev) < 0)) {
+        if (s->platd && (!pm_runtime_enabled(s->dev) || pm_runtime_get_sync(s->dev) < 0)) {
                printk(KERN_ERR "[nvdebug] nvdebug_readq: Unable to read; registers unavailable. Is GPU on?\n");
+                pm_runtime_put_noidle(s->dev); // No-op if !pm_runtime_enabled()
                return -1;
        }
        // readq seems to always (?) return the uppermost 32 bits as 0, so workaround with readl
@@ -59,8 +64,9 @@ u64 nvdebug_readq(struct nvdebug_state *s, u32 r) {
 void nvdebug_writel(struct nvdebug_state *s, u32 r, u32 v) {
        // If this is an integrated ("platform") GPU, make sure that it's on first
-        if (s->platd && (!pm_runtime_enabled(s->dev) || pm_runtime_get(s->dev) < 0)) {
+        if (s->platd && (!pm_runtime_enabled(s->dev) || pm_runtime_get_sync(s->dev) < 0)) {
                printk(KERN_ERR "[nvdebug] nvdebug_writel: Unable to write; registers unavailable. Is GPU on?\n");
+                pm_runtime_put_noidle(s->dev); // No-op if !pm_runtime_enabled()
                return;
        }
        writel_relaxed(v, s->regs + r);
@@ -74,8 +80,9 @@ void nvdebug_writel(struct nvdebug_state *s, u32 r, u32 v) {
 // XXX: Not clear this works on all platforms
 void nvdebug_writeq(struct nvdebug_state *s, u32 r, u64 v) {
        // If this is an integrated ("platform") GPU, make sure that it's on first
-        if (s->platd && (!pm_runtime_enabled(s->dev) || pm_runtime_get(s->dev) < 0)) {
+        if (s->platd && (!pm_runtime_enabled(s->dev) || pm_runtime_get_sync(s->dev) < 0)) {
                printk(KERN_ERR "[nvdebug] nvdebug_writeq: Unable to write; registers unavailable. Is GPU on?\n");
+                pm_runtime_put_noidle(s->dev); // No-op if !pm_runtime_enabled()
                return;
        }
        writeq_relaxed(v, s->regs + r);
author	Joshua Bakita <jbakita@cs.unc.edu>	2025-04-04 10:29:54 -0400
committer	Joshua Bakita <jbakita@cs.unc.edu>	2025-04-04 10:29:54 -0400
commit	494df296bf4abe9b2b484bde1a4fad28c989afec (patch)
tree	123a4b696545e70a9953ada907ed00dfccb6038a
parent	6143114460e5125621747cde2f712fed445b9a15 (diff)