aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJoshua Bakita <jbakita@cs.unc.edu>2025-04-04 10:29:54 -0400
committerJoshua Bakita <jbakita@cs.unc.edu>2025-04-04 10:29:54 -0400
commit494df296bf4abe9b2b484bde1a4fad28c989afec (patch)
tree123a4b696545e70a9953ada907ed00dfccb6038a
parent6143114460e5125621747cde2f712fed445b9a15 (diff)
Fix a race condition in nvdebug_{readl,readq,writel,writeq}HEADmaster
When the GPU is powered off, attempts to read any of its registers (such as via nvdebug_readl()) result in a fatal interrupt. The pm_runtime_get() call included in nvdebug sent a request to nvgpu to turn the GPU back on. **However,** this call did not wait for the power-on command to take effect. This resulted in a race between nvdebug and the power management logic, meaning that the GPU may not have powered-on by the time that nvdebug attempted to read its registers. Use pm_runtime_get_sync() instead, which explicitly waits for the power-on command to complete (or fail) before returning. This eliminates the race condition. Thank you to Diego Alejandro Parra Guzman <diego.guzman@tttech-auto.com>, who brought this issue to my attention.
-rw-r--r--nvdebug_linux.c19
1 files changed, 13 insertions, 6 deletions
diff --git a/nvdebug_linux.c b/nvdebug_linux.c
index 111d5aa..e673a8b 100644
--- a/nvdebug_linux.c
+++ b/nvdebug_linux.c
@@ -1,4 +1,4 @@
1/* Copyright 2024 Joshua Bakita 1/* Copyright 2024-2025 Joshua Bakita
2 * Implementation of Kernel-specific function implementations 2 * Implementation of Kernel-specific function implementations
3 */ 3 */
4#include "nvdebug_linux.h" 4#include "nvdebug_linux.h"
@@ -9,10 +9,14 @@ u32 nvdebug_readl(struct nvdebug_state *s, u32 r) {
9 u32 ret; 9 u32 ret;
10 // If this is an integrated ("platform") GPU, make sure that it's on first 10 // If this is an integrated ("platform") GPU, make sure that it's on first
11 // (pm_runtime_enabled() will return false until nvgpu is started. Once 11 // (pm_runtime_enabled() will return false until nvgpu is started. Once
12 // nvgpu is started, pm_runtime_get() will attempt to resume the GPU.) 12 // nvgpu is started, pm_runtime_get_sync() will attempt to resume the GPU.
13 // This still increments the usage counter on failure, so we undo that with
14 // pm_runtime_put_noidle(). We avoid pm_runtime_resume_and_get() as it was
15 // not added until Linux 5.9.11)
13 // This works to bring up the TX2, Xavier, and Orin, but not the TX1. 16 // This works to bring up the TX2, Xavier, and Orin, but not the TX1.
14 if (s->platd && (!pm_runtime_enabled(s->dev) || pm_runtime_get(s->dev) < 0)) { 17 if (s->platd && (!pm_runtime_enabled(s->dev) || pm_runtime_get_sync(s->dev) < 0)) {
15 printk(KERN_ERR "[nvdebug] nvdebug_readl: Unable to read; registers unavailable. Is GPU on?\n"); 18 printk(KERN_ERR "[nvdebug] nvdebug_readl: Unable to read; registers unavailable. Is GPU on?\n");
19 pm_runtime_put_noidle(s->dev); // No-op if !pm_runtime_enabled()
16 return -1; 20 return -1;
17 } 21 }
18 ret = readl(s->regs + r); 22 ret = readl(s->regs + r);
@@ -39,8 +43,9 @@ u32 nvdebug_readl(struct nvdebug_state *s, u32 r) {
39u64 nvdebug_readq(struct nvdebug_state *s, u32 r) { 43u64 nvdebug_readq(struct nvdebug_state *s, u32 r) {
40 u64 ret; 44 u64 ret;
41 // If this is an integrated ("platform") GPU, make sure that it's on first 45 // If this is an integrated ("platform") GPU, make sure that it's on first
42 if (s->platd && (!pm_runtime_enabled(s->dev) || pm_runtime_get(s->dev) < 0)) { 46 if (s->platd && (!pm_runtime_enabled(s->dev) || pm_runtime_get_sync(s->dev) < 0)) {
43 printk(KERN_ERR "[nvdebug] nvdebug_readq: Unable to read; registers unavailable. Is GPU on?\n"); 47 printk(KERN_ERR "[nvdebug] nvdebug_readq: Unable to read; registers unavailable. Is GPU on?\n");
48 pm_runtime_put_noidle(s->dev); // No-op if !pm_runtime_enabled()
44 return -1; 49 return -1;
45 } 50 }
46 // readq seems to always (?) return the uppermost 32 bits as 0, so workaround with readl 51 // readq seems to always (?) return the uppermost 32 bits as 0, so workaround with readl
@@ -59,8 +64,9 @@ u64 nvdebug_readq(struct nvdebug_state *s, u32 r) {
59 64
60void nvdebug_writel(struct nvdebug_state *s, u32 r, u32 v) { 65void nvdebug_writel(struct nvdebug_state *s, u32 r, u32 v) {
61 // If this is an integrated ("platform") GPU, make sure that it's on first 66 // If this is an integrated ("platform") GPU, make sure that it's on first
62 if (s->platd && (!pm_runtime_enabled(s->dev) || pm_runtime_get(s->dev) < 0)) { 67 if (s->platd && (!pm_runtime_enabled(s->dev) || pm_runtime_get_sync(s->dev) < 0)) {
63 printk(KERN_ERR "[nvdebug] nvdebug_writel: Unable to write; registers unavailable. Is GPU on?\n"); 68 printk(KERN_ERR "[nvdebug] nvdebug_writel: Unable to write; registers unavailable. Is GPU on?\n");
69 pm_runtime_put_noidle(s->dev); // No-op if !pm_runtime_enabled()
64 return; 70 return;
65 } 71 }
66 writel_relaxed(v, s->regs + r); 72 writel_relaxed(v, s->regs + r);
@@ -74,8 +80,9 @@ void nvdebug_writel(struct nvdebug_state *s, u32 r, u32 v) {
74// XXX: Not clear this works on all platforms 80// XXX: Not clear this works on all platforms
75void nvdebug_writeq(struct nvdebug_state *s, u32 r, u64 v) { 81void nvdebug_writeq(struct nvdebug_state *s, u32 r, u64 v) {
76 // If this is an integrated ("platform") GPU, make sure that it's on first 82 // If this is an integrated ("platform") GPU, make sure that it's on first
77 if (s->platd && (!pm_runtime_enabled(s->dev) || pm_runtime_get(s->dev) < 0)) { 83 if (s->platd && (!pm_runtime_enabled(s->dev) || pm_runtime_get_sync(s->dev) < 0)) {
78 printk(KERN_ERR "[nvdebug] nvdebug_writeq: Unable to write; registers unavailable. Is GPU on?\n"); 84 printk(KERN_ERR "[nvdebug] nvdebug_writeq: Unable to write; registers unavailable. Is GPU on?\n");
85 pm_runtime_put_noidle(s->dev); // No-op if !pm_runtime_enabled()
79 return; 86 return;
80 } 87 }
81 writeq_relaxed(v, s->regs + r); 88 writeq_relaxed(v, s->regs + r);