diff options
| author | Joshua Bakita <jbakita@cs.unc.edu> | 2025-04-04 10:29:54 -0400 |
|---|---|---|
| committer | Joshua Bakita <jbakita@cs.unc.edu> | 2025-04-04 10:29:54 -0400 |
| commit | 494df296bf4abe9b2b484bde1a4fad28c989afec (patch) | |
| tree | 123a4b696545e70a9953ada907ed00dfccb6038a | |
| parent | 6143114460e5125621747cde2f712fed445b9a15 (diff) | |
When the GPU is powered off, attempts to read any of its registers
(such as via nvdebug_readl()) result in a fatal interrupt. The
pm_runtime_get() call included in nvdebug sent a request to nvgpu
to turn the GPU back on. **However,** this call did not wait for
the power-on command to take effect. This resulted in a race between
nvdebug and the power management logic, meaning that the GPU may not
have powered-on by the time that nvdebug attempted to read its
registers.
Use pm_runtime_get_sync() instead, which explicitly waits for the
power-on command to complete (or fail) before returning. This
eliminates the race condition.
Thank you to Diego Alejandro Parra Guzman
<diego.guzman@tttech-auto.com>, who brought this issue to my
attention.
| -rw-r--r-- | nvdebug_linux.c | 19 |
1 files changed, 13 insertions, 6 deletions
diff --git a/nvdebug_linux.c b/nvdebug_linux.c index 111d5aa..e673a8b 100644 --- a/nvdebug_linux.c +++ b/nvdebug_linux.c | |||
| @@ -1,4 +1,4 @@ | |||
| 1 | /* Copyright 2024 Joshua Bakita | 1 | /* Copyright 2024-2025 Joshua Bakita |
| 2 | * Implementation of Kernel-specific function implementations | 2 | * Implementation of Kernel-specific function implementations |
| 3 | */ | 3 | */ |
| 4 | #include "nvdebug_linux.h" | 4 | #include "nvdebug_linux.h" |
| @@ -9,10 +9,14 @@ u32 nvdebug_readl(struct nvdebug_state *s, u32 r) { | |||
| 9 | u32 ret; | 9 | u32 ret; |
| 10 | // If this is an integrated ("platform") GPU, make sure that it's on first | 10 | // If this is an integrated ("platform") GPU, make sure that it's on first |
| 11 | // (pm_runtime_enabled() will return false until nvgpu is started. Once | 11 | // (pm_runtime_enabled() will return false until nvgpu is started. Once |
| 12 | // nvgpu is started, pm_runtime_get() will attempt to resume the GPU.) | 12 | // nvgpu is started, pm_runtime_get_sync() will attempt to resume the GPU. |
| 13 | // This still increments the usage counter on failure, so we undo that with | ||
| 14 | // pm_runtime_put_noidle(). We avoid pm_runtime_resume_and_get() as it was | ||
| 15 | // not added until Linux 5.9.11) | ||
| 13 | // This works to bring up the TX2, Xavier, and Orin, but not the TX1. | 16 | // This works to bring up the TX2, Xavier, and Orin, but not the TX1. |
| 14 | if (s->platd && (!pm_runtime_enabled(s->dev) || pm_runtime_get(s->dev) < 0)) { | 17 | if (s->platd && (!pm_runtime_enabled(s->dev) || pm_runtime_get_sync(s->dev) < 0)) { |
| 15 | printk(KERN_ERR "[nvdebug] nvdebug_readl: Unable to read; registers unavailable. Is GPU on?\n"); | 18 | printk(KERN_ERR "[nvdebug] nvdebug_readl: Unable to read; registers unavailable. Is GPU on?\n"); |
| 19 | pm_runtime_put_noidle(s->dev); // No-op if !pm_runtime_enabled() | ||
| 16 | return -1; | 20 | return -1; |
| 17 | } | 21 | } |
| 18 | ret = readl(s->regs + r); | 22 | ret = readl(s->regs + r); |
| @@ -39,8 +43,9 @@ u32 nvdebug_readl(struct nvdebug_state *s, u32 r) { | |||
| 39 | u64 nvdebug_readq(struct nvdebug_state *s, u32 r) { | 43 | u64 nvdebug_readq(struct nvdebug_state *s, u32 r) { |
| 40 | u64 ret; | 44 | u64 ret; |
| 41 | // If this is an integrated ("platform") GPU, make sure that it's on first | 45 | // If this is an integrated ("platform") GPU, make sure that it's on first |
| 42 | if (s->platd && (!pm_runtime_enabled(s->dev) || pm_runtime_get(s->dev) < 0)) { | 46 | if (s->platd && (!pm_runtime_enabled(s->dev) || pm_runtime_get_sync(s->dev) < 0)) { |
| 43 | printk(KERN_ERR "[nvdebug] nvdebug_readq: Unable to read; registers unavailable. Is GPU on?\n"); | 47 | printk(KERN_ERR "[nvdebug] nvdebug_readq: Unable to read; registers unavailable. Is GPU on?\n"); |
| 48 | pm_runtime_put_noidle(s->dev); // No-op if !pm_runtime_enabled() | ||
| 44 | return -1; | 49 | return -1; |
| 45 | } | 50 | } |
| 46 | // readq seems to always (?) return the uppermost 32 bits as 0, so workaround with readl | 51 | // readq seems to always (?) return the uppermost 32 bits as 0, so workaround with readl |
| @@ -59,8 +64,9 @@ u64 nvdebug_readq(struct nvdebug_state *s, u32 r) { | |||
| 59 | 64 | ||
| 60 | void nvdebug_writel(struct nvdebug_state *s, u32 r, u32 v) { | 65 | void nvdebug_writel(struct nvdebug_state *s, u32 r, u32 v) { |
| 61 | // If this is an integrated ("platform") GPU, make sure that it's on first | 66 | // If this is an integrated ("platform") GPU, make sure that it's on first |
| 62 | if (s->platd && (!pm_runtime_enabled(s->dev) || pm_runtime_get(s->dev) < 0)) { | 67 | if (s->platd && (!pm_runtime_enabled(s->dev) || pm_runtime_get_sync(s->dev) < 0)) { |
| 63 | printk(KERN_ERR "[nvdebug] nvdebug_writel: Unable to write; registers unavailable. Is GPU on?\n"); | 68 | printk(KERN_ERR "[nvdebug] nvdebug_writel: Unable to write; registers unavailable. Is GPU on?\n"); |
| 69 | pm_runtime_put_noidle(s->dev); // No-op if !pm_runtime_enabled() | ||
| 64 | return; | 70 | return; |
| 65 | } | 71 | } |
| 66 | writel_relaxed(v, s->regs + r); | 72 | writel_relaxed(v, s->regs + r); |
| @@ -74,8 +80,9 @@ void nvdebug_writel(struct nvdebug_state *s, u32 r, u32 v) { | |||
| 74 | // XXX: Not clear this works on all platforms | 80 | // XXX: Not clear this works on all platforms |
| 75 | void nvdebug_writeq(struct nvdebug_state *s, u32 r, u64 v) { | 81 | void nvdebug_writeq(struct nvdebug_state *s, u32 r, u64 v) { |
| 76 | // If this is an integrated ("platform") GPU, make sure that it's on first | 82 | // If this is an integrated ("platform") GPU, make sure that it's on first |
| 77 | if (s->platd && (!pm_runtime_enabled(s->dev) || pm_runtime_get(s->dev) < 0)) { | 83 | if (s->platd && (!pm_runtime_enabled(s->dev) || pm_runtime_get_sync(s->dev) < 0)) { |
| 78 | printk(KERN_ERR "[nvdebug] nvdebug_writeq: Unable to write; registers unavailable. Is GPU on?\n"); | 84 | printk(KERN_ERR "[nvdebug] nvdebug_writeq: Unable to write; registers unavailable. Is GPU on?\n"); |
| 85 | pm_runtime_put_noidle(s->dev); // No-op if !pm_runtime_enabled() | ||
| 79 | return; | 86 | return; |
| 80 | } | 87 | } |
| 81 | writeq_relaxed(v, s->regs + r); | 88 | writeq_relaxed(v, s->regs + r); |
