From 494df296bf4abe9b2b484bde1a4fad28c989afec Mon Sep 17 00:00:00 2001 From: Joshua Bakita Date: Fri, 4 Apr 2025 10:29:54 -0400 Subject: Fix a race condition in nvdebug_{readl,readq,writel,writeq} When the GPU is powered off, attempts to read any of its registers (such as via nvdebug_readl()) result in a fatal interrupt. The pm_runtime_get() call included in nvdebug sent a request to nvgpu to turn the GPU back on. **However,** this call did not wait for the power-on command to take effect. This resulted in a race between nvdebug and the power management logic, meaning that the GPU may not have powered-on by the time that nvdebug attempted to read its registers. Use pm_runtime_get_sync() instead, which explicitly waits for the power-on command to complete (or fail) before returning. This eliminates the race condition. Thank you to Diego Alejandro Parra Guzman , who brought this issue to my attention. --- nvdebug_linux.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/nvdebug_linux.c b/nvdebug_linux.c index 111d5aa..e673a8b 100644 --- a/nvdebug_linux.c +++ b/nvdebug_linux.c @@ -1,4 +1,4 @@ -/* Copyright 2024 Joshua Bakita +/* Copyright 2024-2025 Joshua Bakita * Implementation of Kernel-specific function implementations */ #include "nvdebug_linux.h" @@ -9,10 +9,14 @@ u32 nvdebug_readl(struct nvdebug_state *s, u32 r) { u32 ret; // If this is an integrated ("platform") GPU, make sure that it's on first // (pm_runtime_enabled() will return false until nvgpu is started. Once - // nvgpu is started, pm_runtime_get() will attempt to resume the GPU.) + // nvgpu is started, pm_runtime_get_sync() will attempt to resume the GPU. + // This still increments the usage counter on failure, so we undo that with + // pm_runtime_put_noidle(). We avoid pm_runtime_resume_and_get() as it was + // not added until Linux 5.9.11) // This works to bring up the TX2, Xavier, and Orin, but not the TX1. - if (s->platd && (!pm_runtime_enabled(s->dev) || pm_runtime_get(s->dev) < 0)) { + if (s->platd && (!pm_runtime_enabled(s->dev) || pm_runtime_get_sync(s->dev) < 0)) { printk(KERN_ERR "[nvdebug] nvdebug_readl: Unable to read; registers unavailable. Is GPU on?\n"); + pm_runtime_put_noidle(s->dev); // No-op if !pm_runtime_enabled() return -1; } ret = readl(s->regs + r); @@ -39,8 +43,9 @@ u32 nvdebug_readl(struct nvdebug_state *s, u32 r) { u64 nvdebug_readq(struct nvdebug_state *s, u32 r) { u64 ret; // If this is an integrated ("platform") GPU, make sure that it's on first - if (s->platd && (!pm_runtime_enabled(s->dev) || pm_runtime_get(s->dev) < 0)) { + if (s->platd && (!pm_runtime_enabled(s->dev) || pm_runtime_get_sync(s->dev) < 0)) { printk(KERN_ERR "[nvdebug] nvdebug_readq: Unable to read; registers unavailable. Is GPU on?\n"); + pm_runtime_put_noidle(s->dev); // No-op if !pm_runtime_enabled() return -1; } // readq seems to always (?) return the uppermost 32 bits as 0, so workaround with readl @@ -59,8 +64,9 @@ u64 nvdebug_readq(struct nvdebug_state *s, u32 r) { void nvdebug_writel(struct nvdebug_state *s, u32 r, u32 v) { // If this is an integrated ("platform") GPU, make sure that it's on first - if (s->platd && (!pm_runtime_enabled(s->dev) || pm_runtime_get(s->dev) < 0)) { + if (s->platd && (!pm_runtime_enabled(s->dev) || pm_runtime_get_sync(s->dev) < 0)) { printk(KERN_ERR "[nvdebug] nvdebug_writel: Unable to write; registers unavailable. Is GPU on?\n"); + pm_runtime_put_noidle(s->dev); // No-op if !pm_runtime_enabled() return; } writel_relaxed(v, s->regs + r); @@ -74,8 +80,9 @@ void nvdebug_writel(struct nvdebug_state *s, u32 r, u32 v) { // XXX: Not clear this works on all platforms void nvdebug_writeq(struct nvdebug_state *s, u32 r, u64 v) { // If this is an integrated ("platform") GPU, make sure that it's on first - if (s->platd && (!pm_runtime_enabled(s->dev) || pm_runtime_get(s->dev) < 0)) { + if (s->platd && (!pm_runtime_enabled(s->dev) || pm_runtime_get_sync(s->dev) < 0)) { printk(KERN_ERR "[nvdebug] nvdebug_writeq: Unable to write; registers unavailable. Is GPU on?\n"); + pm_runtime_put_noidle(s->dev); // No-op if !pm_runtime_enabled() return; } writeq_relaxed(v, s->regs + r); -- cgit v1.2.2