diff options
author | Deepak Nibade <dnibade@nvidia.com> | 2016-01-07 04:01:12 -0500 |
---|---|---|
committer | Terje Bergstrom <tbergstrom@nvidia.com> | 2016-01-11 12:06:31 -0500 |
commit | 0ce201e8de6a320b70f1f34d05202650b9b5a046 (patch) | |
tree | 81898989fe240bf10190d15887cbfc98fbc45fa3 /drivers/gpu | |
parent | 9713e3572a740216c6ecbc2257349be51c204a67 (diff) |
gpu: nvgpu: stop timer on failing channel
In gk20a_channel_timeout_handler(), below deadlock scenario
is possible :
thread 1:
- take global lock g->ch_wdt_lock
- identify timed out channel (as ch1)
- check engine status which is stuck
- identify failing channel on engine as ch2
- we need to trigger recovery with ch2
- as part of recovery, call channel_abort() for ch2
- in channel_abort(), we wait to cancel the timer wq
- but timer wq for ch2 never completes due to thread 2
thread 2:
- ch2 has already timed out
- to process, we wait for global lock g->ch_wdt_lock
- this lock needs to be released by thread 1
To fix this, cancel the timer (through flag) of ch2
(failing channel on engine) before triggering recovery
on that channel
Bug 200164753
Change-Id: Idb42d01c8440a53f43cb5e87e41f1c283f7e8fcf
Signed-off-by: Deepak Nibade <dnibade@nvidia.com>
Reviewed-on: http://git-master/r/929924
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
Diffstat (limited to 'drivers/gpu')
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/channel_gk20a.c | 7 |
1 files changed, 6 insertions, 1 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c index f0a700ac..2421307f 100644 --- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c | |||
@@ -1700,9 +1700,14 @@ static void gk20a_channel_timeout_handler(struct work_struct *work) | |||
1700 | if (!failing_ch) | 1700 | if (!failing_ch) |
1701 | goto fail_enable_ctxsw; | 1701 | goto fail_enable_ctxsw; |
1702 | 1702 | ||
1703 | if (failing_ch->hw_chid != ch->hw_chid) | 1703 | if (failing_ch->hw_chid != ch->hw_chid) { |
1704 | gk20a_channel_timeout_start(ch, job); | 1704 | gk20a_channel_timeout_start(ch, job); |
1705 | 1705 | ||
1706 | mutex_lock(&failing_ch->timeout.lock); | ||
1707 | failing_ch->timeout.initialized = false; | ||
1708 | mutex_unlock(&failing_ch->timeout.lock); | ||
1709 | } | ||
1710 | |||
1706 | gk20a_fifo_recover(g, BIT(engine_id), | 1711 | gk20a_fifo_recover(g, BIT(engine_id), |
1707 | failing_ch->hw_chid, is_tsg, | 1712 | failing_ch->hw_chid, is_tsg, |
1708 | true, failing_ch->timeout_debug_dump); | 1713 | true, failing_ch->timeout_debug_dump); |