From 4252e00aa6f2a82ecf608f86d8057bac8cc97e15 Mon Sep 17 00:00:00 2001 From: Deepak Nibade Date: Wed, 30 May 2018 18:47:44 -0700 Subject: gpu: nvgpu: fix crash due to accessing incorrect TSG pointer In gk20a_gr_isr(), we handle various errors including GPC/TPC errors. And then if BPT errors are pending we call gk20a_gr_post_bpt_events() at the end and pass channel pointer to it gk20a_gr_post_bpt_events() extracts TSG pointer based on ch->tsgid But in some race conditions it is possible that we clear the error and trigger recovery and as a result channel is unbounded from TSG and closed by user space before calling gk20a_gr_post_bpt_events() And in that case the code above results in getting incorrect TSG pointer and hence crashes as below Unable to handle kernel paging request at virtual address ffffff8012000c08 ... [] el1_da+0x24/0xb4 [] gk20a_tsg_get_event_data_from_id+0x30/0xb0 [] gk20a_tsg_event_id_post_event+0x50/0xc8 [] gk20a_gr_isr+0x27c/0x12e0 To fix this extract the TSG pointer before handling all the errors and pass this pointer to gk20a_gr_post_bpt_events() will post the events if they are enabled and if TSG is still open Bug 200404720 Change-Id: I4861c72e338a2cec96f31cb9488af665c5f2be39 Signed-off-by: Deepak Nibade Reviewed-on: https://git-master.nvidia.com/r/1735415 Reviewed-by: svc-mobile-coverity GVS: Gerrit_Virtual_Submit Reviewed-by: Vinod Gopalakrishnakurup Reviewed-by: Alex Waterman Reviewed-by: Terje Bergstrom Reviewed-by: mobile promotions Tested-by: mobile promotions --- drivers/gpu/nvgpu/gk20a/gr_gk20a.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'drivers') diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c index b69618ae..d4b31c86 100644 --- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c @@ -5837,19 +5837,14 @@ static int gk20a_gr_handle_gpc_exception(struct gk20a *g, bool *post_event, return ret; } -static int gk20a_gr_post_bpt_events(struct gk20a *g, struct channel_gk20a *ch, +static int gk20a_gr_post_bpt_events(struct gk20a *g, struct tsg_gk20a *tsg, u32 global_esr) { - if (global_esr & gr_gpc0_tpc0_sm_hww_global_esr_bpt_int_pending_f()) { - struct tsg_gk20a *tsg = &g->fifo.tsg[ch->tsgid]; - + if (global_esr & gr_gpc0_tpc0_sm_hww_global_esr_bpt_int_pending_f()) g->ops.fifo.post_event_id(tsg, NVGPU_EVENT_ID_BPT_INT); - } - if (global_esr & gr_gpc0_tpc0_sm_hww_global_esr_bpt_pause_pending_f()) { - struct tsg_gk20a *tsg = &g->fifo.tsg[ch->tsgid]; + if (global_esr & gr_gpc0_tpc0_sm_hww_global_esr_bpt_pause_pending_f()) g->ops.fifo.post_event_id(tsg, NVGPU_EVENT_ID_BPT_PAUSE); - } return 0; } @@ -5864,6 +5859,7 @@ int gk20a_gr_isr(struct gk20a *g) struct channel_gk20a *ch = NULL; struct channel_gk20a *fault_ch = NULL; int tsgid = NVGPU_INVALID_TSG_ID; + struct tsg_gk20a *tsg = NULL; u32 gr_engine_id; u32 global_esr = 0; @@ -5903,6 +5899,9 @@ int gk20a_gr_isr(struct gk20a *g) nvgpu_err(g, "ch id is INVALID 0xffffffff"); } + if (ch && gk20a_is_channel_marked_as_tsg(ch)) + tsg = &g->fifo.tsg[ch->tsgid]; + nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "channel %d: addr 0x%08x, " "data 0x%08x 0x%08x," @@ -6126,8 +6125,8 @@ int gk20a_gr_isr(struct gk20a *g) "unhandled gr interrupt 0x%08x", gr_intr); /* Posting of BPT events should be the last thing in this function */ - if (global_esr && fault_ch) - gk20a_gr_post_bpt_events(g, fault_ch, global_esr); + if (global_esr && tsg) + gk20a_gr_post_bpt_events(g, tsg, global_esr); if (ch) gk20a_channel_put(ch); -- cgit v1.2.2