diff options
author | Deepak Nibade <dnibade@nvidia.com> | 2018-05-30 21:47:44 -0400 |
---|---|---|
committer | Tejal Kudav <tkudav@nvidia.com> | 2018-06-14 09:44:06 -0400 |
commit | 4252e00aa6f2a82ecf608f86d8057bac8cc97e15 (patch) | |
tree | 0952089c4517916edfd962d82ab034e993f3f5c8 /drivers/gpu/nvgpu | |
parent | 7aa928fa07066b1b9ac6ffb2edf0b473f10a2518 (diff) |
gpu: nvgpu: fix crash due to accessing incorrect TSG pointer
In gk20a_gr_isr(), we handle various errors including GPC/TPC errors.
And then if BPT errors are pending we call gk20a_gr_post_bpt_events() at the
end and pass channel pointer to it
gk20a_gr_post_bpt_events() extracts TSG pointer based on ch->tsgid
But in some race conditions it is possible that we clear the error and trigger
recovery and as a result channel is unbounded from TSG and closed by user space
before calling gk20a_gr_post_bpt_events()
And in that case the code above results in getting incorrect TSG pointer and
hence crashes as below
Unable to handle kernel paging request at virtual address ffffff8012000c08
...
[<ffffff8008081f84>] el1_da+0x24/0xb4
[<ffffff80086e72e0>] gk20a_tsg_get_event_data_from_id+0x30/0xb0
[<ffffff80086e7560>] gk20a_tsg_event_id_post_event+0x50/0xc8
[<ffffff800872922c>] gk20a_gr_isr+0x27c/0x12e0
To fix this extract the TSG pointer before handling all the errors and pass
this pointer to gk20a_gr_post_bpt_events() will post the events if they are
enabled and if TSG is still open
Bug 200404720
Change-Id: I4861c72e338a2cec96f31cb9488af665c5f2be39
Signed-off-by: Deepak Nibade <dnibade@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/1735415
Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com>
GVS: Gerrit_Virtual_Submit
Reviewed-by: Vinod Gopalakrishnakurup <vinodg@nvidia.com>
Reviewed-by: Alex Waterman <alexw@nvidia.com>
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
Diffstat (limited to 'drivers/gpu/nvgpu')
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/gr_gk20a.c | 19 |
1 files changed, 9 insertions, 10 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c index b69618ae..d4b31c86 100644 --- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c | |||
@@ -5837,19 +5837,14 @@ static int gk20a_gr_handle_gpc_exception(struct gk20a *g, bool *post_event, | |||
5837 | return ret; | 5837 | return ret; |
5838 | } | 5838 | } |
5839 | 5839 | ||
5840 | static int gk20a_gr_post_bpt_events(struct gk20a *g, struct channel_gk20a *ch, | 5840 | static int gk20a_gr_post_bpt_events(struct gk20a *g, struct tsg_gk20a *tsg, |
5841 | u32 global_esr) | 5841 | u32 global_esr) |
5842 | { | 5842 | { |
5843 | if (global_esr & gr_gpc0_tpc0_sm_hww_global_esr_bpt_int_pending_f()) { | 5843 | if (global_esr & gr_gpc0_tpc0_sm_hww_global_esr_bpt_int_pending_f()) |
5844 | struct tsg_gk20a *tsg = &g->fifo.tsg[ch->tsgid]; | ||
5845 | |||
5846 | g->ops.fifo.post_event_id(tsg, NVGPU_EVENT_ID_BPT_INT); | 5844 | g->ops.fifo.post_event_id(tsg, NVGPU_EVENT_ID_BPT_INT); |
5847 | } | ||
5848 | if (global_esr & gr_gpc0_tpc0_sm_hww_global_esr_bpt_pause_pending_f()) { | ||
5849 | struct tsg_gk20a *tsg = &g->fifo.tsg[ch->tsgid]; | ||
5850 | 5845 | ||
5846 | if (global_esr & gr_gpc0_tpc0_sm_hww_global_esr_bpt_pause_pending_f()) | ||
5851 | g->ops.fifo.post_event_id(tsg, NVGPU_EVENT_ID_BPT_PAUSE); | 5847 | g->ops.fifo.post_event_id(tsg, NVGPU_EVENT_ID_BPT_PAUSE); |
5852 | } | ||
5853 | 5848 | ||
5854 | return 0; | 5849 | return 0; |
5855 | } | 5850 | } |
@@ -5864,6 +5859,7 @@ int gk20a_gr_isr(struct gk20a *g) | |||
5864 | struct channel_gk20a *ch = NULL; | 5859 | struct channel_gk20a *ch = NULL; |
5865 | struct channel_gk20a *fault_ch = NULL; | 5860 | struct channel_gk20a *fault_ch = NULL; |
5866 | int tsgid = NVGPU_INVALID_TSG_ID; | 5861 | int tsgid = NVGPU_INVALID_TSG_ID; |
5862 | struct tsg_gk20a *tsg = NULL; | ||
5867 | u32 gr_engine_id; | 5863 | u32 gr_engine_id; |
5868 | u32 global_esr = 0; | 5864 | u32 global_esr = 0; |
5869 | 5865 | ||
@@ -5903,6 +5899,9 @@ int gk20a_gr_isr(struct gk20a *g) | |||
5903 | nvgpu_err(g, "ch id is INVALID 0xffffffff"); | 5899 | nvgpu_err(g, "ch id is INVALID 0xffffffff"); |
5904 | } | 5900 | } |
5905 | 5901 | ||
5902 | if (ch && gk20a_is_channel_marked_as_tsg(ch)) | ||
5903 | tsg = &g->fifo.tsg[ch->tsgid]; | ||
5904 | |||
5906 | nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, | 5905 | nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, |
5907 | "channel %d: addr 0x%08x, " | 5906 | "channel %d: addr 0x%08x, " |
5908 | "data 0x%08x 0x%08x," | 5907 | "data 0x%08x 0x%08x," |
@@ -6126,8 +6125,8 @@ int gk20a_gr_isr(struct gk20a *g) | |||
6126 | "unhandled gr interrupt 0x%08x", gr_intr); | 6125 | "unhandled gr interrupt 0x%08x", gr_intr); |
6127 | 6126 | ||
6128 | /* Posting of BPT events should be the last thing in this function */ | 6127 | /* Posting of BPT events should be the last thing in this function */ |
6129 | if (global_esr && fault_ch) | 6128 | if (global_esr && tsg) |
6130 | gk20a_gr_post_bpt_events(g, fault_ch, global_esr); | 6129 | gk20a_gr_post_bpt_events(g, tsg, global_esr); |
6131 | 6130 | ||
6132 | if (ch) | 6131 | if (ch) |
6133 | gk20a_channel_put(ch); | 6132 | gk20a_channel_put(ch); |