From 89e0745fa024891b988508c3baa20c453230a80b Mon Sep 17 00:00:00 2001 From: Deepak Nibade Date: Mon, 2 Apr 2018 19:10:42 +0530 Subject: gpu: nvgpu: handle misaligned_addr SM exception We right now do not handle misaligned_addr SM exception explicitly and hence we incorrectly initiate CILP on this exception Handle this exception explicitly in this sequence - - set error notifier first - clear the interrupt - return error from gr_gv11b_handle_warp_esr_error_misaligned_addr() so that RC recovery is triggered by gk20a_gr_isr() Ensure that the error value is propagated back to gk20a_gr_isr() correctly Use nvgpu_set_error_notifier_if_empty() to set error notifier since this will prevent overwriting of error notifier value in case gk20a_gr_isr() also tries to write to some error notifier value Bug 200388475 Jira NVGPU-554 Change-Id: I84c4d202a8068e738567ccd344e05d9d5f6ad2f0 Signed-off-by: Deepak Nibade Reviewed-on: https://git-master.nvidia.com/r/1686781 Reviewed-by: mobile promotions Tested-by: mobile promotions --- drivers/gpu/nvgpu/gk20a/gr_gk20a.c | 14 ++++---- drivers/gpu/nvgpu/gv100/hal_gv100.c | 2 +- drivers/gpu/nvgpu/gv11b/gr_gv11b.c | 40 ++++++++++++++++++++++ drivers/gpu/nvgpu/gv11b/hal_gv11b.c | 2 +- .../gpu/nvgpu/include/nvgpu/hw/gv100/hw_gr_gv100.h | 4 +++ .../gpu/nvgpu/include/nvgpu/hw/gv11b/hw_gr_gv11b.h | 4 +++ 6 files changed, 57 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c index c6a58fec..680b1637 100644 --- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c @@ -5740,7 +5740,7 @@ static int gk20a_gr_handle_tpc_exception(struct gk20a *g, u32 gpc, u32 tpc, "GPC%d TPC%d: SM%d exception pending", gpc, tpc, sm); - ret = g->ops.gr.handle_sm_exception(g, + ret |= g->ops.gr.handle_sm_exception(g, gpc, tpc, sm, post_event, fault_ch, hww_global_esr); /* clear the hwws, also causes tpc and gpc @@ -5759,11 +5759,11 @@ static int gk20a_gr_handle_tpc_exception(struct gk20a *g, u32 gpc, u32 tpc, gr_gpc0_tpc0_tpccs_tpc_exception_tex_pending_v()) { gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "GPC%d TPC%d: TEX exception pending", gpc, tpc); - ret = g->ops.gr.handle_tex_exception(g, gpc, tpc, post_event); + ret |= g->ops.gr.handle_tex_exception(g, gpc, tpc, post_event); } if (g->ops.gr.handle_tpc_mpc_exception) - ret = g->ops.gr.handle_tpc_mpc_exception(g, + ret |= g->ops.gr.handle_tpc_mpc_exception(g, gpc, tpc, post_event); return ret; @@ -5801,7 +5801,7 @@ static int gk20a_gr_handle_gpc_exception(struct gk20a *g, bool *post_event, gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "GPC%d: TPC%d exception pending", gpc, tpc); - ret = gk20a_gr_handle_tpc_exception(g, gpc, tpc, + ret |= gk20a_gr_handle_tpc_exception(g, gpc, tpc, post_event, fault_ch, hww_global_esr); } @@ -5812,7 +5812,7 @@ static int gk20a_gr_handle_gpc_exception(struct gk20a *g, bool *post_event, int gcc_ret = 0; gcc_ret = g->ops.gr.handle_gcc_exception(g, gpc, tpc, post_event, fault_ch, hww_global_esr); - ret = ret ? ret : gcc_ret; + ret |= ret ? ret : gcc_ret; } /* Handle GPCCS exceptions */ @@ -5820,7 +5820,7 @@ static int gk20a_gr_handle_gpc_exception(struct gk20a *g, bool *post_event, int ret_ecc = 0; ret_ecc = g->ops.gr.handle_gpc_gpccs_exception(g, gpc, gpc_exception); - ret = ret ? ret : ret_ecc; + ret |= ret ? ret : ret_ecc; } /* Handle GPCMMU exceptions */ @@ -5829,7 +5829,7 @@ static int gk20a_gr_handle_gpc_exception(struct gk20a *g, bool *post_event, ret_mmu = g->ops.gr.handle_gpc_gpcmmu_exception(g, gpc, gpc_exception); - ret = ret ? ret : ret_mmu; + ret |= ret ? ret : ret_mmu; } } diff --git a/drivers/gpu/nvgpu/gv100/hal_gv100.c b/drivers/gpu/nvgpu/gv100/hal_gv100.c index f0187dab..b38260a5 100644 --- a/drivers/gpu/nvgpu/gv100/hal_gv100.c +++ b/drivers/gpu/nvgpu/gv100/hal_gv100.c @@ -517,7 +517,7 @@ static const struct gpu_ops gv100_ops = { .check_ch_ctxsw_timeout = gk20a_fifo_check_ch_ctxsw_timeout, .channel_suspend = gk20a_channel_suspend, .channel_resume = gk20a_channel_resume, - .set_error_notifier = nvgpu_set_error_notifier, + .set_error_notifier = nvgpu_set_error_notifier_if_empty, .setup_sw = gk20a_init_fifo_setup_sw, #ifdef CONFIG_TEGRA_GK20A_NVHOST .alloc_syncpt_buf = gv11b_fifo_alloc_syncpt_buf, diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c index 7f6d1906..c43c6e83 100644 --- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c @@ -30,6 +30,7 @@ #include #include #include +#include #include "gk20a/gk20a.h" #include "gk20a/gr_gk20a.h" @@ -2090,6 +2091,41 @@ static int gr_gv11b_handle_warp_esr_error_mmu_nack(struct gk20a *g, return 0; } +static int gr_gv11b_handle_warp_esr_error_misaligned_addr(struct gk20a *g, + u32 gpc, u32 tpc, u32 sm, + u32 warp_esr, + struct channel_gk20a *fault_ch) +{ + struct tsg_gk20a *tsg; + u32 offset; + struct channel_gk20a *ch_tsg; + + if (fault_ch) { + tsg = &g->fifo.tsg[fault_ch->tsgid]; + + nvgpu_rwsem_down_read(&tsg->ch_list_lock); + nvgpu_list_for_each_entry(ch_tsg, &tsg->ch_list, + channel_gk20a, ch_entry) { + if (gk20a_channel_get(ch_tsg)) { + g->ops.fifo.set_error_notifier(ch_tsg, + NVGPU_ERR_NOTIFIER_GR_EXCEPTION); + gk20a_channel_put(ch_tsg); + } + } + nvgpu_rwsem_up_read(&tsg->ch_list_lock); + } + + /* clear interrupt */ + offset = gk20a_gr_gpc_offset(g, gpc) + + gk20a_gr_tpc_offset(g, tpc) + + gv11b_gr_sm_offset(g, sm); + nvgpu_writel(g, + gr_gpc0_tpc0_sm0_hww_warp_esr_r() + offset, 0); + + /* return error so that recovery is triggered by gk20a_gr_isr() */ + return -EFAULT; +} + /* @brief pre-process work on the SM exceptions to determine if we clear them or not. * * On Pascal, if we are in CILP preemtion mode, preempt the channel and handle errors with special processing @@ -2118,6 +2154,10 @@ int gr_gv11b_pre_process_sm_exception(struct gk20a *g, return gr_gv11b_handle_warp_esr_error_mmu_nack(g, gpc, tpc, sm, warp_esr, fault_ch); + if (warp_esr & gr_gpc0_tpc0_sm0_hww_warp_esr_error_misaligned_addr_f()) + return gr_gv11b_handle_warp_esr_error_misaligned_addr(g, gpc, tpc, sm, + warp_esr, fault_ch); + if (fault_ch) { tsg = tsg_gk20a_from_ch(fault_ch); if (!tsg) diff --git a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c index 2d6dc9b0..dd4bd55a 100644 --- a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c @@ -534,7 +534,7 @@ static const struct gpu_ops gv11b_ops = { .check_ch_ctxsw_timeout = gk20a_fifo_check_ch_ctxsw_timeout, .channel_suspend = gk20a_channel_suspend, .channel_resume = gk20a_channel_resume, - .set_error_notifier = nvgpu_set_error_notifier, + .set_error_notifier = nvgpu_set_error_notifier_if_empty, .setup_sw = gk20a_init_fifo_setup_sw, #ifdef CONFIG_TEGRA_GK20A_NVHOST .alloc_syncpt_buf = gv11b_fifo_alloc_syncpt_buf, diff --git a/drivers/gpu/nvgpu/include/nvgpu/hw/gv100/hw_gr_gv100.h b/drivers/gpu/nvgpu/include/nvgpu/hw/gv100/hw_gr_gv100.h index 8e475895..f5f09cdf 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/hw/gv100/hw_gr_gv100.h +++ b/drivers/gpu/nvgpu/include/nvgpu/hw/gv100/hw_gr_gv100.h @@ -3632,6 +3632,10 @@ static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_none_v(void) { return 0x00000000U; } +static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_misaligned_addr_f(void) +{ + return 0xfU; +} static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_mmu_nack_f(void) { return 0x20U; diff --git a/drivers/gpu/nvgpu/include/nvgpu/hw/gv11b/hw_gr_gv11b.h b/drivers/gpu/nvgpu/include/nvgpu/hw/gv11b/hw_gr_gv11b.h index 4458265d..f7968089 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/hw/gv11b/hw_gr_gv11b.h +++ b/drivers/gpu/nvgpu/include/nvgpu/hw/gv11b/hw_gr_gv11b.h @@ -4392,6 +4392,10 @@ static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_none_f(void) { return 0x0U; } +static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_misaligned_addr_f(void) +{ + return 0xfU; +} static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_mmu_nack_f(void) { return 0x20U; -- cgit v1.2.2