From 7aded206bc3eb0f36422e9f6f3dab3e065e7e7e4 Mon Sep 17 00:00:00 2001 From: Vinod G Date: Fri, 25 May 2018 15:44:34 -0700 Subject: gpu: nvgpu: gv11b: Handle all SM errors Add the missing register bits to identify the SM errors. Except for mmu_nack error, all other errors are handled using a single function. That function sets the error notifier with GR_EXCEPTION, clears interrupt and triggers recovery process. bug 200402677 JIRA NVGPU-573 Change-Id: Icfaff1f20f1f35adb4cd35ce288ce694845aed3c Signed-off-by: Vinod G Reviewed-on: https://git-master.nvidia.com/r/1730963 Reviewed-by: Seshendra Gadagottu Reviewed-by: svc-mobile-coverity GVS: Gerrit_Virtual_Submit Reviewed-by: Deepak Nibade Reviewed-by: Terje Bergstrom Reviewed-by: mobile promotions Tested-by: mobile promotions --- drivers/gpu/nvgpu/gv11b/gr_gv11b.c | 105 ++++++++++++++++++--- .../gpu/nvgpu/include/nvgpu/hw/gv100/hw_gr_gv100.h | 76 ++++++++++++++- .../gpu/nvgpu/include/nvgpu/hw/gv11b/hw_gr_gv11b.h | 68 +++++++++++++ 3 files changed, 234 insertions(+), 15 deletions(-) (limited to 'drivers/gpu/nvgpu') diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c index 84699db7..378bdc13 100644 --- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c @@ -2089,7 +2089,7 @@ void gr_gv11b_get_access_map(struct gk20a *g, static int gr_gv11b_handle_warp_esr_error_mmu_nack(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, - u32 warp_esr, + u32 warp_esr_error, struct channel_gk20a *fault_ch) { struct tsg_gk20a *tsg; @@ -2117,17 +2117,92 @@ static int gr_gv11b_handle_warp_esr_error_mmu_nack(struct gk20a *g, nvgpu_writel(g, gr_gpc0_tpc0_sm0_hww_warp_esr_r() + offset, 0); + nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, + "ESR %s(0x%x)", + "MMU NACK ERROR", + warp_esr_error); return 0; } -static int gr_gv11b_handle_warp_esr_error_misaligned_addr(struct gk20a *g, - u32 gpc, u32 tpc, u32 sm, - u32 warp_esr, - struct channel_gk20a *fault_ch) +static bool gr_gv11b_check_warp_esr_error(struct gk20a *g, u32 warp_esr_error) +{ + u32 index = 0U; + u32 esr_err = gr_gpc0_tpc0_sm0_hww_warp_esr_error_none_f(); + + struct warp_esr_error_table_s { + u32 error_value; + const char *error_name; + }; + + struct warp_esr_error_table_s warp_esr_error_table[] = { + { gr_gpc0_tpc0_sm0_hww_warp_esr_error_stack_error_f(), + "STACK ERROR"}, + { gr_gpc0_tpc0_sm0_hww_warp_esr_error_api_stack_error_f(), + "API STACK ERROR"}, + { gr_gpc0_tpc0_sm0_hww_warp_esr_error_pc_wrap_f(), + "PC WRAP ERROR"}, + { gr_gpc0_tpc0_sm0_hww_warp_esr_error_misaligned_pc_f(), + "MISALIGNED PC ERROR"}, + { gr_gpc0_tpc0_sm0_hww_warp_esr_error_pc_overflow_f(), + "PC OVERFLOW ERROR"}, + { gr_gpc0_tpc0_sm0_hww_warp_esr_error_misaligned_reg_f(), + "MISALIGNED REG ERROR"}, + { gr_gpc0_tpc0_sm0_hww_warp_esr_error_illegal_instr_encoding_f(), + "ILLEGAL INSTRUCTION ENCODING ERROR"}, + { gr_gpc0_tpc0_sm0_hww_warp_esr_error_illegal_instr_param_f(), + "ILLEGAL INSTRUCTION PARAM ERROR"}, + { gr_gpc0_tpc0_sm0_hww_warp_esr_error_oor_reg_f(), + "OOR REG ERROR"}, + { gr_gpc0_tpc0_sm0_hww_warp_esr_error_oor_addr_f(), + "OOR ADDR ERROR"}, + { gr_gpc0_tpc0_sm0_hww_warp_esr_error_misaligned_addr_f(), + "MISALIGNED ADDR ERROR"}, + { gr_gpc0_tpc0_sm0_hww_warp_esr_error_invalid_addr_space_f(), + "INVALID ADDR SPACE ERROR"}, + { gr_gpc0_tpc0_sm0_hww_warp_esr_error_invalid_const_addr_ldc_f(), + "INVALID ADDR LDC ERROR"}, + { gr_gpc0_tpc0_sm0_hww_warp_esr_error_stack_overflow_f(), + "STACK OVERFLOW ERROR"}, + { gr_gpc0_tpc0_sm0_hww_warp_esr_error_mmu_fault_f(), + "MMU FAULT ERROR"}, + { gr_gpc0_tpc0_sm0_hww_warp_esr_error_tex_format_f(), + "TEX FORMAT ERROR"}, + { gr_gpc0_tpc0_sm0_hww_warp_esr_error_tex_layout_f(), + "TEX LAYOUT ERROR"}, + }; + + for (index = 0; index < ARRAY_SIZE(warp_esr_error_table); index++) { + if (warp_esr_error_table[index].error_value == warp_esr_error) { + esr_err = warp_esr_error_table[index].error_value; + nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, + "ESR %s(0x%x)", + warp_esr_error_table[index].error_name, + esr_err); + break; + } + } + + return (esr_err == 0U) ? false : true; +} +static int gr_gv11b_handle_all_warp_esr_errors(struct gk20a *g, + u32 gpc, u32 tpc, u32 sm, + u32 warp_esr_error, + struct channel_gk20a *fault_ch) { struct tsg_gk20a *tsg; - u32 offset; struct channel_gk20a *ch_tsg; + u32 offset = 0U; + bool is_esr_error = false; + + /* + * Check for an esr error + */ + is_esr_error = gr_gv11b_check_warp_esr_error(g, warp_esr_error); + if (!is_esr_error) { + nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, + "No ESR error, Skip RC recovery and Trigeer CILP"); + return 0; + } if (fault_ch) { tsg = &g->fifo.tsg[fault_ch->tsgid]; @@ -2170,8 +2245,10 @@ int gr_gv11b_pre_process_sm_exception(struct gk20a *g, u32 offset = gk20a_gr_gpc_offset(g, gpc) + gk20a_gr_tpc_offset(g, tpc) + gv11b_gr_sm_offset(g, sm); + u32 warp_esr_error = gr_gpc0_tpc0_sm0_hww_warp_esr_error_v(warp_esr); struct tsg_gk20a *tsg; + *early_exit = false; *ignore_debugger = false; @@ -2179,13 +2256,19 @@ int gr_gv11b_pre_process_sm_exception(struct gk20a *g, * We don't need to trigger CILP in case of MMU_NACK * So just handle MMU_NACK and return */ - if (warp_esr & gr_gpc0_tpc0_sm0_hww_warp_esr_error_mmu_nack_f()) + if (warp_esr_error == gr_gpc0_tpc0_sm0_hww_warp_esr_error_mmu_nack_f()) return gr_gv11b_handle_warp_esr_error_mmu_nack(g, gpc, tpc, sm, - warp_esr, fault_ch); + warp_esr_error, fault_ch); - if (warp_esr & gr_gpc0_tpc0_sm0_hww_warp_esr_error_misaligned_addr_f()) - return gr_gv11b_handle_warp_esr_error_misaligned_addr(g, gpc, tpc, sm, - warp_esr, fault_ch); + /* + * Proceed to trigger CILP preemption if the return value + * from this function is zero, else proceed to recovery + */ + ret = gr_gv11b_handle_all_warp_esr_errors(g, gpc, tpc, sm, + warp_esr_error, fault_ch); + if (ret) { + return ret; + } if (fault_ch) { tsg = tsg_gk20a_from_ch(fault_ch); diff --git a/drivers/gpu/nvgpu/include/nvgpu/hw/gv100/hw_gr_gv100.h b/drivers/gpu/nvgpu/include/nvgpu/hw/gv100/hw_gr_gv100.h index 29fd9a6f..0f83d6ba 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/hw/gv100/hw_gr_gv100.h +++ b/drivers/gpu/nvgpu/include/nvgpu/hw/gv100/hw_gr_gv100.h @@ -3632,17 +3632,81 @@ static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_none_v(void) { return 0x00000000U; } +static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_none_f(void) +{ + return 0x0U; +} +static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_stack_error_f(void) +{ + return 0x1U; +} +static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_api_stack_error_f(void) +{ + return 0x2U; +} +static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_pc_wrap_f(void) +{ + return 0x4U; +} +static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_misaligned_pc_f(void) +{ + return 0x5U; +} +static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_pc_overflow_f(void) +{ + return 0x6U; +} +static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_misaligned_reg_f(void) +{ + return 0x8U; +} +static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_illegal_instr_encoding_f(void) +{ + return 0x9U; +} +static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_illegal_instr_param_f(void) +{ + return 0xbU; +} +static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_oor_reg_f(void) +{ + return 0xdU; +} +static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_oor_addr_f(void) +{ + return 0xeU; +} static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_misaligned_addr_f(void) { return 0xfU; } -static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_mmu_nack_f(void) +static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_invalid_addr_space_f(void) { - return 0x20U; + return 0x10U; } -static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_none_f(void) +static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_invalid_const_addr_ldc_f(void) { - return 0x0U; + return 0x12U; +} +static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_stack_overflow_f(void) +{ + return 0x16U; +} +static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_mmu_fault_f(void) +{ + return 0x17U; +} +static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_tex_format_f(void) +{ + return 0x18U; +} +static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_tex_layout_f(void) +{ + return 0x19U; +} +static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_mmu_nack_f(void) +{ + return 0x20U; } static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_wrap_id_m(void) { @@ -3672,6 +3736,10 @@ static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_pc_r(void) { return 0x00504738U; } +static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_pc_hi_r(void) +{ + return 0x0050473cU; +} static inline u32 gr_gpc0_tpc0_sm_halfctl_ctrl_r(void) { return 0x005043a0U; diff --git a/drivers/gpu/nvgpu/include/nvgpu/hw/gv11b/hw_gr_gv11b.h b/drivers/gpu/nvgpu/include/nvgpu/hw/gv11b/hw_gr_gv11b.h index 17c7e77d..5de691a2 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/hw/gv11b/hw_gr_gv11b.h +++ b/drivers/gpu/nvgpu/include/nvgpu/hw/gv11b/hw_gr_gv11b.h @@ -4392,10 +4392,74 @@ static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_none_f(void) { return 0x0U; } +static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_stack_error_f(void) +{ + return 0x1U; +} +static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_api_stack_error_f(void) +{ + return 0x2U; +} +static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_pc_wrap_f(void) +{ + return 0x4U; +} +static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_misaligned_pc_f(void) +{ + return 0x5U; +} +static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_pc_overflow_f(void) +{ + return 0x6U; +} +static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_misaligned_reg_f(void) +{ + return 0x8U; +} +static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_illegal_instr_encoding_f(void) +{ + return 0x9U; +} +static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_illegal_instr_param_f(void) +{ + return 0xbU; +} +static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_oor_reg_f(void) +{ + return 0xdU; +} +static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_oor_addr_f(void) +{ + return 0xeU; +} static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_misaligned_addr_f(void) { return 0xfU; } +static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_invalid_addr_space_f(void) +{ + return 0x10U; +} +static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_invalid_const_addr_ldc_f(void) +{ + return 0x12U; +} +static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_stack_overflow_f(void) +{ + return 0x16U; +} +static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_mmu_fault_f(void) +{ + return 0x17U; +} +static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_tex_format_f(void) +{ + return 0x18U; +} +static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_tex_layout_f(void) +{ + return 0x19U; +} static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_mmu_nack_f(void) { return 0x20U; @@ -4428,6 +4492,10 @@ static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_pc_r(void) { return 0x00504738U; } +static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_pc_hi_r(void) +{ + return 0x0050473cU; +} static inline u32 gr_gpc0_tpc0_sm_halfctl_ctrl_r(void) { return 0x005043a0U; -- cgit v1.2.2