From 45ca7cb8c5774cfc15015973b1883faa1d93b9e6 Mon Sep 17 00:00:00 2001 From: Lakshmanan M Date: Fri, 19 May 2017 15:40:41 +0530 Subject: gpu: nvgpu: gv11b: Add GCC L1.5 parity support Add handling of GCC L1.5 parity exception. JIRA GPUT19X-86 Change-Id: Ie83fc306d3dff79b0ddaf2616dcf0ff71fccd4ca Signed-off-by: Lakshmanan M Reviewed-on: http://git-master/r/1485834 Reviewed-by: Terje Bergstrom Tested-by: Terje Bergstrom --- drivers/gpu/nvgpu/gv11b/gr_gv11b.c | 82 +++++++++++++++++++++- drivers/gpu/nvgpu/gv11b/gr_gv11b.h | 2 + drivers/gpu/nvgpu/gv11b/platform_gv11b_tegra.c | 24 +++++++ .../gpu/nvgpu/include/nvgpu/hw/gv11b/hw_gr_gv11b.h | 64 +++++++++++++++++ 4 files changed, 171 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/nvgpu') diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c index 0c0b4261..014ba537 100644 --- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c @@ -556,6 +556,84 @@ static int gr_gv11b_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, return ret; } +static int gr_gv11b_handle_gcc_exception(struct gk20a *g, u32 gpc, u32 tpc, + bool *post_event, struct channel_gk20a *fault_ch, + u32 *hww_global_esr) +{ + u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); + u32 offset = gpc_stride * gpc; + u32 gcc_l15_ecc_status, gcc_l15_ecc_corrected_err_status = 0; + u32 gcc_l15_ecc_uncorrected_err_status = 0; + u32 gcc_l15_corrected_err_count_delta = 0; + u32 gcc_l15_uncorrected_err_count_delta = 0; + bool is_gcc_l15_ecc_corrected_total_err_overflow = 0; + bool is_gcc_l15_ecc_uncorrected_total_err_overflow = 0; + + /* Check for gcc l15 ECC errors. */ + gcc_l15_ecc_status = gk20a_readl(g, + gr_pri_gpc0_gcc_l15_ecc_status_r() + offset); + gcc_l15_ecc_corrected_err_status = gcc_l15_ecc_status & + (gr_pri_gpc0_gcc_l15_ecc_status_corrected_err_bank0_m() | + gr_pri_gpc0_gcc_l15_ecc_status_corrected_err_bank1_m()); + gcc_l15_ecc_uncorrected_err_status = gcc_l15_ecc_status & + (gr_pri_gpc0_gcc_l15_ecc_status_uncorrected_err_bank0_m() | + gr_pri_gpc0_gcc_l15_ecc_status_uncorrected_err_bank1_m()); + + if ((gcc_l15_ecc_corrected_err_status == 0) && (gcc_l15_ecc_uncorrected_err_status == 0)) + return 0; + + gcc_l15_corrected_err_count_delta = + gr_pri_gpc0_gcc_l15_ecc_corrected_err_count_total_v( + gk20a_readl(g, + gr_pri_gpc0_gcc_l15_ecc_corrected_err_count_r() + + offset)); + gcc_l15_uncorrected_err_count_delta = + gr_pri_gpc0_gcc_l15_ecc_uncorrected_err_count_total_v( + gk20a_readl(g, + gr_pri_gpc0_gcc_l15_ecc_uncorrected_err_count_r() + + offset)); + is_gcc_l15_ecc_corrected_total_err_overflow = + gr_pri_gpc0_gcc_l15_ecc_status_corrected_err_total_counter_overflow_v(gcc_l15_ecc_status); + is_gcc_l15_ecc_uncorrected_total_err_overflow = + gr_pri_gpc0_gcc_l15_ecc_status_uncorrected_err_total_counter_overflow_v(gcc_l15_ecc_status); + + if ((gcc_l15_corrected_err_count_delta > 0) || is_gcc_l15_ecc_corrected_total_err_overflow) { + nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr, + "corrected error (SBE) detected in GCC L1.5! err_mask [%08x] is_overf [%d]", + gcc_l15_ecc_corrected_err_status, is_gcc_l15_ecc_corrected_total_err_overflow); + + /* HW uses 16-bits counter */ + gcc_l15_corrected_err_count_delta += + (is_gcc_l15_ecc_corrected_total_err_overflow << + gr_pri_gpc0_gcc_l15_ecc_corrected_err_count_total_s()); + g->gr.t19x.ecc_stats.gcc_l15_corrected_err_count.counters[gpc] += + gcc_l15_corrected_err_count_delta; + gk20a_writel(g, + gr_pri_gpc0_gcc_l15_ecc_corrected_err_count_r() + offset, + 0); + } + if ((gcc_l15_uncorrected_err_count_delta > 0) || is_gcc_l15_ecc_uncorrected_total_err_overflow) { + nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr, + "Uncorrected error (DBE) detected in GCC L1.5! err_mask [%08x] is_overf [%d]", + gcc_l15_ecc_uncorrected_err_status, is_gcc_l15_ecc_uncorrected_total_err_overflow); + + /* HW uses 16-bits counter */ + gcc_l15_uncorrected_err_count_delta += + (is_gcc_l15_ecc_uncorrected_total_err_overflow << + gr_pri_gpc0_gcc_l15_ecc_uncorrected_err_count_total_s()); + g->gr.t19x.ecc_stats.gcc_l15_uncorrected_err_count.counters[gpc] += + gcc_l15_uncorrected_err_count_delta; + gk20a_writel(g, + gr_pri_gpc0_gcc_l15_ecc_uncorrected_err_count_r() + offset, + 0); + } + + gk20a_writel(g, gr_pri_gpc0_gcc_l15_ecc_status_r() + offset, + gr_pri_gpc0_gcc_l15_ecc_status_reset_task_f()); + + return 0; +} + static void gr_gv11b_enable_gpc_exceptions(struct gk20a *g) { struct gr_gk20a *gr = &g->gr; @@ -567,7 +645,8 @@ static void gr_gv11b_enable_gpc_exceptions(struct gk20a *g) tpc_mask = gr_gpcs_gpccs_gpc_exception_en_tpc_f((1 << gr->tpc_count) - 1); - gk20a_writel(g, gr_gpcs_gpccs_gpc_exception_en_r(), tpc_mask); + gk20a_writel(g, gr_gpcs_gpccs_gpc_exception_en_r(), + (tpc_mask | gr_gpcs_gpccs_gpc_exception_en_gcc_f(1))); } static int gr_gv11b_handle_tex_exception(struct gk20a *g, u32 gpc, u32 tpc, @@ -2113,6 +2192,7 @@ void gv11b_init_gr(struct gpu_ops *gops) gops->gr.set_gpc_tpc_mask = gr_gv11b_set_gpc_tpc_mask; gops->gr.get_access_map = gr_gv11b_get_access_map; gops->gr.handle_sm_exception = gr_gv11b_handle_sm_exception; + gops->gr.handle_gcc_exception = gr_gv11b_handle_gcc_exception; gops->gr.handle_tex_exception = gr_gv11b_handle_tex_exception; gops->gr.enable_gpc_exceptions = gr_gv11b_enable_gpc_exceptions; gops->gr.mask_hww_warp_esr = gv11b_mask_hww_warp_esr; diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.h b/drivers/gpu/nvgpu/gv11b/gr_gv11b.h index 5bcbe667..cf3842b6 100644 --- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.h +++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.h @@ -45,6 +45,8 @@ struct gr_t19x { struct gr_gp10b_ecc_stat sm_l1_data_uncorrected_err_count; struct gr_gp10b_ecc_stat sm_icache_corrected_err_count; struct gr_gp10b_ecc_stat sm_icache_uncorrected_err_count; + struct gr_gp10b_ecc_stat gcc_l15_corrected_err_count; + struct gr_gp10b_ecc_stat gcc_l15_uncorrected_err_count; } ecc_stats; }; diff --git a/drivers/gpu/nvgpu/gv11b/platform_gv11b_tegra.c b/drivers/gpu/nvgpu/gv11b/platform_gv11b_tegra.c index 009e5716..39ae68eb 100644 --- a/drivers/gpu/nvgpu/gv11b/platform_gv11b_tegra.c +++ b/drivers/gpu/nvgpu/gv11b/platform_gv11b_tegra.c @@ -131,6 +131,8 @@ static struct device_attribute *dev_attr_sm_l1_data_ecc_corrected_err_count_arra static struct device_attribute *dev_attr_sm_l1_data_ecc_uncorrected_err_count_array; static struct device_attribute *dev_attr_sm_icache_ecc_corrected_err_count_array; static struct device_attribute *dev_attr_sm_icache_ecc_uncorrected_err_count_array; +static struct device_attribute *dev_attr_gcc_l15_ecc_corrected_err_count_array; +static struct device_attribute *dev_attr_gcc_l15_ecc_uncorrected_err_count_array; void gr_gv11b_create_sysfs(struct device *dev) { @@ -193,6 +195,18 @@ void gr_gv11b_create_sysfs(struct device *dev) &g->gr.t19x.ecc_stats.sm_icache_uncorrected_err_count, dev_attr_sm_icache_ecc_uncorrected_err_count_array); + error |= gr_gp10b_ecc_stat_create(dev, + 0, + "gcc_l15_ecc_corrected_err_count", + &g->gr.t19x.ecc_stats.gcc_l15_corrected_err_count, + dev_attr_gcc_l15_ecc_corrected_err_count_array); + + error |= gr_gp10b_ecc_stat_create(dev, + 0, + "gcc_l15_ecc_uncorrected_err_count", + &g->gr.t19x.ecc_stats.gcc_l15_uncorrected_err_count, + dev_attr_gcc_l15_ecc_uncorrected_err_count_array); + if (error) dev_err(dev, "Failed to create gv11b sysfs attributes!\n"); } @@ -241,4 +255,14 @@ static void gr_gv11b_remove_sysfs(struct device *dev) &g->gr.t19x.ecc_stats.sm_icache_uncorrected_err_count, dev_attr_sm_icache_ecc_uncorrected_err_count_array); + gr_gp10b_ecc_stat_remove(dev, + 0, + &g->gr.t19x.ecc_stats.gcc_l15_corrected_err_count, + dev_attr_gcc_l15_ecc_corrected_err_count_array); + + gr_gp10b_ecc_stat_remove(dev, + 0, + &g->gr.t19x.ecc_stats.gcc_l15_uncorrected_err_count, + dev_attr_gcc_l15_ecc_uncorrected_err_count_array); + } diff --git a/drivers/gpu/nvgpu/include/nvgpu/hw/gv11b/hw_gr_gv11b.h b/drivers/gpu/nvgpu/include/nvgpu/hw/gv11b/hw_gr_gv11b.h index 4ce69743..6f38cf5b 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/hw/gv11b/hw_gr_gv11b.h +++ b/drivers/gpu/nvgpu/include/nvgpu/hw/gv11b/hw_gr_gv11b.h @@ -3370,6 +3370,10 @@ static inline u32 gr_gpcs_gpccs_gpc_exception_en_r(void) { return 0x0041ac94; } +static inline u32 gr_gpcs_gpccs_gpc_exception_en_gcc_f(u32 v) +{ + return (v & 0x1) << 2; +} static inline u32 gr_gpcs_gpccs_gpc_exception_en_tpc_f(u32 v) { return (v & 0xff) << 16; @@ -3378,6 +3382,10 @@ static inline u32 gr_gpc0_gpccs_gpc_exception_r(void) { return 0x00502c90; } +static inline u32 gr_gpc0_gpccs_gpc_exception_gcc_v(u32 r) +{ + return (r >> 2) & 0x1; +} static inline u32 gr_gpc0_gpccs_gpc_exception_tpc_v(u32 r) { return (r >> 16) & 0xff; @@ -3386,6 +3394,62 @@ static inline u32 gr_gpc0_gpccs_gpc_exception_tpc_0_pending_v(void) { return 0x00000001; } +static inline u32 gr_pri_gpc0_gcc_l15_ecc_status_r(void) +{ + return 0x00501048; +} +static inline u32 gr_pri_gpc0_gcc_l15_ecc_status_corrected_err_bank0_m(void) +{ + return 0x1 << 0; +} +static inline u32 gr_pri_gpc0_gcc_l15_ecc_status_corrected_err_bank1_m(void) +{ + return 0x1 << 1; +} +static inline u32 gr_pri_gpc0_gcc_l15_ecc_status_uncorrected_err_bank0_m(void) +{ + return 0x1 << 4; +} +static inline u32 gr_pri_gpc0_gcc_l15_ecc_status_uncorrected_err_bank1_m(void) +{ + return 0x1 << 5; +} +static inline u32 gr_pri_gpc0_gcc_l15_ecc_status_corrected_err_total_counter_overflow_v(u32 r) +{ + return (r >> 8) & 0x1; +} +static inline u32 gr_pri_gpc0_gcc_l15_ecc_status_uncorrected_err_total_counter_overflow_v(u32 r) +{ + return (r >> 10) & 0x1; +} +static inline u32 gr_pri_gpc0_gcc_l15_ecc_status_reset_task_f(void) +{ + return 0x40000000; +} +static inline u32 gr_pri_gpc0_gcc_l15_ecc_corrected_err_count_r(void) +{ + return 0x0050104c; +} +static inline u32 gr_pri_gpc0_gcc_l15_ecc_corrected_err_count_total_s(void) +{ + return 16; +} +static inline u32 gr_pri_gpc0_gcc_l15_ecc_corrected_err_count_total_v(u32 r) +{ + return (r >> 0) & 0xffff; +} +static inline u32 gr_pri_gpc0_gcc_l15_ecc_uncorrected_err_count_r(void) +{ + return 0x00501054; +} +static inline u32 gr_pri_gpc0_gcc_l15_ecc_uncorrected_err_count_total_s(void) +{ + return 16; +} +static inline u32 gr_pri_gpc0_gcc_l15_ecc_uncorrected_err_count_total_v(u32 r) +{ + return (r >> 0) & 0xffff; +} static inline u32 gr_gpc0_tpc0_tpccs_tpc_exception_r(void) { return 0x00504508; -- cgit v1.2.2