From 345eaef6a76771da9c3e8a5e375fc9d659fb1b2b Mon Sep 17 00:00:00 2001 From: David Nieto Date: Fri, 26 May 2017 08:31:46 -0700 Subject: gpu: nvgpu: GPC MMU ECC support Adding support for GPC MMU ECC error handling JIRA: GPUT19X-112 Change-Id: I62083bf2f144ff628ecd8c0aefc8d227a233ff36 Signed-off-by: David Nieto Reviewed-on: http://git-master/r/1490772 Reviewed-by: svccoveritychecker GVS: Gerrit_Virtual_Submit Reviewed-by: Terje Bergstrom --- drivers/gpu/nvgpu/gv11b/ecc_gv11b.h | 2 + drivers/gpu/nvgpu/gv11b/gr_gv11b.c | 105 +++++++++- drivers/gpu/nvgpu/gv11b/platform_gv11b_tegra.c | 25 +++ .../gpu/nvgpu/include/nvgpu/hw/gv11b/hw_gr_gv11b.h | 216 +++++++++++++++++++-- 4 files changed, 332 insertions(+), 16 deletions(-) diff --git a/drivers/gpu/nvgpu/gv11b/ecc_gv11b.h b/drivers/gpu/nvgpu/gv11b/ecc_gv11b.h index 4e1696f7..70b1bab8 100644 --- a/drivers/gpu/nvgpu/gv11b/ecc_gv11b.h +++ b/drivers/gpu/nvgpu/gv11b/ecc_gv11b.h @@ -31,6 +31,8 @@ struct ecc_gr_t19x { struct gk20a_ecc_stat fecs_uncorrected_err_count; struct gk20a_ecc_stat gpccs_corrected_err_count; struct gk20a_ecc_stat gpccs_uncorrected_err_count; + struct gk20a_ecc_stat mmu_l1tlb_corrected_err_count; + struct gk20a_ecc_stat mmu_l1tlb_uncorrected_err_count; }; struct ecc_ltc_t19x { diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c index 8176b807..701b840a 100644 --- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c @@ -658,16 +658,101 @@ static int gr_gv11b_handle_gcc_exception(struct gk20a *g, u32 gpc, u32 tpc, return 0; } -static int gr_gv11b_handle_gpccs_ecc_exception(struct gk20a *g, u32 gpc, +static int gr_gv11b_handle_gpcmmu_ecc_exception(struct gk20a *g, u32 gpc, u32 exception) { int ret = 0; + u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); + u32 offset = gpc_stride * gpc; u32 ecc_status, ecc_addr, corrected_cnt, uncorrected_cnt; u32 corrected_delta, uncorrected_delta; u32 corrected_overflow, uncorrected_overflow; + int hww_esr; + + hww_esr = gk20a_readl(g, gr_gpc0_mmu_gpcmmu_global_esr_r() + offset); + + if (!(hww_esr & (gr_gpc0_mmu_gpcmmu_global_esr_ecc_corrected_m() | + gr_gpc0_mmu_gpcmmu_global_esr_ecc_uncorrected_m()))) + return ret; + + ecc_status = gk20a_readl(g, + gr_gpc0_mmu_l1tlb_ecc_status_r() + offset); + ecc_addr = gk20a_readl(g, + gr_gpc0_mmu_l1tlb_ecc_address_r() + offset); + corrected_cnt = gk20a_readl(g, + gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_r() + offset); + uncorrected_cnt = gk20a_readl(g, + gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_r() + offset); + + corrected_delta = gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_total_v( + corrected_cnt); + uncorrected_delta = gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_total_v( + uncorrected_cnt); + corrected_overflow = ecc_status & + gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_total_counter_overflow_m(); + + uncorrected_overflow = ecc_status & + gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_total_counter_overflow_m(); + + + /* clear the interrupt */ + if ((corrected_delta > 0) || corrected_overflow) + gk20a_writel(g, + gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_r() + + offset, 0); + if ((uncorrected_delta > 0) || uncorrected_overflow) + gk20a_writel(g, + gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_r() + + offset, 0); + + gk20a_writel(g, gr_gpc0_mmu_l1tlb_ecc_status_r() + offset, + gr_gpc0_mmu_l1tlb_ecc_status_reset_task_f()); + + /* Handle overflow */ + if (corrected_overflow) + corrected_delta += (0x1UL << gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_total_s()); + if (uncorrected_overflow) + uncorrected_delta += (0x1UL << gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_total_s()); + + g->ecc.gr.t19x.mmu_l1tlb_corrected_err_count.counters[gpc] += + corrected_delta; + g->ecc.gr.t19x.mmu_l1tlb_uncorrected_err_count.counters[gpc] += + uncorrected_delta; + nvgpu_log(g, gpu_dbg_intr, + "mmu l1tlb gpc:%d ecc interrupt intr: 0x%x", gpc, hww_esr); + + if (ecc_status & gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_sa_data_m()) + nvgpu_log(g, gpu_dbg_intr, "corrected ecc sa data error"); + if (ecc_status & gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_sa_data_m()) + nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc sa data error"); + if (ecc_status & gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_fa_data_m()) + nvgpu_log(g, gpu_dbg_intr, "corrected ecc fa data error"); + if (ecc_status & gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_fa_data_m()) + nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc fa data error"); + if (corrected_overflow || uncorrected_overflow) + nvgpu_info(g, "mmu l1tlb ecc counter overflow!"); + + nvgpu_log(g, gpu_dbg_intr, + "ecc error address: 0x%x", ecc_addr); + nvgpu_log(g, gpu_dbg_intr, + "ecc error count corrected: %d, uncorrected %d", + g->ecc.gr.t19x.mmu_l1tlb_corrected_err_count.counters[gpc], + g->ecc.gr.t19x.mmu_l1tlb_uncorrected_err_count.counters[gpc]); + + return ret; +} + +static int gr_gv11b_handle_gpccs_ecc_exception(struct gk20a *g, u32 gpc, + u32 exception) +{ + int ret = 0; + u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); + u32 offset = gpc_stride * gpc; + u32 ecc_status, ecc_addr, corrected_cnt, uncorrected_cnt; + u32 corrected_delta, uncorrected_delta; + u32 corrected_overflow, uncorrected_overflow; int hww_esr; - u32 offset = proj_gpc_stride_v() * gpc; hww_esr = gk20a_readl(g, gr_gpc0_gpccs_hww_esr_r() + offset); @@ -741,6 +826,15 @@ static int gr_gv11b_handle_gpccs_ecc_exception(struct gk20a *g, u32 gpc, return ret; } +static int gr_gv11b_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc, + u32 gpc_exception) +{ + if (gpc_exception & gr_gpc0_gpccs_gpc_exception_gpcmmu_m()) + return gr_gv11b_handle_gpcmmu_ecc_exception(g, gpc, + gpc_exception); + return 0; +} + static int gr_gv11b_handle_gpc_gpccs_exception(struct gk20a *g, u32 gpc, u32 gpc_exception) { @@ -764,7 +858,8 @@ static void gr_gv11b_enable_gpc_exceptions(struct gk20a *g) gk20a_writel(g, gr_gpcs_gpccs_gpc_exception_en_r(), (tpc_mask | gr_gpcs_gpccs_gpc_exception_en_gcc_f(1) | - gr_gpcs_gpccs_gpc_exception_en_gpccs_f(1))); + gr_gpcs_gpccs_gpc_exception_en_gpccs_f(1) | + gr_gpcs_gpccs_gpc_exception_en_gpcmmu_f(1))); } static int gr_gv11b_handle_tex_exception(struct gk20a *g, u32 gpc, u32 tpc, @@ -1810,7 +1905,7 @@ static void gr_gv11b_handle_fecs_ecc_error(struct gk20a *g, u32 intr) nvgpu_log(g, gpu_dbg_intr, "dmem ecc error uncorrected"); if (corrected_overflow || uncorrected_overflow) - nvgpu_info(g, "gpccs ecc counter overflow!"); + nvgpu_info(g, "fecs ecc counter overflow!"); nvgpu_log(g, gpu_dbg_intr, "ecc error row address: 0x%x", @@ -2422,4 +2517,6 @@ void gv11b_init_gr(struct gpu_ops *gops) gops->gr.handle_gpc_gpccs_exception = gr_gv11b_handle_gpc_gpccs_exception; gops->gr.set_czf_bypass = NULL; + gops->gr.handle_gpc_gpcmmu_exception = + gr_gv11b_handle_gpc_gpcmmu_exception; } diff --git a/drivers/gpu/nvgpu/gv11b/platform_gv11b_tegra.c b/drivers/gpu/nvgpu/gv11b/platform_gv11b_tegra.c index 432af7c1..c69e1478 100644 --- a/drivers/gpu/nvgpu/gv11b/platform_gv11b_tegra.c +++ b/drivers/gpu/nvgpu/gv11b/platform_gv11b_tegra.c @@ -171,6 +171,8 @@ static struct device_attribute *dev_attr_sm_icache_ecc_corrected_err_count_array static struct device_attribute *dev_attr_sm_icache_ecc_uncorrected_err_count_array; static struct device_attribute *dev_attr_gcc_l15_ecc_corrected_err_count_array; static struct device_attribute *dev_attr_gcc_l15_ecc_uncorrected_err_count_array; +static struct device_attribute *dev_attr_mmu_l1tlb_ecc_corrected_err_count_array; +static struct device_attribute *dev_attr_mmu_l1tlb_ecc_uncorrected_err_count_array; static struct device_attribute *dev_attr_fecs_ecc_corrected_err_count_array; static struct device_attribute *dev_attr_fecs_ecc_uncorrected_err_count_array; @@ -295,6 +297,19 @@ void gr_gv11b_create_sysfs(struct device *dev) &g->ecc.gr.t19x.gpccs_corrected_err_count, dev_attr_gpccs_ecc_corrected_err_count_array); + error |= gp10b_ecc_stat_create(dev, + g->gr.gpc_count, + "gpc", + "mmu_l1tlb_ecc_uncorrected_err_count", + &g->ecc.gr.t19x.mmu_l1tlb_uncorrected_err_count, + dev_attr_mmu_l1tlb_ecc_uncorrected_err_count_array); + + error |= gp10b_ecc_stat_create(dev, + g->gr.gpc_count, + "gpc", + "mmu_l1tlb_ecc_corrected_err_count", + &g->ecc.gr.t19x.mmu_l1tlb_corrected_err_count, + dev_attr_mmu_l1tlb_ecc_corrected_err_count_array); if (error) dev_err(dev, "Failed to create gv11b sysfs attributes!\n"); } @@ -382,4 +397,14 @@ static void gr_gv11b_remove_sysfs(struct device *dev) g->gr.gpc_count, &g->ecc.gr.t19x.gpccs_corrected_err_count, dev_attr_gpccs_ecc_corrected_err_count_array); + + gp10b_ecc_stat_remove(dev, + g->gr.gpc_count, + &g->ecc.gr.t19x.mmu_l1tlb_uncorrected_err_count, + dev_attr_mmu_l1tlb_ecc_uncorrected_err_count_array); + + gp10b_ecc_stat_remove(dev, + g->gr.gpc_count, + &g->ecc.gr.t19x.mmu_l1tlb_corrected_err_count, + dev_attr_mmu_l1tlb_ecc_corrected_err_count_array); } diff --git a/drivers/gpu/nvgpu/include/nvgpu/hw/gv11b/hw_gr_gv11b.h b/drivers/gpu/nvgpu/include/nvgpu/hw/gv11b/hw_gr_gv11b.h index 2d5afb29..62307265 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/hw/gv11b/hw_gr_gv11b.h +++ b/drivers/gpu/nvgpu/include/nvgpu/hw/gv11b/hw_gr_gv11b.h @@ -3426,6 +3426,10 @@ static inline u32 gr_gpcs_gpccs_gpc_exception_en_gpccs_f(u32 v) { return (v & 0x1) << 14; } +static inline u32 gr_gpcs_gpccs_gpc_exception_en_gpcmmu_f(u32 v) +{ + return (v & 0x1) << 15; +} static inline u32 gr_gpc0_gpccs_gpc_exception_r(void) { return 0x00502c90; @@ -3442,6 +3446,30 @@ static inline u32 gr_gpc0_gpccs_gpc_exception_tpc_0_pending_v(void) { return 0x00000001; } +static inline u32 gr_gpc0_gpccs_gpc_exception_gpccs_f(u32 v) +{ + return (v & 0x1) << 14; +} +static inline u32 gr_gpc0_gpccs_gpc_exception_gpccs_m(void) +{ + return 0x1 << 14; +} +static inline u32 gr_gpc0_gpccs_gpc_exception_gpccs_pending_f(void) +{ + return 0x4000; +} +static inline u32 gr_gpc0_gpccs_gpc_exception_gpcmmu_f(u32 v) +{ + return (v & 0x1) << 15; +} +static inline u32 gr_gpc0_gpccs_gpc_exception_gpcmmu_m(void) +{ + return 0x1 << 15; +} +static inline u32 gr_gpc0_gpccs_gpc_exception_gpcmmu_pending_f(void) +{ + return 0x8000; +} static inline u32 gr_pri_gpc0_gcc_l15_ecc_status_r(void) { return 0x00501048; @@ -3498,18 +3526,6 @@ static inline u32 gr_pri_gpc0_gcc_l15_ecc_uncorrected_err_count_total_v(u32 r) { return (r >> 0) & 0xffff; } -static inline u32 gr_gpc0_gpccs_gpc_exception_gpccs_f(u32 v) -{ - return (v & 0x1) << 14; -} -static inline u32 gr_gpc0_gpccs_gpc_exception_gpccs_m(void) -{ - return 0x1 << 14; -} -static inline u32 gr_gpc0_gpccs_gpc_exception_gpccs_pending_f(void) -{ - return 0x4000; -} static inline u32 gr_gpc0_tpc0_tpccs_tpc_exception_r(void) { return 0x00504508; @@ -4014,6 +4030,182 @@ static inline u32 gr_gpcs_tc_debug0_limit_coalesce_buffer_size_m(void) { return 0x1ff << 0; } +static inline u32 gr_gpc0_mmu_gpcmmu_global_esr_r(void) +{ + return 0x00500324; +} +static inline u32 gr_gpc0_mmu_gpcmmu_global_esr_ecc_corrected_f(u32 v) +{ + return (v & 0x1) << 0; +} +static inline u32 gr_gpc0_mmu_gpcmmu_global_esr_ecc_corrected_m(void) +{ + return 0x1 << 0; +} +static inline u32 gr_gpc0_mmu_gpcmmu_global_esr_ecc_uncorrected_f(u32 v) +{ + return (v & 0x1) << 1; +} +static inline u32 gr_gpc0_mmu_gpcmmu_global_esr_ecc_uncorrected_m(void) +{ + return 0x1 << 1; +} +static inline u32 gr_gpc0_mmu_l1tlb_ecc_status_r(void) +{ + return 0x00500314; +} +static inline u32 gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_sa_data_f(u32 v) +{ + return (v & 0x1) << 0; +} +static inline u32 gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_sa_data_m(void) +{ + return 0x1 << 0; +} +static inline u32 gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_fa_data_f(u32 v) +{ + return (v & 0x1) << 2; +} +static inline u32 gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_fa_data_m(void) +{ + return 0x1 << 2; +} +static inline u32 gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_sa_data_f(u32 v) +{ + return (v & 0x1) << 1; +} +static inline u32 gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_sa_data_m(void) +{ + return 0x1 << 1; +} +static inline u32 gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_fa_data_f(u32 v) +{ + return (v & 0x1) << 3; +} +static inline u32 gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_fa_data_m(void) +{ + return 0x1 << 3; +} +static inline u32 gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_total_counter_overflow_f(u32 v) +{ + return (v & 0x1) << 18; +} +static inline u32 gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_total_counter_overflow_m(void) +{ + return 0x1 << 18; +} +static inline u32 gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_total_counter_overflow_f(u32 v) +{ + return (v & 0x1) << 16; +} +static inline u32 gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_total_counter_overflow_m(void) +{ + return 0x1 << 16; +} +static inline u32 gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_unique_counter_overflow_f(u32 v) +{ + return (v & 0x1) << 19; +} +static inline u32 gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_unique_counter_overflow_m(void) +{ + return 0x1 << 19; +} +static inline u32 gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_unique_counter_overflow_f(u32 v) +{ + return (v & 0x1) << 17; +} +static inline u32 gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_unique_counter_overflow_m(void) +{ + return 0x1 << 17; +} +static inline u32 gr_gpc0_mmu_l1tlb_ecc_status_reset_f(u32 v) +{ + return (v & 0x1) << 30; +} +static inline u32 gr_gpc0_mmu_l1tlb_ecc_status_reset_task_f(void) +{ + return 0x40000000; +} +static inline u32 gr_gpc0_mmu_l1tlb_ecc_address_r(void) +{ + return 0x00500320; +} +static inline u32 gr_gpc0_mmu_l1tlb_ecc_address_index_f(u32 v) +{ + return (v & 0xffffffff) << 0; +} +static inline u32 gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_r(void) +{ + return 0x00500318; +} +static inline u32 gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_total_s(void) +{ + return 16; +} +static inline u32 gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_total_f(u32 v) +{ + return (v & 0xffff) << 0; +} +static inline u32 gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_total_m(void) +{ + return 0xffff << 0; +} +static inline u32 gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_total_v(u32 r) +{ + return (r >> 0) & 0xffff; +} +static inline u32 gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_unique_total_s(void) +{ + return 16; +} +static inline u32 gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_unique_total_f(u32 v) +{ + return (v & 0xffff) << 16; +} +static inline u32 gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_unique_total_m(void) +{ + return 0xffff << 16; +} +static inline u32 gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_unique_total_v(u32 r) +{ + return (r >> 16) & 0xffff; +} +static inline u32 gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_r(void) +{ + return 0x0050031c; +} +static inline u32 gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_total_s(void) +{ + return 16; +} +static inline u32 gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_total_f(u32 v) +{ + return (v & 0xffff) << 0; +} +static inline u32 gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_total_m(void) +{ + return 0xffff << 0; +} +static inline u32 gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_total_v(u32 r) +{ + return (r >> 0) & 0xffff; +} +static inline u32 gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_unique_total_s(void) +{ + return 16; +} +static inline u32 gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_unique_total_f(u32 v) +{ + return (v & 0xffff) << 16; +} +static inline u32 gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_unique_total_m(void) +{ + return 0xffff << 16; +} +static inline u32 gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_unique_total_v(u32 r) +{ + return (r >> 16) & 0xffff; +} static inline u32 gr_gpc0_gpccs_hww_esr_r(void) { return 0x00502c98; -- cgit v1.2.2