From 6bc36bded05ee497a474e5a718c49dc33eb235f1 Mon Sep 17 00:00:00 2001 From: David Nieto Date: Mon, 22 May 2017 16:38:49 -0700 Subject: gpu: nvgpu: L2 cache tag ECC support Adding support for L2 cache tag ECC error handling JIRA: GPUT19X-112 Change-Id: I9a8ebefe97814b341f57a024dfb126013adaac1c Signed-off-by: David Nieto Reviewed-on: http://git-master/r/1489029 Reviewed-by: svccoveritychecker GVS: Gerrit_Virtual_Submit Reviewed-by: Terje Bergstrom --- drivers/gpu/nvgpu/gv11b/ecc_gv11b.h | 5 + drivers/gpu/nvgpu/gv11b/ltc_gv11b.c | 107 ++++++++++++ drivers/gpu/nvgpu/gv11b/platform_gv11b_tegra.c | 27 +++ .../nvgpu/include/nvgpu/hw/gv11b/hw_ltc_gv11b.h | 184 +++++++++++++++++++++ 4 files changed, 323 insertions(+) (limited to 'drivers/gpu') diff --git a/drivers/gpu/nvgpu/gv11b/ecc_gv11b.h b/drivers/gpu/nvgpu/gv11b/ecc_gv11b.h index 6b471655..4e1696f7 100644 --- a/drivers/gpu/nvgpu/gv11b/ecc_gv11b.h +++ b/drivers/gpu/nvgpu/gv11b/ecc_gv11b.h @@ -33,4 +33,9 @@ struct ecc_gr_t19x { struct gk20a_ecc_stat gpccs_uncorrected_err_count; }; +struct ecc_ltc_t19x { + struct gk20a_ecc_stat l2_cache_corrected_err_count; + struct gk20a_ecc_stat l2_cache_uncorrected_err_count; +}; + #endif diff --git a/drivers/gpu/nvgpu/gv11b/ltc_gv11b.c b/drivers/gpu/nvgpu/gv11b/ltc_gv11b.c index 23beca5d..b8a97ce3 100644 --- a/drivers/gpu/nvgpu/gv11b/ltc_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/ltc_gv11b.c @@ -20,6 +20,7 @@ #include "ltc_gv11b.h" #include +#include #include #include @@ -74,6 +75,111 @@ static void gv11b_ltc_init_fs_state(struct gk20a *g) ltc_intr); } +static void gv11b_ltc_isr(struct gk20a *g) +{ + u32 mc_intr, ltc_intr3; + unsigned int ltc, slice; + u32 ltc_stride = nvgpu_get_litter_value(g, GPU_LIT_LTC_STRIDE); + u32 lts_stride = nvgpu_get_litter_value(g, GPU_LIT_LTS_STRIDE); + u32 ecc_status, ecc_addr, corrected_cnt, uncorrected_cnt; + u32 corrected_delta, uncorrected_delta; + u32 corrected_overflow, uncorrected_overflow; + u32 ltc_corrected, ltc_uncorrected; + + mc_intr = gk20a_readl(g, mc_intr_ltc_r()); + for (ltc = 0; ltc < g->ltc_count; ltc++) { + if ((mc_intr & 1 << ltc) == 0) + continue; + ltc_corrected = ltc_uncorrected = 0; + + for (slice = 0; slice < g->gr.slices_per_ltc; slice++) { + u32 offset = ltc_stride * ltc + lts_stride * slice; + ltc_intr3 = gk20a_readl(g, ltc_ltc0_lts0_intr3_r() + + offset); + + /* Detect and handle ECC PARITY errors */ + + if (ltc_intr3 & + (ltc_ltcs_ltss_intr3_ecc_uncorrected_m() | + ltc_ltcs_ltss_intr3_ecc_corrected_m())) { + + ecc_status = gk20a_readl(g, + ltc_ltc0_lts0_l2_cache_ecc_status_r() + + offset); + ecc_addr = gk20a_readl(g, + ltc_ltc0_lts0_l2_cache_ecc_address_r() + + offset); + corrected_cnt = gk20a_readl(g, + ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_r() + offset); + uncorrected_cnt = gk20a_readl(g, + ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_r() + offset); + + corrected_delta = + ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_total_v(corrected_cnt); + uncorrected_delta = + ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_total_v(uncorrected_cnt); + corrected_overflow = ecc_status & + ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_total_counter_overflow_m(); + + uncorrected_overflow = ecc_status & + ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_total_counter_overflow_m(); + + /* clear the interrupt */ + if ((corrected_delta > 0) || corrected_overflow) { + gk20a_writel(g, ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_r() + offset, 0); + } + if ((uncorrected_delta > 0) || uncorrected_overflow) { + gk20a_writel(g, + ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_r() + offset, 0); + } + + gk20a_writel(g, ltc_ltc0_lts0_l2_cache_ecc_status_r() + offset, + ltc_ltc0_lts0_l2_cache_ecc_status_reset_task_f()); + + /* update counters per slice */ + if (corrected_overflow) + corrected_delta += (0x1UL << ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_total_s()); + if (uncorrected_overflow) + uncorrected_delta += (0x1UL << ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_total_s()); + + ltc_corrected += corrected_delta; + ltc_uncorrected += uncorrected_delta; + nvgpu_log(g, gpu_dbg_intr, + "ltc:%d lts: %d cache ecc interrupt intr: 0x%x", ltc, slice, ltc_intr3); + + if (ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_rstg_m()) + nvgpu_log(g, gpu_dbg_intr, "rstg ecc error corrected"); + if (ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_rstg_m()) + nvgpu_log(g, gpu_dbg_intr, "rstg ecc error uncorrected"); + if (ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_tstg_m()) + nvgpu_log(g, gpu_dbg_intr, "tstg ecc error corrected"); + if (ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_tstg_m()) + nvgpu_log(g, gpu_dbg_intr, "tstg ecc error uncorrected"); + if (ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_dstg_m()) + nvgpu_log(g, gpu_dbg_intr, "dstg ecc error corrected"); + if (ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_dstg_m()) + nvgpu_log(g, gpu_dbg_intr, "dstg ecc error uncorrected"); + + if (corrected_overflow || uncorrected_overflow) + nvgpu_info(g, "ecc counter overflow!"); + + nvgpu_log(g, gpu_dbg_intr, + "ecc error address: 0x%x", ecc_addr); + + } + + } + g->ecc.ltc.t19x.l2_cache_corrected_err_count.counters[ltc] += + ltc_corrected; + g->ecc.ltc.t19x.l2_cache_uncorrected_err_count.counters[ltc] += + ltc_uncorrected; + + } + + /* fallback to other interrupts */ + gp10b_ltc_isr(g); +} + static u32 gv11b_ltc_cbc_fix_config(struct gk20a *g, int base) { u32 val = gk20a_readl(g, ltc_ltcs_ltss_cbc_num_active_ltcs_r()); @@ -93,4 +199,5 @@ void gv11b_init_ltc(struct gpu_ops *gops) gops->ltc.set_zbc_s_entry = gv11b_ltc_set_zbc_stencil_entry; gops->ltc.init_fs_state = gv11b_ltc_init_fs_state; gops->ltc.cbc_fix_config = gv11b_ltc_cbc_fix_config; + gops->ltc.isr = gv11b_ltc_isr; } diff --git a/drivers/gpu/nvgpu/gv11b/platform_gv11b_tegra.c b/drivers/gpu/nvgpu/gv11b/platform_gv11b_tegra.c index 8733cae9..432af7c1 100644 --- a/drivers/gpu/nvgpu/gv11b/platform_gv11b_tegra.c +++ b/drivers/gpu/nvgpu/gv11b/platform_gv11b_tegra.c @@ -177,6 +177,9 @@ static struct device_attribute *dev_attr_fecs_ecc_uncorrected_err_count_array; static struct device_attribute *dev_attr_gpccs_ecc_corrected_err_count_array; static struct device_attribute *dev_attr_gpccs_ecc_uncorrected_err_count_array; +static struct device_attribute *dev_attr_l2_cache_ecc_corrected_err_count_array; +static struct device_attribute *dev_attr_l2_cache_ecc_uncorrected_err_count_array; + void gr_gv11b_create_sysfs(struct device *dev) { struct gk20a *g = get_gk20a(dev); @@ -250,6 +253,20 @@ void gr_gv11b_create_sysfs(struct device *dev) &g->ecc.gr.t19x.gcc_l15_uncorrected_err_count, dev_attr_gcc_l15_ecc_uncorrected_err_count_array); + error |= gp10b_ecc_stat_create(dev, + g->ltc_count, + "ltc", + "l2_cache_uncorrected_err_count", + &g->ecc.ltc.t19x.l2_cache_uncorrected_err_count, + dev_attr_l2_cache_ecc_uncorrected_err_count_array); + + error |= gp10b_ecc_stat_create(dev, + g->ltc_count, + "ltc", + "l2_cache_corrected_err_count", + &g->ecc.ltc.t19x.l2_cache_corrected_err_count, + dev_attr_l2_cache_ecc_corrected_err_count_array); + error |= gp10b_ecc_stat_create(dev, 1, "gpc", @@ -336,6 +353,16 @@ static void gr_gv11b_remove_sysfs(struct device *dev) &g->ecc.gr.t19x.gcc_l15_uncorrected_err_count, dev_attr_gcc_l15_ecc_uncorrected_err_count_array); + gp10b_ecc_stat_remove(dev, + g->ltc_count, + &g->ecc.ltc.t19x.l2_cache_uncorrected_err_count, + dev_attr_l2_cache_ecc_uncorrected_err_count_array); + + gp10b_ecc_stat_remove(dev, + g->ltc_count, + &g->ecc.ltc.t19x.l2_cache_corrected_err_count, + dev_attr_l2_cache_ecc_corrected_err_count_array); + gp10b_ecc_stat_remove(dev, 1, &g->ecc.gr.t19x.fecs_uncorrected_err_count, diff --git a/drivers/gpu/nvgpu/include/nvgpu/hw/gv11b/hw_ltc_gv11b.h b/drivers/gpu/nvgpu/include/nvgpu/hw/gv11b/hw_ltc_gv11b.h index 45d3df07..1bcd1246 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/hw/gv11b/hw_ltc_gv11b.h +++ b/drivers/gpu/nvgpu/include/nvgpu/hw/gv11b/hw_ltc_gv11b.h @@ -374,6 +374,190 @@ static inline u32 ltc_ltc0_lts0_intr_r(void) { return 0x0014040c; } +static inline u32 ltc_ltcs_ltss_intr3_r(void) +{ + return 0x0017e388; +} +static inline u32 ltc_ltcs_ltss_intr3_ecc_corrected_m(void) +{ + return 0x1 << 7; +} +static inline u32 ltc_ltcs_ltss_intr3_ecc_uncorrected_m(void) +{ + return 0x1 << 8; +} +static inline u32 ltc_ltc0_lts0_intr3_r(void) +{ + return 0x00140588; +} +static inline u32 ltc_ltc0_lts0_l2_cache_ecc_status_r(void) +{ + return 0x001404f0; +} +static inline u32 ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_rstg_f(u32 v) +{ + return (v & 0x1) << 1; +} +static inline u32 ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_rstg_m(void) +{ + return 0x1 << 1; +} +static inline u32 ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_tstg_f(u32 v) +{ + return (v & 0x1) << 3; +} +static inline u32 ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_tstg_m(void) +{ + return 0x1 << 3; +} +static inline u32 ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_dstg_f(u32 v) +{ + return (v & 0x1) << 5; +} +static inline u32 ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_dstg_m(void) +{ + return 0x1 << 5; +} +static inline u32 ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_rstg_f(u32 v) +{ + return (v & 0x1) << 0; +} +static inline u32 ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_rstg_m(void) +{ + return 0x1 << 0; +} +static inline u32 ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_tstg_f(u32 v) +{ + return (v & 0x1) << 2; +} +static inline u32 ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_tstg_m(void) +{ + return 0x1 << 2; +} +static inline u32 ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_dstg_f(u32 v) +{ + return (v & 0x1) << 4; +} +static inline u32 ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_dstg_m(void) +{ + return 0x1 << 4; +} +static inline u32 ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_total_counter_overflow_f(u32 v) +{ + return (v & 0x1) << 18; +} +static inline u32 ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_total_counter_overflow_m(void) +{ + return 0x1 << 18; +} +static inline u32 ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_total_counter_overflow_f(u32 v) +{ + return (v & 0x1) << 16; +} +static inline u32 ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_total_counter_overflow_m(void) +{ + return 0x1 << 16; +} +static inline u32 ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_unique_counter_overflow_f(u32 v) +{ + return (v & 0x1) << 19; +} +static inline u32 ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_unique_counter_overflow_m(void) +{ + return 0x1 << 19; +} +static inline u32 ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_unique_counter_overflow_f(u32 v) +{ + return (v & 0x1) << 17; +} +static inline u32 ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_unique_counter_overflow_m(void) +{ + return 0x1 << 17; +} +static inline u32 ltc_ltc0_lts0_l2_cache_ecc_status_reset_f(u32 v) +{ + return (v & 0x1) << 30; +} +static inline u32 ltc_ltc0_lts0_l2_cache_ecc_status_reset_task_f(void) +{ + return 0x40000000; +} +static inline u32 ltc_ltc0_lts0_l2_cache_ecc_address_r(void) +{ + return 0x001404fc; +} +static inline u32 ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_r(void) +{ + return 0x001404f4; +} +static inline u32 ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_total_s(void) +{ + return 16; +} +static inline u32 ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_total_f(u32 v) +{ + return (v & 0xffff) << 0; +} +static inline u32 ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_total_m(void) +{ + return 0xffff << 0; +} +static inline u32 ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_total_v(u32 r) +{ + return (r >> 0) & 0xffff; +} +static inline u32 ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_unique_total_s(void) +{ + return 16; +} +static inline u32 ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_unique_total_f(u32 v) +{ + return (v & 0xffff) << 16; +} +static inline u32 ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_unique_total_m(void) +{ + return 0xffff << 16; +} +static inline u32 ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_unique_total_v(u32 r) +{ + return (r >> 16) & 0xffff; +} +static inline u32 ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_r(void) +{ + return 0x001404f8; +} +static inline u32 ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_total_s(void) +{ + return 16; +} +static inline u32 ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_total_f(u32 v) +{ + return (v & 0xffff) << 0; +} +static inline u32 ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_total_m(void) +{ + return 0xffff << 0; +} +static inline u32 ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_total_v(u32 r) +{ + return (r >> 0) & 0xffff; +} +static inline u32 ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_unique_total_s(void) +{ + return 16; +} +static inline u32 ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_unique_total_f(u32 v) +{ + return (v & 0xffff) << 16; +} +static inline u32 ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_unique_total_m(void) +{ + return 0xffff << 16; +} +static inline u32 ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_unique_total_v(u32 r) +{ + return (r >> 16) & 0xffff; +} static inline u32 ltc_ltc0_lts0_dstg_ecc_report_r(void) { return 0x0014051c; -- cgit v1.2.2