From 5a08eafbe076fba98de62883636ee6b0751cf7e9 Mon Sep 17 00:00:00 2001 From: Lakshmanan M Date: Wed, 17 May 2017 11:42:24 +0530 Subject: gpu: nvgpu: gv11b: Add L1 DATA + iCACHE parity This CL covers the following parity support (uncorrected error), 1) SM's L1 DATA 2) SM's L0 && L1 icache Volta Resiliency Id - Volta-634 JIRA GPUT19X-113 JIRA GPUT19X-99 Bug 1807553 Change-Id: Iacbf492028983529dadc5753007e43510b8cb786 Signed-off-by: Lakshmanan M Reviewed-on: http://git-master/r/1483681 Reviewed-by: mobile promotions Tested-by: mobile promotions --- drivers/gpu/nvgpu/gv11b/gr_gv11b.c | 170 +++++++++++++++++++++ drivers/gpu/nvgpu/gv11b/gr_gv11b.h | 4 + drivers/gpu/nvgpu/gv11b/platform_gv11b_tegra.c | 48 ++++++ .../gpu/nvgpu/include/nvgpu/hw/gv11b/hw_gr_gv11b.h | 128 ++++++++++++++++ 4 files changed, 350 insertions(+) (limited to 'drivers/gpu') diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c index d36aa6ec..0c0b4261 100644 --- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c @@ -368,6 +368,170 @@ static int gr_gv11b_handle_cbu_exception(struct gk20a *g, u32 gpc, u32 tpc, } +static int gr_gv11b_handle_l1_data_exception(struct gk20a *g, u32 gpc, u32 tpc, + bool *post_event, struct channel_gk20a *fault_ch, + u32 *hww_global_esr) +{ + u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); + u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE); + u32 offset = gpc_stride * gpc + tpc_in_gpc_stride * tpc; + u32 l1_data_ecc_status, l1_data_ecc_corrected_err_status = 0; + u32 l1_data_ecc_uncorrected_err_status = 0; + u32 l1_data_corrected_err_count_delta = 0; + u32 l1_data_uncorrected_err_count_delta = 0; + bool is_l1_data_ecc_corrected_total_err_overflow = 0; + bool is_l1_data_ecc_uncorrected_total_err_overflow = 0; + + /* Check for L1 data ECC errors. */ + l1_data_ecc_status = gk20a_readl(g, + gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_r() + offset); + l1_data_ecc_corrected_err_status = l1_data_ecc_status & + (gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_corrected_err_el1_0_m() | + gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_corrected_err_el1_1_m()); + l1_data_ecc_uncorrected_err_status = l1_data_ecc_status & + (gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_uncorrected_err_el1_0_m() | + gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_uncorrected_err_el1_1_m()); + + if ((l1_data_ecc_corrected_err_status == 0) && (l1_data_ecc_uncorrected_err_status == 0)) + return 0; + + l1_data_corrected_err_count_delta = + gr_pri_gpc0_tpc0_sm_l1_data_ecc_corrected_err_count_total_v( + gk20a_readl(g, + gr_pri_gpc0_tpc0_sm_l1_data_ecc_corrected_err_count_r() + + offset)); + l1_data_uncorrected_err_count_delta = + gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_total_v( + gk20a_readl(g, + gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_r() + + offset)); + is_l1_data_ecc_corrected_total_err_overflow = + gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_corrected_err_total_counter_overflow_v(l1_data_ecc_status); + is_l1_data_ecc_uncorrected_total_err_overflow = + gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_uncorrected_err_total_counter_overflow_v(l1_data_ecc_status); + + if ((l1_data_corrected_err_count_delta > 0) || is_l1_data_ecc_corrected_total_err_overflow) { + gk20a_dbg(gpu_dbg_fn | gpu_dbg_intr, + "corrected error (SBE) detected in SM L1 data! err_mask [%08x] is_overf [%d]", + l1_data_ecc_corrected_err_status, is_l1_data_ecc_corrected_total_err_overflow); + + /* HW uses 16-bits counter */ + l1_data_corrected_err_count_delta += + (is_l1_data_ecc_corrected_total_err_overflow << + gr_pri_gpc0_tpc0_sm_l1_data_ecc_corrected_err_count_total_s()); + g->gr.t19x.ecc_stats.sm_l1_data_corrected_err_count.counters[tpc] += + l1_data_corrected_err_count_delta; + gk20a_writel(g, + gr_pri_gpc0_tpc0_sm_l1_data_ecc_corrected_err_count_r() + offset, + 0); + } + if ((l1_data_uncorrected_err_count_delta > 0) || is_l1_data_ecc_uncorrected_total_err_overflow) { + gk20a_dbg(gpu_dbg_fn | gpu_dbg_intr, + "Uncorrected error (DBE) detected in SM L1 data! err_mask [%08x] is_overf [%d]", + l1_data_ecc_uncorrected_err_status, is_l1_data_ecc_uncorrected_total_err_overflow); + + /* HW uses 16-bits counter */ + l1_data_uncorrected_err_count_delta += + (is_l1_data_ecc_uncorrected_total_err_overflow << + gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_total_s()); + g->gr.t19x.ecc_stats.sm_l1_data_uncorrected_err_count.counters[tpc] += + l1_data_uncorrected_err_count_delta; + gk20a_writel(g, + gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_r() + offset, + 0); + } + + gk20a_writel(g, gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_r() + offset, + gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_reset_task_f()); + + return 0; + +} + +static int gr_gv11b_handle_icache_exception(struct gk20a *g, u32 gpc, u32 tpc, + bool *post_event, struct channel_gk20a *fault_ch, + u32 *hww_global_esr) +{ + u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); + u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE); + u32 offset = gpc_stride * gpc + tpc_in_gpc_stride * tpc; + u32 icache_ecc_status, icache_ecc_corrected_err_status = 0; + u32 icache_ecc_uncorrected_err_status = 0; + u32 icache_corrected_err_count_delta = 0; + u32 icache_uncorrected_err_count_delta = 0; + bool is_icache_ecc_corrected_total_err_overflow = 0; + bool is_icache_ecc_uncorrected_total_err_overflow = 0; + + /* Check for L0 && L1 icache ECC errors. */ + icache_ecc_status = gk20a_readl(g, + gr_pri_gpc0_tpc0_sm_icache_ecc_status_r() + offset); + icache_ecc_corrected_err_status = icache_ecc_status & + (gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l0_data_m() | + gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l0_predecode_m() | + gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l1_data_m() | + gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l1_predecode_m()); + icache_ecc_uncorrected_err_status = icache_ecc_status & + (gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l0_data_m() | + gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l0_predecode_m() | + gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l1_data_m() | + gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l1_predecode_m()); + + if ((icache_ecc_corrected_err_status == 0) && (icache_ecc_uncorrected_err_status == 0)) + return 0; + + icache_corrected_err_count_delta = + gr_pri_gpc0_tpc0_sm_icache_ecc_corrected_err_count_total_v( + gk20a_readl(g, + gr_pri_gpc0_tpc0_sm_icache_ecc_corrected_err_count_r() + + offset)); + icache_uncorrected_err_count_delta = + gr_pri_gpc0_tpc0_sm_icache_ecc_uncorrected_err_count_total_v( + gk20a_readl(g, + gr_pri_gpc0_tpc0_sm_icache_ecc_uncorrected_err_count_r() + + offset)); + is_icache_ecc_corrected_total_err_overflow = + gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_total_counter_overflow_v(icache_ecc_status); + is_icache_ecc_uncorrected_total_err_overflow = + gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_total_counter_overflow_v(icache_ecc_status); + + if ((icache_corrected_err_count_delta > 0) || is_icache_ecc_corrected_total_err_overflow) { + gk20a_dbg(gpu_dbg_fn | gpu_dbg_intr, + "corrected error (SBE) detected in SM L0 && L1 icache! err_mask [%08x] is_overf [%d]", + icache_ecc_corrected_err_status, is_icache_ecc_corrected_total_err_overflow); + + /* HW uses 16-bits counter */ + icache_corrected_err_count_delta += + (is_icache_ecc_corrected_total_err_overflow << + gr_pri_gpc0_tpc0_sm_icache_ecc_corrected_err_count_total_s()); + g->gr.t19x.ecc_stats.sm_icache_corrected_err_count.counters[tpc] += + icache_corrected_err_count_delta; + gk20a_writel(g, + gr_pri_gpc0_tpc0_sm_icache_ecc_corrected_err_count_r() + offset, + 0); + } + if ((icache_uncorrected_err_count_delta > 0) || is_icache_ecc_uncorrected_total_err_overflow) { + gk20a_dbg(gpu_dbg_fn | gpu_dbg_intr, + "Uncorrected error (DBE) detected in SM L0 && L1 icache! err_mask [%08x] is_overf [%d]", + icache_ecc_uncorrected_err_status, is_icache_ecc_uncorrected_total_err_overflow); + + /* HW uses 16-bits counter */ + icache_uncorrected_err_count_delta += + (is_icache_ecc_uncorrected_total_err_overflow << + gr_pri_gpc0_tpc0_sm_icache_ecc_uncorrected_err_count_total_s()); + g->gr.t19x.ecc_stats.sm_icache_uncorrected_err_count.counters[tpc] += + icache_uncorrected_err_count_delta; + gk20a_writel(g, + gr_pri_gpc0_tpc0_sm_icache_ecc_uncorrected_err_count_r() + offset, + 0); + } + + gk20a_writel(g, gr_pri_gpc0_tpc0_sm_icache_ecc_status_r() + offset, + gr_pri_gpc0_tpc0_sm_icache_ecc_status_reset_task_f()); + + return 0; + +} + static int gr_gv11b_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, bool *post_event, struct channel_gk20a *fault_ch, u32 *hww_global_esr) @@ -383,6 +547,12 @@ static int gr_gv11b_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, /* Check for CBU ECC errors. */ gr_gv11b_handle_cbu_exception(g, gpc, tpc, post_event, fault_ch, hww_global_esr); + /* Check for L1 data ECC errors. */ + gr_gv11b_handle_l1_data_exception(g, gpc, tpc, post_event, fault_ch, hww_global_esr); + + /* Check for L0 && L1 icache ECC errors. */ + gr_gv11b_handle_icache_exception(g, gpc, tpc, post_event, fault_ch, hww_global_esr); + return ret; } diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.h b/drivers/gpu/nvgpu/gv11b/gr_gv11b.h index b350862c..5bcbe667 100644 --- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.h +++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.h @@ -41,6 +41,10 @@ struct gr_t19x { struct gr_gp10b_ecc_stat sm_l1_tag_uncorrected_err_count; struct gr_gp10b_ecc_stat sm_cbu_corrected_err_count; struct gr_gp10b_ecc_stat sm_cbu_uncorrected_err_count; + struct gr_gp10b_ecc_stat sm_l1_data_corrected_err_count; + struct gr_gp10b_ecc_stat sm_l1_data_uncorrected_err_count; + struct gr_gp10b_ecc_stat sm_icache_corrected_err_count; + struct gr_gp10b_ecc_stat sm_icache_uncorrected_err_count; } ecc_stats; }; diff --git a/drivers/gpu/nvgpu/gv11b/platform_gv11b_tegra.c b/drivers/gpu/nvgpu/gv11b/platform_gv11b_tegra.c index d235b261..009e5716 100644 --- a/drivers/gpu/nvgpu/gv11b/platform_gv11b_tegra.c +++ b/drivers/gpu/nvgpu/gv11b/platform_gv11b_tegra.c @@ -127,6 +127,10 @@ static struct device_attribute *dev_attr_sm_l1_tag_ecc_corrected_err_count_array static struct device_attribute *dev_attr_sm_l1_tag_ecc_uncorrected_err_count_array; static struct device_attribute *dev_attr_sm_cbu_ecc_corrected_err_count_array; static struct device_attribute *dev_attr_sm_cbu_ecc_uncorrected_err_count_array; +static struct device_attribute *dev_attr_sm_l1_data_ecc_corrected_err_count_array; +static struct device_attribute *dev_attr_sm_l1_data_ecc_uncorrected_err_count_array; +static struct device_attribute *dev_attr_sm_icache_ecc_corrected_err_count_array; +static struct device_attribute *dev_attr_sm_icache_ecc_uncorrected_err_count_array; void gr_gv11b_create_sysfs(struct device *dev) { @@ -165,6 +169,30 @@ void gr_gv11b_create_sysfs(struct device *dev) &g->gr.t19x.ecc_stats.sm_cbu_uncorrected_err_count, dev_attr_sm_cbu_ecc_uncorrected_err_count_array); + error |= gr_gp10b_ecc_stat_create(dev, + 0, + "sm_l1_data_ecc_corrected_err_count", + &g->gr.t19x.ecc_stats.sm_l1_data_corrected_err_count, + dev_attr_sm_l1_data_ecc_corrected_err_count_array); + + error |= gr_gp10b_ecc_stat_create(dev, + 0, + "sm_l1_data_ecc_uncorrected_err_count", + &g->gr.t19x.ecc_stats.sm_l1_data_uncorrected_err_count, + dev_attr_sm_l1_data_ecc_uncorrected_err_count_array); + + error |= gr_gp10b_ecc_stat_create(dev, + 0, + "sm_icache_ecc_corrected_err_count", + &g->gr.t19x.ecc_stats.sm_icache_corrected_err_count, + dev_attr_sm_icache_ecc_corrected_err_count_array); + + error |= gr_gp10b_ecc_stat_create(dev, + 0, + "sm_icache_ecc_uncorrected_err_count", + &g->gr.t19x.ecc_stats.sm_icache_uncorrected_err_count, + dev_attr_sm_icache_ecc_uncorrected_err_count_array); + if (error) dev_err(dev, "Failed to create gv11b sysfs attributes!\n"); } @@ -193,4 +221,24 @@ static void gr_gv11b_remove_sysfs(struct device *dev) &g->gr.t19x.ecc_stats.sm_cbu_uncorrected_err_count, dev_attr_sm_cbu_ecc_uncorrected_err_count_array); + gr_gp10b_ecc_stat_remove(dev, + 0, + &g->gr.t19x.ecc_stats.sm_l1_data_corrected_err_count, + dev_attr_sm_l1_data_ecc_corrected_err_count_array); + + gr_gp10b_ecc_stat_remove(dev, + 0, + &g->gr.t19x.ecc_stats.sm_l1_data_uncorrected_err_count, + dev_attr_sm_l1_data_ecc_uncorrected_err_count_array); + + gr_gp10b_ecc_stat_remove(dev, + 0, + &g->gr.t19x.ecc_stats.sm_icache_corrected_err_count, + dev_attr_sm_icache_ecc_corrected_err_count_array); + + gr_gp10b_ecc_stat_remove(dev, + 0, + &g->gr.t19x.ecc_stats.sm_icache_uncorrected_err_count, + dev_attr_sm_icache_ecc_uncorrected_err_count_array); + } diff --git a/drivers/gpu/nvgpu/include/nvgpu/hw/gv11b/hw_gr_gv11b.h b/drivers/gpu/nvgpu/include/nvgpu/hw/gv11b/hw_gr_gv11b.h index 4b2e8c32..4ce69743 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/hw/gv11b/hw_gr_gv11b.h +++ b/drivers/gpu/nvgpu/include/nvgpu/hw/gv11b/hw_gr_gv11b.h @@ -582,6 +582,134 @@ static inline u32 gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_total_v(u32 { return (r >> 0) & 0xffff; } +static inline u32 gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_r(void) +{ + return 0x0050436c; +} +static inline u32 gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_corrected_err_el1_0_m(void) +{ + return 0x1 << 0; +} +static inline u32 gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_corrected_err_el1_1_m(void) +{ + return 0x1 << 1; +} +static inline u32 gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_uncorrected_err_el1_0_m(void) +{ + return 0x1 << 2; +} +static inline u32 gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_uncorrected_err_el1_1_m(void) +{ + return 0x1 << 3; +} +static inline u32 gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_corrected_err_total_counter_overflow_v(u32 r) +{ + return (r >> 8) & 0x1; +} +static inline u32 gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_uncorrected_err_total_counter_overflow_v(u32 r) +{ + return (r >> 10) & 0x1; +} +static inline u32 gr_pri_gpc0_tpc0_sm_l1_data_ecc_status_reset_task_f(void) +{ + return 0x40000000; +} +static inline u32 gr_pri_gpc0_tpc0_sm_l1_data_ecc_corrected_err_count_r(void) +{ + return 0x00504370; +} +static inline u32 gr_pri_gpc0_tpc0_sm_l1_data_ecc_corrected_err_count_total_s(void) +{ + return 16; +} +static inline u32 gr_pri_gpc0_tpc0_sm_l1_data_ecc_corrected_err_count_total_v(u32 r) +{ + return (r >> 0) & 0xffff; +} +static inline u32 gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_r(void) +{ + return 0x00504374; +} +static inline u32 gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_total_s(void) +{ + return 16; +} +static inline u32 gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_total_v(u32 r) +{ + return (r >> 0) & 0xffff; +} +static inline u32 gr_pri_gpc0_tpc0_sm_icache_ecc_status_r(void) +{ + return 0x0050464c; +} +static inline u32 gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l0_data_m(void) +{ + return 0x1 << 0; +} +static inline u32 gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l0_predecode_m(void) +{ + return 0x1 << 1; +} +static inline u32 gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l1_data_m(void) +{ + return 0x1 << 2; +} +static inline u32 gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_l1_predecode_m(void) +{ + return 0x1 << 3; +} +static inline u32 gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l0_data_m(void) +{ + return 0x1 << 4; +} +static inline u32 gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l0_predecode_m(void) +{ + return 0x1 << 5; +} +static inline u32 gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l1_data_m(void) +{ + return 0x1 << 6; +} +static inline u32 gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_l1_predecode_m(void) +{ + return 0x1 << 7; +} +static inline u32 gr_pri_gpc0_tpc0_sm_icache_ecc_status_corrected_err_total_counter_overflow_v(u32 r) +{ + return (r >> 16) & 0x1; +} +static inline u32 gr_pri_gpc0_tpc0_sm_icache_ecc_status_uncorrected_err_total_counter_overflow_v(u32 r) +{ + return (r >> 18) & 0x1; +} +static inline u32 gr_pri_gpc0_tpc0_sm_icache_ecc_status_reset_task_f(void) +{ + return 0x40000000; +} +static inline u32 gr_pri_gpc0_tpc0_sm_icache_ecc_corrected_err_count_r(void) +{ + return 0x00504650; +} +static inline u32 gr_pri_gpc0_tpc0_sm_icache_ecc_corrected_err_count_total_s(void) +{ + return 16; +} +static inline u32 gr_pri_gpc0_tpc0_sm_icache_ecc_corrected_err_count_total_v(u32 r) +{ + return (r >> 0) & 0xffff; +} +static inline u32 gr_pri_gpc0_tpc0_sm_icache_ecc_uncorrected_err_count_r(void) +{ + return 0x00504654; +} +static inline u32 gr_pri_gpc0_tpc0_sm_icache_ecc_uncorrected_err_count_total_s(void) +{ + return 16; +} +static inline u32 gr_pri_gpc0_tpc0_sm_icache_ecc_uncorrected_err_count_total_v(u32 r) +{ + return (r >> 0) & 0xffff; +} static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_r(void) { return 0x00504624; -- cgit v1.2.2