From ffc37e50fa8e869e9a160b35f3cf414040e8a360 Mon Sep 17 00:00:00 2001 From: Lakshmanan M Date: Wed, 10 May 2017 12:38:08 +0530 Subject: gpu: nvgpu: gv11b: Add L1 tags parity support This CL covers the following parity support (corrected + uncorrected), 1) SM's L1 tags 2) SM's S2R's pixel PRF buffer 3) SM's L1 D-cache miss latency FIFOs Volta Resiliency Id - Volta-720, Volta-721, Volta-637 JIRA GPUT19X-85 JIRA GPUT19X-104 JIRA GPUT19X-100 JIRA GPUT19X-103 Bug 1825948 Bug 1825962 Bug 1775457 Change-Id: I53d7231a36b2c7c252395eca27b349eca80dec63 Signed-off-by: Lakshmanan M Reviewed-on: http://git-master/r/1478881 Reviewed-by: mobile promotions Tested-by: mobile promotions --- drivers/gpu/nvgpu/gv11b/gr_gv11b.c | 87 +++++++++++++++++++++- drivers/gpu/nvgpu/gv11b/gr_gv11b.h | 8 ++ drivers/gpu/nvgpu/gv11b/platform_gv11b_tegra.c | 59 +++++++++++++++ .../gpu/nvgpu/include/nvgpu/hw/gv11b/hw_gr_gv11b.h | 72 ++++++++++++++++++ 4 files changed, 225 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c index 179c7d33..ad34233c 100644 --- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c @@ -108,6 +108,89 @@ static bool gr_gv11b_is_valid_compute_class(struct gk20a *g, u32 class_num) return valid; } +static int gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc, + bool *post_event, struct channel_gk20a *fault_ch, + u32 *hww_global_esr) +{ + u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); + u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE); + u32 offset = gpc_stride * gpc + tpc_in_gpc_stride * tpc; + u32 l1_tag_ecc_status, l1_tag_ecc_corrected_err_status = 0; + u32 l1_tag_ecc_uncorrected_err_status = 0; + u32 l1_tag_corrected_err_count_delta = 0; + u32 l1_tag_uncorrected_err_count_delta = 0; + bool is_l1_tag_ecc_corrected_total_err_overflow = 0; + bool is_l1_tag_ecc_uncorrected_total_err_overflow = 0; + + /* Check for L1 tag ECC errors. */ + l1_tag_ecc_status = gk20a_readl(g, + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_r() + offset); + l1_tag_ecc_corrected_err_status = l1_tag_ecc_status & + (gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_0_m() | + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_1_m() | + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_pixrpf_m() | + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_miss_fifo_m()); + l1_tag_ecc_uncorrected_err_status = l1_tag_ecc_status & + (gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_0_m() | + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_1_m() | + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_pixrpf_m() | + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_miss_fifo_m()); + + if ((l1_tag_ecc_corrected_err_status == 0) && (l1_tag_ecc_uncorrected_err_status == 0)) + return 0; + + l1_tag_corrected_err_count_delta = + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_total_v( + gk20a_readl(g, + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_r() + + offset)); + l1_tag_uncorrected_err_count_delta = + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_total_v( + gk20a_readl(g, + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_r() + + offset)); + is_l1_tag_ecc_corrected_total_err_overflow = + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_total_counter_overflow_v(l1_tag_ecc_status); + is_l1_tag_ecc_uncorrected_total_err_overflow = + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_total_counter_overflow_v(l1_tag_ecc_status); + + if ((l1_tag_corrected_err_count_delta > 0) || is_l1_tag_ecc_corrected_total_err_overflow) { + gk20a_dbg(gpu_dbg_fn | gpu_dbg_intr, + "corrected error (SBE) detected in SM L1 tag! err_mask [%08x] is_overf [%d]", + l1_tag_ecc_corrected_err_status, is_l1_tag_ecc_corrected_total_err_overflow); + + /* HW uses 16-bits counter */ + l1_tag_corrected_err_count_delta += + (is_l1_tag_ecc_corrected_total_err_overflow << + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_total_s()); + g->gr.t19x.ecc_stats.sm_l1_tag_corrected_err_count.counters[tpc] += + l1_tag_corrected_err_count_delta; + gk20a_writel(g, + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_r() + offset, + 0); + } + if ((l1_tag_uncorrected_err_count_delta > 0) || is_l1_tag_ecc_uncorrected_total_err_overflow) { + gk20a_dbg(gpu_dbg_fn | gpu_dbg_intr, + "Uncorrected error (DBE) detected in SM L1 tag! err_mask [%08x] is_overf [%d]", + l1_tag_ecc_uncorrected_err_status, is_l1_tag_ecc_uncorrected_total_err_overflow); + + /* HW uses 16-bits counter */ + l1_tag_uncorrected_err_count_delta += + (is_l1_tag_ecc_uncorrected_total_err_overflow << + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_total_s()); + g->gr.t19x.ecc_stats.sm_l1_tag_uncorrected_err_count.counters[tpc] += + l1_tag_uncorrected_err_count_delta; + gk20a_writel(g, + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_r() + offset, + 0); + } + + gk20a_writel(g, gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_r() + offset, + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_reset_task_f()); + + return 0; + +} static int gr_gv11b_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, bool *post_event, struct channel_gk20a *fault_ch, @@ -118,7 +201,8 @@ static int gr_gv11b_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, proj_tpc_in_gpc_stride_v() * tpc; u32 lrf_ecc_status; - gr_gk20a_handle_sm_exception(g, gpc, tpc, post_event, fault_ch, hww_global_esr); + /* Check for L1 tag ECC errors. */ + gr_gv11b_handle_l1_tag_exception(g, gpc, tpc, post_event, fault_ch, hww_global_esr); /* Check for LRF ECC errors. */ lrf_ecc_status = gk20a_readl(g, @@ -1692,6 +1776,7 @@ void gv11b_init_gr(struct gpu_ops *gops) gops->gr.pre_process_sm_exception = gr_gv11b_pre_process_sm_exception; gops->gr.handle_fecs_error = gr_gv11b_handle_fecs_error; + gops->gr.create_gr_sysfs = gr_gv11b_create_sysfs; gops->gr.setup_rop_mapping = gr_gv11b_setup_rop_mapping; gops->gr.init_sw_veid_bundle = gr_gv11b_init_sw_veid_bundle; gops->gr.program_zcull_mapping = gr_gv11b_program_zcull_mapping; diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.h b/drivers/gpu/nvgpu/gv11b/gr_gv11b.h index 9d9f969d..2d6e3d1f 100644 --- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.h +++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.h @@ -35,6 +35,13 @@ enum { VOLTA_DMA_COPY_A = 0xC3B5, }; +struct gr_t19x { + struct { + struct gr_gp10b_ecc_stat sm_l1_tag_corrected_err_count; + struct gr_gp10b_ecc_stat sm_l1_tag_uncorrected_err_count; + } ecc_stats; +}; + #define NVC397_SET_SHADER_EXCEPTIONS 0x1528 #define NVC397_SET_CIRCULAR_BUFFER_SIZE 0x1280 #define NVC397_SET_ALPHA_CIRCULAR_BUFFER_SIZE 0x02dc @@ -48,4 +55,5 @@ int gr_gv11b_alloc_buffer(struct vm_gk20a *vm, size_t size, /*zcull*/ void gr_gv11b_program_zcull_mapping(struct gk20a *g, u32 zcull_num_entries, u32 *zcull_map_tiles); +void gr_gv11b_create_sysfs(struct device *dev); #endif diff --git a/drivers/gpu/nvgpu/gv11b/platform_gv11b_tegra.c b/drivers/gpu/nvgpu/gv11b/platform_gv11b_tegra.c index 97845035..8ca9dd30 100644 --- a/drivers/gpu/nvgpu/gv11b/platform_gv11b_tegra.c +++ b/drivers/gpu/nvgpu/gv11b/platform_gv11b_tegra.c @@ -27,11 +27,13 @@ #include "tegra/linux/clk.h" #include "gp10b/platform_gp10b.h" +#include "tegra/linux/platform_gp10b_tegra.h" #include "tegra/linux/platform_gk20a_tegra.h" #include "gr_gv11b.h" #include "nvgpu_gpuid_t19x.h" +static void gr_gv11b_remove_sysfs(struct device *dev); static int gv11b_tegra_probe(struct device *dev) { @@ -57,6 +59,15 @@ static int gv11b_tegra_probe(struct device *dev) return 0; } +static int gv11b_tegra_remove(struct device *dev) +{ + gp10b_tegra_remove(dev); + + gr_gv11b_remove_sysfs(dev); + + return 0; +} + static bool gv11b_tegra_is_railgated(struct device *dev) { bool ret = false; @@ -89,6 +100,7 @@ struct gk20a_platform t19x_gpu_tegra_platform = { .ptimer_src_freq = 31250000, .probe = gv11b_tegra_probe, + .remove = gv11b_tegra_remove, /* power management callbacks */ .suspend = gv11b_tegra_suspend, @@ -110,3 +122,50 @@ struct gk20a_platform t19x_gpu_tegra_platform = { .reset_assert = gp10b_tegra_reset_assert, .reset_deassert = gp10b_tegra_reset_deassert, }; + +static struct device_attribute *dev_attr_sm_l1_tag_ecc_corrected_err_count_array; +static struct device_attribute *dev_attr_sm_l1_tag_ecc_uncorrected_err_count_array; + +void gr_gv11b_create_sysfs(struct device *dev) +{ + struct gk20a *g = get_gk20a(dev); + int error = 0; + /* This stat creation function is called on GR init. GR can get + initialized multiple times but we only need to create the ECC + stats once. Therefore, add the following check to avoid + creating duplicate stat sysfs nodes. */ + if (g->gr.t19x.ecc_stats.sm_l1_tag_corrected_err_count.counters != NULL) + return; + + gr_gp10b_create_sysfs(dev); + + error |= gr_gp10b_ecc_stat_create(dev, + 0, + "sm_l1_tag_ecc_corrected_err_count", + &g->gr.t19x.ecc_stats.sm_l1_tag_corrected_err_count, + dev_attr_sm_l1_tag_ecc_corrected_err_count_array); + + error |= gr_gp10b_ecc_stat_create(dev, + 0, + "sm_l1_tag_ecc_uncorrected_err_count", + &g->gr.t19x.ecc_stats.sm_l1_tag_uncorrected_err_count, + dev_attr_sm_l1_tag_ecc_uncorrected_err_count_array); + + if (error) + dev_err(dev, "Failed to create gv11b sysfs attributes!\n"); +} + +static void gr_gv11b_remove_sysfs(struct device *dev) +{ + struct gk20a *g = get_gk20a(dev); + + gr_gp10b_ecc_stat_remove(dev, + 0, + &g->gr.t19x.ecc_stats.sm_l1_tag_corrected_err_count, + dev_attr_sm_l1_tag_ecc_corrected_err_count_array); + + gr_gp10b_ecc_stat_remove(dev, + 0, + &g->gr.t19x.ecc_stats.sm_l1_tag_uncorrected_err_count, + dev_attr_sm_l1_tag_ecc_uncorrected_err_count_array); +} diff --git a/drivers/gpu/nvgpu/include/nvgpu/hw/gv11b/hw_gr_gv11b.h b/drivers/gpu/nvgpu/include/nvgpu/hw/gv11b/hw_gr_gv11b.h index 592a7899..d45385a8 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/hw/gv11b/hw_gr_gv11b.h +++ b/drivers/gpu/nvgpu/include/nvgpu/hw/gv11b/hw_gr_gv11b.h @@ -482,6 +482,78 @@ static inline u32 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r(void) { return 0x00504358; } +static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_r(void) +{ + return 0x00504624; +} +static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_0_m(void) +{ + return 0x1 << 0; +} +static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_1_m(void) +{ + return 0x1 << 1; +} +static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_0_m(void) +{ + return 0x1 << 2; +} +static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_1_m(void) +{ + return 0x1 << 3; +} +static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_pixrpf_m(void) +{ + return 0x1 << 4; +} +static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_miss_fifo_m(void) +{ + return 0x1 << 5; +} +static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_pixrpf_m(void) +{ + return 0x1 << 6; +} +static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_miss_fifo_m(void) +{ + return 0x1 << 7; +} +static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_total_counter_overflow_v(u32 r) +{ + return (r >> 8) & 0x1; +} +static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_total_counter_overflow_v(u32 r) +{ + return (r >> 10) & 0x1; +} +static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_reset_task_f(void) +{ + return 0x40000000; +} +static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_r(void) +{ + return 0x00504628; +} +static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_total_s(void) +{ + return 16; +} +static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_total_v(u32 r) +{ + return (r >> 0) & 0xffff; +} +static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_r(void) +{ + return 0x0050462c; +} +static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_total_s(void) +{ + return 16; +} +static inline u32 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_total_v(u32 r) +{ + return (r >> 0) & 0xffff; +} static inline u32 gr_pri_gpc0_tpc0_tex_m_routing_r(void) { return 0x005042c4; -- cgit v1.2.2