From f17e0d822b47465cca23afa2054bfa1267b52b95 Mon Sep 17 00:00:00 2001 From: Adeel Raza Date: Thu, 18 Jun 2015 16:31:50 -0700 Subject: gpu: nvgpu: gp10b: add ECC support Add ECC exception handling support for SM, TEX, and LTC. Bug 1635727 Bug 1637486 Change-Id: I8862ead5784f48742355432ec07c71a82b1b6735 Signed-off-by: Adeel Raza Reviewed-on: http://git-master/r/935362 Reviewed-by: Terje Bergstrom GVS: Gerrit_Virtual_Submit --- drivers/gpu/nvgpu/gp10b/gr_gp10b.c | 102 ++++++++++++++++++++++++++++++ drivers/gpu/nvgpu/gp10b/hw_gr_gp10b.h | 110 ++++++++++++++++++++++++++++++++- drivers/gpu/nvgpu/gp10b/hw_ltc_gp10b.h | 16 +++++ drivers/gpu/nvgpu/gp10b/ltc_gp10b.c | 23 +++++++ 4 files changed, 250 insertions(+), 1 deletion(-) (limited to 'drivers/gpu') diff --git a/drivers/gpu/nvgpu/gp10b/gr_gp10b.c b/drivers/gpu/nvgpu/gp10b/gr_gp10b.c index 6bdb9a7c..f8c31bd3 100644 --- a/drivers/gpu/nvgpu/gp10b/gr_gp10b.c +++ b/drivers/gpu/nvgpu/gp10b/gr_gp10b.c @@ -55,6 +55,106 @@ static bool gr_gp10b_is_valid_class(struct gk20a *g, u32 class_num) return valid; } +static int gr_gp10b_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, + bool *post_event, struct channel_gk20a *fault_ch) +{ + int ret = 0; + u32 offset = proj_gpc_stride_v() * gpc + + proj_tpc_in_gpc_stride_v() * tpc; + u32 lrf_ecc_status, shm_ecc_status; + + gr_gk20a_handle_sm_exception(g, gpc, tpc, post_event, NULL); + + /* Check for LRF ECC errors. */ + lrf_ecc_status = gk20a_readl(g, + gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r() + offset); + if ( (lrf_ecc_status & + gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp0_pending_f()) || + (lrf_ecc_status & + gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp1_pending_f()) || + (lrf_ecc_status & + gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp2_pending_f()) || + (lrf_ecc_status & + gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp3_pending_f()) ) { + + gk20a_dbg(gpu_dbg_fn | gpu_dbg_intr, + "Single bit error detected in SM LRF!"); + } + if ( (lrf_ecc_status & + gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp0_pending_f()) || + (lrf_ecc_status & + gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp1_pending_f()) || + (lrf_ecc_status & + gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp2_pending_f()) || + (lrf_ecc_status & + gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp3_pending_f()) ) { + + gk20a_dbg(gpu_dbg_fn | gpu_dbg_intr, + "Double bit error detected in SM LRF!"); + } + gk20a_writel(g, gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r() + offset, + lrf_ecc_status); + + /* Check for SHM ECC errors. */ + shm_ecc_status = gk20a_readl(g, + gr_pri_gpc0_tpc0_sm_shm_ecc_status_r() + offset); + if ((shm_ecc_status & + gr_pri_gpc0_tpc0_sm_shm_ecc_status_single_err_corrected_shm0_pending_f()) || + (shm_ecc_status & + gr_pri_gpc0_tpc0_sm_shm_ecc_status_single_err_corrected_shm1_pending_f()) || + (shm_ecc_status & + gr_pri_gpc0_tpc0_sm_shm_ecc_status_single_err_detected_shm0_pending_f()) || + (shm_ecc_status & + gr_pri_gpc0_tpc0_sm_shm_ecc_status_single_err_detected_shm1_pending_f()) ) { + + gk20a_dbg(gpu_dbg_fn | gpu_dbg_intr, + "Single bit error detected in SM SHM!"); + } + if ( (shm_ecc_status & + gr_pri_gpc0_tpc0_sm_shm_ecc_status_double_err_detected_shm0_pending_f()) || + (shm_ecc_status & + gr_pri_gpc0_tpc0_sm_shm_ecc_status_double_err_detected_shm1_pending_f()) ) { + + gk20a_dbg(gpu_dbg_fn | gpu_dbg_intr, + "Double bit error detected in SM SHM!"); + } + gk20a_writel(g, gr_pri_gpc0_tpc0_sm_shm_ecc_status_r() + offset, + shm_ecc_status); + + + return ret; +} + +static int gr_gp10b_handle_tex_exception(struct gk20a *g, u32 gpc, u32 tpc, + bool *post_event) +{ + int ret = 0; + u32 offset = proj_gpc_stride_v() * gpc + + proj_tpc_in_gpc_stride_v() * tpc; + u32 esr; + + gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, ""); + + esr = gk20a_readl(g, + gr_gpc0_tpc0_tex_m_hww_esr_r() + offset); + gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "0x%08x", esr); + + if (esr & gr_gpc0_tpc0_tex_m_hww_esr_ecc_sec_pending_f()) { + gk20a_dbg(gpu_dbg_fn | gpu_dbg_intr, + "Single bit error detected in TEX!"); + } + if (esr & gr_gpc0_tpc0_tex_m_hww_esr_ecc_ded_pending_f()) { + gk20a_dbg(gpu_dbg_fn | gpu_dbg_intr, + "Double bit error detected in TEX!"); + } + + gk20a_writel(g, + gr_gpc0_tpc0_tex_m_hww_esr_r() + offset, + esr); + + return ret; +} + static int gr_gp10b_commit_global_cb_manager(struct gk20a *g, struct channel_gk20a *c, bool patch) { @@ -1154,4 +1254,6 @@ void gp10b_init_gr(struct gpu_ops *gops) gops->gr.init_cyclestats = gr_gp10b_init_cyclestats; gops->gr.set_gpc_tpc_mask = gr_gp10b_set_gpc_tpc_mask; gops->gr.get_access_map = gr_gp10b_get_access_map; + gops->gr.handle_sm_exception = gr_gp10b_handle_sm_exception; + gops->gr.handle_tex_exception = gr_gp10b_handle_tex_exception; } diff --git a/drivers/gpu/nvgpu/gp10b/hw_gr_gp10b.h b/drivers/gpu/nvgpu/gp10b/hw_gr_gp10b.h index 347e530d..9569bb9c 100644 --- a/drivers/gpu/nvgpu/gp10b/hw_gr_gp10b.h +++ b/drivers/gpu/nvgpu/gp10b/hw_gr_gp10b.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014-2015, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2014-2016, NVIDIA CORPORATION. All rights reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -466,6 +466,70 @@ static inline u32 gr_pri_gpc0_tpc0_tex_m_tex_subunits_status_r(void) { return 0x00504238; } +static inline u32 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r(void) +{ + return 0x005046b8; +} +static inline u32 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp0_pending_f(void) +{ + return 0x10; +} +static inline u32 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp1_pending_f(void) +{ + return 0x20; +} +static inline u32 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp2_pending_f(void) +{ + return 0x40; +} +static inline u32 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp3_pending_f(void) +{ + return 0x80; +} +static inline u32 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp0_pending_f(void) +{ + return 0x100; +} +static inline u32 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp1_pending_f(void) +{ + return 0x200; +} +static inline u32 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp2_pending_f(void) +{ + return 0x400; +} +static inline u32 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp3_pending_f(void) +{ + return 0x800; +} +static inline u32 gr_pri_gpc0_tpc0_sm_shm_ecc_status_r(void) +{ + return 0x005044a0; +} +static inline u32 gr_pri_gpc0_tpc0_sm_shm_ecc_status_single_err_corrected_shm0_pending_f(void) +{ + return 0x1; +} +static inline u32 gr_pri_gpc0_tpc0_sm_shm_ecc_status_single_err_corrected_shm1_pending_f(void) +{ + return 0x2; +} +static inline u32 gr_pri_gpc0_tpc0_sm_shm_ecc_status_single_err_detected_shm0_pending_f(void) +{ + return 0x10; +} +static inline u32 gr_pri_gpc0_tpc0_sm_shm_ecc_status_single_err_detected_shm1_pending_f(void) +{ + return 0x20; +} +static inline u32 gr_pri_gpc0_tpc0_sm_shm_ecc_status_double_err_detected_shm0_pending_f(void) +{ + return 0x100; +} +static inline u32 gr_pri_gpc0_tpc0_sm_shm_ecc_status_double_err_detected_shm1_pending_f(void) +{ + return 0x200; +} static inline u32 gr_pri_be0_crop_status1_r(void) { return 0x00410134; @@ -3158,6 +3222,14 @@ static inline u32 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_int_report_f(vo { return 0x10; } +static inline u32 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_ecc_sec_error_report_f(void) +{ + return 0x20000000; +} +static inline u32 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_ecc_ded_error_report_f(void) +{ + return 0x40000000; +} static inline u32 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_pause_report_f(void) { return 0x20; @@ -3174,6 +3246,10 @@ static inline u32 gr_gpcs_tpcs_tpccs_tpc_exception_en_sm_enabled_f(void) { return 0x2; } +static inline u32 gr_gpcs_tpcs_tpccs_tpc_exception_en_tex_enabled_f(void) +{ + return 0x1; +} static inline u32 gr_gpc0_tpc0_tpccs_tpc_exception_en_r(void) { return 0x0050450c; @@ -3210,6 +3286,14 @@ static inline u32 gr_gpc0_tpc0_tpccs_tpc_exception_r(void) { return 0x00504508; } +static inline u32 gr_gpc0_tpc0_tpccs_tpc_exception_tex_v(u32 r) +{ + return (r >> 0) & 0x1; +} +static inline u32 gr_gpc0_tpc0_tpccs_tpc_exception_tex_pending_v(void) +{ + return 0x00000001; +} static inline u32 gr_gpc0_tpc0_tpccs_tpc_exception_sm_v(u32 r) { return (r >> 1) & 0x1; @@ -3322,6 +3406,14 @@ static inline u32 gr_gpc0_tpc0_sm_hww_global_esr_bpt_int_pending_f(void) { return 0x10; } +static inline u32 gr_gpc0_tpc0_sm_hww_global_esr_ecc_sec_error_pending_f(void) +{ + return 0x20000000; +} +static inline u32 gr_gpc0_tpc0_sm_hww_global_esr_ecc_ded_error_pending_f(void) +{ + return 0x40000000; +} static inline u32 gr_gpc0_tpc0_sm_hww_global_esr_bpt_pause_pending_f(void) { return 0x20; @@ -3330,6 +3422,22 @@ static inline u32 gr_gpc0_tpc0_sm_hww_global_esr_single_step_complete_pending_f( { return 0x40; } +static inline u32 gr_gpc0_tpc0_tex_m_hww_esr_r(void) +{ + return 0x00504224; +} +static inline u32 gr_gpc0_tpc0_tex_m_hww_esr_intr_pending_f(void) +{ + return 0x1; +} +static inline u32 gr_gpc0_tpc0_tex_m_hww_esr_ecc_sec_pending_f(void) +{ + return 0x80; +} +static inline u32 gr_gpc0_tpc0_tex_m_hww_esr_ecc_ded_pending_f(void) +{ + return 0x100; +} static inline u32 gr_gpc0_tpc0_sm_hww_warp_esr_r(void) { return 0x00504648; diff --git a/drivers/gpu/nvgpu/gp10b/hw_ltc_gp10b.h b/drivers/gpu/nvgpu/gp10b/hw_ltc_gp10b.h index ea96a9aa..302c2243 100644 --- a/drivers/gpu/nvgpu/gp10b/hw_ltc_gp10b.h +++ b/drivers/gpu/nvgpu/gp10b/hw_ltc_gp10b.h @@ -286,6 +286,14 @@ static inline u32 ltc_ltcs_ltss_intr_r(void) { return 0x0017e20c; } +static inline u32 ltc_ltcs_ltss_intr_ecc_sec_error_pending_f(void) +{ + return 0x100; +} +static inline u32 ltc_ltcs_ltss_intr_ecc_ded_error_pending_f(void) +{ + return 0x200; +} static inline u32 ltc_ltcs_ltss_intr_en_evicted_cb_m(void) { return 0x1 << 20; @@ -294,6 +302,14 @@ static inline u32 ltc_ltcs_ltss_intr_en_illegal_compstat_access_m(void) { return 0x1 << 30; } +static inline u32 ltc_ltcs_ltss_intr_en_ecc_sec_error_enabled_f(void) +{ + return 0x1000000; +} +static inline u32 ltc_ltcs_ltss_intr_en_ecc_ded_error_enabled_f(void) +{ + return 0x2000000; +} static inline u32 ltc_ltc0_lts0_intr_r(void) { return 0x0014040c; diff --git a/drivers/gpu/nvgpu/gp10b/ltc_gp10b.c b/drivers/gpu/nvgpu/gp10b/ltc_gp10b.c index 47992988..d0be86a4 100644 --- a/drivers/gpu/nvgpu/gp10b/ltc_gp10b.c +++ b/drivers/gpu/nvgpu/gp10b/ltc_gp10b.c @@ -136,6 +136,20 @@ static void gp10b_ltc_isr(struct gk20a *g) ltc_intr = gk20a_readl(g, ltc_ltc0_lts0_intr_r() + proj_ltc_stride_v() * ltc + proj_lts_stride_v() * slice); + + /* Detect and handle ECC errors */ + if (ltc_intr & + ltc_ltcs_ltss_intr_ecc_sec_error_pending_f()) { + gk20a_err(dev_from_gk20a(g), + "Single bit error detected in GPU L2!"); + g->ops.mm.l2_flush(g, true); + } + if (ltc_intr & + ltc_ltcs_ltss_intr_ecc_ded_error_pending_f()) { + gk20a_err(dev_from_gk20a(g), + "Double bit error detected in GPU L2!"); + } + gk20a_err(dev_from_gk20a(g), "ltc%d, slice %d: %08x", ltc, slice, ltc_intr); gk20a_writel(g, ltc_ltc0_lts0_intr_r() + @@ -148,10 +162,19 @@ static void gp10b_ltc_isr(struct gk20a *g) static void gp10b_ltc_init_fs_state(struct gk20a *g) { + u32 ltc_intr; + gm20b_ltc_init_fs_state(g); gk20a_writel(g, ltc_ltca_g_axi_pctrl_r(), ltc_ltca_g_axi_pctrl_user_sid_f(TEGRA_SID_GPUB)); + + /* Enable ECC interrupts */ + ltc_intr = gk20a_readl(g, ltc_ltcs_ltss_intr_r()); + ltc_intr |= ltc_ltcs_ltss_intr_en_ecc_sec_error_enabled_f() | + ltc_ltcs_ltss_intr_en_ecc_ded_error_enabled_f(); + gk20a_writel(g, ltc_ltcs_ltss_intr_r(), + ltc_intr); } void gp10b_init_ltc(struct gpu_ops *gops) -- cgit v1.2.2