From 49be5d49292c9c853f5b6ad53c32d59f866322ec Mon Sep 17 00:00:00 2001 From: Deepak Goyal Date: Wed, 15 Nov 2017 11:40:54 +0530 Subject: gpu: nvgpu: gv11b: implement ecc scrubber Check the availability of ecc units by checking relevant ecc fuse and fuse overrides. During gpu boot, initialize ecc units by scrubbing individual ecc units available. ECC initialization should be done before gr initialization. Following ecc units are scrubbed: SM LRF SM L1 DATA SM L1 TAG SM CBU SM ICACHE Bug 200339497 Change-Id: I54bf8cc1fce639a9993bf80984dafc28dca0dba3 Signed-off-by: Deepak Goyal Signed-off-by: seshendra Gadagottu Reviewed-on: https://git-master.nvidia.com/r/1612734 Reviewed-by: mobile promotions Tested-by: mobile promotions --- drivers/gpu/nvgpu/gv11b/gr_gv11b.c | 185 ++++++++++++++++++++++++++++++++++++ drivers/gpu/nvgpu/gv11b/gr_gv11b.h | 1 + drivers/gpu/nvgpu/gv11b/gv11b.c | 120 +++++++++++++++++++++++ drivers/gpu/nvgpu/gv11b/gv11b.h | 1 + drivers/gpu/nvgpu/gv11b/hal_gv11b.c | 1 + 5 files changed, 308 insertions(+) (limited to 'drivers/gpu/nvgpu/gv11b') diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c index 033d83d5..8514cc1e 100644 --- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c @@ -44,6 +44,7 @@ #include "gv11b/gr_gv11b.h" #include "gv11b/mm_gv11b.h" #include "gv11b/subctx_gv11b.h" +#include "gv11b/gv11b.h" #include #include @@ -57,6 +58,10 @@ #define GFXP_WFI_TIMEOUT_COUNT_IN_USEC_DEFAULT 1000 +/* ecc scrubbing will done in 1 pri read cycle,but for safety used 10 retries */ +#define ECC_SCRUBBING_TIMEOUT_MAX 1000 +#define ECC_SCRUBBING_TIMEOUT_DEFAULT 10 + bool gr_gv11b_is_valid_class(struct gk20a *g, u32 class_num) { bool valid = false; @@ -3674,3 +3679,183 @@ unsigned long gr_gv11b_get_max_gfxp_wfi_timeout_count(struct gk20a *g) /* 100 msec in usec count */ return (100 * 1000UL); } + +static int gr_gv11b_ecc_scrub_is_done(struct gk20a *g, + u32 scrub_reg, u32 scrub_mask, u32 scrub_done) +{ + struct nvgpu_timeout timeout; + int status = 0; + u32 val; + + nvgpu_timeout_init(g, &timeout, + ECC_SCRUBBING_TIMEOUT_MAX / + ECC_SCRUBBING_TIMEOUT_DEFAULT, + NVGPU_TIMER_RETRY_TIMER); + do { + val = gk20a_readl(g, scrub_reg); + if ((val & scrub_mask) == scrub_done) + goto exit; + nvgpu_udelay(ECC_SCRUBBING_TIMEOUT_DEFAULT); + } while (!nvgpu_timeout_expired(&timeout)); + + if (nvgpu_timeout_peek_expired(&timeout)) + status = -ETIMEDOUT; +exit: + return status; + +} + +static int gr_gv11b_ecc_scrub_sm_lrf(struct gk20a *g) +{ + u32 scrub_mask, scrub_done; + + if (!nvgpu_is_enabled(g, NVGPU_ECC_ENABLED_SM_LRF)) { + nvgpu_log_info(g, "ECC SM LRF is disabled"); + return 0; + } + + nvgpu_log_info(g, "gr_gv11b_ecc_scrub_sm_lrf"); + scrub_mask = + (gr_pri_gpcs_tpcs_sm_lrf_ecc_control_scrub_qrfdp0_task_f() | + gr_pri_gpcs_tpcs_sm_lrf_ecc_control_scrub_qrfdp1_task_f() | + gr_pri_gpcs_tpcs_sm_lrf_ecc_control_scrub_qrfdp2_task_f() | + gr_pri_gpcs_tpcs_sm_lrf_ecc_control_scrub_qrfdp3_task_f() | + gr_pri_gpcs_tpcs_sm_lrf_ecc_control_scrub_qrfdp4_task_f() | + gr_pri_gpcs_tpcs_sm_lrf_ecc_control_scrub_qrfdp5_task_f() | + gr_pri_gpcs_tpcs_sm_lrf_ecc_control_scrub_qrfdp6_task_f() | + gr_pri_gpcs_tpcs_sm_lrf_ecc_control_scrub_qrfdp7_task_f()); + + /* Issue scrub lrf regions with single write command */ + gk20a_writel(g, gr_pri_gpcs_tpcs_sm_lrf_ecc_control_r(), scrub_mask); + + scrub_done = + (gr_pri_gpc0_tpc0_sm_lrf_ecc_control_scrub_qrfdp0_init_f() | + gr_pri_gpc0_tpc0_sm_lrf_ecc_control_scrub_qrfdp1_init_f() | + gr_pri_gpc0_tpc0_sm_lrf_ecc_control_scrub_qrfdp2_init_f() | + gr_pri_gpc0_tpc0_sm_lrf_ecc_control_scrub_qrfdp3_init_f() | + gr_pri_gpc0_tpc0_sm_lrf_ecc_control_scrub_qrfdp4_init_f() | + gr_pri_gpc0_tpc0_sm_lrf_ecc_control_scrub_qrfdp5_init_f() | + gr_pri_gpc0_tpc0_sm_lrf_ecc_control_scrub_qrfdp6_init_f() | + gr_pri_gpc0_tpc0_sm_lrf_ecc_control_scrub_qrfdp7_init_f()); + + return gr_gv11b_ecc_scrub_is_done(g, + gr_pri_gpc0_tpc0_sm_lrf_ecc_control_r(), + scrub_mask, scrub_done); +} + +static int gr_gv11b_ecc_scrub_sm_l1_data(struct gk20a *g) +{ + u32 scrub_mask, scrub_done; + + if (!nvgpu_is_enabled(g, NVGPU_ECC_ENABLED_SM_L1_DATA)) { + nvgpu_log_info(g, "ECC L1DATA is disabled"); + return 0; + } + nvgpu_log_info(g, "gr_gv11b_ecc_scrub_sm_l1_data"); + scrub_mask = + (gr_pri_gpcs_tpcs_sm_l1_data_ecc_control_scrub_el1_0_task_f() | + gr_pri_gpcs_tpcs_sm_l1_data_ecc_control_scrub_el1_1_task_f()); + + gk20a_writel(g, gr_pri_gpcs_tpcs_sm_l1_data_ecc_control_r(), + scrub_mask); + + scrub_done = + (gr_pri_gpc0_tpc0_sm_l1_data_ecc_control_scrub_el1_0_init_f() | + gr_pri_gpc0_tpc0_sm_l1_data_ecc_control_scrub_el1_1_init_f()); + return gr_gv11b_ecc_scrub_is_done(g, + gr_pri_gpc0_tpc0_sm_l1_data_ecc_control_r(), + scrub_mask, scrub_done); +} + +static int gr_gv11b_ecc_scrub_sm_l1_tag(struct gk20a *g) +{ + u32 scrub_mask, scrub_done; + + if (!nvgpu_is_enabled(g, NVGPU_ECC_ENABLED_SM_L1_TAG)) { + nvgpu_log_info(g, "ECC L1TAG is disabled"); + return 0; + } + nvgpu_log_info(g, "gr_gv11b_ecc_scrub_sm_l1_tag"); + scrub_mask = + (gr_pri_gpcs_tpcs_sm_l1_tag_ecc_control_scrub_el1_0_task_f() | + gr_pri_gpcs_tpcs_sm_l1_tag_ecc_control_scrub_el1_1_task_f()); + gk20a_writel(g, gr_pri_gpcs_tpcs_sm_l1_tag_ecc_control_r(), scrub_mask); + + scrub_done = + (gr_pri_gpc0_tpc0_sm_l1_tag_ecc_control_scrub_el1_0_init_f() | + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_control_scrub_el1_1_init_f()); + return gr_gv11b_ecc_scrub_is_done(g, + gr_pri_gpc0_tpc0_sm_l1_tag_ecc_control_r(), + scrub_mask, scrub_done); +} + +static int gr_gv11b_ecc_scrub_sm_cbu(struct gk20a *g) +{ + u32 scrub_mask, scrub_done; + + if (!nvgpu_is_enabled(g, NVGPU_ECC_ENABLED_SM_CBU)) { + nvgpu_log_info(g, "ECC CBU is disabled"); + return 0; + } + nvgpu_log_info(g, "gr_gv11b_ecc_scrub_sm_cbu"); + scrub_mask = + (gr_pri_gpcs_tpcs_sm_cbu_ecc_control_scrub_warp_sm0_task_f() | + gr_pri_gpcs_tpcs_sm_cbu_ecc_control_scrub_warp_sm1_task_f() | + gr_pri_gpcs_tpcs_sm_cbu_ecc_control_scrub_barrier_sm0_task_f() | + gr_pri_gpcs_tpcs_sm_cbu_ecc_control_scrub_barrier_sm1_task_f()); + gk20a_writel(g, gr_pri_gpcs_tpcs_sm_cbu_ecc_control_r(), scrub_mask); + + scrub_done = + (gr_pri_gpc0_tpc0_sm_cbu_ecc_control_scrub_warp_sm0_init_f() | + gr_pri_gpc0_tpc0_sm_cbu_ecc_control_scrub_warp_sm1_init_f() | + gr_pri_gpc0_tpc0_sm_cbu_ecc_control_scrub_barrier_sm0_init_f() | + gr_pri_gpc0_tpc0_sm_cbu_ecc_control_scrub_barrier_sm1_init_f()); + return gr_gv11b_ecc_scrub_is_done(g, + gr_pri_gpc0_tpc0_sm_cbu_ecc_control_r(), + scrub_mask, scrub_done); +} + +static int gr_gv11b_ecc_scrub_sm_icahe(struct gk20a *g) +{ + u32 scrub_mask, scrub_done; + + if (!nvgpu_is_enabled(g, NVGPU_ECC_ENABLED_SM_ICACHE)) { + nvgpu_log_info(g, "ECC ICAHE is disabled"); + return 0; + } + nvgpu_log_info(g, "gr_gv11b_ecc_scrub_sm_icahe"); + scrub_mask = + (gr_pri_gpcs_tpcs_sm_icache_ecc_control_scrub_l0_data_task_f() | + gr_pri_gpcs_tpcs_sm_icache_ecc_control_scrub_l0_predecode_task_f() | + gr_pri_gpcs_tpcs_sm_icache_ecc_control_scrub_l1_data_task_f() | + gr_pri_gpcs_tpcs_sm_icache_ecc_control_scrub_l1_predecode_task_f()); + gk20a_writel(g, gr_pri_gpcs_tpcs_sm_icache_ecc_control_r(), scrub_mask); + + scrub_done = + (gr_pri_gpc0_tpc0_sm_icache_ecc_control_scrub_l0_data_init_f() | + gr_pri_gpc0_tpc0_sm_icache_ecc_control_scrub_l0_predecode_init_f() | + gr_pri_gpc0_tpc0_sm_icache_ecc_control_scrub_l1_data_init_f() | + gr_pri_gpc0_tpc0_sm_icache_ecc_control_scrub_l1_predecode_init_f()); + return gr_gv11b_ecc_scrub_is_done(g, + gr_pri_gpc0_tpc0_sm_icache_ecc_control_r(), + scrub_mask, scrub_done); +} + +void gr_gv11b_ecc_init_scrub_reg(struct gk20a *g) +{ + nvgpu_log_fn(g, "ecc srub start "); + + gv11b_detect_ecc_enabled_units(g); + + if (gr_gv11b_ecc_scrub_sm_lrf(g)) + nvgpu_warn(g, "ECC SCRUB SM LRF Failed"); + if (gr_gv11b_ecc_scrub_sm_l1_data(g)) + nvgpu_warn(g, "ECC SCRUB SM L1 DATA Failed"); + if (gr_gv11b_ecc_scrub_sm_l1_tag(g)) + nvgpu_warn(g, "ECC SCRUB SM L1 TAG Failed"); + if (gr_gv11b_ecc_scrub_sm_cbu(g)) + nvgpu_warn(g, "ECC SCRUB SM CBU Failed"); + if (gr_gv11b_ecc_scrub_sm_icahe(g)) + nvgpu_warn(g, "ECC SCRUB SM ICACHE Failed"); + +} diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.h b/drivers/gpu/nvgpu/gv11b/gr_gv11b.h index 7c56f62d..39d12b3f 100644 --- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.h +++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.h @@ -216,5 +216,6 @@ void gr_gv11b_init_gpc_mmu(struct gk20a *g); int gr_gv11b_init_preemption_state(struct gk20a *g); void gr_gv11b_init_gfxp_wfi_timeout_count(struct gk20a *g); unsigned long gr_gv11b_get_max_gfxp_wfi_timeout_count(struct gk20a *g); +void gr_gv11b_ecc_init_scrub_reg(struct gk20a *g); #endif diff --git a/drivers/gpu/nvgpu/gv11b/gv11b.c b/drivers/gpu/nvgpu/gv11b/gv11b.c index 211755e5..a62e49fb 100644 --- a/drivers/gpu/nvgpu/gv11b/gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/gv11b.c @@ -26,8 +26,128 @@ #include #include "gk20a/gk20a.h" +#include "gp10b/gp10b.h" #include "gv11b/gv11b.h" +#include +#include + +void gv11b_detect_ecc_enabled_units(struct gk20a *g) +{ + u32 opt_ecc_en = gk20a_readl(g, fuse_opt_ecc_en_r()); + u32 opt_feature_fuses_override_disable = + gk20a_readl(g, + fuse_opt_feature_fuses_override_disable_r()); + u32 fecs_feature_override_ecc = + gk20a_readl(g, + gr_fecs_feature_override_ecc_r()); + + if (opt_feature_fuses_override_disable) { + if (opt_ecc_en) { + __nvgpu_set_enabled(g, + NVGPU_ECC_ENABLED_SM_LRF, true); + __nvgpu_set_enabled(g, + NVGPU_ECC_ENABLED_SM_L1_DATA, true); + __nvgpu_set_enabled(g, + NVGPU_ECC_ENABLED_SM_L1_TAG, true); + __nvgpu_set_enabled(g, + NVGPU_ECC_ENABLED_SM_ICACHE, true); + __nvgpu_set_enabled(g, NVGPU_ECC_ENABLED_LTC, true); + __nvgpu_set_enabled(g, NVGPU_ECC_ENABLED_SM_CBU, true); + } + } else { + /* SM LRF */ + if (gr_fecs_feature_override_ecc_sm_lrf_override_v( + fecs_feature_override_ecc)) { + if (gr_fecs_feature_override_ecc_sm_lrf_v( + fecs_feature_override_ecc)) { + __nvgpu_set_enabled(g, + NVGPU_ECC_ENABLED_SM_LRF, true); + } + } else { + if (opt_ecc_en) { + __nvgpu_set_enabled(g, + NVGPU_ECC_ENABLED_SM_LRF, true); + } + } + /* SM L1 DATA*/ + if (gr_fecs_feature_override_ecc_sm_l1_data_override_v( + fecs_feature_override_ecc)) { + if (gr_fecs_feature_override_ecc_sm_l1_data_v( + fecs_feature_override_ecc)) { + __nvgpu_set_enabled(g, + NVGPU_ECC_ENABLED_SM_L1_DATA, true); + } + } else { + if (opt_ecc_en) { + __nvgpu_set_enabled(g, + NVGPU_ECC_ENABLED_SM_L1_DATA, true); + } + } + /* SM L1 TAG*/ + if (gr_fecs_feature_override_ecc_sm_l1_tag_override_v( + fecs_feature_override_ecc)) { + if (gr_fecs_feature_override_ecc_sm_l1_tag_v( + fecs_feature_override_ecc)) { + __nvgpu_set_enabled(g, + NVGPU_ECC_ENABLED_SM_L1_TAG, true); + } + } else { + if (opt_ecc_en) { + __nvgpu_set_enabled(g, + NVGPU_ECC_ENABLED_SM_L1_TAG, true); + } + } + /* SM ICACHE*/ + if (gr_fecs_feature_override_ecc_1_sm_l0_icache_override_v( + fecs_feature_override_ecc) && + gr_fecs_feature_override_ecc_1_sm_l1_icache_override_v( + fecs_feature_override_ecc)) { + if (gr_fecs_feature_override_ecc_1_sm_l0_icache_v( + fecs_feature_override_ecc) && + gr_fecs_feature_override_ecc_1_sm_l1_icache_v( + fecs_feature_override_ecc)) { + __nvgpu_set_enabled(g, + NVGPU_ECC_ENABLED_SM_ICACHE, true); + } + } else { + if (opt_ecc_en) { + __nvgpu_set_enabled(g, + NVGPU_ECC_ENABLED_SM_ICACHE, true); + } + } + /* LTC */ + if (gr_fecs_feature_override_ecc_ltc_override_v( + fecs_feature_override_ecc)) { + if (gr_fecs_feature_override_ecc_ltc_v( + fecs_feature_override_ecc)) { + __nvgpu_set_enabled(g, + NVGPU_ECC_ENABLED_LTC, true); + } + } else { + if (opt_ecc_en) { + __nvgpu_set_enabled(g, + NVGPU_ECC_ENABLED_LTC, true); + } + } + /* SM CBU */ + if (gr_fecs_feature_override_ecc_sm_cbu_override_v( + fecs_feature_override_ecc)) { + if (gr_fecs_feature_override_ecc_sm_cbu_v( + fecs_feature_override_ecc)) { + __nvgpu_set_enabled(g, + NVGPU_ECC_ENABLED_SM_CBU, true); + } + } else { + if (opt_ecc_en) { + __nvgpu_set_enabled(g, + NVGPU_ECC_ENABLED_SM_CBU, true); + } + } + } +} + + int gv11b_init_gpu_characteristics(struct gk20a *g) { diff --git a/drivers/gpu/nvgpu/gv11b/gv11b.h b/drivers/gpu/nvgpu/gv11b/gv11b.h index 3d5490e6..17dfa7aa 100644 --- a/drivers/gpu/nvgpu/gv11b/gv11b.h +++ b/drivers/gpu/nvgpu/gv11b/gv11b.h @@ -27,6 +27,7 @@ #include "gk20a/gk20a.h" +void gv11b_detect_ecc_enabled_units(struct gk20a *g); int gv11b_init_gpu_characteristics(struct gk20a *g); #endif /* GV11B_H */ diff --git a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c index f6bdf6e5..65cae8de 100644 --- a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c @@ -392,6 +392,7 @@ static const struct gpu_ops gv11b_ops = { gr_gv11b_init_gfxp_wfi_timeout_count, .get_max_gfxp_wfi_timeout_count = gr_gv11b_get_max_gfxp_wfi_timeout_count, + .ecc_init_scrub_reg = gr_gv11b_ecc_init_scrub_reg, }, .fb = { .reset = gv11b_fb_reset, -- cgit v1.2.2