summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
diff options
context:
space:
mode:
authorDebarshi Dutta <ddutta@nvidia.com>2021-05-17 04:38:25 -0400
committermobile promotions <svcmobile_promotions@nvidia.com>2021-05-28 15:10:24 -0400
commit34993e4f7b0d47620e88ba64a6d7c67330d97e35 (patch)
tree2136284f5bd4095780884885413bb268fd318a96 /drivers/gpu/nvgpu/gv11b/gr_gv11b.c
parent5f88598b9e7b2cfe0387733577ece138a7bc912b (diff)
gpu: nvgpu: Add ECC Support for GV11B in Linux
Implement nvgpu plumbing to allow reporting ECC errors(corrected and uncorrected) to a L1SS service(if one exists). This patch includes the following 1) Added code that submits ECC error reports via the Interrupt context directly to a L1SS service in linux OS. 2) Added support for enabling/disabling the error reports via L1SS's registration/deregistration API. Nvgpu simply invokes an empty function until the registration is successful. 3) Added Spinlock to correctly handle concurrency for accessing the correct Ops for submitting requests. 4) Adds error reporting for a subset of interrupts that can be verified via external ECC injection logic. A subsequent patch will add the API for rest of the interrupts. 5) In case of critical(uncorrected errors), change nvgpu's state to quiesce state. Jira L4T-1187 Bug 200700400 Change-Id: Id31f70531fba355e94e72c4f9762593e7667a11c Signed-off-by: Debarshi Dutta <ddutta@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2530411 Tested-by: Bibek Basu <bbasu@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com> Reviewed-by: Bibek Basu <bbasu@nvidia.com> Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> GVS: Gerrit_Virtual_Submit
Diffstat (limited to 'drivers/gpu/nvgpu/gv11b/gr_gv11b.c')
-rw-r--r--drivers/gpu/nvgpu/gv11b/gr_gv11b.c43
1 files changed, 42 insertions, 1 deletions
diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
index a7a804d2..110819a9 100644
--- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * GV11b GPU GR 2 * GV11b GPU GR
3 * 3 *
4 * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. 4 * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
5 * 5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a 6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"), 7 * copy of this software and associated documentation files (the "Software"),
@@ -37,6 +37,7 @@
37#include <nvgpu/bitops.h> 37#include <nvgpu/bitops.h>
38#include <nvgpu/gk20a.h> 38#include <nvgpu/gk20a.h>
39#include <nvgpu/channel.h> 39#include <nvgpu/channel.h>
40#include <nvgpu/nvgpu_err.h>
40 41
41#include "gk20a/gr_gk20a.h" 42#include "gk20a/gr_gk20a.h"
42#include "gk20a/dbg_gpu_gk20a.h" 43#include "gk20a/dbg_gpu_gk20a.h"
@@ -61,6 +62,8 @@
61#include <nvgpu/hw/gv11b/hw_pbdma_gv11b.h> 62#include <nvgpu/hw/gv11b/hw_pbdma_gv11b.h>
62#include <nvgpu/hw/gv11b/hw_perf_gv11b.h> 63#include <nvgpu/hw/gv11b/hw_perf_gv11b.h>
63 64
65#define SHIFT_8_BITS 8U
66
64#define GFXP_WFI_TIMEOUT_COUNT_IN_USEC_DEFAULT 100 67#define GFXP_WFI_TIMEOUT_COUNT_IN_USEC_DEFAULT 100
65 68
66/* ecc scrubbing will done in 1 pri read cycle,but for safety used 10 retries */ 69/* ecc scrubbing will done in 1 pri read cycle,but for safety used 10 retries */
@@ -224,6 +227,12 @@ static int gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc,
224 } 227 }
225 g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter += 228 g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter +=
226 l1_tag_corrected_err_count_delta; 229 l1_tag_corrected_err_count_delta;
230
231 nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
232 (gpc << SHIFT_8_BITS) | tpc,
233 GPU_SM_L1_TAG_ECC_CORRECTED, 0,
234 g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter);
235
227 gk20a_writel(g, 236 gk20a_writel(g,
228 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_r() + offset, 237 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_r() + offset,
229 0); 238 0);
@@ -240,6 +249,12 @@ static int gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc,
240 } 249 }
241 g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter += 250 g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter +=
242 l1_tag_uncorrected_err_count_delta; 251 l1_tag_uncorrected_err_count_delta;
252
253 nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
254 (gpc << SHIFT_8_BITS) | tpc,
255 GPU_SM_L1_TAG_ECC_UNCORRECTED, 0,
256 g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter);
257
243 gk20a_writel(g, 258 gk20a_writel(g,
244 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_r() + offset, 259 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_r() + offset,
245 0); 260 0);
@@ -335,6 +350,10 @@ static int gr_gv11b_handle_lrf_exception(struct gk20a *g, u32 gpc, u32 tpc,
335 } 350 }
336 g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter += 351 g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter +=
337 lrf_uncorrected_err_count_delta; 352 lrf_uncorrected_err_count_delta;
353 nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
354 (gpc << SHIFT_8_BITS) | tpc,
355 GPU_SM_LRF_ECC_UNCORRECTED, 0,
356 g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter);
338 gk20a_writel(g, 357 gk20a_writel(g,
339 gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_r() + offset, 358 gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_r() + offset,
340 0); 359 0);
@@ -497,6 +516,12 @@ static int gr_gv11b_handle_cbu_exception(struct gk20a *g, u32 gpc, u32 tpc,
497 } 516 }
498 g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter += 517 g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter +=
499 cbu_uncorrected_err_count_delta; 518 cbu_uncorrected_err_count_delta;
519
520 nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
521 (gpc << SHIFT_8_BITS) | tpc,
522 GPU_SM_CBU_ECC_UNCORRECTED,
523 0, g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter);
524
500 gk20a_writel(g, 525 gk20a_writel(g,
501 gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_r() + offset, 526 gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_r() + offset,
502 0); 527 0);
@@ -580,6 +605,10 @@ static int gr_gv11b_handle_l1_data_exception(struct gk20a *g, u32 gpc, u32 tpc,
580 } 605 }
581 g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter += 606 g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter +=
582 l1_data_uncorrected_err_count_delta; 607 l1_data_uncorrected_err_count_delta;
608 nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
609 (gpc << SHIFT_8_BITS) | tpc,
610 GPU_SM_L1_DATA_ECC_UNCORRECTED,
611 0, g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter);
583 gk20a_writel(g, 612 gk20a_writel(g,
584 gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_r() + offset, 613 gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_r() + offset,
585 0); 614 0);
@@ -2537,10 +2566,18 @@ static void gr_gv11b_handle_fecs_ecc_error(struct gk20a *g, u32 intr)
2537 2566
2538 if (ecc_status & 2567 if (ecc_status &
2539 gr_fecs_falcon_ecc_status_corrected_err_imem_m()) { 2568 gr_fecs_falcon_ecc_status_corrected_err_imem_m()) {
2569 nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_FECS, 0,
2570 GPU_FECS_FALCON_IMEM_ECC_CORRECTED,
2571 ecc_addr,
2572 g->ecc.gr.fecs_ecc_corrected_err_count[0].counter);
2540 nvgpu_log(g, gpu_dbg_intr, "imem ecc error corrected"); 2573 nvgpu_log(g, gpu_dbg_intr, "imem ecc error corrected");
2541 } 2574 }
2542 if (ecc_status & 2575 if (ecc_status &
2543 gr_fecs_falcon_ecc_status_uncorrected_err_imem_m()) { 2576 gr_fecs_falcon_ecc_status_uncorrected_err_imem_m()) {
2577 nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_FECS, 0,
2578 GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED,
2579 ecc_addr,
2580 g->ecc.gr.fecs_ecc_uncorrected_err_count[0].counter);
2544 nvgpu_log(g, gpu_dbg_intr, 2581 nvgpu_log(g, gpu_dbg_intr,
2545 "imem ecc error uncorrected"); 2582 "imem ecc error uncorrected");
2546 } 2583 }
@@ -2550,6 +2587,10 @@ static void gr_gv11b_handle_fecs_ecc_error(struct gk20a *g, u32 intr)
2550 } 2587 }
2551 if (ecc_status & 2588 if (ecc_status &
2552 gr_fecs_falcon_ecc_status_uncorrected_err_dmem_m()) { 2589 gr_fecs_falcon_ecc_status_uncorrected_err_dmem_m()) {
2590 nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_FECS, 0,
2591 GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED,
2592 ecc_addr,
2593 g->ecc.gr.fecs_ecc_uncorrected_err_count[0].counter);
2553 nvgpu_log(g, gpu_dbg_intr, 2594 nvgpu_log(g, gpu_dbg_intr,
2554 "dmem ecc error uncorrected"); 2595 "dmem ecc error uncorrected");
2555 } 2596 }