gpu: nvgpu: Add ECC Support for GV11B in Linux

Implement nvgpu plumbing to allow reporting ECC errors(corrected and uncorrected) to a L1SS service(if one exists). This patch includes the following 1) Added code that submits ECC error reports via the Interrupt context directly to a L1SS service in linux OS. 2) Added support for enabling/disabling the error reports via L1SS's registration/deregistration API. Nvgpu simply invokes an empty function until the registration is successful. 3) Added Spinlock to correctly handle concurrency for accessing the correct Ops for submitting requests. 4) Adds error reporting for a subset of interrupts that can be verified via external ECC injection logic. A subsequent patch will add the API for rest of the interrupts. 5) In case of critical(uncorrected errors), change nvgpu's state to quiesce state. Jira L4T-1187 Bug 200700400 Change-Id: Id31f70531fba355e94e72c4f9762593e7667a11c Signed-off-by: Debarshi Dutta <ddutta@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2530411 Tested-by: Bibek Basu <bbasu@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com> Reviewed-by: Bibek Basu <bbasu@nvidia.com> Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> GVS: Gerrit_Virtual_Submit
author: Debarshi Dutta <ddutta@nvidia.com> 2021-05-17 04:38:25 -0400
committer: mobile promotions <svcmobile_promotions@nvidia.com> 2021-05-28 15:10:24 -0400
commit: 34993e4f7b0d47620e88ba64a6d7c67330d97e35 (patch)
tree: 2136284f5bd4095780884885413bb268fd318a96 /drivers/gpu/nvgpu/gv11b/gr_gv11b.c
parent: 5f88598b9e7b2cfe0387733577ece138a7bc912b (diff)
1 files changed, 42 insertions, 1 deletions
diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
index a7a804d2..110819a9 100644
--- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
@@ -1,7 +1,7 @@
 /*
 * GV11b GPU GR
 *
- * Copyright (c) 2016-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2016-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -37,6 +37,7 @@
 #include <nvgpu/bitops.h>
 #include <nvgpu/gk20a.h>
 #include <nvgpu/channel.h>
+#include <nvgpu/nvgpu_err.h>
 #include "gk20a/gr_gk20a.h"
 #include "gk20a/dbg_gpu_gk20a.h"
@@ -61,6 +62,8 @@
 #include <nvgpu/hw/gv11b/hw_pbdma_gv11b.h>
 #include <nvgpu/hw/gv11b/hw_perf_gv11b.h>
+#define SHIFT_8_BITS    8U
 #define GFXP_WFI_TIMEOUT_COUNT_IN_USEC_DEFAULT 100
 /* ecc scrubbing will done in 1 pri read cycle,but for safety used 10 retries */
@@ -224,6 +227,12 @@ static int gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc,
                }
                g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter +=
                                                        l1_tag_corrected_err_count_delta;
+                nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
+                                (gpc << SHIFT_8_BITS) | tpc,
+                                GPU_SM_L1_TAG_ECC_CORRECTED, 0,
+                                g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter);
                gk20a_writel(g,
                        gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_r() + offset,
                        0);
@@ -240,6 +249,12 @@ static int gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc,
                }
                g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter +=
                                                        l1_tag_uncorrected_err_count_delta;
+                nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
+                        (gpc << SHIFT_8_BITS) | tpc,
+                        GPU_SM_L1_TAG_ECC_UNCORRECTED, 0,
+                        g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter);
                gk20a_writel(g,
                        gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_r() + offset,
                        0);
@@ -335,6 +350,10 @@ static int gr_gv11b_handle_lrf_exception(struct gk20a *g, u32 gpc, u32 tpc,
                }
                g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter +=
                                                        lrf_uncorrected_err_count_delta;
+                nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
+                        (gpc << SHIFT_8_BITS) | tpc,
+                        GPU_SM_LRF_ECC_UNCORRECTED, 0,
+                        g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter);
                gk20a_writel(g,
                        gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_r() + offset,
                        0);
@@ -497,6 +516,12 @@ static int gr_gv11b_handle_cbu_exception(struct gk20a *g, u32 gpc, u32 tpc,
                }
                g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter +=
                                                        cbu_uncorrected_err_count_delta;
+                nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
+                                (gpc << SHIFT_8_BITS) | tpc,
+                                GPU_SM_CBU_ECC_UNCORRECTED,
+                                0, g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter);
                gk20a_writel(g,
                        gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_r() + offset,
                        0);
@@ -580,6 +605,10 @@ static int gr_gv11b_handle_l1_data_exception(struct gk20a *g, u32 gpc, u32 tpc,
                }
                g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter +=
                                                        l1_data_uncorrected_err_count_delta;
+                nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
+                                (gpc << SHIFT_8_BITS) | tpc,
+                                GPU_SM_L1_DATA_ECC_UNCORRECTED,
+                                0, g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter);
                gk20a_writel(g,
                        gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_r() + offset,
                        0);
@@ -2537,10 +2566,18 @@ static void gr_gv11b_handle_fecs_ecc_error(struct gk20a *g, u32 intr)
                if (ecc_status &
                        gr_fecs_falcon_ecc_status_corrected_err_imem_m()) {
+                        nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_FECS, 0,
+                                GPU_FECS_FALCON_IMEM_ECC_CORRECTED,
+                                ecc_addr,
+                                g->ecc.gr.fecs_ecc_corrected_err_count[0].counter);
                        nvgpu_log(g, gpu_dbg_intr, "imem ecc error corrected");
                }
                if (ecc_status &
                        gr_fecs_falcon_ecc_status_uncorrected_err_imem_m()) {
+                        nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_FECS, 0,
+                                GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED,
+                                ecc_addr,
+                                g->ecc.gr.fecs_ecc_uncorrected_err_count[0].counter);
                        nvgpu_log(g, gpu_dbg_intr,
                                                "imem ecc error uncorrected");
                }
@@ -2550,6 +2587,10 @@ static void gr_gv11b_handle_fecs_ecc_error(struct gk20a *g, u32 intr)
                }
                if (ecc_status &
                        gr_fecs_falcon_ecc_status_uncorrected_err_dmem_m()) {
+                        nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_FECS, 0,
+                                GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED,
+                                ecc_addr,
+                                g->ecc.gr.fecs_ecc_uncorrected_err_count[0].counter);
                        nvgpu_log(g, gpu_dbg_intr,
                                                "dmem ecc error uncorrected");
                }
author	Debarshi Dutta <ddutta@nvidia.com>	2021-05-17 04:38:25 -0400
committer	mobile promotions <svcmobile_promotions@nvidia.com>	2021-05-28 15:10:24 -0400
commit	34993e4f7b0d47620e88ba64a6d7c67330d97e35 (patch)
tree	2136284f5bd4095780884885413bb268fd318a96 /drivers/gpu/nvgpu/gv11b/gr_gv11b.c
parent	5f88598b9e7b2cfe0387733577ece138a7bc912b (diff)