1 files changed, 42 insertions, 1 deletions
diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
index a7a804d2..110819a9 100644
--- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
@@ -1,7 +1,7 @@
 /*
 * GV11b GPU GR
 *
- * Copyright (c) 2016-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2016-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -37,6 +37,7 @@
 #include <nvgpu/bitops.h>
 #include <nvgpu/gk20a.h>
 #include <nvgpu/channel.h>
+#include <nvgpu/nvgpu_err.h>
 #include "gk20a/gr_gk20a.h"
 #include "gk20a/dbg_gpu_gk20a.h"
@@ -61,6 +62,8 @@
 #include <nvgpu/hw/gv11b/hw_pbdma_gv11b.h>
 #include <nvgpu/hw/gv11b/hw_perf_gv11b.h>
+#define SHIFT_8_BITS    8U
 #define GFXP_WFI_TIMEOUT_COUNT_IN_USEC_DEFAULT 100
 /* ecc scrubbing will done in 1 pri read cycle,but for safety used 10 retries */
@@ -224,6 +227,12 @@ static int gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc,
                }
                g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter +=
                                                        l1_tag_corrected_err_count_delta;
+                nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
+                                (gpc << SHIFT_8_BITS) | tpc,
+                                GPU_SM_L1_TAG_ECC_CORRECTED, 0,
+                                g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter);
                gk20a_writel(g,
                        gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_r() + offset,
                        0);
@@ -240,6 +249,12 @@ static int gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc,
                }
                g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter +=
                                                        l1_tag_uncorrected_err_count_delta;
+                nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
+                        (gpc << SHIFT_8_BITS) | tpc,
+                        GPU_SM_L1_TAG_ECC_UNCORRECTED, 0,
+                        g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter);
                gk20a_writel(g,
                        gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_r() + offset,
                        0);
@@ -335,6 +350,10 @@ static int gr_gv11b_handle_lrf_exception(struct gk20a *g, u32 gpc, u32 tpc,
                }
                g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter +=
                                                        lrf_uncorrected_err_count_delta;
+                nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
+                        (gpc << SHIFT_8_BITS) | tpc,
+                        GPU_SM_LRF_ECC_UNCORRECTED, 0,
+                        g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter);
                gk20a_writel(g,
                        gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_r() + offset,
                        0);
@@ -497,6 +516,12 @@ static int gr_gv11b_handle_cbu_exception(struct gk20a *g, u32 gpc, u32 tpc,
                }
                g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter +=
                                                        cbu_uncorrected_err_count_delta;
+                nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
+                                (gpc << SHIFT_8_BITS) | tpc,
+                                GPU_SM_CBU_ECC_UNCORRECTED,
+                                0, g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter);
                gk20a_writel(g,
                        gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_r() + offset,
                        0);
@@ -580,6 +605,10 @@ static int gr_gv11b_handle_l1_data_exception(struct gk20a *g, u32 gpc, u32 tpc,
                }
                g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter +=
                                                        l1_data_uncorrected_err_count_delta;
+                nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
+                                (gpc << SHIFT_8_BITS) | tpc,
+                                GPU_SM_L1_DATA_ECC_UNCORRECTED,
+                                0, g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter);
                gk20a_writel(g,
                        gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_r() + offset,
                        0);
@@ -2537,10 +2566,18 @@ static void gr_gv11b_handle_fecs_ecc_error(struct gk20a *g, u32 intr)
                if (ecc_status &
                        gr_fecs_falcon_ecc_status_corrected_err_imem_m()) {
+                        nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_FECS, 0,
+                                GPU_FECS_FALCON_IMEM_ECC_CORRECTED,
+                                ecc_addr,
+                                g->ecc.gr.fecs_ecc_corrected_err_count[0].counter);
                        nvgpu_log(g, gpu_dbg_intr, "imem ecc error corrected");
                }
                if (ecc_status &
                        gr_fecs_falcon_ecc_status_uncorrected_err_imem_m()) {
+                        nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_FECS, 0,
+                                GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED,
+                                ecc_addr,
+                                g->ecc.gr.fecs_ecc_uncorrected_err_count[0].counter);
                        nvgpu_log(g, gpu_dbg_intr,
                                                "imem ecc error uncorrected");
                }
@@ -2550,6 +2587,10 @@ static void gr_gv11b_handle_fecs_ecc_error(struct gk20a *g, u32 intr)
                }
                if (ecc_status &
                        gr_fecs_falcon_ecc_status_uncorrected_err_dmem_m()) {
+                        nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_FECS, 0,
+                                GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED,
+                                ecc_addr,
+                                g->ecc.gr.fecs_ecc_uncorrected_err_count[0].counter);
                        nvgpu_log(g, gpu_dbg_intr,
                                                "dmem ecc error uncorrected");
                }