12 files changed, 867 insertions, 8 deletions
diff --git a/drivers/gpu/nvgpu/Kconfig b/drivers/gpu/nvgpu/Kconfig
index 7dba61a3..07331817 100644
--- a/drivers/gpu/nvgpu/Kconfig
+++ b/drivers/gpu/nvgpu/Kconfig
@@ -143,6 +143,13 @@ config NVGPU_SUPPORT_CDE
        help
          Enable support for extraction of comptags for CDE.
+config NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING
+        bool "Support ECC error reporting for Linux"
+        depends on TEGRA_SAFETY
+        default y
+        help
+                Enable support for ECC error reporting for Linux.
 config NVGPU_USE_TEGRA_ALLOC_FD
        bool "Use tegra_alloc_fd() for allocating dma_buf fds for vidmem"
        depends on GK20A && GK20A_VIDMEM
diff --git a/drivers/gpu/nvgpu/Makefile b/drivers/gpu/nvgpu/Makefile
index 472bf32c..d5ceecb6 100644
--- a/drivers/gpu/nvgpu/Makefile
+++ b/drivers/gpu/nvgpu/Makefile
@@ -98,6 +98,8 @@ nvgpu-y += \
        os/linux/ltc.o \
        os/linux/vpr.o
+nvgpu-$(CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING) += os/linux/sdl.o
 nvgpu-$(CONFIG_GK20A_VIDMEM) += \
        os/linux/dmabuf_vidmem.o
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c
index c3068b76..1a117169 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.c
@@ -1,7 +1,7 @@
 /*
 * GK20A Graphics
 *
- * Copyright (c) 2011-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -39,6 +39,7 @@
 #include <nvgpu/therm.h>
 #include <nvgpu/mc.h>
 #include <nvgpu/channel_sync.h>
+#include <nvgpu/nvgpu_err.h>
 #include <trace/events/gk20a.h>
@@ -525,6 +526,10 @@ static void gk20a_free_cb(struct nvgpu_ref *refcount)
        struct gk20a *g = container_of(refcount,
                struct gk20a, refcount);
+#ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING
+        nvgpu_deinit_ecc_reporting(g);
+#endif
        nvgpu_log(g, gpu_dbg_shutdown, "Freeing GK20A struct!");
        gk20a_ce_destroy(g);
diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
index a7a804d2..110819a9 100644
--- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
@@ -1,7 +1,7 @@
 /*
 * GV11b GPU GR
 *
- * Copyright (c) 2016-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2016-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -37,6 +37,7 @@
 #include <nvgpu/bitops.h>
 #include <nvgpu/gk20a.h>
 #include <nvgpu/channel.h>
+#include <nvgpu/nvgpu_err.h>
 #include "gk20a/gr_gk20a.h"
 #include "gk20a/dbg_gpu_gk20a.h"
@@ -61,6 +62,8 @@
 #include <nvgpu/hw/gv11b/hw_pbdma_gv11b.h>
 #include <nvgpu/hw/gv11b/hw_perf_gv11b.h>
+#define SHIFT_8_BITS    8U
 #define GFXP_WFI_TIMEOUT_COUNT_IN_USEC_DEFAULT 100
 /* ecc scrubbing will done in 1 pri read cycle,but for safety used 10 retries */
@@ -224,6 +227,12 @@ static int gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc,
                }
                g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter +=
                                                        l1_tag_corrected_err_count_delta;
+                nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
+                                (gpc << SHIFT_8_BITS) | tpc,
+                                GPU_SM_L1_TAG_ECC_CORRECTED, 0,
+                                g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter);
                gk20a_writel(g,
                        gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_r() + offset,
                        0);
@@ -240,6 +249,12 @@ static int gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc,
                }
                g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter +=
                                                        l1_tag_uncorrected_err_count_delta;
+                nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
+                        (gpc << SHIFT_8_BITS) | tpc,
+                        GPU_SM_L1_TAG_ECC_UNCORRECTED, 0,
+                        g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter);
                gk20a_writel(g,
                        gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_r() + offset,
                        0);
@@ -335,6 +350,10 @@ static int gr_gv11b_handle_lrf_exception(struct gk20a *g, u32 gpc, u32 tpc,
                }
                g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter +=
                                                        lrf_uncorrected_err_count_delta;
+                nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
+                        (gpc << SHIFT_8_BITS) | tpc,
+                        GPU_SM_LRF_ECC_UNCORRECTED, 0,
+                        g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter);
                gk20a_writel(g,
                        gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_r() + offset,
                        0);
@@ -497,6 +516,12 @@ static int gr_gv11b_handle_cbu_exception(struct gk20a *g, u32 gpc, u32 tpc,
                }
                g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter +=
                                                        cbu_uncorrected_err_count_delta;
+                nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
+                                (gpc << SHIFT_8_BITS) | tpc,
+                                GPU_SM_CBU_ECC_UNCORRECTED,
+                                0, g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter);
                gk20a_writel(g,
                        gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_r() + offset,
                        0);
@@ -580,6 +605,10 @@ static int gr_gv11b_handle_l1_data_exception(struct gk20a *g, u32 gpc, u32 tpc,
                }
                g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter +=
                                                        l1_data_uncorrected_err_count_delta;
+                nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
+                                (gpc << SHIFT_8_BITS) | tpc,
+                                GPU_SM_L1_DATA_ECC_UNCORRECTED,
+                                0, g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter);
                gk20a_writel(g,
                        gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_r() + offset,
                        0);
@@ -2537,10 +2566,18 @@ static void gr_gv11b_handle_fecs_ecc_error(struct gk20a *g, u32 intr)
                if (ecc_status &
                        gr_fecs_falcon_ecc_status_corrected_err_imem_m()) {
+                        nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_FECS, 0,
+                                GPU_FECS_FALCON_IMEM_ECC_CORRECTED,
+                                ecc_addr,
+                                g->ecc.gr.fecs_ecc_corrected_err_count[0].counter);
                        nvgpu_log(g, gpu_dbg_intr, "imem ecc error corrected");
                }
                if (ecc_status &
                        gr_fecs_falcon_ecc_status_uncorrected_err_imem_m()) {
+                        nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_FECS, 0,
+                                GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED,
+                                ecc_addr,
+                                g->ecc.gr.fecs_ecc_uncorrected_err_count[0].counter);
                        nvgpu_log(g, gpu_dbg_intr,
                                                "imem ecc error uncorrected");
                }
@@ -2550,6 +2587,10 @@ static void gr_gv11b_handle_fecs_ecc_error(struct gk20a *g, u32 intr)
                }
                if (ecc_status &
                        gr_fecs_falcon_ecc_status_uncorrected_err_dmem_m()) {
+                        nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_FECS, 0,
+                                GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED,
+                                ecc_addr,
+                                g->ecc.gr.fecs_ecc_uncorrected_err_count[0].counter);
                        nvgpu_log(g, gpu_dbg_intr,
                                                "dmem ecc error uncorrected");
                }
diff --git a/drivers/gpu/nvgpu/gv11b/pmu_gv11b.c b/drivers/gpu/nvgpu/gv11b/pmu_gv11b.c
index 5e586ec2..336258a7 100644
--- a/drivers/gpu/nvgpu/gv11b/pmu_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/pmu_gv11b.c
@@ -1,7 +1,7 @@
 /*
 * GV11B PMU
 *
- * Copyright (c) 2016-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2016-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -29,6 +29,7 @@
 #include <nvgpu/io.h>
 #include <nvgpu/utils.h>
 #include <nvgpu/gk20a.h>
+#include <nvgpu/nvgpu_err.h>
 #include "gk20a/pmu_gk20a.h"
 #include "gp10b/pmu_gp10b.h"
@@ -354,10 +355,18 @@ void gv11b_pmu_handle_ext_irq(struct gk20a *g, u32 intr0)
                                "pmu ecc interrupt intr1: 0x%x", intr1);
                        if (ecc_status & pwr_pmu_falcon_ecc_status_corrected_err_imem_m()) {
+                                nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_PMU, 0,
+                                        GPU_PMU_FALCON_IMEM_ECC_CORRECTED,
+                                        ecc_addr,
+                                        g->ecc.pmu.pmu_ecc_corrected_err_count[0].counter);
                                nvgpu_log(g, gpu_dbg_intr,
                                        "imem ecc error corrected");
                        }
                        if (ecc_status & pwr_pmu_falcon_ecc_status_uncorrected_err_imem_m()) {
+                                nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_PMU, 0,
+                                        GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED,
+                                        ecc_addr,
+                                        g->ecc.pmu.pmu_ecc_uncorrected_err_count[0].counter);
                                nvgpu_log(g, gpu_dbg_intr,
                                        "imem ecc error uncorrected");
                        }
@@ -366,6 +375,10 @@ void gv11b_pmu_handle_ext_irq(struct gk20a *g, u32 intr0)
                                        "dmem ecc error corrected");
                        }
                        if (ecc_status & pwr_pmu_falcon_ecc_status_uncorrected_err_dmem_m()) {
+                                nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_PMU, 0,
+                                        GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED,
+                                        ecc_addr,
+                                        g->ecc.pmu.pmu_ecc_uncorrected_err_count[0].counter);
                                nvgpu_log(g, gpu_dbg_intr,
                                        "dmem ecc error uncorrected");
                        }
diff --git a/drivers/gpu/nvgpu/include/nvgpu/bug.h b/drivers/gpu/nvgpu/include/nvgpu/bug.h
index 3d139b75..82d641bd 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/bug.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/bug.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -24,6 +24,24 @@
 #ifdef __KERNEL__
 #include <linux/bug.h>
+/*
+ * Define an assert macro that code within nvgpu can use.
+ *
+ * The goal of this macro is for debugging but what that means varies from OS
+ * to OS. On Linux wee don't want to BUG() for general driver misbehaving. BUG()
+ * is a very heavy handed tool - in fact there's probably no where within the
+ * nvgpu core code where it makes sense to use a BUG() when running under Linux.
+ *
+ * However, on QNX (and POSIX) BUG() will just kill the current process. This
+ * means we can use it for handling bugs in nvgpu.
+ *
+ * As a result this macro varies depending on platform.
+ */
+#define nvgpu_assert(cond)      ((void) WARN_ON(!(cond)))
+#define nvgpu_do_assert_print(g, fmt, arg...)                           \
+        do {                                                            \
+                nvgpu_err(g, fmt, ##arg);                               \
+        } while (false)
 #elif defined(__NVGPU_POSIX__)
 #include <nvgpu/posix/bug.h>
 #else
diff --git a/drivers/gpu/nvgpu/include/nvgpu/log.h b/drivers/gpu/nvgpu/include/nvgpu/log.h
index 70a16762..2bcca335 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/log.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/log.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -80,6 +80,7 @@ void __nvgpu_log_dbg(struct gk20a *g, u64 log_mask,
 #define gpu_dbg_vidmem          BIT(24) /* VIDMEM tracing. */
 #define gpu_dbg_nvlink          BIT(25) /* nvlink Operation tracing. */
 #define gpu_dbg_clk_arb         BIT(26) /* Clk arbiter debugging. */
+#define gpu_dbg_ecc             BIT(27) /* Print ECC Info Logs. */
 #define gpu_dbg_mem             BIT(31) /* memory accesses; very verbose. */
 /**
diff --git a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h
new file mode 100644
index 00000000..0595fafb
--- /dev/null
+++ b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h
@@ -0,0 +1,359 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#ifndef NVGPU_NVGPU_ERR_H
+#define NVGPU_NVGPU_ERR_H
+/**
+ * @file
+ *
+ * Define indices for HW units and errors. Define structures used to carry error
+ * information. Declare prototype for APIs that are used to report GPU HW errors
+ * to the Safety_Services framework.
+ */
+#include <nvgpu/types.h>
+#include <nvgpu/atomic.h>
+struct gk20a;
+/**
+ * @defgroup INDICES_FOR_GPU_HW_UNITS
+ * Macros used to assign unique index to GPU HW units.
+ * @{
+ */
+#define NVGPU_ERR_MODULE_SM                     (0U)
+#define NVGPU_ERR_MODULE_FECS           (1U)
+#define NVGPU_ERR_MODULE_PMU            (2U)
+/**
+ * @}
+ */
+/**
+ * @defgroup LIST_OF_ERRORS_REPORTED_FROM_SM
+ * Macros used to assign unique index to errors reported from the SM unit.
+ * @{
+ */
+#define GPU_SM_L1_TAG_ECC_CORRECTED                     (0U)
+#define GPU_SM_L1_TAG_ECC_UNCORRECTED                   (1U)
+#define GPU_SM_CBU_ECC_UNCORRECTED                      (3U)
+#define GPU_SM_LRF_ECC_UNCORRECTED                      (5U)
+#define GPU_SM_L1_DATA_ECC_UNCORRECTED                  (7U)
+#define GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED           (9U)
+#define GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED           (11U)
+#define GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED      (13U)
+#define GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED         (15U)
+#define GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED        (17U)
+#define GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED      (20U)
+/**
+ * @}
+ */
+/**
+ * @defgroup LIST_OF_ERRORS_REPORTED_FROM_FECS
+ * Macros used to assign unique index to errors reported from the FECS unit.
+ * @{
+ */
+#define GPU_FECS_FALCON_IMEM_ECC_CORRECTED      (0U)
+#define GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED    (1U)
+#define GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED    (3U)
+/**
+ * @}
+ */
+/**
+ * @defgroup LIST_OF_ERRORS_REPORTED_FROM_GPCCS
+ * Macros used to assign unique index to errors reported from the GPCCS unit.
+ * @{
+ */
+#define GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED     (0U)
+#define GPU_GPCCS_FALCON_IMEM_ECC_UNCORRECTED   (1U)
+#define GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED   (3U)
+/**
+ * @}
+ */
+/**
+ * @defgroup LIST_OF_ERRORS_REPORTED_FROM_MMU
+ * Macros used to assign unique index to errors reported from the MMU unit.
+ * @{
+ */
+#define GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED   (1U)
+#define GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED   (3U)
+/**
+ * @}
+ */
+/**
+ * @defgroup LIST_OF_ERRORS_REPORTED_FROM_GCC
+ * Macros used to assign unique index to errors reported from the GCC unit.
+ * @{
+ */
+#define GPU_GCC_L15_ECC_UNCORRECTED             (1U)
+/**
+ * @}
+ */
+/**
+ * @defgroup LIST_OF_ERRORS_REPORTED_FROM_PMU
+ * Macros used to assign unique index to errors reported from the PMU unit.
+ * @{
+ */
+#define GPU_PMU_FALCON_IMEM_ECC_CORRECTED       (0U)
+#define GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED     (1U)
+#define GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED     (3U)
+/**
+ * @}
+ */
+/**
+ * @defgroup LIST_OF_ERRORS_REPORTED_FROM_LTC
+ * Macros used to assign unique index to errors reported from the LTC unit.
+ * @{
+ */
+#define GPU_LTC_CACHE_DSTG_ECC_CORRECTED        (0U)
+#define GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED      (1U)
+#define GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED      (3U)
+#define GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED   (7U)
+/**
+ * @}
+ */
+/**
+ * @defgroup LIST_OF_ERRORS_REPORTED_FROM_HUBMMU
+ * Macros used to assign unique index to errors reported from the HUBMMU unit.
+ * @{
+ */
+#define GPU_HUBMMU_L2TLB_SA_DATA_ECC_UNCORRECTED        (1U)
+#define GPU_HUBMMU_TLB_SA_DATA_ECC_UNCORRECTED          (3U)
+#define GPU_HUBMMU_PTE_DATA_ECC_UNCORRECTED             (5U)
+#define GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED            (7U)
+#define GPU_HUBMMU_PAGE_FAULT_ERROR                     (8U)
+#ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING
+/**
+ * @}
+ */
+/**
+ * nvgpu_err_desc structure holds fields which describe an error along with
+ * function callback which can be used to inject the error.
+ */
+struct nvgpu_err_desc {
+        /** String representation of error. */
+        const char *name;
+        /** Flag to classify an error as critical or non-critical. */
+        bool is_critical;
+        /**
+         * Error Threshold: once this threshold value is reached, then the
+         * corresponding error counter will be reset to 0 and the error will be
+         * propagated to Safety_Services.
+         */
+        int err_threshold;
+        /**
+         * Total number of times an error has occurred (since its last reset).
+         */
+        nvgpu_atomic_t err_count;
+        /** Error ID. */
+        u8 error_id;
+};
+/**
+ * gpu_err_header structure holds fields which are required to identify the
+ * version of header, sub-error type, sub-unit id, error address and time stamp.
+ */
+struct gpu_err_header {
+        /** Version of GPU error header. */
+        struct {
+                /** Major version number. */
+                u16 major;
+                /** Minor version number. */
+                u16 minor;
+        } version;
+        /** Sub error type corresponding to the error that is being reported. */
+        u32 sub_err_type;
+        /** ID of the sub-unit in a HW unit which encountered an error. */
+        u64 sub_unit_id;
+        /** Location of the error. */
+        u64 address;
+        /** Timestamp in nano seconds. */
+        u64 timestamp_ns;
+};
+struct gpu_ecc_error_info {
+        struct gpu_err_header header;
+        /** Number of ECC errors. */
+        u64 err_cnt;
+};
+/**
+ * nvgpu_err_hw_module structure holds fields which describe the h/w modules
+ * error reporting capabilities.
+ */
+struct nvgpu_err_hw_module {
+        /** String representation of a given HW unit. */
+        const char *name;
+        /** HW unit ID. */
+        u32 hw_unit;
+        /** Total number of errors reported from a given HW unit. */
+        u32 num_errs;
+        u32 base_ecc_service_id;
+        /** Used to get error description from look-up table. */
+        struct nvgpu_err_desc *errs;
+};
+struct nvgpu_ecc_reporting_ops {
+        void (*report_ecc_err)(struct gk20a *g, u32 hw_unit, u32 inst,
+                u32 err_id, u64 err_addr, u64 err_count);
+};
+struct nvgpu_ecc_reporting {
+        struct nvgpu_spinlock lock;
+        /* This flag is protected by the above spinlock */
+        bool ecc_reporting_service_enabled;
+        const struct nvgpu_ecc_reporting_ops *ops;
+};
+ /**
+  * This macro is used to initialize the members of nvgpu_err_desc struct.
+  */
+#define GPU_ERR(err, critical, id, threshold, ecount) \
+{                                                                       \
+                .name = (err),                                          \
+                .is_critical = (critical),                              \
+                .error_id = (id),                                       \
+                .err_threshold = (threshold),                           \
+                .err_count = NVGPU_ATOMIC_INIT(ecount),                                 \
+}
+/**
+ * This macro is used to initialize critical errors.
+ */
+#define GPU_CRITERR(err, id, threshold, ecount) \
+        GPU_ERR(err, true, id, threshold, ecount)
+/**
+ * This macro is used to initialize non-critical errors.
+ */
+#define GPU_NONCRITERR(err, id, threshold, ecount) \
+        GPU_ERR(err, false, id, threshold, ecount)
+/**
+ * @brief GPU HW errors need to be reported to Safety_Services via SDL unit.
+ *        This function provides an interface to report ECC erros to SDL unit.
+ *
+ * @param g [in]                - The GPU driver struct.
+ * @param hw_unit [in]          - Index of HW unit.
+ *                                - List of valid HW unit IDs
+ *                                  - NVGPU_ERR_MODULE_SM
+ *                                  - NVGPU_ERR_MODULE_FECS
+ *                                  - NVGPU_ERR_MODULE_GPCCS
+ *                                  - NVGPU_ERR_MODULE_MMU
+ *                                  - NVGPU_ERR_MODULE_GCC
+ *                                  - NVGPU_ERR_MODULE_PMU
+ *                                  - NVGPU_ERR_MODULE_LTC
+ *                                  - NVGPU_ERR_MODULE_HUBMMU
+ * @param inst [in]             - Instance ID.
+ *                                - In case of multiple instances of the same HW
+ *                                  unit (e.g., there are multiple instances of
+ *                                  SM), it is used to identify the instance
+ *                                  that encountered a fault.
+ * @param err_id [in]           - Error index.
+ *                                - For SM:
+ *                                  - Min: GPU_SM_L1_TAG_ECC_CORRECTED
+ *                                  - Max: GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED
+ *                                - For FECS:
+ *                                  - Min: GPU_FECS_FALCON_IMEM_ECC_CORRECTED
+ *                                  - Max: GPU_FECS_INVALID_ERROR
+ *                                - For GPCCS:
+ *                                  - Min: GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED
+ *                                  - Max: GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED
+ *                                - For MMU:
+ *                                  - Min: GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED
+ *                                  - Max: GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED
+ *                                - For GCC:
+ *                                  - Min: GPU_GCC_L15_ECC_UNCORRECTED
+ *                                  - Max: GPU_GCC_L15_ECC_UNCORRECTED
+ *                                - For PMU:
+ *                                  - Min: GPU_PMU_FALCON_IMEM_ECC_CORRECTED
+ *                                  - Max: GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED
+ *                                - For LTC:
+ *                                  - Min: GPU_LTC_CACHE_DSTG_ECC_CORRECTED
+ *                                  - Max: GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED
+ *                                - For HUBMMU:
+ *                                  - Min: GPU_HUBMMU_L2TLB_SA_DATA_ECC_UNCORRECTED
+ *                                  - Max: GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED
+ * @param err_addr [in]         - Error address.
+ *                                - This is the location at which correctable or
+ *                                  uncorrectable error has occurred.
+ * @param err_count [in]        - Error count.
+ *
+ * - Checks whether SDL is supported in the current GPU platform. If SDL is not
+ *   supported, it simply returns.
+ * - Validates both \a hw_unit and \a err_id indices. In case of a failure,
+ *   invokes #nvgpu_sdl_handle_report_failure() api.
+ * - Gets the current time of a clock. In case of a failure, invokes
+ *   #nvgpu_sdl_handle_report_failure() api.
+ * - Gets error description from internal look-up table using \a hw_unit and
+ *   \a err_id indices.
+ * - Forms error packet using details such as time-stamp, \a hw_unit, \a err_id,
+ *   criticality of the error, \a inst, \a err_addr, \a err_count, error
+ *   description, and size of the error packet.
+ * - Performs compile-time assert check to ensure that the size of the error
+ *   packet does not exceed the maximum allowable size specified in
+ *   #MAX_ERR_MSG_SIZE.
+ *
+ * @return      None
+ */
+void nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst,
+                u32 err_id, u64 err_addr, u64 err_count);
+void nvgpu_init_ecc_reporting(struct gk20a *g);
+void nvgpu_enable_ecc_reporting(struct gk20a *g);
+void nvgpu_disable_ecc_reporting(struct gk20a *g);
+void nvgpu_deinit_ecc_reporting(struct gk20a *g);
+#else
+static inline void nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst,
+                u32 err_id, u64 err_addr, u64 err_count) {
+}
+#endif /* CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING */
+#endif /* NVGPU_NVGPU_ERR_H */
+\ No newline at end of file
diff --git a/drivers/gpu/nvgpu/os/linux/ecc_linux.h b/drivers/gpu/nvgpu/os/linux/ecc_linux.h
new file mode 100644
index 00000000..7e0f650b
--- /dev/null
+++ b/drivers/gpu/nvgpu/os/linux/ecc_linux.h
@@ -0,0 +1,49 @@
+/*
+ *
+ * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#ifndef NVGPU_OS_ECC_LINUX_H
+#define NVGPU_OS_ECC_LINUX_H
+#ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING
+#include <linux/tegra_l1ss_kernel_interface.h>
+#include <linux/tegra_l1ss_ioctl.h>
+#include <linux/tegra_nv_guard_service_id.h>
+#include <linux/tegra_nv_guard_group_id.h>
+#include <nvgpu/nvgpu_err.h>
+struct nvgpu_ecc_reporting_linux {
+    struct nvgpu_ecc_reporting common;
+    client_param_t priv;
+};
+static inline struct nvgpu_ecc_reporting_linux *get_ecc_reporting_linux(
+    struct nvgpu_ecc_reporting *ecc_report)
+{
+        return container_of(ecc_report, struct nvgpu_ecc_reporting_linux, common);
+}
+#endif /* CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING */
+#endif
+\ No newline at end of file
diff --git a/drivers/gpu/nvgpu/os/linux/module.c b/drivers/gpu/nvgpu/os/linux/module.c
index 807df2ca..fdbab46d 100644
--- a/drivers/gpu/nvgpu/os/linux/module.c
+++ b/drivers/gpu/nvgpu/os/linux/module.c
@@ -1,7 +1,7 @@
 /*
 * GK20A Graphics
 *
- * Copyright (c) 2011-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
@@ -49,6 +49,7 @@
 #include <nvgpu/clk_arb.h>
 #include <nvgpu/timers.h>
 #include <nvgpu/channel.h>
+#include <nvgpu/nvgpu_err.h>
 #include "platform_gk20a.h"
 #include "sysfs.h"
@@ -355,6 +356,10 @@ int gk20a_pm_finalize_poweron(struct device *dev)
                gk20a_init_cde_support(l);
 #endif
+#ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING
+        nvgpu_enable_ecc_reporting(g);
+#endif
        err = gk20a_sched_ctrl_init(g);
        if (err) {
                nvgpu_err(g, "failed to init sched control");
@@ -364,9 +369,14 @@ int gk20a_pm_finalize_poweron(struct device *dev)
        g->sw_ready = true;
 done:
-        if (err)
+        if (err) {
                g->power_on = false;
+#ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING
+                nvgpu_disable_ecc_reporting(g);
+#endif
+        }
        nvgpu_mutex_release(&g->power_lock);
        return err;
 }
@@ -433,6 +443,10 @@ static int gk20a_pm_prepare_poweroff(struct device *dev)
        /* Stop CPU from accessing the GPU registers. */
        gk20a_lockout_registers(g);
+#ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING
+        nvgpu_disable_ecc_reporting(g);
+#endif
        nvgpu_hide_usermode_for_poweroff(g);
        nvgpu_mutex_release(&g->power_lock);
        return 0;
@@ -1382,6 +1396,10 @@ static int gk20a_probe(struct platform_device *dev)
                goto return_err;
        }
+#ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING
+        nvgpu_init_ecc_reporting(gk20a);
+#endif
        gk20a->nvgpu_reboot_nb.notifier_call =
                nvgpu_kernel_shutdown_notification;
        err = register_reboot_notifier(&gk20a->nvgpu_reboot_nb);
diff --git a/drivers/gpu/nvgpu/os/linux/os_linux.h b/drivers/gpu/nvgpu/os/linux/os_linux.h
index 25c6c03a..adcfdb2f 100644
--- a/drivers/gpu/nvgpu/os/linux/os_linux.h
+++ b/drivers/gpu/nvgpu/os/linux/os_linux.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
@@ -25,6 +25,7 @@
 #include "cde.h"
 #include "sched.h"
+#include "ecc_linux.h"
 struct nvgpu_os_linux_ops {
        struct {
@@ -134,6 +135,10 @@ struct nvgpu_os_linux {
        u64 regs_bus_addr;
+#ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING
+        struct nvgpu_ecc_reporting_linux ecc_reporting_linux;
+#endif
        struct nvgpu_os_linux_ops ops;
 #ifdef CONFIG_DEBUG_FS
diff --git a/drivers/gpu/nvgpu/os/linux/sdl.c b/drivers/gpu/nvgpu/os/linux/sdl.c
new file mode 100644
index 00000000..c4dccdc6
--- /dev/null
+++ b/drivers/gpu/nvgpu/os/linux/sdl.c
@@ -0,0 +1,341 @@
+/*
+ * Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <nvgpu/gk20a.h>
+#include <nvgpu/types.h>
+#include <nvgpu/nvgpu_err.h>
+#include <nvgpu/timers.h>
+#include <nvgpu/bug.h>
+#include "ecc_linux.h"
+#include "os_linux.h"
+#include "module.h"
+/* This look-up table initializes the list of hw units and their errors.
+ * It also specifies the error injection mechanism supported, for each error.
+ * In case of hw error injection support, this initialization will be overriden
+ * by the values provided from the hal layes of corresponding hw units.
+ */
+static struct nvgpu_err_hw_module gv11b_err_lut[] = {
+        {
+                .name = "sm",
+                .hw_unit = (u32)NVGPU_ERR_MODULE_SM,
+                .num_errs = 21U,
+                .base_ecc_service_id =
+                        NVGUARD_SERVICE_IGPU_SM_SWERR_L1_TAG_ECC_CORRECTED,
+                .errs = (struct nvgpu_err_desc[]) {
+                        GPU_NONCRITERR("l1_tag_ecc_corrected",
+                                        GPU_SM_L1_TAG_ECC_CORRECTED, 0, 0),
+                        GPU_CRITERR("l1_tag_ecc_uncorrected",
+                                        GPU_SM_L1_TAG_ECC_UNCORRECTED, 0, 0),
+                        GPU_NONCRITERR("cbu_ecc_corrected", 0, 0, 0),
+                        GPU_CRITERR("cbu_ecc_uncorrected",
+                                        GPU_SM_CBU_ECC_UNCORRECTED, 0, 0),
+                        GPU_NONCRITERR("lrf_ecc_corrected", 0, 0, 0),
+                        GPU_CRITERR("lrf_ecc_uncorrected",
+                                        GPU_SM_LRF_ECC_UNCORRECTED, 0, 0),
+                        GPU_NONCRITERR("l1_data_ecc_corrected", 0, 0, 0),
+                        GPU_CRITERR("l1_data_ecc_uncorrected",
+                                        GPU_SM_L1_DATA_ECC_UNCORRECTED, 0, 0),
+                        GPU_NONCRITERR("icache_l0_data_ecc_corrected", 0, 0, 0),
+                        GPU_CRITERR("icache_l0_data_ecc_uncorrected",
+                                        GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED, 0, 0),
+                        GPU_NONCRITERR("icache_l1_data_ecc_corrected", 0, 0, 0),
+                        GPU_CRITERR("icache_l1_data_ecc_uncorrected",
+                                        GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED, 0, 0),
+                        GPU_NONCRITERR("icache_l0_predecode_ecc_corrected", 0, 0, 0),
+                        GPU_CRITERR("icache_l0_predecode_ecc_uncorrected",
+                                        GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED, 0, 0),
+                        GPU_NONCRITERR("l1_tag_miss_fifo_ecc_corrected", 0, 0, 0),
+                        GPU_CRITERR("l1_tag_miss_fifo_ecc_uncorrected",
+                                        GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED, 0, 0),
+                        GPU_NONCRITERR("l1_tag_s2r_pixprf_ecc_corrected", 0, 0, 0),
+                        GPU_CRITERR("l1_tag_s2r_pixprf_ecc_uncorrected",
+                                        GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED, 0, 0),
+                        GPU_CRITERR("machine_check_error", 0, 0, 0),
+                        GPU_NONCRITERR("icache_l1_predecode_ecc_corrected", 0, 0, 0),
+                        GPU_CRITERR("icache_l1_predecode_ecc_uncorrected",
+                                        GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED, 0, 0),
+                },
+        },
+        {
+                .name = "fecs",
+                .hw_unit = (u32)NVGPU_ERR_MODULE_FECS,
+                .num_errs = 4U,
+                .base_ecc_service_id =
+                        NVGUARD_SERVICE_IGPU_FECS_SWERR_FALCON_IMEM_ECC_CORRECTED,
+                .errs = (struct nvgpu_err_desc[]) {
+                        GPU_NONCRITERR("falcon_imem_ecc_corrected",
+                                        GPU_FECS_FALCON_IMEM_ECC_CORRECTED, 0, 0),
+                        GPU_CRITERR("falcon_imem_ecc_uncorrected",
+                                        GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED, 0, 0),
+                        GPU_NONCRITERR("falcon_dmem_ecc_corrected", 0, 0, 0),
+                        GPU_CRITERR("falcon_dmem_ecc_uncorrected",
+                                        GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED, 0, 0),
+                },
+        },
+        {
+                .name = "pmu",
+                .hw_unit = NVGPU_ERR_MODULE_PMU,
+                .num_errs = 4U,
+                .base_ecc_service_id =
+                        NVGUARD_SERVICE_IGPU_PMU_SWERR_FALCON_IMEM_ECC_CORRECTED,
+                .errs = (struct nvgpu_err_desc[]) {
+                        GPU_NONCRITERR("falcon_imem_ecc_corrected",
+                                        GPU_PMU_FALCON_IMEM_ECC_CORRECTED, 0, 0),
+                        GPU_CRITERR("falcon_imem_ecc_uncorrected",
+                                        GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED, 0, 0),
+                        GPU_NONCRITERR("falcon_dmem_ecc_corrected", 0, 0, 0),
+                        GPU_CRITERR("falcon_dmem_ecc_uncorrected",
+                                        GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED, 0, 0),
+                },
+        },
+};
+static void nvgpu_init_err_msg_header(struct gpu_err_header *header)
+{
+        header->version.major = (u16)1U;
+        header->version.minor = (u16)0U;
+        header->sub_err_type = 0U;
+        header->sub_unit_id = 0UL;
+        header->address = 0UL;
+        header->timestamp_ns = 0UL;
+}
+static void nvgpu_init_ecc_err_msg(struct gpu_ecc_error_info *err_info)
+{
+        nvgpu_init_err_msg_header(&err_info->header);
+        err_info->err_cnt = 0UL;
+}
+static void nvgpu_report_ecc_error_linux(struct gk20a *g, u32 hw_unit, u32 inst,
+                u32 err_id, u64 err_addr, u64 err_count)
+{
+        int err = 0;
+        u32 s_id = 0;
+        u8 err_status = 0;
+        u8 err_info_size = 0;
+        u64 timestamp = 0ULL;
+        int err_threshold_counter = 0;
+        struct gpu_ecc_error_info err_pkt;
+        struct nvgpu_err_desc *err_desc = NULL;
+        struct nvgpu_err_hw_module *hw_module = NULL;
+        nv_guard_request_t req;
+        memset(&req, 0, sizeof(req));
+        nvgpu_init_ecc_err_msg(&err_pkt);
+        if (hw_unit >= sizeof(gv11b_err_lut)/sizeof(gv11b_err_lut[0])) {
+                err = -EINVAL;
+                goto done;
+        }
+        hw_module = &gv11b_err_lut[hw_unit];
+        if (err_id >= hw_module->num_errs) {
+                nvgpu_err(g, "invalid err_id (%u) for hw module (%u)",
+                        err_id, hw_module->hw_unit);
+                err = -EINVAL;
+                goto done;
+        }
+        err_desc = &hw_module->errs[err_id];
+        timestamp = (u64)nvgpu_current_time_ns();
+        err_pkt.header.timestamp_ns = timestamp;
+        err_pkt.header.sub_unit_id = inst;
+        err_pkt.header.address = err_addr;
+        err_pkt.err_cnt = err_count;
+        err_info_size = sizeof(err_pkt);
+        s_id = hw_module->base_ecc_service_id + err_id;
+        if (err_desc->is_critical) {
+                err_status = NVGUARD_ERROR_DETECTED;
+        } else {
+                err_status = NVGUARD_NO_ERROR;
+        }
+        nvgpu_atomic_inc(&err_desc->err_count);
+        err_threshold_counter = nvgpu_atomic_cmpxchg(&err_desc->err_count,
+                        err_desc->err_threshold + 1, 0);
+        if (unlikely(err_threshold_counter != err_desc->err_threshold + 1)) {
+                goto done;
+        }
+        nvgpu_log(g, gpu_dbg_ecc, "ECC reporting hw: %s, desc:%s, count:%llu",
+                hw_module->name, err_desc->name, err_count);
+        req.srv_id_cmd = NVGUARD_SERVICESTATUS_NOTIFICATION;
+        req.srv_status.srv_id = (nv_guard_service_id_t)s_id;
+        req.srv_status.status = err_status;
+        req.srv_status.timestamp = timestamp;
+        req.srv_status.error_info_size = err_info_size;
+        memcpy(req.srv_status.error_info, (u8*)&err_pkt, err_info_size);
+        /*
+         * l1ss_submit_rq may fail due to kmalloc failures but may pass in
+         * subsequent calls
+         */
+        err = l1ss_submit_rq(&req, true);
+        if (err != 0) {
+                nvgpu_err(g, "Error returned from L1SS submit %d", err);
+        }
+        if (err_desc->is_critical) {
+                nvgpu_quiesce(g);
+        }
+done:
+        return;
+}
+static void nvgpu_report_ecc_error_empty(struct gk20a *g, u32 hw_unit, u32 inst,
+                u32 err_id, u64 err_addr, u64 err_count) {
+                nvgpu_log(g, gpu_dbg_ecc, "ECC reporting empty");
+}
+const struct nvgpu_ecc_reporting_ops default_disabled_ecc_report_ops = {
+        .report_ecc_err = nvgpu_report_ecc_error_empty,
+};
+const struct nvgpu_ecc_reporting_ops ecc_enable_report_ops = {
+        .report_ecc_err = nvgpu_report_ecc_error_linux,
+};
+static int nvgpu_l1ss_callback(l1ss_cli_callback_param param, void *data)
+{
+        struct gk20a *g = (struct gk20a *)data;
+        struct nvgpu_os_linux *l = NULL;
+        struct nvgpu_ecc_reporting_linux *ecc_reporting_linux = NULL;
+        int err = 0;
+        /* Ensure we have a valid gk20a struct before proceeding */
+        if ((g == NULL) || (gk20a_get(g) == NULL)) {
+                return -ENODEV;
+        }
+        l = nvgpu_os_linux_from_gk20a(g);
+        ecc_reporting_linux = &l->ecc_reporting_linux;
+        nvgpu_spinlock_acquire(&ecc_reporting_linux->common.lock);
+        if (param == L1SS_READY) {
+                if (!ecc_reporting_linux->common.ecc_reporting_service_enabled) {
+                        ecc_reporting_linux->common.ecc_reporting_service_enabled = true;
+                        ecc_reporting_linux->common.ops = &ecc_enable_report_ops;
+                        nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is enabled");
+                }
+        } else if (param == L1SS_NOT_READY) {
+                if (ecc_reporting_linux->common.ecc_reporting_service_enabled) {
+                        ecc_reporting_linux->common.ecc_reporting_service_enabled = false;
+                        ecc_reporting_linux->common.ops = &default_disabled_ecc_report_ops;
+                        nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is disabled");
+                }
+        } else {
+                err = -EINVAL;
+        }
+        nvgpu_spinlock_release(&ecc_reporting_linux->common.lock);
+        gk20a_put(g);
+        return err;
+}
+void nvgpu_init_ecc_reporting(struct gk20a *g)
+{
+        struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
+        struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux;
+        int err = 0;
+        /* This will invoke the registration API */
+        nvgpu_spinlock_init(&ecc_report_linux->common.lock);
+        ecc_report_linux->priv.id = (NVGUARD_GROUPID_IGPU & NVGUARD_GROUPINDEX_FIELDMASK);
+        ecc_report_linux->priv.cli_callback = nvgpu_l1ss_callback;
+        ecc_report_linux->priv.data = g;
+        ecc_report_linux->common.ops = &default_disabled_ecc_report_ops;
+        nvgpu_log(g, gpu_dbg_ecc, "ECC reporting Init");
+        /*
+         * err == 0 indicates service is available but not active yet.
+         * err == 1 indicates service is available and active
+         * error for other cases.
+         */
+        err = l1ss_register_client(&ecc_report_linux->priv);
+        if (err == 0) {
+                ecc_report_linux->common.ecc_reporting_service_enabled = false;
+                nvgpu_log(g, gpu_dbg_ecc, "ECC reporting init success");
+        } else if (err == 1) {
+                ecc_report_linux->common.ecc_reporting_service_enabled = true;
+                /* Actual Ops will be replaced during nvgpu_enable_ecc_reporting
+                 * called as part of gk20a_busy()
+                 */
+        } else {
+                nvgpu_log(g, gpu_dbg_ecc, "ECC reporting init failure %d", err);
+        }
+}
+void nvgpu_deinit_ecc_reporting(struct gk20a *g)
+{
+        struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
+        struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux;
+        if (ecc_report_linux->common.ecc_reporting_service_enabled) {
+                ecc_report_linux->common.ecc_reporting_service_enabled = false;
+                l1ss_deregister_client(ecc_report_linux->priv.id);
+                memset(ecc_report_linux, 0, sizeof(*ecc_report_linux));
+                nvgpu_log(g, gpu_dbg_ecc, "ECC reporting de-init success");
+        }
+}
+void nvgpu_enable_ecc_reporting(struct gk20a *g)
+{
+        struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
+        struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux;
+        struct nvgpu_ecc_reporting *error_reporting = &ecc_report_linux->common;
+        nvgpu_spinlock_acquire(&ecc_report_linux->common.lock);
+        if (error_reporting->ecc_reporting_service_enabled) {
+                error_reporting->ops = &ecc_enable_report_ops;
+                nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is enabled");
+        }
+        nvgpu_spinlock_release(&ecc_report_linux->common.lock);
+}
+void nvgpu_disable_ecc_reporting(struct gk20a *g)
+{
+        struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
+        struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux;
+        struct nvgpu_ecc_reporting *error_reporting = &ecc_report_linux->common;
+        nvgpu_spinlock_acquire(&ecc_report_linux->common.lock);
+        error_reporting->ops = &default_disabled_ecc_report_ops;
+        nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is disabled");
+        nvgpu_spinlock_release(&ecc_report_linux->common.lock);
+}
+void nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst,
+                u32 err_id, u64 err_addr, u64 err_count)
+{
+        struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
+        struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux;
+        struct nvgpu_ecc_reporting *error_reporting = &ecc_report_linux->common;
+        void (*report_ecc_err_func)(struct gk20a *g, u32 hw_unit, u32 inst,
+                u32 err_id, u64 err_addr, u64 err_count);
+        nvgpu_spinlock_acquire(&ecc_report_linux->common.lock);
+        report_ecc_err_func = error_reporting->ops->report_ecc_err;
+        nvgpu_spinlock_release(&ecc_report_linux->common.lock);
+        report_ecc_err_func(g, hw_unit, inst, err_id, err_addr, err_count);
+}