1 files changed, 341 insertions, 0 deletions
diff --git a/drivers/gpu/nvgpu/os/linux/sdl.c b/drivers/gpu/nvgpu/os/linux/sdl.c
new file mode 100644
index 00000000..c4dccdc6
--- /dev/null
+++ b/drivers/gpu/nvgpu/os/linux/sdl.c
@@ -0,0 +1,341 @@
+/*
+ * Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <nvgpu/gk20a.h>
+#include <nvgpu/types.h>
+#include <nvgpu/nvgpu_err.h>
+#include <nvgpu/timers.h>
+#include <nvgpu/bug.h>
+#include "ecc_linux.h"
+#include "os_linux.h"
+#include "module.h"
+/* This look-up table initializes the list of hw units and their errors.
+ * It also specifies the error injection mechanism supported, for each error.
+ * In case of hw error injection support, this initialization will be overriden
+ * by the values provided from the hal layes of corresponding hw units.
+ */
+static struct nvgpu_err_hw_module gv11b_err_lut[] = {
+        {
+                .name = "sm",
+                .hw_unit = (u32)NVGPU_ERR_MODULE_SM,
+                .num_errs = 21U,
+                .base_ecc_service_id =
+                        NVGUARD_SERVICE_IGPU_SM_SWERR_L1_TAG_ECC_CORRECTED,
+                .errs = (struct nvgpu_err_desc[]) {
+                        GPU_NONCRITERR("l1_tag_ecc_corrected",
+                                        GPU_SM_L1_TAG_ECC_CORRECTED, 0, 0),
+                        GPU_CRITERR("l1_tag_ecc_uncorrected",
+                                        GPU_SM_L1_TAG_ECC_UNCORRECTED, 0, 0),
+                        GPU_NONCRITERR("cbu_ecc_corrected", 0, 0, 0),
+                        GPU_CRITERR("cbu_ecc_uncorrected",
+                                        GPU_SM_CBU_ECC_UNCORRECTED, 0, 0),
+                        GPU_NONCRITERR("lrf_ecc_corrected", 0, 0, 0),
+                        GPU_CRITERR("lrf_ecc_uncorrected",
+                                        GPU_SM_LRF_ECC_UNCORRECTED, 0, 0),
+                        GPU_NONCRITERR("l1_data_ecc_corrected", 0, 0, 0),
+                        GPU_CRITERR("l1_data_ecc_uncorrected",
+                                        GPU_SM_L1_DATA_ECC_UNCORRECTED, 0, 0),
+                        GPU_NONCRITERR("icache_l0_data_ecc_corrected", 0, 0, 0),
+                        GPU_CRITERR("icache_l0_data_ecc_uncorrected",
+                                        GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED, 0, 0),
+                        GPU_NONCRITERR("icache_l1_data_ecc_corrected", 0, 0, 0),
+                        GPU_CRITERR("icache_l1_data_ecc_uncorrected",
+                                        GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED, 0, 0),
+                        GPU_NONCRITERR("icache_l0_predecode_ecc_corrected", 0, 0, 0),
+                        GPU_CRITERR("icache_l0_predecode_ecc_uncorrected",
+                                        GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED, 0, 0),
+                        GPU_NONCRITERR("l1_tag_miss_fifo_ecc_corrected", 0, 0, 0),
+                        GPU_CRITERR("l1_tag_miss_fifo_ecc_uncorrected",
+                                        GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED, 0, 0),
+                        GPU_NONCRITERR("l1_tag_s2r_pixprf_ecc_corrected", 0, 0, 0),
+                        GPU_CRITERR("l1_tag_s2r_pixprf_ecc_uncorrected",
+                                        GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED, 0, 0),
+                        GPU_CRITERR("machine_check_error", 0, 0, 0),
+                        GPU_NONCRITERR("icache_l1_predecode_ecc_corrected", 0, 0, 0),
+                        GPU_CRITERR("icache_l1_predecode_ecc_uncorrected",
+                                        GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED, 0, 0),
+                },
+        },
+        {
+                .name = "fecs",
+                .hw_unit = (u32)NVGPU_ERR_MODULE_FECS,
+                .num_errs = 4U,
+                .base_ecc_service_id =
+                        NVGUARD_SERVICE_IGPU_FECS_SWERR_FALCON_IMEM_ECC_CORRECTED,
+                .errs = (struct nvgpu_err_desc[]) {
+                        GPU_NONCRITERR("falcon_imem_ecc_corrected",
+                                        GPU_FECS_FALCON_IMEM_ECC_CORRECTED, 0, 0),
+                        GPU_CRITERR("falcon_imem_ecc_uncorrected",
+                                        GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED, 0, 0),
+                        GPU_NONCRITERR("falcon_dmem_ecc_corrected", 0, 0, 0),
+                        GPU_CRITERR("falcon_dmem_ecc_uncorrected",
+                                        GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED, 0, 0),
+                },
+        },
+        {
+                .name = "pmu",
+                .hw_unit = NVGPU_ERR_MODULE_PMU,
+                .num_errs = 4U,
+                .base_ecc_service_id =
+                        NVGUARD_SERVICE_IGPU_PMU_SWERR_FALCON_IMEM_ECC_CORRECTED,
+                .errs = (struct nvgpu_err_desc[]) {
+                        GPU_NONCRITERR("falcon_imem_ecc_corrected",
+                                        GPU_PMU_FALCON_IMEM_ECC_CORRECTED, 0, 0),
+                        GPU_CRITERR("falcon_imem_ecc_uncorrected",
+                                        GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED, 0, 0),
+                        GPU_NONCRITERR("falcon_dmem_ecc_corrected", 0, 0, 0),
+                        GPU_CRITERR("falcon_dmem_ecc_uncorrected",
+                                        GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED, 0, 0),
+                },
+        },
+};
+static void nvgpu_init_err_msg_header(struct gpu_err_header *header)
+{
+        header->version.major = (u16)1U;
+        header->version.minor = (u16)0U;
+        header->sub_err_type = 0U;
+        header->sub_unit_id = 0UL;
+        header->address = 0UL;
+        header->timestamp_ns = 0UL;
+}
+static void nvgpu_init_ecc_err_msg(struct gpu_ecc_error_info *err_info)
+{
+        nvgpu_init_err_msg_header(&err_info->header);
+        err_info->err_cnt = 0UL;
+}
+static void nvgpu_report_ecc_error_linux(struct gk20a *g, u32 hw_unit, u32 inst,
+                u32 err_id, u64 err_addr, u64 err_count)
+{
+        int err = 0;
+        u32 s_id = 0;
+        u8 err_status = 0;
+        u8 err_info_size = 0;
+        u64 timestamp = 0ULL;
+        int err_threshold_counter = 0;
+        struct gpu_ecc_error_info err_pkt;
+        struct nvgpu_err_desc *err_desc = NULL;
+        struct nvgpu_err_hw_module *hw_module = NULL;
+        nv_guard_request_t req;
+        memset(&req, 0, sizeof(req));
+        nvgpu_init_ecc_err_msg(&err_pkt);
+        if (hw_unit >= sizeof(gv11b_err_lut)/sizeof(gv11b_err_lut[0])) {
+                err = -EINVAL;
+                goto done;
+        }
+        hw_module = &gv11b_err_lut[hw_unit];
+        if (err_id >= hw_module->num_errs) {
+                nvgpu_err(g, "invalid err_id (%u) for hw module (%u)",
+                        err_id, hw_module->hw_unit);
+                err = -EINVAL;
+                goto done;
+        }
+        err_desc = &hw_module->errs[err_id];
+        timestamp = (u64)nvgpu_current_time_ns();
+        err_pkt.header.timestamp_ns = timestamp;
+        err_pkt.header.sub_unit_id = inst;
+        err_pkt.header.address = err_addr;
+        err_pkt.err_cnt = err_count;
+        err_info_size = sizeof(err_pkt);
+        s_id = hw_module->base_ecc_service_id + err_id;
+        if (err_desc->is_critical) {
+                err_status = NVGUARD_ERROR_DETECTED;
+        } else {
+                err_status = NVGUARD_NO_ERROR;
+        }
+        nvgpu_atomic_inc(&err_desc->err_count);
+        err_threshold_counter = nvgpu_atomic_cmpxchg(&err_desc->err_count,
+                        err_desc->err_threshold + 1, 0);
+        if (unlikely(err_threshold_counter != err_desc->err_threshold + 1)) {
+                goto done;
+        }
+        nvgpu_log(g, gpu_dbg_ecc, "ECC reporting hw: %s, desc:%s, count:%llu",
+                hw_module->name, err_desc->name, err_count);
+        req.srv_id_cmd = NVGUARD_SERVICESTATUS_NOTIFICATION;
+        req.srv_status.srv_id = (nv_guard_service_id_t)s_id;
+        req.srv_status.status = err_status;
+        req.srv_status.timestamp = timestamp;
+        req.srv_status.error_info_size = err_info_size;
+        memcpy(req.srv_status.error_info, (u8*)&err_pkt, err_info_size);
+        /*
+         * l1ss_submit_rq may fail due to kmalloc failures but may pass in
+         * subsequent calls
+         */
+        err = l1ss_submit_rq(&req, true);
+        if (err != 0) {
+                nvgpu_err(g, "Error returned from L1SS submit %d", err);
+        }
+        if (err_desc->is_critical) {
+                nvgpu_quiesce(g);
+        }
+done:
+        return;
+}
+static void nvgpu_report_ecc_error_empty(struct gk20a *g, u32 hw_unit, u32 inst,
+                u32 err_id, u64 err_addr, u64 err_count) {
+                nvgpu_log(g, gpu_dbg_ecc, "ECC reporting empty");
+}
+const struct nvgpu_ecc_reporting_ops default_disabled_ecc_report_ops = {
+        .report_ecc_err = nvgpu_report_ecc_error_empty,
+};
+const struct nvgpu_ecc_reporting_ops ecc_enable_report_ops = {
+        .report_ecc_err = nvgpu_report_ecc_error_linux,
+};
+static int nvgpu_l1ss_callback(l1ss_cli_callback_param param, void *data)
+{
+        struct gk20a *g = (struct gk20a *)data;
+        struct nvgpu_os_linux *l = NULL;
+        struct nvgpu_ecc_reporting_linux *ecc_reporting_linux = NULL;
+        int err = 0;
+        /* Ensure we have a valid gk20a struct before proceeding */
+        if ((g == NULL) || (gk20a_get(g) == NULL)) {
+                return -ENODEV;
+        }
+        l = nvgpu_os_linux_from_gk20a(g);
+        ecc_reporting_linux = &l->ecc_reporting_linux;
+        nvgpu_spinlock_acquire(&ecc_reporting_linux->common.lock);
+        if (param == L1SS_READY) {
+                if (!ecc_reporting_linux->common.ecc_reporting_service_enabled) {
+                        ecc_reporting_linux->common.ecc_reporting_service_enabled = true;
+                        ecc_reporting_linux->common.ops = &ecc_enable_report_ops;
+                        nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is enabled");
+                }
+        } else if (param == L1SS_NOT_READY) {
+                if (ecc_reporting_linux->common.ecc_reporting_service_enabled) {
+                        ecc_reporting_linux->common.ecc_reporting_service_enabled = false;
+                        ecc_reporting_linux->common.ops = &default_disabled_ecc_report_ops;
+                        nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is disabled");
+                }
+        } else {
+                err = -EINVAL;
+        }
+        nvgpu_spinlock_release(&ecc_reporting_linux->common.lock);
+        gk20a_put(g);
+        return err;
+}
+void nvgpu_init_ecc_reporting(struct gk20a *g)
+{
+        struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
+        struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux;
+        int err = 0;
+        /* This will invoke the registration API */
+        nvgpu_spinlock_init(&ecc_report_linux->common.lock);
+        ecc_report_linux->priv.id = (NVGUARD_GROUPID_IGPU & NVGUARD_GROUPINDEX_FIELDMASK);
+        ecc_report_linux->priv.cli_callback = nvgpu_l1ss_callback;
+        ecc_report_linux->priv.data = g;
+        ecc_report_linux->common.ops = &default_disabled_ecc_report_ops;
+        nvgpu_log(g, gpu_dbg_ecc, "ECC reporting Init");
+        /*
+         * err == 0 indicates service is available but not active yet.
+         * err == 1 indicates service is available and active
+         * error for other cases.
+         */
+        err = l1ss_register_client(&ecc_report_linux->priv);
+        if (err == 0) {
+                ecc_report_linux->common.ecc_reporting_service_enabled = false;
+                nvgpu_log(g, gpu_dbg_ecc, "ECC reporting init success");
+        } else if (err == 1) {
+                ecc_report_linux->common.ecc_reporting_service_enabled = true;
+                /* Actual Ops will be replaced during nvgpu_enable_ecc_reporting
+                 * called as part of gk20a_busy()
+                 */
+        } else {
+                nvgpu_log(g, gpu_dbg_ecc, "ECC reporting init failure %d", err);
+        }
+}
+void nvgpu_deinit_ecc_reporting(struct gk20a *g)
+{
+        struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
+        struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux;
+        if (ecc_report_linux->common.ecc_reporting_service_enabled) {
+                ecc_report_linux->common.ecc_reporting_service_enabled = false;
+                l1ss_deregister_client(ecc_report_linux->priv.id);
+                memset(ecc_report_linux, 0, sizeof(*ecc_report_linux));
+                nvgpu_log(g, gpu_dbg_ecc, "ECC reporting de-init success");
+        }
+}
+void nvgpu_enable_ecc_reporting(struct gk20a *g)
+{
+        struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
+        struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux;
+        struct nvgpu_ecc_reporting *error_reporting = &ecc_report_linux->common;
+        nvgpu_spinlock_acquire(&ecc_report_linux->common.lock);
+        if (error_reporting->ecc_reporting_service_enabled) {
+                error_reporting->ops = &ecc_enable_report_ops;
+                nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is enabled");
+        }
+        nvgpu_spinlock_release(&ecc_report_linux->common.lock);
+}
+void nvgpu_disable_ecc_reporting(struct gk20a *g)
+{
+        struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
+        struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux;
+        struct nvgpu_ecc_reporting *error_reporting = &ecc_report_linux->common;
+        nvgpu_spinlock_acquire(&ecc_report_linux->common.lock);
+        error_reporting->ops = &default_disabled_ecc_report_ops;
+        nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is disabled");
+        nvgpu_spinlock_release(&ecc_report_linux->common.lock);
+}
+void nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst,
+                u32 err_id, u64 err_addr, u64 err_count)
+{
+        struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
+        struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux;
+        struct nvgpu_ecc_reporting *error_reporting = &ecc_report_linux->common;
+        void (*report_ecc_err_func)(struct gk20a *g, u32 hw_unit, u32 inst,
+                u32 err_id, u64 err_addr, u64 err_count);
+        nvgpu_spinlock_acquire(&ecc_report_linux->common.lock);
+        report_ecc_err_func = error_reporting->ops->report_ecc_err;
+        nvgpu_spinlock_release(&ecc_report_linux->common.lock);
+        report_ecc_err_func(g, hw_unit, inst, err_id, err_addr, err_count);
+}

diff --git a/drivers/gpu/nvgpu/os/linux/sdl.c b/drivers/gpu/nvgpu/os/linux/sdl.c new file mode 100644 index 00000000..c4dccdc6 --- /dev/null +++ b/drivers/gpu/nvgpu/os/linux/sdl.c
@@ -0,0 +1,341 @@
	1	/*
	2	* Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
	3	*
	4	* This program is free software; you can redistribute it and/or modify it
	5	* under the terms and conditions of the GNU General Public License,
	6	* version 2, as published by the Free Software Foundation.
	7	*
	8	* This program is distributed in the hope it will be useful, but WITHOUT
	9	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
	10	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
	11	* more details.
	12	*
	13	* You should have received a copy of the GNU General Public License
	14	* along with this program. If not, see <http://www.gnu.org/licenses/>.
	15	*/
	16
	17	#include <nvgpu/gk20a.h>
	18	#include <nvgpu/types.h>
	19	#include <nvgpu/nvgpu_err.h>
	20	#include <nvgpu/timers.h>
	21	#include <nvgpu/bug.h>
	22
	23	#include "ecc_linux.h"
	24	#include "os_linux.h"
	25	#include "module.h"
	26
	27	/* This look-up table initializes the list of hw units and their errors.
	28	* It also specifies the error injection mechanism supported, for each error.
	29	* In case of hw error injection support, this initialization will be overriden
	30	* by the values provided from the hal layes of corresponding hw units.
	31	*/
	32	static struct nvgpu_err_hw_module gv11b_err_lut[] = {
	33	{
	34	.name = "sm",
	35	.hw_unit = (u32)NVGPU_ERR_MODULE_SM,
	36	.num_errs = 21U,
	37	.base_ecc_service_id =
	38	NVGUARD_SERVICE_IGPU_SM_SWERR_L1_TAG_ECC_CORRECTED,
	39	.errs = (struct nvgpu_err_desc[]) {
	40	GPU_NONCRITERR("l1_tag_ecc_corrected",
	41	GPU_SM_L1_TAG_ECC_CORRECTED, 0, 0),
	42	GPU_CRITERR("l1_tag_ecc_uncorrected",
	43	GPU_SM_L1_TAG_ECC_UNCORRECTED, 0, 0),
	44	GPU_NONCRITERR("cbu_ecc_corrected", 0, 0, 0),
	45	GPU_CRITERR("cbu_ecc_uncorrected",
	46	GPU_SM_CBU_ECC_UNCORRECTED, 0, 0),
	47	GPU_NONCRITERR("lrf_ecc_corrected", 0, 0, 0),
	48	GPU_CRITERR("lrf_ecc_uncorrected",
	49	GPU_SM_LRF_ECC_UNCORRECTED, 0, 0),
	50	GPU_NONCRITERR("l1_data_ecc_corrected", 0, 0, 0),
	51	GPU_CRITERR("l1_data_ecc_uncorrected",
	52	GPU_SM_L1_DATA_ECC_UNCORRECTED, 0, 0),
	53	GPU_NONCRITERR("icache_l0_data_ecc_corrected", 0, 0, 0),
	54	GPU_CRITERR("icache_l0_data_ecc_uncorrected",
	55	GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED, 0, 0),
	56	GPU_NONCRITERR("icache_l1_data_ecc_corrected", 0, 0, 0),
	57	GPU_CRITERR("icache_l1_data_ecc_uncorrected",
	58	GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED, 0, 0),
	59	GPU_NONCRITERR("icache_l0_predecode_ecc_corrected", 0, 0, 0),
	60	GPU_CRITERR("icache_l0_predecode_ecc_uncorrected",
	61	GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED, 0, 0),
	62	GPU_NONCRITERR("l1_tag_miss_fifo_ecc_corrected", 0, 0, 0),
	63	GPU_CRITERR("l1_tag_miss_fifo_ecc_uncorrected",
	64	GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED, 0, 0),
	65	GPU_NONCRITERR("l1_tag_s2r_pixprf_ecc_corrected", 0, 0, 0),
	66	GPU_CRITERR("l1_tag_s2r_pixprf_ecc_uncorrected",
	67	GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED, 0, 0),
	68	GPU_CRITERR("machine_check_error", 0, 0, 0),
	69	GPU_NONCRITERR("icache_l1_predecode_ecc_corrected", 0, 0, 0),
	70	GPU_CRITERR("icache_l1_predecode_ecc_uncorrected",
	71	GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED, 0, 0),
	72	},
	73	},
	74	{
	75	.name = "fecs",
	76	.hw_unit = (u32)NVGPU_ERR_MODULE_FECS,
	77	.num_errs = 4U,
	78	.base_ecc_service_id =
	79	NVGUARD_SERVICE_IGPU_FECS_SWERR_FALCON_IMEM_ECC_CORRECTED,
	80	.errs = (struct nvgpu_err_desc[]) {
	81	GPU_NONCRITERR("falcon_imem_ecc_corrected",
	82	GPU_FECS_FALCON_IMEM_ECC_CORRECTED, 0, 0),
	83	GPU_CRITERR("falcon_imem_ecc_uncorrected",
	84	GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED, 0, 0),
	85	GPU_NONCRITERR("falcon_dmem_ecc_corrected", 0, 0, 0),
	86	GPU_CRITERR("falcon_dmem_ecc_uncorrected",
	87	GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED, 0, 0),
	88	},
	89	},
	90	{
	91	.name = "pmu",
	92	.hw_unit = NVGPU_ERR_MODULE_PMU,
	93	.num_errs = 4U,
	94	.base_ecc_service_id =
	95	NVGUARD_SERVICE_IGPU_PMU_SWERR_FALCON_IMEM_ECC_CORRECTED,
	96	.errs = (struct nvgpu_err_desc[]) {
	97	GPU_NONCRITERR("falcon_imem_ecc_corrected",
	98	GPU_PMU_FALCON_IMEM_ECC_CORRECTED, 0, 0),
	99	GPU_CRITERR("falcon_imem_ecc_uncorrected",
	100	GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED, 0, 0),
	101	GPU_NONCRITERR("falcon_dmem_ecc_corrected", 0, 0, 0),
	102	GPU_CRITERR("falcon_dmem_ecc_uncorrected",
	103	GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED, 0, 0),
	104	},
	105	},
	106	};
	107
	108	static void nvgpu_init_err_msg_header(struct gpu_err_header *header)
	109	{
	110	header->version.major = (u16)1U;
	111	header->version.minor = (u16)0U;
	112	header->sub_err_type = 0U;
	113	header->sub_unit_id = 0UL;
	114	header->address = 0UL;
	115	header->timestamp_ns = 0UL;
	116	}
	117
	118	static void nvgpu_init_ecc_err_msg(struct gpu_ecc_error_info *err_info)
	119	{
	120	nvgpu_init_err_msg_header(&err_info->header);
	121	err_info->err_cnt = 0UL;
	122	}
	123
	124	static void nvgpu_report_ecc_error_linux(struct gk20a *g, u32 hw_unit, u32 inst,
	125	u32 err_id, u64 err_addr, u64 err_count)
	126	{
	127	int err = 0;
	128	u32 s_id = 0;
	129	u8 err_status = 0;
	130	u8 err_info_size = 0;
	131	u64 timestamp = 0ULL;
	132	int err_threshold_counter = 0;
	133	struct gpu_ecc_error_info err_pkt;
	134	struct nvgpu_err_desc *err_desc = NULL;
	135	struct nvgpu_err_hw_module *hw_module = NULL;
	136	nv_guard_request_t req;
	137
	138	memset(&req, 0, sizeof(req));
	139	nvgpu_init_ecc_err_msg(&err_pkt);
	140	if (hw_unit >= sizeof(gv11b_err_lut)/sizeof(gv11b_err_lut[0])) {
	141	err = -EINVAL;
	142	goto done;
	143	}
	144
	145	hw_module = &gv11b_err_lut[hw_unit];
	146	if (err_id >= hw_module->num_errs) {
	147	nvgpu_err(g, "invalid err_id (%u) for hw module (%u)",
	148	err_id, hw_module->hw_unit);
	149	err = -EINVAL;
	150	goto done;
	151	}
	152	err_desc = &hw_module->errs[err_id];
	153	timestamp = (u64)nvgpu_current_time_ns();
	154
	155	err_pkt.header.timestamp_ns = timestamp;
	156	err_pkt.header.sub_unit_id = inst;
	157	err_pkt.header.address = err_addr;
	158	err_pkt.err_cnt = err_count;
	159	err_info_size = sizeof(err_pkt);
	160
	161	s_id = hw_module->base_ecc_service_id + err_id;
	162
	163	if (err_desc->is_critical) {
	164	err_status = NVGUARD_ERROR_DETECTED;
	165	} else {
	166	err_status = NVGUARD_NO_ERROR;
	167	}
	168
	169	nvgpu_atomic_inc(&err_desc->err_count);
	170	err_threshold_counter = nvgpu_atomic_cmpxchg(&err_desc->err_count,
	171	err_desc->err_threshold + 1, 0);
	172
	173	if (unlikely(err_threshold_counter != err_desc->err_threshold + 1)) {
	174	goto done;
	175	}
	176
	177	nvgpu_log(g, gpu_dbg_ecc, "ECC reporting hw: %s, desc:%s, count:%llu",
	178	hw_module->name, err_desc->name, err_count);
	179
	180	req.srv_id_cmd = NVGUARD_SERVICESTATUS_NOTIFICATION;
	181	req.srv_status.srv_id = (nv_guard_service_id_t)s_id;
	182	req.srv_status.status = err_status;
	183	req.srv_status.timestamp = timestamp;
	184	req.srv_status.error_info_size = err_info_size;
	185	memcpy(req.srv_status.error_info, (u8*)&err_pkt, err_info_size);
	186
	187	/*
	188	* l1ss_submit_rq may fail due to kmalloc failures but may pass in
	189	* subsequent calls
	190	*/
	191	err = l1ss_submit_rq(&req, true);
	192	if (err != 0) {
	193	nvgpu_err(g, "Error returned from L1SS submit %d", err);
	194	}
	195
	196	if (err_desc->is_critical) {
	197	nvgpu_quiesce(g);
	198	}
	199
	200	done:
	201	return;
	202	}
	203
	204	static void nvgpu_report_ecc_error_empty(struct gk20a *g, u32 hw_unit, u32 inst,
	205	u32 err_id, u64 err_addr, u64 err_count) {
	206	nvgpu_log(g, gpu_dbg_ecc, "ECC reporting empty");
	207	}
	208
	209	const struct nvgpu_ecc_reporting_ops default_disabled_ecc_report_ops = {
	210	.report_ecc_err = nvgpu_report_ecc_error_empty,
	211	};
	212
	213	const struct nvgpu_ecc_reporting_ops ecc_enable_report_ops = {
	214	.report_ecc_err = nvgpu_report_ecc_error_linux,
	215	};
	216
	217	static int nvgpu_l1ss_callback(l1ss_cli_callback_param param, void *data)
	218	{
	219	struct gk20a g = (struct gk20a )data;
	220	struct nvgpu_os_linux *l = NULL;
	221	struct nvgpu_ecc_reporting_linux *ecc_reporting_linux = NULL;
	222	int err = 0;
	223	/* Ensure we have a valid gk20a struct before proceeding */
	224	if ((g == NULL) \|\| (gk20a_get(g) == NULL)) {
	225	return -ENODEV;
	226	}
	227
	228	l = nvgpu_os_linux_from_gk20a(g);
	229	ecc_reporting_linux = &l->ecc_reporting_linux;
	230
	231	nvgpu_spinlock_acquire(&ecc_reporting_linux->common.lock);
	232	if (param == L1SS_READY) {
	233	if (!ecc_reporting_linux->common.ecc_reporting_service_enabled) {
	234	ecc_reporting_linux->common.ecc_reporting_service_enabled = true;
	235	ecc_reporting_linux->common.ops = &ecc_enable_report_ops;
	236	nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is enabled");
	237	}
	238	} else if (param == L1SS_NOT_READY) {
	239	if (ecc_reporting_linux->common.ecc_reporting_service_enabled) {
	240	ecc_reporting_linux->common.ecc_reporting_service_enabled = false;
	241	ecc_reporting_linux->common.ops = &default_disabled_ecc_report_ops;
	242	nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is disabled");
	243	}
	244	} else {
	245	err = -EINVAL;
	246	}
	247	nvgpu_spinlock_release(&ecc_reporting_linux->common.lock);
	248
	249	gk20a_put(g);
	250
	251	return err;
	252	}
	253
	254	void nvgpu_init_ecc_reporting(struct gk20a *g)
	255	{
	256	struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
	257	struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux;
	258	int err = 0;
	259	/* This will invoke the registration API */
	260	nvgpu_spinlock_init(&ecc_report_linux->common.lock);
	261	ecc_report_linux->priv.id = (NVGUARD_GROUPID_IGPU & NVGUARD_GROUPINDEX_FIELDMASK);
	262	ecc_report_linux->priv.cli_callback = nvgpu_l1ss_callback;
	263	ecc_report_linux->priv.data = g;
	264	ecc_report_linux->common.ops = &default_disabled_ecc_report_ops;
	265
	266	nvgpu_log(g, gpu_dbg_ecc, "ECC reporting Init");
	267
	268	/*
	269	* err == 0 indicates service is available but not active yet.
	270	* err == 1 indicates service is available and active
	271	* error for other cases.
	272	*/
	273	err = l1ss_register_client(&ecc_report_linux->priv);
	274	if (err == 0) {
	275	ecc_report_linux->common.ecc_reporting_service_enabled = false;
	276	nvgpu_log(g, gpu_dbg_ecc, "ECC reporting init success");
	277	} else if (err == 1) {
	278	ecc_report_linux->common.ecc_reporting_service_enabled = true;
	279	/* Actual Ops will be replaced during nvgpu_enable_ecc_reporting
	280	* called as part of gk20a_busy()
	281	*/
	282	} else {
	283	nvgpu_log(g, gpu_dbg_ecc, "ECC reporting init failure %d", err);
	284	}
	285	}
	286
	287	void nvgpu_deinit_ecc_reporting(struct gk20a *g)
	288	{
	289	struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
	290	struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux;
	291
	292	if (ecc_report_linux->common.ecc_reporting_service_enabled) {
	293	ecc_report_linux->common.ecc_reporting_service_enabled = false;
	294	l1ss_deregister_client(ecc_report_linux->priv.id);
	295	memset(ecc_report_linux, 0, sizeof(*ecc_report_linux));
	296	nvgpu_log(g, gpu_dbg_ecc, "ECC reporting de-init success");
	297	}
	298
	299	}
	300
	301	void nvgpu_enable_ecc_reporting(struct gk20a *g)
	302	{
	303	struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
	304	struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux;
	305	struct nvgpu_ecc_reporting *error_reporting = &ecc_report_linux->common;
	306
	307	nvgpu_spinlock_acquire(&ecc_report_linux->common.lock);
	308	if (error_reporting->ecc_reporting_service_enabled) {
	309	error_reporting->ops = &ecc_enable_report_ops;
	310	nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is enabled");
	311	}
	312	nvgpu_spinlock_release(&ecc_report_linux->common.lock);
	313	}
	314
	315	void nvgpu_disable_ecc_reporting(struct gk20a *g)
	316	{
	317	struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
	318	struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux;
	319	struct nvgpu_ecc_reporting *error_reporting = &ecc_report_linux->common;
	320
	321	nvgpu_spinlock_acquire(&ecc_report_linux->common.lock);
	322	error_reporting->ops = &default_disabled_ecc_report_ops;
	323	nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is disabled");
	324	nvgpu_spinlock_release(&ecc_report_linux->common.lock);
	325	}
	326
	327	void nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst,
	328	u32 err_id, u64 err_addr, u64 err_count)
	329	{
	330	struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
	331	struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux;
	332	struct nvgpu_ecc_reporting *error_reporting = &ecc_report_linux->common;
	333	void (report_ecc_err_func)(struct gk20a g, u32 hw_unit, u32 inst,
	334	u32 err_id, u64 err_addr, u64 err_count);
	335
	336	nvgpu_spinlock_acquire(&ecc_report_linux->common.lock);
	337	report_ecc_err_func = error_reporting->ops->report_ecc_err;
	338	nvgpu_spinlock_release(&ecc_report_linux->common.lock);
	339
	340	report_ecc_err_func(g, hw_unit, inst, err_id, err_addr, err_count);
	341	}