Update includes to L4T r32.7.4 and drop nvgpu/gk20a.h dependency

Also add instructions for updating `include/`. These files are now only needed to build on Linux 4.9-based Tegra platforms.
author: Joshua Bakita <bakitajoshua@gmail.com> 2023-10-29 13:07:40 -0400
committer: Joshua Bakita <bakitajoshua@gmail.com> 2023-10-29 13:10:52 -0400
commit: 2c5337a24f7f2d02989dfb733c55d6d8c7e90493 (patch)
tree: b9f1028cb443b03190b710c0d7ee640bf5958631 /include/nvgpu/nvgpu_err.h
parent: aa06f84f03cba7ad1aae5cd527355bb3d8c152a6 (diff)
1 files changed, 359 insertions, 0 deletions
diff --git a/include/nvgpu/nvgpu_err.h b/include/nvgpu/nvgpu_err.h
new file mode 100644
index 0000000..0595faf
--- /dev/null
+++ b/include/nvgpu/nvgpu_err.h
@@ -0,0 +1,359 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#ifndef NVGPU_NVGPU_ERR_H
+#define NVGPU_NVGPU_ERR_H
+/**
+ * @file
+ *
+ * Define indices for HW units and errors. Define structures used to carry error
+ * information. Declare prototype for APIs that are used to report GPU HW errors
+ * to the Safety_Services framework.
+ */
+#include <nvgpu/types.h>
+#include <nvgpu/atomic.h>
+struct gk20a;
+/**
+ * @defgroup INDICES_FOR_GPU_HW_UNITS
+ * Macros used to assign unique index to GPU HW units.
+ * @{
+ */
+#define NVGPU_ERR_MODULE_SM                     (0U)
+#define NVGPU_ERR_MODULE_FECS           (1U)
+#define NVGPU_ERR_MODULE_PMU            (2U)
+/**
+ * @}
+ */
+/**
+ * @defgroup LIST_OF_ERRORS_REPORTED_FROM_SM
+ * Macros used to assign unique index to errors reported from the SM unit.
+ * @{
+ */
+#define GPU_SM_L1_TAG_ECC_CORRECTED                     (0U)
+#define GPU_SM_L1_TAG_ECC_UNCORRECTED                   (1U)
+#define GPU_SM_CBU_ECC_UNCORRECTED                      (3U)
+#define GPU_SM_LRF_ECC_UNCORRECTED                      (5U)
+#define GPU_SM_L1_DATA_ECC_UNCORRECTED                  (7U)
+#define GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED           (9U)
+#define GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED           (11U)
+#define GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED      (13U)
+#define GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED         (15U)
+#define GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED        (17U)
+#define GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED      (20U)
+/**
+ * @}
+ */
+/**
+ * @defgroup LIST_OF_ERRORS_REPORTED_FROM_FECS
+ * Macros used to assign unique index to errors reported from the FECS unit.
+ * @{
+ */
+#define GPU_FECS_FALCON_IMEM_ECC_CORRECTED      (0U)
+#define GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED    (1U)
+#define GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED    (3U)
+/**
+ * @}
+ */
+/**
+ * @defgroup LIST_OF_ERRORS_REPORTED_FROM_GPCCS
+ * Macros used to assign unique index to errors reported from the GPCCS unit.
+ * @{
+ */
+#define GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED     (0U)
+#define GPU_GPCCS_FALCON_IMEM_ECC_UNCORRECTED   (1U)
+#define GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED   (3U)
+/**
+ * @}
+ */
+/**
+ * @defgroup LIST_OF_ERRORS_REPORTED_FROM_MMU
+ * Macros used to assign unique index to errors reported from the MMU unit.
+ * @{
+ */
+#define GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED   (1U)
+#define GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED   (3U)
+/**
+ * @}
+ */
+/**
+ * @defgroup LIST_OF_ERRORS_REPORTED_FROM_GCC
+ * Macros used to assign unique index to errors reported from the GCC unit.
+ * @{
+ */
+#define GPU_GCC_L15_ECC_UNCORRECTED             (1U)
+/**
+ * @}
+ */
+/**
+ * @defgroup LIST_OF_ERRORS_REPORTED_FROM_PMU
+ * Macros used to assign unique index to errors reported from the PMU unit.
+ * @{
+ */
+#define GPU_PMU_FALCON_IMEM_ECC_CORRECTED       (0U)
+#define GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED     (1U)
+#define GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED     (3U)
+/**
+ * @}
+ */
+/**
+ * @defgroup LIST_OF_ERRORS_REPORTED_FROM_LTC
+ * Macros used to assign unique index to errors reported from the LTC unit.
+ * @{
+ */
+#define GPU_LTC_CACHE_DSTG_ECC_CORRECTED        (0U)
+#define GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED      (1U)
+#define GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED      (3U)
+#define GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED   (7U)
+/**
+ * @}
+ */
+/**
+ * @defgroup LIST_OF_ERRORS_REPORTED_FROM_HUBMMU
+ * Macros used to assign unique index to errors reported from the HUBMMU unit.
+ * @{
+ */
+#define GPU_HUBMMU_L2TLB_SA_DATA_ECC_UNCORRECTED        (1U)
+#define GPU_HUBMMU_TLB_SA_DATA_ECC_UNCORRECTED          (3U)
+#define GPU_HUBMMU_PTE_DATA_ECC_UNCORRECTED             (5U)
+#define GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED            (7U)
+#define GPU_HUBMMU_PAGE_FAULT_ERROR                     (8U)
+#ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING
+/**
+ * @}
+ */
+/**
+ * nvgpu_err_desc structure holds fields which describe an error along with
+ * function callback which can be used to inject the error.
+ */
+struct nvgpu_err_desc {
+        /** String representation of error. */
+        const char *name;
+        /** Flag to classify an error as critical or non-critical. */
+        bool is_critical;
+        /**
+         * Error Threshold: once this threshold value is reached, then the
+         * corresponding error counter will be reset to 0 and the error will be
+         * propagated to Safety_Services.
+         */
+        int err_threshold;
+        /**
+         * Total number of times an error has occurred (since its last reset).
+         */
+        nvgpu_atomic_t err_count;
+        /** Error ID. */
+        u8 error_id;
+};
+/**
+ * gpu_err_header structure holds fields which are required to identify the
+ * version of header, sub-error type, sub-unit id, error address and time stamp.
+ */
+struct gpu_err_header {
+        /** Version of GPU error header. */
+        struct {
+                /** Major version number. */
+                u16 major;
+                /** Minor version number. */
+                u16 minor;
+        } version;
+        /** Sub error type corresponding to the error that is being reported. */
+        u32 sub_err_type;
+        /** ID of the sub-unit in a HW unit which encountered an error. */
+        u64 sub_unit_id;
+        /** Location of the error. */
+        u64 address;
+        /** Timestamp in nano seconds. */
+        u64 timestamp_ns;
+};
+struct gpu_ecc_error_info {
+        struct gpu_err_header header;
+        /** Number of ECC errors. */
+        u64 err_cnt;
+};
+/**
+ * nvgpu_err_hw_module structure holds fields which describe the h/w modules
+ * error reporting capabilities.
+ */
+struct nvgpu_err_hw_module {
+        /** String representation of a given HW unit. */
+        const char *name;
+        /** HW unit ID. */
+        u32 hw_unit;
+        /** Total number of errors reported from a given HW unit. */
+        u32 num_errs;
+        u32 base_ecc_service_id;
+        /** Used to get error description from look-up table. */
+        struct nvgpu_err_desc *errs;
+};
+struct nvgpu_ecc_reporting_ops {
+        void (*report_ecc_err)(struct gk20a *g, u32 hw_unit, u32 inst,
+                u32 err_id, u64 err_addr, u64 err_count);
+};
+struct nvgpu_ecc_reporting {
+        struct nvgpu_spinlock lock;
+        /* This flag is protected by the above spinlock */
+        bool ecc_reporting_service_enabled;
+        const struct nvgpu_ecc_reporting_ops *ops;
+};
+ /**
+  * This macro is used to initialize the members of nvgpu_err_desc struct.
+  */
+#define GPU_ERR(err, critical, id, threshold, ecount) \
+{                                                                       \
+                .name = (err),                                          \
+                .is_critical = (critical),                              \
+                .error_id = (id),                                       \
+                .err_threshold = (threshold),                           \
+                .err_count = NVGPU_ATOMIC_INIT(ecount),                                 \
+}
+/**
+ * This macro is used to initialize critical errors.
+ */
+#define GPU_CRITERR(err, id, threshold, ecount) \
+        GPU_ERR(err, true, id, threshold, ecount)
+/**
+ * This macro is used to initialize non-critical errors.
+ */
+#define GPU_NONCRITERR(err, id, threshold, ecount) \
+        GPU_ERR(err, false, id, threshold, ecount)
+/**
+ * @brief GPU HW errors need to be reported to Safety_Services via SDL unit.
+ *        This function provides an interface to report ECC erros to SDL unit.
+ *
+ * @param g [in]                - The GPU driver struct.
+ * @param hw_unit [in]          - Index of HW unit.
+ *                                - List of valid HW unit IDs
+ *                                  - NVGPU_ERR_MODULE_SM
+ *                                  - NVGPU_ERR_MODULE_FECS
+ *                                  - NVGPU_ERR_MODULE_GPCCS
+ *                                  - NVGPU_ERR_MODULE_MMU
+ *                                  - NVGPU_ERR_MODULE_GCC
+ *                                  - NVGPU_ERR_MODULE_PMU
+ *                                  - NVGPU_ERR_MODULE_LTC
+ *                                  - NVGPU_ERR_MODULE_HUBMMU
+ * @param inst [in]             - Instance ID.
+ *                                - In case of multiple instances of the same HW
+ *                                  unit (e.g., there are multiple instances of
+ *                                  SM), it is used to identify the instance
+ *                                  that encountered a fault.
+ * @param err_id [in]           - Error index.
+ *                                - For SM:
+ *                                  - Min: GPU_SM_L1_TAG_ECC_CORRECTED
+ *                                  - Max: GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED
+ *                                - For FECS:
+ *                                  - Min: GPU_FECS_FALCON_IMEM_ECC_CORRECTED
+ *                                  - Max: GPU_FECS_INVALID_ERROR
+ *                                - For GPCCS:
+ *                                  - Min: GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED
+ *                                  - Max: GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED
+ *                                - For MMU:
+ *                                  - Min: GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED
+ *                                  - Max: GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED
+ *                                - For GCC:
+ *                                  - Min: GPU_GCC_L15_ECC_UNCORRECTED
+ *                                  - Max: GPU_GCC_L15_ECC_UNCORRECTED
+ *                                - For PMU:
+ *                                  - Min: GPU_PMU_FALCON_IMEM_ECC_CORRECTED
+ *                                  - Max: GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED
+ *                                - For LTC:
+ *                                  - Min: GPU_LTC_CACHE_DSTG_ECC_CORRECTED
+ *                                  - Max: GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED
+ *                                - For HUBMMU:
+ *                                  - Min: GPU_HUBMMU_L2TLB_SA_DATA_ECC_UNCORRECTED
+ *                                  - Max: GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED
+ * @param err_addr [in]         - Error address.
+ *                                - This is the location at which correctable or
+ *                                  uncorrectable error has occurred.
+ * @param err_count [in]        - Error count.
+ *
+ * - Checks whether SDL is supported in the current GPU platform. If SDL is not
+ *   supported, it simply returns.
+ * - Validates both \a hw_unit and \a err_id indices. In case of a failure,
+ *   invokes #nvgpu_sdl_handle_report_failure() api.
+ * - Gets the current time of a clock. In case of a failure, invokes
+ *   #nvgpu_sdl_handle_report_failure() api.
+ * - Gets error description from internal look-up table using \a hw_unit and
+ *   \a err_id indices.
+ * - Forms error packet using details such as time-stamp, \a hw_unit, \a err_id,
+ *   criticality of the error, \a inst, \a err_addr, \a err_count, error
+ *   description, and size of the error packet.
+ * - Performs compile-time assert check to ensure that the size of the error
+ *   packet does not exceed the maximum allowable size specified in
+ *   #MAX_ERR_MSG_SIZE.
+ *
+ * @return      None
+ */
+void nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst,
+                u32 err_id, u64 err_addr, u64 err_count);
+void nvgpu_init_ecc_reporting(struct gk20a *g);
+void nvgpu_enable_ecc_reporting(struct gk20a *g);
+void nvgpu_disable_ecc_reporting(struct gk20a *g);
+void nvgpu_deinit_ecc_reporting(struct gk20a *g);
+#else
+static inline void nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst,
+                u32 err_id, u64 err_addr, u64 err_count) {
+}
+#endif /* CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING */
+#endif /* NVGPU_NVGPU_ERR_H */
+\ No newline at end of file
author	Joshua Bakita <bakitajoshua@gmail.com>	2023-10-29 13:07:40 -0400
committer	Joshua Bakita <bakitajoshua@gmail.com>	2023-10-29 13:10:52 -0400
commit	2c5337a24f7f2d02989dfb733c55d6d8c7e90493 (patch)
tree	b9f1028cb443b03190b710c0d7ee640bf5958631 /include/nvgpu/nvgpu_err.h
parent	aa06f84f03cba7ad1aae5cd527355bb3d8c152a6 (diff)

diff --git a/include/nvgpu/nvgpu_err.h b/include/nvgpu/nvgpu_err.h new file mode 100644 index 0000000..0595faf --- /dev/null +++ b/include/nvgpu/nvgpu_err.h
@@ -0,0 +1,359 @@
	1	/*
	2	* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
	3	*
	4	* Permission is hereby granted, free of charge, to any person obtaining a
	5	* copy of this software and associated documentation files (the "Software"),
	6	* to deal in the Software without restriction, including without limitation
	7	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
	8	* and/or sell copies of the Software, and to permit persons to whom the
	9	* Software is furnished to do so, subject to the following conditions:
	10	*
	11	* The above copyright notice and this permission notice shall be included in
	12	* all copies or substantial portions of the Software.
	13	*
	14	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	15	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	16	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
	17	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	18	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
	19	* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
	20	* DEALINGS IN THE SOFTWARE.
	21	*/
	22
	23	#ifndef NVGPU_NVGPU_ERR_H
	24	#define NVGPU_NVGPU_ERR_H
	25
	26	/**
	27	* @file
	28	*
	29	* Define indices for HW units and errors. Define structures used to carry error
	30	* information. Declare prototype for APIs that are used to report GPU HW errors
	31	* to the Safety_Services framework.
	32	*/
	33
	34	#include <nvgpu/types.h>
	35	#include <nvgpu/atomic.h>
	36
	37	struct gk20a;
	38
	39	/**
	40	* @defgroup INDICES_FOR_GPU_HW_UNITS
	41	* Macros used to assign unique index to GPU HW units.
	42	* @{
	43	*/
	44	#define NVGPU_ERR_MODULE_SM (0U)
	45	#define NVGPU_ERR_MODULE_FECS (1U)
	46	#define NVGPU_ERR_MODULE_PMU (2U)
	47	/**
	48	* @}
	49	*/
	50
	51	/**
	52	* @defgroup LIST_OF_ERRORS_REPORTED_FROM_SM
	53	* Macros used to assign unique index to errors reported from the SM unit.
	54	* @{
	55	*/
	56	#define GPU_SM_L1_TAG_ECC_CORRECTED (0U)
	57	#define GPU_SM_L1_TAG_ECC_UNCORRECTED (1U)
	58	#define GPU_SM_CBU_ECC_UNCORRECTED (3U)
	59	#define GPU_SM_LRF_ECC_UNCORRECTED (5U)
	60	#define GPU_SM_L1_DATA_ECC_UNCORRECTED (7U)
	61	#define GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED (9U)
	62	#define GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED (11U)
	63	#define GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED (13U)
	64	#define GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED (15U)
	65	#define GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED (17U)
	66	#define GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED (20U)
	67	/**
	68	* @}
	69	*/
	70
	71	/**
	72	* @defgroup LIST_OF_ERRORS_REPORTED_FROM_FECS
	73	* Macros used to assign unique index to errors reported from the FECS unit.
	74	* @{
	75	*/
	76	#define GPU_FECS_FALCON_IMEM_ECC_CORRECTED (0U)
	77	#define GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED (1U)
	78	#define GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED (3U)
	79	/**
	80	* @}
	81	*/
	82
	83	/**
	84	* @defgroup LIST_OF_ERRORS_REPORTED_FROM_GPCCS
	85	* Macros used to assign unique index to errors reported from the GPCCS unit.
	86	* @{
	87	*/
	88	#define GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED (0U)
	89	#define GPU_GPCCS_FALCON_IMEM_ECC_UNCORRECTED (1U)
	90	#define GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED (3U)
	91	/**
	92	* @}
	93	*/
	94
	95	/**
	96	* @defgroup LIST_OF_ERRORS_REPORTED_FROM_MMU
	97	* Macros used to assign unique index to errors reported from the MMU unit.
	98	* @{
	99	*/
	100	#define GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED (1U)
	101	#define GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED (3U)
	102	/**
	103	* @}
	104	*/
	105
	106	/**
	107	* @defgroup LIST_OF_ERRORS_REPORTED_FROM_GCC
	108	* Macros used to assign unique index to errors reported from the GCC unit.
	109	* @{
	110	*/
	111	#define GPU_GCC_L15_ECC_UNCORRECTED (1U)
	112	/**
	113	* @}
	114	*/
	115
	116
	117	/**
	118	* @defgroup LIST_OF_ERRORS_REPORTED_FROM_PMU
	119	* Macros used to assign unique index to errors reported from the PMU unit.
	120	* @{
	121	*/
	122	#define GPU_PMU_FALCON_IMEM_ECC_CORRECTED (0U)
	123	#define GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED (1U)
	124	#define GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED (3U)
	125	/**
	126	* @}
	127	*/
	128
	129	/**
	130	* @defgroup LIST_OF_ERRORS_REPORTED_FROM_LTC
	131	* Macros used to assign unique index to errors reported from the LTC unit.
	132	* @{
	133	*/
	134	#define GPU_LTC_CACHE_DSTG_ECC_CORRECTED (0U)
	135	#define GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED (1U)
	136	#define GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED (3U)
	137	#define GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED (7U)
	138	/**
	139	* @}
	140	*/
	141
	142	/**
	143	* @defgroup LIST_OF_ERRORS_REPORTED_FROM_HUBMMU
	144	* Macros used to assign unique index to errors reported from the HUBMMU unit.
	145	* @{
	146	*/
	147	#define GPU_HUBMMU_L2TLB_SA_DATA_ECC_UNCORRECTED (1U)
	148	#define GPU_HUBMMU_TLB_SA_DATA_ECC_UNCORRECTED (3U)
	149	#define GPU_HUBMMU_PTE_DATA_ECC_UNCORRECTED (5U)
	150	#define GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED (7U)
	151	#define GPU_HUBMMU_PAGE_FAULT_ERROR (8U)
	152
	153
	154	#ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING
	155	/**
	156	* @}
	157	*/
	158
	159	/**
	160	* nvgpu_err_desc structure holds fields which describe an error along with
	161	* function callback which can be used to inject the error.
	162	*/
	163	struct nvgpu_err_desc {
	164	/** String representation of error. */
	165	const char *name;
	166
	167	/** Flag to classify an error as critical or non-critical. */
	168	bool is_critical;
	169
	170	/**
	171	* Error Threshold: once this threshold value is reached, then the
	172	* corresponding error counter will be reset to 0 and the error will be
	173	* propagated to Safety_Services.
	174	*/
	175	int err_threshold;
	176
	177	/**
	178	* Total number of times an error has occurred (since its last reset).
	179	*/
	180	nvgpu_atomic_t err_count;
	181
	182	/** Error ID. */
	183	u8 error_id;
	184	};
	185
	186	/**
	187	* gpu_err_header structure holds fields which are required to identify the
	188	* version of header, sub-error type, sub-unit id, error address and time stamp.
	189	*/
	190	struct gpu_err_header {
	191	/** Version of GPU error header. */
	192	struct {
	193	/** Major version number. */
	194	u16 major;
	195	/** Minor version number. */
	196	u16 minor;
	197	} version;
	198
	199	/** Sub error type corresponding to the error that is being reported. */
	200	u32 sub_err_type;
	201
	202	/** ID of the sub-unit in a HW unit which encountered an error. */
	203	u64 sub_unit_id;
	204
	205	/** Location of the error. */
	206	u64 address;
	207
	208	/** Timestamp in nano seconds. */
	209	u64 timestamp_ns;
	210	};
	211
	212	struct gpu_ecc_error_info {
	213	struct gpu_err_header header;
	214
	215	/** Number of ECC errors. */
	216	u64 err_cnt;
	217	};
	218
	219	/**
	220	* nvgpu_err_hw_module structure holds fields which describe the h/w modules
	221	* error reporting capabilities.
	222	*/
	223	struct nvgpu_err_hw_module {
	224	/** String representation of a given HW unit. */
	225	const char *name;
	226
	227	/** HW unit ID. */
	228	u32 hw_unit;
	229
	230	/** Total number of errors reported from a given HW unit. */
	231	u32 num_errs;
	232
	233	u32 base_ecc_service_id;
	234
	235	/** Used to get error description from look-up table. */
	236	struct nvgpu_err_desc *errs;
	237	};
	238
	239	struct nvgpu_ecc_reporting_ops {
	240	void (report_ecc_err)(struct gk20a g, u32 hw_unit, u32 inst,
	241	u32 err_id, u64 err_addr, u64 err_count);
	242	};
	243
	244	struct nvgpu_ecc_reporting {
	245	struct nvgpu_spinlock lock;
	246	/* This flag is protected by the above spinlock */
	247	bool ecc_reporting_service_enabled;
	248	const struct nvgpu_ecc_reporting_ops *ops;
	249	};
	250
	251	/**
	252	* This macro is used to initialize the members of nvgpu_err_desc struct.
	253	*/
	254	#define GPU_ERR(err, critical, id, threshold, ecount) \
	255	{ \
	256	.name = (err), \
	257	.is_critical = (critical), \
	258	.error_id = (id), \
	259	.err_threshold = (threshold), \
	260	.err_count = NVGPU_ATOMIC_INIT(ecount), \
	261	}
	262
	263	/**
	264	* This macro is used to initialize critical errors.
	265	*/
	266	#define GPU_CRITERR(err, id, threshold, ecount) \
	267	GPU_ERR(err, true, id, threshold, ecount)
	268
	269	/**
	270	* This macro is used to initialize non-critical errors.
	271	*/
	272	#define GPU_NONCRITERR(err, id, threshold, ecount) \
	273	GPU_ERR(err, false, id, threshold, ecount)
	274
	275	/**
	276	* @brief GPU HW errors need to be reported to Safety_Services via SDL unit.
	277	* This function provides an interface to report ECC erros to SDL unit.
	278	*
	279	* @param g [in] - The GPU driver struct.
	280	* @param hw_unit [in] - Index of HW unit.
	281	* - List of valid HW unit IDs
	282	* - NVGPU_ERR_MODULE_SM
	283	* - NVGPU_ERR_MODULE_FECS
	284	* - NVGPU_ERR_MODULE_GPCCS
	285	* - NVGPU_ERR_MODULE_MMU
	286	* - NVGPU_ERR_MODULE_GCC
	287	* - NVGPU_ERR_MODULE_PMU
	288	* - NVGPU_ERR_MODULE_LTC
	289	* - NVGPU_ERR_MODULE_HUBMMU
	290	* @param inst [in] - Instance ID.
	291	* - In case of multiple instances of the same HW
	292	* unit (e.g., there are multiple instances of
	293	* SM), it is used to identify the instance
	294	* that encountered a fault.
	295	* @param err_id [in] - Error index.
	296	* - For SM:
	297	* - Min: GPU_SM_L1_TAG_ECC_CORRECTED
	298	* - Max: GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED
	299	* - For FECS:
	300	* - Min: GPU_FECS_FALCON_IMEM_ECC_CORRECTED
	301	* - Max: GPU_FECS_INVALID_ERROR
	302	* - For GPCCS:
	303	* - Min: GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED
	304	* - Max: GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED
	305	* - For MMU:
	306	* - Min: GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED
	307	* - Max: GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED
	308	* - For GCC:
	309	* - Min: GPU_GCC_L15_ECC_UNCORRECTED
	310	* - Max: GPU_GCC_L15_ECC_UNCORRECTED
	311	* - For PMU:
	312	* - Min: GPU_PMU_FALCON_IMEM_ECC_CORRECTED
	313	* - Max: GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED
	314	* - For LTC:
	315	* - Min: GPU_LTC_CACHE_DSTG_ECC_CORRECTED
	316	* - Max: GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED
	317	* - For HUBMMU:
	318	* - Min: GPU_HUBMMU_L2TLB_SA_DATA_ECC_UNCORRECTED
	319	* - Max: GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED
	320	* @param err_addr [in] - Error address.
	321	* - This is the location at which correctable or
	322	* uncorrectable error has occurred.
	323	* @param err_count [in] - Error count.
	324	*
	325	* - Checks whether SDL is supported in the current GPU platform. If SDL is not
	326	* supported, it simply returns.
	327	* - Validates both \a hw_unit and \a err_id indices. In case of a failure,
	328	* invokes #nvgpu_sdl_handle_report_failure() api.
	329	* - Gets the current time of a clock. In case of a failure, invokes
	330	* #nvgpu_sdl_handle_report_failure() api.
	331	* - Gets error description from internal look-up table using \a hw_unit and
	332	* \a err_id indices.
	333	* - Forms error packet using details such as time-stamp, \a hw_unit, \a err_id,
	334	* criticality of the error, \a inst, \a err_addr, \a err_count, error
	335	* description, and size of the error packet.
	336	* - Performs compile-time assert check to ensure that the size of the error
	337	* packet does not exceed the maximum allowable size specified in
	338	* #MAX_ERR_MSG_SIZE.
	339	*
	340	* @return None
	341	*/
	342	void nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst,
	343	u32 err_id, u64 err_addr, u64 err_count);
	344
	345	void nvgpu_init_ecc_reporting(struct gk20a *g);
	346	void nvgpu_enable_ecc_reporting(struct gk20a *g);
	347	void nvgpu_disable_ecc_reporting(struct gk20a *g);
	348	void nvgpu_deinit_ecc_reporting(struct gk20a *g);
	349
	350	#else
	351
	352	static inline void nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst,
	353	u32 err_id, u64 err_addr, u64 err_count) {
	354
	355	}
	356
	357	#endif /* CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING */
	358
	359	#endif /* NVGPU_NVGPU_ERR_H */ \ No newline at end of file