summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h')
-rw-r--r--drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h359
1 files changed, 359 insertions, 0 deletions
diff --git a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h
new file mode 100644
index 00000000..0595fafb
--- /dev/null
+++ b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h
@@ -0,0 +1,359 @@
1/*
2 * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 * DEALINGS IN THE SOFTWARE.
21 */
22
23#ifndef NVGPU_NVGPU_ERR_H
24#define NVGPU_NVGPU_ERR_H
25
26/**
27 * @file
28 *
29 * Define indices for HW units and errors. Define structures used to carry error
30 * information. Declare prototype for APIs that are used to report GPU HW errors
31 * to the Safety_Services framework.
32 */
33
34#include <nvgpu/types.h>
35#include <nvgpu/atomic.h>
36
37struct gk20a;
38
39/**
40 * @defgroup INDICES_FOR_GPU_HW_UNITS
41 * Macros used to assign unique index to GPU HW units.
42 * @{
43 */
44#define NVGPU_ERR_MODULE_SM (0U)
45#define NVGPU_ERR_MODULE_FECS (1U)
46#define NVGPU_ERR_MODULE_PMU (2U)
47/**
48 * @}
49 */
50
51/**
52 * @defgroup LIST_OF_ERRORS_REPORTED_FROM_SM
53 * Macros used to assign unique index to errors reported from the SM unit.
54 * @{
55 */
56#define GPU_SM_L1_TAG_ECC_CORRECTED (0U)
57#define GPU_SM_L1_TAG_ECC_UNCORRECTED (1U)
58#define GPU_SM_CBU_ECC_UNCORRECTED (3U)
59#define GPU_SM_LRF_ECC_UNCORRECTED (5U)
60#define GPU_SM_L1_DATA_ECC_UNCORRECTED (7U)
61#define GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED (9U)
62#define GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED (11U)
63#define GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED (13U)
64#define GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED (15U)
65#define GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED (17U)
66#define GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED (20U)
67/**
68 * @}
69 */
70
71/**
72 * @defgroup LIST_OF_ERRORS_REPORTED_FROM_FECS
73 * Macros used to assign unique index to errors reported from the FECS unit.
74 * @{
75 */
76#define GPU_FECS_FALCON_IMEM_ECC_CORRECTED (0U)
77#define GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED (1U)
78#define GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED (3U)
79/**
80 * @}
81 */
82
83/**
84 * @defgroup LIST_OF_ERRORS_REPORTED_FROM_GPCCS
85 * Macros used to assign unique index to errors reported from the GPCCS unit.
86 * @{
87 */
88#define GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED (0U)
89#define GPU_GPCCS_FALCON_IMEM_ECC_UNCORRECTED (1U)
90#define GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED (3U)
91/**
92 * @}
93 */
94
95/**
96 * @defgroup LIST_OF_ERRORS_REPORTED_FROM_MMU
97 * Macros used to assign unique index to errors reported from the MMU unit.
98 * @{
99 */
100#define GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED (1U)
101#define GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED (3U)
102/**
103 * @}
104 */
105
106/**
107 * @defgroup LIST_OF_ERRORS_REPORTED_FROM_GCC
108 * Macros used to assign unique index to errors reported from the GCC unit.
109 * @{
110 */
111#define GPU_GCC_L15_ECC_UNCORRECTED (1U)
112/**
113 * @}
114 */
115
116
117/**
118 * @defgroup LIST_OF_ERRORS_REPORTED_FROM_PMU
119 * Macros used to assign unique index to errors reported from the PMU unit.
120 * @{
121 */
122#define GPU_PMU_FALCON_IMEM_ECC_CORRECTED (0U)
123#define GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED (1U)
124#define GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED (3U)
125/**
126 * @}
127 */
128
129/**
130 * @defgroup LIST_OF_ERRORS_REPORTED_FROM_LTC
131 * Macros used to assign unique index to errors reported from the LTC unit.
132 * @{
133 */
134#define GPU_LTC_CACHE_DSTG_ECC_CORRECTED (0U)
135#define GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED (1U)
136#define GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED (3U)
137#define GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED (7U)
138/**
139 * @}
140 */
141
142/**
143 * @defgroup LIST_OF_ERRORS_REPORTED_FROM_HUBMMU
144 * Macros used to assign unique index to errors reported from the HUBMMU unit.
145 * @{
146 */
147#define GPU_HUBMMU_L2TLB_SA_DATA_ECC_UNCORRECTED (1U)
148#define GPU_HUBMMU_TLB_SA_DATA_ECC_UNCORRECTED (3U)
149#define GPU_HUBMMU_PTE_DATA_ECC_UNCORRECTED (5U)
150#define GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED (7U)
151#define GPU_HUBMMU_PAGE_FAULT_ERROR (8U)
152
153
154#ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING
155/**
156 * @}
157 */
158
159/**
160 * nvgpu_err_desc structure holds fields which describe an error along with
161 * function callback which can be used to inject the error.
162 */
163struct nvgpu_err_desc {
164 /** String representation of error. */
165 const char *name;
166
167 /** Flag to classify an error as critical or non-critical. */
168 bool is_critical;
169
170 /**
171 * Error Threshold: once this threshold value is reached, then the
172 * corresponding error counter will be reset to 0 and the error will be
173 * propagated to Safety_Services.
174 */
175 int err_threshold;
176
177 /**
178 * Total number of times an error has occurred (since its last reset).
179 */
180 nvgpu_atomic_t err_count;
181
182 /** Error ID. */
183 u8 error_id;
184};
185
186/**
187 * gpu_err_header structure holds fields which are required to identify the
188 * version of header, sub-error type, sub-unit id, error address and time stamp.
189 */
190struct gpu_err_header {
191 /** Version of GPU error header. */
192 struct {
193 /** Major version number. */
194 u16 major;
195 /** Minor version number. */
196 u16 minor;
197 } version;
198
199 /** Sub error type corresponding to the error that is being reported. */
200 u32 sub_err_type;
201
202 /** ID of the sub-unit in a HW unit which encountered an error. */
203 u64 sub_unit_id;
204
205 /** Location of the error. */
206 u64 address;
207
208 /** Timestamp in nano seconds. */
209 u64 timestamp_ns;
210};
211
212struct gpu_ecc_error_info {
213 struct gpu_err_header header;
214
215 /** Number of ECC errors. */
216 u64 err_cnt;
217};
218
219/**
220 * nvgpu_err_hw_module structure holds fields which describe the h/w modules
221 * error reporting capabilities.
222 */
223struct nvgpu_err_hw_module {
224 /** String representation of a given HW unit. */
225 const char *name;
226
227 /** HW unit ID. */
228 u32 hw_unit;
229
230 /** Total number of errors reported from a given HW unit. */
231 u32 num_errs;
232
233 u32 base_ecc_service_id;
234
235 /** Used to get error description from look-up table. */
236 struct nvgpu_err_desc *errs;
237};
238
239struct nvgpu_ecc_reporting_ops {
240 void (*report_ecc_err)(struct gk20a *g, u32 hw_unit, u32 inst,
241 u32 err_id, u64 err_addr, u64 err_count);
242};
243
244struct nvgpu_ecc_reporting {
245 struct nvgpu_spinlock lock;
246 /* This flag is protected by the above spinlock */
247 bool ecc_reporting_service_enabled;
248 const struct nvgpu_ecc_reporting_ops *ops;
249};
250
251 /**
252 * This macro is used to initialize the members of nvgpu_err_desc struct.
253 */
254#define GPU_ERR(err, critical, id, threshold, ecount) \
255{ \
256 .name = (err), \
257 .is_critical = (critical), \
258 .error_id = (id), \
259 .err_threshold = (threshold), \
260 .err_count = NVGPU_ATOMIC_INIT(ecount), \
261}
262
263/**
264 * This macro is used to initialize critical errors.
265 */
266#define GPU_CRITERR(err, id, threshold, ecount) \
267 GPU_ERR(err, true, id, threshold, ecount)
268
269/**
270 * This macro is used to initialize non-critical errors.
271 */
272#define GPU_NONCRITERR(err, id, threshold, ecount) \
273 GPU_ERR(err, false, id, threshold, ecount)
274
275/**
276 * @brief GPU HW errors need to be reported to Safety_Services via SDL unit.
277 * This function provides an interface to report ECC erros to SDL unit.
278 *
279 * @param g [in] - The GPU driver struct.
280 * @param hw_unit [in] - Index of HW unit.
281 * - List of valid HW unit IDs
282 * - NVGPU_ERR_MODULE_SM
283 * - NVGPU_ERR_MODULE_FECS
284 * - NVGPU_ERR_MODULE_GPCCS
285 * - NVGPU_ERR_MODULE_MMU
286 * - NVGPU_ERR_MODULE_GCC
287 * - NVGPU_ERR_MODULE_PMU
288 * - NVGPU_ERR_MODULE_LTC
289 * - NVGPU_ERR_MODULE_HUBMMU
290 * @param inst [in] - Instance ID.
291 * - In case of multiple instances of the same HW
292 * unit (e.g., there are multiple instances of
293 * SM), it is used to identify the instance
294 * that encountered a fault.
295 * @param err_id [in] - Error index.
296 * - For SM:
297 * - Min: GPU_SM_L1_TAG_ECC_CORRECTED
298 * - Max: GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED
299 * - For FECS:
300 * - Min: GPU_FECS_FALCON_IMEM_ECC_CORRECTED
301 * - Max: GPU_FECS_INVALID_ERROR
302 * - For GPCCS:
303 * - Min: GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED
304 * - Max: GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED
305 * - For MMU:
306 * - Min: GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED
307 * - Max: GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED
308 * - For GCC:
309 * - Min: GPU_GCC_L15_ECC_UNCORRECTED
310 * - Max: GPU_GCC_L15_ECC_UNCORRECTED
311 * - For PMU:
312 * - Min: GPU_PMU_FALCON_IMEM_ECC_CORRECTED
313 * - Max: GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED
314 * - For LTC:
315 * - Min: GPU_LTC_CACHE_DSTG_ECC_CORRECTED
316 * - Max: GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED
317 * - For HUBMMU:
318 * - Min: GPU_HUBMMU_L2TLB_SA_DATA_ECC_UNCORRECTED
319 * - Max: GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED
320 * @param err_addr [in] - Error address.
321 * - This is the location at which correctable or
322 * uncorrectable error has occurred.
323 * @param err_count [in] - Error count.
324 *
325 * - Checks whether SDL is supported in the current GPU platform. If SDL is not
326 * supported, it simply returns.
327 * - Validates both \a hw_unit and \a err_id indices. In case of a failure,
328 * invokes #nvgpu_sdl_handle_report_failure() api.
329 * - Gets the current time of a clock. In case of a failure, invokes
330 * #nvgpu_sdl_handle_report_failure() api.
331 * - Gets error description from internal look-up table using \a hw_unit and
332 * \a err_id indices.
333 * - Forms error packet using details such as time-stamp, \a hw_unit, \a err_id,
334 * criticality of the error, \a inst, \a err_addr, \a err_count, error
335 * description, and size of the error packet.
336 * - Performs compile-time assert check to ensure that the size of the error
337 * packet does not exceed the maximum allowable size specified in
338 * #MAX_ERR_MSG_SIZE.
339 *
340 * @return None
341 */
342void nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst,
343 u32 err_id, u64 err_addr, u64 err_count);
344
345void nvgpu_init_ecc_reporting(struct gk20a *g);
346void nvgpu_enable_ecc_reporting(struct gk20a *g);
347void nvgpu_disable_ecc_reporting(struct gk20a *g);
348void nvgpu_deinit_ecc_reporting(struct gk20a *g);
349
350#else
351
352static inline void nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst,
353 u32 err_id, u64 err_addr, u64 err_count) {
354
355}
356
357#endif /* CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING */
358
359#endif /* NVGPU_NVGPU_ERR_H */ \ No newline at end of file