diff options
author | Joshua Bakita <bakitajoshua@gmail.com> | 2023-10-29 13:07:40 -0400 |
---|---|---|
committer | Joshua Bakita <bakitajoshua@gmail.com> | 2023-10-29 13:10:52 -0400 |
commit | 2c5337a24f7f2d02989dfb733c55d6d8c7e90493 (patch) | |
tree | b9f1028cb443b03190b710c0d7ee640bf5958631 /include/nvgpu/nvgpu_err.h | |
parent | aa06f84f03cba7ad1aae5cd527355bb3d8c152a6 (diff) |
Update includes to L4T r32.7.4 and drop nvgpu/gk20a.h dependency
Also add instructions for updating `include/`. These files are now
only needed to build on Linux 4.9-based Tegra platforms.
Diffstat (limited to 'include/nvgpu/nvgpu_err.h')
-rw-r--r-- | include/nvgpu/nvgpu_err.h | 359 |
1 files changed, 359 insertions, 0 deletions
diff --git a/include/nvgpu/nvgpu_err.h b/include/nvgpu/nvgpu_err.h new file mode 100644 index 0000000..0595faf --- /dev/null +++ b/include/nvgpu/nvgpu_err.h | |||
@@ -0,0 +1,359 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. | ||
3 | * | ||
4 | * Permission is hereby granted, free of charge, to any person obtaining a | ||
5 | * copy of this software and associated documentation files (the "Software"), | ||
6 | * to deal in the Software without restriction, including without limitation | ||
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | ||
8 | * and/or sell copies of the Software, and to permit persons to whom the | ||
9 | * Software is furnished to do so, subject to the following conditions: | ||
10 | * | ||
11 | * The above copyright notice and this permission notice shall be included in | ||
12 | * all copies or substantial portions of the Software. | ||
13 | * | ||
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | ||
17 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | ||
19 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | ||
20 | * DEALINGS IN THE SOFTWARE. | ||
21 | */ | ||
22 | |||
23 | #ifndef NVGPU_NVGPU_ERR_H | ||
24 | #define NVGPU_NVGPU_ERR_H | ||
25 | |||
26 | /** | ||
27 | * @file | ||
28 | * | ||
29 | * Define indices for HW units and errors. Define structures used to carry error | ||
30 | * information. Declare prototype for APIs that are used to report GPU HW errors | ||
31 | * to the Safety_Services framework. | ||
32 | */ | ||
33 | |||
34 | #include <nvgpu/types.h> | ||
35 | #include <nvgpu/atomic.h> | ||
36 | |||
37 | struct gk20a; | ||
38 | |||
39 | /** | ||
40 | * @defgroup INDICES_FOR_GPU_HW_UNITS | ||
41 | * Macros used to assign unique index to GPU HW units. | ||
42 | * @{ | ||
43 | */ | ||
44 | #define NVGPU_ERR_MODULE_SM (0U) | ||
45 | #define NVGPU_ERR_MODULE_FECS (1U) | ||
46 | #define NVGPU_ERR_MODULE_PMU (2U) | ||
47 | /** | ||
48 | * @} | ||
49 | */ | ||
50 | |||
51 | /** | ||
52 | * @defgroup LIST_OF_ERRORS_REPORTED_FROM_SM | ||
53 | * Macros used to assign unique index to errors reported from the SM unit. | ||
54 | * @{ | ||
55 | */ | ||
56 | #define GPU_SM_L1_TAG_ECC_CORRECTED (0U) | ||
57 | #define GPU_SM_L1_TAG_ECC_UNCORRECTED (1U) | ||
58 | #define GPU_SM_CBU_ECC_UNCORRECTED (3U) | ||
59 | #define GPU_SM_LRF_ECC_UNCORRECTED (5U) | ||
60 | #define GPU_SM_L1_DATA_ECC_UNCORRECTED (7U) | ||
61 | #define GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED (9U) | ||
62 | #define GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED (11U) | ||
63 | #define GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED (13U) | ||
64 | #define GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED (15U) | ||
65 | #define GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED (17U) | ||
66 | #define GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED (20U) | ||
67 | /** | ||
68 | * @} | ||
69 | */ | ||
70 | |||
71 | /** | ||
72 | * @defgroup LIST_OF_ERRORS_REPORTED_FROM_FECS | ||
73 | * Macros used to assign unique index to errors reported from the FECS unit. | ||
74 | * @{ | ||
75 | */ | ||
76 | #define GPU_FECS_FALCON_IMEM_ECC_CORRECTED (0U) | ||
77 | #define GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED (1U) | ||
78 | #define GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED (3U) | ||
79 | /** | ||
80 | * @} | ||
81 | */ | ||
82 | |||
83 | /** | ||
84 | * @defgroup LIST_OF_ERRORS_REPORTED_FROM_GPCCS | ||
85 | * Macros used to assign unique index to errors reported from the GPCCS unit. | ||
86 | * @{ | ||
87 | */ | ||
88 | #define GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED (0U) | ||
89 | #define GPU_GPCCS_FALCON_IMEM_ECC_UNCORRECTED (1U) | ||
90 | #define GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED (3U) | ||
91 | /** | ||
92 | * @} | ||
93 | */ | ||
94 | |||
95 | /** | ||
96 | * @defgroup LIST_OF_ERRORS_REPORTED_FROM_MMU | ||
97 | * Macros used to assign unique index to errors reported from the MMU unit. | ||
98 | * @{ | ||
99 | */ | ||
100 | #define GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED (1U) | ||
101 | #define GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED (3U) | ||
102 | /** | ||
103 | * @} | ||
104 | */ | ||
105 | |||
106 | /** | ||
107 | * @defgroup LIST_OF_ERRORS_REPORTED_FROM_GCC | ||
108 | * Macros used to assign unique index to errors reported from the GCC unit. | ||
109 | * @{ | ||
110 | */ | ||
111 | #define GPU_GCC_L15_ECC_UNCORRECTED (1U) | ||
112 | /** | ||
113 | * @} | ||
114 | */ | ||
115 | |||
116 | |||
117 | /** | ||
118 | * @defgroup LIST_OF_ERRORS_REPORTED_FROM_PMU | ||
119 | * Macros used to assign unique index to errors reported from the PMU unit. | ||
120 | * @{ | ||
121 | */ | ||
122 | #define GPU_PMU_FALCON_IMEM_ECC_CORRECTED (0U) | ||
123 | #define GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED (1U) | ||
124 | #define GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED (3U) | ||
125 | /** | ||
126 | * @} | ||
127 | */ | ||
128 | |||
129 | /** | ||
130 | * @defgroup LIST_OF_ERRORS_REPORTED_FROM_LTC | ||
131 | * Macros used to assign unique index to errors reported from the LTC unit. | ||
132 | * @{ | ||
133 | */ | ||
134 | #define GPU_LTC_CACHE_DSTG_ECC_CORRECTED (0U) | ||
135 | #define GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED (1U) | ||
136 | #define GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED (3U) | ||
137 | #define GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED (7U) | ||
138 | /** | ||
139 | * @} | ||
140 | */ | ||
141 | |||
142 | /** | ||
143 | * @defgroup LIST_OF_ERRORS_REPORTED_FROM_HUBMMU | ||
144 | * Macros used to assign unique index to errors reported from the HUBMMU unit. | ||
145 | * @{ | ||
146 | */ | ||
147 | #define GPU_HUBMMU_L2TLB_SA_DATA_ECC_UNCORRECTED (1U) | ||
148 | #define GPU_HUBMMU_TLB_SA_DATA_ECC_UNCORRECTED (3U) | ||
149 | #define GPU_HUBMMU_PTE_DATA_ECC_UNCORRECTED (5U) | ||
150 | #define GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED (7U) | ||
151 | #define GPU_HUBMMU_PAGE_FAULT_ERROR (8U) | ||
152 | |||
153 | |||
154 | #ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING | ||
155 | /** | ||
156 | * @} | ||
157 | */ | ||
158 | |||
159 | /** | ||
160 | * nvgpu_err_desc structure holds fields which describe an error along with | ||
161 | * function callback which can be used to inject the error. | ||
162 | */ | ||
163 | struct nvgpu_err_desc { | ||
164 | /** String representation of error. */ | ||
165 | const char *name; | ||
166 | |||
167 | /** Flag to classify an error as critical or non-critical. */ | ||
168 | bool is_critical; | ||
169 | |||
170 | /** | ||
171 | * Error Threshold: once this threshold value is reached, then the | ||
172 | * corresponding error counter will be reset to 0 and the error will be | ||
173 | * propagated to Safety_Services. | ||
174 | */ | ||
175 | int err_threshold; | ||
176 | |||
177 | /** | ||
178 | * Total number of times an error has occurred (since its last reset). | ||
179 | */ | ||
180 | nvgpu_atomic_t err_count; | ||
181 | |||
182 | /** Error ID. */ | ||
183 | u8 error_id; | ||
184 | }; | ||
185 | |||
186 | /** | ||
187 | * gpu_err_header structure holds fields which are required to identify the | ||
188 | * version of header, sub-error type, sub-unit id, error address and time stamp. | ||
189 | */ | ||
190 | struct gpu_err_header { | ||
191 | /** Version of GPU error header. */ | ||
192 | struct { | ||
193 | /** Major version number. */ | ||
194 | u16 major; | ||
195 | /** Minor version number. */ | ||
196 | u16 minor; | ||
197 | } version; | ||
198 | |||
199 | /** Sub error type corresponding to the error that is being reported. */ | ||
200 | u32 sub_err_type; | ||
201 | |||
202 | /** ID of the sub-unit in a HW unit which encountered an error. */ | ||
203 | u64 sub_unit_id; | ||
204 | |||
205 | /** Location of the error. */ | ||
206 | u64 address; | ||
207 | |||
208 | /** Timestamp in nano seconds. */ | ||
209 | u64 timestamp_ns; | ||
210 | }; | ||
211 | |||
212 | struct gpu_ecc_error_info { | ||
213 | struct gpu_err_header header; | ||
214 | |||
215 | /** Number of ECC errors. */ | ||
216 | u64 err_cnt; | ||
217 | }; | ||
218 | |||
219 | /** | ||
220 | * nvgpu_err_hw_module structure holds fields which describe the h/w modules | ||
221 | * error reporting capabilities. | ||
222 | */ | ||
223 | struct nvgpu_err_hw_module { | ||
224 | /** String representation of a given HW unit. */ | ||
225 | const char *name; | ||
226 | |||
227 | /** HW unit ID. */ | ||
228 | u32 hw_unit; | ||
229 | |||
230 | /** Total number of errors reported from a given HW unit. */ | ||
231 | u32 num_errs; | ||
232 | |||
233 | u32 base_ecc_service_id; | ||
234 | |||
235 | /** Used to get error description from look-up table. */ | ||
236 | struct nvgpu_err_desc *errs; | ||
237 | }; | ||
238 | |||
239 | struct nvgpu_ecc_reporting_ops { | ||
240 | void (*report_ecc_err)(struct gk20a *g, u32 hw_unit, u32 inst, | ||
241 | u32 err_id, u64 err_addr, u64 err_count); | ||
242 | }; | ||
243 | |||
244 | struct nvgpu_ecc_reporting { | ||
245 | struct nvgpu_spinlock lock; | ||
246 | /* This flag is protected by the above spinlock */ | ||
247 | bool ecc_reporting_service_enabled; | ||
248 | const struct nvgpu_ecc_reporting_ops *ops; | ||
249 | }; | ||
250 | |||
251 | /** | ||
252 | * This macro is used to initialize the members of nvgpu_err_desc struct. | ||
253 | */ | ||
254 | #define GPU_ERR(err, critical, id, threshold, ecount) \ | ||
255 | { \ | ||
256 | .name = (err), \ | ||
257 | .is_critical = (critical), \ | ||
258 | .error_id = (id), \ | ||
259 | .err_threshold = (threshold), \ | ||
260 | .err_count = NVGPU_ATOMIC_INIT(ecount), \ | ||
261 | } | ||
262 | |||
263 | /** | ||
264 | * This macro is used to initialize critical errors. | ||
265 | */ | ||
266 | #define GPU_CRITERR(err, id, threshold, ecount) \ | ||
267 | GPU_ERR(err, true, id, threshold, ecount) | ||
268 | |||
269 | /** | ||
270 | * This macro is used to initialize non-critical errors. | ||
271 | */ | ||
272 | #define GPU_NONCRITERR(err, id, threshold, ecount) \ | ||
273 | GPU_ERR(err, false, id, threshold, ecount) | ||
274 | |||
275 | /** | ||
276 | * @brief GPU HW errors need to be reported to Safety_Services via SDL unit. | ||
277 | * This function provides an interface to report ECC erros to SDL unit. | ||
278 | * | ||
279 | * @param g [in] - The GPU driver struct. | ||
280 | * @param hw_unit [in] - Index of HW unit. | ||
281 | * - List of valid HW unit IDs | ||
282 | * - NVGPU_ERR_MODULE_SM | ||
283 | * - NVGPU_ERR_MODULE_FECS | ||
284 | * - NVGPU_ERR_MODULE_GPCCS | ||
285 | * - NVGPU_ERR_MODULE_MMU | ||
286 | * - NVGPU_ERR_MODULE_GCC | ||
287 | * - NVGPU_ERR_MODULE_PMU | ||
288 | * - NVGPU_ERR_MODULE_LTC | ||
289 | * - NVGPU_ERR_MODULE_HUBMMU | ||
290 | * @param inst [in] - Instance ID. | ||
291 | * - In case of multiple instances of the same HW | ||
292 | * unit (e.g., there are multiple instances of | ||
293 | * SM), it is used to identify the instance | ||
294 | * that encountered a fault. | ||
295 | * @param err_id [in] - Error index. | ||
296 | * - For SM: | ||
297 | * - Min: GPU_SM_L1_TAG_ECC_CORRECTED | ||
298 | * - Max: GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED | ||
299 | * - For FECS: | ||
300 | * - Min: GPU_FECS_FALCON_IMEM_ECC_CORRECTED | ||
301 | * - Max: GPU_FECS_INVALID_ERROR | ||
302 | * - For GPCCS: | ||
303 | * - Min: GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED | ||
304 | * - Max: GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED | ||
305 | * - For MMU: | ||
306 | * - Min: GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED | ||
307 | * - Max: GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED | ||
308 | * - For GCC: | ||
309 | * - Min: GPU_GCC_L15_ECC_UNCORRECTED | ||
310 | * - Max: GPU_GCC_L15_ECC_UNCORRECTED | ||
311 | * - For PMU: | ||
312 | * - Min: GPU_PMU_FALCON_IMEM_ECC_CORRECTED | ||
313 | * - Max: GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED | ||
314 | * - For LTC: | ||
315 | * - Min: GPU_LTC_CACHE_DSTG_ECC_CORRECTED | ||
316 | * - Max: GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED | ||
317 | * - For HUBMMU: | ||
318 | * - Min: GPU_HUBMMU_L2TLB_SA_DATA_ECC_UNCORRECTED | ||
319 | * - Max: GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED | ||
320 | * @param err_addr [in] - Error address. | ||
321 | * - This is the location at which correctable or | ||
322 | * uncorrectable error has occurred. | ||
323 | * @param err_count [in] - Error count. | ||
324 | * | ||
325 | * - Checks whether SDL is supported in the current GPU platform. If SDL is not | ||
326 | * supported, it simply returns. | ||
327 | * - Validates both \a hw_unit and \a err_id indices. In case of a failure, | ||
328 | * invokes #nvgpu_sdl_handle_report_failure() api. | ||
329 | * - Gets the current time of a clock. In case of a failure, invokes | ||
330 | * #nvgpu_sdl_handle_report_failure() api. | ||
331 | * - Gets error description from internal look-up table using \a hw_unit and | ||
332 | * \a err_id indices. | ||
333 | * - Forms error packet using details such as time-stamp, \a hw_unit, \a err_id, | ||
334 | * criticality of the error, \a inst, \a err_addr, \a err_count, error | ||
335 | * description, and size of the error packet. | ||
336 | * - Performs compile-time assert check to ensure that the size of the error | ||
337 | * packet does not exceed the maximum allowable size specified in | ||
338 | * #MAX_ERR_MSG_SIZE. | ||
339 | * | ||
340 | * @return None | ||
341 | */ | ||
342 | void nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst, | ||
343 | u32 err_id, u64 err_addr, u64 err_count); | ||
344 | |||
345 | void nvgpu_init_ecc_reporting(struct gk20a *g); | ||
346 | void nvgpu_enable_ecc_reporting(struct gk20a *g); | ||
347 | void nvgpu_disable_ecc_reporting(struct gk20a *g); | ||
348 | void nvgpu_deinit_ecc_reporting(struct gk20a *g); | ||
349 | |||
350 | #else | ||
351 | |||
352 | static inline void nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst, | ||
353 | u32 err_id, u64 err_addr, u64 err_count) { | ||
354 | |||
355 | } | ||
356 | |||
357 | #endif /* CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING */ | ||
358 | |||
359 | #endif /* NVGPU_NVGPU_ERR_H */ \ No newline at end of file | ||