diff options
author | Joshua Bakita <bakitajoshua@gmail.com> | 2023-10-29 13:07:40 -0400 |
---|---|---|
committer | Joshua Bakita <bakitajoshua@gmail.com> | 2023-10-29 13:10:52 -0400 |
commit | 2c5337a24f7f2d02989dfb733c55d6d8c7e90493 (patch) | |
tree | b9f1028cb443b03190b710c0d7ee640bf5958631 /include/os/linux/sdl.c | |
parent | aa06f84f03cba7ad1aae5cd527355bb3d8c152a6 (diff) |
Update includes to L4T r32.7.4 and drop nvgpu/gk20a.h dependency
Also add instructions for updating `include/`. These files are now
only needed to build on Linux 4.9-based Tegra platforms.
Diffstat (limited to 'include/os/linux/sdl.c')
-rw-r--r-- | include/os/linux/sdl.c | 341 |
1 files changed, 341 insertions, 0 deletions
diff --git a/include/os/linux/sdl.c b/include/os/linux/sdl.c new file mode 100644 index 0000000..c4dccdc --- /dev/null +++ b/include/os/linux/sdl.c | |||
@@ -0,0 +1,341 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2021, NVIDIA Corporation. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify it | ||
5 | * under the terms and conditions of the GNU General Public License, | ||
6 | * version 2, as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope it will be useful, but WITHOUT | ||
9 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
10 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
11 | * more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public License | ||
14 | * along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
15 | */ | ||
16 | |||
17 | #include <nvgpu/gk20a.h> | ||
18 | #include <nvgpu/types.h> | ||
19 | #include <nvgpu/nvgpu_err.h> | ||
20 | #include <nvgpu/timers.h> | ||
21 | #include <nvgpu/bug.h> | ||
22 | |||
23 | #include "ecc_linux.h" | ||
24 | #include "os_linux.h" | ||
25 | #include "module.h" | ||
26 | |||
27 | /* This look-up table initializes the list of hw units and their errors. | ||
28 | * It also specifies the error injection mechanism supported, for each error. | ||
29 | * In case of hw error injection support, this initialization will be overriden | ||
30 | * by the values provided from the hal layes of corresponding hw units. | ||
31 | */ | ||
32 | static struct nvgpu_err_hw_module gv11b_err_lut[] = { | ||
33 | { | ||
34 | .name = "sm", | ||
35 | .hw_unit = (u32)NVGPU_ERR_MODULE_SM, | ||
36 | .num_errs = 21U, | ||
37 | .base_ecc_service_id = | ||
38 | NVGUARD_SERVICE_IGPU_SM_SWERR_L1_TAG_ECC_CORRECTED, | ||
39 | .errs = (struct nvgpu_err_desc[]) { | ||
40 | GPU_NONCRITERR("l1_tag_ecc_corrected", | ||
41 | GPU_SM_L1_TAG_ECC_CORRECTED, 0, 0), | ||
42 | GPU_CRITERR("l1_tag_ecc_uncorrected", | ||
43 | GPU_SM_L1_TAG_ECC_UNCORRECTED, 0, 0), | ||
44 | GPU_NONCRITERR("cbu_ecc_corrected", 0, 0, 0), | ||
45 | GPU_CRITERR("cbu_ecc_uncorrected", | ||
46 | GPU_SM_CBU_ECC_UNCORRECTED, 0, 0), | ||
47 | GPU_NONCRITERR("lrf_ecc_corrected", 0, 0, 0), | ||
48 | GPU_CRITERR("lrf_ecc_uncorrected", | ||
49 | GPU_SM_LRF_ECC_UNCORRECTED, 0, 0), | ||
50 | GPU_NONCRITERR("l1_data_ecc_corrected", 0, 0, 0), | ||
51 | GPU_CRITERR("l1_data_ecc_uncorrected", | ||
52 | GPU_SM_L1_DATA_ECC_UNCORRECTED, 0, 0), | ||
53 | GPU_NONCRITERR("icache_l0_data_ecc_corrected", 0, 0, 0), | ||
54 | GPU_CRITERR("icache_l0_data_ecc_uncorrected", | ||
55 | GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED, 0, 0), | ||
56 | GPU_NONCRITERR("icache_l1_data_ecc_corrected", 0, 0, 0), | ||
57 | GPU_CRITERR("icache_l1_data_ecc_uncorrected", | ||
58 | GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED, 0, 0), | ||
59 | GPU_NONCRITERR("icache_l0_predecode_ecc_corrected", 0, 0, 0), | ||
60 | GPU_CRITERR("icache_l0_predecode_ecc_uncorrected", | ||
61 | GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED, 0, 0), | ||
62 | GPU_NONCRITERR("l1_tag_miss_fifo_ecc_corrected", 0, 0, 0), | ||
63 | GPU_CRITERR("l1_tag_miss_fifo_ecc_uncorrected", | ||
64 | GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED, 0, 0), | ||
65 | GPU_NONCRITERR("l1_tag_s2r_pixprf_ecc_corrected", 0, 0, 0), | ||
66 | GPU_CRITERR("l1_tag_s2r_pixprf_ecc_uncorrected", | ||
67 | GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED, 0, 0), | ||
68 | GPU_CRITERR("machine_check_error", 0, 0, 0), | ||
69 | GPU_NONCRITERR("icache_l1_predecode_ecc_corrected", 0, 0, 0), | ||
70 | GPU_CRITERR("icache_l1_predecode_ecc_uncorrected", | ||
71 | GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED, 0, 0), | ||
72 | }, | ||
73 | }, | ||
74 | { | ||
75 | .name = "fecs", | ||
76 | .hw_unit = (u32)NVGPU_ERR_MODULE_FECS, | ||
77 | .num_errs = 4U, | ||
78 | .base_ecc_service_id = | ||
79 | NVGUARD_SERVICE_IGPU_FECS_SWERR_FALCON_IMEM_ECC_CORRECTED, | ||
80 | .errs = (struct nvgpu_err_desc[]) { | ||
81 | GPU_NONCRITERR("falcon_imem_ecc_corrected", | ||
82 | GPU_FECS_FALCON_IMEM_ECC_CORRECTED, 0, 0), | ||
83 | GPU_CRITERR("falcon_imem_ecc_uncorrected", | ||
84 | GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED, 0, 0), | ||
85 | GPU_NONCRITERR("falcon_dmem_ecc_corrected", 0, 0, 0), | ||
86 | GPU_CRITERR("falcon_dmem_ecc_uncorrected", | ||
87 | GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED, 0, 0), | ||
88 | }, | ||
89 | }, | ||
90 | { | ||
91 | .name = "pmu", | ||
92 | .hw_unit = NVGPU_ERR_MODULE_PMU, | ||
93 | .num_errs = 4U, | ||
94 | .base_ecc_service_id = | ||
95 | NVGUARD_SERVICE_IGPU_PMU_SWERR_FALCON_IMEM_ECC_CORRECTED, | ||
96 | .errs = (struct nvgpu_err_desc[]) { | ||
97 | GPU_NONCRITERR("falcon_imem_ecc_corrected", | ||
98 | GPU_PMU_FALCON_IMEM_ECC_CORRECTED, 0, 0), | ||
99 | GPU_CRITERR("falcon_imem_ecc_uncorrected", | ||
100 | GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED, 0, 0), | ||
101 | GPU_NONCRITERR("falcon_dmem_ecc_corrected", 0, 0, 0), | ||
102 | GPU_CRITERR("falcon_dmem_ecc_uncorrected", | ||
103 | GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED, 0, 0), | ||
104 | }, | ||
105 | }, | ||
106 | }; | ||
107 | |||
108 | static void nvgpu_init_err_msg_header(struct gpu_err_header *header) | ||
109 | { | ||
110 | header->version.major = (u16)1U; | ||
111 | header->version.minor = (u16)0U; | ||
112 | header->sub_err_type = 0U; | ||
113 | header->sub_unit_id = 0UL; | ||
114 | header->address = 0UL; | ||
115 | header->timestamp_ns = 0UL; | ||
116 | } | ||
117 | |||
118 | static void nvgpu_init_ecc_err_msg(struct gpu_ecc_error_info *err_info) | ||
119 | { | ||
120 | nvgpu_init_err_msg_header(&err_info->header); | ||
121 | err_info->err_cnt = 0UL; | ||
122 | } | ||
123 | |||
124 | static void nvgpu_report_ecc_error_linux(struct gk20a *g, u32 hw_unit, u32 inst, | ||
125 | u32 err_id, u64 err_addr, u64 err_count) | ||
126 | { | ||
127 | int err = 0; | ||
128 | u32 s_id = 0; | ||
129 | u8 err_status = 0; | ||
130 | u8 err_info_size = 0; | ||
131 | u64 timestamp = 0ULL; | ||
132 | int err_threshold_counter = 0; | ||
133 | struct gpu_ecc_error_info err_pkt; | ||
134 | struct nvgpu_err_desc *err_desc = NULL; | ||
135 | struct nvgpu_err_hw_module *hw_module = NULL; | ||
136 | nv_guard_request_t req; | ||
137 | |||
138 | memset(&req, 0, sizeof(req)); | ||
139 | nvgpu_init_ecc_err_msg(&err_pkt); | ||
140 | if (hw_unit >= sizeof(gv11b_err_lut)/sizeof(gv11b_err_lut[0])) { | ||
141 | err = -EINVAL; | ||
142 | goto done; | ||
143 | } | ||
144 | |||
145 | hw_module = &gv11b_err_lut[hw_unit]; | ||
146 | if (err_id >= hw_module->num_errs) { | ||
147 | nvgpu_err(g, "invalid err_id (%u) for hw module (%u)", | ||
148 | err_id, hw_module->hw_unit); | ||
149 | err = -EINVAL; | ||
150 | goto done; | ||
151 | } | ||
152 | err_desc = &hw_module->errs[err_id]; | ||
153 | timestamp = (u64)nvgpu_current_time_ns(); | ||
154 | |||
155 | err_pkt.header.timestamp_ns = timestamp; | ||
156 | err_pkt.header.sub_unit_id = inst; | ||
157 | err_pkt.header.address = err_addr; | ||
158 | err_pkt.err_cnt = err_count; | ||
159 | err_info_size = sizeof(err_pkt); | ||
160 | |||
161 | s_id = hw_module->base_ecc_service_id + err_id; | ||
162 | |||
163 | if (err_desc->is_critical) { | ||
164 | err_status = NVGUARD_ERROR_DETECTED; | ||
165 | } else { | ||
166 | err_status = NVGUARD_NO_ERROR; | ||
167 | } | ||
168 | |||
169 | nvgpu_atomic_inc(&err_desc->err_count); | ||
170 | err_threshold_counter = nvgpu_atomic_cmpxchg(&err_desc->err_count, | ||
171 | err_desc->err_threshold + 1, 0); | ||
172 | |||
173 | if (unlikely(err_threshold_counter != err_desc->err_threshold + 1)) { | ||
174 | goto done; | ||
175 | } | ||
176 | |||
177 | nvgpu_log(g, gpu_dbg_ecc, "ECC reporting hw: %s, desc:%s, count:%llu", | ||
178 | hw_module->name, err_desc->name, err_count); | ||
179 | |||
180 | req.srv_id_cmd = NVGUARD_SERVICESTATUS_NOTIFICATION; | ||
181 | req.srv_status.srv_id = (nv_guard_service_id_t)s_id; | ||
182 | req.srv_status.status = err_status; | ||
183 | req.srv_status.timestamp = timestamp; | ||
184 | req.srv_status.error_info_size = err_info_size; | ||
185 | memcpy(req.srv_status.error_info, (u8*)&err_pkt, err_info_size); | ||
186 | |||
187 | /* | ||
188 | * l1ss_submit_rq may fail due to kmalloc failures but may pass in | ||
189 | * subsequent calls | ||
190 | */ | ||
191 | err = l1ss_submit_rq(&req, true); | ||
192 | if (err != 0) { | ||
193 | nvgpu_err(g, "Error returned from L1SS submit %d", err); | ||
194 | } | ||
195 | |||
196 | if (err_desc->is_critical) { | ||
197 | nvgpu_quiesce(g); | ||
198 | } | ||
199 | |||
200 | done: | ||
201 | return; | ||
202 | } | ||
203 | |||
204 | static void nvgpu_report_ecc_error_empty(struct gk20a *g, u32 hw_unit, u32 inst, | ||
205 | u32 err_id, u64 err_addr, u64 err_count) { | ||
206 | nvgpu_log(g, gpu_dbg_ecc, "ECC reporting empty"); | ||
207 | } | ||
208 | |||
209 | const struct nvgpu_ecc_reporting_ops default_disabled_ecc_report_ops = { | ||
210 | .report_ecc_err = nvgpu_report_ecc_error_empty, | ||
211 | }; | ||
212 | |||
213 | const struct nvgpu_ecc_reporting_ops ecc_enable_report_ops = { | ||
214 | .report_ecc_err = nvgpu_report_ecc_error_linux, | ||
215 | }; | ||
216 | |||
217 | static int nvgpu_l1ss_callback(l1ss_cli_callback_param param, void *data) | ||
218 | { | ||
219 | struct gk20a *g = (struct gk20a *)data; | ||
220 | struct nvgpu_os_linux *l = NULL; | ||
221 | struct nvgpu_ecc_reporting_linux *ecc_reporting_linux = NULL; | ||
222 | int err = 0; | ||
223 | /* Ensure we have a valid gk20a struct before proceeding */ | ||
224 | if ((g == NULL) || (gk20a_get(g) == NULL)) { | ||
225 | return -ENODEV; | ||
226 | } | ||
227 | |||
228 | l = nvgpu_os_linux_from_gk20a(g); | ||
229 | ecc_reporting_linux = &l->ecc_reporting_linux; | ||
230 | |||
231 | nvgpu_spinlock_acquire(&ecc_reporting_linux->common.lock); | ||
232 | if (param == L1SS_READY) { | ||
233 | if (!ecc_reporting_linux->common.ecc_reporting_service_enabled) { | ||
234 | ecc_reporting_linux->common.ecc_reporting_service_enabled = true; | ||
235 | ecc_reporting_linux->common.ops = &ecc_enable_report_ops; | ||
236 | nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is enabled"); | ||
237 | } | ||
238 | } else if (param == L1SS_NOT_READY) { | ||
239 | if (ecc_reporting_linux->common.ecc_reporting_service_enabled) { | ||
240 | ecc_reporting_linux->common.ecc_reporting_service_enabled = false; | ||
241 | ecc_reporting_linux->common.ops = &default_disabled_ecc_report_ops; | ||
242 | nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is disabled"); | ||
243 | } | ||
244 | } else { | ||
245 | err = -EINVAL; | ||
246 | } | ||
247 | nvgpu_spinlock_release(&ecc_reporting_linux->common.lock); | ||
248 | |||
249 | gk20a_put(g); | ||
250 | |||
251 | return err; | ||
252 | } | ||
253 | |||
254 | void nvgpu_init_ecc_reporting(struct gk20a *g) | ||
255 | { | ||
256 | struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g); | ||
257 | struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux; | ||
258 | int err = 0; | ||
259 | /* This will invoke the registration API */ | ||
260 | nvgpu_spinlock_init(&ecc_report_linux->common.lock); | ||
261 | ecc_report_linux->priv.id = (NVGUARD_GROUPID_IGPU & NVGUARD_GROUPINDEX_FIELDMASK); | ||
262 | ecc_report_linux->priv.cli_callback = nvgpu_l1ss_callback; | ||
263 | ecc_report_linux->priv.data = g; | ||
264 | ecc_report_linux->common.ops = &default_disabled_ecc_report_ops; | ||
265 | |||
266 | nvgpu_log(g, gpu_dbg_ecc, "ECC reporting Init"); | ||
267 | |||
268 | /* | ||
269 | * err == 0 indicates service is available but not active yet. | ||
270 | * err == 1 indicates service is available and active | ||
271 | * error for other cases. | ||
272 | */ | ||
273 | err = l1ss_register_client(&ecc_report_linux->priv); | ||
274 | if (err == 0) { | ||
275 | ecc_report_linux->common.ecc_reporting_service_enabled = false; | ||
276 | nvgpu_log(g, gpu_dbg_ecc, "ECC reporting init success"); | ||
277 | } else if (err == 1) { | ||
278 | ecc_report_linux->common.ecc_reporting_service_enabled = true; | ||
279 | /* Actual Ops will be replaced during nvgpu_enable_ecc_reporting | ||
280 | * called as part of gk20a_busy() | ||
281 | */ | ||
282 | } else { | ||
283 | nvgpu_log(g, gpu_dbg_ecc, "ECC reporting init failure %d", err); | ||
284 | } | ||
285 | } | ||
286 | |||
287 | void nvgpu_deinit_ecc_reporting(struct gk20a *g) | ||
288 | { | ||
289 | struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g); | ||
290 | struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux; | ||
291 | |||
292 | if (ecc_report_linux->common.ecc_reporting_service_enabled) { | ||
293 | ecc_report_linux->common.ecc_reporting_service_enabled = false; | ||
294 | l1ss_deregister_client(ecc_report_linux->priv.id); | ||
295 | memset(ecc_report_linux, 0, sizeof(*ecc_report_linux)); | ||
296 | nvgpu_log(g, gpu_dbg_ecc, "ECC reporting de-init success"); | ||
297 | } | ||
298 | |||
299 | } | ||
300 | |||
301 | void nvgpu_enable_ecc_reporting(struct gk20a *g) | ||
302 | { | ||
303 | struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g); | ||
304 | struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux; | ||
305 | struct nvgpu_ecc_reporting *error_reporting = &ecc_report_linux->common; | ||
306 | |||
307 | nvgpu_spinlock_acquire(&ecc_report_linux->common.lock); | ||
308 | if (error_reporting->ecc_reporting_service_enabled) { | ||
309 | error_reporting->ops = &ecc_enable_report_ops; | ||
310 | nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is enabled"); | ||
311 | } | ||
312 | nvgpu_spinlock_release(&ecc_report_linux->common.lock); | ||
313 | } | ||
314 | |||
315 | void nvgpu_disable_ecc_reporting(struct gk20a *g) | ||
316 | { | ||
317 | struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g); | ||
318 | struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux; | ||
319 | struct nvgpu_ecc_reporting *error_reporting = &ecc_report_linux->common; | ||
320 | |||
321 | nvgpu_spinlock_acquire(&ecc_report_linux->common.lock); | ||
322 | error_reporting->ops = &default_disabled_ecc_report_ops; | ||
323 | nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is disabled"); | ||
324 | nvgpu_spinlock_release(&ecc_report_linux->common.lock); | ||
325 | } | ||
326 | |||
327 | void nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst, | ||
328 | u32 err_id, u64 err_addr, u64 err_count) | ||
329 | { | ||
330 | struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g); | ||
331 | struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux; | ||
332 | struct nvgpu_ecc_reporting *error_reporting = &ecc_report_linux->common; | ||
333 | void (*report_ecc_err_func)(struct gk20a *g, u32 hw_unit, u32 inst, | ||
334 | u32 err_id, u64 err_addr, u64 err_count); | ||
335 | |||
336 | nvgpu_spinlock_acquire(&ecc_report_linux->common.lock); | ||
337 | report_ecc_err_func = error_reporting->ops->report_ecc_err; | ||
338 | nvgpu_spinlock_release(&ecc_report_linux->common.lock); | ||
339 | |||
340 | report_ecc_err_func(g, hw_unit, inst, err_id, err_addr, err_count); | ||
341 | } | ||