diff options
author | Debarshi Dutta <ddutta@nvidia.com> | 2021-05-17 04:38:25 -0400 |
---|---|---|
committer | mobile promotions <svcmobile_promotions@nvidia.com> | 2021-05-28 15:10:24 -0400 |
commit | 34993e4f7b0d47620e88ba64a6d7c67330d97e35 (patch) | |
tree | 2136284f5bd4095780884885413bb268fd318a96 /drivers/gpu/nvgpu/os/linux/sdl.c | |
parent | 5f88598b9e7b2cfe0387733577ece138a7bc912b (diff) |
gpu: nvgpu: Add ECC Support for GV11B in Linux
Implement nvgpu plumbing to allow reporting ECC errors(corrected
and uncorrected) to a L1SS service(if one exists).
This patch includes the following
1) Added code that submits ECC error reports via the Interrupt context
directly to a L1SS service in linux OS.
2) Added support for enabling/disabling the error reports via L1SS's
registration/deregistration API. Nvgpu simply invokes an empty function
until the registration is successful.
3) Added Spinlock to correctly handle concurrency for accessing the
correct Ops for submitting requests.
4) Adds error reporting for a subset of interrupts that can be verified
via external ECC injection logic. A subsequent patch will add the
API for rest of the interrupts.
5) In case of critical(uncorrected errors), change nvgpu's state to
quiesce state.
Jira L4T-1187
Bug 200700400
Change-Id: Id31f70531fba355e94e72c4f9762593e7667a11c
Signed-off-by: Debarshi Dutta <ddutta@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2530411
Tested-by: Bibek Basu <bbasu@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
Reviewed-by: Bibek Basu <bbasu@nvidia.com>
Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
GVS: Gerrit_Virtual_Submit
Diffstat (limited to 'drivers/gpu/nvgpu/os/linux/sdl.c')
-rw-r--r-- | drivers/gpu/nvgpu/os/linux/sdl.c | 341 |
1 files changed, 341 insertions, 0 deletions
diff --git a/drivers/gpu/nvgpu/os/linux/sdl.c b/drivers/gpu/nvgpu/os/linux/sdl.c new file mode 100644 index 00000000..c4dccdc6 --- /dev/null +++ b/drivers/gpu/nvgpu/os/linux/sdl.c | |||
@@ -0,0 +1,341 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2021, NVIDIA Corporation. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify it | ||
5 | * under the terms and conditions of the GNU General Public License, | ||
6 | * version 2, as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope it will be useful, but WITHOUT | ||
9 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
10 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
11 | * more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public License | ||
14 | * along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
15 | */ | ||
16 | |||
17 | #include <nvgpu/gk20a.h> | ||
18 | #include <nvgpu/types.h> | ||
19 | #include <nvgpu/nvgpu_err.h> | ||
20 | #include <nvgpu/timers.h> | ||
21 | #include <nvgpu/bug.h> | ||
22 | |||
23 | #include "ecc_linux.h" | ||
24 | #include "os_linux.h" | ||
25 | #include "module.h" | ||
26 | |||
27 | /* This look-up table initializes the list of hw units and their errors. | ||
28 | * It also specifies the error injection mechanism supported, for each error. | ||
29 | * In case of hw error injection support, this initialization will be overriden | ||
30 | * by the values provided from the hal layes of corresponding hw units. | ||
31 | */ | ||
32 | static struct nvgpu_err_hw_module gv11b_err_lut[] = { | ||
33 | { | ||
34 | .name = "sm", | ||
35 | .hw_unit = (u32)NVGPU_ERR_MODULE_SM, | ||
36 | .num_errs = 21U, | ||
37 | .base_ecc_service_id = | ||
38 | NVGUARD_SERVICE_IGPU_SM_SWERR_L1_TAG_ECC_CORRECTED, | ||
39 | .errs = (struct nvgpu_err_desc[]) { | ||
40 | GPU_NONCRITERR("l1_tag_ecc_corrected", | ||
41 | GPU_SM_L1_TAG_ECC_CORRECTED, 0, 0), | ||
42 | GPU_CRITERR("l1_tag_ecc_uncorrected", | ||
43 | GPU_SM_L1_TAG_ECC_UNCORRECTED, 0, 0), | ||
44 | GPU_NONCRITERR("cbu_ecc_corrected", 0, 0, 0), | ||
45 | GPU_CRITERR("cbu_ecc_uncorrected", | ||
46 | GPU_SM_CBU_ECC_UNCORRECTED, 0, 0), | ||
47 | GPU_NONCRITERR("lrf_ecc_corrected", 0, 0, 0), | ||
48 | GPU_CRITERR("lrf_ecc_uncorrected", | ||
49 | GPU_SM_LRF_ECC_UNCORRECTED, 0, 0), | ||
50 | GPU_NONCRITERR("l1_data_ecc_corrected", 0, 0, 0), | ||
51 | GPU_CRITERR("l1_data_ecc_uncorrected", | ||
52 | GPU_SM_L1_DATA_ECC_UNCORRECTED, 0, 0), | ||
53 | GPU_NONCRITERR("icache_l0_data_ecc_corrected", 0, 0, 0), | ||
54 | GPU_CRITERR("icache_l0_data_ecc_uncorrected", | ||
55 | GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED, 0, 0), | ||
56 | GPU_NONCRITERR("icache_l1_data_ecc_corrected", 0, 0, 0), | ||
57 | GPU_CRITERR("icache_l1_data_ecc_uncorrected", | ||
58 | GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED, 0, 0), | ||
59 | GPU_NONCRITERR("icache_l0_predecode_ecc_corrected", 0, 0, 0), | ||
60 | GPU_CRITERR("icache_l0_predecode_ecc_uncorrected", | ||
61 | GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED, 0, 0), | ||
62 | GPU_NONCRITERR("l1_tag_miss_fifo_ecc_corrected", 0, 0, 0), | ||
63 | GPU_CRITERR("l1_tag_miss_fifo_ecc_uncorrected", | ||
64 | GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED, 0, 0), | ||
65 | GPU_NONCRITERR("l1_tag_s2r_pixprf_ecc_corrected", 0, 0, 0), | ||
66 | GPU_CRITERR("l1_tag_s2r_pixprf_ecc_uncorrected", | ||
67 | GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED, 0, 0), | ||
68 | GPU_CRITERR("machine_check_error", 0, 0, 0), | ||
69 | GPU_NONCRITERR("icache_l1_predecode_ecc_corrected", 0, 0, 0), | ||
70 | GPU_CRITERR("icache_l1_predecode_ecc_uncorrected", | ||
71 | GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED, 0, 0), | ||
72 | }, | ||
73 | }, | ||
74 | { | ||
75 | .name = "fecs", | ||
76 | .hw_unit = (u32)NVGPU_ERR_MODULE_FECS, | ||
77 | .num_errs = 4U, | ||
78 | .base_ecc_service_id = | ||
79 | NVGUARD_SERVICE_IGPU_FECS_SWERR_FALCON_IMEM_ECC_CORRECTED, | ||
80 | .errs = (struct nvgpu_err_desc[]) { | ||
81 | GPU_NONCRITERR("falcon_imem_ecc_corrected", | ||
82 | GPU_FECS_FALCON_IMEM_ECC_CORRECTED, 0, 0), | ||
83 | GPU_CRITERR("falcon_imem_ecc_uncorrected", | ||
84 | GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED, 0, 0), | ||
85 | GPU_NONCRITERR("falcon_dmem_ecc_corrected", 0, 0, 0), | ||
86 | GPU_CRITERR("falcon_dmem_ecc_uncorrected", | ||
87 | GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED, 0, 0), | ||
88 | }, | ||
89 | }, | ||
90 | { | ||
91 | .name = "pmu", | ||
92 | .hw_unit = NVGPU_ERR_MODULE_PMU, | ||
93 | .num_errs = 4U, | ||
94 | .base_ecc_service_id = | ||
95 | NVGUARD_SERVICE_IGPU_PMU_SWERR_FALCON_IMEM_ECC_CORRECTED, | ||
96 | .errs = (struct nvgpu_err_desc[]) { | ||
97 | GPU_NONCRITERR("falcon_imem_ecc_corrected", | ||
98 | GPU_PMU_FALCON_IMEM_ECC_CORRECTED, 0, 0), | ||
99 | GPU_CRITERR("falcon_imem_ecc_uncorrected", | ||
100 | GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED, 0, 0), | ||
101 | GPU_NONCRITERR("falcon_dmem_ecc_corrected", 0, 0, 0), | ||
102 | GPU_CRITERR("falcon_dmem_ecc_uncorrected", | ||
103 | GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED, 0, 0), | ||
104 | }, | ||
105 | }, | ||
106 | }; | ||
107 | |||
108 | static void nvgpu_init_err_msg_header(struct gpu_err_header *header) | ||
109 | { | ||
110 | header->version.major = (u16)1U; | ||
111 | header->version.minor = (u16)0U; | ||
112 | header->sub_err_type = 0U; | ||
113 | header->sub_unit_id = 0UL; | ||
114 | header->address = 0UL; | ||
115 | header->timestamp_ns = 0UL; | ||
116 | } | ||
117 | |||
118 | static void nvgpu_init_ecc_err_msg(struct gpu_ecc_error_info *err_info) | ||
119 | { | ||
120 | nvgpu_init_err_msg_header(&err_info->header); | ||
121 | err_info->err_cnt = 0UL; | ||
122 | } | ||
123 | |||
124 | static void nvgpu_report_ecc_error_linux(struct gk20a *g, u32 hw_unit, u32 inst, | ||
125 | u32 err_id, u64 err_addr, u64 err_count) | ||
126 | { | ||
127 | int err = 0; | ||
128 | u32 s_id = 0; | ||
129 | u8 err_status = 0; | ||
130 | u8 err_info_size = 0; | ||
131 | u64 timestamp = 0ULL; | ||
132 | int err_threshold_counter = 0; | ||
133 | struct gpu_ecc_error_info err_pkt; | ||
134 | struct nvgpu_err_desc *err_desc = NULL; | ||
135 | struct nvgpu_err_hw_module *hw_module = NULL; | ||
136 | nv_guard_request_t req; | ||
137 | |||
138 | memset(&req, 0, sizeof(req)); | ||
139 | nvgpu_init_ecc_err_msg(&err_pkt); | ||
140 | if (hw_unit >= sizeof(gv11b_err_lut)/sizeof(gv11b_err_lut[0])) { | ||
141 | err = -EINVAL; | ||
142 | goto done; | ||
143 | } | ||
144 | |||
145 | hw_module = &gv11b_err_lut[hw_unit]; | ||
146 | if (err_id >= hw_module->num_errs) { | ||
147 | nvgpu_err(g, "invalid err_id (%u) for hw module (%u)", | ||
148 | err_id, hw_module->hw_unit); | ||
149 | err = -EINVAL; | ||
150 | goto done; | ||
151 | } | ||
152 | err_desc = &hw_module->errs[err_id]; | ||
153 | timestamp = (u64)nvgpu_current_time_ns(); | ||
154 | |||
155 | err_pkt.header.timestamp_ns = timestamp; | ||
156 | err_pkt.header.sub_unit_id = inst; | ||
157 | err_pkt.header.address = err_addr; | ||
158 | err_pkt.err_cnt = err_count; | ||
159 | err_info_size = sizeof(err_pkt); | ||
160 | |||
161 | s_id = hw_module->base_ecc_service_id + err_id; | ||
162 | |||
163 | if (err_desc->is_critical) { | ||
164 | err_status = NVGUARD_ERROR_DETECTED; | ||
165 | } else { | ||
166 | err_status = NVGUARD_NO_ERROR; | ||
167 | } | ||
168 | |||
169 | nvgpu_atomic_inc(&err_desc->err_count); | ||
170 | err_threshold_counter = nvgpu_atomic_cmpxchg(&err_desc->err_count, | ||
171 | err_desc->err_threshold + 1, 0); | ||
172 | |||
173 | if (unlikely(err_threshold_counter != err_desc->err_threshold + 1)) { | ||
174 | goto done; | ||
175 | } | ||
176 | |||
177 | nvgpu_log(g, gpu_dbg_ecc, "ECC reporting hw: %s, desc:%s, count:%llu", | ||
178 | hw_module->name, err_desc->name, err_count); | ||
179 | |||
180 | req.srv_id_cmd = NVGUARD_SERVICESTATUS_NOTIFICATION; | ||
181 | req.srv_status.srv_id = (nv_guard_service_id_t)s_id; | ||
182 | req.srv_status.status = err_status; | ||
183 | req.srv_status.timestamp = timestamp; | ||
184 | req.srv_status.error_info_size = err_info_size; | ||
185 | memcpy(req.srv_status.error_info, (u8*)&err_pkt, err_info_size); | ||
186 | |||
187 | /* | ||
188 | * l1ss_submit_rq may fail due to kmalloc failures but may pass in | ||
189 | * subsequent calls | ||
190 | */ | ||
191 | err = l1ss_submit_rq(&req, true); | ||
192 | if (err != 0) { | ||
193 | nvgpu_err(g, "Error returned from L1SS submit %d", err); | ||
194 | } | ||
195 | |||
196 | if (err_desc->is_critical) { | ||
197 | nvgpu_quiesce(g); | ||
198 | } | ||
199 | |||
200 | done: | ||
201 | return; | ||
202 | } | ||
203 | |||
204 | static void nvgpu_report_ecc_error_empty(struct gk20a *g, u32 hw_unit, u32 inst, | ||
205 | u32 err_id, u64 err_addr, u64 err_count) { | ||
206 | nvgpu_log(g, gpu_dbg_ecc, "ECC reporting empty"); | ||
207 | } | ||
208 | |||
209 | const struct nvgpu_ecc_reporting_ops default_disabled_ecc_report_ops = { | ||
210 | .report_ecc_err = nvgpu_report_ecc_error_empty, | ||
211 | }; | ||
212 | |||
213 | const struct nvgpu_ecc_reporting_ops ecc_enable_report_ops = { | ||
214 | .report_ecc_err = nvgpu_report_ecc_error_linux, | ||
215 | }; | ||
216 | |||
217 | static int nvgpu_l1ss_callback(l1ss_cli_callback_param param, void *data) | ||
218 | { | ||
219 | struct gk20a *g = (struct gk20a *)data; | ||
220 | struct nvgpu_os_linux *l = NULL; | ||
221 | struct nvgpu_ecc_reporting_linux *ecc_reporting_linux = NULL; | ||
222 | int err = 0; | ||
223 | /* Ensure we have a valid gk20a struct before proceeding */ | ||
224 | if ((g == NULL) || (gk20a_get(g) == NULL)) { | ||
225 | return -ENODEV; | ||
226 | } | ||
227 | |||
228 | l = nvgpu_os_linux_from_gk20a(g); | ||
229 | ecc_reporting_linux = &l->ecc_reporting_linux; | ||
230 | |||
231 | nvgpu_spinlock_acquire(&ecc_reporting_linux->common.lock); | ||
232 | if (param == L1SS_READY) { | ||
233 | if (!ecc_reporting_linux->common.ecc_reporting_service_enabled) { | ||
234 | ecc_reporting_linux->common.ecc_reporting_service_enabled = true; | ||
235 | ecc_reporting_linux->common.ops = &ecc_enable_report_ops; | ||
236 | nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is enabled"); | ||
237 | } | ||
238 | } else if (param == L1SS_NOT_READY) { | ||
239 | if (ecc_reporting_linux->common.ecc_reporting_service_enabled) { | ||
240 | ecc_reporting_linux->common.ecc_reporting_service_enabled = false; | ||
241 | ecc_reporting_linux->common.ops = &default_disabled_ecc_report_ops; | ||
242 | nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is disabled"); | ||
243 | } | ||
244 | } else { | ||
245 | err = -EINVAL; | ||
246 | } | ||
247 | nvgpu_spinlock_release(&ecc_reporting_linux->common.lock); | ||
248 | |||
249 | gk20a_put(g); | ||
250 | |||
251 | return err; | ||
252 | } | ||
253 | |||
254 | void nvgpu_init_ecc_reporting(struct gk20a *g) | ||
255 | { | ||
256 | struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g); | ||
257 | struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux; | ||
258 | int err = 0; | ||
259 | /* This will invoke the registration API */ | ||
260 | nvgpu_spinlock_init(&ecc_report_linux->common.lock); | ||
261 | ecc_report_linux->priv.id = (NVGUARD_GROUPID_IGPU & NVGUARD_GROUPINDEX_FIELDMASK); | ||
262 | ecc_report_linux->priv.cli_callback = nvgpu_l1ss_callback; | ||
263 | ecc_report_linux->priv.data = g; | ||
264 | ecc_report_linux->common.ops = &default_disabled_ecc_report_ops; | ||
265 | |||
266 | nvgpu_log(g, gpu_dbg_ecc, "ECC reporting Init"); | ||
267 | |||
268 | /* | ||
269 | * err == 0 indicates service is available but not active yet. | ||
270 | * err == 1 indicates service is available and active | ||
271 | * error for other cases. | ||
272 | */ | ||
273 | err = l1ss_register_client(&ecc_report_linux->priv); | ||
274 | if (err == 0) { | ||
275 | ecc_report_linux->common.ecc_reporting_service_enabled = false; | ||
276 | nvgpu_log(g, gpu_dbg_ecc, "ECC reporting init success"); | ||
277 | } else if (err == 1) { | ||
278 | ecc_report_linux->common.ecc_reporting_service_enabled = true; | ||
279 | /* Actual Ops will be replaced during nvgpu_enable_ecc_reporting | ||
280 | * called as part of gk20a_busy() | ||
281 | */ | ||
282 | } else { | ||
283 | nvgpu_log(g, gpu_dbg_ecc, "ECC reporting init failure %d", err); | ||
284 | } | ||
285 | } | ||
286 | |||
287 | void nvgpu_deinit_ecc_reporting(struct gk20a *g) | ||
288 | { | ||
289 | struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g); | ||
290 | struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux; | ||
291 | |||
292 | if (ecc_report_linux->common.ecc_reporting_service_enabled) { | ||
293 | ecc_report_linux->common.ecc_reporting_service_enabled = false; | ||
294 | l1ss_deregister_client(ecc_report_linux->priv.id); | ||
295 | memset(ecc_report_linux, 0, sizeof(*ecc_report_linux)); | ||
296 | nvgpu_log(g, gpu_dbg_ecc, "ECC reporting de-init success"); | ||
297 | } | ||
298 | |||
299 | } | ||
300 | |||
301 | void nvgpu_enable_ecc_reporting(struct gk20a *g) | ||
302 | { | ||
303 | struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g); | ||
304 | struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux; | ||
305 | struct nvgpu_ecc_reporting *error_reporting = &ecc_report_linux->common; | ||
306 | |||
307 | nvgpu_spinlock_acquire(&ecc_report_linux->common.lock); | ||
308 | if (error_reporting->ecc_reporting_service_enabled) { | ||
309 | error_reporting->ops = &ecc_enable_report_ops; | ||
310 | nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is enabled"); | ||
311 | } | ||
312 | nvgpu_spinlock_release(&ecc_report_linux->common.lock); | ||
313 | } | ||
314 | |||
315 | void nvgpu_disable_ecc_reporting(struct gk20a *g) | ||
316 | { | ||
317 | struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g); | ||
318 | struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux; | ||
319 | struct nvgpu_ecc_reporting *error_reporting = &ecc_report_linux->common; | ||
320 | |||
321 | nvgpu_spinlock_acquire(&ecc_report_linux->common.lock); | ||
322 | error_reporting->ops = &default_disabled_ecc_report_ops; | ||
323 | nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is disabled"); | ||
324 | nvgpu_spinlock_release(&ecc_report_linux->common.lock); | ||
325 | } | ||
326 | |||
327 | void nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst, | ||
328 | u32 err_id, u64 err_addr, u64 err_count) | ||
329 | { | ||
330 | struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g); | ||
331 | struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux; | ||
332 | struct nvgpu_ecc_reporting *error_reporting = &ecc_report_linux->common; | ||
333 | void (*report_ecc_err_func)(struct gk20a *g, u32 hw_unit, u32 inst, | ||
334 | u32 err_id, u64 err_addr, u64 err_count); | ||
335 | |||
336 | nvgpu_spinlock_acquire(&ecc_report_linux->common.lock); | ||
337 | report_ecc_err_func = error_reporting->ops->report_ecc_err; | ||
338 | nvgpu_spinlock_release(&ecc_report_linux->common.lock); | ||
339 | |||
340 | report_ecc_err_func(g, hw_unit, inst, err_id, err_addr, err_count); | ||
341 | } | ||