summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/nvgpu/os/linux/sdl.c
diff options
context:
space:
mode:
authorDebarshi Dutta <ddutta@nvidia.com>2021-05-17 04:38:25 -0400
committermobile promotions <svcmobile_promotions@nvidia.com>2021-05-28 15:10:24 -0400
commit34993e4f7b0d47620e88ba64a6d7c67330d97e35 (patch)
tree2136284f5bd4095780884885413bb268fd318a96 /drivers/gpu/nvgpu/os/linux/sdl.c
parent5f88598b9e7b2cfe0387733577ece138a7bc912b (diff)
gpu: nvgpu: Add ECC Support for GV11B in Linux
Implement nvgpu plumbing to allow reporting ECC errors(corrected and uncorrected) to a L1SS service(if one exists). This patch includes the following 1) Added code that submits ECC error reports via the Interrupt context directly to a L1SS service in linux OS. 2) Added support for enabling/disabling the error reports via L1SS's registration/deregistration API. Nvgpu simply invokes an empty function until the registration is successful. 3) Added Spinlock to correctly handle concurrency for accessing the correct Ops for submitting requests. 4) Adds error reporting for a subset of interrupts that can be verified via external ECC injection logic. A subsequent patch will add the API for rest of the interrupts. 5) In case of critical(uncorrected errors), change nvgpu's state to quiesce state. Jira L4T-1187 Bug 200700400 Change-Id: Id31f70531fba355e94e72c4f9762593e7667a11c Signed-off-by: Debarshi Dutta <ddutta@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2530411 Tested-by: Bibek Basu <bbasu@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com> Reviewed-by: Bibek Basu <bbasu@nvidia.com> Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> GVS: Gerrit_Virtual_Submit
Diffstat (limited to 'drivers/gpu/nvgpu/os/linux/sdl.c')
-rw-r--r--drivers/gpu/nvgpu/os/linux/sdl.c341
1 files changed, 341 insertions, 0 deletions
diff --git a/drivers/gpu/nvgpu/os/linux/sdl.c b/drivers/gpu/nvgpu/os/linux/sdl.c
new file mode 100644
index 00000000..c4dccdc6
--- /dev/null
+++ b/drivers/gpu/nvgpu/os/linux/sdl.c
@@ -0,0 +1,341 @@
1/*
2 * Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program. If not, see <http://www.gnu.org/licenses/>.
15 */
16
17#include <nvgpu/gk20a.h>
18#include <nvgpu/types.h>
19#include <nvgpu/nvgpu_err.h>
20#include <nvgpu/timers.h>
21#include <nvgpu/bug.h>
22
23#include "ecc_linux.h"
24#include "os_linux.h"
25#include "module.h"
26
27/* This look-up table initializes the list of hw units and their errors.
28 * It also specifies the error injection mechanism supported, for each error.
29 * In case of hw error injection support, this initialization will be overriden
30 * by the values provided from the hal layes of corresponding hw units.
31 */
32static struct nvgpu_err_hw_module gv11b_err_lut[] = {
33 {
34 .name = "sm",
35 .hw_unit = (u32)NVGPU_ERR_MODULE_SM,
36 .num_errs = 21U,
37 .base_ecc_service_id =
38 NVGUARD_SERVICE_IGPU_SM_SWERR_L1_TAG_ECC_CORRECTED,
39 .errs = (struct nvgpu_err_desc[]) {
40 GPU_NONCRITERR("l1_tag_ecc_corrected",
41 GPU_SM_L1_TAG_ECC_CORRECTED, 0, 0),
42 GPU_CRITERR("l1_tag_ecc_uncorrected",
43 GPU_SM_L1_TAG_ECC_UNCORRECTED, 0, 0),
44 GPU_NONCRITERR("cbu_ecc_corrected", 0, 0, 0),
45 GPU_CRITERR("cbu_ecc_uncorrected",
46 GPU_SM_CBU_ECC_UNCORRECTED, 0, 0),
47 GPU_NONCRITERR("lrf_ecc_corrected", 0, 0, 0),
48 GPU_CRITERR("lrf_ecc_uncorrected",
49 GPU_SM_LRF_ECC_UNCORRECTED, 0, 0),
50 GPU_NONCRITERR("l1_data_ecc_corrected", 0, 0, 0),
51 GPU_CRITERR("l1_data_ecc_uncorrected",
52 GPU_SM_L1_DATA_ECC_UNCORRECTED, 0, 0),
53 GPU_NONCRITERR("icache_l0_data_ecc_corrected", 0, 0, 0),
54 GPU_CRITERR("icache_l0_data_ecc_uncorrected",
55 GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED, 0, 0),
56 GPU_NONCRITERR("icache_l1_data_ecc_corrected", 0, 0, 0),
57 GPU_CRITERR("icache_l1_data_ecc_uncorrected",
58 GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED, 0, 0),
59 GPU_NONCRITERR("icache_l0_predecode_ecc_corrected", 0, 0, 0),
60 GPU_CRITERR("icache_l0_predecode_ecc_uncorrected",
61 GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED, 0, 0),
62 GPU_NONCRITERR("l1_tag_miss_fifo_ecc_corrected", 0, 0, 0),
63 GPU_CRITERR("l1_tag_miss_fifo_ecc_uncorrected",
64 GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED, 0, 0),
65 GPU_NONCRITERR("l1_tag_s2r_pixprf_ecc_corrected", 0, 0, 0),
66 GPU_CRITERR("l1_tag_s2r_pixprf_ecc_uncorrected",
67 GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED, 0, 0),
68 GPU_CRITERR("machine_check_error", 0, 0, 0),
69 GPU_NONCRITERR("icache_l1_predecode_ecc_corrected", 0, 0, 0),
70 GPU_CRITERR("icache_l1_predecode_ecc_uncorrected",
71 GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED, 0, 0),
72 },
73 },
74 {
75 .name = "fecs",
76 .hw_unit = (u32)NVGPU_ERR_MODULE_FECS,
77 .num_errs = 4U,
78 .base_ecc_service_id =
79 NVGUARD_SERVICE_IGPU_FECS_SWERR_FALCON_IMEM_ECC_CORRECTED,
80 .errs = (struct nvgpu_err_desc[]) {
81 GPU_NONCRITERR("falcon_imem_ecc_corrected",
82 GPU_FECS_FALCON_IMEM_ECC_CORRECTED, 0, 0),
83 GPU_CRITERR("falcon_imem_ecc_uncorrected",
84 GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED, 0, 0),
85 GPU_NONCRITERR("falcon_dmem_ecc_corrected", 0, 0, 0),
86 GPU_CRITERR("falcon_dmem_ecc_uncorrected",
87 GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED, 0, 0),
88 },
89 },
90 {
91 .name = "pmu",
92 .hw_unit = NVGPU_ERR_MODULE_PMU,
93 .num_errs = 4U,
94 .base_ecc_service_id =
95 NVGUARD_SERVICE_IGPU_PMU_SWERR_FALCON_IMEM_ECC_CORRECTED,
96 .errs = (struct nvgpu_err_desc[]) {
97 GPU_NONCRITERR("falcon_imem_ecc_corrected",
98 GPU_PMU_FALCON_IMEM_ECC_CORRECTED, 0, 0),
99 GPU_CRITERR("falcon_imem_ecc_uncorrected",
100 GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED, 0, 0),
101 GPU_NONCRITERR("falcon_dmem_ecc_corrected", 0, 0, 0),
102 GPU_CRITERR("falcon_dmem_ecc_uncorrected",
103 GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED, 0, 0),
104 },
105 },
106};
107
108static void nvgpu_init_err_msg_header(struct gpu_err_header *header)
109{
110 header->version.major = (u16)1U;
111 header->version.minor = (u16)0U;
112 header->sub_err_type = 0U;
113 header->sub_unit_id = 0UL;
114 header->address = 0UL;
115 header->timestamp_ns = 0UL;
116}
117
118static void nvgpu_init_ecc_err_msg(struct gpu_ecc_error_info *err_info)
119{
120 nvgpu_init_err_msg_header(&err_info->header);
121 err_info->err_cnt = 0UL;
122}
123
124static void nvgpu_report_ecc_error_linux(struct gk20a *g, u32 hw_unit, u32 inst,
125 u32 err_id, u64 err_addr, u64 err_count)
126{
127 int err = 0;
128 u32 s_id = 0;
129 u8 err_status = 0;
130 u8 err_info_size = 0;
131 u64 timestamp = 0ULL;
132 int err_threshold_counter = 0;
133 struct gpu_ecc_error_info err_pkt;
134 struct nvgpu_err_desc *err_desc = NULL;
135 struct nvgpu_err_hw_module *hw_module = NULL;
136 nv_guard_request_t req;
137
138 memset(&req, 0, sizeof(req));
139 nvgpu_init_ecc_err_msg(&err_pkt);
140 if (hw_unit >= sizeof(gv11b_err_lut)/sizeof(gv11b_err_lut[0])) {
141 err = -EINVAL;
142 goto done;
143 }
144
145 hw_module = &gv11b_err_lut[hw_unit];
146 if (err_id >= hw_module->num_errs) {
147 nvgpu_err(g, "invalid err_id (%u) for hw module (%u)",
148 err_id, hw_module->hw_unit);
149 err = -EINVAL;
150 goto done;
151 }
152 err_desc = &hw_module->errs[err_id];
153 timestamp = (u64)nvgpu_current_time_ns();
154
155 err_pkt.header.timestamp_ns = timestamp;
156 err_pkt.header.sub_unit_id = inst;
157 err_pkt.header.address = err_addr;
158 err_pkt.err_cnt = err_count;
159 err_info_size = sizeof(err_pkt);
160
161 s_id = hw_module->base_ecc_service_id + err_id;
162
163 if (err_desc->is_critical) {
164 err_status = NVGUARD_ERROR_DETECTED;
165 } else {
166 err_status = NVGUARD_NO_ERROR;
167 }
168
169 nvgpu_atomic_inc(&err_desc->err_count);
170 err_threshold_counter = nvgpu_atomic_cmpxchg(&err_desc->err_count,
171 err_desc->err_threshold + 1, 0);
172
173 if (unlikely(err_threshold_counter != err_desc->err_threshold + 1)) {
174 goto done;
175 }
176
177 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting hw: %s, desc:%s, count:%llu",
178 hw_module->name, err_desc->name, err_count);
179
180 req.srv_id_cmd = NVGUARD_SERVICESTATUS_NOTIFICATION;
181 req.srv_status.srv_id = (nv_guard_service_id_t)s_id;
182 req.srv_status.status = err_status;
183 req.srv_status.timestamp = timestamp;
184 req.srv_status.error_info_size = err_info_size;
185 memcpy(req.srv_status.error_info, (u8*)&err_pkt, err_info_size);
186
187 /*
188 * l1ss_submit_rq may fail due to kmalloc failures but may pass in
189 * subsequent calls
190 */
191 err = l1ss_submit_rq(&req, true);
192 if (err != 0) {
193 nvgpu_err(g, "Error returned from L1SS submit %d", err);
194 }
195
196 if (err_desc->is_critical) {
197 nvgpu_quiesce(g);
198 }
199
200done:
201 return;
202}
203
204static void nvgpu_report_ecc_error_empty(struct gk20a *g, u32 hw_unit, u32 inst,
205 u32 err_id, u64 err_addr, u64 err_count) {
206 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting empty");
207}
208
209const struct nvgpu_ecc_reporting_ops default_disabled_ecc_report_ops = {
210 .report_ecc_err = nvgpu_report_ecc_error_empty,
211};
212
213const struct nvgpu_ecc_reporting_ops ecc_enable_report_ops = {
214 .report_ecc_err = nvgpu_report_ecc_error_linux,
215};
216
217static int nvgpu_l1ss_callback(l1ss_cli_callback_param param, void *data)
218{
219 struct gk20a *g = (struct gk20a *)data;
220 struct nvgpu_os_linux *l = NULL;
221 struct nvgpu_ecc_reporting_linux *ecc_reporting_linux = NULL;
222 int err = 0;
223 /* Ensure we have a valid gk20a struct before proceeding */
224 if ((g == NULL) || (gk20a_get(g) == NULL)) {
225 return -ENODEV;
226 }
227
228 l = nvgpu_os_linux_from_gk20a(g);
229 ecc_reporting_linux = &l->ecc_reporting_linux;
230
231 nvgpu_spinlock_acquire(&ecc_reporting_linux->common.lock);
232 if (param == L1SS_READY) {
233 if (!ecc_reporting_linux->common.ecc_reporting_service_enabled) {
234 ecc_reporting_linux->common.ecc_reporting_service_enabled = true;
235 ecc_reporting_linux->common.ops = &ecc_enable_report_ops;
236 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is enabled");
237 }
238 } else if (param == L1SS_NOT_READY) {
239 if (ecc_reporting_linux->common.ecc_reporting_service_enabled) {
240 ecc_reporting_linux->common.ecc_reporting_service_enabled = false;
241 ecc_reporting_linux->common.ops = &default_disabled_ecc_report_ops;
242 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is disabled");
243 }
244 } else {
245 err = -EINVAL;
246 }
247 nvgpu_spinlock_release(&ecc_reporting_linux->common.lock);
248
249 gk20a_put(g);
250
251 return err;
252}
253
254void nvgpu_init_ecc_reporting(struct gk20a *g)
255{
256 struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
257 struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux;
258 int err = 0;
259 /* This will invoke the registration API */
260 nvgpu_spinlock_init(&ecc_report_linux->common.lock);
261 ecc_report_linux->priv.id = (NVGUARD_GROUPID_IGPU & NVGUARD_GROUPINDEX_FIELDMASK);
262 ecc_report_linux->priv.cli_callback = nvgpu_l1ss_callback;
263 ecc_report_linux->priv.data = g;
264 ecc_report_linux->common.ops = &default_disabled_ecc_report_ops;
265
266 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting Init");
267
268 /*
269 * err == 0 indicates service is available but not active yet.
270 * err == 1 indicates service is available and active
271 * error for other cases.
272 */
273 err = l1ss_register_client(&ecc_report_linux->priv);
274 if (err == 0) {
275 ecc_report_linux->common.ecc_reporting_service_enabled = false;
276 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting init success");
277 } else if (err == 1) {
278 ecc_report_linux->common.ecc_reporting_service_enabled = true;
279 /* Actual Ops will be replaced during nvgpu_enable_ecc_reporting
280 * called as part of gk20a_busy()
281 */
282 } else {
283 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting init failure %d", err);
284 }
285}
286
287void nvgpu_deinit_ecc_reporting(struct gk20a *g)
288{
289 struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
290 struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux;
291
292 if (ecc_report_linux->common.ecc_reporting_service_enabled) {
293 ecc_report_linux->common.ecc_reporting_service_enabled = false;
294 l1ss_deregister_client(ecc_report_linux->priv.id);
295 memset(ecc_report_linux, 0, sizeof(*ecc_report_linux));
296 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting de-init success");
297 }
298
299}
300
301void nvgpu_enable_ecc_reporting(struct gk20a *g)
302{
303 struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
304 struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux;
305 struct nvgpu_ecc_reporting *error_reporting = &ecc_report_linux->common;
306
307 nvgpu_spinlock_acquire(&ecc_report_linux->common.lock);
308 if (error_reporting->ecc_reporting_service_enabled) {
309 error_reporting->ops = &ecc_enable_report_ops;
310 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is enabled");
311 }
312 nvgpu_spinlock_release(&ecc_report_linux->common.lock);
313}
314
315void nvgpu_disable_ecc_reporting(struct gk20a *g)
316{
317 struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
318 struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux;
319 struct nvgpu_ecc_reporting *error_reporting = &ecc_report_linux->common;
320
321 nvgpu_spinlock_acquire(&ecc_report_linux->common.lock);
322 error_reporting->ops = &default_disabled_ecc_report_ops;
323 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is disabled");
324 nvgpu_spinlock_release(&ecc_report_linux->common.lock);
325}
326
327void nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst,
328 u32 err_id, u64 err_addr, u64 err_count)
329{
330 struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
331 struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux;
332 struct nvgpu_ecc_reporting *error_reporting = &ecc_report_linux->common;
333 void (*report_ecc_err_func)(struct gk20a *g, u32 hw_unit, u32 inst,
334 u32 err_id, u64 err_addr, u64 err_count);
335
336 nvgpu_spinlock_acquire(&ecc_report_linux->common.lock);
337 report_ecc_err_func = error_reporting->ops->report_ecc_err;
338 nvgpu_spinlock_release(&ecc_report_linux->common.lock);
339
340 report_ecc_err_func(g, hw_unit, inst, err_id, err_addr, err_count);
341}