summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/nvgpu/os/linux/sdl.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/nvgpu/os/linux/sdl.c')
-rw-r--r--drivers/gpu/nvgpu/os/linux/sdl.c341
1 files changed, 341 insertions, 0 deletions
diff --git a/drivers/gpu/nvgpu/os/linux/sdl.c b/drivers/gpu/nvgpu/os/linux/sdl.c
new file mode 100644
index 00000000..c4dccdc6
--- /dev/null
+++ b/drivers/gpu/nvgpu/os/linux/sdl.c
@@ -0,0 +1,341 @@
1/*
2 * Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program. If not, see <http://www.gnu.org/licenses/>.
15 */
16
17#include <nvgpu/gk20a.h>
18#include <nvgpu/types.h>
19#include <nvgpu/nvgpu_err.h>
20#include <nvgpu/timers.h>
21#include <nvgpu/bug.h>
22
23#include "ecc_linux.h"
24#include "os_linux.h"
25#include "module.h"
26
27/* This look-up table initializes the list of hw units and their errors.
28 * It also specifies the error injection mechanism supported, for each error.
29 * In case of hw error injection support, this initialization will be overriden
30 * by the values provided from the hal layes of corresponding hw units.
31 */
32static struct nvgpu_err_hw_module gv11b_err_lut[] = {
33 {
34 .name = "sm",
35 .hw_unit = (u32)NVGPU_ERR_MODULE_SM,
36 .num_errs = 21U,
37 .base_ecc_service_id =
38 NVGUARD_SERVICE_IGPU_SM_SWERR_L1_TAG_ECC_CORRECTED,
39 .errs = (struct nvgpu_err_desc[]) {
40 GPU_NONCRITERR("l1_tag_ecc_corrected",
41 GPU_SM_L1_TAG_ECC_CORRECTED, 0, 0),
42 GPU_CRITERR("l1_tag_ecc_uncorrected",
43 GPU_SM_L1_TAG_ECC_UNCORRECTED, 0, 0),
44 GPU_NONCRITERR("cbu_ecc_corrected", 0, 0, 0),
45 GPU_CRITERR("cbu_ecc_uncorrected",
46 GPU_SM_CBU_ECC_UNCORRECTED, 0, 0),
47 GPU_NONCRITERR("lrf_ecc_corrected", 0, 0, 0),
48 GPU_CRITERR("lrf_ecc_uncorrected",
49 GPU_SM_LRF_ECC_UNCORRECTED, 0, 0),
50 GPU_NONCRITERR("l1_data_ecc_corrected", 0, 0, 0),
51 GPU_CRITERR("l1_data_ecc_uncorrected",
52 GPU_SM_L1_DATA_ECC_UNCORRECTED, 0, 0),
53 GPU_NONCRITERR("icache_l0_data_ecc_corrected", 0, 0, 0),
54 GPU_CRITERR("icache_l0_data_ecc_uncorrected",
55 GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED, 0, 0),
56 GPU_NONCRITERR("icache_l1_data_ecc_corrected", 0, 0, 0),
57 GPU_CRITERR("icache_l1_data_ecc_uncorrected",
58 GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED, 0, 0),
59 GPU_NONCRITERR("icache_l0_predecode_ecc_corrected", 0, 0, 0),
60 GPU_CRITERR("icache_l0_predecode_ecc_uncorrected",
61 GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED, 0, 0),
62 GPU_NONCRITERR("l1_tag_miss_fifo_ecc_corrected", 0, 0, 0),
63 GPU_CRITERR("l1_tag_miss_fifo_ecc_uncorrected",
64 GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED, 0, 0),
65 GPU_NONCRITERR("l1_tag_s2r_pixprf_ecc_corrected", 0, 0, 0),
66 GPU_CRITERR("l1_tag_s2r_pixprf_ecc_uncorrected",
67 GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED, 0, 0),
68 GPU_CRITERR("machine_check_error", 0, 0, 0),
69 GPU_NONCRITERR("icache_l1_predecode_ecc_corrected", 0, 0, 0),
70 GPU_CRITERR("icache_l1_predecode_ecc_uncorrected",
71 GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED, 0, 0),
72 },
73 },
74 {
75 .name = "fecs",
76 .hw_unit = (u32)NVGPU_ERR_MODULE_FECS,
77 .num_errs = 4U,
78 .base_ecc_service_id =
79 NVGUARD_SERVICE_IGPU_FECS_SWERR_FALCON_IMEM_ECC_CORRECTED,
80 .errs = (struct nvgpu_err_desc[]) {
81 GPU_NONCRITERR("falcon_imem_ecc_corrected",
82 GPU_FECS_FALCON_IMEM_ECC_CORRECTED, 0, 0),
83 GPU_CRITERR("falcon_imem_ecc_uncorrected",
84 GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED, 0, 0),
85 GPU_NONCRITERR("falcon_dmem_ecc_corrected", 0, 0, 0),
86 GPU_CRITERR("falcon_dmem_ecc_uncorrected",
87 GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED, 0, 0),
88 },
89 },
90 {
91 .name = "pmu",
92 .hw_unit = NVGPU_ERR_MODULE_PMU,
93 .num_errs = 4U,
94 .base_ecc_service_id =
95 NVGUARD_SERVICE_IGPU_PMU_SWERR_FALCON_IMEM_ECC_CORRECTED,
96 .errs = (struct nvgpu_err_desc[]) {
97 GPU_NONCRITERR("falcon_imem_ecc_corrected",
98 GPU_PMU_FALCON_IMEM_ECC_CORRECTED, 0, 0),
99 GPU_CRITERR("falcon_imem_ecc_uncorrected",
100 GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED, 0, 0),
101 GPU_NONCRITERR("falcon_dmem_ecc_corrected", 0, 0, 0),
102 GPU_CRITERR("falcon_dmem_ecc_uncorrected",
103 GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED, 0, 0),
104 },
105 },
106};
107
108static void nvgpu_init_err_msg_header(struct gpu_err_header *header)
109{
110 header->version.major = (u16)1U;
111 header->version.minor = (u16)0U;
112 header->sub_err_type = 0U;
113 header->sub_unit_id = 0UL;
114 header->address = 0UL;
115 header->timestamp_ns = 0UL;
116}
117
118static void nvgpu_init_ecc_err_msg(struct gpu_ecc_error_info *err_info)
119{
120 nvgpu_init_err_msg_header(&err_info->header);
121 err_info->err_cnt = 0UL;
122}
123
124static void nvgpu_report_ecc_error_linux(struct gk20a *g, u32 hw_unit, u32 inst,
125 u32 err_id, u64 err_addr, u64 err_count)
126{
127 int err = 0;
128 u32 s_id = 0;
129 u8 err_status = 0;
130 u8 err_info_size = 0;
131 u64 timestamp = 0ULL;
132 int err_threshold_counter = 0;
133 struct gpu_ecc_error_info err_pkt;
134 struct nvgpu_err_desc *err_desc = NULL;
135 struct nvgpu_err_hw_module *hw_module = NULL;
136 nv_guard_request_t req;
137
138 memset(&req, 0, sizeof(req));
139 nvgpu_init_ecc_err_msg(&err_pkt);
140 if (hw_unit >= sizeof(gv11b_err_lut)/sizeof(gv11b_err_lut[0])) {
141 err = -EINVAL;
142 goto done;
143 }
144
145 hw_module = &gv11b_err_lut[hw_unit];
146 if (err_id >= hw_module->num_errs) {
147 nvgpu_err(g, "invalid err_id (%u) for hw module (%u)",
148 err_id, hw_module->hw_unit);
149 err = -EINVAL;
150 goto done;
151 }
152 err_desc = &hw_module->errs[err_id];
153 timestamp = (u64)nvgpu_current_time_ns();
154
155 err_pkt.header.timestamp_ns = timestamp;
156 err_pkt.header.sub_unit_id = inst;
157 err_pkt.header.address = err_addr;
158 err_pkt.err_cnt = err_count;
159 err_info_size = sizeof(err_pkt);
160
161 s_id = hw_module->base_ecc_service_id + err_id;
162
163 if (err_desc->is_critical) {
164 err_status = NVGUARD_ERROR_DETECTED;
165 } else {
166 err_status = NVGUARD_NO_ERROR;
167 }
168
169 nvgpu_atomic_inc(&err_desc->err_count);
170 err_threshold_counter = nvgpu_atomic_cmpxchg(&err_desc->err_count,
171 err_desc->err_threshold + 1, 0);
172
173 if (unlikely(err_threshold_counter != err_desc->err_threshold + 1)) {
174 goto done;
175 }
176
177 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting hw: %s, desc:%s, count:%llu",
178 hw_module->name, err_desc->name, err_count);
179
180 req.srv_id_cmd = NVGUARD_SERVICESTATUS_NOTIFICATION;
181 req.srv_status.srv_id = (nv_guard_service_id_t)s_id;
182 req.srv_status.status = err_status;
183 req.srv_status.timestamp = timestamp;
184 req.srv_status.error_info_size = err_info_size;
185 memcpy(req.srv_status.error_info, (u8*)&err_pkt, err_info_size);
186
187 /*
188 * l1ss_submit_rq may fail due to kmalloc failures but may pass in
189 * subsequent calls
190 */
191 err = l1ss_submit_rq(&req, true);
192 if (err != 0) {
193 nvgpu_err(g, "Error returned from L1SS submit %d", err);
194 }
195
196 if (err_desc->is_critical) {
197 nvgpu_quiesce(g);
198 }
199
200done:
201 return;
202}
203
204static void nvgpu_report_ecc_error_empty(struct gk20a *g, u32 hw_unit, u32 inst,
205 u32 err_id, u64 err_addr, u64 err_count) {
206 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting empty");
207}
208
209const struct nvgpu_ecc_reporting_ops default_disabled_ecc_report_ops = {
210 .report_ecc_err = nvgpu_report_ecc_error_empty,
211};
212
213const struct nvgpu_ecc_reporting_ops ecc_enable_report_ops = {
214 .report_ecc_err = nvgpu_report_ecc_error_linux,
215};
216
217static int nvgpu_l1ss_callback(l1ss_cli_callback_param param, void *data)
218{
219 struct gk20a *g = (struct gk20a *)data;
220 struct nvgpu_os_linux *l = NULL;
221 struct nvgpu_ecc_reporting_linux *ecc_reporting_linux = NULL;
222 int err = 0;
223 /* Ensure we have a valid gk20a struct before proceeding */
224 if ((g == NULL) || (gk20a_get(g) == NULL)) {
225 return -ENODEV;
226 }
227
228 l = nvgpu_os_linux_from_gk20a(g);
229 ecc_reporting_linux = &l->ecc_reporting_linux;
230
231 nvgpu_spinlock_acquire(&ecc_reporting_linux->common.lock);
232 if (param == L1SS_READY) {
233 if (!ecc_reporting_linux->common.ecc_reporting_service_enabled) {
234 ecc_reporting_linux->common.ecc_reporting_service_enabled = true;
235 ecc_reporting_linux->common.ops = &ecc_enable_report_ops;
236 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is enabled");
237 }
238 } else if (param == L1SS_NOT_READY) {
239 if (ecc_reporting_linux->common.ecc_reporting_service_enabled) {
240 ecc_reporting_linux->common.ecc_reporting_service_enabled = false;
241 ecc_reporting_linux->common.ops = &default_disabled_ecc_report_ops;
242 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is disabled");
243 }
244 } else {
245 err = -EINVAL;
246 }
247 nvgpu_spinlock_release(&ecc_reporting_linux->common.lock);
248
249 gk20a_put(g);
250
251 return err;
252}
253
254void nvgpu_init_ecc_reporting(struct gk20a *g)
255{
256 struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
257 struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux;
258 int err = 0;
259 /* This will invoke the registration API */
260 nvgpu_spinlock_init(&ecc_report_linux->common.lock);
261 ecc_report_linux->priv.id = (NVGUARD_GROUPID_IGPU & NVGUARD_GROUPINDEX_FIELDMASK);
262 ecc_report_linux->priv.cli_callback = nvgpu_l1ss_callback;
263 ecc_report_linux->priv.data = g;
264 ecc_report_linux->common.ops = &default_disabled_ecc_report_ops;
265
266 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting Init");
267
268 /*
269 * err == 0 indicates service is available but not active yet.
270 * err == 1 indicates service is available and active
271 * error for other cases.
272 */
273 err = l1ss_register_client(&ecc_report_linux->priv);
274 if (err == 0) {
275 ecc_report_linux->common.ecc_reporting_service_enabled = false;
276 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting init success");
277 } else if (err == 1) {
278 ecc_report_linux->common.ecc_reporting_service_enabled = true;
279 /* Actual Ops will be replaced during nvgpu_enable_ecc_reporting
280 * called as part of gk20a_busy()
281 */
282 } else {
283 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting init failure %d", err);
284 }
285}
286
287void nvgpu_deinit_ecc_reporting(struct gk20a *g)
288{
289 struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
290 struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux;
291
292 if (ecc_report_linux->common.ecc_reporting_service_enabled) {
293 ecc_report_linux->common.ecc_reporting_service_enabled = false;
294 l1ss_deregister_client(ecc_report_linux->priv.id);
295 memset(ecc_report_linux, 0, sizeof(*ecc_report_linux));
296 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting de-init success");
297 }
298
299}
300
301void nvgpu_enable_ecc_reporting(struct gk20a *g)
302{
303 struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
304 struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux;
305 struct nvgpu_ecc_reporting *error_reporting = &ecc_report_linux->common;
306
307 nvgpu_spinlock_acquire(&ecc_report_linux->common.lock);
308 if (error_reporting->ecc_reporting_service_enabled) {
309 error_reporting->ops = &ecc_enable_report_ops;
310 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is enabled");
311 }
312 nvgpu_spinlock_release(&ecc_report_linux->common.lock);
313}
314
315void nvgpu_disable_ecc_reporting(struct gk20a *g)
316{
317 struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
318 struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux;
319 struct nvgpu_ecc_reporting *error_reporting = &ecc_report_linux->common;
320
321 nvgpu_spinlock_acquire(&ecc_report_linux->common.lock);
322 error_reporting->ops = &default_disabled_ecc_report_ops;
323 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is disabled");
324 nvgpu_spinlock_release(&ecc_report_linux->common.lock);
325}
326
327void nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst,
328 u32 err_id, u64 err_addr, u64 err_count)
329{
330 struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
331 struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux;
332 struct nvgpu_ecc_reporting *error_reporting = &ecc_report_linux->common;
333 void (*report_ecc_err_func)(struct gk20a *g, u32 hw_unit, u32 inst,
334 u32 err_id, u64 err_addr, u64 err_count);
335
336 nvgpu_spinlock_acquire(&ecc_report_linux->common.lock);
337 report_ecc_err_func = error_reporting->ops->report_ecc_err;
338 nvgpu_spinlock_release(&ecc_report_linux->common.lock);
339
340 report_ecc_err_func(g, hw_unit, inst, err_id, err_addr, err_count);
341}