summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/nvgpu/os
diff options
context:
space:
mode:
authorDebarshi Dutta <ddutta@nvidia.com>2021-05-17 04:38:25 -0400
committermobile promotions <svcmobile_promotions@nvidia.com>2021-05-28 15:10:24 -0400
commit34993e4f7b0d47620e88ba64a6d7c67330d97e35 (patch)
tree2136284f5bd4095780884885413bb268fd318a96 /drivers/gpu/nvgpu/os
parent5f88598b9e7b2cfe0387733577ece138a7bc912b (diff)
gpu: nvgpu: Add ECC Support for GV11B in Linux
Implement nvgpu plumbing to allow reporting ECC errors(corrected and uncorrected) to a L1SS service(if one exists). This patch includes the following 1) Added code that submits ECC error reports via the Interrupt context directly to a L1SS service in linux OS. 2) Added support for enabling/disabling the error reports via L1SS's registration/deregistration API. Nvgpu simply invokes an empty function until the registration is successful. 3) Added Spinlock to correctly handle concurrency for accessing the correct Ops for submitting requests. 4) Adds error reporting for a subset of interrupts that can be verified via external ECC injection logic. A subsequent patch will add the API for rest of the interrupts. 5) In case of critical(uncorrected errors), change nvgpu's state to quiesce state. Jira L4T-1187 Bug 200700400 Change-Id: Id31f70531fba355e94e72c4f9762593e7667a11c Signed-off-by: Debarshi Dutta <ddutta@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2530411 Tested-by: Bibek Basu <bbasu@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com> Reviewed-by: Bibek Basu <bbasu@nvidia.com> Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> GVS: Gerrit_Virtual_Submit
Diffstat (limited to 'drivers/gpu/nvgpu/os')
-rw-r--r--drivers/gpu/nvgpu/os/linux/ecc_linux.h49
-rw-r--r--drivers/gpu/nvgpu/os/linux/module.c22
-rw-r--r--drivers/gpu/nvgpu/os/linux/os_linux.h7
-rw-r--r--drivers/gpu/nvgpu/os/linux/sdl.c341
4 files changed, 416 insertions, 3 deletions
diff --git a/drivers/gpu/nvgpu/os/linux/ecc_linux.h b/drivers/gpu/nvgpu/os/linux/ecc_linux.h
new file mode 100644
index 00000000..7e0f650b
--- /dev/null
+++ b/drivers/gpu/nvgpu/os/linux/ecc_linux.h
@@ -0,0 +1,49 @@
1/*
2 *
3 * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24#ifndef NVGPU_OS_ECC_LINUX_H
25#define NVGPU_OS_ECC_LINUX_H
26
27#ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING
28
29#include <linux/tegra_l1ss_kernel_interface.h>
30#include <linux/tegra_l1ss_ioctl.h>
31#include <linux/tegra_nv_guard_service_id.h>
32#include <linux/tegra_nv_guard_group_id.h>
33
34#include <nvgpu/nvgpu_err.h>
35
36struct nvgpu_ecc_reporting_linux {
37 struct nvgpu_ecc_reporting common;
38 client_param_t priv;
39};
40
41static inline struct nvgpu_ecc_reporting_linux *get_ecc_reporting_linux(
42 struct nvgpu_ecc_reporting *ecc_report)
43{
44 return container_of(ecc_report, struct nvgpu_ecc_reporting_linux, common);
45}
46
47#endif /* CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING */
48
49#endif \ No newline at end of file
diff --git a/drivers/gpu/nvgpu/os/linux/module.c b/drivers/gpu/nvgpu/os/linux/module.c
index 807df2ca..fdbab46d 100644
--- a/drivers/gpu/nvgpu/os/linux/module.c
+++ b/drivers/gpu/nvgpu/os/linux/module.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * GK20A Graphics 2 * GK20A Graphics
3 * 3 *
4 * Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved. 4 * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved.
5 * 5 *
6 * This program is free software; you can redistribute it and/or modify it 6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms and conditions of the GNU General Public License, 7 * under the terms and conditions of the GNU General Public License,
@@ -49,6 +49,7 @@
49#include <nvgpu/clk_arb.h> 49#include <nvgpu/clk_arb.h>
50#include <nvgpu/timers.h> 50#include <nvgpu/timers.h>
51#include <nvgpu/channel.h> 51#include <nvgpu/channel.h>
52#include <nvgpu/nvgpu_err.h>
52 53
53#include "platform_gk20a.h" 54#include "platform_gk20a.h"
54#include "sysfs.h" 55#include "sysfs.h"
@@ -355,6 +356,10 @@ int gk20a_pm_finalize_poweron(struct device *dev)
355 gk20a_init_cde_support(l); 356 gk20a_init_cde_support(l);
356#endif 357#endif
357 358
359#ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING
360 nvgpu_enable_ecc_reporting(g);
361#endif
362
358 err = gk20a_sched_ctrl_init(g); 363 err = gk20a_sched_ctrl_init(g);
359 if (err) { 364 if (err) {
360 nvgpu_err(g, "failed to init sched control"); 365 nvgpu_err(g, "failed to init sched control");
@@ -364,9 +369,14 @@ int gk20a_pm_finalize_poweron(struct device *dev)
364 g->sw_ready = true; 369 g->sw_ready = true;
365 370
366done: 371done:
367 if (err) 372 if (err) {
368 g->power_on = false; 373 g->power_on = false;
369 374
375#ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING
376 nvgpu_disable_ecc_reporting(g);
377#endif
378 }
379
370 nvgpu_mutex_release(&g->power_lock); 380 nvgpu_mutex_release(&g->power_lock);
371 return err; 381 return err;
372} 382}
@@ -433,6 +443,10 @@ static int gk20a_pm_prepare_poweroff(struct device *dev)
433 /* Stop CPU from accessing the GPU registers. */ 443 /* Stop CPU from accessing the GPU registers. */
434 gk20a_lockout_registers(g); 444 gk20a_lockout_registers(g);
435 445
446#ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING
447 nvgpu_disable_ecc_reporting(g);
448#endif
449
436 nvgpu_hide_usermode_for_poweroff(g); 450 nvgpu_hide_usermode_for_poweroff(g);
437 nvgpu_mutex_release(&g->power_lock); 451 nvgpu_mutex_release(&g->power_lock);
438 return 0; 452 return 0;
@@ -1382,6 +1396,10 @@ static int gk20a_probe(struct platform_device *dev)
1382 goto return_err; 1396 goto return_err;
1383 } 1397 }
1384 1398
1399#ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING
1400 nvgpu_init_ecc_reporting(gk20a);
1401#endif
1402
1385 gk20a->nvgpu_reboot_nb.notifier_call = 1403 gk20a->nvgpu_reboot_nb.notifier_call =
1386 nvgpu_kernel_shutdown_notification; 1404 nvgpu_kernel_shutdown_notification;
1387 err = register_reboot_notifier(&gk20a->nvgpu_reboot_nb); 1405 err = register_reboot_notifier(&gk20a->nvgpu_reboot_nb);
diff --git a/drivers/gpu/nvgpu/os/linux/os_linux.h b/drivers/gpu/nvgpu/os/linux/os_linux.h
index 25c6c03a..adcfdb2f 100644
--- a/drivers/gpu/nvgpu/os/linux/os_linux.h
+++ b/drivers/gpu/nvgpu/os/linux/os_linux.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. 2 * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
3 * 3 *
4 * This program is free software; you can redistribute it and/or modify it 4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License, 5 * under the terms and conditions of the GNU General Public License,
@@ -25,6 +25,7 @@
25 25
26#include "cde.h" 26#include "cde.h"
27#include "sched.h" 27#include "sched.h"
28#include "ecc_linux.h"
28 29
29struct nvgpu_os_linux_ops { 30struct nvgpu_os_linux_ops {
30 struct { 31 struct {
@@ -134,6 +135,10 @@ struct nvgpu_os_linux {
134 135
135 u64 regs_bus_addr; 136 u64 regs_bus_addr;
136 137
138#ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING
139 struct nvgpu_ecc_reporting_linux ecc_reporting_linux;
140#endif
141
137 struct nvgpu_os_linux_ops ops; 142 struct nvgpu_os_linux_ops ops;
138 143
139#ifdef CONFIG_DEBUG_FS 144#ifdef CONFIG_DEBUG_FS
diff --git a/drivers/gpu/nvgpu/os/linux/sdl.c b/drivers/gpu/nvgpu/os/linux/sdl.c
new file mode 100644
index 00000000..c4dccdc6
--- /dev/null
+++ b/drivers/gpu/nvgpu/os/linux/sdl.c
@@ -0,0 +1,341 @@
1/*
2 * Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program. If not, see <http://www.gnu.org/licenses/>.
15 */
16
17#include <nvgpu/gk20a.h>
18#include <nvgpu/types.h>
19#include <nvgpu/nvgpu_err.h>
20#include <nvgpu/timers.h>
21#include <nvgpu/bug.h>
22
23#include "ecc_linux.h"
24#include "os_linux.h"
25#include "module.h"
26
27/* This look-up table initializes the list of hw units and their errors.
28 * It also specifies the error injection mechanism supported, for each error.
29 * In case of hw error injection support, this initialization will be overriden
30 * by the values provided from the hal layes of corresponding hw units.
31 */
32static struct nvgpu_err_hw_module gv11b_err_lut[] = {
33 {
34 .name = "sm",
35 .hw_unit = (u32)NVGPU_ERR_MODULE_SM,
36 .num_errs = 21U,
37 .base_ecc_service_id =
38 NVGUARD_SERVICE_IGPU_SM_SWERR_L1_TAG_ECC_CORRECTED,
39 .errs = (struct nvgpu_err_desc[]) {
40 GPU_NONCRITERR("l1_tag_ecc_corrected",
41 GPU_SM_L1_TAG_ECC_CORRECTED, 0, 0),
42 GPU_CRITERR("l1_tag_ecc_uncorrected",
43 GPU_SM_L1_TAG_ECC_UNCORRECTED, 0, 0),
44 GPU_NONCRITERR("cbu_ecc_corrected", 0, 0, 0),
45 GPU_CRITERR("cbu_ecc_uncorrected",
46 GPU_SM_CBU_ECC_UNCORRECTED, 0, 0),
47 GPU_NONCRITERR("lrf_ecc_corrected", 0, 0, 0),
48 GPU_CRITERR("lrf_ecc_uncorrected",
49 GPU_SM_LRF_ECC_UNCORRECTED, 0, 0),
50 GPU_NONCRITERR("l1_data_ecc_corrected", 0, 0, 0),
51 GPU_CRITERR("l1_data_ecc_uncorrected",
52 GPU_SM_L1_DATA_ECC_UNCORRECTED, 0, 0),
53 GPU_NONCRITERR("icache_l0_data_ecc_corrected", 0, 0, 0),
54 GPU_CRITERR("icache_l0_data_ecc_uncorrected",
55 GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED, 0, 0),
56 GPU_NONCRITERR("icache_l1_data_ecc_corrected", 0, 0, 0),
57 GPU_CRITERR("icache_l1_data_ecc_uncorrected",
58 GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED, 0, 0),
59 GPU_NONCRITERR("icache_l0_predecode_ecc_corrected", 0, 0, 0),
60 GPU_CRITERR("icache_l0_predecode_ecc_uncorrected",
61 GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED, 0, 0),
62 GPU_NONCRITERR("l1_tag_miss_fifo_ecc_corrected", 0, 0, 0),
63 GPU_CRITERR("l1_tag_miss_fifo_ecc_uncorrected",
64 GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED, 0, 0),
65 GPU_NONCRITERR("l1_tag_s2r_pixprf_ecc_corrected", 0, 0, 0),
66 GPU_CRITERR("l1_tag_s2r_pixprf_ecc_uncorrected",
67 GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED, 0, 0),
68 GPU_CRITERR("machine_check_error", 0, 0, 0),
69 GPU_NONCRITERR("icache_l1_predecode_ecc_corrected", 0, 0, 0),
70 GPU_CRITERR("icache_l1_predecode_ecc_uncorrected",
71 GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED, 0, 0),
72 },
73 },
74 {
75 .name = "fecs",
76 .hw_unit = (u32)NVGPU_ERR_MODULE_FECS,
77 .num_errs = 4U,
78 .base_ecc_service_id =
79 NVGUARD_SERVICE_IGPU_FECS_SWERR_FALCON_IMEM_ECC_CORRECTED,
80 .errs = (struct nvgpu_err_desc[]) {
81 GPU_NONCRITERR("falcon_imem_ecc_corrected",
82 GPU_FECS_FALCON_IMEM_ECC_CORRECTED, 0, 0),
83 GPU_CRITERR("falcon_imem_ecc_uncorrected",
84 GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED, 0, 0),
85 GPU_NONCRITERR("falcon_dmem_ecc_corrected", 0, 0, 0),
86 GPU_CRITERR("falcon_dmem_ecc_uncorrected",
87 GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED, 0, 0),
88 },
89 },
90 {
91 .name = "pmu",
92 .hw_unit = NVGPU_ERR_MODULE_PMU,
93 .num_errs = 4U,
94 .base_ecc_service_id =
95 NVGUARD_SERVICE_IGPU_PMU_SWERR_FALCON_IMEM_ECC_CORRECTED,
96 .errs = (struct nvgpu_err_desc[]) {
97 GPU_NONCRITERR("falcon_imem_ecc_corrected",
98 GPU_PMU_FALCON_IMEM_ECC_CORRECTED, 0, 0),
99 GPU_CRITERR("falcon_imem_ecc_uncorrected",
100 GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED, 0, 0),
101 GPU_NONCRITERR("falcon_dmem_ecc_corrected", 0, 0, 0),
102 GPU_CRITERR("falcon_dmem_ecc_uncorrected",
103 GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED, 0, 0),
104 },
105 },
106};
107
108static void nvgpu_init_err_msg_header(struct gpu_err_header *header)
109{
110 header->version.major = (u16)1U;
111 header->version.minor = (u16)0U;
112 header->sub_err_type = 0U;
113 header->sub_unit_id = 0UL;
114 header->address = 0UL;
115 header->timestamp_ns = 0UL;
116}
117
118static void nvgpu_init_ecc_err_msg(struct gpu_ecc_error_info *err_info)
119{
120 nvgpu_init_err_msg_header(&err_info->header);
121 err_info->err_cnt = 0UL;
122}
123
124static void nvgpu_report_ecc_error_linux(struct gk20a *g, u32 hw_unit, u32 inst,
125 u32 err_id, u64 err_addr, u64 err_count)
126{
127 int err = 0;
128 u32 s_id = 0;
129 u8 err_status = 0;
130 u8 err_info_size = 0;
131 u64 timestamp = 0ULL;
132 int err_threshold_counter = 0;
133 struct gpu_ecc_error_info err_pkt;
134 struct nvgpu_err_desc *err_desc = NULL;
135 struct nvgpu_err_hw_module *hw_module = NULL;
136 nv_guard_request_t req;
137
138 memset(&req, 0, sizeof(req));
139 nvgpu_init_ecc_err_msg(&err_pkt);
140 if (hw_unit >= sizeof(gv11b_err_lut)/sizeof(gv11b_err_lut[0])) {
141 err = -EINVAL;
142 goto done;
143 }
144
145 hw_module = &gv11b_err_lut[hw_unit];
146 if (err_id >= hw_module->num_errs) {
147 nvgpu_err(g, "invalid err_id (%u) for hw module (%u)",
148 err_id, hw_module->hw_unit);
149 err = -EINVAL;
150 goto done;
151 }
152 err_desc = &hw_module->errs[err_id];
153 timestamp = (u64)nvgpu_current_time_ns();
154
155 err_pkt.header.timestamp_ns = timestamp;
156 err_pkt.header.sub_unit_id = inst;
157 err_pkt.header.address = err_addr;
158 err_pkt.err_cnt = err_count;
159 err_info_size = sizeof(err_pkt);
160
161 s_id = hw_module->base_ecc_service_id + err_id;
162
163 if (err_desc->is_critical) {
164 err_status = NVGUARD_ERROR_DETECTED;
165 } else {
166 err_status = NVGUARD_NO_ERROR;
167 }
168
169 nvgpu_atomic_inc(&err_desc->err_count);
170 err_threshold_counter = nvgpu_atomic_cmpxchg(&err_desc->err_count,
171 err_desc->err_threshold + 1, 0);
172
173 if (unlikely(err_threshold_counter != err_desc->err_threshold + 1)) {
174 goto done;
175 }
176
177 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting hw: %s, desc:%s, count:%llu",
178 hw_module->name, err_desc->name, err_count);
179
180 req.srv_id_cmd = NVGUARD_SERVICESTATUS_NOTIFICATION;
181 req.srv_status.srv_id = (nv_guard_service_id_t)s_id;
182 req.srv_status.status = err_status;
183 req.srv_status.timestamp = timestamp;
184 req.srv_status.error_info_size = err_info_size;
185 memcpy(req.srv_status.error_info, (u8*)&err_pkt, err_info_size);
186
187 /*
188 * l1ss_submit_rq may fail due to kmalloc failures but may pass in
189 * subsequent calls
190 */
191 err = l1ss_submit_rq(&req, true);
192 if (err != 0) {
193 nvgpu_err(g, "Error returned from L1SS submit %d", err);
194 }
195
196 if (err_desc->is_critical) {
197 nvgpu_quiesce(g);
198 }
199
200done:
201 return;
202}
203
204static void nvgpu_report_ecc_error_empty(struct gk20a *g, u32 hw_unit, u32 inst,
205 u32 err_id, u64 err_addr, u64 err_count) {
206 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting empty");
207}
208
209const struct nvgpu_ecc_reporting_ops default_disabled_ecc_report_ops = {
210 .report_ecc_err = nvgpu_report_ecc_error_empty,
211};
212
213const struct nvgpu_ecc_reporting_ops ecc_enable_report_ops = {
214 .report_ecc_err = nvgpu_report_ecc_error_linux,
215};
216
217static int nvgpu_l1ss_callback(l1ss_cli_callback_param param, void *data)
218{
219 struct gk20a *g = (struct gk20a *)data;
220 struct nvgpu_os_linux *l = NULL;
221 struct nvgpu_ecc_reporting_linux *ecc_reporting_linux = NULL;
222 int err = 0;
223 /* Ensure we have a valid gk20a struct before proceeding */
224 if ((g == NULL) || (gk20a_get(g) == NULL)) {
225 return -ENODEV;
226 }
227
228 l = nvgpu_os_linux_from_gk20a(g);
229 ecc_reporting_linux = &l->ecc_reporting_linux;
230
231 nvgpu_spinlock_acquire(&ecc_reporting_linux->common.lock);
232 if (param == L1SS_READY) {
233 if (!ecc_reporting_linux->common.ecc_reporting_service_enabled) {
234 ecc_reporting_linux->common.ecc_reporting_service_enabled = true;
235 ecc_reporting_linux->common.ops = &ecc_enable_report_ops;
236 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is enabled");
237 }
238 } else if (param == L1SS_NOT_READY) {
239 if (ecc_reporting_linux->common.ecc_reporting_service_enabled) {
240 ecc_reporting_linux->common.ecc_reporting_service_enabled = false;
241 ecc_reporting_linux->common.ops = &default_disabled_ecc_report_ops;
242 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is disabled");
243 }
244 } else {
245 err = -EINVAL;
246 }
247 nvgpu_spinlock_release(&ecc_reporting_linux->common.lock);
248
249 gk20a_put(g);
250
251 return err;
252}
253
254void nvgpu_init_ecc_reporting(struct gk20a *g)
255{
256 struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
257 struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux;
258 int err = 0;
259 /* This will invoke the registration API */
260 nvgpu_spinlock_init(&ecc_report_linux->common.lock);
261 ecc_report_linux->priv.id = (NVGUARD_GROUPID_IGPU & NVGUARD_GROUPINDEX_FIELDMASK);
262 ecc_report_linux->priv.cli_callback = nvgpu_l1ss_callback;
263 ecc_report_linux->priv.data = g;
264 ecc_report_linux->common.ops = &default_disabled_ecc_report_ops;
265
266 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting Init");
267
268 /*
269 * err == 0 indicates service is available but not active yet.
270 * err == 1 indicates service is available and active
271 * error for other cases.
272 */
273 err = l1ss_register_client(&ecc_report_linux->priv);
274 if (err == 0) {
275 ecc_report_linux->common.ecc_reporting_service_enabled = false;
276 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting init success");
277 } else if (err == 1) {
278 ecc_report_linux->common.ecc_reporting_service_enabled = true;
279 /* Actual Ops will be replaced during nvgpu_enable_ecc_reporting
280 * called as part of gk20a_busy()
281 */
282 } else {
283 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting init failure %d", err);
284 }
285}
286
287void nvgpu_deinit_ecc_reporting(struct gk20a *g)
288{
289 struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
290 struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux;
291
292 if (ecc_report_linux->common.ecc_reporting_service_enabled) {
293 ecc_report_linux->common.ecc_reporting_service_enabled = false;
294 l1ss_deregister_client(ecc_report_linux->priv.id);
295 memset(ecc_report_linux, 0, sizeof(*ecc_report_linux));
296 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting de-init success");
297 }
298
299}
300
301void nvgpu_enable_ecc_reporting(struct gk20a *g)
302{
303 struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
304 struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux;
305 struct nvgpu_ecc_reporting *error_reporting = &ecc_report_linux->common;
306
307 nvgpu_spinlock_acquire(&ecc_report_linux->common.lock);
308 if (error_reporting->ecc_reporting_service_enabled) {
309 error_reporting->ops = &ecc_enable_report_ops;
310 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is enabled");
311 }
312 nvgpu_spinlock_release(&ecc_report_linux->common.lock);
313}
314
315void nvgpu_disable_ecc_reporting(struct gk20a *g)
316{
317 struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
318 struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux;
319 struct nvgpu_ecc_reporting *error_reporting = &ecc_report_linux->common;
320
321 nvgpu_spinlock_acquire(&ecc_report_linux->common.lock);
322 error_reporting->ops = &default_disabled_ecc_report_ops;
323 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is disabled");
324 nvgpu_spinlock_release(&ecc_report_linux->common.lock);
325}
326
327void nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst,
328 u32 err_id, u64 err_addr, u64 err_count)
329{
330 struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
331 struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux;
332 struct nvgpu_ecc_reporting *error_reporting = &ecc_report_linux->common;
333 void (*report_ecc_err_func)(struct gk20a *g, u32 hw_unit, u32 inst,
334 u32 err_id, u64 err_addr, u64 err_count);
335
336 nvgpu_spinlock_acquire(&ecc_report_linux->common.lock);
337 report_ecc_err_func = error_reporting->ops->report_ecc_err;
338 nvgpu_spinlock_release(&ecc_report_linux->common.lock);
339
340 report_ecc_err_func(g, hw_unit, inst, err_id, err_addr, err_count);
341}