summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--drivers/gpu/nvgpu/Kconfig7
-rw-r--r--drivers/gpu/nvgpu/Makefile2
-rw-r--r--drivers/gpu/nvgpu/gk20a/gk20a.c7
-rw-r--r--drivers/gpu/nvgpu/gv11b/gr_gv11b.c43
-rw-r--r--drivers/gpu/nvgpu/gv11b/pmu_gv11b.c15
-rw-r--r--drivers/gpu/nvgpu/include/nvgpu/bug.h20
-rw-r--r--drivers/gpu/nvgpu/include/nvgpu/log.h3
-rw-r--r--drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h359
-rw-r--r--drivers/gpu/nvgpu/os/linux/ecc_linux.h49
-rw-r--r--drivers/gpu/nvgpu/os/linux/module.c22
-rw-r--r--drivers/gpu/nvgpu/os/linux/os_linux.h7
-rw-r--r--drivers/gpu/nvgpu/os/linux/sdl.c341
12 files changed, 867 insertions, 8 deletions
diff --git a/drivers/gpu/nvgpu/Kconfig b/drivers/gpu/nvgpu/Kconfig
index 7dba61a3..07331817 100644
--- a/drivers/gpu/nvgpu/Kconfig
+++ b/drivers/gpu/nvgpu/Kconfig
@@ -143,6 +143,13 @@ config NVGPU_SUPPORT_CDE
143 help 143 help
144 Enable support for extraction of comptags for CDE. 144 Enable support for extraction of comptags for CDE.
145 145
146config NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING
147 bool "Support ECC error reporting for Linux"
148 depends on TEGRA_SAFETY
149 default y
150 help
151 Enable support for ECC error reporting for Linux.
152
146config NVGPU_USE_TEGRA_ALLOC_FD 153config NVGPU_USE_TEGRA_ALLOC_FD
147 bool "Use tegra_alloc_fd() for allocating dma_buf fds for vidmem" 154 bool "Use tegra_alloc_fd() for allocating dma_buf fds for vidmem"
148 depends on GK20A && GK20A_VIDMEM 155 depends on GK20A && GK20A_VIDMEM
diff --git a/drivers/gpu/nvgpu/Makefile b/drivers/gpu/nvgpu/Makefile
index 472bf32c..d5ceecb6 100644
--- a/drivers/gpu/nvgpu/Makefile
+++ b/drivers/gpu/nvgpu/Makefile
@@ -98,6 +98,8 @@ nvgpu-y += \
98 os/linux/ltc.o \ 98 os/linux/ltc.o \
99 os/linux/vpr.o 99 os/linux/vpr.o
100 100
101nvgpu-$(CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING) += os/linux/sdl.o
102
101nvgpu-$(CONFIG_GK20A_VIDMEM) += \ 103nvgpu-$(CONFIG_GK20A_VIDMEM) += \
102 os/linux/dmabuf_vidmem.o 104 os/linux/dmabuf_vidmem.o
103 105
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c
index c3068b76..1a117169 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * GK20A Graphics 2 * GK20A Graphics
3 * 3 *
4 * Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved. 4 * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved.
5 * 5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a 6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"), 7 * copy of this software and associated documentation files (the "Software"),
@@ -39,6 +39,7 @@
39#include <nvgpu/therm.h> 39#include <nvgpu/therm.h>
40#include <nvgpu/mc.h> 40#include <nvgpu/mc.h>
41#include <nvgpu/channel_sync.h> 41#include <nvgpu/channel_sync.h>
42#include <nvgpu/nvgpu_err.h>
42 43
43#include <trace/events/gk20a.h> 44#include <trace/events/gk20a.h>
44 45
@@ -525,6 +526,10 @@ static void gk20a_free_cb(struct nvgpu_ref *refcount)
525 struct gk20a *g = container_of(refcount, 526 struct gk20a *g = container_of(refcount,
526 struct gk20a, refcount); 527 struct gk20a, refcount);
527 528
529#ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING
530 nvgpu_deinit_ecc_reporting(g);
531#endif
532
528 nvgpu_log(g, gpu_dbg_shutdown, "Freeing GK20A struct!"); 533 nvgpu_log(g, gpu_dbg_shutdown, "Freeing GK20A struct!");
529 534
530 gk20a_ce_destroy(g); 535 gk20a_ce_destroy(g);
diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
index a7a804d2..110819a9 100644
--- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * GV11b GPU GR 2 * GV11b GPU GR
3 * 3 *
4 * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. 4 * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
5 * 5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a 6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"), 7 * copy of this software and associated documentation files (the "Software"),
@@ -37,6 +37,7 @@
37#include <nvgpu/bitops.h> 37#include <nvgpu/bitops.h>
38#include <nvgpu/gk20a.h> 38#include <nvgpu/gk20a.h>
39#include <nvgpu/channel.h> 39#include <nvgpu/channel.h>
40#include <nvgpu/nvgpu_err.h>
40 41
41#include "gk20a/gr_gk20a.h" 42#include "gk20a/gr_gk20a.h"
42#include "gk20a/dbg_gpu_gk20a.h" 43#include "gk20a/dbg_gpu_gk20a.h"
@@ -61,6 +62,8 @@
61#include <nvgpu/hw/gv11b/hw_pbdma_gv11b.h> 62#include <nvgpu/hw/gv11b/hw_pbdma_gv11b.h>
62#include <nvgpu/hw/gv11b/hw_perf_gv11b.h> 63#include <nvgpu/hw/gv11b/hw_perf_gv11b.h>
63 64
65#define SHIFT_8_BITS 8U
66
64#define GFXP_WFI_TIMEOUT_COUNT_IN_USEC_DEFAULT 100 67#define GFXP_WFI_TIMEOUT_COUNT_IN_USEC_DEFAULT 100
65 68
66/* ecc scrubbing will done in 1 pri read cycle,but for safety used 10 retries */ 69/* ecc scrubbing will done in 1 pri read cycle,but for safety used 10 retries */
@@ -224,6 +227,12 @@ static int gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc,
224 } 227 }
225 g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter += 228 g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter +=
226 l1_tag_corrected_err_count_delta; 229 l1_tag_corrected_err_count_delta;
230
231 nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
232 (gpc << SHIFT_8_BITS) | tpc,
233 GPU_SM_L1_TAG_ECC_CORRECTED, 0,
234 g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter);
235
227 gk20a_writel(g, 236 gk20a_writel(g,
228 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_r() + offset, 237 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_r() + offset,
229 0); 238 0);
@@ -240,6 +249,12 @@ static int gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc,
240 } 249 }
241 g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter += 250 g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter +=
242 l1_tag_uncorrected_err_count_delta; 251 l1_tag_uncorrected_err_count_delta;
252
253 nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
254 (gpc << SHIFT_8_BITS) | tpc,
255 GPU_SM_L1_TAG_ECC_UNCORRECTED, 0,
256 g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter);
257
243 gk20a_writel(g, 258 gk20a_writel(g,
244 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_r() + offset, 259 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_r() + offset,
245 0); 260 0);
@@ -335,6 +350,10 @@ static int gr_gv11b_handle_lrf_exception(struct gk20a *g, u32 gpc, u32 tpc,
335 } 350 }
336 g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter += 351 g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter +=
337 lrf_uncorrected_err_count_delta; 352 lrf_uncorrected_err_count_delta;
353 nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
354 (gpc << SHIFT_8_BITS) | tpc,
355 GPU_SM_LRF_ECC_UNCORRECTED, 0,
356 g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter);
338 gk20a_writel(g, 357 gk20a_writel(g,
339 gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_r() + offset, 358 gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_r() + offset,
340 0); 359 0);
@@ -497,6 +516,12 @@ static int gr_gv11b_handle_cbu_exception(struct gk20a *g, u32 gpc, u32 tpc,
497 } 516 }
498 g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter += 517 g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter +=
499 cbu_uncorrected_err_count_delta; 518 cbu_uncorrected_err_count_delta;
519
520 nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
521 (gpc << SHIFT_8_BITS) | tpc,
522 GPU_SM_CBU_ECC_UNCORRECTED,
523 0, g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter);
524
500 gk20a_writel(g, 525 gk20a_writel(g,
501 gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_r() + offset, 526 gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_r() + offset,
502 0); 527 0);
@@ -580,6 +605,10 @@ static int gr_gv11b_handle_l1_data_exception(struct gk20a *g, u32 gpc, u32 tpc,
580 } 605 }
581 g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter += 606 g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter +=
582 l1_data_uncorrected_err_count_delta; 607 l1_data_uncorrected_err_count_delta;
608 nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
609 (gpc << SHIFT_8_BITS) | tpc,
610 GPU_SM_L1_DATA_ECC_UNCORRECTED,
611 0, g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter);
583 gk20a_writel(g, 612 gk20a_writel(g,
584 gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_r() + offset, 613 gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_r() + offset,
585 0); 614 0);
@@ -2537,10 +2566,18 @@ static void gr_gv11b_handle_fecs_ecc_error(struct gk20a *g, u32 intr)
2537 2566
2538 if (ecc_status & 2567 if (ecc_status &
2539 gr_fecs_falcon_ecc_status_corrected_err_imem_m()) { 2568 gr_fecs_falcon_ecc_status_corrected_err_imem_m()) {
2569 nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_FECS, 0,
2570 GPU_FECS_FALCON_IMEM_ECC_CORRECTED,
2571 ecc_addr,
2572 g->ecc.gr.fecs_ecc_corrected_err_count[0].counter);
2540 nvgpu_log(g, gpu_dbg_intr, "imem ecc error corrected"); 2573 nvgpu_log(g, gpu_dbg_intr, "imem ecc error corrected");
2541 } 2574 }
2542 if (ecc_status & 2575 if (ecc_status &
2543 gr_fecs_falcon_ecc_status_uncorrected_err_imem_m()) { 2576 gr_fecs_falcon_ecc_status_uncorrected_err_imem_m()) {
2577 nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_FECS, 0,
2578 GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED,
2579 ecc_addr,
2580 g->ecc.gr.fecs_ecc_uncorrected_err_count[0].counter);
2544 nvgpu_log(g, gpu_dbg_intr, 2581 nvgpu_log(g, gpu_dbg_intr,
2545 "imem ecc error uncorrected"); 2582 "imem ecc error uncorrected");
2546 } 2583 }
@@ -2550,6 +2587,10 @@ static void gr_gv11b_handle_fecs_ecc_error(struct gk20a *g, u32 intr)
2550 } 2587 }
2551 if (ecc_status & 2588 if (ecc_status &
2552 gr_fecs_falcon_ecc_status_uncorrected_err_dmem_m()) { 2589 gr_fecs_falcon_ecc_status_uncorrected_err_dmem_m()) {
2590 nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_FECS, 0,
2591 GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED,
2592 ecc_addr,
2593 g->ecc.gr.fecs_ecc_uncorrected_err_count[0].counter);
2553 nvgpu_log(g, gpu_dbg_intr, 2594 nvgpu_log(g, gpu_dbg_intr,
2554 "dmem ecc error uncorrected"); 2595 "dmem ecc error uncorrected");
2555 } 2596 }
diff --git a/drivers/gpu/nvgpu/gv11b/pmu_gv11b.c b/drivers/gpu/nvgpu/gv11b/pmu_gv11b.c
index 5e586ec2..336258a7 100644
--- a/drivers/gpu/nvgpu/gv11b/pmu_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/pmu_gv11b.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * GV11B PMU 2 * GV11B PMU
3 * 3 *
4 * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. 4 * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
5 * 5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a 6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"), 7 * copy of this software and associated documentation files (the "Software"),
@@ -29,6 +29,7 @@
29#include <nvgpu/io.h> 29#include <nvgpu/io.h>
30#include <nvgpu/utils.h> 30#include <nvgpu/utils.h>
31#include <nvgpu/gk20a.h> 31#include <nvgpu/gk20a.h>
32#include <nvgpu/nvgpu_err.h>
32 33
33#include "gk20a/pmu_gk20a.h" 34#include "gk20a/pmu_gk20a.h"
34#include "gp10b/pmu_gp10b.h" 35#include "gp10b/pmu_gp10b.h"
@@ -354,10 +355,18 @@ void gv11b_pmu_handle_ext_irq(struct gk20a *g, u32 intr0)
354 "pmu ecc interrupt intr1: 0x%x", intr1); 355 "pmu ecc interrupt intr1: 0x%x", intr1);
355 356
356 if (ecc_status & pwr_pmu_falcon_ecc_status_corrected_err_imem_m()) { 357 if (ecc_status & pwr_pmu_falcon_ecc_status_corrected_err_imem_m()) {
358 nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_PMU, 0,
359 GPU_PMU_FALCON_IMEM_ECC_CORRECTED,
360 ecc_addr,
361 g->ecc.pmu.pmu_ecc_corrected_err_count[0].counter);
357 nvgpu_log(g, gpu_dbg_intr, 362 nvgpu_log(g, gpu_dbg_intr,
358 "imem ecc error corrected"); 363 "imem ecc error corrected");
359 } 364 }
360 if (ecc_status & pwr_pmu_falcon_ecc_status_uncorrected_err_imem_m()) { 365 if (ecc_status & pwr_pmu_falcon_ecc_status_uncorrected_err_imem_m()) {
366 nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_PMU, 0,
367 GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED,
368 ecc_addr,
369 g->ecc.pmu.pmu_ecc_uncorrected_err_count[0].counter);
361 nvgpu_log(g, gpu_dbg_intr, 370 nvgpu_log(g, gpu_dbg_intr,
362 "imem ecc error uncorrected"); 371 "imem ecc error uncorrected");
363 } 372 }
@@ -366,6 +375,10 @@ void gv11b_pmu_handle_ext_irq(struct gk20a *g, u32 intr0)
366 "dmem ecc error corrected"); 375 "dmem ecc error corrected");
367 } 376 }
368 if (ecc_status & pwr_pmu_falcon_ecc_status_uncorrected_err_dmem_m()) { 377 if (ecc_status & pwr_pmu_falcon_ecc_status_uncorrected_err_dmem_m()) {
378 nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_PMU, 0,
379 GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED,
380 ecc_addr,
381 g->ecc.pmu.pmu_ecc_uncorrected_err_count[0].counter);
369 nvgpu_log(g, gpu_dbg_intr, 382 nvgpu_log(g, gpu_dbg_intr,
370 "dmem ecc error uncorrected"); 383 "dmem ecc error uncorrected");
371 } 384 }
diff --git a/drivers/gpu/nvgpu/include/nvgpu/bug.h b/drivers/gpu/nvgpu/include/nvgpu/bug.h
index 3d139b75..82d641bd 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/bug.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/bug.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. 2 * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
3 * 3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a 4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"), 5 * copy of this software and associated documentation files (the "Software"),
@@ -24,6 +24,24 @@
24 24
25#ifdef __KERNEL__ 25#ifdef __KERNEL__
26#include <linux/bug.h> 26#include <linux/bug.h>
27/*
28 * Define an assert macro that code within nvgpu can use.
29 *
30 * The goal of this macro is for debugging but what that means varies from OS
31 * to OS. On Linux wee don't want to BUG() for general driver misbehaving. BUG()
32 * is a very heavy handed tool - in fact there's probably no where within the
33 * nvgpu core code where it makes sense to use a BUG() when running under Linux.
34 *
35 * However, on QNX (and POSIX) BUG() will just kill the current process. This
36 * means we can use it for handling bugs in nvgpu.
37 *
38 * As a result this macro varies depending on platform.
39 */
40#define nvgpu_assert(cond) ((void) WARN_ON(!(cond)))
41#define nvgpu_do_assert_print(g, fmt, arg...) \
42 do { \
43 nvgpu_err(g, fmt, ##arg); \
44 } while (false)
27#elif defined(__NVGPU_POSIX__) 45#elif defined(__NVGPU_POSIX__)
28#include <nvgpu/posix/bug.h> 46#include <nvgpu/posix/bug.h>
29#else 47#else
diff --git a/drivers/gpu/nvgpu/include/nvgpu/log.h b/drivers/gpu/nvgpu/include/nvgpu/log.h
index 70a16762..2bcca335 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/log.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/log.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. 2 * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
3 * 3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a 4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"), 5 * copy of this software and associated documentation files (the "Software"),
@@ -80,6 +80,7 @@ void __nvgpu_log_dbg(struct gk20a *g, u64 log_mask,
80#define gpu_dbg_vidmem BIT(24) /* VIDMEM tracing. */ 80#define gpu_dbg_vidmem BIT(24) /* VIDMEM tracing. */
81#define gpu_dbg_nvlink BIT(25) /* nvlink Operation tracing. */ 81#define gpu_dbg_nvlink BIT(25) /* nvlink Operation tracing. */
82#define gpu_dbg_clk_arb BIT(26) /* Clk arbiter debugging. */ 82#define gpu_dbg_clk_arb BIT(26) /* Clk arbiter debugging. */
83#define gpu_dbg_ecc BIT(27) /* Print ECC Info Logs. */
83#define gpu_dbg_mem BIT(31) /* memory accesses; very verbose. */ 84#define gpu_dbg_mem BIT(31) /* memory accesses; very verbose. */
84 85
85/** 86/**
diff --git a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h
new file mode 100644
index 00000000..0595fafb
--- /dev/null
+++ b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h
@@ -0,0 +1,359 @@
1/*
2 * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 * DEALINGS IN THE SOFTWARE.
21 */
22
23#ifndef NVGPU_NVGPU_ERR_H
24#define NVGPU_NVGPU_ERR_H
25
26/**
27 * @file
28 *
29 * Define indices for HW units and errors. Define structures used to carry error
30 * information. Declare prototype for APIs that are used to report GPU HW errors
31 * to the Safety_Services framework.
32 */
33
34#include <nvgpu/types.h>
35#include <nvgpu/atomic.h>
36
37struct gk20a;
38
39/**
40 * @defgroup INDICES_FOR_GPU_HW_UNITS
41 * Macros used to assign unique index to GPU HW units.
42 * @{
43 */
44#define NVGPU_ERR_MODULE_SM (0U)
45#define NVGPU_ERR_MODULE_FECS (1U)
46#define NVGPU_ERR_MODULE_PMU (2U)
47/**
48 * @}
49 */
50
51/**
52 * @defgroup LIST_OF_ERRORS_REPORTED_FROM_SM
53 * Macros used to assign unique index to errors reported from the SM unit.
54 * @{
55 */
56#define GPU_SM_L1_TAG_ECC_CORRECTED (0U)
57#define GPU_SM_L1_TAG_ECC_UNCORRECTED (1U)
58#define GPU_SM_CBU_ECC_UNCORRECTED (3U)
59#define GPU_SM_LRF_ECC_UNCORRECTED (5U)
60#define GPU_SM_L1_DATA_ECC_UNCORRECTED (7U)
61#define GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED (9U)
62#define GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED (11U)
63#define GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED (13U)
64#define GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED (15U)
65#define GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED (17U)
66#define GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED (20U)
67/**
68 * @}
69 */
70
71/**
72 * @defgroup LIST_OF_ERRORS_REPORTED_FROM_FECS
73 * Macros used to assign unique index to errors reported from the FECS unit.
74 * @{
75 */
76#define GPU_FECS_FALCON_IMEM_ECC_CORRECTED (0U)
77#define GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED (1U)
78#define GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED (3U)
79/**
80 * @}
81 */
82
83/**
84 * @defgroup LIST_OF_ERRORS_REPORTED_FROM_GPCCS
85 * Macros used to assign unique index to errors reported from the GPCCS unit.
86 * @{
87 */
88#define GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED (0U)
89#define GPU_GPCCS_FALCON_IMEM_ECC_UNCORRECTED (1U)
90#define GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED (3U)
91/**
92 * @}
93 */
94
95/**
96 * @defgroup LIST_OF_ERRORS_REPORTED_FROM_MMU
97 * Macros used to assign unique index to errors reported from the MMU unit.
98 * @{
99 */
100#define GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED (1U)
101#define GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED (3U)
102/**
103 * @}
104 */
105
106/**
107 * @defgroup LIST_OF_ERRORS_REPORTED_FROM_GCC
108 * Macros used to assign unique index to errors reported from the GCC unit.
109 * @{
110 */
111#define GPU_GCC_L15_ECC_UNCORRECTED (1U)
112/**
113 * @}
114 */
115
116
117/**
118 * @defgroup LIST_OF_ERRORS_REPORTED_FROM_PMU
119 * Macros used to assign unique index to errors reported from the PMU unit.
120 * @{
121 */
122#define GPU_PMU_FALCON_IMEM_ECC_CORRECTED (0U)
123#define GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED (1U)
124#define GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED (3U)
125/**
126 * @}
127 */
128
129/**
130 * @defgroup LIST_OF_ERRORS_REPORTED_FROM_LTC
131 * Macros used to assign unique index to errors reported from the LTC unit.
132 * @{
133 */
134#define GPU_LTC_CACHE_DSTG_ECC_CORRECTED (0U)
135#define GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED (1U)
136#define GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED (3U)
137#define GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED (7U)
138/**
139 * @}
140 */
141
142/**
143 * @defgroup LIST_OF_ERRORS_REPORTED_FROM_HUBMMU
144 * Macros used to assign unique index to errors reported from the HUBMMU unit.
145 * @{
146 */
147#define GPU_HUBMMU_L2TLB_SA_DATA_ECC_UNCORRECTED (1U)
148#define GPU_HUBMMU_TLB_SA_DATA_ECC_UNCORRECTED (3U)
149#define GPU_HUBMMU_PTE_DATA_ECC_UNCORRECTED (5U)
150#define GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED (7U)
151#define GPU_HUBMMU_PAGE_FAULT_ERROR (8U)
152
153
154#ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING
155/**
156 * @}
157 */
158
159/**
160 * nvgpu_err_desc structure holds fields which describe an error along with
161 * function callback which can be used to inject the error.
162 */
163struct nvgpu_err_desc {
164 /** String representation of error. */
165 const char *name;
166
167 /** Flag to classify an error as critical or non-critical. */
168 bool is_critical;
169
170 /**
171 * Error Threshold: once this threshold value is reached, then the
172 * corresponding error counter will be reset to 0 and the error will be
173 * propagated to Safety_Services.
174 */
175 int err_threshold;
176
177 /**
178 * Total number of times an error has occurred (since its last reset).
179 */
180 nvgpu_atomic_t err_count;
181
182 /** Error ID. */
183 u8 error_id;
184};
185
186/**
187 * gpu_err_header structure holds fields which are required to identify the
188 * version of header, sub-error type, sub-unit id, error address and time stamp.
189 */
190struct gpu_err_header {
191 /** Version of GPU error header. */
192 struct {
193 /** Major version number. */
194 u16 major;
195 /** Minor version number. */
196 u16 minor;
197 } version;
198
199 /** Sub error type corresponding to the error that is being reported. */
200 u32 sub_err_type;
201
202 /** ID of the sub-unit in a HW unit which encountered an error. */
203 u64 sub_unit_id;
204
205 /** Location of the error. */
206 u64 address;
207
208 /** Timestamp in nano seconds. */
209 u64 timestamp_ns;
210};
211
212struct gpu_ecc_error_info {
213 struct gpu_err_header header;
214
215 /** Number of ECC errors. */
216 u64 err_cnt;
217};
218
219/**
220 * nvgpu_err_hw_module structure holds fields which describe the h/w modules
221 * error reporting capabilities.
222 */
223struct nvgpu_err_hw_module {
224 /** String representation of a given HW unit. */
225 const char *name;
226
227 /** HW unit ID. */
228 u32 hw_unit;
229
230 /** Total number of errors reported from a given HW unit. */
231 u32 num_errs;
232
233 u32 base_ecc_service_id;
234
235 /** Used to get error description from look-up table. */
236 struct nvgpu_err_desc *errs;
237};
238
239struct nvgpu_ecc_reporting_ops {
240 void (*report_ecc_err)(struct gk20a *g, u32 hw_unit, u32 inst,
241 u32 err_id, u64 err_addr, u64 err_count);
242};
243
244struct nvgpu_ecc_reporting {
245 struct nvgpu_spinlock lock;
246 /* This flag is protected by the above spinlock */
247 bool ecc_reporting_service_enabled;
248 const struct nvgpu_ecc_reporting_ops *ops;
249};
250
251 /**
252 * This macro is used to initialize the members of nvgpu_err_desc struct.
253 */
254#define GPU_ERR(err, critical, id, threshold, ecount) \
255{ \
256 .name = (err), \
257 .is_critical = (critical), \
258 .error_id = (id), \
259 .err_threshold = (threshold), \
260 .err_count = NVGPU_ATOMIC_INIT(ecount), \
261}
262
263/**
264 * This macro is used to initialize critical errors.
265 */
266#define GPU_CRITERR(err, id, threshold, ecount) \
267 GPU_ERR(err, true, id, threshold, ecount)
268
269/**
270 * This macro is used to initialize non-critical errors.
271 */
272#define GPU_NONCRITERR(err, id, threshold, ecount) \
273 GPU_ERR(err, false, id, threshold, ecount)
274
275/**
276 * @brief GPU HW errors need to be reported to Safety_Services via SDL unit.
277 * This function provides an interface to report ECC erros to SDL unit.
278 *
279 * @param g [in] - The GPU driver struct.
280 * @param hw_unit [in] - Index of HW unit.
281 * - List of valid HW unit IDs
282 * - NVGPU_ERR_MODULE_SM
283 * - NVGPU_ERR_MODULE_FECS
284 * - NVGPU_ERR_MODULE_GPCCS
285 * - NVGPU_ERR_MODULE_MMU
286 * - NVGPU_ERR_MODULE_GCC
287 * - NVGPU_ERR_MODULE_PMU
288 * - NVGPU_ERR_MODULE_LTC
289 * - NVGPU_ERR_MODULE_HUBMMU
290 * @param inst [in] - Instance ID.
291 * - In case of multiple instances of the same HW
292 * unit (e.g., there are multiple instances of
293 * SM), it is used to identify the instance
294 * that encountered a fault.
295 * @param err_id [in] - Error index.
296 * - For SM:
297 * - Min: GPU_SM_L1_TAG_ECC_CORRECTED
298 * - Max: GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED
299 * - For FECS:
300 * - Min: GPU_FECS_FALCON_IMEM_ECC_CORRECTED
301 * - Max: GPU_FECS_INVALID_ERROR
302 * - For GPCCS:
303 * - Min: GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED
304 * - Max: GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED
305 * - For MMU:
306 * - Min: GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED
307 * - Max: GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED
308 * - For GCC:
309 * - Min: GPU_GCC_L15_ECC_UNCORRECTED
310 * - Max: GPU_GCC_L15_ECC_UNCORRECTED
311 * - For PMU:
312 * - Min: GPU_PMU_FALCON_IMEM_ECC_CORRECTED
313 * - Max: GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED
314 * - For LTC:
315 * - Min: GPU_LTC_CACHE_DSTG_ECC_CORRECTED
316 * - Max: GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED
317 * - For HUBMMU:
318 * - Min: GPU_HUBMMU_L2TLB_SA_DATA_ECC_UNCORRECTED
319 * - Max: GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED
320 * @param err_addr [in] - Error address.
321 * - This is the location at which correctable or
322 * uncorrectable error has occurred.
323 * @param err_count [in] - Error count.
324 *
325 * - Checks whether SDL is supported in the current GPU platform. If SDL is not
326 * supported, it simply returns.
327 * - Validates both \a hw_unit and \a err_id indices. In case of a failure,
328 * invokes #nvgpu_sdl_handle_report_failure() api.
329 * - Gets the current time of a clock. In case of a failure, invokes
330 * #nvgpu_sdl_handle_report_failure() api.
331 * - Gets error description from internal look-up table using \a hw_unit and
332 * \a err_id indices.
333 * - Forms error packet using details such as time-stamp, \a hw_unit, \a err_id,
334 * criticality of the error, \a inst, \a err_addr, \a err_count, error
335 * description, and size of the error packet.
336 * - Performs compile-time assert check to ensure that the size of the error
337 * packet does not exceed the maximum allowable size specified in
338 * #MAX_ERR_MSG_SIZE.
339 *
340 * @return None
341 */
342void nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst,
343 u32 err_id, u64 err_addr, u64 err_count);
344
345void nvgpu_init_ecc_reporting(struct gk20a *g);
346void nvgpu_enable_ecc_reporting(struct gk20a *g);
347void nvgpu_disable_ecc_reporting(struct gk20a *g);
348void nvgpu_deinit_ecc_reporting(struct gk20a *g);
349
350#else
351
352static inline void nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst,
353 u32 err_id, u64 err_addr, u64 err_count) {
354
355}
356
357#endif /* CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING */
358
359#endif /* NVGPU_NVGPU_ERR_H */ \ No newline at end of file
diff --git a/drivers/gpu/nvgpu/os/linux/ecc_linux.h b/drivers/gpu/nvgpu/os/linux/ecc_linux.h
new file mode 100644
index 00000000..7e0f650b
--- /dev/null
+++ b/drivers/gpu/nvgpu/os/linux/ecc_linux.h
@@ -0,0 +1,49 @@
1/*
2 *
3 * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24#ifndef NVGPU_OS_ECC_LINUX_H
25#define NVGPU_OS_ECC_LINUX_H
26
27#ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING
28
29#include <linux/tegra_l1ss_kernel_interface.h>
30#include <linux/tegra_l1ss_ioctl.h>
31#include <linux/tegra_nv_guard_service_id.h>
32#include <linux/tegra_nv_guard_group_id.h>
33
34#include <nvgpu/nvgpu_err.h>
35
36struct nvgpu_ecc_reporting_linux {
37 struct nvgpu_ecc_reporting common;
38 client_param_t priv;
39};
40
41static inline struct nvgpu_ecc_reporting_linux *get_ecc_reporting_linux(
42 struct nvgpu_ecc_reporting *ecc_report)
43{
44 return container_of(ecc_report, struct nvgpu_ecc_reporting_linux, common);
45}
46
47#endif /* CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING */
48
49#endif \ No newline at end of file
diff --git a/drivers/gpu/nvgpu/os/linux/module.c b/drivers/gpu/nvgpu/os/linux/module.c
index 807df2ca..fdbab46d 100644
--- a/drivers/gpu/nvgpu/os/linux/module.c
+++ b/drivers/gpu/nvgpu/os/linux/module.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * GK20A Graphics 2 * GK20A Graphics
3 * 3 *
4 * Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved. 4 * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved.
5 * 5 *
6 * This program is free software; you can redistribute it and/or modify it 6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms and conditions of the GNU General Public License, 7 * under the terms and conditions of the GNU General Public License,
@@ -49,6 +49,7 @@
49#include <nvgpu/clk_arb.h> 49#include <nvgpu/clk_arb.h>
50#include <nvgpu/timers.h> 50#include <nvgpu/timers.h>
51#include <nvgpu/channel.h> 51#include <nvgpu/channel.h>
52#include <nvgpu/nvgpu_err.h>
52 53
53#include "platform_gk20a.h" 54#include "platform_gk20a.h"
54#include "sysfs.h" 55#include "sysfs.h"
@@ -355,6 +356,10 @@ int gk20a_pm_finalize_poweron(struct device *dev)
355 gk20a_init_cde_support(l); 356 gk20a_init_cde_support(l);
356#endif 357#endif
357 358
359#ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING
360 nvgpu_enable_ecc_reporting(g);
361#endif
362
358 err = gk20a_sched_ctrl_init(g); 363 err = gk20a_sched_ctrl_init(g);
359 if (err) { 364 if (err) {
360 nvgpu_err(g, "failed to init sched control"); 365 nvgpu_err(g, "failed to init sched control");
@@ -364,9 +369,14 @@ int gk20a_pm_finalize_poweron(struct device *dev)
364 g->sw_ready = true; 369 g->sw_ready = true;
365 370
366done: 371done:
367 if (err) 372 if (err) {
368 g->power_on = false; 373 g->power_on = false;
369 374
375#ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING
376 nvgpu_disable_ecc_reporting(g);
377#endif
378 }
379
370 nvgpu_mutex_release(&g->power_lock); 380 nvgpu_mutex_release(&g->power_lock);
371 return err; 381 return err;
372} 382}
@@ -433,6 +443,10 @@ static int gk20a_pm_prepare_poweroff(struct device *dev)
433 /* Stop CPU from accessing the GPU registers. */ 443 /* Stop CPU from accessing the GPU registers. */
434 gk20a_lockout_registers(g); 444 gk20a_lockout_registers(g);
435 445
446#ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING
447 nvgpu_disable_ecc_reporting(g);
448#endif
449
436 nvgpu_hide_usermode_for_poweroff(g); 450 nvgpu_hide_usermode_for_poweroff(g);
437 nvgpu_mutex_release(&g->power_lock); 451 nvgpu_mutex_release(&g->power_lock);
438 return 0; 452 return 0;
@@ -1382,6 +1396,10 @@ static int gk20a_probe(struct platform_device *dev)
1382 goto return_err; 1396 goto return_err;
1383 } 1397 }
1384 1398
1399#ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING
1400 nvgpu_init_ecc_reporting(gk20a);
1401#endif
1402
1385 gk20a->nvgpu_reboot_nb.notifier_call = 1403 gk20a->nvgpu_reboot_nb.notifier_call =
1386 nvgpu_kernel_shutdown_notification; 1404 nvgpu_kernel_shutdown_notification;
1387 err = register_reboot_notifier(&gk20a->nvgpu_reboot_nb); 1405 err = register_reboot_notifier(&gk20a->nvgpu_reboot_nb);
diff --git a/drivers/gpu/nvgpu/os/linux/os_linux.h b/drivers/gpu/nvgpu/os/linux/os_linux.h
index 25c6c03a..adcfdb2f 100644
--- a/drivers/gpu/nvgpu/os/linux/os_linux.h
+++ b/drivers/gpu/nvgpu/os/linux/os_linux.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. 2 * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
3 * 3 *
4 * This program is free software; you can redistribute it and/or modify it 4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License, 5 * under the terms and conditions of the GNU General Public License,
@@ -25,6 +25,7 @@
25 25
26#include "cde.h" 26#include "cde.h"
27#include "sched.h" 27#include "sched.h"
28#include "ecc_linux.h"
28 29
29struct nvgpu_os_linux_ops { 30struct nvgpu_os_linux_ops {
30 struct { 31 struct {
@@ -134,6 +135,10 @@ struct nvgpu_os_linux {
134 135
135 u64 regs_bus_addr; 136 u64 regs_bus_addr;
136 137
138#ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING
139 struct nvgpu_ecc_reporting_linux ecc_reporting_linux;
140#endif
141
137 struct nvgpu_os_linux_ops ops; 142 struct nvgpu_os_linux_ops ops;
138 143
139#ifdef CONFIG_DEBUG_FS 144#ifdef CONFIG_DEBUG_FS
diff --git a/drivers/gpu/nvgpu/os/linux/sdl.c b/drivers/gpu/nvgpu/os/linux/sdl.c
new file mode 100644
index 00000000..c4dccdc6
--- /dev/null
+++ b/drivers/gpu/nvgpu/os/linux/sdl.c
@@ -0,0 +1,341 @@
1/*
2 * Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program. If not, see <http://www.gnu.org/licenses/>.
15 */
16
17#include <nvgpu/gk20a.h>
18#include <nvgpu/types.h>
19#include <nvgpu/nvgpu_err.h>
20#include <nvgpu/timers.h>
21#include <nvgpu/bug.h>
22
23#include "ecc_linux.h"
24#include "os_linux.h"
25#include "module.h"
26
27/* This look-up table initializes the list of hw units and their errors.
28 * It also specifies the error injection mechanism supported, for each error.
29 * In case of hw error injection support, this initialization will be overriden
30 * by the values provided from the hal layes of corresponding hw units.
31 */
32static struct nvgpu_err_hw_module gv11b_err_lut[] = {
33 {
34 .name = "sm",
35 .hw_unit = (u32)NVGPU_ERR_MODULE_SM,
36 .num_errs = 21U,
37 .base_ecc_service_id =
38 NVGUARD_SERVICE_IGPU_SM_SWERR_L1_TAG_ECC_CORRECTED,
39 .errs = (struct nvgpu_err_desc[]) {
40 GPU_NONCRITERR("l1_tag_ecc_corrected",
41 GPU_SM_L1_TAG_ECC_CORRECTED, 0, 0),
42 GPU_CRITERR("l1_tag_ecc_uncorrected",
43 GPU_SM_L1_TAG_ECC_UNCORRECTED, 0, 0),
44 GPU_NONCRITERR("cbu_ecc_corrected", 0, 0, 0),
45 GPU_CRITERR("cbu_ecc_uncorrected",
46 GPU_SM_CBU_ECC_UNCORRECTED, 0, 0),
47 GPU_NONCRITERR("lrf_ecc_corrected", 0, 0, 0),
48 GPU_CRITERR("lrf_ecc_uncorrected",
49 GPU_SM_LRF_ECC_UNCORRECTED, 0, 0),
50 GPU_NONCRITERR("l1_data_ecc_corrected", 0, 0, 0),
51 GPU_CRITERR("l1_data_ecc_uncorrected",
52 GPU_SM_L1_DATA_ECC_UNCORRECTED, 0, 0),
53 GPU_NONCRITERR("icache_l0_data_ecc_corrected", 0, 0, 0),
54 GPU_CRITERR("icache_l0_data_ecc_uncorrected",
55 GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED, 0, 0),
56 GPU_NONCRITERR("icache_l1_data_ecc_corrected", 0, 0, 0),
57 GPU_CRITERR("icache_l1_data_ecc_uncorrected",
58 GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED, 0, 0),
59 GPU_NONCRITERR("icache_l0_predecode_ecc_corrected", 0, 0, 0),
60 GPU_CRITERR("icache_l0_predecode_ecc_uncorrected",
61 GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED, 0, 0),
62 GPU_NONCRITERR("l1_tag_miss_fifo_ecc_corrected", 0, 0, 0),
63 GPU_CRITERR("l1_tag_miss_fifo_ecc_uncorrected",
64 GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED, 0, 0),
65 GPU_NONCRITERR("l1_tag_s2r_pixprf_ecc_corrected", 0, 0, 0),
66 GPU_CRITERR("l1_tag_s2r_pixprf_ecc_uncorrected",
67 GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED, 0, 0),
68 GPU_CRITERR("machine_check_error", 0, 0, 0),
69 GPU_NONCRITERR("icache_l1_predecode_ecc_corrected", 0, 0, 0),
70 GPU_CRITERR("icache_l1_predecode_ecc_uncorrected",
71 GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED, 0, 0),
72 },
73 },
74 {
75 .name = "fecs",
76 .hw_unit = (u32)NVGPU_ERR_MODULE_FECS,
77 .num_errs = 4U,
78 .base_ecc_service_id =
79 NVGUARD_SERVICE_IGPU_FECS_SWERR_FALCON_IMEM_ECC_CORRECTED,
80 .errs = (struct nvgpu_err_desc[]) {
81 GPU_NONCRITERR("falcon_imem_ecc_corrected",
82 GPU_FECS_FALCON_IMEM_ECC_CORRECTED, 0, 0),
83 GPU_CRITERR("falcon_imem_ecc_uncorrected",
84 GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED, 0, 0),
85 GPU_NONCRITERR("falcon_dmem_ecc_corrected", 0, 0, 0),
86 GPU_CRITERR("falcon_dmem_ecc_uncorrected",
87 GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED, 0, 0),
88 },
89 },
90 {
91 .name = "pmu",
92 .hw_unit = NVGPU_ERR_MODULE_PMU,
93 .num_errs = 4U,
94 .base_ecc_service_id =
95 NVGUARD_SERVICE_IGPU_PMU_SWERR_FALCON_IMEM_ECC_CORRECTED,
96 .errs = (struct nvgpu_err_desc[]) {
97 GPU_NONCRITERR("falcon_imem_ecc_corrected",
98 GPU_PMU_FALCON_IMEM_ECC_CORRECTED, 0, 0),
99 GPU_CRITERR("falcon_imem_ecc_uncorrected",
100 GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED, 0, 0),
101 GPU_NONCRITERR("falcon_dmem_ecc_corrected", 0, 0, 0),
102 GPU_CRITERR("falcon_dmem_ecc_uncorrected",
103 GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED, 0, 0),
104 },
105 },
106};
107
108static void nvgpu_init_err_msg_header(struct gpu_err_header *header)
109{
110 header->version.major = (u16)1U;
111 header->version.minor = (u16)0U;
112 header->sub_err_type = 0U;
113 header->sub_unit_id = 0UL;
114 header->address = 0UL;
115 header->timestamp_ns = 0UL;
116}
117
118static void nvgpu_init_ecc_err_msg(struct gpu_ecc_error_info *err_info)
119{
120 nvgpu_init_err_msg_header(&err_info->header);
121 err_info->err_cnt = 0UL;
122}
123
124static void nvgpu_report_ecc_error_linux(struct gk20a *g, u32 hw_unit, u32 inst,
125 u32 err_id, u64 err_addr, u64 err_count)
126{
127 int err = 0;
128 u32 s_id = 0;
129 u8 err_status = 0;
130 u8 err_info_size = 0;
131 u64 timestamp = 0ULL;
132 int err_threshold_counter = 0;
133 struct gpu_ecc_error_info err_pkt;
134 struct nvgpu_err_desc *err_desc = NULL;
135 struct nvgpu_err_hw_module *hw_module = NULL;
136 nv_guard_request_t req;
137
138 memset(&req, 0, sizeof(req));
139 nvgpu_init_ecc_err_msg(&err_pkt);
140 if (hw_unit >= sizeof(gv11b_err_lut)/sizeof(gv11b_err_lut[0])) {
141 err = -EINVAL;
142 goto done;
143 }
144
145 hw_module = &gv11b_err_lut[hw_unit];
146 if (err_id >= hw_module->num_errs) {
147 nvgpu_err(g, "invalid err_id (%u) for hw module (%u)",
148 err_id, hw_module->hw_unit);
149 err = -EINVAL;
150 goto done;
151 }
152 err_desc = &hw_module->errs[err_id];
153 timestamp = (u64)nvgpu_current_time_ns();
154
155 err_pkt.header.timestamp_ns = timestamp;
156 err_pkt.header.sub_unit_id = inst;
157 err_pkt.header.address = err_addr;
158 err_pkt.err_cnt = err_count;
159 err_info_size = sizeof(err_pkt);
160
161 s_id = hw_module->base_ecc_service_id + err_id;
162
163 if (err_desc->is_critical) {
164 err_status = NVGUARD_ERROR_DETECTED;
165 } else {
166 err_status = NVGUARD_NO_ERROR;
167 }
168
169 nvgpu_atomic_inc(&err_desc->err_count);
170 err_threshold_counter = nvgpu_atomic_cmpxchg(&err_desc->err_count,
171 err_desc->err_threshold + 1, 0);
172
173 if (unlikely(err_threshold_counter != err_desc->err_threshold + 1)) {
174 goto done;
175 }
176
177 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting hw: %s, desc:%s, count:%llu",
178 hw_module->name, err_desc->name, err_count);
179
180 req.srv_id_cmd = NVGUARD_SERVICESTATUS_NOTIFICATION;
181 req.srv_status.srv_id = (nv_guard_service_id_t)s_id;
182 req.srv_status.status = err_status;
183 req.srv_status.timestamp = timestamp;
184 req.srv_status.error_info_size = err_info_size;
185 memcpy(req.srv_status.error_info, (u8*)&err_pkt, err_info_size);
186
187 /*
188 * l1ss_submit_rq may fail due to kmalloc failures but may pass in
189 * subsequent calls
190 */
191 err = l1ss_submit_rq(&req, true);
192 if (err != 0) {
193 nvgpu_err(g, "Error returned from L1SS submit %d", err);
194 }
195
196 if (err_desc->is_critical) {
197 nvgpu_quiesce(g);
198 }
199
200done:
201 return;
202}
203
204static void nvgpu_report_ecc_error_empty(struct gk20a *g, u32 hw_unit, u32 inst,
205 u32 err_id, u64 err_addr, u64 err_count) {
206 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting empty");
207}
208
209const struct nvgpu_ecc_reporting_ops default_disabled_ecc_report_ops = {
210 .report_ecc_err = nvgpu_report_ecc_error_empty,
211};
212
213const struct nvgpu_ecc_reporting_ops ecc_enable_report_ops = {
214 .report_ecc_err = nvgpu_report_ecc_error_linux,
215};
216
217static int nvgpu_l1ss_callback(l1ss_cli_callback_param param, void *data)
218{
219 struct gk20a *g = (struct gk20a *)data;
220 struct nvgpu_os_linux *l = NULL;
221 struct nvgpu_ecc_reporting_linux *ecc_reporting_linux = NULL;
222 int err = 0;
223 /* Ensure we have a valid gk20a struct before proceeding */
224 if ((g == NULL) || (gk20a_get(g) == NULL)) {
225 return -ENODEV;
226 }
227
228 l = nvgpu_os_linux_from_gk20a(g);
229 ecc_reporting_linux = &l->ecc_reporting_linux;
230
231 nvgpu_spinlock_acquire(&ecc_reporting_linux->common.lock);
232 if (param == L1SS_READY) {
233 if (!ecc_reporting_linux->common.ecc_reporting_service_enabled) {
234 ecc_reporting_linux->common.ecc_reporting_service_enabled = true;
235 ecc_reporting_linux->common.ops = &ecc_enable_report_ops;
236 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is enabled");
237 }
238 } else if (param == L1SS_NOT_READY) {
239 if (ecc_reporting_linux->common.ecc_reporting_service_enabled) {
240 ecc_reporting_linux->common.ecc_reporting_service_enabled = false;
241 ecc_reporting_linux->common.ops = &default_disabled_ecc_report_ops;
242 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is disabled");
243 }
244 } else {
245 err = -EINVAL;
246 }
247 nvgpu_spinlock_release(&ecc_reporting_linux->common.lock);
248
249 gk20a_put(g);
250
251 return err;
252}
253
254void nvgpu_init_ecc_reporting(struct gk20a *g)
255{
256 struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
257 struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux;
258 int err = 0;
259 /* This will invoke the registration API */
260 nvgpu_spinlock_init(&ecc_report_linux->common.lock);
261 ecc_report_linux->priv.id = (NVGUARD_GROUPID_IGPU & NVGUARD_GROUPINDEX_FIELDMASK);
262 ecc_report_linux->priv.cli_callback = nvgpu_l1ss_callback;
263 ecc_report_linux->priv.data = g;
264 ecc_report_linux->common.ops = &default_disabled_ecc_report_ops;
265
266 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting Init");
267
268 /*
269 * err == 0 indicates service is available but not active yet.
270 * err == 1 indicates service is available and active
271 * error for other cases.
272 */
273 err = l1ss_register_client(&ecc_report_linux->priv);
274 if (err == 0) {
275 ecc_report_linux->common.ecc_reporting_service_enabled = false;
276 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting init success");
277 } else if (err == 1) {
278 ecc_report_linux->common.ecc_reporting_service_enabled = true;
279 /* Actual Ops will be replaced during nvgpu_enable_ecc_reporting
280 * called as part of gk20a_busy()
281 */
282 } else {
283 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting init failure %d", err);
284 }
285}
286
287void nvgpu_deinit_ecc_reporting(struct gk20a *g)
288{
289 struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
290 struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux;
291
292 if (ecc_report_linux->common.ecc_reporting_service_enabled) {
293 ecc_report_linux->common.ecc_reporting_service_enabled = false;
294 l1ss_deregister_client(ecc_report_linux->priv.id);
295 memset(ecc_report_linux, 0, sizeof(*ecc_report_linux));
296 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting de-init success");
297 }
298
299}
300
301void nvgpu_enable_ecc_reporting(struct gk20a *g)
302{
303 struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
304 struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux;
305 struct nvgpu_ecc_reporting *error_reporting = &ecc_report_linux->common;
306
307 nvgpu_spinlock_acquire(&ecc_report_linux->common.lock);
308 if (error_reporting->ecc_reporting_service_enabled) {
309 error_reporting->ops = &ecc_enable_report_ops;
310 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is enabled");
311 }
312 nvgpu_spinlock_release(&ecc_report_linux->common.lock);
313}
314
315void nvgpu_disable_ecc_reporting(struct gk20a *g)
316{
317 struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
318 struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux;
319 struct nvgpu_ecc_reporting *error_reporting = &ecc_report_linux->common;
320
321 nvgpu_spinlock_acquire(&ecc_report_linux->common.lock);
322 error_reporting->ops = &default_disabled_ecc_report_ops;
323 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is disabled");
324 nvgpu_spinlock_release(&ecc_report_linux->common.lock);
325}
326
327void nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst,
328 u32 err_id, u64 err_addr, u64 err_count)
329{
330 struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
331 struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux;
332 struct nvgpu_ecc_reporting *error_reporting = &ecc_report_linux->common;
333 void (*report_ecc_err_func)(struct gk20a *g, u32 hw_unit, u32 inst,
334 u32 err_id, u64 err_addr, u64 err_count);
335
336 nvgpu_spinlock_acquire(&ecc_report_linux->common.lock);
337 report_ecc_err_func = error_reporting->ops->report_ecc_err;
338 nvgpu_spinlock_release(&ecc_report_linux->common.lock);
339
340 report_ecc_err_func(g, hw_unit, inst, err_id, err_addr, err_count);
341}