summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDebarshi Dutta <ddutta@nvidia.com>2021-05-17 04:38:25 -0400
committermobile promotions <svcmobile_promotions@nvidia.com>2021-05-28 15:10:24 -0400
commit34993e4f7b0d47620e88ba64a6d7c67330d97e35 (patch)
tree2136284f5bd4095780884885413bb268fd318a96
parent5f88598b9e7b2cfe0387733577ece138a7bc912b (diff)
gpu: nvgpu: Add ECC Support for GV11B in Linux
Implement nvgpu plumbing to allow reporting ECC errors(corrected and uncorrected) to a L1SS service(if one exists). This patch includes the following 1) Added code that submits ECC error reports via the Interrupt context directly to a L1SS service in linux OS. 2) Added support for enabling/disabling the error reports via L1SS's registration/deregistration API. Nvgpu simply invokes an empty function until the registration is successful. 3) Added Spinlock to correctly handle concurrency for accessing the correct Ops for submitting requests. 4) Adds error reporting for a subset of interrupts that can be verified via external ECC injection logic. A subsequent patch will add the API for rest of the interrupts. 5) In case of critical(uncorrected errors), change nvgpu's state to quiesce state. Jira L4T-1187 Bug 200700400 Change-Id: Id31f70531fba355e94e72c4f9762593e7667a11c Signed-off-by: Debarshi Dutta <ddutta@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2530411 Tested-by: Bibek Basu <bbasu@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com> Reviewed-by: Bibek Basu <bbasu@nvidia.com> Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> GVS: Gerrit_Virtual_Submit
-rw-r--r--drivers/gpu/nvgpu/Kconfig7
-rw-r--r--drivers/gpu/nvgpu/Makefile2
-rw-r--r--drivers/gpu/nvgpu/gk20a/gk20a.c7
-rw-r--r--drivers/gpu/nvgpu/gv11b/gr_gv11b.c43
-rw-r--r--drivers/gpu/nvgpu/gv11b/pmu_gv11b.c15
-rw-r--r--drivers/gpu/nvgpu/include/nvgpu/bug.h20
-rw-r--r--drivers/gpu/nvgpu/include/nvgpu/log.h3
-rw-r--r--drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h359
-rw-r--r--drivers/gpu/nvgpu/os/linux/ecc_linux.h49
-rw-r--r--drivers/gpu/nvgpu/os/linux/module.c22
-rw-r--r--drivers/gpu/nvgpu/os/linux/os_linux.h7
-rw-r--r--drivers/gpu/nvgpu/os/linux/sdl.c341
12 files changed, 867 insertions, 8 deletions
diff --git a/drivers/gpu/nvgpu/Kconfig b/drivers/gpu/nvgpu/Kconfig
index 7dba61a3..07331817 100644
--- a/drivers/gpu/nvgpu/Kconfig
+++ b/drivers/gpu/nvgpu/Kconfig
@@ -143,6 +143,13 @@ config NVGPU_SUPPORT_CDE
143 help 143 help
144 Enable support for extraction of comptags for CDE. 144 Enable support for extraction of comptags for CDE.
145 145
146config NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING
147 bool "Support ECC error reporting for Linux"
148 depends on TEGRA_SAFETY
149 default y
150 help
151 Enable support for ECC error reporting for Linux.
152
146config NVGPU_USE_TEGRA_ALLOC_FD 153config NVGPU_USE_TEGRA_ALLOC_FD
147 bool "Use tegra_alloc_fd() for allocating dma_buf fds for vidmem" 154 bool "Use tegra_alloc_fd() for allocating dma_buf fds for vidmem"
148 depends on GK20A && GK20A_VIDMEM 155 depends on GK20A && GK20A_VIDMEM
diff --git a/drivers/gpu/nvgpu/Makefile b/drivers/gpu/nvgpu/Makefile
index 472bf32c..d5ceecb6 100644
--- a/drivers/gpu/nvgpu/Makefile
+++ b/drivers/gpu/nvgpu/Makefile
@@ -98,6 +98,8 @@ nvgpu-y += \
98 os/linux/ltc.o \ 98 os/linux/ltc.o \
99 os/linux/vpr.o 99 os/linux/vpr.o
100 100
101nvgpu-$(CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING) += os/linux/sdl.o
102
101nvgpu-$(CONFIG_GK20A_VIDMEM) += \ 103nvgpu-$(CONFIG_GK20A_VIDMEM) += \
102 os/linux/dmabuf_vidmem.o 104 os/linux/dmabuf_vidmem.o
103 105
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c
index c3068b76..1a117169 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * GK20A Graphics 2 * GK20A Graphics
3 * 3 *
4 * Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved. 4 * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved.
5 * 5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a 6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"), 7 * copy of this software and associated documentation files (the "Software"),
@@ -39,6 +39,7 @@
39#include <nvgpu/therm.h> 39#include <nvgpu/therm.h>
40#include <nvgpu/mc.h> 40#include <nvgpu/mc.h>
41#include <nvgpu/channel_sync.h> 41#include <nvgpu/channel_sync.h>
42#include <nvgpu/nvgpu_err.h>
42 43
43#include <trace/events/gk20a.h> 44#include <trace/events/gk20a.h>
44 45
@@ -525,6 +526,10 @@ static void gk20a_free_cb(struct nvgpu_ref *refcount)
525 struct gk20a *g = container_of(refcount, 526 struct gk20a *g = container_of(refcount,
526 struct gk20a, refcount); 527 struct gk20a, refcount);
527 528
529#ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING
530 nvgpu_deinit_ecc_reporting(g);
531#endif
532
528 nvgpu_log(g, gpu_dbg_shutdown, "Freeing GK20A struct!"); 533 nvgpu_log(g, gpu_dbg_shutdown, "Freeing GK20A struct!");
529 534
530 gk20a_ce_destroy(g); 535 gk20a_ce_destroy(g);
diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
index a7a804d2..110819a9 100644
--- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * GV11b GPU GR 2 * GV11b GPU GR
3 * 3 *
4 * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. 4 * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
5 * 5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a 6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"), 7 * copy of this software and associated documentation files (the "Software"),
@@ -37,6 +37,7 @@
37#include <nvgpu/bitops.h> 37#include <nvgpu/bitops.h>
38#include <nvgpu/gk20a.h> 38#include <nvgpu/gk20a.h>
39#include <nvgpu/channel.h> 39#include <nvgpu/channel.h>
40#include <nvgpu/nvgpu_err.h>
40 41
41#include "gk20a/gr_gk20a.h" 42#include "gk20a/gr_gk20a.h"
42#include "gk20a/dbg_gpu_gk20a.h" 43#include "gk20a/dbg_gpu_gk20a.h"
@@ -61,6 +62,8 @@
61#include <nvgpu/hw/gv11b/hw_pbdma_gv11b.h> 62#include <nvgpu/hw/gv11b/hw_pbdma_gv11b.h>
62#include <nvgpu/hw/gv11b/hw_perf_gv11b.h> 63#include <nvgpu/hw/gv11b/hw_perf_gv11b.h>
63 64
65#define SHIFT_8_BITS 8U
66
64#define GFXP_WFI_TIMEOUT_COUNT_IN_USEC_DEFAULT 100 67#define GFXP_WFI_TIMEOUT_COUNT_IN_USEC_DEFAULT 100
65 68
66/* ecc scrubbing will done in 1 pri read cycle,but for safety used 10 retries */ 69/* ecc scrubbing will done in 1 pri read cycle,but for safety used 10 retries */
@@ -224,6 +227,12 @@ static int gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc,
224 } 227 }
225 g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter += 228 g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter +=
226 l1_tag_corrected_err_count_delta; 229 l1_tag_corrected_err_count_delta;
230
231 nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
232 (gpc << SHIFT_8_BITS) | tpc,
233 GPU_SM_L1_TAG_ECC_CORRECTED, 0,
234 g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter);
235
227 gk20a_writel(g, 236 gk20a_writel(g,
228 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_r() + offset, 237 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_r() + offset,
229 0); 238 0);
@@ -240,6 +249,12 @@ static int gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc,
240 } 249 }
241 g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter += 250 g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter +=
242 l1_tag_uncorrected_err_count_delta; 251 l1_tag_uncorrected_err_count_delta;
252
253 nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
254 (gpc << SHIFT_8_BITS) | tpc,
255 GPU_SM_L1_TAG_ECC_UNCORRECTED, 0,
256 g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter);
257
243 gk20a_writel(g, 258 gk20a_writel(g,
244 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_r() + offset, 259 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_r() + offset,
245 0); 260 0);
@@ -335,6 +350,10 @@ static int gr_gv11b_handle_lrf_exception(struct gk20a *g, u32 gpc, u32 tpc,
335 } 350 }
336 g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter += 351 g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter +=
337 lrf_uncorrected_err_count_delta; 352 lrf_uncorrected_err_count_delta;
353 nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
354 (gpc << SHIFT_8_BITS) | tpc,
355 GPU_SM_LRF_ECC_UNCORRECTED, 0,
356 g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter);
338 gk20a_writel(g, 357 gk20a_writel(g,
339 gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_r() + offset, 358 gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_r() + offset,
340 0); 359 0);
@@ -497,6 +516,12 @@ static int gr_gv11b_handle_cbu_exception(struct gk20a *g, u32 gpc, u32 tpc,
497 } 516 }
498 g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter += 517 g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter +=
499 cbu_uncorrected_err_count_delta; 518 cbu_uncorrected_err_count_delta;
519
520 nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
521 (gpc << SHIFT_8_BITS) | tpc,
522 GPU_SM_CBU_ECC_UNCORRECTED,
523 0, g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter);
524
500 gk20a_writel(g, 525 gk20a_writel(g,
501 gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_r() + offset, 526 gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_r() + offset,
502 0); 527 0);
@@ -580,6 +605,10 @@ static int gr_gv11b_handle_l1_data_exception(struct gk20a *g, u32 gpc, u32 tpc,
580 } 605 }
581 g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter += 606 g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter +=
582 l1_data_uncorrected_err_count_delta; 607 l1_data_uncorrected_err_count_delta;
608 nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM,
609 (gpc << SHIFT_8_BITS) | tpc,
610 GPU_SM_L1_DATA_ECC_UNCORRECTED,
611 0, g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter);
583 gk20a_writel(g, 612 gk20a_writel(g,
584 gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_r() + offset, 613 gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_r() + offset,
585 0); 614 0);
@@ -2537,10 +2566,18 @@ static void gr_gv11b_handle_fecs_ecc_error(struct gk20a *g, u32 intr)
2537 2566
2538 if (ecc_status & 2567 if (ecc_status &
2539 gr_fecs_falcon_ecc_status_corrected_err_imem_m()) { 2568 gr_fecs_falcon_ecc_status_corrected_err_imem_m()) {
2569 nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_FECS, 0,
2570 GPU_FECS_FALCON_IMEM_ECC_CORRECTED,
2571 ecc_addr,
2572 g->ecc.gr.fecs_ecc_corrected_err_count[0].counter);
2540 nvgpu_log(g, gpu_dbg_intr, "imem ecc error corrected"); 2573 nvgpu_log(g, gpu_dbg_intr, "imem ecc error corrected");
2541 } 2574 }
2542 if (ecc_status & 2575 if (ecc_status &
2543 gr_fecs_falcon_ecc_status_uncorrected_err_imem_m()) { 2576 gr_fecs_falcon_ecc_status_uncorrected_err_imem_m()) {
2577 nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_FECS, 0,
2578 GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED,
2579 ecc_addr,
2580 g->ecc.gr.fecs_ecc_uncorrected_err_count[0].counter);
2544 nvgpu_log(g, gpu_dbg_intr, 2581 nvgpu_log(g, gpu_dbg_intr,
2545 "imem ecc error uncorrected"); 2582 "imem ecc error uncorrected");
2546 } 2583 }
@@ -2550,6 +2587,10 @@ static void gr_gv11b_handle_fecs_ecc_error(struct gk20a *g, u32 intr)
2550 } 2587 }
2551 if (ecc_status & 2588 if (ecc_status &
2552 gr_fecs_falcon_ecc_status_uncorrected_err_dmem_m()) { 2589 gr_fecs_falcon_ecc_status_uncorrected_err_dmem_m()) {
2590 nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_FECS, 0,
2591 GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED,
2592 ecc_addr,
2593 g->ecc.gr.fecs_ecc_uncorrected_err_count[0].counter);
2553 nvgpu_log(g, gpu_dbg_intr, 2594 nvgpu_log(g, gpu_dbg_intr,
2554 "dmem ecc error uncorrected"); 2595 "dmem ecc error uncorrected");
2555 } 2596 }
diff --git a/drivers/gpu/nvgpu/gv11b/pmu_gv11b.c b/drivers/gpu/nvgpu/gv11b/pmu_gv11b.c
index 5e586ec2..336258a7 100644
--- a/drivers/gpu/nvgpu/gv11b/pmu_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/pmu_gv11b.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * GV11B PMU 2 * GV11B PMU
3 * 3 *
4 * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. 4 * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
5 * 5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a 6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"), 7 * copy of this software and associated documentation files (the "Software"),
@@ -29,6 +29,7 @@
29#include <nvgpu/io.h> 29#include <nvgpu/io.h>
30#include <nvgpu/utils.h> 30#include <nvgpu/utils.h>
31#include <nvgpu/gk20a.h> 31#include <nvgpu/gk20a.h>
32#include <nvgpu/nvgpu_err.h>
32 33
33#include "gk20a/pmu_gk20a.h" 34#include "gk20a/pmu_gk20a.h"
34#include "gp10b/pmu_gp10b.h" 35#include "gp10b/pmu_gp10b.h"
@@ -354,10 +355,18 @@ void gv11b_pmu_handle_ext_irq(struct gk20a *g, u32 intr0)
354 "pmu ecc interrupt intr1: 0x%x", intr1); 355 "pmu ecc interrupt intr1: 0x%x", intr1);
355 356
356 if (ecc_status & pwr_pmu_falcon_ecc_status_corrected_err_imem_m()) { 357 if (ecc_status & pwr_pmu_falcon_ecc_status_corrected_err_imem_m()) {
358 nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_PMU, 0,
359 GPU_PMU_FALCON_IMEM_ECC_CORRECTED,
360 ecc_addr,
361 g->ecc.pmu.pmu_ecc_corrected_err_count[0].counter);
357 nvgpu_log(g, gpu_dbg_intr, 362 nvgpu_log(g, gpu_dbg_intr,
358 "imem ecc error corrected"); 363 "imem ecc error corrected");
359 } 364 }
360 if (ecc_status & pwr_pmu_falcon_ecc_status_uncorrected_err_imem_m()) { 365 if (ecc_status & pwr_pmu_falcon_ecc_status_uncorrected_err_imem_m()) {
366 nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_PMU, 0,
367 GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED,
368 ecc_addr,
369 g->ecc.pmu.pmu_ecc_uncorrected_err_count[0].counter);
361 nvgpu_log(g, gpu_dbg_intr, 370 nvgpu_log(g, gpu_dbg_intr,
362 "imem ecc error uncorrected"); 371 "imem ecc error uncorrected");
363 } 372 }
@@ -366,6 +375,10 @@ void gv11b_pmu_handle_ext_irq(struct gk20a *g, u32 intr0)
366 "dmem ecc error corrected"); 375 "dmem ecc error corrected");
367 } 376 }
368 if (ecc_status & pwr_pmu_falcon_ecc_status_uncorrected_err_dmem_m()) { 377 if (ecc_status & pwr_pmu_falcon_ecc_status_uncorrected_err_dmem_m()) {
378 nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_PMU, 0,
379 GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED,
380 ecc_addr,
381 g->ecc.pmu.pmu_ecc_uncorrected_err_count[0].counter);
369 nvgpu_log(g, gpu_dbg_intr, 382 nvgpu_log(g, gpu_dbg_intr,
370 "dmem ecc error uncorrected"); 383 "dmem ecc error uncorrected");
371 } 384 }
diff --git a/drivers/gpu/nvgpu/include/nvgpu/bug.h b/drivers/gpu/nvgpu/include/nvgpu/bug.h
index 3d139b75..82d641bd 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/bug.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/bug.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. 2 * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
3 * 3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a 4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"), 5 * copy of this software and associated documentation files (the "Software"),
@@ -24,6 +24,24 @@
24 24
25#ifdef __KERNEL__ 25#ifdef __KERNEL__
26#include <linux/bug.h> 26#include <linux/bug.h>
27/*
28 * Define an assert macro that code within nvgpu can use.
29 *
30 * The goal of this macro is for debugging but what that means varies from OS
31 * to OS. On Linux wee don't want to BUG() for general driver misbehaving. BUG()
32 * is a very heavy handed tool - in fact there's probably no where within the
33 * nvgpu core code where it makes sense to use a BUG() when running under Linux.
34 *
35 * However, on QNX (and POSIX) BUG() will just kill the current process. This
36 * means we can use it for handling bugs in nvgpu.
37 *
38 * As a result this macro varies depending on platform.
39 */
40#define nvgpu_assert(cond) ((void) WARN_ON(!(cond)))
41#define nvgpu_do_assert_print(g, fmt, arg...) \
42 do { \
43 nvgpu_err(g, fmt, ##arg); \
44 } while (false)
27#elif defined(__NVGPU_POSIX__) 45#elif defined(__NVGPU_POSIX__)
28#include <nvgpu/posix/bug.h> 46#include <nvgpu/posix/bug.h>
29#else 47#else
diff --git a/drivers/gpu/nvgpu/include/nvgpu/log.h b/drivers/gpu/nvgpu/include/nvgpu/log.h
index 70a16762..2bcca335 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/log.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/log.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. 2 * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
3 * 3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a 4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"), 5 * copy of this software and associated documentation files (the "Software"),
@@ -80,6 +80,7 @@ void __nvgpu_log_dbg(struct gk20a *g, u64 log_mask,
80#define gpu_dbg_vidmem BIT(24) /* VIDMEM tracing. */ 80#define gpu_dbg_vidmem BIT(24) /* VIDMEM tracing. */
81#define gpu_dbg_nvlink BIT(25) /* nvlink Operation tracing. */ 81#define gpu_dbg_nvlink BIT(25) /* nvlink Operation tracing. */
82#define gpu_dbg_clk_arb BIT(26) /* Clk arbiter debugging. */ 82#define gpu_dbg_clk_arb BIT(26) /* Clk arbiter debugging. */
83#define gpu_dbg_ecc BIT(27) /* Print ECC Info Logs. */
83#define gpu_dbg_mem BIT(31) /* memory accesses; very verbose. */ 84#define gpu_dbg_mem BIT(31) /* memory accesses; very verbose. */
84 85
85/** 86/**
diff --git a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h
new file mode 100644
index 00000000..0595fafb
--- /dev/null
+++ b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h
@@ -0,0 +1,359 @@
1/*
2 * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 * DEALINGS IN THE SOFTWARE.
21 */
22
23#ifndef NVGPU_NVGPU_ERR_H
24#define NVGPU_NVGPU_ERR_H
25
26/**
27 * @file
28 *
29 * Define indices for HW units and errors. Define structures used to carry error
30 * information. Declare prototype for APIs that are used to report GPU HW errors
31 * to the Safety_Services framework.
32 */
33
34#include <nvgpu/types.h>
35#include <nvgpu/atomic.h>
36
37struct gk20a;
38
39/**
40 * @defgroup INDICES_FOR_GPU_HW_UNITS
41 * Macros used to assign unique index to GPU HW units.
42 * @{
43 */
44#define NVGPU_ERR_MODULE_SM (0U)
45#define NVGPU_ERR_MODULE_FECS (1U)
46#define NVGPU_ERR_MODULE_PMU (2U)
47/**
48 * @}
49 */
50
51/**
52 * @defgroup LIST_OF_ERRORS_REPORTED_FROM_SM
53 * Macros used to assign unique index to errors reported from the SM unit.
54 * @{
55 */
56#define GPU_SM_L1_TAG_ECC_CORRECTED (0U)
57#define GPU_SM_L1_TAG_ECC_UNCORRECTED (1U)
58#define GPU_SM_CBU_ECC_UNCORRECTED (3U)
59#define GPU_SM_LRF_ECC_UNCORRECTED (5U)
60#define GPU_SM_L1_DATA_ECC_UNCORRECTED (7U)
61#define GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED (9U)
62#define GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED (11U)
63#define GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED (13U)
64#define GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED (15U)
65#define GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED (17U)
66#define GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED (20U)
67/**
68 * @}
69 */
70
71/**
72 * @defgroup LIST_OF_ERRORS_REPORTED_FROM_FECS
73 * Macros used to assign unique index to errors reported from the FECS unit.
74 * @{
75 */
76#define GPU_FECS_FALCON_IMEM_ECC_CORRECTED (0U)
77#define GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED (1U)
78#define GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED (3U)
79/**
80 * @}
81 */
82
83/**
84 * @defgroup LIST_OF_ERRORS_REPORTED_FROM_GPCCS
85 * Macros used to assign unique index to errors reported from the GPCCS unit.
86 * @{
87 */
88#define GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED (0U)
89#define GPU_GPCCS_FALCON_IMEM_ECC_UNCORRECTED (1U)
90#define GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED (3U)
91/**
92 * @}
93 */
94
95/**
96 * @defgroup LIST_OF_ERRORS_REPORTED_FROM_MMU
97 * Macros used to assign unique index to errors reported from the MMU unit.
98 * @{
99 */
100#define GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED (1U)
101#define GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED (3U)
102/**
103 * @}
104 */
105
106/**
107 * @defgroup LIST_OF_ERRORS_REPORTED_FROM_GCC
108 * Macros used to assign unique index to errors reported from the GCC unit.
109 * @{
110 */
111#define GPU_GCC_L15_ECC_UNCORRECTED (1U)
112/**
113 * @}
114 */
115
116
117/**
118 * @defgroup LIST_OF_ERRORS_REPORTED_FROM_PMU
119 * Macros used to assign unique index to errors reported from the PMU unit.
120 * @{
121 */
122#define GPU_PMU_FALCON_IMEM_ECC_CORRECTED (0U)
123#define GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED (1U)
124#define GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED (3U)
125/**
126 * @}
127 */
128
129/**
130 * @defgroup LIST_OF_ERRORS_REPORTED_FROM_LTC
131 * Macros used to assign unique index to errors reported from the LTC unit.
132 * @{
133 */
134#define GPU_LTC_CACHE_DSTG_ECC_CORRECTED (0U)
135#define GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED (1U)
136#define GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED (3U)
137#define GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED (7U)
138/**
139 * @}
140 */
141
142/**
143 * @defgroup LIST_OF_ERRORS_REPORTED_FROM_HUBMMU
144 * Macros used to assign unique index to errors reported from the HUBMMU unit.
145 * @{
146 */
147#define GPU_HUBMMU_L2TLB_SA_DATA_ECC_UNCORRECTED (1U)
148#define GPU_HUBMMU_TLB_SA_DATA_ECC_UNCORRECTED (3U)
149#define GPU_HUBMMU_PTE_DATA_ECC_UNCORRECTED (5U)
150#define GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED (7U)
151#define GPU_HUBMMU_PAGE_FAULT_ERROR (8U)
152
153
154#ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING
155/**
156 * @}
157 */
158
159/**
160 * nvgpu_err_desc structure holds fields which describe an error along with
161 * function callback which can be used to inject the error.
162 */
163struct nvgpu_err_desc {
164 /** String representation of error. */
165 const char *name;
166
167 /** Flag to classify an error as critical or non-critical. */
168 bool is_critical;
169
170 /**
171 * Error Threshold: once this threshold value is reached, then the
172 * corresponding error counter will be reset to 0 and the error will be
173 * propagated to Safety_Services.
174 */
175 int err_threshold;
176
177 /**
178 * Total number of times an error has occurred (since its last reset).
179 */
180 nvgpu_atomic_t err_count;
181
182 /** Error ID. */
183 u8 error_id;
184};
185
186/**
187 * gpu_err_header structure holds fields which are required to identify the
188 * version of header, sub-error type, sub-unit id, error address and time stamp.
189 */
190struct gpu_err_header {
191 /** Version of GPU error header. */
192 struct {
193 /** Major version number. */
194 u16 major;
195 /** Minor version number. */
196 u16 minor;
197 } version;
198
199 /** Sub error type corresponding to the error that is being reported. */
200 u32 sub_err_type;
201
202 /** ID of the sub-unit in a HW unit which encountered an error. */
203 u64 sub_unit_id;
204
205 /** Location of the error. */
206 u64 address;
207
208 /** Timestamp in nano seconds. */
209 u64 timestamp_ns;
210};
211
212struct gpu_ecc_error_info {
213 struct gpu_err_header header;
214
215 /** Number of ECC errors. */
216 u64 err_cnt;
217};
218
219/**
220 * nvgpu_err_hw_module structure holds fields which describe the h/w modules
221 * error reporting capabilities.
222 */
223struct nvgpu_err_hw_module {
224 /** String representation of a given HW unit. */
225 const char *name;
226
227 /** HW unit ID. */
228 u32 hw_unit;
229
230 /** Total number of errors reported from a given HW unit. */
231 u32 num_errs;
232
233 u32 base_ecc_service_id;
234
235 /** Used to get error description from look-up table. */
236 struct nvgpu_err_desc *errs;
237};
238
239struct nvgpu_ecc_reporting_ops {
240 void (*report_ecc_err)(struct gk20a *g, u32 hw_unit, u32 inst,
241 u32 err_id, u64 err_addr, u64 err_count);
242};
243
244struct nvgpu_ecc_reporting {
245 struct nvgpu_spinlock lock;
246 /* This flag is protected by the above spinlock */
247 bool ecc_reporting_service_enabled;
248 const struct nvgpu_ecc_reporting_ops *ops;
249};
250
251 /**
252 * This macro is used to initialize the members of nvgpu_err_desc struct.
253 */
254#define GPU_ERR(err, critical, id, threshold, ecount) \
255{ \
256 .name = (err), \
257 .is_critical = (critical), \
258 .error_id = (id), \
259 .err_threshold = (threshold), \
260 .err_count = NVGPU_ATOMIC_INIT(ecount), \
261}
262
263/**
264 * This macro is used to initialize critical errors.
265 */
266#define GPU_CRITERR(err, id, threshold, ecount) \
267 GPU_ERR(err, true, id, threshold, ecount)
268
269/**
270 * This macro is used to initialize non-critical errors.
271 */
272#define GPU_NONCRITERR(err, id, threshold, ecount) \
273 GPU_ERR(err, false, id, threshold, ecount)
274
275/**
276 * @brief GPU HW errors need to be reported to Safety_Services via SDL unit.
277 * This function provides an interface to report ECC erros to SDL unit.
278 *
279 * @param g [in] - The GPU driver struct.
280 * @param hw_unit [in] - Index of HW unit.
281 * - List of valid HW unit IDs
282 * - NVGPU_ERR_MODULE_SM
283 * - NVGPU_ERR_MODULE_FECS
284 * - NVGPU_ERR_MODULE_GPCCS
285 * - NVGPU_ERR_MODULE_MMU
286 * - NVGPU_ERR_MODULE_GCC
287 * - NVGPU_ERR_MODULE_PMU
288 * - NVGPU_ERR_MODULE_LTC
289 * - NVGPU_ERR_MODULE_HUBMMU
290 * @param inst [in] - Instance ID.
291 * - In case of multiple instances of the same HW
292 * unit (e.g., there are multiple instances of
293 * SM), it is used to identify the instance
294 * that encountered a fault.
295 * @param err_id [in] - Error index.
296 * - For SM:
297 * - Min: GPU_SM_L1_TAG_ECC_CORRECTED
298 * - Max: GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED
299 * - For FECS:
300 * - Min: GPU_FECS_FALCON_IMEM_ECC_CORRECTED
301 * - Max: GPU_FECS_INVALID_ERROR
302 * - For GPCCS:
303 * - Min: GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED
304 * - Max: GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED
305 * - For MMU:
306 * - Min: GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED
307 * - Max: GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED
308 * - For GCC:
309 * - Min: GPU_GCC_L15_ECC_UNCORRECTED
310 * - Max: GPU_GCC_L15_ECC_UNCORRECTED
311 * - For PMU:
312 * - Min: GPU_PMU_FALCON_IMEM_ECC_CORRECTED
313 * - Max: GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED
314 * - For LTC:
315 * - Min: GPU_LTC_CACHE_DSTG_ECC_CORRECTED
316 * - Max: GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED
317 * - For HUBMMU:
318 * - Min: GPU_HUBMMU_L2TLB_SA_DATA_ECC_UNCORRECTED
319 * - Max: GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED
320 * @param err_addr [in] - Error address.
321 * - This is the location at which correctable or
322 * uncorrectable error has occurred.
323 * @param err_count [in] - Error count.
324 *
325 * - Checks whether SDL is supported in the current GPU platform. If SDL is not
326 * supported, it simply returns.
327 * - Validates both \a hw_unit and \a err_id indices. In case of a failure,
328 * invokes #nvgpu_sdl_handle_report_failure() api.
329 * - Gets the current time of a clock. In case of a failure, invokes
330 * #nvgpu_sdl_handle_report_failure() api.
331 * - Gets error description from internal look-up table using \a hw_unit and
332 * \a err_id indices.
333 * - Forms error packet using details such as time-stamp, \a hw_unit, \a err_id,
334 * criticality of the error, \a inst, \a err_addr, \a err_count, error
335 * description, and size of the error packet.
336 * - Performs compile-time assert check to ensure that the size of the error
337 * packet does not exceed the maximum allowable size specified in
338 * #MAX_ERR_MSG_SIZE.
339 *
340 * @return None
341 */
342void nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst,
343 u32 err_id, u64 err_addr, u64 err_count);
344
345void nvgpu_init_ecc_reporting(struct gk20a *g);
346void nvgpu_enable_ecc_reporting(struct gk20a *g);
347void nvgpu_disable_ecc_reporting(struct gk20a *g);
348void nvgpu_deinit_ecc_reporting(struct gk20a *g);
349
350#else
351
352static inline void nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst,
353 u32 err_id, u64 err_addr, u64 err_count) {
354
355}
356
357#endif /* CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING */
358
359#endif /* NVGPU_NVGPU_ERR_H */ \ No newline at end of file
diff --git a/drivers/gpu/nvgpu/os/linux/ecc_linux.h b/drivers/gpu/nvgpu/os/linux/ecc_linux.h
new file mode 100644
index 00000000..7e0f650b
--- /dev/null
+++ b/drivers/gpu/nvgpu/os/linux/ecc_linux.h
@@ -0,0 +1,49 @@
1/*
2 *
3 * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24#ifndef NVGPU_OS_ECC_LINUX_H
25#define NVGPU_OS_ECC_LINUX_H
26
27#ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING
28
29#include <linux/tegra_l1ss_kernel_interface.h>
30#include <linux/tegra_l1ss_ioctl.h>
31#include <linux/tegra_nv_guard_service_id.h>
32#include <linux/tegra_nv_guard_group_id.h>
33
34#include <nvgpu/nvgpu_err.h>
35
36struct nvgpu_ecc_reporting_linux {
37 struct nvgpu_ecc_reporting common;
38 client_param_t priv;
39};
40
41static inline struct nvgpu_ecc_reporting_linux *get_ecc_reporting_linux(
42 struct nvgpu_ecc_reporting *ecc_report)
43{
44 return container_of(ecc_report, struct nvgpu_ecc_reporting_linux, common);
45}
46
47#endif /* CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING */
48
49#endif \ No newline at end of file
diff --git a/drivers/gpu/nvgpu/os/linux/module.c b/drivers/gpu/nvgpu/os/linux/module.c
index 807df2ca..fdbab46d 100644
--- a/drivers/gpu/nvgpu/os/linux/module.c
+++ b/drivers/gpu/nvgpu/os/linux/module.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * GK20A Graphics 2 * GK20A Graphics
3 * 3 *
4 * Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved. 4 * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved.
5 * 5 *
6 * This program is free software; you can redistribute it and/or modify it 6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms and conditions of the GNU General Public License, 7 * under the terms and conditions of the GNU General Public License,
@@ -49,6 +49,7 @@
49#include <nvgpu/clk_arb.h> 49#include <nvgpu/clk_arb.h>
50#include <nvgpu/timers.h> 50#include <nvgpu/timers.h>
51#include <nvgpu/channel.h> 51#include <nvgpu/channel.h>
52#include <nvgpu/nvgpu_err.h>
52 53
53#include "platform_gk20a.h" 54#include "platform_gk20a.h"
54#include "sysfs.h" 55#include "sysfs.h"
@@ -355,6 +356,10 @@ int gk20a_pm_finalize_poweron(struct device *dev)
355 gk20a_init_cde_support(l); 356 gk20a_init_cde_support(l);
356#endif 357#endif
357 358
359#ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING
360 nvgpu_enable_ecc_reporting(g);
361#endif
362
358 err = gk20a_sched_ctrl_init(g); 363 err = gk20a_sched_ctrl_init(g);
359 if (err) { 364 if (err) {
360 nvgpu_err(g, "failed to init sched control"); 365 nvgpu_err(g, "failed to init sched control");
@@ -364,9 +369,14 @@ int gk20a_pm_finalize_poweron(struct device *dev)
364 g->sw_ready = true; 369 g->sw_ready = true;
365 370
366done: 371done:
367 if (err) 372 if (err) {
368 g->power_on = false; 373 g->power_on = false;
369 374
375#ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING
376 nvgpu_disable_ecc_reporting(g);
377#endif
378 }
379
370 nvgpu_mutex_release(&g->power_lock); 380 nvgpu_mutex_release(&g->power_lock);
371 return err; 381 return err;
372} 382}
@@ -433,6 +443,10 @@ static int gk20a_pm_prepare_poweroff(struct device *dev)
433 /* Stop CPU from accessing the GPU registers. */ 443 /* Stop CPU from accessing the GPU registers. */
434 gk20a_lockout_registers(g); 444 gk20a_lockout_registers(g);
435 445
446#ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING
447 nvgpu_disable_ecc_reporting(g);
448#endif
449
436 nvgpu_hide_usermode_for_poweroff(g); 450 nvgpu_hide_usermode_for_poweroff(g);
437 nvgpu_mutex_release(&g->power_lock); 451 nvgpu_mutex_release(&g->power_lock);
438 return 0; 452 return 0;
@@ -1382,6 +1396,10 @@ static int gk20a_probe(struct platform_device *dev)
1382 goto return_err; 1396 goto return_err;
1383 } 1397 }
1384 1398
1399#ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING
1400 nvgpu_init_ecc_reporting(gk20a);
1401#endif
1402
1385 gk20a->nvgpu_reboot_nb.notifier_call = 1403 gk20a->nvgpu_reboot_nb.notifier_call =
1386 nvgpu_kernel_shutdown_notification; 1404 nvgpu_kernel_shutdown_notification;
1387 err = register_reboot_notifier(&gk20a->nvgpu_reboot_nb); 1405 err = register_reboot_notifier(&gk20a->nvgpu_reboot_nb);
diff --git a/drivers/gpu/nvgpu/os/linux/os_linux.h b/drivers/gpu/nvgpu/os/linux/os_linux.h
index 25c6c03a..adcfdb2f 100644
--- a/drivers/gpu/nvgpu/os/linux/os_linux.h
+++ b/drivers/gpu/nvgpu/os/linux/os_linux.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. 2 * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
3 * 3 *
4 * This program is free software; you can redistribute it and/or modify it 4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License, 5 * under the terms and conditions of the GNU General Public License,
@@ -25,6 +25,7 @@
25 25
26#include "cde.h" 26#include "cde.h"
27#include "sched.h" 27#include "sched.h"
28#include "ecc_linux.h"
28 29
29struct nvgpu_os_linux_ops { 30struct nvgpu_os_linux_ops {
30 struct { 31 struct {
@@ -134,6 +135,10 @@ struct nvgpu_os_linux {
134 135
135 u64 regs_bus_addr; 136 u64 regs_bus_addr;
136 137
138#ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING
139 struct nvgpu_ecc_reporting_linux ecc_reporting_linux;
140#endif
141
137 struct nvgpu_os_linux_ops ops; 142 struct nvgpu_os_linux_ops ops;
138 143
139#ifdef CONFIG_DEBUG_FS 144#ifdef CONFIG_DEBUG_FS
diff --git a/drivers/gpu/nvgpu/os/linux/sdl.c b/drivers/gpu/nvgpu/os/linux/sdl.c
new file mode 100644
index 00000000..c4dccdc6
--- /dev/null
+++ b/drivers/gpu/nvgpu/os/linux/sdl.c
@@ -0,0 +1,341 @@
1/*
2 * Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program. If not, see <http://www.gnu.org/licenses/>.
15 */
16
17#include <nvgpu/gk20a.h>
18#include <nvgpu/types.h>
19#include <nvgpu/nvgpu_err.h>
20#include <nvgpu/timers.h>
21#include <nvgpu/bug.h>
22
23#include "ecc_linux.h"
24#include "os_linux.h"
25#include "module.h"
26
27/* This look-up table initializes the list of hw units and their errors.
28 * It also specifies the error injection mechanism supported, for each error.
29 * In case of hw error injection support, this initialization will be overriden
30 * by the values provided from the hal layes of corresponding hw units.
31 */
32static struct nvgpu_err_hw_module gv11b_err_lut[] = {
33 {
34 .name = "sm",
35 .hw_unit = (u32)NVGPU_ERR_MODULE_SM,
36 .num_errs = 21U,
37 .base_ecc_service_id =
38 NVGUARD_SERVICE_IGPU_SM_SWERR_L1_TAG_ECC_CORRECTED,
39 .errs = (struct nvgpu_err_desc[]) {
40 GPU_NONCRITERR("l1_tag_ecc_corrected",
41 GPU_SM_L1_TAG_ECC_CORRECTED, 0, 0),
42 GPU_CRITERR("l1_tag_ecc_uncorrected",
43 GPU_SM_L1_TAG_ECC_UNCORRECTED, 0, 0),
44 GPU_NONCRITERR("cbu_ecc_corrected", 0, 0, 0),
45 GPU_CRITERR("cbu_ecc_uncorrected",
46 GPU_SM_CBU_ECC_UNCORRECTED, 0, 0),
47 GPU_NONCRITERR("lrf_ecc_corrected", 0, 0, 0),
48 GPU_CRITERR("lrf_ecc_uncorrected",
49 GPU_SM_LRF_ECC_UNCORRECTED, 0, 0),
50 GPU_NONCRITERR("l1_data_ecc_corrected", 0, 0, 0),
51 GPU_CRITERR("l1_data_ecc_uncorrected",
52 GPU_SM_L1_DATA_ECC_UNCORRECTED, 0, 0),
53 GPU_NONCRITERR("icache_l0_data_ecc_corrected", 0, 0, 0),
54 GPU_CRITERR("icache_l0_data_ecc_uncorrected",
55 GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED, 0, 0),
56 GPU_NONCRITERR("icache_l1_data_ecc_corrected", 0, 0, 0),
57 GPU_CRITERR("icache_l1_data_ecc_uncorrected",
58 GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED, 0, 0),
59 GPU_NONCRITERR("icache_l0_predecode_ecc_corrected", 0, 0, 0),
60 GPU_CRITERR("icache_l0_predecode_ecc_uncorrected",
61 GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED, 0, 0),
62 GPU_NONCRITERR("l1_tag_miss_fifo_ecc_corrected", 0, 0, 0),
63 GPU_CRITERR("l1_tag_miss_fifo_ecc_uncorrected",
64 GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED, 0, 0),
65 GPU_NONCRITERR("l1_tag_s2r_pixprf_ecc_corrected", 0, 0, 0),
66 GPU_CRITERR("l1_tag_s2r_pixprf_ecc_uncorrected",
67 GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED, 0, 0),
68 GPU_CRITERR("machine_check_error", 0, 0, 0),
69 GPU_NONCRITERR("icache_l1_predecode_ecc_corrected", 0, 0, 0),
70 GPU_CRITERR("icache_l1_predecode_ecc_uncorrected",
71 GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED, 0, 0),
72 },
73 },
74 {
75 .name = "fecs",
76 .hw_unit = (u32)NVGPU_ERR_MODULE_FECS,
77 .num_errs = 4U,
78 .base_ecc_service_id =
79 NVGUARD_SERVICE_IGPU_FECS_SWERR_FALCON_IMEM_ECC_CORRECTED,
80 .errs = (struct nvgpu_err_desc[]) {
81 GPU_NONCRITERR("falcon_imem_ecc_corrected",
82 GPU_FECS_FALCON_IMEM_ECC_CORRECTED, 0, 0),
83 GPU_CRITERR("falcon_imem_ecc_uncorrected",
84 GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED, 0, 0),
85 GPU_NONCRITERR("falcon_dmem_ecc_corrected", 0, 0, 0),
86 GPU_CRITERR("falcon_dmem_ecc_uncorrected",
87 GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED, 0, 0),
88 },
89 },
90 {
91 .name = "pmu",
92 .hw_unit = NVGPU_ERR_MODULE_PMU,
93 .num_errs = 4U,
94 .base_ecc_service_id =
95 NVGUARD_SERVICE_IGPU_PMU_SWERR_FALCON_IMEM_ECC_CORRECTED,
96 .errs = (struct nvgpu_err_desc[]) {
97 GPU_NONCRITERR("falcon_imem_ecc_corrected",
98 GPU_PMU_FALCON_IMEM_ECC_CORRECTED, 0, 0),
99 GPU_CRITERR("falcon_imem_ecc_uncorrected",
100 GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED, 0, 0),
101 GPU_NONCRITERR("falcon_dmem_ecc_corrected", 0, 0, 0),
102 GPU_CRITERR("falcon_dmem_ecc_uncorrected",
103 GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED, 0, 0),
104 },
105 },
106};
107
108static void nvgpu_init_err_msg_header(struct gpu_err_header *header)
109{
110 header->version.major = (u16)1U;
111 header->version.minor = (u16)0U;
112 header->sub_err_type = 0U;
113 header->sub_unit_id = 0UL;
114 header->address = 0UL;
115 header->timestamp_ns = 0UL;
116}
117
118static void nvgpu_init_ecc_err_msg(struct gpu_ecc_error_info *err_info)
119{
120 nvgpu_init_err_msg_header(&err_info->header);
121 err_info->err_cnt = 0UL;
122}
123
124static void nvgpu_report_ecc_error_linux(struct gk20a *g, u32 hw_unit, u32 inst,
125 u32 err_id, u64 err_addr, u64 err_count)
126{
127 int err = 0;
128 u32 s_id = 0;
129 u8 err_status = 0;
130 u8 err_info_size = 0;
131 u64 timestamp = 0ULL;
132 int err_threshold_counter = 0;
133 struct gpu_ecc_error_info err_pkt;
134 struct nvgpu_err_desc *err_desc = NULL;
135 struct nvgpu_err_hw_module *hw_module = NULL;
136 nv_guard_request_t req;
137
138 memset(&req, 0, sizeof(req));
139 nvgpu_init_ecc_err_msg(&err_pkt);
140 if (hw_unit >= sizeof(gv11b_err_lut)/sizeof(gv11b_err_lut[0])) {
141 err = -EINVAL;
142 goto done;
143 }
144
145 hw_module = &gv11b_err_lut[hw_unit];
146 if (err_id >= hw_module->num_errs) {
147 nvgpu_err(g, "invalid err_id (%u) for hw module (%u)",
148 err_id, hw_module->hw_unit);
149 err = -EINVAL;
150 goto done;
151 }
152 err_desc = &hw_module->errs[err_id];
153 timestamp = (u64)nvgpu_current_time_ns();
154
155 err_pkt.header.timestamp_ns = timestamp;
156 err_pkt.header.sub_unit_id = inst;
157 err_pkt.header.address = err_addr;
158 err_pkt.err_cnt = err_count;
159 err_info_size = sizeof(err_pkt);
160
161 s_id = hw_module->base_ecc_service_id + err_id;
162
163 if (err_desc->is_critical) {
164 err_status = NVGUARD_ERROR_DETECTED;
165 } else {
166 err_status = NVGUARD_NO_ERROR;
167 }
168
169 nvgpu_atomic_inc(&err_desc->err_count);
170 err_threshold_counter = nvgpu_atomic_cmpxchg(&err_desc->err_count,
171 err_desc->err_threshold + 1, 0);
172
173 if (unlikely(err_threshold_counter != err_desc->err_threshold + 1)) {
174 goto done;
175 }
176
177 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting hw: %s, desc:%s, count:%llu",
178 hw_module->name, err_desc->name, err_count);
179
180 req.srv_id_cmd = NVGUARD_SERVICESTATUS_NOTIFICATION;
181 req.srv_status.srv_id = (nv_guard_service_id_t)s_id;
182 req.srv_status.status = err_status;
183 req.srv_status.timestamp = timestamp;
184 req.srv_status.error_info_size = err_info_size;
185 memcpy(req.srv_status.error_info, (u8*)&err_pkt, err_info_size);
186
187 /*
188 * l1ss_submit_rq may fail due to kmalloc failures but may pass in
189 * subsequent calls
190 */
191 err = l1ss_submit_rq(&req, true);
192 if (err != 0) {
193 nvgpu_err(g, "Error returned from L1SS submit %d", err);
194 }
195
196 if (err_desc->is_critical) {
197 nvgpu_quiesce(g);
198 }
199
200done:
201 return;
202}
203
204static void nvgpu_report_ecc_error_empty(struct gk20a *g, u32 hw_unit, u32 inst,
205 u32 err_id, u64 err_addr, u64 err_count) {
206 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting empty");
207}
208
209const struct nvgpu_ecc_reporting_ops default_disabled_ecc_report_ops = {
210 .report_ecc_err = nvgpu_report_ecc_error_empty,
211};
212
213const struct nvgpu_ecc_reporting_ops ecc_enable_report_ops = {
214 .report_ecc_err = nvgpu_report_ecc_error_linux,
215};
216
217static int nvgpu_l1ss_callback(l1ss_cli_callback_param param, void *data)
218{
219 struct gk20a *g = (struct gk20a *)data;
220 struct nvgpu_os_linux *l = NULL;
221 struct nvgpu_ecc_reporting_linux *ecc_reporting_linux = NULL;
222 int err = 0;
223 /* Ensure we have a valid gk20a struct before proceeding */
224 if ((g == NULL) || (gk20a_get(g) == NULL)) {
225 return -ENODEV;
226 }
227
228 l = nvgpu_os_linux_from_gk20a(g);
229 ecc_reporting_linux = &l->ecc_reporting_linux;
230
231 nvgpu_spinlock_acquire(&ecc_reporting_linux->common.lock);
232 if (param == L1SS_READY) {
233 if (!ecc_reporting_linux->common.ecc_reporting_service_enabled) {
234 ecc_reporting_linux->common.ecc_reporting_service_enabled = true;
235 ecc_reporting_linux->common.ops = &ecc_enable_report_ops;
236 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is enabled");
237 }
238 } else if (param == L1SS_NOT_READY) {
239 if (ecc_reporting_linux->common.ecc_reporting_service_enabled) {
240 ecc_reporting_linux->common.ecc_reporting_service_enabled = false;
241 ecc_reporting_linux->common.ops = &default_disabled_ecc_report_ops;
242 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is disabled");
243 }
244 } else {
245 err = -EINVAL;
246 }
247 nvgpu_spinlock_release(&ecc_reporting_linux->common.lock);
248
249 gk20a_put(g);
250
251 return err;
252}
253
254void nvgpu_init_ecc_reporting(struct gk20a *g)
255{
256 struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
257 struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux;
258 int err = 0;
259 /* This will invoke the registration API */
260 nvgpu_spinlock_init(&ecc_report_linux->common.lock);
261 ecc_report_linux->priv.id = (NVGUARD_GROUPID_IGPU & NVGUARD_GROUPINDEX_FIELDMASK);
262 ecc_report_linux->priv.cli_callback = nvgpu_l1ss_callback;
263 ecc_report_linux->priv.data = g;
264 ecc_report_linux->common.ops = &default_disabled_ecc_report_ops;
265
266 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting Init");
267
268 /*
269 * err == 0 indicates service is available but not active yet.
270 * err == 1 indicates service is available and active
271 * error for other cases.
272 */
273 err = l1ss_register_client(&ecc_report_linux->priv);
274 if (err == 0) {
275 ecc_report_linux->common.ecc_reporting_service_enabled = false;
276 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting init success");
277 } else if (err == 1) {
278 ecc_report_linux->common.ecc_reporting_service_enabled = true;
279 /* Actual Ops will be replaced during nvgpu_enable_ecc_reporting
280 * called as part of gk20a_busy()
281 */
282 } else {
283 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting init failure %d", err);
284 }
285}
286
287void nvgpu_deinit_ecc_reporting(struct gk20a *g)
288{
289 struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
290 struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux;
291
292 if (ecc_report_linux->common.ecc_reporting_service_enabled) {
293 ecc_report_linux->common.ecc_reporting_service_enabled = false;
294 l1ss_deregister_client(ecc_report_linux->priv.id);
295 memset(ecc_report_linux, 0, sizeof(*ecc_report_linux));
296 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting de-init success");
297 }
298
299}
300
301void nvgpu_enable_ecc_reporting(struct gk20a *g)
302{
303 struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
304 struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux;
305 struct nvgpu_ecc_reporting *error_reporting = &ecc_report_linux->common;
306
307 nvgpu_spinlock_acquire(&ecc_report_linux->common.lock);
308 if (error_reporting->ecc_reporting_service_enabled) {
309 error_reporting->ops = &ecc_enable_report_ops;
310 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is enabled");
311 }
312 nvgpu_spinlock_release(&ecc_report_linux->common.lock);
313}
314
315void nvgpu_disable_ecc_reporting(struct gk20a *g)
316{
317 struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
318 struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux;
319 struct nvgpu_ecc_reporting *error_reporting = &ecc_report_linux->common;
320
321 nvgpu_spinlock_acquire(&ecc_report_linux->common.lock);
322 error_reporting->ops = &default_disabled_ecc_report_ops;
323 nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is disabled");
324 nvgpu_spinlock_release(&ecc_report_linux->common.lock);
325}
326
327void nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst,
328 u32 err_id, u64 err_addr, u64 err_count)
329{
330 struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
331 struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux;
332 struct nvgpu_ecc_reporting *error_reporting = &ecc_report_linux->common;
333 void (*report_ecc_err_func)(struct gk20a *g, u32 hw_unit, u32 inst,
334 u32 err_id, u64 err_addr, u64 err_count);
335
336 nvgpu_spinlock_acquire(&ecc_report_linux->common.lock);
337 report_ecc_err_func = error_reporting->ops->report_ecc_err;
338 nvgpu_spinlock_release(&ecc_report_linux->common.lock);
339
340 report_ecc_err_func(g, hw_unit, inst, err_id, err_addr, err_count);
341}