diff options
-rw-r--r-- | drivers/gpu/nvgpu/Kconfig | 7 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/Makefile | 2 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/gk20a.c | 7 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gv11b/gr_gv11b.c | 43 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gv11b/pmu_gv11b.c | 15 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/include/nvgpu/bug.h | 20 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/include/nvgpu/log.h | 3 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h | 359 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/os/linux/ecc_linux.h | 49 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/os/linux/module.c | 22 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/os/linux/os_linux.h | 7 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/os/linux/sdl.c | 341 |
12 files changed, 867 insertions, 8 deletions
diff --git a/drivers/gpu/nvgpu/Kconfig b/drivers/gpu/nvgpu/Kconfig index 7dba61a3..07331817 100644 --- a/drivers/gpu/nvgpu/Kconfig +++ b/drivers/gpu/nvgpu/Kconfig | |||
@@ -143,6 +143,13 @@ config NVGPU_SUPPORT_CDE | |||
143 | help | 143 | help |
144 | Enable support for extraction of comptags for CDE. | 144 | Enable support for extraction of comptags for CDE. |
145 | 145 | ||
146 | config NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING | ||
147 | bool "Support ECC error reporting for Linux" | ||
148 | depends on TEGRA_SAFETY | ||
149 | default y | ||
150 | help | ||
151 | Enable support for ECC error reporting for Linux. | ||
152 | |||
146 | config NVGPU_USE_TEGRA_ALLOC_FD | 153 | config NVGPU_USE_TEGRA_ALLOC_FD |
147 | bool "Use tegra_alloc_fd() for allocating dma_buf fds for vidmem" | 154 | bool "Use tegra_alloc_fd() for allocating dma_buf fds for vidmem" |
148 | depends on GK20A && GK20A_VIDMEM | 155 | depends on GK20A && GK20A_VIDMEM |
diff --git a/drivers/gpu/nvgpu/Makefile b/drivers/gpu/nvgpu/Makefile index 472bf32c..d5ceecb6 100644 --- a/drivers/gpu/nvgpu/Makefile +++ b/drivers/gpu/nvgpu/Makefile | |||
@@ -98,6 +98,8 @@ nvgpu-y += \ | |||
98 | os/linux/ltc.o \ | 98 | os/linux/ltc.o \ |
99 | os/linux/vpr.o | 99 | os/linux/vpr.o |
100 | 100 | ||
101 | nvgpu-$(CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING) += os/linux/sdl.o | ||
102 | |||
101 | nvgpu-$(CONFIG_GK20A_VIDMEM) += \ | 103 | nvgpu-$(CONFIG_GK20A_VIDMEM) += \ |
102 | os/linux/dmabuf_vidmem.o | 104 | os/linux/dmabuf_vidmem.o |
103 | 105 | ||
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c index c3068b76..1a117169 100644 --- a/drivers/gpu/nvgpu/gk20a/gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/gk20a.c | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * GK20A Graphics | 2 | * GK20A Graphics |
3 | * | 3 | * |
4 | * Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved. | 4 | * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. |
5 | * | 5 | * |
6 | * Permission is hereby granted, free of charge, to any person obtaining a | 6 | * Permission is hereby granted, free of charge, to any person obtaining a |
7 | * copy of this software and associated documentation files (the "Software"), | 7 | * copy of this software and associated documentation files (the "Software"), |
@@ -39,6 +39,7 @@ | |||
39 | #include <nvgpu/therm.h> | 39 | #include <nvgpu/therm.h> |
40 | #include <nvgpu/mc.h> | 40 | #include <nvgpu/mc.h> |
41 | #include <nvgpu/channel_sync.h> | 41 | #include <nvgpu/channel_sync.h> |
42 | #include <nvgpu/nvgpu_err.h> | ||
42 | 43 | ||
43 | #include <trace/events/gk20a.h> | 44 | #include <trace/events/gk20a.h> |
44 | 45 | ||
@@ -525,6 +526,10 @@ static void gk20a_free_cb(struct nvgpu_ref *refcount) | |||
525 | struct gk20a *g = container_of(refcount, | 526 | struct gk20a *g = container_of(refcount, |
526 | struct gk20a, refcount); | 527 | struct gk20a, refcount); |
527 | 528 | ||
529 | #ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING | ||
530 | nvgpu_deinit_ecc_reporting(g); | ||
531 | #endif | ||
532 | |||
528 | nvgpu_log(g, gpu_dbg_shutdown, "Freeing GK20A struct!"); | 533 | nvgpu_log(g, gpu_dbg_shutdown, "Freeing GK20A struct!"); |
529 | 534 | ||
530 | gk20a_ce_destroy(g); | 535 | gk20a_ce_destroy(g); |
diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c index a7a804d2..110819a9 100644 --- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * GV11b GPU GR | 2 | * GV11b GPU GR |
3 | * | 3 | * |
4 | * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. | 4 | * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. |
5 | * | 5 | * |
6 | * Permission is hereby granted, free of charge, to any person obtaining a | 6 | * Permission is hereby granted, free of charge, to any person obtaining a |
7 | * copy of this software and associated documentation files (the "Software"), | 7 | * copy of this software and associated documentation files (the "Software"), |
@@ -37,6 +37,7 @@ | |||
37 | #include <nvgpu/bitops.h> | 37 | #include <nvgpu/bitops.h> |
38 | #include <nvgpu/gk20a.h> | 38 | #include <nvgpu/gk20a.h> |
39 | #include <nvgpu/channel.h> | 39 | #include <nvgpu/channel.h> |
40 | #include <nvgpu/nvgpu_err.h> | ||
40 | 41 | ||
41 | #include "gk20a/gr_gk20a.h" | 42 | #include "gk20a/gr_gk20a.h" |
42 | #include "gk20a/dbg_gpu_gk20a.h" | 43 | #include "gk20a/dbg_gpu_gk20a.h" |
@@ -61,6 +62,8 @@ | |||
61 | #include <nvgpu/hw/gv11b/hw_pbdma_gv11b.h> | 62 | #include <nvgpu/hw/gv11b/hw_pbdma_gv11b.h> |
62 | #include <nvgpu/hw/gv11b/hw_perf_gv11b.h> | 63 | #include <nvgpu/hw/gv11b/hw_perf_gv11b.h> |
63 | 64 | ||
65 | #define SHIFT_8_BITS 8U | ||
66 | |||
64 | #define GFXP_WFI_TIMEOUT_COUNT_IN_USEC_DEFAULT 100 | 67 | #define GFXP_WFI_TIMEOUT_COUNT_IN_USEC_DEFAULT 100 |
65 | 68 | ||
66 | /* ecc scrubbing will done in 1 pri read cycle,but for safety used 10 retries */ | 69 | /* ecc scrubbing will done in 1 pri read cycle,but for safety used 10 retries */ |
@@ -224,6 +227,12 @@ static int gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc, | |||
224 | } | 227 | } |
225 | g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter += | 228 | g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter += |
226 | l1_tag_corrected_err_count_delta; | 229 | l1_tag_corrected_err_count_delta; |
230 | |||
231 | nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, | ||
232 | (gpc << SHIFT_8_BITS) | tpc, | ||
233 | GPU_SM_L1_TAG_ECC_CORRECTED, 0, | ||
234 | g->ecc.gr.sm_l1_tag_ecc_corrected_err_count[gpc][tpc].counter); | ||
235 | |||
227 | gk20a_writel(g, | 236 | gk20a_writel(g, |
228 | gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_r() + offset, | 237 | gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_r() + offset, |
229 | 0); | 238 | 0); |
@@ -240,6 +249,12 @@ static int gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc, | |||
240 | } | 249 | } |
241 | g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter += | 250 | g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter += |
242 | l1_tag_uncorrected_err_count_delta; | 251 | l1_tag_uncorrected_err_count_delta; |
252 | |||
253 | nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, | ||
254 | (gpc << SHIFT_8_BITS) | tpc, | ||
255 | GPU_SM_L1_TAG_ECC_UNCORRECTED, 0, | ||
256 | g->ecc.gr.sm_l1_tag_ecc_uncorrected_err_count[gpc][tpc].counter); | ||
257 | |||
243 | gk20a_writel(g, | 258 | gk20a_writel(g, |
244 | gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_r() + offset, | 259 | gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_r() + offset, |
245 | 0); | 260 | 0); |
@@ -335,6 +350,10 @@ static int gr_gv11b_handle_lrf_exception(struct gk20a *g, u32 gpc, u32 tpc, | |||
335 | } | 350 | } |
336 | g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter += | 351 | g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter += |
337 | lrf_uncorrected_err_count_delta; | 352 | lrf_uncorrected_err_count_delta; |
353 | nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, | ||
354 | (gpc << SHIFT_8_BITS) | tpc, | ||
355 | GPU_SM_LRF_ECC_UNCORRECTED, 0, | ||
356 | g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter); | ||
338 | gk20a_writel(g, | 357 | gk20a_writel(g, |
339 | gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_r() + offset, | 358 | gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_r() + offset, |
340 | 0); | 359 | 0); |
@@ -497,6 +516,12 @@ static int gr_gv11b_handle_cbu_exception(struct gk20a *g, u32 gpc, u32 tpc, | |||
497 | } | 516 | } |
498 | g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter += | 517 | g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter += |
499 | cbu_uncorrected_err_count_delta; | 518 | cbu_uncorrected_err_count_delta; |
519 | |||
520 | nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, | ||
521 | (gpc << SHIFT_8_BITS) | tpc, | ||
522 | GPU_SM_CBU_ECC_UNCORRECTED, | ||
523 | 0, g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter); | ||
524 | |||
500 | gk20a_writel(g, | 525 | gk20a_writel(g, |
501 | gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_r() + offset, | 526 | gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_r() + offset, |
502 | 0); | 527 | 0); |
@@ -580,6 +605,10 @@ static int gr_gv11b_handle_l1_data_exception(struct gk20a *g, u32 gpc, u32 tpc, | |||
580 | } | 605 | } |
581 | g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter += | 606 | g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter += |
582 | l1_data_uncorrected_err_count_delta; | 607 | l1_data_uncorrected_err_count_delta; |
608 | nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_SM, | ||
609 | (gpc << SHIFT_8_BITS) | tpc, | ||
610 | GPU_SM_L1_DATA_ECC_UNCORRECTED, | ||
611 | 0, g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter); | ||
583 | gk20a_writel(g, | 612 | gk20a_writel(g, |
584 | gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_r() + offset, | 613 | gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_r() + offset, |
585 | 0); | 614 | 0); |
@@ -2537,10 +2566,18 @@ static void gr_gv11b_handle_fecs_ecc_error(struct gk20a *g, u32 intr) | |||
2537 | 2566 | ||
2538 | if (ecc_status & | 2567 | if (ecc_status & |
2539 | gr_fecs_falcon_ecc_status_corrected_err_imem_m()) { | 2568 | gr_fecs_falcon_ecc_status_corrected_err_imem_m()) { |
2569 | nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_FECS, 0, | ||
2570 | GPU_FECS_FALCON_IMEM_ECC_CORRECTED, | ||
2571 | ecc_addr, | ||
2572 | g->ecc.gr.fecs_ecc_corrected_err_count[0].counter); | ||
2540 | nvgpu_log(g, gpu_dbg_intr, "imem ecc error corrected"); | 2573 | nvgpu_log(g, gpu_dbg_intr, "imem ecc error corrected"); |
2541 | } | 2574 | } |
2542 | if (ecc_status & | 2575 | if (ecc_status & |
2543 | gr_fecs_falcon_ecc_status_uncorrected_err_imem_m()) { | 2576 | gr_fecs_falcon_ecc_status_uncorrected_err_imem_m()) { |
2577 | nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_FECS, 0, | ||
2578 | GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED, | ||
2579 | ecc_addr, | ||
2580 | g->ecc.gr.fecs_ecc_uncorrected_err_count[0].counter); | ||
2544 | nvgpu_log(g, gpu_dbg_intr, | 2581 | nvgpu_log(g, gpu_dbg_intr, |
2545 | "imem ecc error uncorrected"); | 2582 | "imem ecc error uncorrected"); |
2546 | } | 2583 | } |
@@ -2550,6 +2587,10 @@ static void gr_gv11b_handle_fecs_ecc_error(struct gk20a *g, u32 intr) | |||
2550 | } | 2587 | } |
2551 | if (ecc_status & | 2588 | if (ecc_status & |
2552 | gr_fecs_falcon_ecc_status_uncorrected_err_dmem_m()) { | 2589 | gr_fecs_falcon_ecc_status_uncorrected_err_dmem_m()) { |
2590 | nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_FECS, 0, | ||
2591 | GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED, | ||
2592 | ecc_addr, | ||
2593 | g->ecc.gr.fecs_ecc_uncorrected_err_count[0].counter); | ||
2553 | nvgpu_log(g, gpu_dbg_intr, | 2594 | nvgpu_log(g, gpu_dbg_intr, |
2554 | "dmem ecc error uncorrected"); | 2595 | "dmem ecc error uncorrected"); |
2555 | } | 2596 | } |
diff --git a/drivers/gpu/nvgpu/gv11b/pmu_gv11b.c b/drivers/gpu/nvgpu/gv11b/pmu_gv11b.c index 5e586ec2..336258a7 100644 --- a/drivers/gpu/nvgpu/gv11b/pmu_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/pmu_gv11b.c | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * GV11B PMU | 2 | * GV11B PMU |
3 | * | 3 | * |
4 | * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. | 4 | * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. |
5 | * | 5 | * |
6 | * Permission is hereby granted, free of charge, to any person obtaining a | 6 | * Permission is hereby granted, free of charge, to any person obtaining a |
7 | * copy of this software and associated documentation files (the "Software"), | 7 | * copy of this software and associated documentation files (the "Software"), |
@@ -29,6 +29,7 @@ | |||
29 | #include <nvgpu/io.h> | 29 | #include <nvgpu/io.h> |
30 | #include <nvgpu/utils.h> | 30 | #include <nvgpu/utils.h> |
31 | #include <nvgpu/gk20a.h> | 31 | #include <nvgpu/gk20a.h> |
32 | #include <nvgpu/nvgpu_err.h> | ||
32 | 33 | ||
33 | #include "gk20a/pmu_gk20a.h" | 34 | #include "gk20a/pmu_gk20a.h" |
34 | #include "gp10b/pmu_gp10b.h" | 35 | #include "gp10b/pmu_gp10b.h" |
@@ -354,10 +355,18 @@ void gv11b_pmu_handle_ext_irq(struct gk20a *g, u32 intr0) | |||
354 | "pmu ecc interrupt intr1: 0x%x", intr1); | 355 | "pmu ecc interrupt intr1: 0x%x", intr1); |
355 | 356 | ||
356 | if (ecc_status & pwr_pmu_falcon_ecc_status_corrected_err_imem_m()) { | 357 | if (ecc_status & pwr_pmu_falcon_ecc_status_corrected_err_imem_m()) { |
358 | nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_PMU, 0, | ||
359 | GPU_PMU_FALCON_IMEM_ECC_CORRECTED, | ||
360 | ecc_addr, | ||
361 | g->ecc.pmu.pmu_ecc_corrected_err_count[0].counter); | ||
357 | nvgpu_log(g, gpu_dbg_intr, | 362 | nvgpu_log(g, gpu_dbg_intr, |
358 | "imem ecc error corrected"); | 363 | "imem ecc error corrected"); |
359 | } | 364 | } |
360 | if (ecc_status & pwr_pmu_falcon_ecc_status_uncorrected_err_imem_m()) { | 365 | if (ecc_status & pwr_pmu_falcon_ecc_status_uncorrected_err_imem_m()) { |
366 | nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_PMU, 0, | ||
367 | GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED, | ||
368 | ecc_addr, | ||
369 | g->ecc.pmu.pmu_ecc_uncorrected_err_count[0].counter); | ||
361 | nvgpu_log(g, gpu_dbg_intr, | 370 | nvgpu_log(g, gpu_dbg_intr, |
362 | "imem ecc error uncorrected"); | 371 | "imem ecc error uncorrected"); |
363 | } | 372 | } |
@@ -366,6 +375,10 @@ void gv11b_pmu_handle_ext_irq(struct gk20a *g, u32 intr0) | |||
366 | "dmem ecc error corrected"); | 375 | "dmem ecc error corrected"); |
367 | } | 376 | } |
368 | if (ecc_status & pwr_pmu_falcon_ecc_status_uncorrected_err_dmem_m()) { | 377 | if (ecc_status & pwr_pmu_falcon_ecc_status_uncorrected_err_dmem_m()) { |
378 | nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_PMU, 0, | ||
379 | GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED, | ||
380 | ecc_addr, | ||
381 | g->ecc.pmu.pmu_ecc_uncorrected_err_count[0].counter); | ||
369 | nvgpu_log(g, gpu_dbg_intr, | 382 | nvgpu_log(g, gpu_dbg_intr, |
370 | "dmem ecc error uncorrected"); | 383 | "dmem ecc error uncorrected"); |
371 | } | 384 | } |
diff --git a/drivers/gpu/nvgpu/include/nvgpu/bug.h b/drivers/gpu/nvgpu/include/nvgpu/bug.h index 3d139b75..82d641bd 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/bug.h +++ b/drivers/gpu/nvgpu/include/nvgpu/bug.h | |||
@@ -1,5 +1,5 @@ | |||
1 | /* | 1 | /* |
2 | * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. | 2 | * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. |
3 | * | 3 | * |
4 | * Permission is hereby granted, free of charge, to any person obtaining a | 4 | * Permission is hereby granted, free of charge, to any person obtaining a |
5 | * copy of this software and associated documentation files (the "Software"), | 5 | * copy of this software and associated documentation files (the "Software"), |
@@ -24,6 +24,24 @@ | |||
24 | 24 | ||
25 | #ifdef __KERNEL__ | 25 | #ifdef __KERNEL__ |
26 | #include <linux/bug.h> | 26 | #include <linux/bug.h> |
27 | /* | ||
28 | * Define an assert macro that code within nvgpu can use. | ||
29 | * | ||
30 | * The goal of this macro is for debugging but what that means varies from OS | ||
31 | * to OS. On Linux wee don't want to BUG() for general driver misbehaving. BUG() | ||
32 | * is a very heavy handed tool - in fact there's probably no where within the | ||
33 | * nvgpu core code where it makes sense to use a BUG() when running under Linux. | ||
34 | * | ||
35 | * However, on QNX (and POSIX) BUG() will just kill the current process. This | ||
36 | * means we can use it for handling bugs in nvgpu. | ||
37 | * | ||
38 | * As a result this macro varies depending on platform. | ||
39 | */ | ||
40 | #define nvgpu_assert(cond) ((void) WARN_ON(!(cond))) | ||
41 | #define nvgpu_do_assert_print(g, fmt, arg...) \ | ||
42 | do { \ | ||
43 | nvgpu_err(g, fmt, ##arg); \ | ||
44 | } while (false) | ||
27 | #elif defined(__NVGPU_POSIX__) | 45 | #elif defined(__NVGPU_POSIX__) |
28 | #include <nvgpu/posix/bug.h> | 46 | #include <nvgpu/posix/bug.h> |
29 | #else | 47 | #else |
diff --git a/drivers/gpu/nvgpu/include/nvgpu/log.h b/drivers/gpu/nvgpu/include/nvgpu/log.h index 70a16762..2bcca335 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/log.h +++ b/drivers/gpu/nvgpu/include/nvgpu/log.h | |||
@@ -1,5 +1,5 @@ | |||
1 | /* | 1 | /* |
2 | * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. | 2 | * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. |
3 | * | 3 | * |
4 | * Permission is hereby granted, free of charge, to any person obtaining a | 4 | * Permission is hereby granted, free of charge, to any person obtaining a |
5 | * copy of this software and associated documentation files (the "Software"), | 5 | * copy of this software and associated documentation files (the "Software"), |
@@ -80,6 +80,7 @@ void __nvgpu_log_dbg(struct gk20a *g, u64 log_mask, | |||
80 | #define gpu_dbg_vidmem BIT(24) /* VIDMEM tracing. */ | 80 | #define gpu_dbg_vidmem BIT(24) /* VIDMEM tracing. */ |
81 | #define gpu_dbg_nvlink BIT(25) /* nvlink Operation tracing. */ | 81 | #define gpu_dbg_nvlink BIT(25) /* nvlink Operation tracing. */ |
82 | #define gpu_dbg_clk_arb BIT(26) /* Clk arbiter debugging. */ | 82 | #define gpu_dbg_clk_arb BIT(26) /* Clk arbiter debugging. */ |
83 | #define gpu_dbg_ecc BIT(27) /* Print ECC Info Logs. */ | ||
83 | #define gpu_dbg_mem BIT(31) /* memory accesses; very verbose. */ | 84 | #define gpu_dbg_mem BIT(31) /* memory accesses; very verbose. */ |
84 | 85 | ||
85 | /** | 86 | /** |
diff --git a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h new file mode 100644 index 00000000..0595fafb --- /dev/null +++ b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h | |||
@@ -0,0 +1,359 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. | ||
3 | * | ||
4 | * Permission is hereby granted, free of charge, to any person obtaining a | ||
5 | * copy of this software and associated documentation files (the "Software"), | ||
6 | * to deal in the Software without restriction, including without limitation | ||
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | ||
8 | * and/or sell copies of the Software, and to permit persons to whom the | ||
9 | * Software is furnished to do so, subject to the following conditions: | ||
10 | * | ||
11 | * The above copyright notice and this permission notice shall be included in | ||
12 | * all copies or substantial portions of the Software. | ||
13 | * | ||
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | ||
17 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | ||
19 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | ||
20 | * DEALINGS IN THE SOFTWARE. | ||
21 | */ | ||
22 | |||
23 | #ifndef NVGPU_NVGPU_ERR_H | ||
24 | #define NVGPU_NVGPU_ERR_H | ||
25 | |||
26 | /** | ||
27 | * @file | ||
28 | * | ||
29 | * Define indices for HW units and errors. Define structures used to carry error | ||
30 | * information. Declare prototype for APIs that are used to report GPU HW errors | ||
31 | * to the Safety_Services framework. | ||
32 | */ | ||
33 | |||
34 | #include <nvgpu/types.h> | ||
35 | #include <nvgpu/atomic.h> | ||
36 | |||
37 | struct gk20a; | ||
38 | |||
39 | /** | ||
40 | * @defgroup INDICES_FOR_GPU_HW_UNITS | ||
41 | * Macros used to assign unique index to GPU HW units. | ||
42 | * @{ | ||
43 | */ | ||
44 | #define NVGPU_ERR_MODULE_SM (0U) | ||
45 | #define NVGPU_ERR_MODULE_FECS (1U) | ||
46 | #define NVGPU_ERR_MODULE_PMU (2U) | ||
47 | /** | ||
48 | * @} | ||
49 | */ | ||
50 | |||
51 | /** | ||
52 | * @defgroup LIST_OF_ERRORS_REPORTED_FROM_SM | ||
53 | * Macros used to assign unique index to errors reported from the SM unit. | ||
54 | * @{ | ||
55 | */ | ||
56 | #define GPU_SM_L1_TAG_ECC_CORRECTED (0U) | ||
57 | #define GPU_SM_L1_TAG_ECC_UNCORRECTED (1U) | ||
58 | #define GPU_SM_CBU_ECC_UNCORRECTED (3U) | ||
59 | #define GPU_SM_LRF_ECC_UNCORRECTED (5U) | ||
60 | #define GPU_SM_L1_DATA_ECC_UNCORRECTED (7U) | ||
61 | #define GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED (9U) | ||
62 | #define GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED (11U) | ||
63 | #define GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED (13U) | ||
64 | #define GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED (15U) | ||
65 | #define GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED (17U) | ||
66 | #define GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED (20U) | ||
67 | /** | ||
68 | * @} | ||
69 | */ | ||
70 | |||
71 | /** | ||
72 | * @defgroup LIST_OF_ERRORS_REPORTED_FROM_FECS | ||
73 | * Macros used to assign unique index to errors reported from the FECS unit. | ||
74 | * @{ | ||
75 | */ | ||
76 | #define GPU_FECS_FALCON_IMEM_ECC_CORRECTED (0U) | ||
77 | #define GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED (1U) | ||
78 | #define GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED (3U) | ||
79 | /** | ||
80 | * @} | ||
81 | */ | ||
82 | |||
83 | /** | ||
84 | * @defgroup LIST_OF_ERRORS_REPORTED_FROM_GPCCS | ||
85 | * Macros used to assign unique index to errors reported from the GPCCS unit. | ||
86 | * @{ | ||
87 | */ | ||
88 | #define GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED (0U) | ||
89 | #define GPU_GPCCS_FALCON_IMEM_ECC_UNCORRECTED (1U) | ||
90 | #define GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED (3U) | ||
91 | /** | ||
92 | * @} | ||
93 | */ | ||
94 | |||
95 | /** | ||
96 | * @defgroup LIST_OF_ERRORS_REPORTED_FROM_MMU | ||
97 | * Macros used to assign unique index to errors reported from the MMU unit. | ||
98 | * @{ | ||
99 | */ | ||
100 | #define GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED (1U) | ||
101 | #define GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED (3U) | ||
102 | /** | ||
103 | * @} | ||
104 | */ | ||
105 | |||
106 | /** | ||
107 | * @defgroup LIST_OF_ERRORS_REPORTED_FROM_GCC | ||
108 | * Macros used to assign unique index to errors reported from the GCC unit. | ||
109 | * @{ | ||
110 | */ | ||
111 | #define GPU_GCC_L15_ECC_UNCORRECTED (1U) | ||
112 | /** | ||
113 | * @} | ||
114 | */ | ||
115 | |||
116 | |||
117 | /** | ||
118 | * @defgroup LIST_OF_ERRORS_REPORTED_FROM_PMU | ||
119 | * Macros used to assign unique index to errors reported from the PMU unit. | ||
120 | * @{ | ||
121 | */ | ||
122 | #define GPU_PMU_FALCON_IMEM_ECC_CORRECTED (0U) | ||
123 | #define GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED (1U) | ||
124 | #define GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED (3U) | ||
125 | /** | ||
126 | * @} | ||
127 | */ | ||
128 | |||
129 | /** | ||
130 | * @defgroup LIST_OF_ERRORS_REPORTED_FROM_LTC | ||
131 | * Macros used to assign unique index to errors reported from the LTC unit. | ||
132 | * @{ | ||
133 | */ | ||
134 | #define GPU_LTC_CACHE_DSTG_ECC_CORRECTED (0U) | ||
135 | #define GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED (1U) | ||
136 | #define GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED (3U) | ||
137 | #define GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED (7U) | ||
138 | /** | ||
139 | * @} | ||
140 | */ | ||
141 | |||
142 | /** | ||
143 | * @defgroup LIST_OF_ERRORS_REPORTED_FROM_HUBMMU | ||
144 | * Macros used to assign unique index to errors reported from the HUBMMU unit. | ||
145 | * @{ | ||
146 | */ | ||
147 | #define GPU_HUBMMU_L2TLB_SA_DATA_ECC_UNCORRECTED (1U) | ||
148 | #define GPU_HUBMMU_TLB_SA_DATA_ECC_UNCORRECTED (3U) | ||
149 | #define GPU_HUBMMU_PTE_DATA_ECC_UNCORRECTED (5U) | ||
150 | #define GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED (7U) | ||
151 | #define GPU_HUBMMU_PAGE_FAULT_ERROR (8U) | ||
152 | |||
153 | |||
154 | #ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING | ||
155 | /** | ||
156 | * @} | ||
157 | */ | ||
158 | |||
159 | /** | ||
160 | * nvgpu_err_desc structure holds fields which describe an error along with | ||
161 | * function callback which can be used to inject the error. | ||
162 | */ | ||
163 | struct nvgpu_err_desc { | ||
164 | /** String representation of error. */ | ||
165 | const char *name; | ||
166 | |||
167 | /** Flag to classify an error as critical or non-critical. */ | ||
168 | bool is_critical; | ||
169 | |||
170 | /** | ||
171 | * Error Threshold: once this threshold value is reached, then the | ||
172 | * corresponding error counter will be reset to 0 and the error will be | ||
173 | * propagated to Safety_Services. | ||
174 | */ | ||
175 | int err_threshold; | ||
176 | |||
177 | /** | ||
178 | * Total number of times an error has occurred (since its last reset). | ||
179 | */ | ||
180 | nvgpu_atomic_t err_count; | ||
181 | |||
182 | /** Error ID. */ | ||
183 | u8 error_id; | ||
184 | }; | ||
185 | |||
186 | /** | ||
187 | * gpu_err_header structure holds fields which are required to identify the | ||
188 | * version of header, sub-error type, sub-unit id, error address and time stamp. | ||
189 | */ | ||
190 | struct gpu_err_header { | ||
191 | /** Version of GPU error header. */ | ||
192 | struct { | ||
193 | /** Major version number. */ | ||
194 | u16 major; | ||
195 | /** Minor version number. */ | ||
196 | u16 minor; | ||
197 | } version; | ||
198 | |||
199 | /** Sub error type corresponding to the error that is being reported. */ | ||
200 | u32 sub_err_type; | ||
201 | |||
202 | /** ID of the sub-unit in a HW unit which encountered an error. */ | ||
203 | u64 sub_unit_id; | ||
204 | |||
205 | /** Location of the error. */ | ||
206 | u64 address; | ||
207 | |||
208 | /** Timestamp in nano seconds. */ | ||
209 | u64 timestamp_ns; | ||
210 | }; | ||
211 | |||
212 | struct gpu_ecc_error_info { | ||
213 | struct gpu_err_header header; | ||
214 | |||
215 | /** Number of ECC errors. */ | ||
216 | u64 err_cnt; | ||
217 | }; | ||
218 | |||
219 | /** | ||
220 | * nvgpu_err_hw_module structure holds fields which describe the h/w modules | ||
221 | * error reporting capabilities. | ||
222 | */ | ||
223 | struct nvgpu_err_hw_module { | ||
224 | /** String representation of a given HW unit. */ | ||
225 | const char *name; | ||
226 | |||
227 | /** HW unit ID. */ | ||
228 | u32 hw_unit; | ||
229 | |||
230 | /** Total number of errors reported from a given HW unit. */ | ||
231 | u32 num_errs; | ||
232 | |||
233 | u32 base_ecc_service_id; | ||
234 | |||
235 | /** Used to get error description from look-up table. */ | ||
236 | struct nvgpu_err_desc *errs; | ||
237 | }; | ||
238 | |||
239 | struct nvgpu_ecc_reporting_ops { | ||
240 | void (*report_ecc_err)(struct gk20a *g, u32 hw_unit, u32 inst, | ||
241 | u32 err_id, u64 err_addr, u64 err_count); | ||
242 | }; | ||
243 | |||
244 | struct nvgpu_ecc_reporting { | ||
245 | struct nvgpu_spinlock lock; | ||
246 | /* This flag is protected by the above spinlock */ | ||
247 | bool ecc_reporting_service_enabled; | ||
248 | const struct nvgpu_ecc_reporting_ops *ops; | ||
249 | }; | ||
250 | |||
251 | /** | ||
252 | * This macro is used to initialize the members of nvgpu_err_desc struct. | ||
253 | */ | ||
254 | #define GPU_ERR(err, critical, id, threshold, ecount) \ | ||
255 | { \ | ||
256 | .name = (err), \ | ||
257 | .is_critical = (critical), \ | ||
258 | .error_id = (id), \ | ||
259 | .err_threshold = (threshold), \ | ||
260 | .err_count = NVGPU_ATOMIC_INIT(ecount), \ | ||
261 | } | ||
262 | |||
263 | /** | ||
264 | * This macro is used to initialize critical errors. | ||
265 | */ | ||
266 | #define GPU_CRITERR(err, id, threshold, ecount) \ | ||
267 | GPU_ERR(err, true, id, threshold, ecount) | ||
268 | |||
269 | /** | ||
270 | * This macro is used to initialize non-critical errors. | ||
271 | */ | ||
272 | #define GPU_NONCRITERR(err, id, threshold, ecount) \ | ||
273 | GPU_ERR(err, false, id, threshold, ecount) | ||
274 | |||
275 | /** | ||
276 | * @brief GPU HW errors need to be reported to Safety_Services via SDL unit. | ||
277 | * This function provides an interface to report ECC erros to SDL unit. | ||
278 | * | ||
279 | * @param g [in] - The GPU driver struct. | ||
280 | * @param hw_unit [in] - Index of HW unit. | ||
281 | * - List of valid HW unit IDs | ||
282 | * - NVGPU_ERR_MODULE_SM | ||
283 | * - NVGPU_ERR_MODULE_FECS | ||
284 | * - NVGPU_ERR_MODULE_GPCCS | ||
285 | * - NVGPU_ERR_MODULE_MMU | ||
286 | * - NVGPU_ERR_MODULE_GCC | ||
287 | * - NVGPU_ERR_MODULE_PMU | ||
288 | * - NVGPU_ERR_MODULE_LTC | ||
289 | * - NVGPU_ERR_MODULE_HUBMMU | ||
290 | * @param inst [in] - Instance ID. | ||
291 | * - In case of multiple instances of the same HW | ||
292 | * unit (e.g., there are multiple instances of | ||
293 | * SM), it is used to identify the instance | ||
294 | * that encountered a fault. | ||
295 | * @param err_id [in] - Error index. | ||
296 | * - For SM: | ||
297 | * - Min: GPU_SM_L1_TAG_ECC_CORRECTED | ||
298 | * - Max: GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED | ||
299 | * - For FECS: | ||
300 | * - Min: GPU_FECS_FALCON_IMEM_ECC_CORRECTED | ||
301 | * - Max: GPU_FECS_INVALID_ERROR | ||
302 | * - For GPCCS: | ||
303 | * - Min: GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED | ||
304 | * - Max: GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED | ||
305 | * - For MMU: | ||
306 | * - Min: GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED | ||
307 | * - Max: GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED | ||
308 | * - For GCC: | ||
309 | * - Min: GPU_GCC_L15_ECC_UNCORRECTED | ||
310 | * - Max: GPU_GCC_L15_ECC_UNCORRECTED | ||
311 | * - For PMU: | ||
312 | * - Min: GPU_PMU_FALCON_IMEM_ECC_CORRECTED | ||
313 | * - Max: GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED | ||
314 | * - For LTC: | ||
315 | * - Min: GPU_LTC_CACHE_DSTG_ECC_CORRECTED | ||
316 | * - Max: GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED | ||
317 | * - For HUBMMU: | ||
318 | * - Min: GPU_HUBMMU_L2TLB_SA_DATA_ECC_UNCORRECTED | ||
319 | * - Max: GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED | ||
320 | * @param err_addr [in] - Error address. | ||
321 | * - This is the location at which correctable or | ||
322 | * uncorrectable error has occurred. | ||
323 | * @param err_count [in] - Error count. | ||
324 | * | ||
325 | * - Checks whether SDL is supported in the current GPU platform. If SDL is not | ||
326 | * supported, it simply returns. | ||
327 | * - Validates both \a hw_unit and \a err_id indices. In case of a failure, | ||
328 | * invokes #nvgpu_sdl_handle_report_failure() api. | ||
329 | * - Gets the current time of a clock. In case of a failure, invokes | ||
330 | * #nvgpu_sdl_handle_report_failure() api. | ||
331 | * - Gets error description from internal look-up table using \a hw_unit and | ||
332 | * \a err_id indices. | ||
333 | * - Forms error packet using details such as time-stamp, \a hw_unit, \a err_id, | ||
334 | * criticality of the error, \a inst, \a err_addr, \a err_count, error | ||
335 | * description, and size of the error packet. | ||
336 | * - Performs compile-time assert check to ensure that the size of the error | ||
337 | * packet does not exceed the maximum allowable size specified in | ||
338 | * #MAX_ERR_MSG_SIZE. | ||
339 | * | ||
340 | * @return None | ||
341 | */ | ||
342 | void nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst, | ||
343 | u32 err_id, u64 err_addr, u64 err_count); | ||
344 | |||
345 | void nvgpu_init_ecc_reporting(struct gk20a *g); | ||
346 | void nvgpu_enable_ecc_reporting(struct gk20a *g); | ||
347 | void nvgpu_disable_ecc_reporting(struct gk20a *g); | ||
348 | void nvgpu_deinit_ecc_reporting(struct gk20a *g); | ||
349 | |||
350 | #else | ||
351 | |||
352 | static inline void nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst, | ||
353 | u32 err_id, u64 err_addr, u64 err_count) { | ||
354 | |||
355 | } | ||
356 | |||
357 | #endif /* CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING */ | ||
358 | |||
359 | #endif /* NVGPU_NVGPU_ERR_H */ \ No newline at end of file | ||
diff --git a/drivers/gpu/nvgpu/os/linux/ecc_linux.h b/drivers/gpu/nvgpu/os/linux/ecc_linux.h new file mode 100644 index 00000000..7e0f650b --- /dev/null +++ b/drivers/gpu/nvgpu/os/linux/ecc_linux.h | |||
@@ -0,0 +1,49 @@ | |||
1 | /* | ||
2 | * | ||
3 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. | ||
4 | * | ||
5 | * Permission is hereby granted, free of charge, to any person obtaining a | ||
6 | * copy of this software and associated documentation files (the "Software"), | ||
7 | * to deal in the Software without restriction, including without limitation | ||
8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | ||
9 | * and/or sell copies of the Software, and to permit persons to whom the | ||
10 | * Software is furnished to do so, subject to the following conditions: | ||
11 | * | ||
12 | * The above copyright notice and this permission notice shall be included in | ||
13 | * all copies or substantial portions of the Software. | ||
14 | * | ||
15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | ||
18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | ||
20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | ||
21 | * DEALINGS IN THE SOFTWARE. | ||
22 | */ | ||
23 | |||
24 | #ifndef NVGPU_OS_ECC_LINUX_H | ||
25 | #define NVGPU_OS_ECC_LINUX_H | ||
26 | |||
27 | #ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING | ||
28 | |||
29 | #include <linux/tegra_l1ss_kernel_interface.h> | ||
30 | #include <linux/tegra_l1ss_ioctl.h> | ||
31 | #include <linux/tegra_nv_guard_service_id.h> | ||
32 | #include <linux/tegra_nv_guard_group_id.h> | ||
33 | |||
34 | #include <nvgpu/nvgpu_err.h> | ||
35 | |||
36 | struct nvgpu_ecc_reporting_linux { | ||
37 | struct nvgpu_ecc_reporting common; | ||
38 | client_param_t priv; | ||
39 | }; | ||
40 | |||
41 | static inline struct nvgpu_ecc_reporting_linux *get_ecc_reporting_linux( | ||
42 | struct nvgpu_ecc_reporting *ecc_report) | ||
43 | { | ||
44 | return container_of(ecc_report, struct nvgpu_ecc_reporting_linux, common); | ||
45 | } | ||
46 | |||
47 | #endif /* CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING */ | ||
48 | |||
49 | #endif \ No newline at end of file | ||
diff --git a/drivers/gpu/nvgpu/os/linux/module.c b/drivers/gpu/nvgpu/os/linux/module.c index 807df2ca..fdbab46d 100644 --- a/drivers/gpu/nvgpu/os/linux/module.c +++ b/drivers/gpu/nvgpu/os/linux/module.c | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * GK20A Graphics | 2 | * GK20A Graphics |
3 | * | 3 | * |
4 | * Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved. | 4 | * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. |
5 | * | 5 | * |
6 | * This program is free software; you can redistribute it and/or modify it | 6 | * This program is free software; you can redistribute it and/or modify it |
7 | * under the terms and conditions of the GNU General Public License, | 7 | * under the terms and conditions of the GNU General Public License, |
@@ -49,6 +49,7 @@ | |||
49 | #include <nvgpu/clk_arb.h> | 49 | #include <nvgpu/clk_arb.h> |
50 | #include <nvgpu/timers.h> | 50 | #include <nvgpu/timers.h> |
51 | #include <nvgpu/channel.h> | 51 | #include <nvgpu/channel.h> |
52 | #include <nvgpu/nvgpu_err.h> | ||
52 | 53 | ||
53 | #include "platform_gk20a.h" | 54 | #include "platform_gk20a.h" |
54 | #include "sysfs.h" | 55 | #include "sysfs.h" |
@@ -355,6 +356,10 @@ int gk20a_pm_finalize_poweron(struct device *dev) | |||
355 | gk20a_init_cde_support(l); | 356 | gk20a_init_cde_support(l); |
356 | #endif | 357 | #endif |
357 | 358 | ||
359 | #ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING | ||
360 | nvgpu_enable_ecc_reporting(g); | ||
361 | #endif | ||
362 | |||
358 | err = gk20a_sched_ctrl_init(g); | 363 | err = gk20a_sched_ctrl_init(g); |
359 | if (err) { | 364 | if (err) { |
360 | nvgpu_err(g, "failed to init sched control"); | 365 | nvgpu_err(g, "failed to init sched control"); |
@@ -364,9 +369,14 @@ int gk20a_pm_finalize_poweron(struct device *dev) | |||
364 | g->sw_ready = true; | 369 | g->sw_ready = true; |
365 | 370 | ||
366 | done: | 371 | done: |
367 | if (err) | 372 | if (err) { |
368 | g->power_on = false; | 373 | g->power_on = false; |
369 | 374 | ||
375 | #ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING | ||
376 | nvgpu_disable_ecc_reporting(g); | ||
377 | #endif | ||
378 | } | ||
379 | |||
370 | nvgpu_mutex_release(&g->power_lock); | 380 | nvgpu_mutex_release(&g->power_lock); |
371 | return err; | 381 | return err; |
372 | } | 382 | } |
@@ -433,6 +443,10 @@ static int gk20a_pm_prepare_poweroff(struct device *dev) | |||
433 | /* Stop CPU from accessing the GPU registers. */ | 443 | /* Stop CPU from accessing the GPU registers. */ |
434 | gk20a_lockout_registers(g); | 444 | gk20a_lockout_registers(g); |
435 | 445 | ||
446 | #ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING | ||
447 | nvgpu_disable_ecc_reporting(g); | ||
448 | #endif | ||
449 | |||
436 | nvgpu_hide_usermode_for_poweroff(g); | 450 | nvgpu_hide_usermode_for_poweroff(g); |
437 | nvgpu_mutex_release(&g->power_lock); | 451 | nvgpu_mutex_release(&g->power_lock); |
438 | return 0; | 452 | return 0; |
@@ -1382,6 +1396,10 @@ static int gk20a_probe(struct platform_device *dev) | |||
1382 | goto return_err; | 1396 | goto return_err; |
1383 | } | 1397 | } |
1384 | 1398 | ||
1399 | #ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING | ||
1400 | nvgpu_init_ecc_reporting(gk20a); | ||
1401 | #endif | ||
1402 | |||
1385 | gk20a->nvgpu_reboot_nb.notifier_call = | 1403 | gk20a->nvgpu_reboot_nb.notifier_call = |
1386 | nvgpu_kernel_shutdown_notification; | 1404 | nvgpu_kernel_shutdown_notification; |
1387 | err = register_reboot_notifier(&gk20a->nvgpu_reboot_nb); | 1405 | err = register_reboot_notifier(&gk20a->nvgpu_reboot_nb); |
diff --git a/drivers/gpu/nvgpu/os/linux/os_linux.h b/drivers/gpu/nvgpu/os/linux/os_linux.h index 25c6c03a..adcfdb2f 100644 --- a/drivers/gpu/nvgpu/os/linux/os_linux.h +++ b/drivers/gpu/nvgpu/os/linux/os_linux.h | |||
@@ -1,5 +1,5 @@ | |||
1 | /* | 1 | /* |
2 | * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. | 2 | * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. |
3 | * | 3 | * |
4 | * This program is free software; you can redistribute it and/or modify it | 4 | * This program is free software; you can redistribute it and/or modify it |
5 | * under the terms and conditions of the GNU General Public License, | 5 | * under the terms and conditions of the GNU General Public License, |
@@ -25,6 +25,7 @@ | |||
25 | 25 | ||
26 | #include "cde.h" | 26 | #include "cde.h" |
27 | #include "sched.h" | 27 | #include "sched.h" |
28 | #include "ecc_linux.h" | ||
28 | 29 | ||
29 | struct nvgpu_os_linux_ops { | 30 | struct nvgpu_os_linux_ops { |
30 | struct { | 31 | struct { |
@@ -134,6 +135,10 @@ struct nvgpu_os_linux { | |||
134 | 135 | ||
135 | u64 regs_bus_addr; | 136 | u64 regs_bus_addr; |
136 | 137 | ||
138 | #ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING | ||
139 | struct nvgpu_ecc_reporting_linux ecc_reporting_linux; | ||
140 | #endif | ||
141 | |||
137 | struct nvgpu_os_linux_ops ops; | 142 | struct nvgpu_os_linux_ops ops; |
138 | 143 | ||
139 | #ifdef CONFIG_DEBUG_FS | 144 | #ifdef CONFIG_DEBUG_FS |
diff --git a/drivers/gpu/nvgpu/os/linux/sdl.c b/drivers/gpu/nvgpu/os/linux/sdl.c new file mode 100644 index 00000000..c4dccdc6 --- /dev/null +++ b/drivers/gpu/nvgpu/os/linux/sdl.c | |||
@@ -0,0 +1,341 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2021, NVIDIA Corporation. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify it | ||
5 | * under the terms and conditions of the GNU General Public License, | ||
6 | * version 2, as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope it will be useful, but WITHOUT | ||
9 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
10 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
11 | * more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public License | ||
14 | * along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
15 | */ | ||
16 | |||
17 | #include <nvgpu/gk20a.h> | ||
18 | #include <nvgpu/types.h> | ||
19 | #include <nvgpu/nvgpu_err.h> | ||
20 | #include <nvgpu/timers.h> | ||
21 | #include <nvgpu/bug.h> | ||
22 | |||
23 | #include "ecc_linux.h" | ||
24 | #include "os_linux.h" | ||
25 | #include "module.h" | ||
26 | |||
27 | /* This look-up table initializes the list of hw units and their errors. | ||
28 | * It also specifies the error injection mechanism supported, for each error. | ||
29 | * In case of hw error injection support, this initialization will be overriden | ||
30 | * by the values provided from the hal layes of corresponding hw units. | ||
31 | */ | ||
32 | static struct nvgpu_err_hw_module gv11b_err_lut[] = { | ||
33 | { | ||
34 | .name = "sm", | ||
35 | .hw_unit = (u32)NVGPU_ERR_MODULE_SM, | ||
36 | .num_errs = 21U, | ||
37 | .base_ecc_service_id = | ||
38 | NVGUARD_SERVICE_IGPU_SM_SWERR_L1_TAG_ECC_CORRECTED, | ||
39 | .errs = (struct nvgpu_err_desc[]) { | ||
40 | GPU_NONCRITERR("l1_tag_ecc_corrected", | ||
41 | GPU_SM_L1_TAG_ECC_CORRECTED, 0, 0), | ||
42 | GPU_CRITERR("l1_tag_ecc_uncorrected", | ||
43 | GPU_SM_L1_TAG_ECC_UNCORRECTED, 0, 0), | ||
44 | GPU_NONCRITERR("cbu_ecc_corrected", 0, 0, 0), | ||
45 | GPU_CRITERR("cbu_ecc_uncorrected", | ||
46 | GPU_SM_CBU_ECC_UNCORRECTED, 0, 0), | ||
47 | GPU_NONCRITERR("lrf_ecc_corrected", 0, 0, 0), | ||
48 | GPU_CRITERR("lrf_ecc_uncorrected", | ||
49 | GPU_SM_LRF_ECC_UNCORRECTED, 0, 0), | ||
50 | GPU_NONCRITERR("l1_data_ecc_corrected", 0, 0, 0), | ||
51 | GPU_CRITERR("l1_data_ecc_uncorrected", | ||
52 | GPU_SM_L1_DATA_ECC_UNCORRECTED, 0, 0), | ||
53 | GPU_NONCRITERR("icache_l0_data_ecc_corrected", 0, 0, 0), | ||
54 | GPU_CRITERR("icache_l0_data_ecc_uncorrected", | ||
55 | GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED, 0, 0), | ||
56 | GPU_NONCRITERR("icache_l1_data_ecc_corrected", 0, 0, 0), | ||
57 | GPU_CRITERR("icache_l1_data_ecc_uncorrected", | ||
58 | GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED, 0, 0), | ||
59 | GPU_NONCRITERR("icache_l0_predecode_ecc_corrected", 0, 0, 0), | ||
60 | GPU_CRITERR("icache_l0_predecode_ecc_uncorrected", | ||
61 | GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED, 0, 0), | ||
62 | GPU_NONCRITERR("l1_tag_miss_fifo_ecc_corrected", 0, 0, 0), | ||
63 | GPU_CRITERR("l1_tag_miss_fifo_ecc_uncorrected", | ||
64 | GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED, 0, 0), | ||
65 | GPU_NONCRITERR("l1_tag_s2r_pixprf_ecc_corrected", 0, 0, 0), | ||
66 | GPU_CRITERR("l1_tag_s2r_pixprf_ecc_uncorrected", | ||
67 | GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED, 0, 0), | ||
68 | GPU_CRITERR("machine_check_error", 0, 0, 0), | ||
69 | GPU_NONCRITERR("icache_l1_predecode_ecc_corrected", 0, 0, 0), | ||
70 | GPU_CRITERR("icache_l1_predecode_ecc_uncorrected", | ||
71 | GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED, 0, 0), | ||
72 | }, | ||
73 | }, | ||
74 | { | ||
75 | .name = "fecs", | ||
76 | .hw_unit = (u32)NVGPU_ERR_MODULE_FECS, | ||
77 | .num_errs = 4U, | ||
78 | .base_ecc_service_id = | ||
79 | NVGUARD_SERVICE_IGPU_FECS_SWERR_FALCON_IMEM_ECC_CORRECTED, | ||
80 | .errs = (struct nvgpu_err_desc[]) { | ||
81 | GPU_NONCRITERR("falcon_imem_ecc_corrected", | ||
82 | GPU_FECS_FALCON_IMEM_ECC_CORRECTED, 0, 0), | ||
83 | GPU_CRITERR("falcon_imem_ecc_uncorrected", | ||
84 | GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED, 0, 0), | ||
85 | GPU_NONCRITERR("falcon_dmem_ecc_corrected", 0, 0, 0), | ||
86 | GPU_CRITERR("falcon_dmem_ecc_uncorrected", | ||
87 | GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED, 0, 0), | ||
88 | }, | ||
89 | }, | ||
90 | { | ||
91 | .name = "pmu", | ||
92 | .hw_unit = NVGPU_ERR_MODULE_PMU, | ||
93 | .num_errs = 4U, | ||
94 | .base_ecc_service_id = | ||
95 | NVGUARD_SERVICE_IGPU_PMU_SWERR_FALCON_IMEM_ECC_CORRECTED, | ||
96 | .errs = (struct nvgpu_err_desc[]) { | ||
97 | GPU_NONCRITERR("falcon_imem_ecc_corrected", | ||
98 | GPU_PMU_FALCON_IMEM_ECC_CORRECTED, 0, 0), | ||
99 | GPU_CRITERR("falcon_imem_ecc_uncorrected", | ||
100 | GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED, 0, 0), | ||
101 | GPU_NONCRITERR("falcon_dmem_ecc_corrected", 0, 0, 0), | ||
102 | GPU_CRITERR("falcon_dmem_ecc_uncorrected", | ||
103 | GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED, 0, 0), | ||
104 | }, | ||
105 | }, | ||
106 | }; | ||
107 | |||
108 | static void nvgpu_init_err_msg_header(struct gpu_err_header *header) | ||
109 | { | ||
110 | header->version.major = (u16)1U; | ||
111 | header->version.minor = (u16)0U; | ||
112 | header->sub_err_type = 0U; | ||
113 | header->sub_unit_id = 0UL; | ||
114 | header->address = 0UL; | ||
115 | header->timestamp_ns = 0UL; | ||
116 | } | ||
117 | |||
118 | static void nvgpu_init_ecc_err_msg(struct gpu_ecc_error_info *err_info) | ||
119 | { | ||
120 | nvgpu_init_err_msg_header(&err_info->header); | ||
121 | err_info->err_cnt = 0UL; | ||
122 | } | ||
123 | |||
124 | static void nvgpu_report_ecc_error_linux(struct gk20a *g, u32 hw_unit, u32 inst, | ||
125 | u32 err_id, u64 err_addr, u64 err_count) | ||
126 | { | ||
127 | int err = 0; | ||
128 | u32 s_id = 0; | ||
129 | u8 err_status = 0; | ||
130 | u8 err_info_size = 0; | ||
131 | u64 timestamp = 0ULL; | ||
132 | int err_threshold_counter = 0; | ||
133 | struct gpu_ecc_error_info err_pkt; | ||
134 | struct nvgpu_err_desc *err_desc = NULL; | ||
135 | struct nvgpu_err_hw_module *hw_module = NULL; | ||
136 | nv_guard_request_t req; | ||
137 | |||
138 | memset(&req, 0, sizeof(req)); | ||
139 | nvgpu_init_ecc_err_msg(&err_pkt); | ||
140 | if (hw_unit >= sizeof(gv11b_err_lut)/sizeof(gv11b_err_lut[0])) { | ||
141 | err = -EINVAL; | ||
142 | goto done; | ||
143 | } | ||
144 | |||
145 | hw_module = &gv11b_err_lut[hw_unit]; | ||
146 | if (err_id >= hw_module->num_errs) { | ||
147 | nvgpu_err(g, "invalid err_id (%u) for hw module (%u)", | ||
148 | err_id, hw_module->hw_unit); | ||
149 | err = -EINVAL; | ||
150 | goto done; | ||
151 | } | ||
152 | err_desc = &hw_module->errs[err_id]; | ||
153 | timestamp = (u64)nvgpu_current_time_ns(); | ||
154 | |||
155 | err_pkt.header.timestamp_ns = timestamp; | ||
156 | err_pkt.header.sub_unit_id = inst; | ||
157 | err_pkt.header.address = err_addr; | ||
158 | err_pkt.err_cnt = err_count; | ||
159 | err_info_size = sizeof(err_pkt); | ||
160 | |||
161 | s_id = hw_module->base_ecc_service_id + err_id; | ||
162 | |||
163 | if (err_desc->is_critical) { | ||
164 | err_status = NVGUARD_ERROR_DETECTED; | ||
165 | } else { | ||
166 | err_status = NVGUARD_NO_ERROR; | ||
167 | } | ||
168 | |||
169 | nvgpu_atomic_inc(&err_desc->err_count); | ||
170 | err_threshold_counter = nvgpu_atomic_cmpxchg(&err_desc->err_count, | ||
171 | err_desc->err_threshold + 1, 0); | ||
172 | |||
173 | if (unlikely(err_threshold_counter != err_desc->err_threshold + 1)) { | ||
174 | goto done; | ||
175 | } | ||
176 | |||
177 | nvgpu_log(g, gpu_dbg_ecc, "ECC reporting hw: %s, desc:%s, count:%llu", | ||
178 | hw_module->name, err_desc->name, err_count); | ||
179 | |||
180 | req.srv_id_cmd = NVGUARD_SERVICESTATUS_NOTIFICATION; | ||
181 | req.srv_status.srv_id = (nv_guard_service_id_t)s_id; | ||
182 | req.srv_status.status = err_status; | ||
183 | req.srv_status.timestamp = timestamp; | ||
184 | req.srv_status.error_info_size = err_info_size; | ||
185 | memcpy(req.srv_status.error_info, (u8*)&err_pkt, err_info_size); | ||
186 | |||
187 | /* | ||
188 | * l1ss_submit_rq may fail due to kmalloc failures but may pass in | ||
189 | * subsequent calls | ||
190 | */ | ||
191 | err = l1ss_submit_rq(&req, true); | ||
192 | if (err != 0) { | ||
193 | nvgpu_err(g, "Error returned from L1SS submit %d", err); | ||
194 | } | ||
195 | |||
196 | if (err_desc->is_critical) { | ||
197 | nvgpu_quiesce(g); | ||
198 | } | ||
199 | |||
200 | done: | ||
201 | return; | ||
202 | } | ||
203 | |||
204 | static void nvgpu_report_ecc_error_empty(struct gk20a *g, u32 hw_unit, u32 inst, | ||
205 | u32 err_id, u64 err_addr, u64 err_count) { | ||
206 | nvgpu_log(g, gpu_dbg_ecc, "ECC reporting empty"); | ||
207 | } | ||
208 | |||
209 | const struct nvgpu_ecc_reporting_ops default_disabled_ecc_report_ops = { | ||
210 | .report_ecc_err = nvgpu_report_ecc_error_empty, | ||
211 | }; | ||
212 | |||
213 | const struct nvgpu_ecc_reporting_ops ecc_enable_report_ops = { | ||
214 | .report_ecc_err = nvgpu_report_ecc_error_linux, | ||
215 | }; | ||
216 | |||
217 | static int nvgpu_l1ss_callback(l1ss_cli_callback_param param, void *data) | ||
218 | { | ||
219 | struct gk20a *g = (struct gk20a *)data; | ||
220 | struct nvgpu_os_linux *l = NULL; | ||
221 | struct nvgpu_ecc_reporting_linux *ecc_reporting_linux = NULL; | ||
222 | int err = 0; | ||
223 | /* Ensure we have a valid gk20a struct before proceeding */ | ||
224 | if ((g == NULL) || (gk20a_get(g) == NULL)) { | ||
225 | return -ENODEV; | ||
226 | } | ||
227 | |||
228 | l = nvgpu_os_linux_from_gk20a(g); | ||
229 | ecc_reporting_linux = &l->ecc_reporting_linux; | ||
230 | |||
231 | nvgpu_spinlock_acquire(&ecc_reporting_linux->common.lock); | ||
232 | if (param == L1SS_READY) { | ||
233 | if (!ecc_reporting_linux->common.ecc_reporting_service_enabled) { | ||
234 | ecc_reporting_linux->common.ecc_reporting_service_enabled = true; | ||
235 | ecc_reporting_linux->common.ops = &ecc_enable_report_ops; | ||
236 | nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is enabled"); | ||
237 | } | ||
238 | } else if (param == L1SS_NOT_READY) { | ||
239 | if (ecc_reporting_linux->common.ecc_reporting_service_enabled) { | ||
240 | ecc_reporting_linux->common.ecc_reporting_service_enabled = false; | ||
241 | ecc_reporting_linux->common.ops = &default_disabled_ecc_report_ops; | ||
242 | nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is disabled"); | ||
243 | } | ||
244 | } else { | ||
245 | err = -EINVAL; | ||
246 | } | ||
247 | nvgpu_spinlock_release(&ecc_reporting_linux->common.lock); | ||
248 | |||
249 | gk20a_put(g); | ||
250 | |||
251 | return err; | ||
252 | } | ||
253 | |||
254 | void nvgpu_init_ecc_reporting(struct gk20a *g) | ||
255 | { | ||
256 | struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g); | ||
257 | struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux; | ||
258 | int err = 0; | ||
259 | /* This will invoke the registration API */ | ||
260 | nvgpu_spinlock_init(&ecc_report_linux->common.lock); | ||
261 | ecc_report_linux->priv.id = (NVGUARD_GROUPID_IGPU & NVGUARD_GROUPINDEX_FIELDMASK); | ||
262 | ecc_report_linux->priv.cli_callback = nvgpu_l1ss_callback; | ||
263 | ecc_report_linux->priv.data = g; | ||
264 | ecc_report_linux->common.ops = &default_disabled_ecc_report_ops; | ||
265 | |||
266 | nvgpu_log(g, gpu_dbg_ecc, "ECC reporting Init"); | ||
267 | |||
268 | /* | ||
269 | * err == 0 indicates service is available but not active yet. | ||
270 | * err == 1 indicates service is available and active | ||
271 | * error for other cases. | ||
272 | */ | ||
273 | err = l1ss_register_client(&ecc_report_linux->priv); | ||
274 | if (err == 0) { | ||
275 | ecc_report_linux->common.ecc_reporting_service_enabled = false; | ||
276 | nvgpu_log(g, gpu_dbg_ecc, "ECC reporting init success"); | ||
277 | } else if (err == 1) { | ||
278 | ecc_report_linux->common.ecc_reporting_service_enabled = true; | ||
279 | /* Actual Ops will be replaced during nvgpu_enable_ecc_reporting | ||
280 | * called as part of gk20a_busy() | ||
281 | */ | ||
282 | } else { | ||
283 | nvgpu_log(g, gpu_dbg_ecc, "ECC reporting init failure %d", err); | ||
284 | } | ||
285 | } | ||
286 | |||
287 | void nvgpu_deinit_ecc_reporting(struct gk20a *g) | ||
288 | { | ||
289 | struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g); | ||
290 | struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux; | ||
291 | |||
292 | if (ecc_report_linux->common.ecc_reporting_service_enabled) { | ||
293 | ecc_report_linux->common.ecc_reporting_service_enabled = false; | ||
294 | l1ss_deregister_client(ecc_report_linux->priv.id); | ||
295 | memset(ecc_report_linux, 0, sizeof(*ecc_report_linux)); | ||
296 | nvgpu_log(g, gpu_dbg_ecc, "ECC reporting de-init success"); | ||
297 | } | ||
298 | |||
299 | } | ||
300 | |||
301 | void nvgpu_enable_ecc_reporting(struct gk20a *g) | ||
302 | { | ||
303 | struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g); | ||
304 | struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux; | ||
305 | struct nvgpu_ecc_reporting *error_reporting = &ecc_report_linux->common; | ||
306 | |||
307 | nvgpu_spinlock_acquire(&ecc_report_linux->common.lock); | ||
308 | if (error_reporting->ecc_reporting_service_enabled) { | ||
309 | error_reporting->ops = &ecc_enable_report_ops; | ||
310 | nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is enabled"); | ||
311 | } | ||
312 | nvgpu_spinlock_release(&ecc_report_linux->common.lock); | ||
313 | } | ||
314 | |||
315 | void nvgpu_disable_ecc_reporting(struct gk20a *g) | ||
316 | { | ||
317 | struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g); | ||
318 | struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux; | ||
319 | struct nvgpu_ecc_reporting *error_reporting = &ecc_report_linux->common; | ||
320 | |||
321 | nvgpu_spinlock_acquire(&ecc_report_linux->common.lock); | ||
322 | error_reporting->ops = &default_disabled_ecc_report_ops; | ||
323 | nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is disabled"); | ||
324 | nvgpu_spinlock_release(&ecc_report_linux->common.lock); | ||
325 | } | ||
326 | |||
327 | void nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst, | ||
328 | u32 err_id, u64 err_addr, u64 err_count) | ||
329 | { | ||
330 | struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g); | ||
331 | struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux; | ||
332 | struct nvgpu_ecc_reporting *error_reporting = &ecc_report_linux->common; | ||
333 | void (*report_ecc_err_func)(struct gk20a *g, u32 hw_unit, u32 inst, | ||
334 | u32 err_id, u64 err_addr, u64 err_count); | ||
335 | |||
336 | nvgpu_spinlock_acquire(&ecc_report_linux->common.lock); | ||
337 | report_ecc_err_func = error_reporting->ops->report_ecc_err; | ||
338 | nvgpu_spinlock_release(&ecc_report_linux->common.lock); | ||
339 | |||
340 | report_ecc_err_func(g, hw_unit, inst, err_id, err_addr, err_count); | ||
341 | } | ||