summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/nvgpu/gv11b/pmu_gv11b.c
diff options
context:
space:
mode:
authorDebarshi Dutta <ddutta@nvidia.com>2021-05-17 04:38:25 -0400
committermobile promotions <svcmobile_promotions@nvidia.com>2021-05-28 15:10:24 -0400
commit34993e4f7b0d47620e88ba64a6d7c67330d97e35 (patch)
tree2136284f5bd4095780884885413bb268fd318a96 /drivers/gpu/nvgpu/gv11b/pmu_gv11b.c
parent5f88598b9e7b2cfe0387733577ece138a7bc912b (diff)
gpu: nvgpu: Add ECC Support for GV11B in Linux
Implement nvgpu plumbing to allow reporting ECC errors(corrected and uncorrected) to a L1SS service(if one exists). This patch includes the following 1) Added code that submits ECC error reports via the Interrupt context directly to a L1SS service in linux OS. 2) Added support for enabling/disabling the error reports via L1SS's registration/deregistration API. Nvgpu simply invokes an empty function until the registration is successful. 3) Added Spinlock to correctly handle concurrency for accessing the correct Ops for submitting requests. 4) Adds error reporting for a subset of interrupts that can be verified via external ECC injection logic. A subsequent patch will add the API for rest of the interrupts. 5) In case of critical(uncorrected errors), change nvgpu's state to quiesce state. Jira L4T-1187 Bug 200700400 Change-Id: Id31f70531fba355e94e72c4f9762593e7667a11c Signed-off-by: Debarshi Dutta <ddutta@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2530411 Tested-by: Bibek Basu <bbasu@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com> Reviewed-by: Bibek Basu <bbasu@nvidia.com> Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> GVS: Gerrit_Virtual_Submit
Diffstat (limited to 'drivers/gpu/nvgpu/gv11b/pmu_gv11b.c')
-rw-r--r--drivers/gpu/nvgpu/gv11b/pmu_gv11b.c15
1 files changed, 14 insertions, 1 deletions
diff --git a/drivers/gpu/nvgpu/gv11b/pmu_gv11b.c b/drivers/gpu/nvgpu/gv11b/pmu_gv11b.c
index 5e586ec2..336258a7 100644
--- a/drivers/gpu/nvgpu/gv11b/pmu_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/pmu_gv11b.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * GV11B PMU 2 * GV11B PMU
3 * 3 *
4 * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. 4 * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
5 * 5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a 6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"), 7 * copy of this software and associated documentation files (the "Software"),
@@ -29,6 +29,7 @@
29#include <nvgpu/io.h> 29#include <nvgpu/io.h>
30#include <nvgpu/utils.h> 30#include <nvgpu/utils.h>
31#include <nvgpu/gk20a.h> 31#include <nvgpu/gk20a.h>
32#include <nvgpu/nvgpu_err.h>
32 33
33#include "gk20a/pmu_gk20a.h" 34#include "gk20a/pmu_gk20a.h"
34#include "gp10b/pmu_gp10b.h" 35#include "gp10b/pmu_gp10b.h"
@@ -354,10 +355,18 @@ void gv11b_pmu_handle_ext_irq(struct gk20a *g, u32 intr0)
354 "pmu ecc interrupt intr1: 0x%x", intr1); 355 "pmu ecc interrupt intr1: 0x%x", intr1);
355 356
356 if (ecc_status & pwr_pmu_falcon_ecc_status_corrected_err_imem_m()) { 357 if (ecc_status & pwr_pmu_falcon_ecc_status_corrected_err_imem_m()) {
358 nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_PMU, 0,
359 GPU_PMU_FALCON_IMEM_ECC_CORRECTED,
360 ecc_addr,
361 g->ecc.pmu.pmu_ecc_corrected_err_count[0].counter);
357 nvgpu_log(g, gpu_dbg_intr, 362 nvgpu_log(g, gpu_dbg_intr,
358 "imem ecc error corrected"); 363 "imem ecc error corrected");
359 } 364 }
360 if (ecc_status & pwr_pmu_falcon_ecc_status_uncorrected_err_imem_m()) { 365 if (ecc_status & pwr_pmu_falcon_ecc_status_uncorrected_err_imem_m()) {
366 nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_PMU, 0,
367 GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED,
368 ecc_addr,
369 g->ecc.pmu.pmu_ecc_uncorrected_err_count[0].counter);
361 nvgpu_log(g, gpu_dbg_intr, 370 nvgpu_log(g, gpu_dbg_intr,
362 "imem ecc error uncorrected"); 371 "imem ecc error uncorrected");
363 } 372 }
@@ -366,6 +375,10 @@ void gv11b_pmu_handle_ext_irq(struct gk20a *g, u32 intr0)
366 "dmem ecc error corrected"); 375 "dmem ecc error corrected");
367 } 376 }
368 if (ecc_status & pwr_pmu_falcon_ecc_status_uncorrected_err_dmem_m()) { 377 if (ecc_status & pwr_pmu_falcon_ecc_status_uncorrected_err_dmem_m()) {
378 nvgpu_report_ecc_err(g, NVGPU_ERR_MODULE_PMU, 0,
379 GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED,
380 ecc_addr,
381 g->ecc.pmu.pmu_ecc_uncorrected_err_count[0].counter);
369 nvgpu_log(g, gpu_dbg_intr, 382 nvgpu_log(g, gpu_dbg_intr,
370 "dmem ecc error uncorrected"); 383 "dmem ecc error uncorrected");
371 } 384 }