/*
* Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
#include
#include
#include
#include
#include
#include "ecc_linux.h"
#include "os_linux.h"
#include "module.h"
/* This look-up table initializes the list of hw units and their errors.
* It also specifies the error injection mechanism supported, for each error.
* In case of hw error injection support, this initialization will be overriden
* by the values provided from the hal layes of corresponding hw units.
*/
static struct nvgpu_err_hw_module gv11b_err_lut[] = {
{
.name = "sm",
.hw_unit = (u32)NVGPU_ERR_MODULE_SM,
.num_errs = 21U,
.base_ecc_service_id =
NVGUARD_SERVICE_IGPU_SM_SWERR_L1_TAG_ECC_CORRECTED,
.errs = (struct nvgpu_err_desc[]) {
GPU_NONCRITERR("l1_tag_ecc_corrected",
GPU_SM_L1_TAG_ECC_CORRECTED, 0, 0),
GPU_CRITERR("l1_tag_ecc_uncorrected",
GPU_SM_L1_TAG_ECC_UNCORRECTED, 0, 0),
GPU_NONCRITERR("cbu_ecc_corrected", 0, 0, 0),
GPU_CRITERR("cbu_ecc_uncorrected",
GPU_SM_CBU_ECC_UNCORRECTED, 0, 0),
GPU_NONCRITERR("lrf_ecc_corrected", 0, 0, 0),
GPU_CRITERR("lrf_ecc_uncorrected",
GPU_SM_LRF_ECC_UNCORRECTED, 0, 0),
GPU_NONCRITERR("l1_data_ecc_corrected", 0, 0, 0),
GPU_CRITERR("l1_data_ecc_uncorrected",
GPU_SM_L1_DATA_ECC_UNCORRECTED, 0, 0),
GPU_NONCRITERR("icache_l0_data_ecc_corrected", 0, 0, 0),
GPU_CRITERR("icache_l0_data_ecc_uncorrected",
GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED, 0, 0),
GPU_NONCRITERR("icache_l1_data_ecc_corrected", 0, 0, 0),
GPU_CRITERR("icache_l1_data_ecc_uncorrected",
GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED, 0, 0),
GPU_NONCRITERR("icache_l0_predecode_ecc_corrected", 0, 0, 0),
GPU_CRITERR("icache_l0_predecode_ecc_uncorrected",
GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED, 0, 0),
GPU_NONCRITERR("l1_tag_miss_fifo_ecc_corrected", 0, 0, 0),
GPU_CRITERR("l1_tag_miss_fifo_ecc_uncorrected",
GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED, 0, 0),
GPU_NONCRITERR("l1_tag_s2r_pixprf_ecc_corrected", 0, 0, 0),
GPU_CRITERR("l1_tag_s2r_pixprf_ecc_uncorrected",
GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED, 0, 0),
GPU_CRITERR("machine_check_error", 0, 0, 0),
GPU_NONCRITERR("icache_l1_predecode_ecc_corrected", 0, 0, 0),
GPU_CRITERR("icache_l1_predecode_ecc_uncorrected",
GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED, 0, 0),
},
},
{
.name = "fecs",
.hw_unit = (u32)NVGPU_ERR_MODULE_FECS,
.num_errs = 4U,
.base_ecc_service_id =
NVGUARD_SERVICE_IGPU_FECS_SWERR_FALCON_IMEM_ECC_CORRECTED,
.errs = (struct nvgpu_err_desc[]) {
GPU_NONCRITERR("falcon_imem_ecc_corrected",
GPU_FECS_FALCON_IMEM_ECC_CORRECTED, 0, 0),
GPU_CRITERR("falcon_imem_ecc_uncorrected",
GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED, 0, 0),
GPU_NONCRITERR("falcon_dmem_ecc_corrected", 0, 0, 0),
GPU_CRITERR("falcon_dmem_ecc_uncorrected",
GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED, 0, 0),
},
},
{
.name = "pmu",
.hw_unit = NVGPU_ERR_MODULE_PMU,
.num_errs = 4U,
.base_ecc_service_id =
NVGUARD_SERVICE_IGPU_PMU_SWERR_FALCON_IMEM_ECC_CORRECTED,
.errs = (struct nvgpu_err_desc[]) {
GPU_NONCRITERR("falcon_imem_ecc_corrected",
GPU_PMU_FALCON_IMEM_ECC_CORRECTED, 0, 0),
GPU_CRITERR("falcon_imem_ecc_uncorrected",
GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED, 0, 0),
GPU_NONCRITERR("falcon_dmem_ecc_corrected", 0, 0, 0),
GPU_CRITERR("falcon_dmem_ecc_uncorrected",
GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED, 0, 0),
},
},
};
static void nvgpu_init_err_msg_header(struct gpu_err_header *header)
{
header->version.major = (u16)1U;
header->version.minor = (u16)0U;
header->sub_err_type = 0U;
header->sub_unit_id = 0UL;
header->address = 0UL;
header->timestamp_ns = 0UL;
}
static void nvgpu_init_ecc_err_msg(struct gpu_ecc_error_info *err_info)
{
nvgpu_init_err_msg_header(&err_info->header);
err_info->err_cnt = 0UL;
}
static void nvgpu_report_ecc_error_linux(struct gk20a *g, u32 hw_unit, u32 inst,
u32 err_id, u64 err_addr, u64 err_count)
{
int err = 0;
u32 s_id = 0;
u8 err_status = 0;
u8 err_info_size = 0;
u64 timestamp = 0ULL;
int err_threshold_counter = 0;
struct gpu_ecc_error_info err_pkt;
struct nvgpu_err_desc *err_desc = NULL;
struct nvgpu_err_hw_module *hw_module = NULL;
nv_guard_request_t req;
memset(&req, 0, sizeof(req));
nvgpu_init_ecc_err_msg(&err_pkt);
if (hw_unit >= sizeof(gv11b_err_lut)/sizeof(gv11b_err_lut[0])) {
err = -EINVAL;
goto done;
}
hw_module = &gv11b_err_lut[hw_unit];
if (err_id >= hw_module->num_errs) {
nvgpu_err(g, "invalid err_id (%u) for hw module (%u)",
err_id, hw_module->hw_unit);
err = -EINVAL;
goto done;
}
err_desc = &hw_module->errs[err_id];
timestamp = (u64)nvgpu_current_time_ns();
err_pkt.header.timestamp_ns = timestamp;
err_pkt.header.sub_unit_id = inst;
err_pkt.header.address = err_addr;
err_pkt.err_cnt = err_count;
err_info_size = sizeof(err_pkt);
s_id = hw_module->base_ecc_service_id + err_id;
if (err_desc->is_critical) {
err_status = NVGUARD_ERROR_DETECTED;
} else {
err_status = NVGUARD_NO_ERROR;
}
nvgpu_atomic_inc(&err_desc->err_count);
err_threshold_counter = nvgpu_atomic_cmpxchg(&err_desc->err_count,
err_desc->err_threshold + 1, 0);
if (unlikely(err_threshold_counter != err_desc->err_threshold + 1)) {
goto done;
}
nvgpu_log(g, gpu_dbg_ecc, "ECC reporting hw: %s, desc:%s, count:%llu",
hw_module->name, err_desc->name, err_count);
req.srv_id_cmd = NVGUARD_SERVICESTATUS_NOTIFICATION;
req.srv_status.srv_id = (nv_guard_service_id_t)s_id;
req.srv_status.status = err_status;
req.srv_status.timestamp = timestamp;
req.srv_status.error_info_size = err_info_size;
memcpy(req.srv_status.error_info, (u8*)&err_pkt, err_info_size);
/*
* l1ss_submit_rq may fail due to kmalloc failures but may pass in
* subsequent calls
*/
err = l1ss_submit_rq(&req, true);
if (err != 0) {
nvgpu_err(g, "Error returned from L1SS submit %d", err);
}
if (err_desc->is_critical) {
nvgpu_quiesce(g);
}
done:
return;
}
static void nvgpu_report_ecc_error_empty(struct gk20a *g, u32 hw_unit, u32 inst,
u32 err_id, u64 err_addr, u64 err_count) {
nvgpu_log(g, gpu_dbg_ecc, "ECC reporting empty");
}
const struct nvgpu_ecc_reporting_ops default_disabled_ecc_report_ops = {
.report_ecc_err = nvgpu_report_ecc_error_empty,
};
const struct nvgpu_ecc_reporting_ops ecc_enable_report_ops = {
.report_ecc_err = nvgpu_report_ecc_error_linux,
};
static int nvgpu_l1ss_callback(l1ss_cli_callback_param param, void *data)
{
struct gk20a *g = (struct gk20a *)data;
struct nvgpu_os_linux *l = NULL;
struct nvgpu_ecc_reporting_linux *ecc_reporting_linux = NULL;
int err = 0;
/* Ensure we have a valid gk20a struct before proceeding */
if ((g == NULL) || (gk20a_get(g) == NULL)) {
return -ENODEV;
}
l = nvgpu_os_linux_from_gk20a(g);
ecc_reporting_linux = &l->ecc_reporting_linux;
nvgpu_spinlock_acquire(&ecc_reporting_linux->common.lock);
if (param == L1SS_READY) {
if (!ecc_reporting_linux->common.ecc_reporting_service_enabled) {
ecc_reporting_linux->common.ecc_reporting_service_enabled = true;
ecc_reporting_linux->common.ops = &ecc_enable_report_ops;
nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is enabled");
}
} else if (param == L1SS_NOT_READY) {
if (ecc_reporting_linux->common.ecc_reporting_service_enabled) {
ecc_reporting_linux->common.ecc_reporting_service_enabled = false;
ecc_reporting_linux->common.ops = &default_disabled_ecc_report_ops;
nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is disabled");
}
} else {
err = -EINVAL;
}
nvgpu_spinlock_release(&ecc_reporting_linux->common.lock);
gk20a_put(g);
return err;
}
void nvgpu_init_ecc_reporting(struct gk20a *g)
{
struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux;
int err = 0;
/* This will invoke the registration API */
nvgpu_spinlock_init(&ecc_report_linux->common.lock);
ecc_report_linux->priv.id = (NVGUARD_GROUPID_IGPU & NVGUARD_GROUPINDEX_FIELDMASK);
ecc_report_linux->priv.cli_callback = nvgpu_l1ss_callback;
ecc_report_linux->priv.data = g;
ecc_report_linux->common.ops = &default_disabled_ecc_report_ops;
nvgpu_log(g, gpu_dbg_ecc, "ECC reporting Init");
/*
* err == 0 indicates service is available but not active yet.
* err == 1 indicates service is available and active
* error for other cases.
*/
err = l1ss_register_client(&ecc_report_linux->priv);
if (err == 0) {
ecc_report_linux->common.ecc_reporting_service_enabled = false;
nvgpu_log(g, gpu_dbg_ecc, "ECC reporting init success");
} else if (err == 1) {
ecc_report_linux->common.ecc_reporting_service_enabled = true;
/* Actual Ops will be replaced during nvgpu_enable_ecc_reporting
* called as part of gk20a_busy()
*/
} else {
nvgpu_log(g, gpu_dbg_ecc, "ECC reporting init failure %d", err);
}
}
void nvgpu_deinit_ecc_reporting(struct gk20a *g)
{
struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux;
if (ecc_report_linux->common.ecc_reporting_service_enabled) {
ecc_report_linux->common.ecc_reporting_service_enabled = false;
l1ss_deregister_client(ecc_report_linux->priv.id);
memset(ecc_report_linux, 0, sizeof(*ecc_report_linux));
nvgpu_log(g, gpu_dbg_ecc, "ECC reporting de-init success");
}
}
void nvgpu_enable_ecc_reporting(struct gk20a *g)
{
struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux;
struct nvgpu_ecc_reporting *error_reporting = &ecc_report_linux->common;
nvgpu_spinlock_acquire(&ecc_report_linux->common.lock);
if (error_reporting->ecc_reporting_service_enabled) {
error_reporting->ops = &ecc_enable_report_ops;
nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is enabled");
}
nvgpu_spinlock_release(&ecc_report_linux->common.lock);
}
void nvgpu_disable_ecc_reporting(struct gk20a *g)
{
struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux;
struct nvgpu_ecc_reporting *error_reporting = &ecc_report_linux->common;
nvgpu_spinlock_acquire(&ecc_report_linux->common.lock);
error_reporting->ops = &default_disabled_ecc_report_ops;
nvgpu_log(g, gpu_dbg_ecc, "ECC reporting is disabled");
nvgpu_spinlock_release(&ecc_report_linux->common.lock);
}
void nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst,
u32 err_id, u64 err_addr, u64 err_count)
{
struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
struct nvgpu_ecc_reporting_linux *ecc_report_linux = &l->ecc_reporting_linux;
struct nvgpu_ecc_reporting *error_reporting = &ecc_report_linux->common;
void (*report_ecc_err_func)(struct gk20a *g, u32 hw_unit, u32 inst,
u32 err_id, u64 err_addr, u64 err_count);
nvgpu_spinlock_acquire(&ecc_report_linux->common.lock);
report_ecc_err_func = error_reporting->ops->report_ecc_err;
nvgpu_spinlock_release(&ecc_report_linux->common.lock);
report_ecc_err_func(g, hw_unit, inst, err_id, err_addr, err_count);
}