/*
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#ifndef NVGPU_NVGPU_ERR_H
#define NVGPU_NVGPU_ERR_H
/**
* @file
*
* Define indices for HW units and errors. Define structures used to carry error
* information. Declare prototype for APIs that are used to report GPU HW errors
* to the Safety_Services framework.
*/
#include <nvgpu/types.h>
#include <nvgpu/atomic.h>
struct gk20a;
/**
* @defgroup INDICES_FOR_GPU_HW_UNITS
* Macros used to assign unique index to GPU HW units.
* @{
*/
#define NVGPU_ERR_MODULE_SM (0U)
#define NVGPU_ERR_MODULE_FECS (1U)
#define NVGPU_ERR_MODULE_PMU (2U)
/**
* @}
*/
/**
* @defgroup LIST_OF_ERRORS_REPORTED_FROM_SM
* Macros used to assign unique index to errors reported from the SM unit.
* @{
*/
#define GPU_SM_L1_TAG_ECC_CORRECTED (0U)
#define GPU_SM_L1_TAG_ECC_UNCORRECTED (1U)
#define GPU_SM_CBU_ECC_UNCORRECTED (3U)
#define GPU_SM_LRF_ECC_UNCORRECTED (5U)
#define GPU_SM_L1_DATA_ECC_UNCORRECTED (7U)
#define GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED (9U)
#define GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED (11U)
#define GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED (13U)
#define GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED (15U)
#define GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED (17U)
#define GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED (20U)
/**
* @}
*/
/**
* @defgroup LIST_OF_ERRORS_REPORTED_FROM_FECS
* Macros used to assign unique index to errors reported from the FECS unit.
* @{
*/
#define GPU_FECS_FALCON_IMEM_ECC_CORRECTED (0U)
#define GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED (1U)
#define GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED (3U)
/**
* @}
*/
/**
* @defgroup LIST_OF_ERRORS_REPORTED_FROM_GPCCS
* Macros used to assign unique index to errors reported from the GPCCS unit.
* @{
*/
#define GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED (0U)
#define GPU_GPCCS_FALCON_IMEM_ECC_UNCORRECTED (1U)
#define GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED (3U)
/**
* @}
*/
/**
* @defgroup LIST_OF_ERRORS_REPORTED_FROM_MMU
* Macros used to assign unique index to errors reported from the MMU unit.
* @{
*/
#define GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED (1U)
#define GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED (3U)
/**
* @}
*/
/**
* @defgroup LIST_OF_ERRORS_REPORTED_FROM_GCC
* Macros used to assign unique index to errors reported from the GCC unit.
* @{
*/
#define GPU_GCC_L15_ECC_UNCORRECTED (1U)
/**
* @}
*/
/**
* @defgroup LIST_OF_ERRORS_REPORTED_FROM_PMU
* Macros used to assign unique index to errors reported from the PMU unit.
* @{
*/
#define GPU_PMU_FALCON_IMEM_ECC_CORRECTED (0U)
#define GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED (1U)
#define GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED (3U)
/**
* @}
*/
/**
* @defgroup LIST_OF_ERRORS_REPORTED_FROM_LTC
* Macros used to assign unique index to errors reported from the LTC unit.
* @{
*/
#define GPU_LTC_CACHE_DSTG_ECC_CORRECTED (0U)
#define GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED (1U)
#define GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED (3U)
#define GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED (7U)
/**
* @}
*/
/**
* @defgroup LIST_OF_ERRORS_REPORTED_FROM_HUBMMU
* Macros used to assign unique index to errors reported from the HUBMMU unit.
* @{
*/
#define GPU_HUBMMU_L2TLB_SA_DATA_ECC_UNCORRECTED (1U)
#define GPU_HUBMMU_TLB_SA_DATA_ECC_UNCORRECTED (3U)
#define GPU_HUBMMU_PTE_DATA_ECC_UNCORRECTED (5U)
#define GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED (7U)
#define GPU_HUBMMU_PAGE_FAULT_ERROR (8U)
#ifdef CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING
/**
* @}
*/
/**
* nvgpu_err_desc structure holds fields which describe an error along with
* function callback which can be used to inject the error.
*/
struct nvgpu_err_desc {
/** String representation of error. */
const char *name;
/** Flag to classify an error as critical or non-critical. */
bool is_critical;
/**
* Error Threshold: once this threshold value is reached, then the
* corresponding error counter will be reset to 0 and the error will be
* propagated to Safety_Services.
*/
int err_threshold;
/**
* Total number of times an error has occurred (since its last reset).
*/
nvgpu_atomic_t err_count;
/** Error ID. */
u8 error_id;
};
/**
* gpu_err_header structure holds fields which are required to identify the
* version of header, sub-error type, sub-unit id, error address and time stamp.
*/
struct gpu_err_header {
/** Version of GPU error header. */
struct {
/** Major version number. */
u16 major;
/** Minor version number. */
u16 minor;
} version;
/** Sub error type corresponding to the error that is being reported. */
u32 sub_err_type;
/** ID of the sub-unit in a HW unit which encountered an error. */
u64 sub_unit_id;
/** Location of the error. */
u64 address;
/** Timestamp in nano seconds. */
u64 timestamp_ns;
};
struct gpu_ecc_error_info {
struct gpu_err_header header;
/** Number of ECC errors. */
u64 err_cnt;
};
/**
* nvgpu_err_hw_module structure holds fields which describe the h/w modules
* error reporting capabilities.
*/
struct nvgpu_err_hw_module {
/** String representation of a given HW unit. */
const char *name;
/** HW unit ID. */
u32 hw_unit;
/** Total number of errors reported from a given HW unit. */
u32 num_errs;
u32 base_ecc_service_id;
/** Used to get error description from look-up table. */
struct nvgpu_err_desc *errs;
};
struct nvgpu_ecc_reporting_ops {
void (*report_ecc_err)(struct gk20a *g, u32 hw_unit, u32 inst,
u32 err_id, u64 err_addr, u64 err_count);
};
struct nvgpu_ecc_reporting {
struct nvgpu_spinlock lock;
/* This flag is protected by the above spinlock */
bool ecc_reporting_service_enabled;
const struct nvgpu_ecc_reporting_ops *ops;
};
/**
* This macro is used to initialize the members of nvgpu_err_desc struct.
*/
#define GPU_ERR(err, critical, id, threshold, ecount) \
{ \
.name = (err), \
.is_critical = (critical), \
.error_id = (id), \
.err_threshold = (threshold), \
.err_count = NVGPU_ATOMIC_INIT(ecount), \
}
/**
* This macro is used to initialize critical errors.
*/
#define GPU_CRITERR(err, id, threshold, ecount) \
GPU_ERR(err, true, id, threshold, ecount)
/**
* This macro is used to initialize non-critical errors.
*/
#define GPU_NONCRITERR(err, id, threshold, ecount) \
GPU_ERR(err, false, id, threshold, ecount)
/**
* @brief GPU HW errors need to be reported to Safety_Services via SDL unit.
* This function provides an interface to report ECC erros to SDL unit.
*
* @param g [in] - The GPU driver struct.
* @param hw_unit [in] - Index of HW unit.
* - List of valid HW unit IDs
* - NVGPU_ERR_MODULE_SM
* - NVGPU_ERR_MODULE_FECS
* - NVGPU_ERR_MODULE_GPCCS
* - NVGPU_ERR_MODULE_MMU
* - NVGPU_ERR_MODULE_GCC
* - NVGPU_ERR_MODULE_PMU
* - NVGPU_ERR_MODULE_LTC
* - NVGPU_ERR_MODULE_HUBMMU
* @param inst [in] - Instance ID.
* - In case of multiple instances of the same HW
* unit (e.g., there are multiple instances of
* SM), it is used to identify the instance
* that encountered a fault.
* @param err_id [in] - Error index.
* - For SM:
* - Min: GPU_SM_L1_TAG_ECC_CORRECTED
* - Max: GPU_SM_ICACHE_L1_PREDECODE_ECC_UNCORRECTED
* - For FECS:
* - Min: GPU_FECS_FALCON_IMEM_ECC_CORRECTED
* - Max: GPU_FECS_INVALID_ERROR
* - For GPCCS:
* - Min: GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED
* - Max: GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED
* - For MMU:
* - Min: GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED
* - Max: GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED
* - For GCC:
* - Min: GPU_GCC_L15_ECC_UNCORRECTED
* - Max: GPU_GCC_L15_ECC_UNCORRECTED
* - For PMU:
* - Min: GPU_PMU_FALCON_IMEM_ECC_CORRECTED
* - Max: GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED
* - For LTC:
* - Min: GPU_LTC_CACHE_DSTG_ECC_CORRECTED
* - Max: GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED
* - For HUBMMU:
* - Min: GPU_HUBMMU_L2TLB_SA_DATA_ECC_UNCORRECTED
* - Max: GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED
* @param err_addr [in] - Error address.
* - This is the location at which correctable or
* uncorrectable error has occurred.
* @param err_count [in] - Error count.
*
* - Checks whether SDL is supported in the current GPU platform. If SDL is not
* supported, it simply returns.
* - Validates both \a hw_unit and \a err_id indices. In case of a failure,
* invokes #nvgpu_sdl_handle_report_failure() api.
* - Gets the current time of a clock. In case of a failure, invokes
* #nvgpu_sdl_handle_report_failure() api.
* - Gets error description from internal look-up table using \a hw_unit and
* \a err_id indices.
* - Forms error packet using details such as time-stamp, \a hw_unit, \a err_id,
* criticality of the error, \a inst, \a err_addr, \a err_count, error
* description, and size of the error packet.
* - Performs compile-time assert check to ensure that the size of the error
* packet does not exceed the maximum allowable size specified in
* #MAX_ERR_MSG_SIZE.
*
* @return None
*/
void nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst,
u32 err_id, u64 err_addr, u64 err_count);
void nvgpu_init_ecc_reporting(struct gk20a *g);
void nvgpu_enable_ecc_reporting(struct gk20a *g);
void nvgpu_disable_ecc_reporting(struct gk20a *g);
void nvgpu_deinit_ecc_reporting(struct gk20a *g);
#else
static inline void nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst,
u32 err_id, u64 err_addr, u64 err_count) {
}
#endif /* CONFIG_NVGPU_SUPPORT_LINUX_ECC_ERROR_REPORTING */
#endif /* NVGPU_NVGPU_ERR_H */