From d84e822128a224eda4a703dad530716331dd36bd Mon Sep 17 00:00:00 2001 From: Vinod G Date: Wed, 23 May 2018 17:22:03 -0700 Subject: gpu: nvgpu: Add Ctrl API to read SM error state Expose IOCTL to Ctrl node to read Single SM error under NVGPU_GPU_IOCTL_READ_SINGLE_SM_ERROR_STATE bug 200412642 JIRA NVGPU-700 Change-Id: I3cbcf4d7f23a53dbd2350b38a5e259559d5fd3af Signed-off-by: Vinod G Reviewed-on: https://git-master.nvidia.com/r/1728931 Reviewed-by: mobile promotions Tested-by: mobile promotions --- drivers/gpu/nvgpu/common/linux/ioctl_ctrl.c | 55 +++++++++++++++++++++++++++++ include/uapi/linux/nvgpu.h | 37 ++++++++++++++++++- 2 files changed, 91 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/nvgpu/common/linux/ioctl_ctrl.c b/drivers/gpu/nvgpu/common/linux/ioctl_ctrl.c index b40efc0f..ee0739c9 100644 --- a/drivers/gpu/nvgpu/common/linux/ioctl_ctrl.c +++ b/drivers/gpu/nvgpu/common/linux/ioctl_ctrl.c @@ -1575,6 +1575,56 @@ out: return err; } +static int nvgpu_gpu_read_single_sm_error_state(struct gk20a *g, + struct nvgpu_gpu_read_single_sm_error_state_args *args) +{ + struct gr_gk20a *gr = &g->gr; + struct nvgpu_gr_sm_error_state *sm_error_state; + struct nvgpu_gpu_sm_error_state_record sm_error_state_record; + u32 sm_id; + int err = 0; + + sm_id = args->sm_id; + if (sm_id >= gr->no_of_sm) + return -EINVAL; + + nvgpu_speculation_barrier(); + + sm_error_state = gr->sm_error_states + sm_id; + sm_error_state_record.global_esr = + sm_error_state->hww_global_esr; + sm_error_state_record.warp_esr = + sm_error_state->hww_warp_esr; + sm_error_state_record.warp_esr_pc = + sm_error_state->hww_warp_esr_pc; + sm_error_state_record.global_esr_report_mask = + sm_error_state->hww_global_esr_report_mask; + sm_error_state_record.warp_esr_report_mask = + sm_error_state->hww_warp_esr_report_mask; + + if (args->record_size > 0) { + size_t write_size = sizeof(*sm_error_state); + + if (write_size > args->record_size) + write_size = args->record_size; + + nvgpu_mutex_acquire(&g->dbg_sessions_lock); + err = copy_to_user((void __user *)(uintptr_t) + args->record_mem, + &sm_error_state_record, + write_size); + nvgpu_mutex_release(&g->dbg_sessions_lock); + if (err) { + nvgpu_err(g, "copy_to_user failed!"); + return err; + } + + args->record_size = write_size; + } + + return 0; +} + long gk20a_ctrl_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct gk20a_ctrl_priv *priv = filp->private_data; @@ -1887,6 +1937,11 @@ long gk20a_ctrl_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg (struct nvgpu_gpu_set_deterministic_opts_args *)buf); break; + case NVGPU_GPU_IOCTL_READ_SINGLE_SM_ERROR_STATE: + err = nvgpu_gpu_read_single_sm_error_state(g, + (struct nvgpu_gpu_read_single_sm_error_state_args *)buf); + break; + default: nvgpu_log_info(g, "unrecognized gpu ioctl cmd: 0x%x", cmd); err = -ENOTTY; diff --git a/include/uapi/linux/nvgpu.h b/include/uapi/linux/nvgpu.h index b36509b0..908e5c57 100644 --- a/include/uapi/linux/nvgpu.h +++ b/include/uapi/linux/nvgpu.h @@ -864,6 +864,38 @@ struct nvgpu_gpu_set_deterministic_opts_args { __u64 channels; /* in */ }; +/* + * This struct helps to report the SM error state of a single SM. + * This acts upon the currently resident GR context. + * Global Error status register + * Warp Error status register + * Warp Error status register PC + * Global Error status register Report Mask + * Warp Error status register Report Mask + */ +struct nvgpu_gpu_sm_error_state_record { + __u32 global_esr; + __u32 warp_esr; + __u64 warp_esr_pc; + __u32 global_esr_report_mask; + __u32 warp_esr_report_mask; +}; + +/* + * This struct helps to read the SM error state. + */ +struct nvgpu_gpu_read_single_sm_error_state_args { + /* Valid SM ID */ + __u32 sm_id; + __u32 reserved; + /* + * This is pointer to the struct nvgpu_gpu_sm_error_state_record + */ + __u64 record_mem; + /* size of the record size to read */ + __u64 record_size; +}; + #define NVGPU_GPU_IOCTL_ZCULL_GET_CTX_SIZE \ _IOR(NVGPU_GPU_IOCTL_MAGIC, 1, struct nvgpu_gpu_zcull_get_ctx_size_args) #define NVGPU_GPU_IOCTL_ZCULL_GET_INFO \ @@ -949,8 +981,11 @@ struct nvgpu_gpu_set_deterministic_opts_args { #define NVGPU_GPU_IOCTL_SET_DETERMINISTIC_OPTS \ _IOWR(NVGPU_GPU_IOCTL_MAGIC, 40, \ struct nvgpu_gpu_set_deterministic_opts_args) +#define NVGPU_GPU_IOCTL_READ_SINGLE_SM_ERROR_STATE \ + _IOWR(NVGPU_GPU_IOCTL_MAGIC, 41, \ + struct nvgpu_gpu_read_single_sm_error_state_args) #define NVGPU_GPU_IOCTL_LAST \ - _IOC_NR(NVGPU_GPU_IOCTL_SET_DETERMINISTIC_OPTS) + _IOC_NR(NVGPU_GPU_IOCTL_READ_SINGLE_SM_ERROR_STATE) #define NVGPU_GPU_IOCTL_MAX_ARG_SIZE \ sizeof(struct nvgpu_gpu_get_cpu_time_correlation_info_args) -- cgit v1.2.2