summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVinod G <vinodg@nvidia.com>2018-05-23 20:22:03 -0400
committerTejal Kudav <tkudav@nvidia.com>2018-06-14 09:44:06 -0400
commitd84e822128a224eda4a703dad530716331dd36bd (patch)
tree1be84de0d8e1407fa109b1e51e89cf8f5fa94b82
parent40cefb666f3767059383052346d4c0faa9195a48 (diff)
gpu: nvgpu: Add Ctrl API to read SM error state
Expose IOCTL to Ctrl node to read Single SM error under NVGPU_GPU_IOCTL_READ_SINGLE_SM_ERROR_STATE bug 200412642 JIRA NVGPU-700 Change-Id: I3cbcf4d7f23a53dbd2350b38a5e259559d5fd3af Signed-off-by: Vinod G <vinodg@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1728931 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
-rw-r--r--drivers/gpu/nvgpu/common/linux/ioctl_ctrl.c55
-rw-r--r--include/uapi/linux/nvgpu.h37
2 files changed, 91 insertions, 1 deletions
diff --git a/drivers/gpu/nvgpu/common/linux/ioctl_ctrl.c b/drivers/gpu/nvgpu/common/linux/ioctl_ctrl.c
index b40efc0f..ee0739c9 100644
--- a/drivers/gpu/nvgpu/common/linux/ioctl_ctrl.c
+++ b/drivers/gpu/nvgpu/common/linux/ioctl_ctrl.c
@@ -1575,6 +1575,56 @@ out:
1575 return err; 1575 return err;
1576} 1576}
1577 1577
1578static int nvgpu_gpu_read_single_sm_error_state(struct gk20a *g,
1579 struct nvgpu_gpu_read_single_sm_error_state_args *args)
1580{
1581 struct gr_gk20a *gr = &g->gr;
1582 struct nvgpu_gr_sm_error_state *sm_error_state;
1583 struct nvgpu_gpu_sm_error_state_record sm_error_state_record;
1584 u32 sm_id;
1585 int err = 0;
1586
1587 sm_id = args->sm_id;
1588 if (sm_id >= gr->no_of_sm)
1589 return -EINVAL;
1590
1591 nvgpu_speculation_barrier();
1592
1593 sm_error_state = gr->sm_error_states + sm_id;
1594 sm_error_state_record.global_esr =
1595 sm_error_state->hww_global_esr;
1596 sm_error_state_record.warp_esr =
1597 sm_error_state->hww_warp_esr;
1598 sm_error_state_record.warp_esr_pc =
1599 sm_error_state->hww_warp_esr_pc;
1600 sm_error_state_record.global_esr_report_mask =
1601 sm_error_state->hww_global_esr_report_mask;
1602 sm_error_state_record.warp_esr_report_mask =
1603 sm_error_state->hww_warp_esr_report_mask;
1604
1605 if (args->record_size > 0) {
1606 size_t write_size = sizeof(*sm_error_state);
1607
1608 if (write_size > args->record_size)
1609 write_size = args->record_size;
1610
1611 nvgpu_mutex_acquire(&g->dbg_sessions_lock);
1612 err = copy_to_user((void __user *)(uintptr_t)
1613 args->record_mem,
1614 &sm_error_state_record,
1615 write_size);
1616 nvgpu_mutex_release(&g->dbg_sessions_lock);
1617 if (err) {
1618 nvgpu_err(g, "copy_to_user failed!");
1619 return err;
1620 }
1621
1622 args->record_size = write_size;
1623 }
1624
1625 return 0;
1626}
1627
1578long gk20a_ctrl_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 1628long gk20a_ctrl_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
1579{ 1629{
1580 struct gk20a_ctrl_priv *priv = filp->private_data; 1630 struct gk20a_ctrl_priv *priv = filp->private_data;
@@ -1887,6 +1937,11 @@ long gk20a_ctrl_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg
1887 (struct nvgpu_gpu_set_deterministic_opts_args *)buf); 1937 (struct nvgpu_gpu_set_deterministic_opts_args *)buf);
1888 break; 1938 break;
1889 1939
1940 case NVGPU_GPU_IOCTL_READ_SINGLE_SM_ERROR_STATE:
1941 err = nvgpu_gpu_read_single_sm_error_state(g,
1942 (struct nvgpu_gpu_read_single_sm_error_state_args *)buf);
1943 break;
1944
1890 default: 1945 default:
1891 nvgpu_log_info(g, "unrecognized gpu ioctl cmd: 0x%x", cmd); 1946 nvgpu_log_info(g, "unrecognized gpu ioctl cmd: 0x%x", cmd);
1892 err = -ENOTTY; 1947 err = -ENOTTY;
diff --git a/include/uapi/linux/nvgpu.h b/include/uapi/linux/nvgpu.h
index b36509b0..908e5c57 100644
--- a/include/uapi/linux/nvgpu.h
+++ b/include/uapi/linux/nvgpu.h
@@ -864,6 +864,38 @@ struct nvgpu_gpu_set_deterministic_opts_args {
864 __u64 channels; /* in */ 864 __u64 channels; /* in */
865}; 865};
866 866
867/*
868 * This struct helps to report the SM error state of a single SM.
869 * This acts upon the currently resident GR context.
870 * Global Error status register
871 * Warp Error status register
872 * Warp Error status register PC
873 * Global Error status register Report Mask
874 * Warp Error status register Report Mask
875 */
876struct nvgpu_gpu_sm_error_state_record {
877 __u32 global_esr;
878 __u32 warp_esr;
879 __u64 warp_esr_pc;
880 __u32 global_esr_report_mask;
881 __u32 warp_esr_report_mask;
882};
883
884/*
885 * This struct helps to read the SM error state.
886 */
887struct nvgpu_gpu_read_single_sm_error_state_args {
888 /* Valid SM ID */
889 __u32 sm_id;
890 __u32 reserved;
891 /*
892 * This is pointer to the struct nvgpu_gpu_sm_error_state_record
893 */
894 __u64 record_mem;
895 /* size of the record size to read */
896 __u64 record_size;
897};
898
867#define NVGPU_GPU_IOCTL_ZCULL_GET_CTX_SIZE \ 899#define NVGPU_GPU_IOCTL_ZCULL_GET_CTX_SIZE \
868 _IOR(NVGPU_GPU_IOCTL_MAGIC, 1, struct nvgpu_gpu_zcull_get_ctx_size_args) 900 _IOR(NVGPU_GPU_IOCTL_MAGIC, 1, struct nvgpu_gpu_zcull_get_ctx_size_args)
869#define NVGPU_GPU_IOCTL_ZCULL_GET_INFO \ 901#define NVGPU_GPU_IOCTL_ZCULL_GET_INFO \
@@ -949,8 +981,11 @@ struct nvgpu_gpu_set_deterministic_opts_args {
949#define NVGPU_GPU_IOCTL_SET_DETERMINISTIC_OPTS \ 981#define NVGPU_GPU_IOCTL_SET_DETERMINISTIC_OPTS \
950 _IOWR(NVGPU_GPU_IOCTL_MAGIC, 40, \ 982 _IOWR(NVGPU_GPU_IOCTL_MAGIC, 40, \
951 struct nvgpu_gpu_set_deterministic_opts_args) 983 struct nvgpu_gpu_set_deterministic_opts_args)
984#define NVGPU_GPU_IOCTL_READ_SINGLE_SM_ERROR_STATE \
985 _IOWR(NVGPU_GPU_IOCTL_MAGIC, 41, \
986 struct nvgpu_gpu_read_single_sm_error_state_args)
952#define NVGPU_GPU_IOCTL_LAST \ 987#define NVGPU_GPU_IOCTL_LAST \
953 _IOC_NR(NVGPU_GPU_IOCTL_SET_DETERMINISTIC_OPTS) 988 _IOC_NR(NVGPU_GPU_IOCTL_READ_SINGLE_SM_ERROR_STATE)
954#define NVGPU_GPU_IOCTL_MAX_ARG_SIZE \ 989#define NVGPU_GPU_IOCTL_MAX_ARG_SIZE \
955 sizeof(struct nvgpu_gpu_get_cpu_time_correlation_info_args) 990 sizeof(struct nvgpu_gpu_get_cpu_time_correlation_info_args)
956 991