From a09b9cd587d27a3ef6479a17631c3497d447e7a9 Mon Sep 17 00:00:00 2001 From: Vinod G Date: Thu, 24 May 2018 14:00:19 -0700 Subject: gpu: nvgpu: Add IOCTL for SM_EXCEPTION_TYPE_MASK Add new ioctl to set the SM_EXCEPTION_TYPE_MASK is added to dbg session. Currently support SM_EXCEPTION_TYPE_MASK_FATAL type If this type is set then the code will skip RC recovery, instead trigger CILP preemption. bug 200412641 JIRA NVGPU-702 Change-Id: I4b1f18379ee792cd324ccc555939e0f4f5c9e3b4 Signed-off-by: Vinod G Reviewed-on: https://git-master.nvidia.com/r/1729792 Reviewed-by: mobile promotions Tested-by: mobile promotions --- drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.h | 6 +++ drivers/gpu/nvgpu/gk20a/gr_gk20a.h | 6 +++ drivers/gpu/nvgpu/gv11b/gr_gv11b.c | 22 +++++++++-- drivers/gpu/nvgpu/os/linux/ioctl_dbg.c | 68 +++++++++++++++++++++++++++++++++ include/uapi/linux/nvgpu.h | 21 +++++++++- 5 files changed, 118 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.h b/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.h index 50002557..4d3c4d74 100644 --- a/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.h @@ -72,6 +72,12 @@ struct dbg_session_gk20a { bool broadcast_stop_trigger; struct nvgpu_mutex ioctl_lock; + + /* + * sm set exception type mask flag, to check whether + * exception type mask is requested or not. + */ + bool is_sm_exception_type_mask_set; }; struct dbg_session_data { diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h index 0c6be57b..804e0e25 100644 --- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h @@ -437,6 +437,12 @@ struct gr_gk20a { u32 no_of_sm; struct sm_info *sm_to_cluster; struct nvgpu_gr_sm_error_state *sm_error_states; + +#define NVGPU_SM_EXCEPTION_TYPE_MASK_NONE (0x0U) +#define NVGPU_SM_EXCEPTION_TYPE_MASK_FATAL (0x1U << 0) + u32 sm_exception_mask_type; + u32 sm_exception_mask_refcount; + #if defined(CONFIG_GK20A_CYCLE_STATS) struct nvgpu_mutex cs_lock; struct gk20a_cs_snapshot *cs_data; diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c index c925e5b6..9e36071f 100644 --- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c @@ -2182,9 +2182,9 @@ static bool gr_gv11b_check_warp_esr_error(struct gk20a *g, u32 warp_esr_error) struct warp_esr_error_table_s warp_esr_error_table[] = { { gr_gpc0_tpc0_sm0_hww_warp_esr_error_stack_error_f(), - "STACK ERROR"}, + "STACK ERROR"}, { gr_gpc0_tpc0_sm0_hww_warp_esr_error_api_stack_error_f(), - "API STACK ERROR"}, + "API STACK ERROR"}, { gr_gpc0_tpc0_sm0_hww_warp_esr_error_pc_wrap_f(), "PC WRAP ERROR"}, { gr_gpc0_tpc0_sm0_hww_warp_esr_error_misaligned_pc_f(), @@ -2221,7 +2221,7 @@ static bool gr_gv11b_check_warp_esr_error(struct gk20a *g, u32 warp_esr_error) if (warp_esr_error_table[index].error_value == warp_esr_error) { esr_err = warp_esr_error_table[index].error_value; nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, - "ESR %s(0x%x)", + "WARP_ESR %s(0x%x)", warp_esr_error_table[index].error_name, esr_err); break; @@ -2250,6 +2250,21 @@ static int gr_gv11b_handle_all_warp_esr_errors(struct gk20a *g, return 0; } + /* + * Check SET_EXCEPTION_TYPE_MASK is being set. + * If set, skip the recovery and trigger CILP + * If not set, trigger the recovery. + */ + if ((g->gr.sm_exception_mask_type & + NVGPU_SM_EXCEPTION_TYPE_MASK_FATAL) == + NVGPU_SM_EXCEPTION_TYPE_MASK_FATAL) { + nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, + "SM Exception Type Mask set %d," + "skip recovery", + g->gr.sm_exception_mask_type); + return 0; + } + if (fault_ch) { tsg = &g->fifo.tsg[fault_ch->tsgid]; @@ -2294,7 +2309,6 @@ int gr_gv11b_pre_process_sm_exception(struct gk20a *g, u32 warp_esr_error = gr_gpc0_tpc0_sm0_hww_warp_esr_error_v(warp_esr); struct tsg_gk20a *tsg; - *early_exit = false; *ignore_debugger = false; diff --git a/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c b/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c index eadf1f93..ad4dfc0e 100644 --- a/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c +++ b/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c @@ -151,6 +151,10 @@ static int dbg_unbind_all_channels_gk20a(struct dbg_session_gk20a *dbg_s); static int gk20a_dbg_gpu_do_dev_open(struct inode *inode, struct file *filp, bool is_profiler); +static int nvgpu_set_sm_exception_type_mask_locked( + struct dbg_session_gk20a *dbg_s, + u32 exception_mask); + unsigned int gk20a_dbg_gpu_dev_poll(struct file *filep, poll_table *wait) { unsigned int mask = 0; @@ -217,6 +221,10 @@ int gk20a_dbg_gpu_dev_release(struct inode *inode, struct file *filp) nvgpu_kfree(g, prof_obj); } } + + nvgpu_set_sm_exception_type_mask_locked(dbg_s, + NVGPU_SM_EXCEPTION_TYPE_MASK_NONE); + nvgpu_mutex_release(&g->dbg_sessions_lock); nvgpu_mutex_destroy(&dbg_s->ch_list_lock); @@ -466,6 +474,7 @@ static int gk20a_dbg_gpu_do_dev_open(struct inode *inode, dbg_s->is_profiler = is_profiler; dbg_s->is_pg_disabled = false; dbg_s->is_timeout_disabled = false; + dbg_s->is_sm_exception_type_mask_set = false; nvgpu_cond_init(&dbg_s->dbg_events.wait_queue); nvgpu_init_list_node(&dbg_s->ch_list); @@ -478,6 +487,9 @@ static int gk20a_dbg_gpu_do_dev_open(struct inode *inode, dbg_s->dbg_events.events_enabled = false; dbg_s->dbg_events.num_pending_events = 0; + nvgpu_set_sm_exception_type_mask_locked(dbg_s, + NVGPU_SM_EXCEPTION_TYPE_MASK_NONE); + return 0; err_destroy_lock: @@ -1839,6 +1851,57 @@ out: return err; } +static int nvgpu_set_sm_exception_type_mask_locked( + struct dbg_session_gk20a *dbg_s, + u32 exception_mask) +{ + struct gk20a *g = dbg_s->g; + struct gr_gk20a *gr = &g->gr; + int err = 0; + + switch (exception_mask) { + case NVGPU_DBG_GPU_IOCTL_SET_SM_EXCEPTION_TYPE_MASK_FATAL: + gr->sm_exception_mask_type = NVGPU_SM_EXCEPTION_TYPE_MASK_FATAL; + if (dbg_s->is_sm_exception_type_mask_set == false) { + gr->sm_exception_mask_refcount++; + dbg_s->is_sm_exception_type_mask_set = true; + } + break; + case NVGPU_DBG_GPU_IOCTL_SET_SM_EXCEPTION_TYPE_MASK_NONE: + if (dbg_s->is_sm_exception_type_mask_set) { + gr->sm_exception_mask_refcount--; + dbg_s->is_sm_exception_type_mask_set = false; + } + if (gr->sm_exception_mask_refcount == 0) + gr->sm_exception_mask_type = + NVGPU_SM_EXCEPTION_TYPE_MASK_NONE; + break; + default: + nvgpu_err(g, + "unrecognized dbg sm exception type mask: 0x%x", + exception_mask); + err = -EINVAL; + break; + } + + return err; +} + +static int nvgpu_dbg_gpu_set_sm_exception_type_mask( + struct dbg_session_gk20a *dbg_s, + struct nvgpu_dbg_gpu_set_sm_exception_type_mask_args *args) +{ + int err = 0; + struct gk20a *g = dbg_s->g; + + nvgpu_mutex_acquire(&g->dbg_sessions_lock); + err = nvgpu_set_sm_exception_type_mask_locked(dbg_s, + args->exception_type_mask); + nvgpu_mutex_release(&g->dbg_sessions_lock); + + return err; +} + int gk20a_dbg_gpu_dev_open(struct inode *inode, struct file *filp) { struct nvgpu_os_linux *l = container_of(inode->i_cdev, @@ -1994,6 +2057,11 @@ long gk20a_dbg_gpu_dev_ioctl(struct file *filp, unsigned int cmd, (struct nvgpu_dbg_gpu_profiler_reserve_args *)buf); break; + case NVGPU_DBG_GPU_IOCTL_SET_SM_EXCEPTION_TYPE_MASK: + err = nvgpu_dbg_gpu_set_sm_exception_type_mask(dbg_s, + (struct nvgpu_dbg_gpu_set_sm_exception_type_mask_args *)buf); + break; + default: nvgpu_err(g, "unrecognized dbg gpu ioctl cmd: 0x%x", diff --git a/include/uapi/linux/nvgpu.h b/include/uapi/linux/nvgpu.h index 446f5bd3..0733a7b2 100644 --- a/include/uapi/linux/nvgpu.h +++ b/include/uapi/linux/nvgpu.h @@ -1411,8 +1411,27 @@ struct nvgpu_dbg_gpu_profiler_reserve_args { #define NVGPU_DBG_GPU_IOCTL_PROFILER_RESERVE \ _IOWR(NVGPU_DBG_GPU_IOCTL_MAGIC, 22, struct nvgpu_dbg_gpu_profiler_reserve_args) +/* + * This struct helps to set the exception mask. If mask is not set + * or set to NVGPU_DBG_GPU_IOCTL_SET_SM_EXCEPTION_TYPE_MASK_NONE + * then kernel code will follow recovery path on sm exception. + * If mask is set to NVGPU_DBG_GPU_IOCTL_SET_SM_EXCEPTION_TYPE_MASK_FATAL, then + * kernel code will skip recovery path on sm exception. + */ +struct nvgpu_dbg_gpu_set_sm_exception_type_mask_args { +#define NVGPU_DBG_GPU_IOCTL_SET_SM_EXCEPTION_TYPE_MASK_NONE (0x0U) +#define NVGPU_DBG_GPU_IOCTL_SET_SM_EXCEPTION_TYPE_MASK_FATAL (0x1U << 0U) + /* exception type mask value */ + __u32 exception_type_mask; + __u32 reserved; +}; + +#define NVGPU_DBG_GPU_IOCTL_SET_SM_EXCEPTION_TYPE_MASK \ + _IOW(NVGPU_DBG_GPU_IOCTL_MAGIC, 23, \ + struct nvgpu_dbg_gpu_set_sm_exception_type_mask_args) + #define NVGPU_DBG_GPU_IOCTL_LAST \ - _IOC_NR(NVGPU_DBG_GPU_IOCTL_PROFILER_RESERVE) + _IOC_NR(NVGPU_DBG_GPU_IOCTL_SET_SM_EXCEPTION_TYPE_MASK) #define NVGPU_DBG_GPU_IOCTL_MAX_ARG_SIZE \ sizeof(struct nvgpu_dbg_gpu_access_fb_memory_args) -- cgit v1.2.2