From c651adbeaacf063b856ef8126b74661b54066477 Mon Sep 17 00:00:00 2001 From: Deepak Nibade Date: Thu, 24 Dec 2015 18:41:15 +0530 Subject: gpu; nvgpu: IOCTL to write/clear SM error states Add below IOCTLs to write/clear SM error states NVGPU_DBG_GPU_IOCTL_CLEAR_SINGLE_SM_ERROR_STATE NVGPU_DBG_GPU_IOCTL_WRITE_SINGLE_SM_ERROR_STATE Bug 200156699 Change-Id: I89e3ec51c33b8e131a67d28807d5acf57b3a48fd Signed-off-by: Deepak Nibade Reviewed-on: http://git-master/r/1120330 Reviewed-by: Automatic_Commit_Validation_User GVS: Gerrit_Virtual_Submit Reviewed-by: Terje Bergstrom --- drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c | 90 ++++++++++++++++++++++++++ drivers/gpu/nvgpu/gk20a/gk20a.h | 6 ++ drivers/gpu/nvgpu/gk20a/gr_gk20a.c | 107 ++++++++++++++++++++++++++++++ drivers/gpu/nvgpu/gm20b/gr_gm20b.c | 111 ++++++++++++++++++++++++++++++++ 4 files changed, 314 insertions(+) (limited to 'drivers') diff --git a/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c b/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c index d9c96417..f717e207 100644 --- a/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c @@ -564,6 +564,86 @@ static int nvgpu_dbg_gpu_ioctl_read_single_sm_error_state( return 0; } +static int nvgpu_dbg_gpu_ioctl_clear_single_sm_error_state( + struct dbg_session_gk20a *dbg_s, + struct nvgpu_dbg_gpu_clear_single_sm_error_state_args *args) +{ + struct gk20a *g = get_gk20a(dbg_s->dev); + struct gr_gk20a *gr = &g->gr; + u32 sm_id; + struct channel_gk20a *ch = dbg_s->ch; + int err = 0; + + sm_id = args->sm_id; + + if (sm_id >= gr->no_of_sm) + return -EINVAL; + + err = gk20a_busy(g->dev); + if (err) + return err; + + err = gr_gk20a_elpg_protected_call(g, + g->ops.gr.clear_sm_error_state(g, ch, sm_id)); + + gk20a_idle(g->dev); + + return err; +} + +static int nvgpu_dbg_gpu_ioctl_write_single_sm_error_state( + struct dbg_session_gk20a *dbg_s, + struct nvgpu_dbg_gpu_write_single_sm_error_state_args *args) +{ + struct gk20a *g = get_gk20a(dbg_s->dev); + struct gr_gk20a *gr = &g->gr; + u32 sm_id; + struct channel_gk20a *ch = dbg_s->ch; + struct nvgpu_dbg_gpu_sm_error_state_record *sm_error_state; + int err = 0; + + sm_id = args->sm_id; + if (sm_id >= gr->no_of_sm) + return -EINVAL; + + sm_error_state = kzalloc(sizeof(*sm_error_state), GFP_KERNEL); + if (!sm_error_state) + return -ENOMEM; + + if (args->sm_error_state_record_size > 0) { + size_t read_size = sizeof(*sm_error_state); + + if (read_size > args->sm_error_state_record_size) + read_size = args->sm_error_state_record_size; + + mutex_lock(&g->dbg_sessions_lock); + err = copy_from_user(sm_error_state, + (void __user *)(uintptr_t) + args->sm_error_state_record_mem, + read_size); + mutex_unlock(&g->dbg_sessions_lock); + if (err) { + err = -ENOMEM; + goto err_free; + } + } + + err = gk20a_busy(g->dev); + if (err) + goto err_free; + + err = gr_gk20a_elpg_protected_call(g, + g->ops.gr.update_sm_error_state(g, ch, + sm_id, sm_error_state)); + + gk20a_idle(g->dev); + +err_free: + kfree(sm_error_state); + + return err; +} + long gk20a_dbg_gpu_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { @@ -666,6 +746,16 @@ long gk20a_dbg_gpu_dev_ioctl(struct file *filp, unsigned int cmd, (struct nvgpu_dbg_gpu_read_single_sm_error_state_args *)buf); break; + case NVGPU_DBG_GPU_IOCTL_CLEAR_SINGLE_SM_ERROR_STATE: + err = nvgpu_dbg_gpu_ioctl_clear_single_sm_error_state(dbg_s, + (struct nvgpu_dbg_gpu_clear_single_sm_error_state_args *)buf); + break; + + case NVGPU_DBG_GPU_IOCTL_WRITE_SINGLE_SM_ERROR_STATE: + err = nvgpu_dbg_gpu_ioctl_write_single_sm_error_state(dbg_s, + (struct nvgpu_dbg_gpu_write_single_sm_error_state_args *)buf); + break; + default: gk20a_err(dev_from_gk20a(g), "unrecognized dbg gpu ioctl cmd: 0x%x", diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h index c70217ea..d1ce6afb 100644 --- a/drivers/gpu/nvgpu/gk20a/gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/gk20a.h @@ -242,6 +242,12 @@ struct gpu_ops { u32 (*get_lrf_tex_ltc_dram_override)(struct gk20a *g); int (*record_sm_error_state)(struct gk20a *g, u32 gpc, u32 tpc); + int (*update_sm_error_state)(struct gk20a *g, + struct channel_gk20a *ch, u32 sm_id, + struct nvgpu_dbg_gpu_sm_error_state_record * + sm_error_state); + int (*clear_sm_error_state)(struct gk20a *g, + struct channel_gk20a *ch, u32 sm_id); } gr; const char *name; struct { diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c index c0a25e68..4c88751e 100644 --- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c @@ -5535,6 +5535,111 @@ static int gk20a_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc) return 0; } +static int gk20a_gr_update_sm_error_state(struct gk20a *g, + struct channel_gk20a *ch, u32 sm_id, + struct nvgpu_dbg_gpu_sm_error_state_record *sm_error_state) +{ + u32 gpc, tpc, offset; + struct gr_gk20a *gr = &g->gr; + struct channel_ctx_gk20a *ch_ctx = &ch->ch_ctx; + int err = 0; + + mutex_lock(&g->dbg_sessions_lock); + + gr->sm_error_states[sm_id].hww_global_esr = + sm_error_state->hww_global_esr; + gr->sm_error_states[sm_id].hww_warp_esr = + sm_error_state->hww_warp_esr; + gr->sm_error_states[sm_id].hww_global_esr_report_mask = + sm_error_state->hww_global_esr_report_mask; + gr->sm_error_states[sm_id].hww_warp_esr_report_mask = + sm_error_state->hww_warp_esr_report_mask; + + err = gr_gk20a_disable_ctxsw(g); + if (err) { + gk20a_err(dev_from_gk20a(g), "unable to stop gr ctxsw\n"); + goto fail; + } + + gpc = g->gr.sm_to_cluster[sm_id].gpc_index; + tpc = g->gr.sm_to_cluster[sm_id].tpc_index; + + offset = proj_gpc_stride_v() * gpc + + proj_tpc_in_gpc_stride_v() * tpc; + + if (gk20a_is_channel_ctx_resident(ch)) { + gk20a_writel(g, gr_gpc0_tpc0_sm_hww_global_esr_r() + offset, + gr->sm_error_states[sm_id].hww_global_esr); + gk20a_writel(g, gr_gpc0_tpc0_sm_hww_warp_esr_r() + offset, + gr->sm_error_states[sm_id].hww_warp_esr); + gk20a_writel(g, gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r() + offset, + gr->sm_error_states[sm_id].hww_global_esr_report_mask); + gk20a_writel(g, gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r() + offset, + gr->sm_error_states[sm_id].hww_warp_esr_report_mask); + } else { + err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx); + if (err) + goto enable_ctxsw; + + gr_gk20a_ctx_patch_write(g, ch_ctx, + gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r() + offset, + gr->sm_error_states[sm_id].hww_global_esr_report_mask, + true); + gr_gk20a_ctx_patch_write(g, ch_ctx, + gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r() + offset, + gr->sm_error_states[sm_id].hww_warp_esr_report_mask, + true); + + gr_gk20a_ctx_patch_write_end(g, ch_ctx); + } + +enable_ctxsw: + err = gr_gk20a_enable_ctxsw(g); + +fail: + mutex_unlock(&g->dbg_sessions_lock); + return err; +} + +static int gk20a_gr_clear_sm_error_state(struct gk20a *g, + struct channel_gk20a *ch, u32 sm_id) +{ + u32 gpc, tpc, offset; + u32 val; + struct gr_gk20a *gr = &g->gr; + int err = 0; + + mutex_lock(&g->dbg_sessions_lock); + + memset(&gr->sm_error_states[sm_id], 0, sizeof(*gr->sm_error_states)); + + err = gr_gk20a_disable_ctxsw(g); + if (err) { + gk20a_err(dev_from_gk20a(g), "unable to stop gr ctxsw\n"); + goto fail; + } + + if (gk20a_is_channel_ctx_resident(ch)) { + gpc = g->gr.sm_to_cluster[sm_id].gpc_index; + tpc = g->gr.sm_to_cluster[sm_id].tpc_index; + + offset = proj_gpc_stride_v() * gpc + + proj_tpc_in_gpc_stride_v() * tpc; + + val = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_global_esr_r() + offset); + gk20a_writel(g, gr_gpc0_tpc0_sm_hww_global_esr_r() + offset, + val); + gk20a_writel(g, gr_gpc0_tpc0_sm_hww_warp_esr_r() + offset, + 0); + } + + err = gr_gk20a_enable_ctxsw(g); + +fail: + mutex_unlock(&g->dbg_sessions_lock); + return err; +} + int gr_gk20a_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, bool *post_event, struct channel_gk20a *fault_ch) { @@ -8415,4 +8520,6 @@ void gk20a_init_gr_ops(struct gpu_ops *gops) gops->gr.update_smpc_ctxsw_mode = gr_gk20a_update_smpc_ctxsw_mode; gops->gr.update_hwpm_ctxsw_mode = gr_gk20a_update_hwpm_ctxsw_mode; gops->gr.record_sm_error_state = gk20a_gr_record_sm_error_state; + gops->gr.update_sm_error_state = gk20a_gr_update_sm_error_state; + gops->gr.clear_sm_error_state = gk20a_gr_clear_sm_error_state; } diff --git a/drivers/gpu/nvgpu/gm20b/gr_gm20b.c b/drivers/gpu/nvgpu/gm20b/gr_gm20b.c index eeb70d76..204a90f3 100644 --- a/drivers/gpu/nvgpu/gm20b/gr_gm20b.c +++ b/drivers/gpu/nvgpu/gm20b/gr_gm20b.c @@ -1219,6 +1219,115 @@ static int gm20b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc) return 0; } +static int gm20b_gr_update_sm_error_state(struct gk20a *g, + struct channel_gk20a *ch, u32 sm_id, + struct nvgpu_dbg_gpu_sm_error_state_record *sm_error_state) +{ + u32 gpc, tpc, offset; + struct gr_gk20a *gr = &g->gr; + struct channel_ctx_gk20a *ch_ctx = &ch->ch_ctx; + int err = 0; + + mutex_lock(&g->dbg_sessions_lock); + + gr->sm_error_states[sm_id].hww_global_esr = + sm_error_state->hww_global_esr; + gr->sm_error_states[sm_id].hww_warp_esr = + sm_error_state->hww_warp_esr; + gr->sm_error_states[sm_id].hww_warp_esr_pc = + sm_error_state->hww_warp_esr_pc; + gr->sm_error_states[sm_id].hww_global_esr_report_mask = + sm_error_state->hww_global_esr_report_mask; + gr->sm_error_states[sm_id].hww_warp_esr_report_mask = + sm_error_state->hww_warp_esr_report_mask; + + err = gr_gk20a_disable_ctxsw(g); + if (err) { + gk20a_err(dev_from_gk20a(g), "unable to stop gr ctxsw\n"); + goto fail; + } + + gpc = g->gr.sm_to_cluster[sm_id].gpc_index; + tpc = g->gr.sm_to_cluster[sm_id].tpc_index; + + offset = proj_gpc_stride_v() * gpc + + proj_tpc_in_gpc_stride_v() * tpc; + + if (gk20a_is_channel_ctx_resident(ch)) { + gk20a_writel(g, gr_gpc0_tpc0_sm_hww_global_esr_r() + offset, + gr->sm_error_states[sm_id].hww_global_esr); + gk20a_writel(g, gr_gpc0_tpc0_sm_hww_warp_esr_r() + offset, + gr->sm_error_states[sm_id].hww_warp_esr); + gk20a_writel(g, gr_gpc0_tpc0_sm_hww_warp_esr_pc_r() + offset, + gr->sm_error_states[sm_id].hww_warp_esr_pc); + gk20a_writel(g, gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r() + offset, + gr->sm_error_states[sm_id].hww_global_esr_report_mask); + gk20a_writel(g, gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r() + offset, + gr->sm_error_states[sm_id].hww_warp_esr_report_mask); + } else { + err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx); + if (err) + goto enable_ctxsw; + + gr_gk20a_ctx_patch_write(g, ch_ctx, + gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r() + offset, + gr->sm_error_states[sm_id].hww_global_esr_report_mask, + true); + gr_gk20a_ctx_patch_write(g, ch_ctx, + gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r() + offset, + gr->sm_error_states[sm_id].hww_warp_esr_report_mask, + true); + + gr_gk20a_ctx_patch_write_end(g, ch_ctx); + } + +enable_ctxsw: + err = gr_gk20a_enable_ctxsw(g); + +fail: + mutex_unlock(&g->dbg_sessions_lock); + return err; +} + +static int gm20b_gr_clear_sm_error_state(struct gk20a *g, + struct channel_gk20a *ch, u32 sm_id) +{ + u32 gpc, tpc, offset; + u32 val; + struct gr_gk20a *gr = &g->gr; + int err = 0; + + mutex_lock(&g->dbg_sessions_lock); + + memset(&gr->sm_error_states[sm_id], 0, sizeof(*gr->sm_error_states)); + + err = gr_gk20a_disable_ctxsw(g); + if (err) { + gk20a_err(dev_from_gk20a(g), "unable to stop gr ctxsw\n"); + goto fail; + } + + if (gk20a_is_channel_ctx_resident(ch)) { + gpc = g->gr.sm_to_cluster[sm_id].gpc_index; + tpc = g->gr.sm_to_cluster[sm_id].tpc_index; + + offset = proj_gpc_stride_v() * gpc + + proj_tpc_in_gpc_stride_v() * tpc; + + val = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_global_esr_r() + offset); + gk20a_writel(g, gr_gpc0_tpc0_sm_hww_global_esr_r() + offset, + val); + gk20a_writel(g, gr_gpc0_tpc0_sm_hww_warp_esr_r() + offset, + 0); + } + + err = gr_gk20a_enable_ctxsw(g); + +fail: + mutex_unlock(&g->dbg_sessions_lock); + return err; +} + void gm20b_init_gr(struct gpu_ops *gops) { gops->gr.init_gpc_mmu = gr_gm20b_init_gpc_mmu; @@ -1286,4 +1395,6 @@ void gm20b_init_gr(struct gpu_ops *gops) gops->gr.update_smpc_ctxsw_mode = gr_gk20a_update_smpc_ctxsw_mode; gops->gr.update_hwpm_ctxsw_mode = gr_gk20a_update_hwpm_ctxsw_mode; gops->gr.record_sm_error_state = gm20b_gr_record_sm_error_state; + gops->gr.update_sm_error_state = gm20b_gr_update_sm_error_state; + gops->gr.clear_sm_error_state = gm20b_gr_clear_sm_error_state; } -- cgit v1.2.2