From bfe65407bde2b5d0776724301e215c6553c989f3 Mon Sep 17 00:00:00 2001 From: Vinod G Date: Tue, 7 Aug 2018 23:09:30 -0700 Subject: gpu: nvgpu: Read sm error ioctl support for tsg Add READ_SM_ERROR IOCTL support to TSG level. Moved the struct to save the sm_error details from gr to tsg as the sm_error support is context based, not global. Also corrected MISRA 21.1 error in header file. nvgpu_dbg_gpu_ioctl_write_single_sm_error_state and nvgpu_dbg_gpu_ioctl_read_single_sm_error_state functions are modified to use the tsg struct nvgpu_tsg_sm_error_state. Bug 200412642 Change-Id: I9e334b059078a4bb0e360b945444cc4bf1cc56ec Signed-off-by: Vinod G Reviewed-on: https://git-master.nvidia.com/r/1794856 Reviewed-by: svc-misra-checker GVS: Gerrit_Virtual_Submit Reviewed-by: Terje Bergstrom Reviewed-by: mobile promotions Tested-by: mobile promotions --- drivers/gpu/nvgpu/gk20a/gk20a.h | 2 +- drivers/gpu/nvgpu/gk20a/gr_gk20a.c | 30 ------ drivers/gpu/nvgpu/gk20a/gr_gk20a.h | 9 -- drivers/gpu/nvgpu/gk20a/tsg_gk20a.c | 82 ++++++++++++++-- drivers/gpu/nvgpu/gk20a/tsg_gk20a.h | 21 ++++- drivers/gpu/nvgpu/gm20b/gr_gm20b.c | 106 +++++++++++++-------- drivers/gpu/nvgpu/gm20b/gr_gm20b.h | 2 +- drivers/gpu/nvgpu/gv11b/gr_gv11b.c | 109 +++++++++++++--------- drivers/gpu/nvgpu/gv11b/gr_gv11b.h | 4 +- drivers/gpu/nvgpu/include/nvgpu/vgpu/tegra_vgpu.h | 5 +- drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c | 55 ----------- drivers/gpu/nvgpu/os/linux/ioctl_dbg.c | 50 +++++++--- drivers/gpu/nvgpu/os/linux/ioctl_tsg.c | 58 ++++++++++++ drivers/gpu/nvgpu/vgpu/gr_vgpu.c | 36 ++++--- 14 files changed, 349 insertions(+), 220 deletions(-) (limited to 'drivers/gpu/nvgpu') diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h index cf202f14..192f4c3e 100644 --- a/drivers/gpu/nvgpu/gk20a/gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/gk20a.h @@ -396,7 +396,7 @@ struct gpu_ops { u32 sm, struct channel_gk20a *fault_ch); int (*update_sm_error_state)(struct gk20a *g, struct channel_gk20a *ch, u32 sm_id, - struct nvgpu_gr_sm_error_state *sm_error_state); + struct nvgpu_tsg_sm_error_state *sm_error_state); int (*clear_sm_error_state)(struct gk20a *g, struct channel_gk20a *ch, u32 sm_id); int (*suspend_contexts)(struct gk20a *g, diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c index f2b083d7..cdc00bbd 100644 --- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c @@ -1561,19 +1561,6 @@ restore_fe_go_idle: if (err) goto clean_up; - nvgpu_kfree(g, gr->sm_error_states); - - /* we need to allocate this after g->ops.gr.init_fs_state() since - * we initialize gr->no_of_sm in this function - */ - gr->sm_error_states = nvgpu_kzalloc(g, - sizeof(struct nvgpu_gr_sm_error_state) - * gr->no_of_sm); - if (!gr->sm_error_states) { - err = -ENOMEM; - goto restore_fe_go_idle; - } - ctx_header_words = roundup(ctx_header_bytes, sizeof(u32)); ctx_header_words >>= 2; @@ -3072,7 +3059,6 @@ static void gk20a_remove_gr_support(struct gr_gk20a *gr) memset(&gr->compbit_store, 0, sizeof(struct compbit_store_desc)); - nvgpu_kfree(g, gr->sm_error_states); nvgpu_kfree(g, gr->gpc_tpc_count); nvgpu_kfree(g, gr->gpc_zcb_count); nvgpu_kfree(g, gr->gpc_ppc_count); @@ -4545,22 +4531,6 @@ restore_fe_go_idle: err = gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g), GR_IDLE_CHECK_DEFAULT); - if (err) - goto out; - - nvgpu_kfree(g, gr->sm_error_states); - - /* we need to allocate this after g->ops.gr.init_fs_state() since - * we initialize gr->no_of_sm in this function - */ - gr->sm_error_states = nvgpu_kzalloc(g, - sizeof(struct nvgpu_gr_sm_error_state) * - gr->no_of_sm); - if (!gr->sm_error_states) { - err = -ENOMEM; - goto restore_fe_go_idle; - } - out: nvgpu_log_fn(g, "done"); return err; diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h index 3fc7e55f..bd5e625d 100644 --- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h @@ -254,14 +254,6 @@ struct nvgpu_preemption_modes_rec { u32 default_compute_preempt_mode; /* default mode */ }; -struct nvgpu_gr_sm_error_state { - u32 hww_global_esr; - u32 hww_warp_esr; - u64 hww_warp_esr_pc; - u32 hww_global_esr_report_mask; - u32 hww_warp_esr_report_mask; -}; - struct gr_gk20a { struct gk20a *g; struct { @@ -427,7 +419,6 @@ struct gr_gk20a { u32 *fbp_rop_l2_en_mask; u32 no_of_sm; struct sm_info *sm_to_cluster; - struct nvgpu_gr_sm_error_state *sm_error_states; #define NVGPU_SM_EXCEPTION_TYPE_MASK_NONE (0x0U) #define NVGPU_SM_EXCEPTION_TYPE_MASK_FATAL (0x1U << 0) diff --git a/drivers/gpu/nvgpu/gk20a/tsg_gk20a.c b/drivers/gpu/nvgpu/gk20a/tsg_gk20a.c index 62763da3..624ee1d7 100644 --- a/drivers/gpu/nvgpu/gk20a/tsg_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/tsg_gk20a.c @@ -275,8 +275,23 @@ struct tsg_gk20a *gk20a_tsg_open(struct gk20a *g, pid_t pid) int err; tsg = gk20a_tsg_acquire_unused_tsg(&g->fifo); - if (!tsg) + if (tsg == NULL) { return NULL; + } + + /* we need to allocate this after g->ops.gr.init_fs_state() since + * we initialize gr->no_of_sm in this function + */ + if (g->gr.no_of_sm == 0U) { + nvgpu_err(g, "no_of_sm %d not set, failed allocation", + g->gr.no_of_sm); + return NULL; + } + + err = gk20a_tsg_alloc_sm_error_states_mem(g, tsg, g->gr.no_of_sm); + if (err != 0) { + return NULL; + } tsg->g = g; tsg->num_active_channels = 0; @@ -295,7 +310,7 @@ struct tsg_gk20a *gk20a_tsg_open(struct gk20a *g, pid_t pid) if (g->ops.fifo.tsg_open) { err = g->ops.fifo.tsg_open(tsg); - if (err) { + if (err != 0) { nvgpu_err(g, "tsg %d fifo open failed %d", tsg->tsgid, err); goto clean_up; @@ -307,6 +322,12 @@ struct tsg_gk20a *gk20a_tsg_open(struct gk20a *g, pid_t pid) return tsg; clean_up: + + if(tsg->sm_error_states != NULL) { + nvgpu_kfree(g, tsg->sm_error_states); + tsg->sm_error_states = NULL; + } + nvgpu_ref_put(&tsg->refcount, gk20a_tsg_release); return NULL; } @@ -317,20 +338,28 @@ void gk20a_tsg_release(struct nvgpu_ref *ref) struct gk20a *g = tsg->g; struct gk20a_event_id_data *event_id_data, *event_id_data_temp; - if (g->ops.fifo.tsg_release) + if (g->ops.fifo.tsg_release != NULL) { g->ops.fifo.tsg_release(tsg); + } - if (nvgpu_mem_is_valid(&tsg->gr_ctx.mem)) + if (nvgpu_mem_is_valid(&tsg->gr_ctx.mem)) { gr_gk20a_free_tsg_gr_ctx(tsg); + } - if (g->ops.fifo.deinit_eng_method_buffers) + if (g->ops.fifo.deinit_eng_method_buffers != NULL) { g->ops.fifo.deinit_eng_method_buffers(g, tsg); + } - if (tsg->vm) { + if (tsg->vm != NULL) { nvgpu_vm_put(tsg->vm); tsg->vm = NULL; } + if(tsg->sm_error_states != NULL) { + nvgpu_kfree(g, tsg->sm_error_states); + tsg->sm_error_states = NULL; + } + /* unhook all events created on this TSG */ nvgpu_mutex_acquire(&tsg->event_id_list_lock); nvgpu_list_for_each_entry_safe(event_id_data, event_id_data_temp, @@ -360,3 +389,44 @@ struct tsg_gk20a *tsg_gk20a_from_ch(struct channel_gk20a *ch) return tsg; } + +int gk20a_tsg_alloc_sm_error_states_mem(struct gk20a *g, + struct tsg_gk20a *tsg, + u32 num_sm) +{ + int err = 0; + + if (tsg->sm_error_states != NULL) { + return err; + } + + tsg->sm_error_states = nvgpu_kzalloc(g, + sizeof(struct nvgpu_tsg_sm_error_state) + * num_sm); + if (tsg->sm_error_states == NULL) { + nvgpu_err(g, "sm_error_states mem allocation failed"); + err = -ENOMEM; + } + + return err; +} + +void gk20a_tsg_update_sm_error_state_locked(struct tsg_gk20a *tsg, + u32 sm_id, + struct nvgpu_tsg_sm_error_state *sm_error_state) +{ + struct nvgpu_tsg_sm_error_state *tsg_sm_error_states; + + tsg_sm_error_states = tsg->sm_error_states + sm_id; + + tsg_sm_error_states->hww_global_esr = + sm_error_state->hww_global_esr; + tsg_sm_error_states->hww_warp_esr = + sm_error_state->hww_warp_esr; + tsg_sm_error_states->hww_warp_esr_pc = + sm_error_state->hww_warp_esr_pc; + tsg_sm_error_states->hww_global_esr_report_mask = + sm_error_state->hww_global_esr_report_mask; + tsg_sm_error_states->hww_warp_esr_report_mask = + sm_error_state->hww_warp_esr_report_mask; +} diff --git a/drivers/gpu/nvgpu/gk20a/tsg_gk20a.h b/drivers/gpu/nvgpu/gk20a/tsg_gk20a.h index 552c3bb3..67ccb9f5 100644 --- a/drivers/gpu/nvgpu/gk20a/tsg_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/tsg_gk20a.h @@ -19,8 +19,8 @@ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ -#ifndef __TSG_GK20A_H_ -#define __TSG_GK20A_H_ +#ifndef TSG_GK20A_H +#define TSG_GK20A_H #include #include @@ -39,6 +39,14 @@ void gk20a_tsg_release(struct nvgpu_ref *ref); int gk20a_init_tsg_support(struct gk20a *g, u32 tsgid); struct tsg_gk20a *tsg_gk20a_from_ch(struct channel_gk20a *ch); +struct nvgpu_tsg_sm_error_state { + u32 hww_global_esr; + u32 hww_warp_esr; + u64 hww_warp_esr_pc; + u32 hww_global_esr_report_mask; + u32 hww_warp_esr_report_mask; +}; + struct tsg_gk20a { struct gk20a *g; @@ -69,6 +77,7 @@ struct tsg_gk20a { bool tpc_num_initialized; bool in_use; + struct nvgpu_tsg_sm_error_state *sm_error_states; }; int gk20a_enable_tsg(struct tsg_gk20a *tsg); @@ -84,6 +93,12 @@ int gk20a_tsg_set_timeslice(struct tsg_gk20a *tsg, u32 timeslice); u32 gk20a_tsg_get_timeslice(struct tsg_gk20a *tsg); int gk20a_tsg_set_priority(struct gk20a *g, struct tsg_gk20a *tsg, u32 priority); +int gk20a_tsg_alloc_sm_error_states_mem(struct gk20a *g, + struct tsg_gk20a *tsg, + u32 num_sm); +void gk20a_tsg_update_sm_error_state_locked(struct tsg_gk20a *tsg, + u32 sm_id, + struct nvgpu_tsg_sm_error_state *sm_error_state); struct gk20a_event_id_data { struct gk20a *g; @@ -106,4 +121,4 @@ gk20a_event_id_data_from_event_id_node(struct nvgpu_list_node *node) ((uintptr_t)node - offsetof(struct gk20a_event_id_data, event_id_node)); }; -#endif /* __TSG_GK20A_H_ */ +#endif /* TSG_GK20A_H */ diff --git a/drivers/gpu/nvgpu/gm20b/gr_gm20b.c b/drivers/gpu/nvgpu/gm20b/gr_gm20b.c index 68ae91e8..fc4ab3dd 100644 --- a/drivers/gpu/nvgpu/gm20b/gr_gm20b.c +++ b/drivers/gpu/nvgpu/gm20b/gr_gm20b.c @@ -1268,32 +1268,68 @@ void gr_gm20b_get_access_map(struct gk20a *g, *num_entries = ARRAY_SIZE(wl_addr_gm20b); } +static void gm20b_gr_read_sm_error_state(struct gk20a *g, + u32 offset, + struct nvgpu_tsg_sm_error_state *sm_error_states) +{ + sm_error_states->hww_global_esr = gk20a_readl(g, + gr_gpc0_tpc0_sm_hww_global_esr_r() + offset); + sm_error_states->hww_warp_esr = gk20a_readl(g, + gr_gpc0_tpc0_sm_hww_warp_esr_r() + offset); + sm_error_states->hww_warp_esr_pc = (u64)(gk20a_readl(g, + gr_gpc0_tpc0_sm_hww_warp_esr_pc_r() + offset)); + sm_error_states->hww_global_esr_report_mask = gk20a_readl(g, + gr_gpc0_tpc0_sm_hww_global_esr_report_mask_r() + offset); + sm_error_states->hww_warp_esr_report_mask = gk20a_readl(g, + gr_gpc0_tpc0_sm_hww_warp_esr_report_mask_r() + offset); + +} + +static void gm20b_gr_write_sm_error_state(struct gk20a *g, + u32 offset, + struct nvgpu_tsg_sm_error_state *sm_error_states) +{ + gk20a_writel(g, gr_gpc0_tpc0_sm_hww_global_esr_r() + offset, + sm_error_states->hww_global_esr); + gk20a_writel(g, gr_gpc0_tpc0_sm_hww_warp_esr_r() + offset, + sm_error_states->hww_warp_esr); + gk20a_writel(g, gr_gpc0_tpc0_sm_hww_warp_esr_pc_r() + offset, + u64_lo32(sm_error_states->hww_warp_esr_pc)); + gk20a_writel(g, gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r() + offset, + sm_error_states->hww_global_esr_report_mask); + gk20a_writel(g, gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r() + offset, + sm_error_states->hww_warp_esr_report_mask); +} + int gm20b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, struct channel_gk20a *fault_ch) { int sm_id; - struct gr_gk20a *gr = &g->gr; u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE); u32 offset = gpc_stride * gpc + tpc_in_gpc_stride * tpc; + struct nvgpu_tsg_sm_error_state *sm_error_states = NULL; + struct tsg_gk20a *tsg = NULL; nvgpu_mutex_acquire(&g->dbg_sessions_lock); sm_id = gr_gpc0_tpc0_sm_cfg_sm_id_v(gk20a_readl(g, gr_gpc0_tpc0_sm_cfg_r() + offset)); - gr->sm_error_states[sm_id].hww_global_esr = gk20a_readl(g, - gr_gpc0_tpc0_sm_hww_global_esr_r() + offset); - gr->sm_error_states[sm_id].hww_warp_esr = gk20a_readl(g, - gr_gpc0_tpc0_sm_hww_warp_esr_r() + offset); - gr->sm_error_states[sm_id].hww_warp_esr_pc = gk20a_readl(g, - gr_gpc0_tpc0_sm_hww_warp_esr_pc_r() + offset); - gr->sm_error_states[sm_id].hww_global_esr_report_mask = gk20a_readl(g, - gr_gpc0_tpc0_sm_hww_global_esr_report_mask_r() + offset); - gr->sm_error_states[sm_id].hww_warp_esr_report_mask = gk20a_readl(g, - gr_gpc0_tpc0_sm_hww_warp_esr_report_mask_r() + offset); + if (fault_ch != NULL) { + tsg = tsg_gk20a_from_ch(fault_ch); + } + + if (tsg == NULL) { + nvgpu_err(g, "no valid tsg"); + goto record_fail; + } + + sm_error_states = tsg->sm_error_states + sm_id; + gm20b_gr_read_sm_error_state(g, offset, sm_error_states); +record_fail: nvgpu_mutex_release(&g->dbg_sessions_lock); return sm_id; @@ -1301,12 +1337,12 @@ int gm20b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, int gm20b_gr_update_sm_error_state(struct gk20a *g, struct channel_gk20a *ch, u32 sm_id, - struct nvgpu_gr_sm_error_state *sm_error_state) + struct nvgpu_tsg_sm_error_state *sm_error_state) { u32 gpc, tpc, offset; - struct gr_gk20a *gr = &g->gr; struct tsg_gk20a *tsg; struct nvgpu_gr_ctx *ch_ctx; + struct nvgpu_tsg_sm_error_state *tsg_sm_error_states; u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE); @@ -1320,16 +1356,8 @@ int gm20b_gr_update_sm_error_state(struct gk20a *g, nvgpu_mutex_acquire(&g->dbg_sessions_lock); - gr->sm_error_states[sm_id].hww_global_esr = - sm_error_state->hww_global_esr; - gr->sm_error_states[sm_id].hww_warp_esr = - sm_error_state->hww_warp_esr; - gr->sm_error_states[sm_id].hww_warp_esr_pc = - sm_error_state->hww_warp_esr_pc; - gr->sm_error_states[sm_id].hww_global_esr_report_mask = - sm_error_state->hww_global_esr_report_mask; - gr->sm_error_states[sm_id].hww_warp_esr_report_mask = - sm_error_state->hww_warp_esr_report_mask; + tsg_sm_error_states = tsg->sm_error_states + sm_id; + gk20a_tsg_update_sm_error_state_locked(tsg, sm_id, sm_error_state); err = gr_gk20a_disable_ctxsw(g); if (err) { @@ -1343,29 +1371,20 @@ int gm20b_gr_update_sm_error_state(struct gk20a *g, offset = gpc_stride * gpc + tpc_in_gpc_stride * tpc; if (gk20a_is_channel_ctx_resident(ch)) { - gk20a_writel(g, gr_gpc0_tpc0_sm_hww_global_esr_r() + offset, - gr->sm_error_states[sm_id].hww_global_esr); - gk20a_writel(g, gr_gpc0_tpc0_sm_hww_warp_esr_r() + offset, - gr->sm_error_states[sm_id].hww_warp_esr); - gk20a_writel(g, gr_gpc0_tpc0_sm_hww_warp_esr_pc_r() + offset, - gr->sm_error_states[sm_id].hww_warp_esr_pc); - gk20a_writel(g, gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r() + offset, - gr->sm_error_states[sm_id].hww_global_esr_report_mask); - gk20a_writel(g, gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r() + offset, - gr->sm_error_states[sm_id].hww_warp_esr_report_mask); + gm20b_gr_write_sm_error_state(g, offset, tsg_sm_error_states); } else { err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx, false); if (err) goto enable_ctxsw; gr_gk20a_ctx_patch_write(g, ch_ctx, - gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r() + offset, - gr->sm_error_states[sm_id].hww_global_esr_report_mask, - true); + gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r() + offset, + tsg_sm_error_states->hww_global_esr_report_mask, + true); gr_gk20a_ctx_patch_write(g, ch_ctx, - gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r() + offset, - gr->sm_error_states[sm_id].hww_warp_esr_report_mask, - true); + gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r() + offset, + tsg_sm_error_states->hww_warp_esr_report_mask, + true); gr_gk20a_ctx_patch_write_end(g, ch_ctx, false); } @@ -1383,15 +1402,20 @@ int gm20b_gr_clear_sm_error_state(struct gk20a *g, { u32 gpc, tpc, offset; u32 val; - struct gr_gk20a *gr = &g->gr; + struct tsg_gk20a *tsg; u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE); int err = 0; + tsg = tsg_gk20a_from_ch(ch); + if (tsg == NULL) { + return -EINVAL; + } + nvgpu_mutex_acquire(&g->dbg_sessions_lock); - memset(&gr->sm_error_states[sm_id], 0, sizeof(*gr->sm_error_states)); + memset(&tsg->sm_error_states[sm_id], 0, sizeof(*tsg->sm_error_states)); err = gr_gk20a_disable_ctxsw(g); if (err) { diff --git a/drivers/gpu/nvgpu/gm20b/gr_gm20b.h b/drivers/gpu/nvgpu/gm20b/gr_gm20b.h index 9d8e5cdf..7c3baa59 100644 --- a/drivers/gpu/nvgpu/gm20b/gr_gm20b.h +++ b/drivers/gpu/nvgpu/gm20b/gr_gm20b.h @@ -119,7 +119,7 @@ int gm20b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, struct channel_gk20a *fault_ch); int gm20b_gr_update_sm_error_state(struct gk20a *g, struct channel_gk20a *ch, u32 sm_id, - struct nvgpu_gr_sm_error_state *sm_error_state); + struct nvgpu_tsg_sm_error_state *sm_error_state); int gm20b_gr_clear_sm_error_state(struct gk20a *g, struct channel_gk20a *ch, u32 sm_id); int gr_gm20b_get_preemption_mode_flags(struct gk20a *g, diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c index 1e001824..bc659a7b 100644 --- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c @@ -3212,18 +3212,42 @@ void gv11b_gr_bpt_reg_info(struct gk20a *g, struct nvgpu_warpstate *w_state) } } +static void gv11b_gr_write_sm_error_state(struct gk20a *g, + u32 offset, + struct nvgpu_tsg_sm_error_state *sm_error_states) +{ + nvgpu_writel(g, + gr_gpc0_tpc0_sm0_hww_global_esr_r() + offset, + sm_error_states->hww_global_esr); + nvgpu_writel(g, + gr_gpc0_tpc0_sm0_hww_warp_esr_r() + offset, + sm_error_states->hww_warp_esr); + nvgpu_writel(g, + gr_gpc0_tpc0_sm0_hww_warp_esr_pc_r() + offset, + u64_lo32(sm_error_states->hww_warp_esr_pc)); + nvgpu_writel(g, + gr_gpc0_tpc0_sm0_hww_warp_esr_pc_hi_r() + offset, + u64_hi32(sm_error_states->hww_warp_esr_pc)); + nvgpu_writel(g, + gr_gpc0_tpc0_sm0_hww_global_esr_report_mask_r() + offset, + sm_error_states->hww_global_esr_report_mask); + nvgpu_writel(g, + gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_r() + offset, + sm_error_states->hww_warp_esr_report_mask); +} + int gv11b_gr_update_sm_error_state(struct gk20a *g, struct channel_gk20a *ch, u32 sm_id, - struct nvgpu_gr_sm_error_state *sm_error_state) + struct nvgpu_tsg_sm_error_state *sm_error_state) { struct tsg_gk20a *tsg; u32 gpc, tpc, sm, offset; - struct gr_gk20a *gr = &g->gr; struct nvgpu_gr_ctx *ch_ctx; int err = 0; + struct nvgpu_tsg_sm_error_state *tsg_sm_error_states; tsg = tsg_gk20a_from_ch(ch); - if (!tsg) { + if (tsg == NULL) { return -EINVAL; } @@ -3231,16 +3255,8 @@ int gv11b_gr_update_sm_error_state(struct gk20a *g, nvgpu_mutex_acquire(&g->dbg_sessions_lock); - gr->sm_error_states[sm_id].hww_global_esr = - sm_error_state->hww_global_esr; - gr->sm_error_states[sm_id].hww_warp_esr = - sm_error_state->hww_warp_esr; - gr->sm_error_states[sm_id].hww_warp_esr_pc = - sm_error_state->hww_warp_esr_pc; - gr->sm_error_states[sm_id].hww_global_esr_report_mask = - sm_error_state->hww_global_esr_report_mask; - gr->sm_error_states[sm_id].hww_warp_esr_report_mask = - sm_error_state->hww_warp_esr_report_mask; + tsg_sm_error_states = tsg->sm_error_states + sm_id; + gk20a_tsg_update_sm_error_state_locked(tsg, sm_id, sm_error_state); err = gr_gk20a_disable_ctxsw(g); if (err) { @@ -3257,21 +3273,7 @@ int gv11b_gr_update_sm_error_state(struct gk20a *g, gv11b_gr_sm_offset(g, sm); if (gk20a_is_channel_ctx_resident(ch)) { - gk20a_writel(g, - gr_gpc0_tpc0_sm0_hww_global_esr_r() + offset, - gr->sm_error_states[sm_id].hww_global_esr); - gk20a_writel(g, - gr_gpc0_tpc0_sm0_hww_warp_esr_r() + offset, - gr->sm_error_states[sm_id].hww_warp_esr); - gk20a_writel(g, - gr_gpc0_tpc0_sm0_hww_warp_esr_pc_r() + offset, - gr->sm_error_states[sm_id].hww_warp_esr_pc); - gk20a_writel(g, - gr_gpc0_tpc0_sm0_hww_global_esr_report_mask_r() + offset, - gr->sm_error_states[sm_id].hww_global_esr_report_mask); - gk20a_writel(g, - gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_r() + offset, - gr->sm_error_states[sm_id].hww_warp_esr_report_mask); + gv11b_gr_write_sm_error_state(g, offset, tsg_sm_error_states); } else { err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx, false); if (err) { @@ -3281,12 +3283,12 @@ int gv11b_gr_update_sm_error_state(struct gk20a *g, gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_sms_hww_global_esr_report_mask_r() + offset, - gr->sm_error_states[sm_id].hww_global_esr_report_mask, + tsg_sm_error_states->hww_global_esr_report_mask, true); gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_sms_hww_warp_esr_report_mask_r() + offset, - gr->sm_error_states[sm_id].hww_warp_esr_report_mask, + tsg_sm_error_states->hww_warp_esr_report_mask, true); gr_gk20a_ctx_patch_write_end(g, ch_ctx, false); @@ -3362,13 +3364,36 @@ int gv11b_gr_set_sm_debug_mode(struct gk20a *g, return err; } +static void gv11b_gr_read_sm_error_state(struct gk20a *g, + u32 offset, + struct nvgpu_tsg_sm_error_state *sm_error_states) +{ + sm_error_states->hww_global_esr = nvgpu_readl(g, + gr_gpc0_tpc0_sm0_hww_global_esr_r() + offset); + + sm_error_states->hww_warp_esr = nvgpu_readl(g, + gr_gpc0_tpc0_sm0_hww_warp_esr_r() + offset); + + sm_error_states->hww_warp_esr_pc = hi32_lo32_to_u64((nvgpu_readl(g, + gr_gpc0_tpc0_sm0_hww_warp_esr_pc_hi_r() + offset)), + (nvgpu_readl(g, + gr_gpc0_tpc0_sm0_hww_warp_esr_pc_r() + offset))); + + sm_error_states->hww_global_esr_report_mask = nvgpu_readl(g, + gr_gpc0_tpc0_sm0_hww_global_esr_report_mask_r() + offset); + + sm_error_states->hww_warp_esr_report_mask = nvgpu_readl(g, + gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_r() + offset); +} + int gv11b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, struct channel_gk20a *fault_ch) { int sm_id; - struct gr_gk20a *gr = &g->gr; u32 offset, sm_per_tpc, tpc_id; u32 gpc_offset, gpc_tpc_offset; + struct nvgpu_tsg_sm_error_state *sm_error_states = NULL; + struct tsg_gk20a *tsg = NULL; nvgpu_mutex_acquire(&g->dbg_sessions_lock); @@ -3381,21 +3406,19 @@ int gv11b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, offset = gpc_tpc_offset + gv11b_gr_sm_offset(g, sm); - gr->sm_error_states[sm_id].hww_global_esr = gk20a_readl(g, - gr_gpc0_tpc0_sm0_hww_global_esr_r() + offset); - - gr->sm_error_states[sm_id].hww_warp_esr = gk20a_readl(g, - gr_gpc0_tpc0_sm0_hww_warp_esr_r() + offset); - - gr->sm_error_states[sm_id].hww_warp_esr_pc = gk20a_readl(g, - gr_gpc0_tpc0_sm0_hww_warp_esr_pc_r() + offset); + if (fault_ch != NULL) { + tsg = tsg_gk20a_from_ch(fault_ch); + } - gr->sm_error_states[sm_id].hww_global_esr_report_mask = gk20a_readl(g, - gr_gpc0_tpc0_sm0_hww_global_esr_report_mask_r() + offset); + if (tsg == NULL) { + nvgpu_err(g, "no valid tsg"); + goto record_fail; + } - gr->sm_error_states[sm_id].hww_warp_esr_report_mask = gk20a_readl(g, - gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_r() + offset); + sm_error_states = tsg->sm_error_states + sm_id; + gv11b_gr_read_sm_error_state(g, offset, sm_error_states); +record_fail: nvgpu_mutex_release(&g->dbg_sessions_lock); return sm_id; diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.h b/drivers/gpu/nvgpu/gv11b/gr_gv11b.h index 0f29ea24..30cc7f0a 100644 --- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.h +++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.h @@ -43,7 +43,7 @@ struct zbc_entry; struct zbc_query_params; struct nvgpu_gr_ctx; struct nvgpu_warpstate; -struct nvgpu_gr_sm_error_state; +struct nvgpu_tsg_sm_error_state; struct gr_ctx_desc; struct gr_gk20a_isr_data; struct gk20a_debug_output; @@ -168,7 +168,7 @@ int gv11b_gr_sm_trigger_suspend(struct gk20a *g); void gv11b_gr_bpt_reg_info(struct gk20a *g, struct nvgpu_warpstate *w_state); int gv11b_gr_update_sm_error_state(struct gk20a *g, struct channel_gk20a *ch, u32 sm_id, - struct nvgpu_gr_sm_error_state *sm_error_state); + struct nvgpu_tsg_sm_error_state *sm_error_state); int gv11b_gr_set_sm_debug_mode(struct gk20a *g, struct channel_gk20a *ch, u64 sms, bool enable); int gv11b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, diff --git a/drivers/gpu/nvgpu/include/nvgpu/vgpu/tegra_vgpu.h b/drivers/gpu/nvgpu/include/nvgpu/vgpu/tegra_vgpu.h index 39d68dd1..f7a58c87 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/vgpu/tegra_vgpu.h +++ b/drivers/gpu/nvgpu/include/nvgpu/vgpu/tegra_vgpu.h @@ -22,8 +22,8 @@ * DEALINGS IN THE SOFTWARE. */ -#ifndef __TEGRA_VGPU_H -#define __TEGRA_VGPU_H +#ifndef TEGRA_VGPU_H +#define TEGRA_VGPU_H #include #include /* For NVGPU_ECC_STAT_NAME_MAX_SIZE */ @@ -737,6 +737,7 @@ struct tegra_vgpu_channel_event_info { }; struct tegra_vgpu_sm_esr_info { + u32 tsg_id; u32 sm_id; u32 hww_global_esr; u32 hww_warp_esr; diff --git a/drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c b/drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c index fc1f7011..2f013029 100644 --- a/drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c +++ b/drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c @@ -1567,56 +1567,6 @@ out: return err; } -static int nvgpu_gpu_read_single_sm_error_state(struct gk20a *g, - struct nvgpu_gpu_read_single_sm_error_state_args *args) -{ - struct gr_gk20a *gr = &g->gr; - struct nvgpu_gr_sm_error_state *sm_error_state; - struct nvgpu_gpu_sm_error_state_record sm_error_state_record; - u32 sm_id; - int err = 0; - - sm_id = args->sm_id; - if (sm_id >= gr->no_of_sm) - return -EINVAL; - - nvgpu_speculation_barrier(); - - sm_error_state = gr->sm_error_states + sm_id; - sm_error_state_record.global_esr = - sm_error_state->hww_global_esr; - sm_error_state_record.warp_esr = - sm_error_state->hww_warp_esr; - sm_error_state_record.warp_esr_pc = - sm_error_state->hww_warp_esr_pc; - sm_error_state_record.global_esr_report_mask = - sm_error_state->hww_global_esr_report_mask; - sm_error_state_record.warp_esr_report_mask = - sm_error_state->hww_warp_esr_report_mask; - - if (args->record_size > 0) { - size_t write_size = sizeof(*sm_error_state); - - if (write_size > args->record_size) - write_size = args->record_size; - - nvgpu_mutex_acquire(&g->dbg_sessions_lock); - err = copy_to_user((void __user *)(uintptr_t) - args->record_mem, - &sm_error_state_record, - write_size); - nvgpu_mutex_release(&g->dbg_sessions_lock); - if (err) { - nvgpu_err(g, "copy_to_user failed!"); - return err; - } - - args->record_size = write_size; - } - - return 0; -} - long gk20a_ctrl_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct gk20a_ctrl_priv *priv = filp->private_data; @@ -1925,11 +1875,6 @@ long gk20a_ctrl_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg (struct nvgpu_gpu_set_deterministic_opts_args *)buf); break; - case NVGPU_GPU_IOCTL_READ_SINGLE_SM_ERROR_STATE: - err = nvgpu_gpu_read_single_sm_error_state(g, - (struct nvgpu_gpu_read_single_sm_error_state_args *)buf); - break; - default: nvgpu_log_info(g, "unrecognized gpu ioctl cmd: 0x%x", cmd); err = -ENOTTY; diff --git a/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c b/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c index ff4fcdca..4ac4fb62 100644 --- a/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c +++ b/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c @@ -35,6 +35,7 @@ #include "gk20a/gk20a.h" #include "gk20a/gr_gk20a.h" +#include "gk20a/tsg_gk20a.h" #include "gk20a/regops_gk20a.h" #include "gk20a/dbg_gpu_gk20a.h" #include "os_linux.h" @@ -271,20 +272,23 @@ static int nvgpu_dbg_gpu_ioctl_write_single_sm_error_state( u32 sm_id; struct channel_gk20a *ch; struct nvgpu_dbg_gpu_sm_error_state_record sm_error_state_record; - struct nvgpu_gr_sm_error_state sm_error_state; + struct nvgpu_tsg_sm_error_state sm_error_state; int err = 0; /* Not currently supported in the virtual case */ - if (g->is_virtual) + if (g->is_virtual) { return -ENOSYS; + } ch = nvgpu_dbg_gpu_get_session_channel(dbg_s); - if (!ch) + if (ch == NULL) { return -EINVAL; + } sm_id = args->sm_id; - if (sm_id >= gr->no_of_sm) + if (sm_id >= gr->no_of_sm) { return -EINVAL; + } nvgpu_speculation_barrier(); @@ -300,13 +304,15 @@ static int nvgpu_dbg_gpu_ioctl_write_single_sm_error_state( args->sm_error_state_record_mem, read_size); nvgpu_mutex_release(&g->dbg_sessions_lock); - if (err) + if (err != 0) { return -ENOMEM; + } } err = gk20a_busy(g); - if (err) + if (err != 0) { return err; + } sm_error_state.hww_global_esr = sm_error_state_record.hww_global_esr; @@ -335,18 +341,36 @@ static int nvgpu_dbg_gpu_ioctl_read_single_sm_error_state( { struct gk20a *g = dbg_s->g; struct gr_gk20a *gr = &g->gr; - struct nvgpu_gr_sm_error_state *sm_error_state; + struct nvgpu_tsg_sm_error_state *sm_error_state; struct nvgpu_dbg_gpu_sm_error_state_record sm_error_state_record; + struct channel_gk20a *ch; + struct tsg_gk20a *tsg; u32 sm_id; int err = 0; + ch = nvgpu_dbg_gpu_get_session_channel(dbg_s); + if (ch == NULL) { + return -EINVAL; + } + + tsg = tsg_gk20a_from_ch(ch); + if (tsg == NULL) { + nvgpu_err(g, "no valid tsg from ch"); + return -EINVAL; + } + sm_id = args->sm_id; - if (sm_id >= gr->no_of_sm) + if (sm_id >= gr->no_of_sm) { return -EINVAL; + } + + if (tsg->sm_error_states == NULL) { + return -EINVAL; + } nvgpu_speculation_barrier(); - sm_error_state = gr->sm_error_states + sm_id; + sm_error_state = tsg->sm_error_states + sm_id; sm_error_state_record.hww_global_esr = sm_error_state->hww_global_esr; sm_error_state_record.hww_warp_esr = @@ -370,7 +394,7 @@ static int nvgpu_dbg_gpu_ioctl_read_single_sm_error_state( &sm_error_state_record, write_size); nvgpu_mutex_release(&g->dbg_sessions_lock); - if (err) { + if (err != 0) { nvgpu_err(g, "copy_to_user failed!"); return err; } @@ -1500,8 +1524,9 @@ static int nvgpu_dbg_gpu_ioctl_clear_single_sm_error_state( int err = 0; ch = nvgpu_dbg_gpu_get_session_channel(dbg_s); - if (!ch) + if (ch == NULL) { return -EINVAL; + } sm_id = args->sm_id; if (sm_id >= gr->no_of_sm) @@ -1510,8 +1535,9 @@ static int nvgpu_dbg_gpu_ioctl_clear_single_sm_error_state( nvgpu_speculation_barrier(); err = gk20a_busy(g); - if (err) + if (err != 0) { return err; + } err = gr_gk20a_elpg_protected_call(g, g->ops.gr.clear_sm_error_state(g, ch, sm_id)); diff --git a/drivers/gpu/nvgpu/os/linux/ioctl_tsg.c b/drivers/gpu/nvgpu/os/linux/ioctl_tsg.c index f7d20f34..6c68ca58 100644 --- a/drivers/gpu/nvgpu/os/linux/ioctl_tsg.c +++ b/drivers/gpu/nvgpu/os/linux/ioctl_tsg.c @@ -536,6 +536,57 @@ static int gk20a_tsg_ioctl_get_timeslice(struct gk20a *g, return 0; } +static int gk20a_tsg_ioctl_read_single_sm_error_state(struct gk20a *g, + struct tsg_gk20a *tsg, + struct nvgpu_tsg_read_single_sm_error_state_args *args) +{ + struct gr_gk20a *gr = &g->gr; + struct nvgpu_tsg_sm_error_state *sm_error_state; + struct nvgpu_tsg_sm_error_state_record sm_error_state_record; + u32 sm_id; + int err = 0; + + sm_id = args->sm_id; + if (sm_id >= gr->no_of_sm) + return -EINVAL; + + nvgpu_speculation_barrier(); + + sm_error_state = tsg->sm_error_states + sm_id; + sm_error_state_record.global_esr = + sm_error_state->hww_global_esr; + sm_error_state_record.warp_esr = + sm_error_state->hww_warp_esr; + sm_error_state_record.warp_esr_pc = + sm_error_state->hww_warp_esr_pc; + sm_error_state_record.global_esr_report_mask = + sm_error_state->hww_global_esr_report_mask; + sm_error_state_record.warp_esr_report_mask = + sm_error_state->hww_warp_esr_report_mask; + + if (args->record_size > 0) { + size_t write_size = sizeof(*sm_error_state); + + if (write_size > args->record_size) + write_size = args->record_size; + + nvgpu_mutex_acquire(&g->dbg_sessions_lock); + err = copy_to_user((void __user *)(uintptr_t) + args->record_mem, + &sm_error_state_record, + write_size); + nvgpu_mutex_release(&g->dbg_sessions_lock); + if (err) { + nvgpu_err(g, "copy_to_user failed!"); + return err; + } + + args->record_size = write_size; + } + + return 0; +} + long nvgpu_ioctl_tsg_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { @@ -670,6 +721,13 @@ long nvgpu_ioctl_tsg_dev_ioctl(struct file *filp, unsigned int cmd, break; } + case NVGPU_TSG_IOCTL_READ_SINGLE_SM_ERROR_STATE: + { + err = gk20a_tsg_ioctl_read_single_sm_error_state(g, tsg, + (struct nvgpu_tsg_read_single_sm_error_state_args *)buf); + break; + } + default: nvgpu_err(g, "unrecognized tsg gpu ioctl cmd: 0x%x", cmd); diff --git a/drivers/gpu/nvgpu/vgpu/gr_vgpu.c b/drivers/gpu/nvgpu/vgpu/gr_vgpu.c index fa64cb82..9ee57fb4 100644 --- a/drivers/gpu/nvgpu/vgpu/gr_vgpu.c +++ b/drivers/gpu/nvgpu/vgpu/gr_vgpu.c @@ -882,9 +882,6 @@ static void vgpu_remove_gr_support(struct gr_gk20a *gr) gk20a_comptag_allocator_destroy(gr->g, &gr->comp_tags); - nvgpu_kfree(gr->g, gr->sm_error_states); - gr->sm_error_states = NULL; - nvgpu_kfree(gr->g, gr->gpc_tpc_mask); gr->gpc_tpc_mask = NULL; @@ -935,14 +932,6 @@ static int vgpu_gr_init_gr_setup_sw(struct gk20a *g) nvgpu_mutex_init(&gr->ctx_mutex); nvgpu_spinlock_init(&gr->ch_tlb_lock); - gr->sm_error_states = nvgpu_kzalloc(g, - sizeof(struct nvgpu_gr_sm_error_state) * - gr->no_of_sm); - if (!gr->sm_error_states) { - err = -ENOMEM; - goto clean_up; - } - gr->remove_support = vgpu_remove_gr_support; gr->sw_ready = true; @@ -1152,12 +1141,17 @@ int vgpu_gr_update_hwpm_ctxsw_mode(struct gk20a *g, int vgpu_gr_clear_sm_error_state(struct gk20a *g, struct channel_gk20a *ch, u32 sm_id) { - struct gr_gk20a *gr = &g->gr; struct tegra_vgpu_cmd_msg msg; struct tegra_vgpu_clear_sm_error_state *p = &msg.params.clear_sm_error_state; + struct tsg_gk20a *tsg; int err; + tsg = tsg_gk20a_from_ch(ch); + if (!tsg) { + return -EINVAL; + } + nvgpu_mutex_acquire(&g->dbg_sessions_lock); msg.cmd = TEGRA_VGPU_CMD_CLEAR_SM_ERROR_STATE; msg.handle = vgpu_get_handle(g); @@ -1167,7 +1161,7 @@ int vgpu_gr_clear_sm_error_state(struct gk20a *g, err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg)); WARN_ON(err || msg.ret); - memset(&gr->sm_error_states[sm_id], 0, sizeof(*gr->sm_error_states)); + memset(&tsg->sm_error_states[sm_id], 0, sizeof(*tsg->sm_error_states)); nvgpu_mutex_release(&g->dbg_sessions_lock); return err ? err : msg.ret; @@ -1264,7 +1258,8 @@ int vgpu_gr_resume_contexts(struct gk20a *g, void vgpu_gr_handle_sm_esr_event(struct gk20a *g, struct tegra_vgpu_sm_esr_info *info) { - struct nvgpu_gr_sm_error_state *sm_error_states; + struct nvgpu_tsg_sm_error_state *sm_error_states; + struct tsg_gk20a *tsg; if (info->sm_id >= g->gr.no_of_sm) { nvgpu_err(g, "invalid smd_id %d / %d", @@ -1272,9 +1267,20 @@ void vgpu_gr_handle_sm_esr_event(struct gk20a *g, return; } + if (info->tsg_id >= g->fifo.num_channels) { + nvgpu_err(g, "invalid tsg_id in sm esr event"); + return; + } + + tsg = &g->fifo.tsg[info->tsg_id]; + if (tsg == NULL) { + nvgpu_err(g, "invalid tsg"); + return; + } + nvgpu_mutex_acquire(&g->dbg_sessions_lock); - sm_error_states = &g->gr.sm_error_states[info->sm_id]; + sm_error_states = &tsg->sm_error_states[info->sm_id]; sm_error_states->hww_global_esr = info->hww_global_esr; sm_error_states->hww_warp_esr = info->hww_warp_esr; -- cgit v1.2.2