From bfe65407bde2b5d0776724301e215c6553c989f3 Mon Sep 17 00:00:00 2001 From: Vinod G Date: Tue, 7 Aug 2018 23:09:30 -0700 Subject: gpu: nvgpu: Read sm error ioctl support for tsg Add READ_SM_ERROR IOCTL support to TSG level. Moved the struct to save the sm_error details from gr to tsg as the sm_error support is context based, not global. Also corrected MISRA 21.1 error in header file. nvgpu_dbg_gpu_ioctl_write_single_sm_error_state and nvgpu_dbg_gpu_ioctl_read_single_sm_error_state functions are modified to use the tsg struct nvgpu_tsg_sm_error_state. Bug 200412642 Change-Id: I9e334b059078a4bb0e360b945444cc4bf1cc56ec Signed-off-by: Vinod G Reviewed-on: https://git-master.nvidia.com/r/1794856 Reviewed-by: svc-misra-checker GVS: Gerrit_Virtual_Submit Reviewed-by: Terje Bergstrom Reviewed-by: mobile promotions Tested-by: mobile promotions --- drivers/gpu/nvgpu/gk20a/gk20a.h | 2 +- drivers/gpu/nvgpu/gk20a/gr_gk20a.c | 30 -------------- drivers/gpu/nvgpu/gk20a/gr_gk20a.h | 9 ---- drivers/gpu/nvgpu/gk20a/tsg_gk20a.c | 82 ++++++++++++++++++++++++++++++++++--- drivers/gpu/nvgpu/gk20a/tsg_gk20a.h | 21 ++++++++-- 5 files changed, 95 insertions(+), 49 deletions(-) (limited to 'drivers/gpu/nvgpu/gk20a') diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h index cf202f14..192f4c3e 100644 --- a/drivers/gpu/nvgpu/gk20a/gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/gk20a.h @@ -396,7 +396,7 @@ struct gpu_ops { u32 sm, struct channel_gk20a *fault_ch); int (*update_sm_error_state)(struct gk20a *g, struct channel_gk20a *ch, u32 sm_id, - struct nvgpu_gr_sm_error_state *sm_error_state); + struct nvgpu_tsg_sm_error_state *sm_error_state); int (*clear_sm_error_state)(struct gk20a *g, struct channel_gk20a *ch, u32 sm_id); int (*suspend_contexts)(struct gk20a *g, diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c index f2b083d7..cdc00bbd 100644 --- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c @@ -1561,19 +1561,6 @@ restore_fe_go_idle: if (err) goto clean_up; - nvgpu_kfree(g, gr->sm_error_states); - - /* we need to allocate this after g->ops.gr.init_fs_state() since - * we initialize gr->no_of_sm in this function - */ - gr->sm_error_states = nvgpu_kzalloc(g, - sizeof(struct nvgpu_gr_sm_error_state) - * gr->no_of_sm); - if (!gr->sm_error_states) { - err = -ENOMEM; - goto restore_fe_go_idle; - } - ctx_header_words = roundup(ctx_header_bytes, sizeof(u32)); ctx_header_words >>= 2; @@ -3072,7 +3059,6 @@ static void gk20a_remove_gr_support(struct gr_gk20a *gr) memset(&gr->compbit_store, 0, sizeof(struct compbit_store_desc)); - nvgpu_kfree(g, gr->sm_error_states); nvgpu_kfree(g, gr->gpc_tpc_count); nvgpu_kfree(g, gr->gpc_zcb_count); nvgpu_kfree(g, gr->gpc_ppc_count); @@ -4545,22 +4531,6 @@ restore_fe_go_idle: err = gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g), GR_IDLE_CHECK_DEFAULT); - if (err) - goto out; - - nvgpu_kfree(g, gr->sm_error_states); - - /* we need to allocate this after g->ops.gr.init_fs_state() since - * we initialize gr->no_of_sm in this function - */ - gr->sm_error_states = nvgpu_kzalloc(g, - sizeof(struct nvgpu_gr_sm_error_state) * - gr->no_of_sm); - if (!gr->sm_error_states) { - err = -ENOMEM; - goto restore_fe_go_idle; - } - out: nvgpu_log_fn(g, "done"); return err; diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h index 3fc7e55f..bd5e625d 100644 --- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h @@ -254,14 +254,6 @@ struct nvgpu_preemption_modes_rec { u32 default_compute_preempt_mode; /* default mode */ }; -struct nvgpu_gr_sm_error_state { - u32 hww_global_esr; - u32 hww_warp_esr; - u64 hww_warp_esr_pc; - u32 hww_global_esr_report_mask; - u32 hww_warp_esr_report_mask; -}; - struct gr_gk20a { struct gk20a *g; struct { @@ -427,7 +419,6 @@ struct gr_gk20a { u32 *fbp_rop_l2_en_mask; u32 no_of_sm; struct sm_info *sm_to_cluster; - struct nvgpu_gr_sm_error_state *sm_error_states; #define NVGPU_SM_EXCEPTION_TYPE_MASK_NONE (0x0U) #define NVGPU_SM_EXCEPTION_TYPE_MASK_FATAL (0x1U << 0) diff --git a/drivers/gpu/nvgpu/gk20a/tsg_gk20a.c b/drivers/gpu/nvgpu/gk20a/tsg_gk20a.c index 62763da3..624ee1d7 100644 --- a/drivers/gpu/nvgpu/gk20a/tsg_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/tsg_gk20a.c @@ -275,8 +275,23 @@ struct tsg_gk20a *gk20a_tsg_open(struct gk20a *g, pid_t pid) int err; tsg = gk20a_tsg_acquire_unused_tsg(&g->fifo); - if (!tsg) + if (tsg == NULL) { return NULL; + } + + /* we need to allocate this after g->ops.gr.init_fs_state() since + * we initialize gr->no_of_sm in this function + */ + if (g->gr.no_of_sm == 0U) { + nvgpu_err(g, "no_of_sm %d not set, failed allocation", + g->gr.no_of_sm); + return NULL; + } + + err = gk20a_tsg_alloc_sm_error_states_mem(g, tsg, g->gr.no_of_sm); + if (err != 0) { + return NULL; + } tsg->g = g; tsg->num_active_channels = 0; @@ -295,7 +310,7 @@ struct tsg_gk20a *gk20a_tsg_open(struct gk20a *g, pid_t pid) if (g->ops.fifo.tsg_open) { err = g->ops.fifo.tsg_open(tsg); - if (err) { + if (err != 0) { nvgpu_err(g, "tsg %d fifo open failed %d", tsg->tsgid, err); goto clean_up; @@ -307,6 +322,12 @@ struct tsg_gk20a *gk20a_tsg_open(struct gk20a *g, pid_t pid) return tsg; clean_up: + + if(tsg->sm_error_states != NULL) { + nvgpu_kfree(g, tsg->sm_error_states); + tsg->sm_error_states = NULL; + } + nvgpu_ref_put(&tsg->refcount, gk20a_tsg_release); return NULL; } @@ -317,20 +338,28 @@ void gk20a_tsg_release(struct nvgpu_ref *ref) struct gk20a *g = tsg->g; struct gk20a_event_id_data *event_id_data, *event_id_data_temp; - if (g->ops.fifo.tsg_release) + if (g->ops.fifo.tsg_release != NULL) { g->ops.fifo.tsg_release(tsg); + } - if (nvgpu_mem_is_valid(&tsg->gr_ctx.mem)) + if (nvgpu_mem_is_valid(&tsg->gr_ctx.mem)) { gr_gk20a_free_tsg_gr_ctx(tsg); + } - if (g->ops.fifo.deinit_eng_method_buffers) + if (g->ops.fifo.deinit_eng_method_buffers != NULL) { g->ops.fifo.deinit_eng_method_buffers(g, tsg); + } - if (tsg->vm) { + if (tsg->vm != NULL) { nvgpu_vm_put(tsg->vm); tsg->vm = NULL; } + if(tsg->sm_error_states != NULL) { + nvgpu_kfree(g, tsg->sm_error_states); + tsg->sm_error_states = NULL; + } + /* unhook all events created on this TSG */ nvgpu_mutex_acquire(&tsg->event_id_list_lock); nvgpu_list_for_each_entry_safe(event_id_data, event_id_data_temp, @@ -360,3 +389,44 @@ struct tsg_gk20a *tsg_gk20a_from_ch(struct channel_gk20a *ch) return tsg; } + +int gk20a_tsg_alloc_sm_error_states_mem(struct gk20a *g, + struct tsg_gk20a *tsg, + u32 num_sm) +{ + int err = 0; + + if (tsg->sm_error_states != NULL) { + return err; + } + + tsg->sm_error_states = nvgpu_kzalloc(g, + sizeof(struct nvgpu_tsg_sm_error_state) + * num_sm); + if (tsg->sm_error_states == NULL) { + nvgpu_err(g, "sm_error_states mem allocation failed"); + err = -ENOMEM; + } + + return err; +} + +void gk20a_tsg_update_sm_error_state_locked(struct tsg_gk20a *tsg, + u32 sm_id, + struct nvgpu_tsg_sm_error_state *sm_error_state) +{ + struct nvgpu_tsg_sm_error_state *tsg_sm_error_states; + + tsg_sm_error_states = tsg->sm_error_states + sm_id; + + tsg_sm_error_states->hww_global_esr = + sm_error_state->hww_global_esr; + tsg_sm_error_states->hww_warp_esr = + sm_error_state->hww_warp_esr; + tsg_sm_error_states->hww_warp_esr_pc = + sm_error_state->hww_warp_esr_pc; + tsg_sm_error_states->hww_global_esr_report_mask = + sm_error_state->hww_global_esr_report_mask; + tsg_sm_error_states->hww_warp_esr_report_mask = + sm_error_state->hww_warp_esr_report_mask; +} diff --git a/drivers/gpu/nvgpu/gk20a/tsg_gk20a.h b/drivers/gpu/nvgpu/gk20a/tsg_gk20a.h index 552c3bb3..67ccb9f5 100644 --- a/drivers/gpu/nvgpu/gk20a/tsg_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/tsg_gk20a.h @@ -19,8 +19,8 @@ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ -#ifndef __TSG_GK20A_H_ -#define __TSG_GK20A_H_ +#ifndef TSG_GK20A_H +#define TSG_GK20A_H #include #include @@ -39,6 +39,14 @@ void gk20a_tsg_release(struct nvgpu_ref *ref); int gk20a_init_tsg_support(struct gk20a *g, u32 tsgid); struct tsg_gk20a *tsg_gk20a_from_ch(struct channel_gk20a *ch); +struct nvgpu_tsg_sm_error_state { + u32 hww_global_esr; + u32 hww_warp_esr; + u64 hww_warp_esr_pc; + u32 hww_global_esr_report_mask; + u32 hww_warp_esr_report_mask; +}; + struct tsg_gk20a { struct gk20a *g; @@ -69,6 +77,7 @@ struct tsg_gk20a { bool tpc_num_initialized; bool in_use; + struct nvgpu_tsg_sm_error_state *sm_error_states; }; int gk20a_enable_tsg(struct tsg_gk20a *tsg); @@ -84,6 +93,12 @@ int gk20a_tsg_set_timeslice(struct tsg_gk20a *tsg, u32 timeslice); u32 gk20a_tsg_get_timeslice(struct tsg_gk20a *tsg); int gk20a_tsg_set_priority(struct gk20a *g, struct tsg_gk20a *tsg, u32 priority); +int gk20a_tsg_alloc_sm_error_states_mem(struct gk20a *g, + struct tsg_gk20a *tsg, + u32 num_sm); +void gk20a_tsg_update_sm_error_state_locked(struct tsg_gk20a *tsg, + u32 sm_id, + struct nvgpu_tsg_sm_error_state *sm_error_state); struct gk20a_event_id_data { struct gk20a *g; @@ -106,4 +121,4 @@ gk20a_event_id_data_from_event_id_node(struct nvgpu_list_node *node) ((uintptr_t)node - offsetof(struct gk20a_event_id_data, event_id_node)); }; -#endif /* __TSG_GK20A_H_ */ +#endif /* TSG_GK20A_H */ -- cgit v1.2.2