From 9eebb7831facaa16b2975f50a716d2986c67b699 Mon Sep 17 00:00:00 2001 From: Terje Bergstrom Date: Wed, 25 Oct 2017 14:17:30 -0700 Subject: gpu: nvgpu: Linux specific sm_error_state_record Create an nvgpu internal nvgpu_gr_sm_error_state to store and propagate SM error state within driver. Use nvgpu_dbg_gpu_sm_error_state_record only in Linux code. JIRA NVGPU-259 Change-Id: I7365cdf5a1a42cbcdb418dfcef3e0020e02a960f Signed-off-by: Terje Bergstrom Reviewed-on: https://git-master.nvidia.com/r/1585645 Reviewed-by: mobile promotions Tested-by: mobile promotions --- drivers/gpu/nvgpu/common/linux/ioctl_dbg.c | 50 +++++++++++++++++++----------- drivers/gpu/nvgpu/gk20a/gk20a.h | 3 +- drivers/gpu/nvgpu/gk20a/gr_gk20a.c | 4 +-- drivers/gpu/nvgpu/gk20a/gr_gk20a.h | 10 +++++- drivers/gpu/nvgpu/gm20b/gr_gm20b.c | 2 +- drivers/gpu/nvgpu/gm20b/gr_gm20b.h | 2 +- drivers/gpu/nvgpu/vgpu/gr_vgpu.c | 4 +-- 7 files changed, 48 insertions(+), 27 deletions(-) (limited to 'drivers/gpu') diff --git a/drivers/gpu/nvgpu/common/linux/ioctl_dbg.c b/drivers/gpu/nvgpu/common/linux/ioctl_dbg.c index 7e62bb5c..403d9261 100644 --- a/drivers/gpu/nvgpu/common/linux/ioctl_dbg.c +++ b/drivers/gpu/nvgpu/common/linux/ioctl_dbg.c @@ -239,7 +239,8 @@ static int nvgpu_dbg_gpu_ioctl_write_single_sm_error_state( struct gr_gk20a *gr = &g->gr; u32 sm_id; struct channel_gk20a *ch; - struct nvgpu_dbg_gpu_sm_error_state_record *sm_error_state; + struct nvgpu_dbg_gpu_sm_error_state_record sm_error_state_record; + struct nvgpu_gr_sm_error_state sm_error_state; int err = 0; ch = nvgpu_dbg_gpu_get_session_channel(dbg_s); @@ -250,41 +251,43 @@ static int nvgpu_dbg_gpu_ioctl_write_single_sm_error_state( if (sm_id >= gr->no_of_sm) return -EINVAL; - sm_error_state = nvgpu_kzalloc(g, sizeof(*sm_error_state)); - if (!sm_error_state) - return -ENOMEM; - if (args->sm_error_state_record_size > 0) { - size_t read_size = sizeof(*sm_error_state); + size_t read_size = sizeof(sm_error_state_record); if (read_size > args->sm_error_state_record_size) read_size = args->sm_error_state_record_size; nvgpu_mutex_acquire(&g->dbg_sessions_lock); - err = copy_from_user(sm_error_state, + err = copy_from_user(&sm_error_state_record, (void __user *)(uintptr_t) args->sm_error_state_record_mem, read_size); nvgpu_mutex_release(&g->dbg_sessions_lock); - if (err) { - err = -ENOMEM; - goto err_free; - } + if (err) + return -ENOMEM; } err = gk20a_busy(g); if (err) - goto err_free; + return err; + + sm_error_state.hww_global_esr = + sm_error_state_record.hww_global_esr; + sm_error_state.hww_warp_esr = + sm_error_state_record.hww_warp_esr; + sm_error_state.hww_warp_esr_pc = + sm_error_state_record.hww_warp_esr_pc; + sm_error_state.hww_global_esr_report_mask = + sm_error_state_record.hww_global_esr_report_mask; + sm_error_state.hww_warp_esr_report_mask = + sm_error_state_record.hww_warp_esr_report_mask; err = gr_gk20a_elpg_protected_call(g, g->ops.gr.update_sm_error_state(g, ch, - sm_id, sm_error_state)); + sm_id, &sm_error_state)); gk20a_idle(g); -err_free: - nvgpu_kfree(g, sm_error_state); - return err; } @@ -295,7 +298,8 @@ static int nvgpu_dbg_gpu_ioctl_read_single_sm_error_state( { struct gk20a *g = dbg_s->g; struct gr_gk20a *gr = &g->gr; - struct nvgpu_dbg_gpu_sm_error_state_record *sm_error_state; + struct nvgpu_gr_sm_error_state *sm_error_state; + struct nvgpu_dbg_gpu_sm_error_state_record sm_error_state_record; u32 sm_id; int err = 0; @@ -304,6 +308,16 @@ static int nvgpu_dbg_gpu_ioctl_read_single_sm_error_state( return -EINVAL; sm_error_state = gr->sm_error_states + sm_id; + sm_error_state_record.hww_global_esr = + sm_error_state->hww_global_esr; + sm_error_state_record.hww_warp_esr = + sm_error_state->hww_warp_esr; + sm_error_state_record.hww_warp_esr_pc = + sm_error_state->hww_warp_esr_pc; + sm_error_state_record.hww_global_esr_report_mask = + sm_error_state->hww_global_esr_report_mask; + sm_error_state_record.hww_warp_esr_report_mask = + sm_error_state->hww_warp_esr_report_mask; if (args->sm_error_state_record_size > 0) { size_t write_size = sizeof(*sm_error_state); @@ -314,7 +328,7 @@ static int nvgpu_dbg_gpu_ioctl_read_single_sm_error_state( nvgpu_mutex_acquire(&g->dbg_sessions_lock); err = copy_to_user((void __user *)(uintptr_t) args->sm_error_state_record_mem, - sm_error_state, + &sm_error_state_record, write_size); nvgpu_mutex_release(&g->dbg_sessions_lock); if (err) { diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h index 13d534c4..80d85d65 100644 --- a/drivers/gpu/nvgpu/gk20a/gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/gk20a.h @@ -361,8 +361,7 @@ struct gpu_ops { u32 gpc, u32 tpc); int (*update_sm_error_state)(struct gk20a *g, struct channel_gk20a *ch, u32 sm_id, - struct nvgpu_dbg_gpu_sm_error_state_record * - sm_error_state); + struct nvgpu_gr_sm_error_state *sm_error_state); int (*clear_sm_error_state)(struct gk20a *g, struct channel_gk20a *ch, u32 sm_id); int (*suspend_contexts)(struct gk20a *g, diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c index 5910c7d9..2fd6f72c 100644 --- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c @@ -1543,7 +1543,7 @@ restore_fe_go_idle: * we initialize gr->no_of_sm in this function */ gr->sm_error_states = nvgpu_kzalloc(g, - sizeof(struct nvgpu_dbg_gpu_sm_error_state_record) + sizeof(struct nvgpu_gr_sm_error_state) * gr->no_of_sm); if (!gr->sm_error_states) { err = -ENOMEM; @@ -4566,7 +4566,7 @@ restore_fe_go_idle: * we initialize gr->no_of_sm in this function */ gr->sm_error_states = nvgpu_kzalloc(g, - sizeof(struct nvgpu_dbg_gpu_sm_error_state_record) * + sizeof(struct nvgpu_gr_sm_error_state) * gr->no_of_sm); if (!gr->sm_error_states) { err = -ENOMEM; diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h index 6b422138..22fc40d1 100644 --- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h @@ -231,6 +231,14 @@ struct nvgpu_preemption_modes_rec { u32 default_compute_preempt_mode; /* default mode */ }; +struct nvgpu_gr_sm_error_state { + u32 hww_global_esr; + u32 hww_warp_esr; + u64 hww_warp_esr_pc; + u32 hww_global_esr_report_mask; + u32 hww_warp_esr_report_mask; +}; + struct gr_gk20a { struct gk20a *g; struct { @@ -387,7 +395,7 @@ struct gr_gk20a { u32 *fbp_rop_l2_en_mask; u32 no_of_sm; struct sm_info *sm_to_cluster; - struct nvgpu_dbg_gpu_sm_error_state_record *sm_error_states; + struct nvgpu_gr_sm_error_state *sm_error_states; #if defined(CONFIG_GK20A_CYCLE_STATS) struct nvgpu_mutex cs_lock; struct gk20a_cs_snapshot *cs_data; diff --git a/drivers/gpu/nvgpu/gm20b/gr_gm20b.c b/drivers/gpu/nvgpu/gm20b/gr_gm20b.c index a1078b10..c10517b7 100644 --- a/drivers/gpu/nvgpu/gm20b/gr_gm20b.c +++ b/drivers/gpu/nvgpu/gm20b/gr_gm20b.c @@ -1297,7 +1297,7 @@ int gm20b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc) int gm20b_gr_update_sm_error_state(struct gk20a *g, struct channel_gk20a *ch, u32 sm_id, - struct nvgpu_dbg_gpu_sm_error_state_record *sm_error_state) + struct nvgpu_gr_sm_error_state *sm_error_state) { u32 gpc, tpc, offset; struct gr_gk20a *gr = &g->gr; diff --git a/drivers/gpu/nvgpu/gm20b/gr_gm20b.h b/drivers/gpu/nvgpu/gm20b/gr_gm20b.h index 67f1ea29..15deaa0d 100644 --- a/drivers/gpu/nvgpu/gm20b/gr_gm20b.h +++ b/drivers/gpu/nvgpu/gm20b/gr_gm20b.h @@ -119,7 +119,7 @@ void gr_gm20b_get_access_map(struct gk20a *g, int gm20b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc); int gm20b_gr_update_sm_error_state(struct gk20a *g, struct channel_gk20a *ch, u32 sm_id, - struct nvgpu_dbg_gpu_sm_error_state_record *sm_error_state); + struct nvgpu_gr_sm_error_state *sm_error_state); int gm20b_gr_clear_sm_error_state(struct gk20a *g, struct channel_gk20a *ch, u32 sm_id); int gr_gm20b_get_preemption_mode_flags(struct gk20a *g, diff --git a/drivers/gpu/nvgpu/vgpu/gr_vgpu.c b/drivers/gpu/nvgpu/vgpu/gr_vgpu.c index 2d6beda6..d400f08e 100644 --- a/drivers/gpu/nvgpu/vgpu/gr_vgpu.c +++ b/drivers/gpu/nvgpu/vgpu/gr_vgpu.c @@ -899,7 +899,7 @@ static int vgpu_gr_init_gr_setup_sw(struct gk20a *g) nvgpu_mutex_init(&gr->ctx_mutex); gr->sm_error_states = nvgpu_kzalloc(g, - sizeof(struct nvgpu_dbg_gpu_sm_error_state_record) * + sizeof(struct nvgpu_gr_sm_error_state) * gr->no_of_sm); if (!gr->sm_error_states) { err = -ENOMEM; @@ -1195,7 +1195,7 @@ int vgpu_gr_resume_contexts(struct gk20a *g, void vgpu_gr_handle_sm_esr_event(struct gk20a *g, struct tegra_vgpu_sm_esr_info *info) { - struct nvgpu_dbg_gpu_sm_error_state_record *sm_error_states; + struct nvgpu_gr_sm_error_state *sm_error_states; if (info->sm_id >= g->gr.no_of_sm) { nvgpu_err(g, "invalid smd_id %d / %d", -- cgit v1.2.2