From 397c6d44ed3ee6cc0c24fce7711bda4f0d6cd9bf Mon Sep 17 00:00:00 2001 From: sujeet baranwal Date: Thu, 20 Aug 2015 17:04:44 -0700 Subject: gpu: nvgpu: Wait for pause for SMs SM locking & register reads Order has been changed. Also, functions have been implemented based on gk20a and gm20b. Change-Id: Iaf720d088130f84c4b2ca318d9860194c07966e1 Signed-off-by: sujeet baranwal Signed-off-by: ashutosh jain Signed-off-by: Terje Bergstrom Reviewed-on: http://git-master/r/837236 --- drivers/gpu/nvgpu/gk20a/ctrl_gk20a.c | 47 +++++++-------------- drivers/gpu/nvgpu/gk20a/gk20a.h | 2 + drivers/gpu/nvgpu/gk20a/gr_gk20a.c | 59 ++++++++++++++++++++++---- drivers/gpu/nvgpu/gm20b/gr_gm20b.c | 78 +++++++++++++++++++++++++++++++++++ drivers/gpu/nvgpu/gm20b/hw_gr_gm20b.h | 30 ++++++++++++-- 5 files changed, 174 insertions(+), 42 deletions(-) (limited to 'drivers') diff --git a/drivers/gpu/nvgpu/gk20a/ctrl_gk20a.c b/drivers/gpu/nvgpu/gk20a/ctrl_gk20a.c index 4f33c78f..e17e239b 100644 --- a/drivers/gpu/nvgpu/gk20a/ctrl_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/ctrl_gk20a.c @@ -30,7 +30,6 @@ #include "hw_fb_gk20a.h" #include "hw_proj_gk20a.h" - int gk20a_ctrl_dev_open(struct inode *inode, struct file *filp) { struct gk20a *g; @@ -389,64 +388,48 @@ static int nvgpu_gpu_ioctl_set_debug_mode( return err; } -static int nvgpu_gpu_ioctl_wait_for_pause( - struct gk20a *g, +static int nvgpu_gpu_ioctl_wait_for_pause(struct gk20a *g, struct nvgpu_gpu_wait_pause_args *args) { - int err = 0, gpc, tpc; - u32 sm_count, sm_id, size; + int err = 0; struct warpstate *w_state; struct gr_gk20a *gr = &g->gr; - u32 tpc_offset, gpc_offset, reg_offset, global_mask; - u64 warps_valid = 0, warps_paused = 0, warps_trapped = 0; + u32 gpc, tpc, sm_count, sm_id, size; + u32 global_mask; sm_count = g->gr.gpc_count * g->gr.tpc_count; size = sm_count * sizeof(struct warpstate); w_state = kzalloc(size, GFP_KERNEL); + /* Wait for the SMs to reach full stop. This condition is: + * 1) All SMs with valid warps must be in the trap handler (SM_IN_TRAP_MODE) + * 2) All SMs in the trap handler must have equivalent VALID and PAUSED warp + * masks. + */ global_mask = gr_gpc0_tpc0_sm_hww_global_esr_bpt_int_pending_f() | gr_gpc0_tpc0_sm_hww_global_esr_bpt_pause_pending_f() | gr_gpc0_tpc0_sm_hww_global_esr_single_step_complete_pending_f(); mutex_lock(&g->dbg_sessions_lock); + /* Lock down all SMs */ for (sm_id = 0; sm_id < gr->no_of_sm; sm_id++) { gpc = g->gr.sm_to_cluster[sm_id].gpc_index; tpc = g->gr.sm_to_cluster[sm_id].tpc_index; - tpc_offset = proj_tpc_in_gpc_stride_v() * tpc; - gpc_offset = proj_gpc_stride_v() * gpc; - reg_offset = tpc_offset + gpc_offset; - - /* Wait until all valid warps on the sm are paused. The valid warp mask - * must be re-read with the paused mask because new warps may become - * valid as the sm is pausing. - */ - err = gk20a_gr_lock_down_sm(g, gpc, tpc, global_mask); + if (err) { gk20a_err(dev_from_gk20a(g), "sm did not lock down!\n"); goto end; } - - /* 64 bit read */ - warps_valid = (u64)gk20a_readl(g, gr_gpc0_tpc0_sm_warp_valid_mask_r() + reg_offset + 4) << 32; - warps_valid |= gk20a_readl(g, gr_gpc0_tpc0_sm_warp_valid_mask_r() + reg_offset); - - /* 64 bit read */ - warps_paused = (u64)gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_bpt_pause_mask_r() + reg_offset + 4) << 32; - warps_paused |= gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_bpt_pause_mask_r() + reg_offset); - - /* 64 bit read */ - warps_trapped = (u64)gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_bpt_trap_mask_r() + reg_offset + 4) << 32; - warps_trapped |= gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_bpt_trap_mask_r() + reg_offset); - - w_state[sm_id].valid_warps = warps_valid; - w_state[sm_id].trapped_warps = warps_trapped; - w_state[sm_id].paused_warps = warps_paused; } + /* Read the warp status */ + g->ops.gr.bpt_reg_info(g, w_state); + + /* Copy to user space - pointed by "args->pwarpstate" */ if (copy_to_user((void __user *)(uintptr_t)args->pwarpstate, w_state, size)) { gk20a_dbg_fn("copy_to_user failed!"); err = -EFAULT; diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h index 51955a3a..47256e24 100644 --- a/drivers/gpu/nvgpu/gk20a/gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/gk20a.h @@ -185,6 +185,8 @@ struct gpu_ops { u32 expect_delay); void (*init_cyclestats)(struct gk20a *g); void (*enable_cde_in_fecs)(void *ctx_ptr); + void (*bpt_reg_info)(struct gk20a *g, + struct warpstate *w_state); } gr; const char *name; struct { diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c index 090f95a5..ef24e078 100644 --- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c @@ -6973,8 +6973,8 @@ static u32 gr_gk20a_get_tpc_num(u32 addr) static int gk20a_gr_wait_for_sm_lock_down(struct gk20a *g, u32 gpc, u32 tpc, u32 global_esr_mask, bool check_errors) { - unsigned long end_jiffies = jiffies + - msecs_to_jiffies(gk20a_get_gr_idle_timeout(g)); + bool locked_down; + bool no_error_pending; u32 delay = GR_IDLE_CHECK_DEFAULT; bool mmu_debug_mode_enabled = g->ops.mm.is_debug_mode_enabled(g); u32 offset = @@ -6991,10 +6991,10 @@ static int gk20a_gr_wait_for_sm_lock_down(struct gk20a *g, u32 gpc, u32 tpc, gr_gpc0_tpc0_sm_hww_warp_esr_r() + offset); u32 dbgr_status0 = gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_status0_r() + offset); - bool locked_down = + locked_down = (gr_gpc0_tpc0_sm_dbgr_status0_locked_down_v(dbgr_status0) == gr_gpc0_tpc0_sm_dbgr_status0_locked_down_true_v()); - bool no_error_pending = + no_error_pending = check_errors && (gr_gpc0_tpc0_sm_hww_warp_esr_error_v(warp_esr) == gr_gpc0_tpc0_sm_hww_warp_esr_error_none_v()) && @@ -7018,9 +7018,7 @@ static int gk20a_gr_wait_for_sm_lock_down(struct gk20a *g, u32 gpc, u32 tpc, usleep_range(delay, delay * 2); delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX); - - } while (time_before(jiffies, end_jiffies) - || !tegra_platform_is_silicon()); + } while (!locked_down); gk20a_err(dev_from_gk20a(g), "GPC%d TPC%d: timed out while trying to lock down SM", @@ -7273,6 +7271,52 @@ static void gr_gk20a_init_cyclestats(struct gk20a *g) #endif } +void gr_gk20a_bpt_reg_info(struct gk20a *g, struct warpstate *w_state) +{ + /* Check if we have at least one valid warp */ + struct gr_gk20a *gr = &g->gr; + u32 gpc, tpc, sm_id; + u32 tpc_offset, gpc_offset, reg_offset; + u64 warps_valid = 0, warps_paused = 0, warps_trapped = 0; + + for (sm_id = 0; sm_id < gr->no_of_sm; sm_id++) { + gpc = g->gr.sm_to_cluster[sm_id].gpc_index; + tpc = g->gr.sm_to_cluster[sm_id].tpc_index; + + tpc_offset = proj_tpc_in_gpc_stride_v() * tpc; + gpc_offset = proj_gpc_stride_v() * gpc; + reg_offset = tpc_offset + gpc_offset; + + /* 64 bit read */ + warps_valid = (u64)gk20a_readl(g, gr_gpc0_tpc0_sm_warp_valid_mask_r() + reg_offset + 4) << 32; + warps_valid |= gk20a_readl(g, gr_gpc0_tpc0_sm_warp_valid_mask_r() + reg_offset); + + + /* 64 bit read */ + warps_paused = (u64)gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_bpt_pause_mask_r() + reg_offset + 4) << 32; + warps_paused |= gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_bpt_pause_mask_r() + reg_offset); + + /* 64 bit read */ + warps_trapped = (u64)gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_bpt_trap_mask_r() + reg_offset + 4) << 32; + warps_trapped |= gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_bpt_trap_mask_r() + reg_offset); + + w_state[sm_id].valid_warps[0] = warps_valid; + w_state[sm_id].trapped_warps[0] = warps_trapped; + w_state[sm_id].paused_warps[0] = warps_paused; + } + + /* Only for debug purpose */ + for (sm_id = 0; sm_id < gr->no_of_sm; sm_id++) { + gk20a_dbg_fn("w_state[%d].valid_warps[0]: %llx\n", + sm_id, w_state[sm_id].valid_warps[0]); + gk20a_dbg_fn("w_state[%d].trapped_warps[0]: %llx\n", + sm_id, w_state[sm_id].trapped_warps[0]); + gk20a_dbg_fn("w_state[%d].paused_warps[0]: %llx\n", + sm_id, w_state[sm_id].paused_warps[0]); + } +} + + void gk20a_init_gr_ops(struct gpu_ops *gops) { gops->gr.access_smpc_reg = gr_gk20a_access_smpc_reg; @@ -7324,4 +7368,5 @@ void gk20a_init_gr_ops(struct gpu_ops *gops) gops->gr.init_sm_dsm_reg_info = gr_gk20a_init_sm_dsm_reg_info; gops->gr.wait_empty = gr_gk20a_wait_idle; gops->gr.init_cyclestats = gr_gk20a_init_cyclestats; + gops->gr.bpt_reg_info = gr_gk20a_bpt_reg_info; } diff --git a/drivers/gpu/nvgpu/gm20b/gr_gm20b.c b/drivers/gpu/nvgpu/gm20b/gr_gm20b.c index d590f566..512c470d 100644 --- a/drivers/gpu/nvgpu/gm20b/gr_gm20b.c +++ b/drivers/gpu/nvgpu/gm20b/gr_gm20b.c @@ -1050,6 +1050,83 @@ static void gr_gm20b_enable_cde_in_fecs(void *ctx_ptr) gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_ctl_o(), 0, cde_v); } +void gr_gm20b_bpt_reg_info(struct gk20a *g, struct warpstate *w_state) +{ + /* Check if we have at least one valid warp */ + /* get paused state on maxwell */ + struct gr_gk20a *gr = &g->gr; + u32 gpc, tpc, sm_id; + u32 tpc_offset, gpc_offset, reg_offset; + u64 warps_valid = 0, warps_paused = 0, warps_trapped = 0; + + /* for maxwell & kepler */ + u32 numSmPerTpc = 1; + u32 numWarpPerTpc = g->gpu_characteristics.sm_arch_warp_count * numSmPerTpc; + + for (sm_id = 0; sm_id < gr->no_of_sm; sm_id++) { + gpc = g->gr.sm_to_cluster[sm_id].gpc_index; + tpc = g->gr.sm_to_cluster[sm_id].tpc_index; + + tpc_offset = proj_tpc_in_gpc_stride_v() * tpc; + gpc_offset = proj_gpc_stride_v() * gpc; + reg_offset = tpc_offset + gpc_offset; + + /* 64 bit read */ + warps_valid = (u64)gk20a_readl(g, gr_gpc0_tpc0_sm_warp_valid_mask_r() + reg_offset + 4) << 32; + warps_valid |= gk20a_readl(g, gr_gpc0_tpc0_sm_warp_valid_mask_r() + reg_offset); + + /* 64 bit read */ + warps_paused = (u64)gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_bpt_pause_mask_r() + reg_offset + 4) << 32; + warps_paused |= gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_bpt_pause_mask_r() + reg_offset); + + /* 64 bit read */ + warps_trapped = (u64)gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_bpt_trap_mask_r() + reg_offset + 4) << 32; + warps_trapped |= gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_bpt_trap_mask_r() + reg_offset); + + w_state[sm_id].valid_warps[0] = warps_valid; + w_state[sm_id].trapped_warps[0] = warps_trapped; + w_state[sm_id].paused_warps[0] = warps_paused; + + + if (numWarpPerTpc > 64) { + /* 64 bit read */ + warps_valid = (u64)gk20a_readl(g, gr_gpc0_tpc0_sm_warp_valid_mask_2_r() + reg_offset + 4) << 32; + warps_valid |= gk20a_readl(g, gr_gpc0_tpc0_sm_warp_valid_mask_2_r() + reg_offset); + + /* 64 bit read */ + warps_paused = (u64)gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_bpt_pause_mask_2_r() + reg_offset + 4) << 32; + warps_paused |= gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_bpt_pause_mask_2_r() + reg_offset); + + /* 64 bit read */ + warps_trapped = (u64)gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_bpt_trap_mask_2_r() + reg_offset + 4) << 32; + warps_trapped |= gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_bpt_trap_mask_2_r() + reg_offset); + + w_state[sm_id].valid_warps[1] = warps_valid; + w_state[sm_id].trapped_warps[1] = warps_trapped; + w_state[sm_id].paused_warps[1] = warps_paused; + } + } + + + /* Only for debug purpose */ + for (sm_id = 0; sm_id < gr->no_of_sm; sm_id++) { + gk20a_dbg_fn("w_state[%d].valid_warps[0]: %llx\n", + sm_id, w_state[sm_id].valid_warps[0]); + gk20a_dbg_fn("w_state[%d].valid_warps[1]: %llx\n", + sm_id, w_state[sm_id].valid_warps[1]); + + gk20a_dbg_fn("w_state[%d].trapped_warps[0]: %llx\n", + sm_id, w_state[sm_id].trapped_warps[0]); + gk20a_dbg_fn("w_state[%d].trapped_warps[1]: %llx\n", + sm_id, w_state[sm_id].trapped_warps[1]); + + gk20a_dbg_fn("w_state[%d].paused_warps[0]: %llx\n", + sm_id, w_state[sm_id].paused_warps[0]); + gk20a_dbg_fn("w_state[%d].paused_warps[1]: %llx\n", + sm_id, w_state[sm_id].paused_warps[1]); + } +} + void gm20b_init_gr(struct gpu_ops *gops) { gops->gr.init_gpc_mmu = gr_gm20b_init_gpc_mmu; @@ -1106,4 +1183,5 @@ void gm20b_init_gr(struct gpu_ops *gops) gops->gr.wait_empty = gr_gk20a_wait_idle; gops->gr.init_cyclestats = gr_gm20b_init_cyclestats; gops->gr.enable_cde_in_fecs = gr_gm20b_enable_cde_in_fecs; + gops->gr.bpt_reg_info = gr_gm20b_bpt_reg_info; } diff --git a/drivers/gpu/nvgpu/gm20b/hw_gr_gm20b.h b/drivers/gpu/nvgpu/gm20b/hw_gr_gm20b.h index 4a712394..05f6cae5 100644 --- a/drivers/gpu/nvgpu/gm20b/hw_gr_gm20b.h +++ b/drivers/gpu/nvgpu/gm20b/hw_gr_gm20b.h @@ -3130,15 +3130,15 @@ static inline u32 gr_gpc0_tpc0_sm_dbgr_control0_stop_on_any_sm_disable_f(void) { return 0x0; } -static inline u32 gr_gpc0_tpc0_sm_warp_valid_mask_r(void) +static inline u32 gr_gpc0_tpc0_sm_warp_valid_mask_0_r(void) { return 0x00504614; } -static inline u32 gr_gpc0_tpc0_sm_dbgr_bpt_pause_mask_r(void) +static inline u32 gr_gpc0_tpc0_sm_dbgr_bpt_pause_mask_0_r(void) { return 0x00504624; } -static inline u32 gr_gpc0_tpc0_sm_dbgr_bpt_trap_mask_r(void) +static inline u32 gr_gpc0_tpc0_sm_dbgr_bpt_trap_mask_0_r(void) { return 0x00504634; } @@ -3150,6 +3150,18 @@ static inline u32 gr_gpc0_tpc0_sm_dbgr_bpt_trap_mask_stop_on_any_sm_disable_v(vo { return 0x00000000; } +static inline u32 gr_gpc0_tpc0_sm_warp_valid_mask_2_r(void) +{ + return 0x0050461c; +} +static inline u32 gr_gpc0_tpc0_sm_dbgr_bpt_pause_mask_2_r(void) +{ + return 0x00504750; +} +static inline u32 gr_gpc0_tpc0_sm_dbgr_bpt_trap_mask_2_r(void) +{ + return 0x00504758; +} static inline u32 gr_gpc0_tpc0_sm_dbgr_status0_r(void) { return 0x0050460c; @@ -3626,6 +3638,18 @@ static inline u32 gr_gpcs_tpcs_sm_dbgr_control0_stop_trigger_disable_f(void) { return 0x0; } +static inline u32 gr_gpc0_tpc0_sm_warp_valid_mask_r(void) +{ + return 0x00504614; +} +static inline u32 gr_gpc0_tpc0_sm_dbgr_bpt_pause_mask_r(void) +{ + return 0x00504624; +} +static inline u32 gr_gpc0_tpc0_sm_dbgr_bpt_trap_mask_r(void) +{ + return 0x00504634; +} static inline u32 gr_gpcs_tpcs_sm_dbgr_control0_run_trigger_m(void) { return 0x1 << 30; -- cgit v1.2.2