gpu: nvgpu: fix suspending all SMs

In gk20a_suspend_all_sms(), we currently loop over all GPCs and then loop over all TPCs in inner loop But this is incorrect and leads to SM with invalid GPC,TPC ids Fix this by looping over number of TPCs in each GPC in inner loop Also, fix gk20a_gr_wait_for_sm_lock_down() as per below - we right now wait infinitely for SM to lock down - restrict this wait with a timeout on silicon platforms - return ETIMEDOUT instead of EAGAIN - add more debug prints with additional data for SM lock down failures Bug 200258704 Change-Id: Id6fe32e579647fd8ac287a4b2ec80cbf98791e0d Signed-off-by: Cory Perry <cperry@nvidia.com> Signed-off-by: Deepak Nibade <dnibade@nvidia.com> Reviewed-on: http://git-master/r/1316471 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
author: Cory Perry <cperry@nvidia.com> 2017-03-07 12:32:53 -0500
committer: mobile promotions <svcmobile_promotions@nvidia.com> 2017-03-14 14:46:52 -0400
commit: de568db9dee599fa27cdc2ead88186099fff3c3b (patch)
tree: e7f448c7b79aece79e4a98aef334f17d8fedca40 /drivers/gpu/nvgpu/gk20a/gr_gk20a.c
parent: 403874fa75dbb00e974a8d0f88b6e92be01ba42e (diff)
1 files changed, 31 insertions, 6 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
index afa665ab..4dec9e99 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -8588,17 +8588,23 @@ int gk20a_gr_wait_for_sm_lock_down(struct gk20a *g, u32 gpc, u32 tpc,
        u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
        u32 offset =
                gpc_stride * gpc + tpc_in_gpc_stride * tpc;
+        u32 dbgr_status0 = 0, dbgr_control0 = 0;
+        u64 warps_valid = 0, warps_paused = 0, warps_trapped = 0;
+        struct nvgpu_timeout timeout;
        gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg,
                "GPC%d TPC%d: locking down SM", gpc, tpc);
+        nvgpu_timeout_init(g, &timeout, gk20a_get_gr_idle_timeout(g),
+                           NVGPU_TIMER_CPU_TIMER);
        /* wait for the sm to lock down */
        do {
                u32 global_esr = gk20a_readl(g,
                                gr_gpc0_tpc0_sm_hww_global_esr_r() + offset);
                u32 warp_esr = gk20a_readl(g,
                                gr_gpc0_tpc0_sm_hww_warp_esr_r() + offset);
-                u32 dbgr_status0 = gk20a_readl(g,
+                dbgr_status0 = gk20a_readl(g,
                                gr_gpc0_tpc0_sm_dbgr_status0_r() + offset);
                warp_esr = g->ops.gr.mask_hww_warp_esr(warp_esr);
@@ -8630,13 +8636,32 @@ int gk20a_gr_wait_for_sm_lock_down(struct gk20a *g, u32 gpc, u32 tpc,
                usleep_range(delay, delay * 2);
                delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
-        } while (!locked_down);
+        } while (!nvgpu_timeout_expired(&timeout)
+                        || !tegra_platform_is_silicon());
+        dbgr_control0 = gk20a_readl(g,
+                                gr_gpc0_tpc0_sm_dbgr_control0_r() + offset);
+        /* 64 bit read */
+        warps_valid = (u64)gk20a_readl(g, gr_gpc0_tpc0_sm_warp_valid_mask_1_r() + offset) << 32;
+        warps_valid |= gk20a_readl(g, gr_gpc0_tpc0_sm_warp_valid_mask_r() + offset);
+        /* 64 bit read */
+        warps_paused = (u64)gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_bpt_pause_mask_1_r() + offset) << 32;
+        warps_paused |= gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_bpt_pause_mask_r() + offset);
+        /* 64 bit read */
+        warps_trapped = (u64)gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_bpt_trap_mask_1_r() + offset) << 32;
+        warps_trapped |= gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_bpt_trap_mask_r() + offset);
+        gk20a_err(dev_from_gk20a(g),
+                "GPC%d TPC%d: timed out while trying to lock down SM", gpc, tpc);
        gk20a_err(dev_from_gk20a(g),
-                  "GPC%d TPC%d: timed out while trying to lock down SM",
+                "STATUS0(0x%x)=0x%x CONTROL0=0x%x VALID_MASK=0x%llx PAUSE_MASK=0x%llx TRAP_MASK=0x%llx\n",
-                  gpc, tpc);
+                gr_gpc0_tpc0_sm_dbgr_status0_r() + offset, dbgr_status0, dbgr_control0,
+                warps_valid, warps_paused, warps_trapped);
-        return -EAGAIN;
+        return -ETIMEDOUT;
 }
 void gk20a_suspend_single_sm(struct gk20a *g,
@@ -8699,7 +8724,7 @@ void gk20a_suspend_all_sms(struct gk20a *g,
                gr_gpcs_tpcs_sm_dbgr_control0_r(), dbgr_control0);
        for (gpc = 0; gpc < gr->gpc_count; gpc++) {
-                for (tpc = 0; tpc < gr->tpc_count; tpc++) {
+                for (tpc = 0; tpc < gr_gk20a_get_tpc_count(gr, gpc); tpc++) {
                        err =
                         gk20a_gr_wait_for_sm_lock_down(g, gpc, tpc,
                                        global_esr_mask, check_errors);
author	Cory Perry <cperry@nvidia.com>	2017-03-07 12:32:53 -0500
committer	mobile promotions <svcmobile_promotions@nvidia.com>	2017-03-14 14:46:52 -0400
commit	de568db9dee599fa27cdc2ead88186099fff3c3b (patch)
tree	e7f448c7b79aece79e4a98aef334f17d8fedca40 /drivers/gpu/nvgpu/gk20a/gr_gk20a.c
parent	403874fa75dbb00e974a8d0f88b6e92be01ba42e (diff)