From 1ab4754c052b639427f38202860d064c2fa03b57 Mon Sep 17 00:00:00 2001 From: Mahantesh Kumbar Date: Fri, 20 Oct 2017 16:30:42 +0530 Subject: gpu: nvgpu: Kill pg init thread if pmu boot fails - Created nvgpu_kill_task_pg_init() method to set pmu state to PMU_STATE_EXIT & make thread stop, and poll to confirm thread stopped. - Check for PMU/SEC2 ACR secure boot completion status & initiate pg init thread kill if ACR boot exits with error, which fails to validate & boot LS-PMU. - Set pmu state to PMU_STATE_OFF after thread kill during ACR boot failure. Issue: pg init task blocks if PMU boot fails & cause kernel to show message "task nvgpu_pg_init_g:2120 blocked for more than 120 seconds" Bug 200346134 Change-Id: I5270426080dcd628ccca4df798005294c19767a0 Signed-off-by: Mahantesh Kumbar Reviewed-on: https://git-master.nvidia.com/r/1582593 Reviewed-by: Automatic_Commit_Validation_User GVS: Gerrit_Virtual_Submit Reviewed-by: Vijayakumar Subbu Reviewed-by: mobile promotions Tested-by: mobile promotions --- drivers/gpu/nvgpu/common/pmu/pmu.c | 45 ++++++++++++++++++++--------------- drivers/gpu/nvgpu/gm20b/acr_gm20b.c | 11 +++++++-- drivers/gpu/nvgpu/gp106/sec2_gp106.c | 10 ++++++-- drivers/gpu/nvgpu/include/nvgpu/pmu.h | 1 + 4 files changed, 44 insertions(+), 23 deletions(-) (limited to 'drivers/gpu') diff --git a/drivers/gpu/nvgpu/common/pmu/pmu.c b/drivers/gpu/nvgpu/common/pmu/pmu.c index 3447f40d..d595097b 100644 --- a/drivers/gpu/nvgpu/common/pmu/pmu.c +++ b/drivers/gpu/nvgpu/common/pmu/pmu.c @@ -155,6 +155,31 @@ static int nvgpu_init_task_pg_init(struct gk20a *g) return err; } +void nvgpu_kill_task_pg_init(struct gk20a *g) +{ + struct nvgpu_pmu *pmu = &g->pmu; + struct nvgpu_timeout timeout; + + /* make sure the pending operations are finished before we continue */ + if (nvgpu_thread_is_running(&pmu->pg_init.state_task)) { + + /* post PMU_STATE_EXIT to exit PMU state machine loop */ + nvgpu_pmu_state_change(g, PMU_STATE_EXIT, true); + + /* Make thread stop*/ + nvgpu_thread_stop(&pmu->pg_init.state_task); + + /* wait to confirm thread stopped */ + nvgpu_timeout_init(g, &timeout, 1000, NVGPU_TIMER_RETRY_TIMER); + do { + if (!nvgpu_thread_is_running(&pmu->pg_init.state_task)) + break; + nvgpu_udelay(2); + } while (!nvgpu_timeout_expired_msg(&timeout, + "timeout - waiting PMU state machine thread stop")); + } +} + static int nvgpu_init_pmu_setup_sw(struct gk20a *g) { struct nvgpu_pmu *pmu = &g->pmu; @@ -469,7 +494,6 @@ int nvgpu_pmu_destroy(struct gk20a *g) { struct nvgpu_pmu *pmu = &g->pmu; struct pmu_pg_stats_data pg_stat_data = { 0 }; - struct nvgpu_timeout timeout; int i; nvgpu_log_fn(g, " "); @@ -477,24 +501,7 @@ int nvgpu_pmu_destroy(struct gk20a *g) if (!g->support_pmu) return 0; - /* make sure the pending operations are finished before we continue */ - if (nvgpu_thread_is_running(&pmu->pg_init.state_task)) { - - /* post PMU_STATE_EXIT to exit PMU state machine loop */ - nvgpu_pmu_state_change(g, PMU_STATE_EXIT, true); - - /* Make thread stop*/ - nvgpu_thread_stop(&pmu->pg_init.state_task); - - /* wait to confirm thread stopped */ - nvgpu_timeout_init(g, &timeout, 1000, NVGPU_TIMER_RETRY_TIMER); - do { - if (!nvgpu_thread_is_running(&pmu->pg_init.state_task)) - break; - nvgpu_udelay(2); - } while (!nvgpu_timeout_expired_msg(&timeout, - "timeout - waiting PMU state machine thread stop")); - } + nvgpu_kill_task_pg_init(g); nvgpu_pmu_get_pg_stats(g, PMU_PG_ELPG_ENGINE_ID_GRAPHICS, &pg_stat_data); diff --git a/drivers/gpu/nvgpu/gm20b/acr_gm20b.c b/drivers/gpu/nvgpu/gm20b/acr_gm20b.c index a39cdf2c..e5fd8692 100644 --- a/drivers/gpu/nvgpu/gm20b/acr_gm20b.c +++ b/drivers/gpu/nvgpu/gm20b/acr_gm20b.c @@ -1407,12 +1407,12 @@ int pmu_wait_for_halt(struct gk20a *g, unsigned int timeout_ms) { struct nvgpu_pmu *pmu = &g->pmu; u32 data = 0; - int ret = -EBUSY; + int ret = 0; ret = nvgpu_flcn_wait_for_halt(pmu->flcn, timeout_ms); if (ret) { nvgpu_err(g, "ACR boot timed out"); - return ret; + goto exit; } g->acr.capabilities = gk20a_readl(g, pwr_falcon_mailbox1_r()); @@ -1421,6 +1421,13 @@ int pmu_wait_for_halt(struct gk20a *g, unsigned int timeout_ms) if (data) { nvgpu_err(g, "ACR boot failed, err %x", data); ret = -EAGAIN; + goto exit; + } + +exit: + if (ret) { + nvgpu_kill_task_pg_init(g); + nvgpu_pmu_state_change(g, PMU_STATE_OFF, false); } return ret; diff --git a/drivers/gpu/nvgpu/gp106/sec2_gp106.c b/drivers/gpu/nvgpu/gp106/sec2_gp106.c index 26ded39e..332ac794 100644 --- a/drivers/gpu/nvgpu/gp106/sec2_gp106.c +++ b/drivers/gpu/nvgpu/gp106/sec2_gp106.c @@ -52,20 +52,26 @@ int sec2_wait_for_halt(struct gk20a *g, unsigned int timeout) completion = nvgpu_flcn_wait_for_halt(&g->sec2_flcn, timeout); if (completion) { nvgpu_err(g, "ACR boot timed out"); - return completion; + goto exit; } g->acr.capabilities = gk20a_readl(g, psec_falcon_mailbox1_r()); gm20b_dbg_pmu("ACR capabilities %x\n", g->acr.capabilities); data = gk20a_readl(g, psec_falcon_mailbox0_r()); if (data) { - nvgpu_err(g, "ACR boot failed, err %x", data); completion = -EAGAIN; + goto exit; } init_pmu_setup_hw1(g); +exit: + if (completion) { + nvgpu_kill_task_pg_init(g); + nvgpu_pmu_state_change(g, PMU_STATE_OFF, false); + } + return completion; } diff --git a/drivers/gpu/nvgpu/include/nvgpu/pmu.h b/drivers/gpu/nvgpu/include/nvgpu/pmu.h index 045bf34c..a818f3d0 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/pmu.h +++ b/drivers/gpu/nvgpu/include/nvgpu/pmu.h @@ -421,6 +421,7 @@ int nvgpu_pmu_process_init_msg(struct nvgpu_pmu *pmu, void nvgpu_pmu_state_change(struct gk20a *g, u32 pmu_state, bool post_change_event); +void nvgpu_kill_task_pg_init(struct gk20a *g); /* NVGPU-PMU MEM alloc */ void nvgpu_pmu_surface_free(struct gk20a *g, struct nvgpu_mem *mem); -- cgit v1.2.2