From 0d088ad70cb43e54661163971095409c76a79f51 Mon Sep 17 00:00:00 2001 From: Sagar Kamble Date: Mon, 3 May 2021 23:17:16 +0530 Subject: gpu: nvgpu: wait for stalling interrupts to complete during TSG unbind preempt Some of the engine stalling interrupts can block the context save off the engine if not handled during fifo.preempt_tsg. They need to be handled while polling for engine ctxsw status. Bug 200711183 Bug 200726848 Change-Id: Ie45d76d9d1d8be3ffb842670843507f2d9aea6d0 Signed-off-by: Sagar Kamble Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2521971 (cherry picked from commit I7418a9e0354013b81fbefd8c0cab5068404fc44e) Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2523938 Reviewed-by: svc-mobile-coverity Reviewed-by: Deepak Nibade Reviewed-by: Bibek Basu Reviewed-by: mobile promotions Tested-by: mobile promotions GVS: Gerrit_Virtual_Submit --- drivers/gpu/nvgpu/gk20a/fifo_gk20a.c | 24 +++++--- drivers/gpu/nvgpu/gk20a/fifo_gk20a.h | 7 ++- drivers/gpu/nvgpu/gv11b/fifo_gv11b.c | 97 ++++++++++++++++++++++----------- drivers/gpu/nvgpu/gv11b/fifo_gv11b.h | 4 +- drivers/gpu/nvgpu/include/nvgpu/gk20a.h | 2 +- 5 files changed, 90 insertions(+), 44 deletions(-) diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c index e91830f8..049b8da2 100644 --- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c @@ -2981,7 +2981,7 @@ static u32 gk20a_fifo_get_preempt_timeout(struct gk20a *g) } int gk20a_fifo_is_preempt_pending(struct gk20a *g, u32 id, - unsigned int id_type) + unsigned int id_type, bool preempt_retries_left) { struct nvgpu_timeout timeout; u32 delay = GR_IDLE_CHECK_DEFAULT; @@ -3037,7 +3037,8 @@ void gk20a_fifo_preempt_timeout_rc(struct gk20a *g, struct channel_gk20a *ch) RC_TYPE_PREEMPT_TIMEOUT); } -int __locked_fifo_preempt(struct gk20a *g, u32 id, bool is_tsg) +int __locked_fifo_preempt(struct gk20a *g, u32 id, bool is_tsg, + bool preempt_retries_left) { int ret; unsigned int id_type; @@ -3049,8 +3050,17 @@ int __locked_fifo_preempt(struct gk20a *g, u32 id, bool is_tsg) id_type = is_tsg ? ID_TYPE_TSG : ID_TYPE_CHANNEL; - /* wait for preempt */ - ret = g->ops.fifo.is_preempt_pending(g, id, id_type); + /* + * Poll for preempt done. if stalling interrupts are pending + * while preempt is in progress we poll for stalling interrupts + * to finish based on return value from this function and + * retry preempt again. + * If HW is hung, on the last retry instance we try to identify + * the engines hung and set the runlist reset_eng_bitmask + * and mark preemption completion. + */ + ret = g->ops.fifo.is_preempt_pending(g, id, id_type, + preempt_retries_left); return ret; } @@ -3072,7 +3082,7 @@ int gk20a_fifo_preempt_channel(struct gk20a *g, struct channel_gk20a *ch) mutex_ret = nvgpu_pmu_mutex_acquire(&g->pmu, PMU_MUTEX_ID_FIFO, &token); - ret = __locked_fifo_preempt(g, ch->chid, false); + ret = __locked_fifo_preempt(g, ch->chid, false, false); if (!mutex_ret) { nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token); @@ -3112,7 +3122,7 @@ int gk20a_fifo_preempt_tsg(struct gk20a *g, struct tsg_gk20a *tsg) mutex_ret = nvgpu_pmu_mutex_acquire(&g->pmu, PMU_MUTEX_ID_FIFO, &token); - ret = __locked_fifo_preempt(g, tsg->tsgid, true); + ret = __locked_fifo_preempt(g, tsg->tsgid, true, false); if (!mutex_ret) { nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token); @@ -3785,7 +3795,7 @@ static int __locked_fifo_reschedule_preempt_next(struct channel_gk20a *ch, gk20a_readl(g, fifo_preempt_r())); #endif if (wait_preempt) { - g->ops.fifo.is_preempt_pending(g, preempt_id, preempt_type); + g->ops.fifo.is_preempt_pending(g, preempt_id, preempt_type, false); } #ifdef TRACEPOINTS_ENABLED trace_gk20a_reschedule_preempted_next(ch->chid); diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h index 26365cae..078236d0 100644 --- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h @@ -1,7 +1,7 @@ /* * GK20A graphics fifo (gr host) * - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -388,8 +388,9 @@ void gk20a_fifo_channel_unbind(struct channel_gk20a *ch_gk20a); u32 gk20a_fifo_intr_0_error_mask(struct gk20a *g); int gk20a_fifo_is_preempt_pending(struct gk20a *g, u32 id, - unsigned int id_type); -int __locked_fifo_preempt(struct gk20a *g, u32 id, bool is_tsg); + unsigned int id_type, bool preempt_retries_left); +int __locked_fifo_preempt(struct gk20a *g, u32 id, bool is_tsg, + bool preempt_retries_left); void gk20a_fifo_preempt_timeout_rc_tsg(struct gk20a *g, struct tsg_gk20a *tsg); void gk20a_fifo_preempt_timeout_rc(struct gk20a *g, struct channel_gk20a *ch); int gk20a_fifo_setup_ramfc(struct channel_gk20a *c, diff --git a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c index be4d56a8..cc43ee33 100644 --- a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c @@ -1,7 +1,7 @@ /* * GV11B fifo * - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -498,7 +498,8 @@ static int gv11b_fifo_poll_pbdma_chan_status(struct gk20a *g, u32 id, } static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id, - u32 act_eng_id, u32 *reset_eng_bitmask) + u32 act_eng_id, u32 *reset_eng_bitmask, + bool preempt_retries_left) { struct nvgpu_timeout timeout; unsigned long delay = GR_IDLE_CHECK_DEFAULT; /* in micro seconds */ @@ -507,6 +508,7 @@ static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id, int ret = -EBUSY; unsigned int loop_count = 0; u32 eng_intr_pending; + bool check_preempt_retry = false; /* timeout in milli seconds */ nvgpu_timeout_init(g, &timeout, g->ops.fifo.get_preempt_timeout(g), @@ -565,9 +567,7 @@ static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id, fifo_engine_status_ctx_status_ctxsw_switch_v()) { /* Eng save hasn't started yet. Continue polling */ if (eng_intr_pending) { - /* if eng intr, stop polling */ - *reset_eng_bitmask |= BIT(act_eng_id); - ret = 0; + check_preempt_retry = true; break; } @@ -578,9 +578,7 @@ static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id, if (id == fifo_engine_status_id_v(eng_stat)) { if (eng_intr_pending) { - /* preemption will not finish */ - *reset_eng_bitmask |= BIT(act_eng_id); - ret = 0; + check_preempt_retry = true; break; } } else { @@ -594,9 +592,7 @@ static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id, if (id == fifo_engine_status_next_id_v(eng_stat)) { if (eng_intr_pending) { - /* preemption will not finish */ - *reset_eng_bitmask |= BIT(act_eng_id); - ret = 0; + check_preempt_retry = true; break; } } else { @@ -606,8 +602,13 @@ static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id, } } else { - /* Preempt should be finished */ - ret = 0; + if (eng_intr_pending) { + check_preempt_retry = true; + } else { + /* Preempt should be finished */ + ret = 0; + } + break; } nvgpu_usleep_range(delay, delay * 2); @@ -615,7 +616,19 @@ static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id, delay << 1, GR_IDLE_CHECK_MAX); } while (!nvgpu_timeout_expired(&timeout)); - if (ret) { + + /* if eng intr, stop polling and check if we can retry preempts. */ + if (check_preempt_retry) { + if (preempt_retries_left) { + ret = -EAGAIN; + } else { + /* preemption will not finish */ + *reset_eng_bitmask |= BIT32(act_eng_id); + ret = 0; + } + } + + if (ret && ret != -EAGAIN) { /* * The reasons a preempt can fail are: * 1.Some other stalling interrupt is asserted preventing @@ -770,7 +783,7 @@ static void gv11b_fifo_issue_runlist_preempt(struct gk20a *g, } int gv11b_fifo_is_preempt_pending(struct gk20a *g, u32 id, - unsigned int id_type) + unsigned int id_type, bool preempt_retries_left) { struct fifo_gk20a *f = &g->fifo; unsigned long runlist_served_pbdmas; @@ -778,7 +791,7 @@ int gv11b_fifo_is_preempt_pending(struct gk20a *g, u32 id, u32 pbdma_id; u32 act_eng_id; u32 runlist_id; - int ret = 0; + int err, ret = 0; u32 tsgid; if (id_type == ID_TYPE_TSG) { @@ -795,14 +808,21 @@ int gv11b_fifo_is_preempt_pending(struct gk20a *g, u32 id, runlist_served_engines = f->runlist_info[runlist_id].eng_bitmask; for_each_set_bit(pbdma_id, &runlist_served_pbdmas, f->num_pbdma) { - ret |= gv11b_fifo_poll_pbdma_chan_status(g, tsgid, pbdma_id); + err = gv11b_fifo_poll_pbdma_chan_status(g, tsgid, pbdma_id); + if (err != 0) { + ret = err; + } } f->runlist_info[runlist_id].reset_eng_bitmask = 0; for_each_set_bit(act_eng_id, &runlist_served_engines, f->max_engines) { - ret |= gv11b_fifo_poll_eng_ctx_status(g, tsgid, act_eng_id, - &f->runlist_info[runlist_id].reset_eng_bitmask); + err = gv11b_fifo_poll_eng_ctx_status(g, tsgid, act_eng_id, + &f->runlist_info[runlist_id].reset_eng_bitmask, + preempt_retries_left); + if ((err != 0) && (ret == 0)) { + ret = err; + } } return ret; } @@ -847,10 +867,13 @@ int gv11b_fifo_enable_tsg(struct tsg_gk20a *tsg) int gv11b_fifo_preempt_tsg(struct gk20a *g, struct tsg_gk20a *tsg) { struct fifo_gk20a *f = &g->fifo; - u32 ret = 0; + int ret = 0; u32 token = PMU_INVALID_MUTEX_OWNER_ID; u32 mutex_ret = 0; u32 runlist_id; + u32 preempt_retry_count = 10U; + u32 preempt_retry_timeout = + g->ops.fifo.get_preempt_timeout(g) / preempt_retry_count; nvgpu_log_fn(g, "tsgid: %d", tsg->tsgid); @@ -860,23 +883,35 @@ int gv11b_fifo_preempt_tsg(struct gk20a *g, struct tsg_gk20a *tsg) return 0; } - nvgpu_mutex_acquire(&f->runlist_info[runlist_id].runlist_lock); + do { + nvgpu_mutex_acquire(&f->runlist_info[runlist_id].runlist_lock); - /* WAR for Bug 2065990 */ - gk20a_fifo_disable_tsg_sched(g, tsg); + /* WAR for Bug 2065990 */ + gk20a_fifo_disable_tsg_sched(g, tsg); - mutex_ret = nvgpu_pmu_mutex_acquire(&g->pmu, PMU_MUTEX_ID_FIFO, &token); + mutex_ret = nvgpu_pmu_mutex_acquire(&g->pmu, PMU_MUTEX_ID_FIFO, &token); - ret = __locked_fifo_preempt(g, tsg->tsgid, true); + ret = __locked_fifo_preempt(g, tsg->tsgid, true, + preempt_retry_count > 1U); - if (!mutex_ret) { - nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token); - } + if (!mutex_ret) { + nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token); + } + + /* WAR for Bug 2065990 */ + gk20a_fifo_enable_tsg_sched(g, tsg); - /* WAR for Bug 2065990 */ - gk20a_fifo_enable_tsg_sched(g, tsg); + nvgpu_mutex_release(&f->runlist_info[runlist_id].runlist_lock); - nvgpu_mutex_release(&f->runlist_info[runlist_id].runlist_lock); + if (ret != -EAGAIN) { + break; + } + + ret = nvgpu_wait_for_stall_interrupts(g, preempt_retry_timeout); + if (ret != 0) { + nvgpu_log_info(g, "wait for stall interrupts failed %d", ret); + } + } while (--preempt_retry_count != 0U); if (ret) { if (nvgpu_platform_is_silicon(g)) { diff --git a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.h b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.h index abbf77a6..4e6bd6ba 100644 --- a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.h +++ b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.h @@ -1,7 +1,7 @@ /* * GV11B Fifo * - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -80,7 +80,7 @@ void gv11b_dump_eng_status(struct gk20a *g, u32 gv11b_fifo_intr_0_error_mask(struct gk20a *g); int gv11b_fifo_reschedule_runlist(struct channel_gk20a *ch, bool preempt_next); int gv11b_fifo_is_preempt_pending(struct gk20a *g, u32 id, - unsigned int id_type); + unsigned int id_type, bool preempt_retries_left); int gv11b_fifo_preempt_channel(struct gk20a *g, struct channel_gk20a *ch); int gv11b_fifo_preempt_tsg(struct gk20a *g, struct tsg_gk20a *tsg); int gv11b_fifo_enable_tsg(struct tsg_gk20a *tsg); diff --git a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h index 3b193dbe..d6d6e939 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h +++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h @@ -719,7 +719,7 @@ struct gpu_ops { struct ch_state *ch_state); u32 (*intr_0_error_mask)(struct gk20a *g); int (*is_preempt_pending)(struct gk20a *g, u32 id, - unsigned int id_type); + unsigned int id_type, bool preempt_retries_left); void (*init_pbdma_intr_descs)(struct fifo_gk20a *f); int (*reset_enable_hw)(struct gk20a *g); int (*setup_userd)(struct channel_gk20a *c); -- cgit v1.2.2