summaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorDebarshi Dutta <ddutta@nvidia.com>2019-04-30 05:03:05 -0400
committermobile promotions <svcmobile_promotions@nvidia.com>2019-05-09 17:41:50 -0400
commitbdaacf544127fcfaa474ccb5466aa93f81382416 (patch)
tree1a5ed5825b3680d5e199ee98ba85d0497c11d208 /drivers
parentc81cc032c48a1b25e095b17b77399166c9091ff3 (diff)
gpu: nvgpu: disable elpg before ctxsw_disable
if fecs is sent stop_ctxsw method, elpg entry/exit cannot happen and may timeout. It could manifest as different error signatures depending on when stop_ctxsw fecs method gets sent with respect to pmu elpg sequence. It could come as pmu halt or abort or maybe ext error too. If ctxsw failed to disable, do not read engine info and just abort tsg. Bug 2092051 Bug 2429295 Bug 2484211 Bug 1890287 Change-Id: I5f3ba07663bcafd3f0083d44c603420b0ccf6945 Signed-off-by: Seema Khowala <seemaj@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/2014914 Signed-off-by: Debarshi Dutta <ddutta@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/2018156 GVS: Gerrit_Virtual_Submit Reviewed-by: Bibek Basu <bbasu@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
Diffstat (limited to 'drivers')
-rw-r--r--drivers/gpu/nvgpu/gk20a/fifo_gk20a.c35
-rw-r--r--drivers/gpu/nvgpu/gk20a/gr_gk20a.c43
2 files changed, 72 insertions, 6 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
index 6d89940a..b96372b4 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -1943,14 +1943,42 @@ void gk20a_fifo_recover_ch(struct gk20a *g, struct channel_gk20a *ch,
1943void gk20a_fifo_recover_tsg(struct gk20a *g, struct tsg_gk20a *tsg, 1943void gk20a_fifo_recover_tsg(struct gk20a *g, struct tsg_gk20a *tsg,
1944 bool verbose, u32 rc_type) 1944 bool verbose, u32 rc_type)
1945{ 1945{
1946 u32 engines; 1946 u32 engines = 0U;
1947 int err;
1947 1948
1948 /* stop context switching to prevent engine assignments from 1949 /* stop context switching to prevent engine assignments from
1949 changing until TSG is recovered */ 1950 changing until TSG is recovered */
1950 nvgpu_mutex_acquire(&g->dbg_sessions_lock); 1951 nvgpu_mutex_acquire(&g->dbg_sessions_lock);
1951 gr_gk20a_disable_ctxsw(g);
1952 1952
1953 engines = gk20a_fifo_engines_on_id(g, tsg->tsgid, true); 1953 /* disable tsg so that it does not get scheduled again */
1954 g->ops.fifo.disable_tsg(tsg);
1955
1956 /*
1957 * stop context switching to prevent engine assignments from
1958 * changing until engine status is checked to make sure tsg
1959 * being recovered is not loaded on the engines
1960 */
1961 err = gr_gk20a_disable_ctxsw(g);
1962
1963 if (err != 0) {
1964 /* if failed to disable ctxsw, just abort tsg */
1965 nvgpu_err(g, "failed to disable ctxsw");
1966 } else {
1967 /* recover engines if tsg is loaded on the engines */
1968 engines = gk20a_fifo_engines_on_id(g, tsg->tsgid, true);
1969
1970 /*
1971 * it is ok to enable ctxsw before tsg is recovered. If engines
1972 * is 0, no engine recovery is needed and if it is non zero,
1973 * gk20a_fifo_recover will call get_engines_mask_on_id again.
1974 * By that time if tsg is not on the engine, engine need not
1975 * be reset.
1976 */
1977 err = gr_gk20a_enable_ctxsw(g);
1978 if (err != 0) {
1979 nvgpu_err(g, "failed to enable ctxsw");
1980 }
1981 }
1954 1982
1955 if (engines) { 1983 if (engines) {
1956 gk20a_fifo_recover(g, engines, tsg->tsgid, true, true, verbose, 1984 gk20a_fifo_recover(g, engines, tsg->tsgid, true, true, verbose,
@@ -1963,7 +1991,6 @@ void gk20a_fifo_recover_tsg(struct gk20a *g, struct tsg_gk20a *tsg,
1963 gk20a_fifo_abort_tsg(g, tsg, false); 1991 gk20a_fifo_abort_tsg(g, tsg, false);
1964 } 1992 }
1965 1993
1966 gr_gk20a_enable_ctxsw(g);
1967 nvgpu_mutex_release(&g->dbg_sessions_lock); 1994 nvgpu_mutex_release(&g->dbg_sessions_lock);
1968} 1995}
1969 1996
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
index a4c1ce58..788ebf45 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -628,7 +628,14 @@ static int gr_gk20a_ctrl_ctxsw(struct gk20a *g, u32 fecs_method, u32 *ret)
628 .cond.fail = GR_IS_UCODE_OP_EQUAL }, true); 628 .cond.fail = GR_IS_UCODE_OP_EQUAL }, true);
629} 629}
630 630
631/* Stop processing (stall) context switches at FECS. */ 631/**
632 * Stop processing (stall) context switches at FECS:-
633 * If fecs is sent stop_ctxsw method, elpg entry/exit cannot happen
634 * and may timeout. It could manifest as different error signatures
635 * depending on when stop_ctxsw fecs method gets sent with respect
636 * to pmu elpg sequence. It could come as pmu halt or abort or
637 * maybe ext error too.
638*/
632int gr_gk20a_disable_ctxsw(struct gk20a *g) 639int gr_gk20a_disable_ctxsw(struct gk20a *g)
633{ 640{
634 int err = 0; 641 int err = 0;
@@ -638,8 +645,24 @@ int gr_gk20a_disable_ctxsw(struct gk20a *g)
638 nvgpu_mutex_acquire(&g->ctxsw_disable_lock); 645 nvgpu_mutex_acquire(&g->ctxsw_disable_lock);
639 g->ctxsw_disable_count++; 646 g->ctxsw_disable_count++;
640 if (g->ctxsw_disable_count == 1) { 647 if (g->ctxsw_disable_count == 1) {
641 err = gr_gk20a_ctrl_ctxsw(g, 648 err = nvgpu_pg_elpg_disable(g);
649 if (err != 0) {
650 nvgpu_err(g, "failed to disable elpg. not safe to "
651 "stop_ctxsw");
652 /* stop ctxsw command is not sent */
653 g->ctxsw_disable_count--;
654 } else {
655 err = gr_gk20a_ctrl_ctxsw(g,
642 gr_fecs_method_push_adr_stop_ctxsw_v(), NULL); 656 gr_fecs_method_push_adr_stop_ctxsw_v(), NULL);
657 if (err != 0) {
658 nvgpu_err(g, "failed to stop fecs ctxsw");
659 /* stop ctxsw failed */
660 g->ctxsw_disable_count--;
661 }
662 }
663 } else {
664 nvgpu_log_info(g, "ctxsw disabled, ctxsw_disable_count: %d",
665 g->ctxsw_disable_count);
643 } 666 }
644 nvgpu_mutex_release(&g->ctxsw_disable_lock); 667 nvgpu_mutex_release(&g->ctxsw_disable_lock);
645 668
@@ -654,12 +677,28 @@ int gr_gk20a_enable_ctxsw(struct gk20a *g)
654 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, " "); 677 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, " ");
655 678
656 nvgpu_mutex_acquire(&g->ctxsw_disable_lock); 679 nvgpu_mutex_acquire(&g->ctxsw_disable_lock);
680
681 if (g->ctxsw_disable_count == 0) {
682 goto ctxsw_already_enabled;
683 }
657 g->ctxsw_disable_count--; 684 g->ctxsw_disable_count--;
658 WARN_ON(g->ctxsw_disable_count < 0); 685 WARN_ON(g->ctxsw_disable_count < 0);
659 if (g->ctxsw_disable_count == 0) { 686 if (g->ctxsw_disable_count == 0) {
660 err = gr_gk20a_ctrl_ctxsw(g, 687 err = gr_gk20a_ctrl_ctxsw(g,
661 gr_fecs_method_push_adr_start_ctxsw_v(), NULL); 688 gr_fecs_method_push_adr_start_ctxsw_v(), NULL);
689 if (err != 0) {
690 nvgpu_err(g, "failed to start fecs ctxsw");
691 } else {
692 if (nvgpu_pg_elpg_enable(g) != 0) {
693 nvgpu_err(g, "failed to enable elpg "
694 "after start_ctxsw");
695 }
696 }
697 } else {
698 nvgpu_log_info(g, "ctxsw_disable_count: %d is not 0 yet",
699 g->ctxsw_disable_count);
662 } 700 }
701ctxsw_already_enabled:
663 nvgpu_mutex_release(&g->ctxsw_disable_lock); 702 nvgpu_mutex_release(&g->ctxsw_disable_lock);
664 703
665 return err; 704 return err;