gpu: nvgpu: fix runlist update timeout handling

bug 1625901 1) disable ELPG before doing GR reset when runlist update times out 2) add mutex for GR reset to avoid multiple threads resetting GR 3) protect GR reset with FECS mutex so that no one else submits methods Change-Id: I02993fd1eabe6875ab1c58a40a06e6c79fcdeeae Signed-off-by: Vijayakumar <vsubbu@nvidia.com> Reviewed-on: http://git-master/r/793643 Reviewed-by: Automatic_Commit_Validation_User GVS: Gerrit_Virtual_Submit Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
author: Vijayakumar <vsubbu@nvidia.com> 2015-08-04 07:44:54 -0400
committer: Terje Bergstrom <tbergstrom@nvidia.com> 2015-09-16 12:44:00 -0400
commit: b8faddfe2ad3d52837b0f766d74feb8e6d6f4ce5 (patch)
tree: 9cf25fcdd5e9ac2ff870f78c35213ce46b90ac7a /drivers/gpu/nvgpu/gk20a
parent: 2359f247d18fbde3220e463543193ab06f75fe81 (diff)
4 files changed, 47 insertions, 19 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index dae9c8cb..c18a4e5d 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -719,7 +719,7 @@ static void gk20a_free_channel(struct channel_gk20a *ch)
        struct vm_gk20a *ch_vm = ch->vm;
        unsigned long timeout = gk20a_get_gr_idle_timeout(g);
        struct dbg_session_gk20a *dbg_s;
+        bool was_reset;
        gk20a_dbg_fn("");
        WARN_ON(ch->g == NULL);
@@ -764,7 +764,15 @@ static void gk20a_free_channel(struct channel_gk20a *ch)
        if (g->fifo.deferred_reset_pending) {
                gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "engine reset was"
                           " deferred, running now");
-                gk20a_fifo_reset_engine(g, g->fifo.deferred_fault_engines);
+                was_reset = mutex_is_locked(&g->fifo.gr_reset_mutex);
+                mutex_lock(&g->fifo.gr_reset_mutex);
+                /* if lock is already taken, a reset is taking place
+                so no need to repeat */
+                if (!was_reset) {
+                        gk20a_fifo_reset_engine(g,
+                                g->fifo.deferred_fault_engines);
+                }
+                mutex_unlock(&g->fifo.gr_reset_mutex);
                g->fifo.deferred_fault_engines = 0;
                g->fifo.deferred_reset_pending = false;
        }
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
index 68c0ddcb..0bd75026 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -476,6 +476,7 @@ static int gk20a_init_fifo_setup_sw(struct gk20a *g)
        f->g = g;
        mutex_init(&f->intr.isr.mutex);
+        mutex_init(&f->gr_reset_mutex);
        gk20a_init_fifo_pbdma_intr_descs(f); /* just filling in data/tables */
        f->num_channels = g->ops.fifo.get_num_fifos(g);
@@ -767,12 +768,15 @@ void gk20a_fifo_reset_engine(struct gk20a *g, u32 engine_id)
        gk20a_dbg_fn("");
        if (engine_id == top_device_info_type_enum_graphics_v()) {
-                /*HALT_PIPELINE method, halt GR engine*/
+                if (support_gk20a_pmu(g->dev) && g->elpg_enabled)
-                if (gr_gk20a_halt_pipe(g))
+                        gk20a_pmu_disable_elpg(g);
-                        gk20a_err(dev_from_gk20a(g), "failed to HALT gr pipe");
+                        /*HALT_PIPELINE method, halt GR engine*/
-                /* resetting engine using mc_enable_r() is not enough,
+                        if (gr_gk20a_halt_pipe(g))
-                 * we do full init sequence */
+                                gk20a_err(dev_from_gk20a(g),
-                gk20a_gr_reset(g);
+                                        "failed to HALT gr pipe");
+                        /* resetting engine using mc_enable_r() is not
+                        enough, we do full init sequence */
+                        gk20a_gr_reset(g);
        }
        if (engine_id == top_device_info_type_enum_copy0_v())
                gk20a_reset(g, mc_enable_ce2_m());
@@ -950,6 +954,7 @@ static bool gk20a_fifo_handle_mmu_fault(
                struct channel_gk20a *ch = NULL;
                struct tsg_gk20a *tsg = NULL;
                struct channel_gk20a *referenced_channel = NULL;
+                bool was_reset;
                /* read and parse engine status */
                u32 status = gk20a_readl(g, fifo_engine_status_r(engine_id));
                u32 ctx_status = fifo_engine_status_ctx_status_v(status);
@@ -1029,9 +1034,15 @@ static bool gk20a_fifo_handle_mmu_fault(
                        /* handled during channel free */
                        g->fifo.deferred_reset_pending = true;
-                } else if (engine_id != ~0)
+                } else if (engine_id != ~0) {
-                        gk20a_fifo_reset_engine(g, engine_id);
+                        was_reset = mutex_is_locked(&g->fifo.gr_reset_mutex);
+                        mutex_lock(&g->fifo.gr_reset_mutex);
+                        /* if lock is already taken, a reset is taking place
+                        so no need to repeat */
+                        if (!was_reset)
+                                gk20a_fifo_reset_engine(g, engine_id);
+                        mutex_unlock(&g->fifo.gr_reset_mutex);
+                }
                /* disable the channel/TSG from hw and increment
                 * syncpoints */
@@ -2120,12 +2131,10 @@ static int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
                        gk20a_fifo_runlist_reset_engines(g, runlist_id);
                        /* engine reset needs the lock. drop it */
-                        mutex_unlock(&runlist->mutex);
                        /* wait until the runlist is active again */
                        ret = gk20a_fifo_runlist_wait_pending(g, runlist_id);
                        /* get the lock back. at this point everything should
                         * should be fine */
-                        mutex_lock(&runlist->mutex);
                        if (ret)
                                gk20a_err(dev_from_gk20a(g),
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
index 7385f9be..3eb193f6 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
@@ -109,6 +109,7 @@ struct fifo_gk20a {
        /* zero-kref'd channels here */
        struct list_head free_chs;
        struct mutex free_chs_mutex;
+        struct mutex gr_reset_mutex;
        struct tsg_gk20a *tsg;
        struct mutex tsg_inuse_mutex;
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
index 512a7d6b..0ae44c6f 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -4320,12 +4320,6 @@ static int gr_gk20a_init_ctxsw(struct gk20a *g)
        if (err)
                goto out;
-        /* this appears query for sw states but fecs actually init
-           ramchain, etc so this is hw init */
-        err = g->ops.gr.init_ctx_state(g);
-        if (err)
-                goto out;
 out:
        if (err)
                gk20a_err(dev_from_gk20a(g), "fail");
@@ -4553,6 +4547,12 @@ int gk20a_init_gr_support(struct gk20a *g)
        if (err)
                return err;
+        /* this appears query for sw states but fecs actually init
+           ramchain, etc so this is hw init */
+        err = g->ops.gr.init_ctx_state(g);
+        if (err)
+                return err;
        err = gk20a_init_gr_setup_sw(g);
        if (err)
                return err;
@@ -4776,6 +4776,8 @@ int gk20a_gr_reset(struct gk20a *g)
        int err;
        u32 size;
+        mutex_lock(&g->gr.fecs_mutex);
        err = gk20a_enable_gr_hw(g);
        if (err)
                return err;
@@ -4788,6 +4790,14 @@ int gk20a_gr_reset(struct gk20a *g)
        if (err)
                return err;
+        mutex_unlock(&g->gr.fecs_mutex);
+        /* this appears query for sw states but fecs actually init
+           ramchain, etc so this is hw init */
+        err = g->ops.gr.init_ctx_state(g);
+        if (err)
+                return err;
        size = 0;
        err = gr_gk20a_fecs_get_reglist_img_size(g, &size);
        if (err) {
author	Vijayakumar <vsubbu@nvidia.com>	2015-08-04 07:44:54 -0400
committer	Terje Bergstrom <tbergstrom@nvidia.com>	2015-09-16 12:44:00 -0400
commit	b8faddfe2ad3d52837b0f766d74feb8e6d6f4ce5 (patch)
tree	9cf25fcdd5e9ac2ff870f78c35213ce46b90ac7a /drivers/gpu/nvgpu/gk20a
parent	2359f247d18fbde3220e463543193ab06f75fe81 (diff)

diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c index dae9c8cb..c18a4e5d 100644 --- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -719,7 +719,7 @@ static void gk20a_free_channel(struct channel_gk20a *ch)
719	struct vm_gk20a *ch_vm = ch->vm;	719	struct vm_gk20a *ch_vm = ch->vm;
720	unsigned long timeout = gk20a_get_gr_idle_timeout(g);	720	unsigned long timeout = gk20a_get_gr_idle_timeout(g);
721	struct dbg_session_gk20a *dbg_s;	721	struct dbg_session_gk20a *dbg_s;
722		722	bool was_reset;
723	gk20a_dbg_fn("");	723	gk20a_dbg_fn("");
724		724
725	WARN_ON(ch->g == NULL);	725	WARN_ON(ch->g == NULL);
@@ -764,7 +764,15 @@ static void gk20a_free_channel(struct channel_gk20a *ch)
764	if (g->fifo.deferred_reset_pending) {	764	if (g->fifo.deferred_reset_pending) {
765	gk20a_dbg(gpu_dbg_intr \| gpu_dbg_gpu_dbg, "engine reset was"	765	gk20a_dbg(gpu_dbg_intr \| gpu_dbg_gpu_dbg, "engine reset was"
766	" deferred, running now");	766	" deferred, running now");
767	gk20a_fifo_reset_engine(g, g->fifo.deferred_fault_engines);	767	was_reset = mutex_is_locked(&g->fifo.gr_reset_mutex);
		768	mutex_lock(&g->fifo.gr_reset_mutex);
		769	/* if lock is already taken, a reset is taking place
		770	so no need to repeat */
		771	if (!was_reset) {
		772	gk20a_fifo_reset_engine(g,
		773	g->fifo.deferred_fault_engines);
		774	}
		775	mutex_unlock(&g->fifo.gr_reset_mutex);
768	g->fifo.deferred_fault_engines = 0;	776	g->fifo.deferred_fault_engines = 0;
769	g->fifo.deferred_reset_pending = false;	777	g->fifo.deferred_reset_pending = false;
770	}	778	}


diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c index 68c0ddcb..0bd75026 100644 --- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -476,6 +476,7 @@ static int gk20a_init_fifo_setup_sw(struct gk20a *g)
476	f->g = g;	476	f->g = g;
477		477
478	mutex_init(&f->intr.isr.mutex);	478	mutex_init(&f->intr.isr.mutex);
		479	mutex_init(&f->gr_reset_mutex);
479	gk20a_init_fifo_pbdma_intr_descs(f); /* just filling in data/tables */	480	gk20a_init_fifo_pbdma_intr_descs(f); /* just filling in data/tables */
480		481
481	f->num_channels = g->ops.fifo.get_num_fifos(g);	482	f->num_channels = g->ops.fifo.get_num_fifos(g);
@@ -767,12 +768,15 @@ void gk20a_fifo_reset_engine(struct gk20a *g, u32 engine_id)
767	gk20a_dbg_fn("");	768	gk20a_dbg_fn("");
768		769
769	if (engine_id == top_device_info_type_enum_graphics_v()) {	770	if (engine_id == top_device_info_type_enum_graphics_v()) {
770	/HALT_PIPELINE method, halt GR engine/	771	if (support_gk20a_pmu(g->dev) && g->elpg_enabled)
771	if (gr_gk20a_halt_pipe(g))	772	gk20a_pmu_disable_elpg(g);
772	gk20a_err(dev_from_gk20a(g), "failed to HALT gr pipe");	773	/HALT_PIPELINE method, halt GR engine/
773	/* resetting engine using mc_enable_r() is not enough,	774	if (gr_gk20a_halt_pipe(g))
774	* we do full init sequence */	775	gk20a_err(dev_from_gk20a(g),
775	gk20a_gr_reset(g);	776	"failed to HALT gr pipe");
		777	/* resetting engine using mc_enable_r() is not
		778	enough, we do full init sequence */
		779	gk20a_gr_reset(g);
776	}	780	}
777	if (engine_id == top_device_info_type_enum_copy0_v())	781	if (engine_id == top_device_info_type_enum_copy0_v())
778	gk20a_reset(g, mc_enable_ce2_m());	782	gk20a_reset(g, mc_enable_ce2_m());
@@ -950,6 +954,7 @@ static bool gk20a_fifo_handle_mmu_fault(
950	struct channel_gk20a *ch = NULL;	954	struct channel_gk20a *ch = NULL;
951	struct tsg_gk20a *tsg = NULL;	955	struct tsg_gk20a *tsg = NULL;
952	struct channel_gk20a *referenced_channel = NULL;	956	struct channel_gk20a *referenced_channel = NULL;
		957	bool was_reset;
953	/* read and parse engine status */	958	/* read and parse engine status */
954	u32 status = gk20a_readl(g, fifo_engine_status_r(engine_id));	959	u32 status = gk20a_readl(g, fifo_engine_status_r(engine_id));
955	u32 ctx_status = fifo_engine_status_ctx_status_v(status);	960	u32 ctx_status = fifo_engine_status_ctx_status_v(status);
@@ -1029,9 +1034,15 @@ static bool gk20a_fifo_handle_mmu_fault(
1029		1034
1030	/* handled during channel free */	1035	/* handled during channel free */
1031	g->fifo.deferred_reset_pending = true;	1036	g->fifo.deferred_reset_pending = true;
1032	} else if (engine_id != ~0)	1037	} else if (engine_id != ~0) {
1033	gk20a_fifo_reset_engine(g, engine_id);	1038	was_reset = mutex_is_locked(&g->fifo.gr_reset_mutex);
1034		1039	mutex_lock(&g->fifo.gr_reset_mutex);
		1040	/* if lock is already taken, a reset is taking place
		1041	so no need to repeat */
		1042	if (!was_reset)
		1043	gk20a_fifo_reset_engine(g, engine_id);
		1044	mutex_unlock(&g->fifo.gr_reset_mutex);
		1045	}
1035	/* disable the channel/TSG from hw and increment	1046	/* disable the channel/TSG from hw and increment
1036	* syncpoints */	1047	* syncpoints */
1037		1048
@@ -2120,12 +2131,10 @@ static int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
2120	gk20a_fifo_runlist_reset_engines(g, runlist_id);	2131	gk20a_fifo_runlist_reset_engines(g, runlist_id);
2121		2132
2122	/* engine reset needs the lock. drop it */	2133	/* engine reset needs the lock. drop it */
2123	mutex_unlock(&runlist->mutex);
2124	/* wait until the runlist is active again */	2134	/* wait until the runlist is active again */
2125	ret = gk20a_fifo_runlist_wait_pending(g, runlist_id);	2135	ret = gk20a_fifo_runlist_wait_pending(g, runlist_id);
2126	/* get the lock back. at this point everything should	2136	/* get the lock back. at this point everything should
2127	* should be fine */	2137	* should be fine */
2128	mutex_lock(&runlist->mutex);
2129		2138
2130	if (ret)	2139	if (ret)
2131	gk20a_err(dev_from_gk20a(g),	2140	gk20a_err(dev_from_gk20a(g),


diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h index 7385f9be..3eb193f6 100644 --- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
@@ -109,6 +109,7 @@ struct fifo_gk20a {
109	/* zero-kref'd channels here */	109	/* zero-kref'd channels here */
110	struct list_head free_chs;	110	struct list_head free_chs;
111	struct mutex free_chs_mutex;	111	struct mutex free_chs_mutex;
		112	struct mutex gr_reset_mutex;
112		113
113	struct tsg_gk20a *tsg;	114	struct tsg_gk20a *tsg;
114	struct mutex tsg_inuse_mutex;	115	struct mutex tsg_inuse_mutex;


diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c index 512a7d6b..0ae44c6f 100644 --- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -4320,12 +4320,6 @@ static int gr_gk20a_init_ctxsw(struct gk20a *g)
4320	if (err)	4320	if (err)
4321	goto out;	4321	goto out;
4322		4322
4323	/* this appears query for sw states but fecs actually init
4324	ramchain, etc so this is hw init */
4325	err = g->ops.gr.init_ctx_state(g);
4326	if (err)
4327	goto out;
4328
4329	out:	4323	out:
4330	if (err)	4324	if (err)
4331	gk20a_err(dev_from_gk20a(g), "fail");	4325	gk20a_err(dev_from_gk20a(g), "fail");
@@ -4553,6 +4547,12 @@ int gk20a_init_gr_support(struct gk20a *g)
4553	if (err)	4547	if (err)
4554	return err;	4548	return err;
4555		4549
		4550	/* this appears query for sw states but fecs actually init
		4551	ramchain, etc so this is hw init */
		4552	err = g->ops.gr.init_ctx_state(g);
		4553	if (err)
		4554	return err;
		4555
4556	err = gk20a_init_gr_setup_sw(g);	4556	err = gk20a_init_gr_setup_sw(g);
4557	if (err)	4557	if (err)
4558	return err;	4558	return err;
@@ -4776,6 +4776,8 @@ int gk20a_gr_reset(struct gk20a *g)
4776	int err;	4776	int err;
4777	u32 size;	4777	u32 size;
4778		4778
		4779	mutex_lock(&g->gr.fecs_mutex);
		4780
4779	err = gk20a_enable_gr_hw(g);	4781	err = gk20a_enable_gr_hw(g);
4780	if (err)	4782	if (err)
4781	return err;	4783	return err;
@@ -4788,6 +4790,14 @@ int gk20a_gr_reset(struct gk20a *g)
4788	if (err)	4790	if (err)
4789	return err;	4791	return err;
4790		4792
		4793	mutex_unlock(&g->gr.fecs_mutex);
		4794
		4795	/* this appears query for sw states but fecs actually init
		4796	ramchain, etc so this is hw init */
		4797	err = g->ops.gr.init_ctx_state(g);
		4798	if (err)
		4799	return err;
		4800
4791	size = 0;	4801	size = 0;
4792	err = gr_gk20a_fecs_get_reglist_img_size(g, &size);	4802	err = gr_gk20a_fecs_get_reglist_img_size(g, &size);
4793	if (err) {	4803	if (err) {