gpu: nvgpu: vgpu: set mmu error for all channels of a tsg at once

In current code vgpu only set error notifier for the reporting channel but abort the whole tsg. When the tsg is aborted, all channels of the tsg are supposed have had their error notifiers set. Set it for all channels once any of the channels gets an MMU fault. For now, RM server still reports num-of-channel times for tsg mmu fault. We may optimize it in future. Jira VFND-3798 Change-Id: I6deaca55e7420899af8eabec72ad888d2726ad3c Signed-off-by: Richard Zhao <rizhao@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1588098 Reviewed-by: Aingara Paramakuru <aparamakuru@nvidia.com> GVS: Gerrit_Virtual_Submit Reviewed-by: Alex Waterman <alexw@nvidia.com> Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
author: Richard Zhao <rizhao@nvidia.com> 2017-10-30 01:30:04 -0400
committer: mobile promotions <svcmobile_promotions@nvidia.com> 2017-11-01 22:06:45 -0400
commit: 5eedf06bf56489bc559a08347f60a7680ccd6897 (patch)
tree: 1fe1e6eb8ead4c55166c1607e599d548093a2f40 /drivers
parent: 88ee812d56333375f7ae44e28b483c1a161d75da (diff)
1 files changed, 26 insertions, 2 deletions
diff --git a/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c b/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c
index 2874e256..121a52f1 100644
--- a/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c
@@ -736,7 +736,7 @@ int vgpu_fifo_force_reset_ch(struct channel_gk20a *ch,
        return err ? err : msg.ret;
 }
-static void vgpu_fifo_set_ctx_mmu_error(struct gk20a *g,
+static void vgpu_fifo_set_ctx_mmu_error_ch(struct gk20a *g,
                struct channel_gk20a *ch)
 {
        nvgpu_mutex_acquire(&ch->error_notifier_mutex);
@@ -761,6 +761,30 @@ static void vgpu_fifo_set_ctx_mmu_error(struct gk20a *g,
        nvgpu_cond_broadcast_interruptible(&ch->notifier_wq);
 }
+static void vgpu_fifo_set_ctx_mmu_error_ch_tsg(struct gk20a *g,
+                struct channel_gk20a *ch)
+{
+        struct tsg_gk20a *tsg = NULL;
+        struct channel_gk20a *ch_tsg = NULL;
+        if (gk20a_is_channel_marked_as_tsg(ch)) {
+                tsg = &g->fifo.tsg[ch->tsgid];
+                nvgpu_rwsem_down_read(&tsg->ch_list_lock);
+                list_for_each_entry(ch_tsg, &tsg->ch_list, ch_entry) {
+                        if (gk20a_channel_get(ch_tsg)) {
+                                vgpu_fifo_set_ctx_mmu_error_ch(g, ch_tsg);
+                                gk20a_channel_put(ch_tsg);
+                        }
+                }
+                nvgpu_rwsem_up_read(&tsg->ch_list_lock);
+        } else {
+                vgpu_fifo_set_ctx_mmu_error_ch(g, ch);
+        }
+}
 int vgpu_fifo_isr(struct gk20a *g, struct tegra_vgpu_fifo_intr_info *info)
 {
        struct fifo_gk20a *f = &g->fifo;
@@ -784,7 +808,7 @@ int vgpu_fifo_isr(struct gk20a *g, struct tegra_vgpu_fifo_intr_info *info)
                                        NVGPU_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT);
                break;
        case TEGRA_VGPU_FIFO_INTR_MMU_FAULT:
-                vgpu_fifo_set_ctx_mmu_error(g, ch);
+                vgpu_fifo_set_ctx_mmu_error_ch_tsg(g, ch);
                gk20a_channel_abort(ch, false);
                break;
        default:
author	Richard Zhao <rizhao@nvidia.com>	2017-10-30 01:30:04 -0400
committer	mobile promotions <svcmobile_promotions@nvidia.com>	2017-11-01 22:06:45 -0400
commit	5eedf06bf56489bc559a08347f60a7680ccd6897 (patch)
tree	1fe1e6eb8ead4c55166c1607e599d548093a2f40 /drivers
parent	88ee812d56333375f7ae44e28b483c1a161d75da (diff)

diff --git a/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c b/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c index 2874e256..121a52f1 100644 --- a/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c +++ b/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c
@@ -736,7 +736,7 @@ int vgpu_fifo_force_reset_ch(struct channel_gk20a *ch,
736	return err ? err : msg.ret;	736	return err ? err : msg.ret;
737	}	737	}
738		738
739	static void vgpu_fifo_set_ctx_mmu_error(struct gk20a *g,	739	static void vgpu_fifo_set_ctx_mmu_error_ch(struct gk20a *g,
740	struct channel_gk20a *ch)	740	struct channel_gk20a *ch)
741	{	741	{
742	nvgpu_mutex_acquire(&ch->error_notifier_mutex);	742	nvgpu_mutex_acquire(&ch->error_notifier_mutex);
@@ -761,6 +761,30 @@ static void vgpu_fifo_set_ctx_mmu_error(struct gk20a *g,
761	nvgpu_cond_broadcast_interruptible(&ch->notifier_wq);	761	nvgpu_cond_broadcast_interruptible(&ch->notifier_wq);
762	}	762	}
763		763
		764	static void vgpu_fifo_set_ctx_mmu_error_ch_tsg(struct gk20a *g,
		765	struct channel_gk20a *ch)
		766	{
		767	struct tsg_gk20a *tsg = NULL;
		768	struct channel_gk20a *ch_tsg = NULL;
		769
		770	if (gk20a_is_channel_marked_as_tsg(ch)) {
		771	tsg = &g->fifo.tsg[ch->tsgid];
		772
		773	nvgpu_rwsem_down_read(&tsg->ch_list_lock);
		774
		775	list_for_each_entry(ch_tsg, &tsg->ch_list, ch_entry) {
		776	if (gk20a_channel_get(ch_tsg)) {
		777	vgpu_fifo_set_ctx_mmu_error_ch(g, ch_tsg);
		778	gk20a_channel_put(ch_tsg);
		779	}
		780	}
		781
		782	nvgpu_rwsem_up_read(&tsg->ch_list_lock);
		783	} else {
		784	vgpu_fifo_set_ctx_mmu_error_ch(g, ch);
		785	}
		786	}
		787
764	int vgpu_fifo_isr(struct gk20a g, struct tegra_vgpu_fifo_intr_info info)	788	int vgpu_fifo_isr(struct gk20a g, struct tegra_vgpu_fifo_intr_info info)
765	{	789	{
766	struct fifo_gk20a *f = &g->fifo;	790	struct fifo_gk20a *f = &g->fifo;
@@ -784,7 +808,7 @@ int vgpu_fifo_isr(struct gk20a g, struct tegra_vgpu_fifo_intr_info info)
784	NVGPU_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT);	808	NVGPU_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT);
785	break;	809	break;
786	case TEGRA_VGPU_FIFO_INTR_MMU_FAULT:	810	case TEGRA_VGPU_FIFO_INTR_MMU_FAULT:
787	vgpu_fifo_set_ctx_mmu_error(g, ch);	811	vgpu_fifo_set_ctx_mmu_error_ch_tsg(g, ch);
788	gk20a_channel_abort(ch, false);	812	gk20a_channel_abort(ch, false);
789	break;	813	break;
790	default:	814	default: