From 220860d04383489a8e75684802a2ced1323831df Mon Sep 17 00:00:00 2001
From: Seema Khowala <seemaj@nvidia.com>
Date: Fri, 19 Oct 2018 12:08:46 -0700
Subject: gpu: nvgpu: rename has_timedout and make it thread safe

Currently has_timedout variable is protected by wmb at places
where it is being set and there is no correspoding rmb whenever
has_timedout variable is read. This is prone to errors for
concurrent execution. This change is supposed to fix this issue.
Rename has_timedout variable of channel struct to ch_timedout.
Also to avoid rmb every time ch_timedout is read,
ch_timedout_spinlock is added to protect ch_timedout
variable for taking care of concurrent execution.

Bug 2404865
Bug 2092051

Change-Id: I0bee9f50af0a48720aa8b54cbc3af97ef9f6df00
Signed-off-by: Seema Khowala <seemaj@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/1930935
Signed-off-by: Debarshi Dutta <ddutta@nvidia.com>
(cherry picked from commit 1f54ea09e3445d9ca3cf7a69b4967849cc9defc8
in dev-kernel)
Reviewed-on: https://git-master.nvidia.com/r/2016975
GVS: Gerrit_Virtual_Submit
Reviewed-by: Alex Waterman <alexw@nvidia.com>
Reviewed-by: Bibek Basu <bbasu@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
---
 drivers/gpu/nvgpu/common/fifo/channel.c    | 26 +++++++++++++++++++++++---
 drivers/gpu/nvgpu/common/fifo/submit.c     |  4 ++--
 drivers/gpu/nvgpu/gk20a/fifo_gk20a.c       |  8 ++++----
 drivers/gpu/nvgpu/include/nvgpu/channel.h  |  6 +++++-
 drivers/gpu/nvgpu/os/linux/cde.c           |  5 +++--
 drivers/gpu/nvgpu/os/linux/ioctl_channel.c | 17 +++++++++++------
 drivers/gpu/nvgpu/os/linux/ioctl_tsg.c     |  2 +-
 drivers/gpu/nvgpu/vgpu/fifo_vgpu.c         |  8 ++++----
 drivers/gpu/nvgpu/vgpu/vgpu.c              |  2 +-
 9 files changed, 54 insertions(+), 24 deletions(-)

(limited to 'drivers/gpu/nvgpu')

diff --git a/drivers/gpu/nvgpu/common/fifo/channel.c b/drivers/gpu/nvgpu/common/fifo/channel.c
index 29720886..b5ae42d4 100644
--- a/drivers/gpu/nvgpu/common/fifo/channel.c
+++ b/drivers/gpu/nvgpu/common/fifo/channel.c
@@ -212,6 +212,24 @@ void gk20a_channel_abort_clean_up(struct channel_gk20a *ch)
 	gk20a_channel_update(ch);
 }
 
+void gk20a_channel_set_timedout(struct channel_gk20a *ch)
+{
+	nvgpu_spinlock_acquire(&ch->ch_timedout_lock);
+	ch->ch_timedout = true;
+	nvgpu_spinlock_release(&ch->ch_timedout_lock);
+}
+
+bool  gk20a_channel_check_timedout(struct channel_gk20a *ch)
+{
+	bool ch_timedout_status;
+
+	nvgpu_spinlock_acquire(&ch->ch_timedout_lock);
+	ch_timedout_status = ch->ch_timedout;
+	nvgpu_spinlock_release(&ch->ch_timedout_lock);
+
+	return ch_timedout_status;
+}
+
 void gk20a_channel_abort(struct channel_gk20a *ch, bool channel_preempt)
 {
 	struct tsg_gk20a *tsg = tsg_gk20a_from_ch(ch);
@@ -223,7 +241,7 @@ void gk20a_channel_abort(struct channel_gk20a *ch, bool channel_preempt)
 	}
 
 	/* make sure new kickoffs are prevented */
-	ch->has_timedout = true;
+	gk20a_channel_set_timedout(ch);
 
 	ch->g->ops.fifo.disable_channel(ch);
 
@@ -425,7 +443,7 @@ static void gk20a_free_channel(struct channel_gk20a *ch, bool force)
 		 * Set user managed syncpoint to safe state
 		 * But it's already done if channel has timedout
 		 */
-		if (ch->has_timedout) {
+		if (gk20a_channel_check_timedout(ch)) {
 			nvgpu_channel_sync_destroy(ch->user_sync, false);
 		} else {
 			nvgpu_channel_sync_destroy(ch->user_sync, true);
@@ -711,7 +729,7 @@ struct channel_gk20a *gk20a_open_new_channel(struct gk20a *g,
 	/* set gr host default timeout */
 	ch->timeout_ms_max = gk20a_get_gr_idle_timeout(g);
 	ch->timeout_debug_dump = true;
-	ch->has_timedout = false;
+	ch->ch_timedout = false;
 
 	/* init kernel watchdog timeout */
 	ch->timeout.enabled = true;
@@ -2196,6 +2214,8 @@ int gk20a_init_channel_support(struct gk20a *g, u32 chid)
 	c->referenceable = false;
 	nvgpu_cond_init(&c->ref_count_dec_wq);
 
+	nvgpu_spinlock_init(&c->ch_timedout_lock);
+
 #if GK20A_CHANNEL_REFCOUNT_TRACKING
 	nvgpu_spinlock_init(&c->ref_actions_lock);
 #endif
diff --git a/drivers/gpu/nvgpu/common/fifo/submit.c b/drivers/gpu/nvgpu/common/fifo/submit.c
index 5e218fbc..599539cd 100644
--- a/drivers/gpu/nvgpu/common/fifo/submit.c
+++ b/drivers/gpu/nvgpu/common/fifo/submit.c
@@ -336,7 +336,7 @@ static int nvgpu_submit_channel_gpfifo(struct channel_gk20a *c,
 		return -ENODEV;
 	}
 
-	if (c->has_timedout) {
+	if (gk20a_channel_check_timedout(c)) {
 		return -ETIMEDOUT;
 	}
 
@@ -504,7 +504,7 @@ static int nvgpu_submit_channel_gpfifo(struct channel_gk20a *c,
 		}
 	}
 
-	if (c->has_timedout) {
+	if (gk20a_channel_check_timedout(c)) {
 		err = -ETIMEDOUT;
 		goto clean_up;
 	}
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
index 06db0bb0..d4e386bd 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -1484,8 +1484,8 @@ static void gk20a_fifo_set_has_timedout_and_wake_up_wqs(struct gk20a *g,
 {
 	if (refch) {
 		/* mark channel as faulted */
-		refch->has_timedout = true;
-		nvgpu_smp_wmb();
+		gk20a_channel_set_timedout(refch);
+
 		/* unblock pending waits */
 		nvgpu_cond_broadcast_interruptible(&refch->semaphore_wq);
 		nvgpu_cond_broadcast_interruptible(&refch->notifier_wq);
@@ -1568,7 +1568,7 @@ void gk20a_fifo_abort_tsg(struct gk20a *g, struct tsg_gk20a *tsg, bool preempt)
 	nvgpu_rwsem_down_read(&tsg->ch_list_lock);
 	nvgpu_list_for_each_entry(ch, &tsg->ch_list, channel_gk20a, ch_entry) {
 		if (gk20a_channel_get(ch)) {
-			ch->has_timedout = true;
+			gk20a_channel_set_timedout(ch);
 			if (ch->g->ops.fifo.ch_abort_clean_up) {
 				ch->g->ops.fifo.ch_abort_clean_up(ch);
 			}
@@ -2181,7 +2181,7 @@ int gk20a_fifo_tsg_unbind_channel(struct channel_gk20a *ch)
 
 	/* If one channel in TSG times out, we disable all channels */
 	nvgpu_rwsem_down_write(&tsg->ch_list_lock);
-	tsg_timedout = ch->has_timedout;
+	tsg_timedout = gk20a_channel_check_timedout(ch);
 	nvgpu_rwsem_up_write(&tsg->ch_list_lock);
 
 	/* Disable TSG and examine status before unbinding channel */
diff --git a/drivers/gpu/nvgpu/include/nvgpu/channel.h b/drivers/gpu/nvgpu/include/nvgpu/channel.h
index 1851b9e2..0a956c66 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/channel.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/channel.h
@@ -298,6 +298,8 @@ struct channel_gk20a {
 
 	struct nvgpu_mem ctx_header;
 
+	struct nvgpu_spinlock ch_timedout_lock;
+	bool ch_timedout;
 	/* Any operating system specific data. */
 	void *os_priv;
 
@@ -313,7 +315,6 @@ struct channel_gk20a {
 	u32 runlist_id;
 
 	bool mmu_nack_handled;
-	bool has_timedout;
 	bool referenceable;
 	bool vpr;
 	bool deterministic;
@@ -464,4 +465,7 @@ static inline void trace_write_pushbuffers(struct channel_gk20a *c, u32 count)
 }
 #endif
 
+void gk20a_channel_set_timedout(struct channel_gk20a *ch);
+bool gk20a_channel_check_timedout(struct channel_gk20a *ch);
+
 #endif
diff --git a/drivers/gpu/nvgpu/os/linux/cde.c b/drivers/gpu/nvgpu/os/linux/cde.c
index a15ef560..7b2cba7d 100644
--- a/drivers/gpu/nvgpu/os/linux/cde.c
+++ b/drivers/gpu/nvgpu/os/linux/cde.c
@@ -1272,7 +1272,7 @@ __releases(&cde_app->mutex)
 		nvgpu_log_info(g, "double finish cde context %p on channel %p",
 				cde_ctx, ch);
 
-	if (ch->has_timedout) {
+	if (gk20a_channel_check_timedout(ch)) {
 		if (cde_ctx->is_temporary) {
 			nvgpu_warn(g,
 					"cde: channel had timed out"
@@ -1299,8 +1299,9 @@ __releases(&cde_app->mutex)
 			msecs_to_jiffies(CTX_DELETE_TIME));
 	}
 
-	if (!ch->has_timedout)
+	if (!gk20a_channel_check_timedout(ch)) {
 		gk20a_cde_ctx_release(cde_ctx);
+	}
 }
 
 static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx)
diff --git a/drivers/gpu/nvgpu/os/linux/ioctl_channel.c b/drivers/gpu/nvgpu/os/linux/ioctl_channel.c
index 45d49474..d0d4b1af 100644
--- a/drivers/gpu/nvgpu/os/linux/ioctl_channel.c
+++ b/drivers/gpu/nvgpu/os/linux/ioctl_channel.c
@@ -636,8 +636,9 @@ static int gk20a_channel_wait_semaphore(struct channel_gk20a *ch,
 	int ret = 0;
 
 	/* do not wait if channel has timed out */
-	if (ch->has_timedout)
+	if (gk20a_channel_check_timedout(ch)) {
 		return -ETIMEDOUT;
+	}
 
 	dmabuf = dma_buf_get(id);
 	if (IS_ERR(dmabuf)) {
@@ -656,7 +657,8 @@ static int gk20a_channel_wait_semaphore(struct channel_gk20a *ch,
 
 	ret = NVGPU_COND_WAIT_INTERRUPTIBLE(
 			&ch->semaphore_wq,
-			*semaphore == payload || ch->has_timedout,
+			*semaphore == payload ||
+			gk20a_channel_check_timedout(ch),
 			timeout);
 
 	dma_buf_kunmap(dmabuf, offset >> PAGE_SHIFT, data);
@@ -680,8 +682,9 @@ static int gk20a_channel_wait(struct channel_gk20a *ch,
 
 	nvgpu_log_fn(g, " ");
 
-	if (ch->has_timedout)
+	if (gk20a_channel_check_timedout(ch)) {
 		return -ETIMEDOUT;
+	}
 
 	switch (args->type) {
 	case NVGPU_WAIT_TYPE_NOTIFIER:
@@ -716,7 +719,8 @@ static int gk20a_channel_wait(struct channel_gk20a *ch,
 		 * calling this ioctl */
 		remain = NVGPU_COND_WAIT_INTERRUPTIBLE(
 				&ch->notifier_wq,
-				notif->status == 0 || ch->has_timedout,
+				notif->status == 0 ||
+				gk20a_channel_check_timedout(ch),
 				args->timeout);
 
 		if (remain == 0 && notif->status != 0) {
@@ -786,8 +790,9 @@ static int gk20a_ioctl_channel_submit_gpfifo(
 	profile = gk20a_fifo_profile_acquire(ch->g);
 	gk20a_fifo_profile_snapshot(profile, PROFILE_IOCTL_ENTRY);
 
-	if (ch->has_timedout)
+	if (gk20a_channel_check_timedout(ch)) {
 		return -ETIMEDOUT;
+	}
 
 	nvgpu_get_fence_args(&args->fence, &fence);
 	submit_flags =
@@ -1249,7 +1254,7 @@ long gk20a_channel_ioctl(struct file *filp,
 	}
 	case NVGPU_IOCTL_CHANNEL_GET_TIMEDOUT:
 		((struct nvgpu_get_param_args *)buf)->value =
-			ch->has_timedout;
+			gk20a_channel_check_timedout(ch);
 		break;
 	case NVGPU_IOCTL_CHANNEL_ENABLE:
 		err = gk20a_busy(ch->g);
diff --git a/drivers/gpu/nvgpu/os/linux/ioctl_tsg.c b/drivers/gpu/nvgpu/os/linux/ioctl_tsg.c
index b0cdf5e5..c5079bd6 100644
--- a/drivers/gpu/nvgpu/os/linux/ioctl_tsg.c
+++ b/drivers/gpu/nvgpu/os/linux/ioctl_tsg.c
@@ -141,7 +141,7 @@ static int gk20a_tsg_unbind_channel_fd(struct tsg_gk20a *tsg, int ch_fd)
 	 * Mark the channel timedout since channel unbound from TSG
 	 * has no context of its own so it can't serve any job
 	 */
-	ch->has_timedout = true;
+	gk20a_channel_set_timedout(ch);
 
 out:
 	gk20a_channel_put(ch);
diff --git a/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c b/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c
index 234f6fd4..4055d5af 100644
--- a/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c
@@ -651,7 +651,7 @@ int vgpu_fifo_force_reset_ch(struct channel_gk20a *ch,
 			if (gk20a_channel_get(ch_tsg)) {
 				g->ops.fifo.set_error_notifier(ch_tsg,
 								err_code);
-				ch_tsg->has_timedout = true;
+				gk20a_channel_set_timedout(ch_tsg);
 				gk20a_channel_put(ch_tsg);
 			}
 		}
@@ -659,7 +659,7 @@ int vgpu_fifo_force_reset_ch(struct channel_gk20a *ch,
 		nvgpu_rwsem_up_read(&tsg->ch_list_lock);
 	} else {
 		g->ops.fifo.set_error_notifier(ch, err_code);
-		ch->has_timedout = true;
+		gk20a_channel_set_timedout(ch);
 	}
 
 	msg.cmd = TEGRA_VGPU_CMD_CHANNEL_FORCE_RESET;
@@ -685,8 +685,8 @@ static void vgpu_fifo_set_ctx_mmu_error_ch(struct gk20a *g,
 		NVGPU_ERR_NOTIFIER_FIFO_ERROR_MMU_ERR_FLT);
 
 	/* mark channel as faulted */
-	ch->has_timedout = true;
-	nvgpu_smp_wmb();
+	gk20a_channel_set_timedout(ch);
+
 	/* unblock pending waits */
 	nvgpu_cond_broadcast_interruptible(&ch->semaphore_wq);
 	nvgpu_cond_broadcast_interruptible(&ch->notifier_wq);
diff --git a/drivers/gpu/nvgpu/vgpu/vgpu.c b/drivers/gpu/nvgpu/vgpu/vgpu.c
index 07361afe..c17a16df 100644
--- a/drivers/gpu/nvgpu/vgpu/vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/vgpu.c
@@ -126,7 +126,7 @@ static void vgpu_channel_abort_cleanup(struct gk20a *g, u32 chid)
 		return;
 	}
 
-	ch->has_timedout = true;
+	gk20a_channel_set_timedout(ch);
 	g->ops.fifo.ch_abort_clean_up(ch);
 	gk20a_channel_put(ch);
 }
-- 
cgit v1.2.2