From 34323b559590ed8f1c64ecbb7ffbd838a6478594 Mon Sep 17 00:00:00 2001
From: Konsta Holtta <kholtta@nvidia.com>
Date: Tue, 13 Mar 2018 16:58:01 +0200
Subject: gpu: nvgpu: wait for all prefence semas on gpu

The pre-fence wait for semaphores in the submit path has supported a
fast path for fences that have only one underlying semaphore. The fast
path just inserts the wait on this sema to the pushbuffer directly. For
other fences, the path has been using a CPU wait indirection, signaling
another semaphore when we get the CPU-side callback.

Instead of only supporting prefences with one sema, unroll all the
individual semaphores and insert waits for each to a pushbuffer, like
we've already been doing with syncpoints. Now all sema-backed syncs get
the fast path. This simplifies the logic and makes it more explicit that
only foreign fences need the CPU wait.

There is no need to hold references to the sync fence or the semas
inside: this submitted job only needs the global read-only sema mapping
that is guaranteed to stay alive while the VM of this channel stays
alive, and the job does not outlive this channel.

Jira NVGPU-43
Jira NVGPU-66
Jira NVGPU-513

Change-Id: I7cfbb510001d998a864aed8d6afd1582b9adb80d
Signed-off-by: Konsta Holtta <kholtta@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/1636345
Reviewed-by: Alex Waterman <alexw@nvidia.com>
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
---
 drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c | 175 +++++++++++++--------------
 drivers/gpu/nvgpu/gk20a/sync_gk20a.c         |  90 ++++++--------
 drivers/gpu/nvgpu/gk20a/sync_gk20a.h         |   6 +-
 3 files changed, 123 insertions(+), 148 deletions(-)

diff --git a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
index 4b1be8b9..c6b55bf8 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
@@ -502,10 +502,10 @@ static void gk20a_channel_semaphore_launcher(
 
 static void add_sema_cmd(struct gk20a *g, struct channel_gk20a *c,
 			 struct nvgpu_semaphore *s, struct priv_cmd_entry *cmd,
-			 int cmd_size, bool acquire, bool wfi)
+			 u32 offset, bool acquire, bool wfi)
 {
 	int ch = c->chid;
-	u32 ob, off = cmd->off;
+	u32 ob, off = cmd->off + offset;
 	u64 va;
 
 	ob = off;
@@ -588,108 +588,79 @@ static int gk20a_channel_semaphore_wait_syncpt(
 }
 
 #ifdef CONFIG_SYNC
-/*
- * Attempt a fast path for waiting on a sync_fence. Basically if the passed
- * sync_fence is backed by a nvgpu_semaphore then there's no reason to go
- * through the rigmarole of setting up a separate semaphore which waits on an
- * interrupt from the GPU and then triggers a worker thread to execute a SW
- * based semaphore release. Instead just have the GPU wait on the same semaphore
- * that is going to be incremented by the GPU.
- *
- * This function returns 2 possible values: -ENODEV or 0 on success. In the case
- * of -ENODEV the fastpath cannot be taken due to the fence not being backed by
- * a GPU semaphore.
- */
-static int __semaphore_wait_fd_fast_path(struct channel_gk20a *c,
-					 struct sync_fence *fence,
-					 struct priv_cmd_entry *wait_cmd,
-					 struct nvgpu_semaphore **fp_sema)
+static int semaphore_wait_fd_native(struct channel_gk20a *c, int fd,
+		struct priv_cmd_entry *wait_cmd)
 {
-	struct nvgpu_semaphore *sema;
+	struct sync_fence *sync_fence;
 	int err;
+	const int wait_cmd_size = 8;
+	int num_wait_cmds;
+	int i;
 
-	if (!gk20a_is_sema_backed_sync_fence(fence))
-		return -ENODEV;
-
-	sema = gk20a_sync_fence_get_sema(fence);
+	sync_fence = gk20a_sync_fence_fdget(fd);
+	if (!sync_fence)
+		return -EINVAL;
 
-	/*
-	 * If there's no underlying sema then that means the underlying sema has
-	 * already signaled.
-	 */
-	if (!sema) {
-		*fp_sema = NULL;
-		return 0;
+	num_wait_cmds = sync_fence->num_fences;
+	if (num_wait_cmds == 0) {
+		err = 0;
+		goto put_fence;
 	}
 
-	err = gk20a_channel_alloc_priv_cmdbuf(c, 8, wait_cmd);
-	if (err)
-		return err;
+	err = gk20a_channel_alloc_priv_cmdbuf(c,
+			wait_cmd_size * num_wait_cmds,
+			wait_cmd);
+	if (err) {
+		nvgpu_err(c->g, "not enough priv cmd buffer space");
+		goto put_fence;
+	}
 
-	nvgpu_semaphore_get(sema);
-	BUG_ON(!sema->incremented);
-	add_sema_cmd(c->g, c, sema, wait_cmd, 8, true, false);
+	for (i = 0; i < sync_fence->num_fences; i++) {
+		struct fence *f = sync_fence->cbs[i].sync_pt;
+		struct sync_pt *pt = sync_pt_from_fence(f);
+		struct nvgpu_semaphore *sema;
 
-	/*
-	 * Make sure that gk20a_channel_semaphore_wait_fd() can create another
-	 * fence with the underlying semaphore.
-	 */
-	*fp_sema = sema;
+		sema = gk20a_sync_pt_sema(pt);
+		if (!sema) {
+			/* expired */
+			nvgpu_memset(c->g, wait_cmd->mem,
+			(wait_cmd->off + i * wait_cmd_size) * sizeof(u32),
+				0, wait_cmd_size * sizeof(u32));
+		} else {
+			WARN_ON(!sema->incremented);
+			add_sema_cmd(c->g, c, sema, wait_cmd,
+					i * wait_cmd_size, true, false);
+			nvgpu_semaphore_put(sema);
+		}
+	}
 
-	return 0;
+put_fence:
+	sync_fence_put(sync_fence);
+	return err;
 }
-#endif
 
-static int gk20a_channel_semaphore_wait_fd(
-		struct gk20a_channel_sync *s, int fd,
-		struct priv_cmd_entry *entry,
-		struct gk20a_fence *fence)
+static int semaphore_wait_fd_proxy(struct channel_gk20a *c, int fd,
+		struct priv_cmd_entry *wait_cmd,
+		struct gk20a_fence *fence_out,
+		struct sync_timeline *timeline)
 {
-	struct gk20a_channel_semaphore *sema =
-		container_of(s, struct gk20a_channel_semaphore, ops);
-	struct channel_gk20a *c = sema->c;
-#ifdef CONFIG_SYNC
-	struct nvgpu_semaphore *fp_sema;
+	const int wait_cmd_size = 8;
 	struct sync_fence *sync_fence;
-	struct priv_cmd_entry *wait_cmd = entry;
 	struct wait_fence_work *w = NULL;
-	int err, ret, status;
+	int err, status;
 
-	sync_fence = gk20a_sync_fence_fdget(fd);
+	sync_fence = sync_fence_fdget(fd);
 	if (!sync_fence)
 		return -EINVAL;
 
-	ret = __semaphore_wait_fd_fast_path(c, sync_fence, wait_cmd, &fp_sema);
-	if (ret == 0) {
-		if (fp_sema) {
-			err = gk20a_fence_from_semaphore(c->g, fence,
-					sema->timeline,
-					fp_sema,
-					&c->semaphore_wq,
-					false);
-			if (err) {
-				nvgpu_semaphore_put(fp_sema);
-				goto clean_up_priv_cmd;
-			}
-		} else
-			/*
-			 * Init an empty fence. It will instantly return
-			 * from gk20a_fence_wait().
-			 */
-			gk20a_init_fence(fence, NULL, NULL);
-
-		sync_fence_put(sync_fence);
-		goto skip_slow_path;
-	}
-
 	/* If the fence has signaled there is no reason to wait on it. */
 	status = atomic_read(&sync_fence->status);
 	if (status == 0) {
 		sync_fence_put(sync_fence);
-		goto skip_slow_path;
+		return 0;
 	}
 
-	err = gk20a_channel_alloc_priv_cmdbuf(c, 8, wait_cmd);
+	err = gk20a_channel_alloc_priv_cmdbuf(c, wait_cmd_size, wait_cmd);
 	if (err) {
 		nvgpu_err(c->g,
 				"not enough priv cmd buffer space");
@@ -718,34 +689,34 @@ static int gk20a_channel_semaphore_wait_fd(
 	nvgpu_semaphore_incr(w->sema, c->hw_sema);
 
 	/* GPU unblocked when the semaphore value increments. */
-	add_sema_cmd(c->g, c, w->sema, wait_cmd, 8, true, false);
+	add_sema_cmd(c->g, c, w->sema, wait_cmd, 0, true, false);
 
 	/*
 	 *  We need to create the fence before adding the waiter to ensure
 	 *  that we properly clean up in the event the sync_fence has
 	 *  already signaled
 	 */
-	err = gk20a_fence_from_semaphore(c->g, fence, sema->timeline, w->sema,
-			&c->semaphore_wq, false);
+	err = gk20a_fence_from_semaphore(c->g, fence_out, timeline,
+			w->sema, &c->semaphore_wq, false);
 	if (err)
 		goto clean_up_sema;
 
-	ret = sync_fence_wait_async(sync_fence, &w->waiter);
+	err = sync_fence_wait_async(sync_fence, &w->waiter);
 	gk20a_add_pending_sema_wait(c->g, w);
 
-	/*
-	 * If the sync_fence has already signaled then the above async_wait
-	 * will never trigger. This causes the semaphore release op to never
-	 * happen which, in turn, hangs the GPU. That's bad. So let's just
-	 * do the nvgpu_semaphore_release() right now.
+        /*
+	 * If the sync_fence has already signaled then the above wait_async
+	 * will not get scheduled; the fence completed just after doing the
+	 * status check above before allocs and waiter init, and won the race.
+	 * This causes the waiter to be skipped, so let's release the semaphore
+	 * here and put the refs taken for the worker.
 	 */
-	if (ret == 1) {
+	if (err == 1) {
 		sync_fence_put(sync_fence);
 		nvgpu_semaphore_release(w->sema, c->hw_sema);
 		nvgpu_semaphore_put(w->sema);
 	}
 
-skip_slow_path:
 	return 0;
 
 clean_up_sema:
@@ -758,10 +729,28 @@ clean_up_sema:
 clean_up_worker:
 	nvgpu_kfree(c->g, w);
 clean_up_priv_cmd:
-	gk20a_free_priv_cmdbuf(c, entry);
+	gk20a_free_priv_cmdbuf(c, wait_cmd);
 clean_up_sync_fence:
 	sync_fence_put(sync_fence);
 	return err;
+}
+#endif
+
+static int gk20a_channel_semaphore_wait_fd(
+		struct gk20a_channel_sync *s, int fd,
+		struct priv_cmd_entry *entry,
+		struct gk20a_fence *fence)
+{
+	struct gk20a_channel_semaphore *sema =
+		container_of(s, struct gk20a_channel_semaphore, ops);
+	struct channel_gk20a *c = sema->c;
+#ifdef CONFIG_SYNC
+	int err;
+
+	err = semaphore_wait_fd_native(c, fd, entry);
+	if (err)
+		err = semaphore_wait_fd_proxy(c, fd, entry, fence, sema->timeline);
+	return err;
 #else
 	nvgpu_err(c->g,
 		  "trying to use sync fds with CONFIG_SYNC disabled");
@@ -798,7 +787,7 @@ static int __gk20a_channel_semaphore_incr(
 	}
 
 	/* Release the completion semaphore. */
-	add_sema_cmd(c->g, c, semaphore, incr_cmd, 14, false, wfi_cmd);
+	add_sema_cmd(c->g, c, semaphore, incr_cmd, 0, false, wfi_cmd);
 
 	err = gk20a_fence_from_semaphore(c->g, fence,
 			sp->timeline, semaphore,
diff --git a/drivers/gpu/nvgpu/gk20a/sync_gk20a.c b/drivers/gpu/nvgpu/gk20a/sync_gk20a.c
index f6d16b90..a8600bce 100644
--- a/drivers/gpu/nvgpu/gk20a/sync_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/sync_gk20a.c
@@ -1,7 +1,7 @@
 /*
  * GK20A Sync Framework Integration
  *
- * Copyright (c) 2014-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2014-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -69,55 +69,6 @@ struct gk20a_sync_pt_inst {
 	struct gk20a_sync_pt		*shared;
 };
 
-/**
- * Check if the passed sync_fence is backed by a single GPU semaphore. In such
- * cases we can short circuit a lot of SW involved in signaling pre-fences and
- * post fences.
- *
- * For now reject multi-sync_pt fences. This could be changed in future. It
- * would require that the sema fast path push a sema acquire for each semaphore
- * in the fence.
- */
-int gk20a_is_sema_backed_sync_fence(struct sync_fence *fence)
-{
-	struct sync_timeline *t;
-
-	struct fence *pt = fence->cbs[0].sync_pt;
-	struct sync_pt *spt = sync_pt_from_fence(pt);
-
-	if (fence->num_fences != 1)
-		return 0;
-
-	if (spt == NULL)
-		return 0;
-
-	t = sync_pt_parent(spt);
-
-	if (t->ops == &gk20a_sync_timeline_ops)
-		return 1;
-	return 0;
-}
-
-struct nvgpu_semaphore *gk20a_sync_fence_get_sema(struct sync_fence *f)
-{
-	struct sync_pt *spt;
-	struct gk20a_sync_pt_inst *pti;
-
-	struct fence *pt;
-
-	if (!f)
-		return NULL;
-
-	if (!gk20a_is_sema_backed_sync_fence(f))
-		return NULL;
-
-	pt = f->cbs[0].sync_pt;
-	spt = sync_pt_from_fence(pt);
-	pti = container_of(spt, struct gk20a_sync_pt_inst, pt);
-
-	return pti->shared->sema;
-}
-
 /**
  * Compares sync pt values a and b, both of which will trigger either before
  * or after ref (i.e. a and b trigger before ref, or a and b trigger after
@@ -371,7 +322,44 @@ static const struct sync_timeline_ops gk20a_sync_timeline_ops = {
 
 struct sync_fence *gk20a_sync_fence_fdget(int fd)
 {
-	return sync_fence_fdget(fd);
+	struct sync_fence *fence = sync_fence_fdget(fd);
+	int i;
+
+	if (!fence)
+		return NULL;
+
+	for (i = 0; i < fence->num_fences; i++) {
+		struct fence *pt = fence->cbs[i].sync_pt;
+		struct sync_pt *spt = sync_pt_from_fence(pt);
+		struct sync_timeline *t;
+
+		if (spt == NULL) {
+			sync_fence_put(fence);
+			return NULL;
+		}
+
+		t = sync_pt_parent(spt);
+		if (t->ops != &gk20a_sync_timeline_ops) {
+			sync_fence_put(fence);
+			return NULL;
+		}
+	}
+
+	return fence;
+}
+
+struct nvgpu_semaphore *gk20a_sync_pt_sema(struct sync_pt *spt)
+{
+	struct gk20a_sync_pt *pt = to_gk20a_sync_pt(spt);
+	struct nvgpu_semaphore *sema;
+
+	nvgpu_spinlock_acquire(&pt->lock);
+	sema = pt->sema;
+	if (sema)
+		nvgpu_semaphore_get(sema);
+	nvgpu_spinlock_release(&pt->lock);
+
+	return sema;
 }
 
 void gk20a_sync_timeline_signal(struct sync_timeline *timeline)
diff --git a/drivers/gpu/nvgpu/gk20a/sync_gk20a.h b/drivers/gpu/nvgpu/gk20a/sync_gk20a.h
index 7d7aff6d..8a6439ab 100644
--- a/drivers/gpu/nvgpu/gk20a/sync_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/sync_gk20a.h
@@ -3,7 +3,7 @@
  *
  * GK20A Sync Framework Integration
  *
- * Copyright (c) 2014-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2014-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -33,9 +33,6 @@ struct sync_pt;
 struct nvgpu_semaphore;
 struct fence;
 
-int gk20a_is_sema_backed_sync_fence(struct sync_fence *fence);
-struct nvgpu_semaphore *gk20a_sync_fence_get_sema(struct sync_fence *f);
-
 #ifdef CONFIG_SYNC
 struct sync_timeline *gk20a_sync_timeline_create(const char *fmt, ...);
 void gk20a_sync_timeline_destroy(struct sync_timeline *);
@@ -46,6 +43,7 @@ struct sync_fence *gk20a_sync_fence_create(
 		struct nvgpu_semaphore *,
 		const char *fmt, ...);
 struct sync_fence *gk20a_sync_fence_fdget(int fd);
+struct nvgpu_semaphore *gk20a_sync_pt_sema(struct sync_pt *spt);
 #else
 static inline void gk20a_sync_timeline_destroy(struct sync_timeline *obj) {}
 static inline void gk20a_sync_timeline_signal(struct sync_timeline *obj) {}
-- 
cgit v1.2.2