gpu: nvgpu: Revamp semaphore support

Revamp the support the nvgpu driver has for semaphores. The original problem with nvgpu's semaphore support is that it required a SW based wait for every semaphore release. This was because for every fence that gk20a_channel_semaphore_wait_fd() waited on a new semaphore was created. This semaphore would then get released by SW when the fence signaled. This meant that for every release there was necessarily a sync_fence_wait_async() call which could block. The latency of this SW wait was enough to cause massive degredation in performance. To fix this a fast path was implemented. When a fence is passed to gk20a_channel_semaphore_wait_fd() that is backed by a GPU semaphore a semaphore acquire is directly used to block the GPU. No longer is a sync_fence_wait_async() performed nor is there an extra semaphore created. To implement this fast path the semaphore memory had to be shared between channels. Previously since a new semaphore was created every time through gk20a_channel_semaphore_wait_fd() what address space a semaphore was mapped into was irrelevant. However, when using the fast path a sempahore may be released on one address space but acquired in another. Sharing the semaphore memory was done by making a fixed GPU mapping in all channels. This mapping points to the semaphore memory (the so called semaphore sea). This global fixed mapping is read-only to make sure no semaphores can be incremented (i.e released) by a malicious channel. Each channel then gets a RW mapping of it's own semaphore. This way a channel may only acquire other channel's semaphores but may both acquire and release its own semaphore. The gk20a fence code was updated to allow introspection of the GPU backed fences. This allows detection of when the fast path can be taken. If the fast path cannot be used (for example when a fence is sync-pt backed) the original slow path is still present. This gets used when the GPU needs to wait on an event from something which only understands how to use sync-pts. Bug 1732449 JIRA DNVGPU-12 Change-Id: Ic0fea74994da5819a771deac726bb0d47a33c2de Signed-off-by: Alex Waterman <alexw@nvidia.com> Reviewed-on: http://git-master/r/1133792 Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
author: Alex Waterman <alexw@nvidia.com> 2016-04-27 15:27:36 -0400
committer: Terje Bergstrom <tbergstrom@nvidia.com> 2016-06-28 18:49:11 -0400
commit: dfd5ec53fcce4ebae27f78242e6b788350337095 (patch)
tree: 073ea380b9ee4734391d381745f57600c3525be5 /drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
parent: b30990ea6db564e885d5aee7a1a5ea87a1e5e8ee (diff)
1 files changed, 176 insertions, 57 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
index d2d8c094..9c8911e9 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
@@ -424,28 +424,52 @@ static void gk20a_channel_semaphore_launcher(
 }
 #endif
-static int add_sema_cmd(struct gk20a *g, struct priv_cmd_entry *cmd,
+static void add_sema_cmd(struct gk20a *g, struct channel_gk20a *c,
-                u64 sema, u32 payload, bool acquire, bool wfi)
+                         struct gk20a_semaphore *s, struct priv_cmd_entry *cmd,
+                         int cmd_size, bool acquire, bool wfi)
 {
        u32 off = cmd->off;
+        u64 va;
+        /*
+         * RO for acquire (since we just need to read the mem) and RW for
+         * release since we will need to write back to the semaphore memory.
+         */
+        va = acquire ? gk20a_semaphore_gpu_ro_va(s) :
+                       gk20a_semaphore_gpu_rw_va(s);
+        /*
+         * If the op is not an acquire (so therefor a release) we should
+         * incr the underlying sema next_value.
+         */
+        if (!acquire)
+                gk20a_semaphore_incr(s);
        /* semaphore_a */
        gk20a_mem_wr32(g, cmd->mem, off++, 0x20010004);
        /* offset_upper */
-        gk20a_mem_wr32(g, cmd->mem, off++, (sema >> 32) & 0xff);
+        gk20a_mem_wr32(g, cmd->mem, off++, (va >> 32) & 0xff);
        /* semaphore_b */
        gk20a_mem_wr32(g, cmd->mem, off++, 0x20010005);
        /* offset */
-        gk20a_mem_wr32(g, cmd->mem, off++, sema & 0xffffffff);
+        gk20a_mem_wr32(g, cmd->mem, off++, va & 0xffffffff);
-        /* semaphore_c */
-        gk20a_mem_wr32(g, cmd->mem, off++, 0x20010006);
-        /* payload */
-        gk20a_mem_wr32(g, cmd->mem, off++, payload);
        if (acquire) {
+                /* semaphore_c */
+                gk20a_mem_wr32(g, cmd->mem, off++, 0x20010006);
+                /* payload */
+                gk20a_mem_wr32(g, cmd->mem, off++,
+                               gk20a_semaphore_get_value(s));
                /* semaphore_d */
                gk20a_mem_wr32(g, cmd->mem, off++, 0x20010007);
                /* operation: acq_geq, switch_en */
                gk20a_mem_wr32(g, cmd->mem, off++, 0x4 | (0x1 << 12));
        } else {
+                /* semaphore_c */
+                gk20a_mem_wr32(g, cmd->mem, off++, 0x20010006);
+                /* payload */
+                gk20a_mem_wr32(g, cmd->mem, off++,
+                               gk20a_semaphore_get_value(s));
                /* semaphore_d */
                gk20a_mem_wr32(g, cmd->mem, off++, 0x20010007);
                /* operation: release, wfi */
@@ -456,7 +480,6 @@ static int add_sema_cmd(struct gk20a *g, struct priv_cmd_entry *cmd,
                /* ignored */
                gk20a_mem_wr32(g, cmd->mem, off++, 0);
        }
-        return off - cmd->off;
 }
 static int gk20a_channel_semaphore_wait_syncpt(
@@ -471,6 +494,76 @@ static int gk20a_channel_semaphore_wait_syncpt(
        return -ENODEV;
 }
+/*
+ * UGHHH - the sync_fence underlying implementation changes from 3.10 to 3.18.
+ * But since there's no API for getting the underlying sync_pts we have to do
+ * some conditional compilation.
+ */
+#ifdef CONFIG_SYNC
+static struct gk20a_semaphore *sema_from_sync_fence(struct sync_fence *f)
+{
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0)
+        struct sync_pt *pt;
+        pt = list_first_entry(&f->pt_list_head, struct sync_pt, pt_list);
+        return gk20a_sync_pt_inst_get_sema(pt);
+#else
+        return gk20a_sync_pt_inst_get_sema(f->cbs[0].sync_pt);
+#endif
+}
+/*
+ * Attempt a fast path for waiting on a sync_fence. Basically if the passed
+ * sync_fence is backed by a gk20a_semaphore then there's no reason to go
+ * through the rigmarole of setting up a separate semaphore which waits on an
+ * interrupt from the GPU and then triggers a worker thread to execute a SW
+ * based semaphore release. Instead just have the GPU wait on the same semaphore
+ * that is going to be incremented by the GPU.
+ *
+ * This function returns 2 possible values: -ENODEV or 0 on success. In the case
+ * of -ENODEV the fastpath cannot be taken due to the fence not being backed by
+ * a GPU semaphore.
+ */
+static int __semaphore_wait_fd_fast_path(struct channel_gk20a *c,
+                                         struct sync_fence *fence,
+                                         struct priv_cmd_entry **wait_cmd,
+                                         struct gk20a_semaphore **fp_sema)
+{
+        struct gk20a_semaphore *sema;
+        int err;
+        if (!gk20a_is_sema_backed_sync_fence(fence))
+                return -ENODEV;
+        sema = sema_from_sync_fence(fence);
+        /*
+         * If there's no underlying sema then that means the underlying sema has
+         * already signaled.
+         */
+        if (!sema) {
+                *fp_sema = NULL;
+                return 0;
+        }
+        err = gk20a_channel_alloc_priv_cmdbuf(c, 8, wait_cmd);
+        if (err)
+                return err;
+        gk20a_semaphore_get(sema);
+        BUG_ON(!atomic_read(&sema->value));
+        add_sema_cmd(c->g, c, sema, *wait_cmd, 8, true, false);
+        /*
+         * Make sure that gk20a_channel_semaphore_wait_fd() can create another
+         * fence with the underlying semaphore.
+         */
+        *fp_sema = sema;
+        return 0;
+}
+#endif
 static int gk20a_channel_semaphore_wait_fd(
                struct gk20a_channel_sync *s, int fd,
                struct priv_cmd_entry **entry,
@@ -480,69 +573,107 @@ static int gk20a_channel_semaphore_wait_fd(
                container_of(s, struct gk20a_channel_semaphore, ops);
        struct channel_gk20a *c = sema->c;
 #ifdef CONFIG_SYNC
+        struct gk20a_semaphore *fp_sema;
        struct sync_fence *sync_fence;
        struct priv_cmd_entry *wait_cmd = NULL;
-        struct wait_fence_work *w;
+        struct wait_fence_work *w = NULL;
-        int written;
+        int err, ret, status;
-        int err, ret;
-        u64 va;
        sync_fence = gk20a_sync_fence_fdget(fd);
        if (!sync_fence)
                return -EINVAL;
+        ret = __semaphore_wait_fd_fast_path(c, sync_fence, &wait_cmd, &fp_sema);
+        if (ret == 0) {
+                if (fp_sema)
+                        *fence = gk20a_fence_from_semaphore(sema->timeline,
+                                                            fp_sema,
+                                                            &c->semaphore_wq,
+                                                            NULL, false);
+                else
+                        /*
+                         * Allocate an empty fence. It will instantly return
+                         * from gk20a_fence_wait().
+                         */
+                        *fence = gk20a_alloc_fence(NULL, NULL, false);
+                sync_fence_put(sync_fence);
+                goto skip_slow_path;
+        }
+        /* If the fence has signaled there is no reason to wait on it. */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0)
+        status = sync_fence->status;
+#else
+        status = atomic_read(&sync_fence->status);
+#endif
+        if (status) {
+                sync_fence_put(sync_fence);
+                goto skip_slow_path;
+        }
+        err = gk20a_channel_alloc_priv_cmdbuf(c, 8, &wait_cmd);
+        if (err) {
+                gk20a_err(dev_from_gk20a(c->g),
+                                "not enough priv cmd buffer space");
+                sync_fence_put(sync_fence);
+                return -ENOMEM;
+        }
        w = kzalloc(sizeof(*w), GFP_KERNEL);
        if (!w) {
                err = -ENOMEM;
-                goto fail;
+                goto fail_free_cmdbuf;
        }
        sync_fence_waiter_init(&w->waiter, gk20a_channel_semaphore_launcher);
        w->ch = c;
-        w->sema = gk20a_semaphore_alloc(sema->pool);
+        w->sema = gk20a_semaphore_alloc(c);
        if (!w->sema) {
                gk20a_err(dev_from_gk20a(c->g), "ran out of semaphores");
                err = -ENOMEM;
-                goto fail;
+                goto fail_free_worker;
        }
        /* worker takes one reference */
        gk20a_semaphore_get(w->sema);
+        gk20a_semaphore_incr(w->sema);
-        err = gk20a_channel_alloc_priv_cmdbuf(c, 8, &wait_cmd);
+        /* GPU unblocked when the semaphore value increments. */
-        if (err) {
+        add_sema_cmd(c->g, c, w->sema, wait_cmd, 8, true, false);
-                gk20a_err(dev_from_gk20a(c->g),
-                                "not enough priv cmd buffer space");
-                goto fail;
-        }
-        va = gk20a_semaphore_gpu_va(w->sema, c->vm);
-        /* GPU unblocked when when the semaphore value becomes 1. */
-        written = add_sema_cmd(c->g, wait_cmd, va, 1, true, false);
-        WARN_ON(written != wait_cmd->size);
        ret = sync_fence_wait_async(sync_fence, &w->waiter);
        /*
         * If the sync_fence has already signaled then the above async_wait
         * will never trigger. This causes the semaphore release op to never
         * happen which, in turn, hangs the GPU. That's bad. So let's just
-         * do the semaphore_release right now.
+         * do the gk20a_semaphore_release() right now.
         */
-        if (ret == 1)
+        if (ret == 1) {
+                sync_fence_put(sync_fence);
                gk20a_semaphore_release(w->sema);
+                gk20a_semaphore_put(w->sema);
+        }
        /* XXX - this fixes an actual bug, we need to hold a ref to this
           semaphore while the job is in flight. */
        *fence = gk20a_fence_from_semaphore(sema->timeline, w->sema,
                                            &c->semaphore_wq,
                                            NULL, false);
+skip_slow_path:
        *entry = wait_cmd;
        return 0;
-fail:
+fail_free_worker:
        if (w && w->sema)
                gk20a_semaphore_put(w->sema);
        kfree(w);
        sync_fence_put(sync_fence);
+fail_free_cmdbuf:
+        if (wait_cmd)
+                gk20a_free_priv_cmdbuf(c, wait_cmd);
        return err;
 #else
        gk20a_err(dev_from_gk20a(c->g),
@@ -558,9 +689,7 @@ static int __gk20a_channel_semaphore_incr(
                struct gk20a_fence **fence,
                bool need_sync_fence)
 {
-        u64 va;
        int incr_cmd_size;
-        int written;
        struct priv_cmd_entry *incr_cmd = NULL;
        struct gk20a_channel_semaphore *sp =
                container_of(s, struct gk20a_channel_semaphore, ops);
@@ -568,7 +697,7 @@ static int __gk20a_channel_semaphore_incr(
        struct gk20a_semaphore *semaphore;
        int err = 0;
-        semaphore = gk20a_semaphore_alloc(sp->pool);
+        semaphore = gk20a_semaphore_alloc(c);
        if (!semaphore) {
                gk20a_err(dev_from_gk20a(c->g),
                                "ran out of semaphores");
@@ -585,9 +714,7 @@ static int __gk20a_channel_semaphore_incr(
        }
        /* Release the completion semaphore. */
-        va = gk20a_semaphore_gpu_va(semaphore, c->vm);
+        add_sema_cmd(c->g, c, semaphore, incr_cmd, 14, false, wfi_cmd);
-        written = add_sema_cmd(c->g, incr_cmd, va, 1, false, wfi_cmd);
-        WARN_ON(written != incr_cmd_size);
        *fence = gk20a_fence_from_semaphore(sp->timeline, semaphore,
                                            &c->semaphore_wq,
@@ -615,8 +742,10 @@ static int gk20a_channel_semaphore_incr(
 {
        /* Don't put wfi cmd to this one since we're not returning
         * a fence to user space. */
-        return __gk20a_channel_semaphore_incr(s, false /* no wfi */,
+        return __gk20a_channel_semaphore_incr(s,
-                                      NULL, entry, fence, need_sync_fence);
+                        false /* no wfi */,
+                        NULL,
+                        entry, fence, need_sync_fence);
 }
 static int gk20a_channel_semaphore_incr_user(
@@ -679,17 +808,16 @@ static void gk20a_channel_semaphore_destroy(struct gk20a_channel_sync *s)
                container_of(s, struct gk20a_channel_semaphore, ops);
        if (sema->timeline)
                gk20a_sync_timeline_destroy(sema->timeline);
-        if (sema->pool) {
-                gk20a_semaphore_pool_unmap(sema->pool, sema->c->vm);
+        /* The sema pool is cleaned up by the VM destroy. */
-                gk20a_semaphore_pool_put(sema->pool);
+        sema->pool = NULL;
-        }
        kfree(sema);
 }
 static struct gk20a_channel_sync *
 gk20a_channel_semaphore_create(struct channel_gk20a *c)
 {
-        int err;
        int asid = -1;
        struct gk20a_channel_semaphore *sema;
        char pool_name[20];
@@ -706,21 +834,15 @@ gk20a_channel_semaphore_create(struct channel_gk20a *c)
                asid = c->vm->as_share->id;
        sprintf(pool_name, "semaphore_pool-%d", c->hw_chid);
-        sema->pool = gk20a_semaphore_pool_alloc(c->g, pool_name, 1024);
+        sema->pool = c->vm->sema_pool;
-        if (!sema->pool)
-                goto clean_up;
-        /* Map the semaphore pool to the channel vm. Map as read-write to the
-         * owner channel (all other channels should map as read only!). */
-        err = gk20a_semaphore_pool_map(sema->pool, c->vm, gk20a_mem_flag_none);
-        if (err)
-                goto clean_up;
 #ifdef CONFIG_SYNC
        sema->timeline = gk20a_sync_timeline_create(
                        "gk20a_ch%d_as%d", c->hw_chid, asid);
-        if (!sema->timeline)
+        if (!sema->timeline) {
-                goto clean_up;
+                gk20a_channel_semaphore_destroy(&sema->ops);
+                return NULL;
+        }
 #endif
        atomic_set(&sema->ops.refcount, 0);
        sema->ops.wait_syncpt   = gk20a_channel_semaphore_wait_syncpt;
@@ -734,9 +856,6 @@ gk20a_channel_semaphore_create(struct channel_gk20a *c)
        sema->ops.destroy       = gk20a_channel_semaphore_destroy;
        return &sema->ops;
-clean_up:
-        gk20a_channel_semaphore_destroy(&sema->ops);
-        return NULL;
 }
 void gk20a_channel_sync_destroy(struct gk20a_channel_sync *sync)
author	Alex Waterman <alexw@nvidia.com>	2016-04-27 15:27:36 -0400
committer	Terje Bergstrom <tbergstrom@nvidia.com>	2016-06-28 18:49:11 -0400
commit	dfd5ec53fcce4ebae27f78242e6b788350337095 (patch)
tree	073ea380b9ee4734391d381745f57600c3525be5 /drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
parent	b30990ea6db564e885d5aee7a1a5ea87a1e5e8ee (diff)

diff --git a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c index d2d8c094..9c8911e9 100644 --- a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
@@ -424,28 +424,52 @@ static void gk20a_channel_semaphore_launcher(
424	}	424	}
425	#endif	425	#endif
426		426
427	static int add_sema_cmd(struct gk20a g, struct priv_cmd_entry cmd,	427	static void add_sema_cmd(struct gk20a g, struct channel_gk20a c,
428	u64 sema, u32 payload, bool acquire, bool wfi)	428	struct gk20a_semaphore s, struct priv_cmd_entry cmd,
		429	int cmd_size, bool acquire, bool wfi)
429	{	430	{
430	u32 off = cmd->off;	431	u32 off = cmd->off;
		432	u64 va;
		433
		434	/*
		435	* RO for acquire (since we just need to read the mem) and RW for
		436	* release since we will need to write back to the semaphore memory.
		437	*/
		438	va = acquire ? gk20a_semaphore_gpu_ro_va(s) :
		439	gk20a_semaphore_gpu_rw_va(s);
		440
		441	/*
		442	* If the op is not an acquire (so therefor a release) we should
		443	* incr the underlying sema next_value.
		444	*/
		445	if (!acquire)
		446	gk20a_semaphore_incr(s);
		447
431	/* semaphore_a */	448	/* semaphore_a */
432	gk20a_mem_wr32(g, cmd->mem, off++, 0x20010004);	449	gk20a_mem_wr32(g, cmd->mem, off++, 0x20010004);
433	/* offset_upper */	450	/* offset_upper */
434	gk20a_mem_wr32(g, cmd->mem, off++, (sema >> 32) & 0xff);	451	gk20a_mem_wr32(g, cmd->mem, off++, (va >> 32) & 0xff);
435	/* semaphore_b */	452	/* semaphore_b */
436	gk20a_mem_wr32(g, cmd->mem, off++, 0x20010005);	453	gk20a_mem_wr32(g, cmd->mem, off++, 0x20010005);
437	/* offset */	454	/* offset */
438	gk20a_mem_wr32(g, cmd->mem, off++, sema & 0xffffffff);	455	gk20a_mem_wr32(g, cmd->mem, off++, va & 0xffffffff);
439	/* semaphore_c */	456
440	gk20a_mem_wr32(g, cmd->mem, off++, 0x20010006);
441	/* payload */
442	gk20a_mem_wr32(g, cmd->mem, off++, payload);
443	if (acquire) {	457	if (acquire) {
		458	/* semaphore_c */
		459	gk20a_mem_wr32(g, cmd->mem, off++, 0x20010006);
		460	/* payload */
		461	gk20a_mem_wr32(g, cmd->mem, off++,
		462	gk20a_semaphore_get_value(s));
444	/* semaphore_d */	463	/* semaphore_d */
445	gk20a_mem_wr32(g, cmd->mem, off++, 0x20010007);	464	gk20a_mem_wr32(g, cmd->mem, off++, 0x20010007);
446	/* operation: acq_geq, switch_en */	465	/* operation: acq_geq, switch_en */
447	gk20a_mem_wr32(g, cmd->mem, off++, 0x4 \| (0x1 << 12));	466	gk20a_mem_wr32(g, cmd->mem, off++, 0x4 \| (0x1 << 12));
448	} else {	467	} else {
		468	/* semaphore_c */
		469	gk20a_mem_wr32(g, cmd->mem, off++, 0x20010006);
		470	/* payload */
		471	gk20a_mem_wr32(g, cmd->mem, off++,
		472	gk20a_semaphore_get_value(s));
449	/* semaphore_d */	473	/* semaphore_d */
450	gk20a_mem_wr32(g, cmd->mem, off++, 0x20010007);	474	gk20a_mem_wr32(g, cmd->mem, off++, 0x20010007);
451	/* operation: release, wfi */	475	/* operation: release, wfi */
@@ -456,7 +480,6 @@ static int add_sema_cmd(struct gk20a g, struct priv_cmd_entry cmd,
456	/* ignored */	480	/* ignored */
457	gk20a_mem_wr32(g, cmd->mem, off++, 0);	481	gk20a_mem_wr32(g, cmd->mem, off++, 0);
458	}	482	}
459	return off - cmd->off;
460	}	483	}
461		484
462	static int gk20a_channel_semaphore_wait_syncpt(	485	static int gk20a_channel_semaphore_wait_syncpt(
@@ -471,6 +494,76 @@ static int gk20a_channel_semaphore_wait_syncpt(
471	return -ENODEV;	494	return -ENODEV;
472	}	495	}
473		496
		497	/*
		498	* UGHHH - the sync_fence underlying implementation changes from 3.10 to 3.18.
		499	* But since there's no API for getting the underlying sync_pts we have to do
		500	* some conditional compilation.
		501	*/
		502	#ifdef CONFIG_SYNC
		503	static struct gk20a_semaphore sema_from_sync_fence(struct sync_fence f)
		504	{
		505	#if LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0)
		506	struct sync_pt *pt;
		507
		508	pt = list_first_entry(&f->pt_list_head, struct sync_pt, pt_list);
		509	return gk20a_sync_pt_inst_get_sema(pt);
		510	#else
		511	return gk20a_sync_pt_inst_get_sema(f->cbs[0].sync_pt);
		512	#endif
		513	}
		514
		515	/*
		516	* Attempt a fast path for waiting on a sync_fence. Basically if the passed
		517	* sync_fence is backed by a gk20a_semaphore then there's no reason to go
		518	* through the rigmarole of setting up a separate semaphore which waits on an
		519	* interrupt from the GPU and then triggers a worker thread to execute a SW
		520	* based semaphore release. Instead just have the GPU wait on the same semaphore
		521	* that is going to be incremented by the GPU.
		522	*
		523	* This function returns 2 possible values: -ENODEV or 0 on success. In the case
		524	* of -ENODEV the fastpath cannot be taken due to the fence not being backed by
		525	* a GPU semaphore.
		526	*/
		527	static int __semaphore_wait_fd_fast_path(struct channel_gk20a *c,
		528	struct sync_fence *fence,
		529	struct priv_cmd_entry **wait_cmd,
		530	struct gk20a_semaphore **fp_sema)
		531	{
		532	struct gk20a_semaphore *sema;
		533	int err;
		534
		535	if (!gk20a_is_sema_backed_sync_fence(fence))
		536	return -ENODEV;
		537
		538	sema = sema_from_sync_fence(fence);
		539
		540	/*
		541	* If there's no underlying sema then that means the underlying sema has
		542	* already signaled.
		543	*/
		544	if (!sema) {
		545	*fp_sema = NULL;
		546	return 0;
		547	}
		548
		549	err = gk20a_channel_alloc_priv_cmdbuf(c, 8, wait_cmd);
		550	if (err)
		551	return err;
		552
		553	gk20a_semaphore_get(sema);
		554	BUG_ON(!atomic_read(&sema->value));
		555	add_sema_cmd(c->g, c, sema, *wait_cmd, 8, true, false);
		556
		557	/*
		558	* Make sure that gk20a_channel_semaphore_wait_fd() can create another
		559	* fence with the underlying semaphore.
		560	*/
		561	*fp_sema = sema;
		562
		563	return 0;
		564	}
		565	#endif
		566
474	static int gk20a_channel_semaphore_wait_fd(	567	static int gk20a_channel_semaphore_wait_fd(
475	struct gk20a_channel_sync *s, int fd,	568	struct gk20a_channel_sync *s, int fd,
476	struct priv_cmd_entry **entry,	569	struct priv_cmd_entry **entry,
@@ -480,69 +573,107 @@ static int gk20a_channel_semaphore_wait_fd(
480	container_of(s, struct gk20a_channel_semaphore, ops);	573	container_of(s, struct gk20a_channel_semaphore, ops);
481	struct channel_gk20a *c = sema->c;	574	struct channel_gk20a *c = sema->c;
482	#ifdef CONFIG_SYNC	575	#ifdef CONFIG_SYNC
		576	struct gk20a_semaphore *fp_sema;
483	struct sync_fence *sync_fence;	577	struct sync_fence *sync_fence;
484	struct priv_cmd_entry *wait_cmd = NULL;	578	struct priv_cmd_entry *wait_cmd = NULL;
485	struct wait_fence_work *w;	579	struct wait_fence_work *w = NULL;
486	int written;	580	int err, ret, status;
487	int err, ret;
488	u64 va;
489		581
490	sync_fence = gk20a_sync_fence_fdget(fd);	582	sync_fence = gk20a_sync_fence_fdget(fd);
491	if (!sync_fence)	583	if (!sync_fence)
492	return -EINVAL;	584	return -EINVAL;
493		585
		586	ret = __semaphore_wait_fd_fast_path(c, sync_fence, &wait_cmd, &fp_sema);
		587	if (ret == 0) {
		588	if (fp_sema)
		589	*fence = gk20a_fence_from_semaphore(sema->timeline,
		590	fp_sema,
		591	&c->semaphore_wq,
		592	NULL, false);
		593	else
		594	/*
		595	* Allocate an empty fence. It will instantly return
		596	* from gk20a_fence_wait().
		597	*/
		598	*fence = gk20a_alloc_fence(NULL, NULL, false);
		599
		600	sync_fence_put(sync_fence);
		601	goto skip_slow_path;
		602	}
		603
		604	/* If the fence has signaled there is no reason to wait on it. */
		605	#if LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0)
		606	status = sync_fence->status;
		607	#else
		608	status = atomic_read(&sync_fence->status);
		609	#endif
		610	if (status) {
		611	sync_fence_put(sync_fence);
		612	goto skip_slow_path;
		613	}
		614
		615	err = gk20a_channel_alloc_priv_cmdbuf(c, 8, &wait_cmd);
		616	if (err) {
		617	gk20a_err(dev_from_gk20a(c->g),
		618	"not enough priv cmd buffer space");
		619	sync_fence_put(sync_fence);
		620	return -ENOMEM;
		621	}
		622
494	w = kzalloc(sizeof(*w), GFP_KERNEL);	623	w = kzalloc(sizeof(*w), GFP_KERNEL);
495	if (!w) {	624	if (!w) {
496	err = -ENOMEM;	625	err = -ENOMEM;
497	goto fail;	626	goto fail_free_cmdbuf;
498	}	627	}
		628
499	sync_fence_waiter_init(&w->waiter, gk20a_channel_semaphore_launcher);	629	sync_fence_waiter_init(&w->waiter, gk20a_channel_semaphore_launcher);
500	w->ch = c;	630	w->ch = c;
501	w->sema = gk20a_semaphore_alloc(sema->pool);	631	w->sema = gk20a_semaphore_alloc(c);
502	if (!w->sema) {	632	if (!w->sema) {
503	gk20a_err(dev_from_gk20a(c->g), "ran out of semaphores");	633	gk20a_err(dev_from_gk20a(c->g), "ran out of semaphores");
504	err = -ENOMEM;	634	err = -ENOMEM;
505	goto fail;	635	goto fail_free_worker;
506	}	636	}
507		637
508	/* worker takes one reference */	638	/* worker takes one reference */
509	gk20a_semaphore_get(w->sema);	639	gk20a_semaphore_get(w->sema);
		640	gk20a_semaphore_incr(w->sema);
510		641
511	err = gk20a_channel_alloc_priv_cmdbuf(c, 8, &wait_cmd);	642	/* GPU unblocked when the semaphore value increments. */
512	if (err) {	643	add_sema_cmd(c->g, c, w->sema, wait_cmd, 8, true, false);
513	gk20a_err(dev_from_gk20a(c->g),
514	"not enough priv cmd buffer space");
515	goto fail;
516	}
517
518	va = gk20a_semaphore_gpu_va(w->sema, c->vm);
519	/* GPU unblocked when when the semaphore value becomes 1. */
520	written = add_sema_cmd(c->g, wait_cmd, va, 1, true, false);
521		644
522	WARN_ON(written != wait_cmd->size);
523	ret = sync_fence_wait_async(sync_fence, &w->waiter);	645	ret = sync_fence_wait_async(sync_fence, &w->waiter);
524		646
525	/*	647	/*
526	* If the sync_fence has already signaled then the above async_wait	648	* If the sync_fence has already signaled then the above async_wait
527	* will never trigger. This causes the semaphore release op to never	649	* will never trigger. This causes the semaphore release op to never
528	* happen which, in turn, hangs the GPU. That's bad. So let's just	650	* happen which, in turn, hangs the GPU. That's bad. So let's just
529	* do the semaphore_release right now.	651	* do the gk20a_semaphore_release() right now.
530	*/	652	*/
531	if (ret == 1)	653	if (ret == 1) {
		654	sync_fence_put(sync_fence);
532	gk20a_semaphore_release(w->sema);	655	gk20a_semaphore_release(w->sema);
		656	gk20a_semaphore_put(w->sema);
		657	}
533		658
534	/* XXX - this fixes an actual bug, we need to hold a ref to this	659	/* XXX - this fixes an actual bug, we need to hold a ref to this
535	semaphore while the job is in flight. */	660	semaphore while the job is in flight. */
536	*fence = gk20a_fence_from_semaphore(sema->timeline, w->sema,	661	*fence = gk20a_fence_from_semaphore(sema->timeline, w->sema,
537	&c->semaphore_wq,	662	&c->semaphore_wq,
538	NULL, false);	663	NULL, false);
		664
		665	skip_slow_path:
539	*entry = wait_cmd;	666	*entry = wait_cmd;
540	return 0;	667	return 0;
541	fail:	668
		669	fail_free_worker:
542	if (w && w->sema)	670	if (w && w->sema)
543	gk20a_semaphore_put(w->sema);	671	gk20a_semaphore_put(w->sema);
544	kfree(w);	672	kfree(w);
545	sync_fence_put(sync_fence);	673	sync_fence_put(sync_fence);
		674	fail_free_cmdbuf:
		675	if (wait_cmd)
		676	gk20a_free_priv_cmdbuf(c, wait_cmd);
546	return err;	677	return err;
547	#else	678	#else
548	gk20a_err(dev_from_gk20a(c->g),	679	gk20a_err(dev_from_gk20a(c->g),
@@ -558,9 +689,7 @@ static int __gk20a_channel_semaphore_incr(
558	struct gk20a_fence **fence,	689	struct gk20a_fence **fence,
559	bool need_sync_fence)	690	bool need_sync_fence)
560	{	691	{
561	u64 va;
562	int incr_cmd_size;	692	int incr_cmd_size;
563	int written;
564	struct priv_cmd_entry *incr_cmd = NULL;	693	struct priv_cmd_entry *incr_cmd = NULL;
565	struct gk20a_channel_semaphore *sp =	694	struct gk20a_channel_semaphore *sp =
566	container_of(s, struct gk20a_channel_semaphore, ops);	695	container_of(s, struct gk20a_channel_semaphore, ops);
@@ -568,7 +697,7 @@ static int __gk20a_channel_semaphore_incr(
568	struct gk20a_semaphore *semaphore;	697	struct gk20a_semaphore *semaphore;
569	int err = 0;	698	int err = 0;
570		699
571	semaphore = gk20a_semaphore_alloc(sp->pool);	700	semaphore = gk20a_semaphore_alloc(c);
572	if (!semaphore) {	701	if (!semaphore) {
573	gk20a_err(dev_from_gk20a(c->g),	702	gk20a_err(dev_from_gk20a(c->g),
574	"ran out of semaphores");	703	"ran out of semaphores");
@@ -585,9 +714,7 @@ static int __gk20a_channel_semaphore_incr(
585	}	714	}
586		715
587	/* Release the completion semaphore. */	716	/* Release the completion semaphore. */
588	va = gk20a_semaphore_gpu_va(semaphore, c->vm);	717	add_sema_cmd(c->g, c, semaphore, incr_cmd, 14, false, wfi_cmd);
589	written = add_sema_cmd(c->g, incr_cmd, va, 1, false, wfi_cmd);
590	WARN_ON(written != incr_cmd_size);
591		718
592	*fence = gk20a_fence_from_semaphore(sp->timeline, semaphore,	719	*fence = gk20a_fence_from_semaphore(sp->timeline, semaphore,
593	&c->semaphore_wq,	720	&c->semaphore_wq,
@@ -615,8 +742,10 @@ static int gk20a_channel_semaphore_incr(
615	{	742	{
616	/* Don't put wfi cmd to this one since we're not returning	743	/* Don't put wfi cmd to this one since we're not returning
617	* a fence to user space. */	744	* a fence to user space. */
618	return __gk20a_channel_semaphore_incr(s, false /* no wfi */,	745	return __gk20a_channel_semaphore_incr(s,
619	NULL, entry, fence, need_sync_fence);	746	false /* no wfi */,
		747	NULL,
		748	entry, fence, need_sync_fence);
620	}	749	}
621		750
622	static int gk20a_channel_semaphore_incr_user(	751	static int gk20a_channel_semaphore_incr_user(
@@ -679,17 +808,16 @@ static void gk20a_channel_semaphore_destroy(struct gk20a_channel_sync *s)
679	container_of(s, struct gk20a_channel_semaphore, ops);	808	container_of(s, struct gk20a_channel_semaphore, ops);
680	if (sema->timeline)	809	if (sema->timeline)
681	gk20a_sync_timeline_destroy(sema->timeline);	810	gk20a_sync_timeline_destroy(sema->timeline);
682	if (sema->pool) {	811
683	gk20a_semaphore_pool_unmap(sema->pool, sema->c->vm);	812	/* The sema pool is cleaned up by the VM destroy. */
684	gk20a_semaphore_pool_put(sema->pool);	813	sema->pool = NULL;
685	}	814
686	kfree(sema);	815	kfree(sema);
687	}	816	}
688		817
689	static struct gk20a_channel_sync *	818	static struct gk20a_channel_sync *
690	gk20a_channel_semaphore_create(struct channel_gk20a *c)	819	gk20a_channel_semaphore_create(struct channel_gk20a *c)
691	{	820	{
692	int err;
693	int asid = -1;	821	int asid = -1;
694	struct gk20a_channel_semaphore *sema;	822	struct gk20a_channel_semaphore *sema;
695	char pool_name[20];	823	char pool_name[20];
@@ -706,21 +834,15 @@ gk20a_channel_semaphore_create(struct channel_gk20a *c)
706	asid = c->vm->as_share->id;	834	asid = c->vm->as_share->id;
707		835
708	sprintf(pool_name, "semaphore_pool-%d", c->hw_chid);	836	sprintf(pool_name, "semaphore_pool-%d", c->hw_chid);
709	sema->pool = gk20a_semaphore_pool_alloc(c->g, pool_name, 1024);	837	sema->pool = c->vm->sema_pool;
710	if (!sema->pool)
711	goto clean_up;
712
713	/* Map the semaphore pool to the channel vm. Map as read-write to the
714	* owner channel (all other channels should map as read only!). */
715	err = gk20a_semaphore_pool_map(sema->pool, c->vm, gk20a_mem_flag_none);
716	if (err)
717	goto clean_up;
718		838
719	#ifdef CONFIG_SYNC	839	#ifdef CONFIG_SYNC
720	sema->timeline = gk20a_sync_timeline_create(	840	sema->timeline = gk20a_sync_timeline_create(
721	"gk20a_ch%d_as%d", c->hw_chid, asid);	841	"gk20a_ch%d_as%d", c->hw_chid, asid);
722	if (!sema->timeline)	842	if (!sema->timeline) {
723	goto clean_up;	843	gk20a_channel_semaphore_destroy(&sema->ops);
		844	return NULL;
		845	}
724	#endif	846	#endif
725	atomic_set(&sema->ops.refcount, 0);	847	atomic_set(&sema->ops.refcount, 0);
726	sema->ops.wait_syncpt = gk20a_channel_semaphore_wait_syncpt;	848	sema->ops.wait_syncpt = gk20a_channel_semaphore_wait_syncpt;
@@ -734,9 +856,6 @@ gk20a_channel_semaphore_create(struct channel_gk20a *c)
734	sema->ops.destroy = gk20a_channel_semaphore_destroy;	856	sema->ops.destroy = gk20a_channel_semaphore_destroy;
735		857
736	return &sema->ops;	858	return &sema->ops;
737	clean_up:
738	gk20a_channel_semaphore_destroy(&sema->ops);
739	return NULL;
740	}	859	}
741		860
742	void gk20a_channel_sync_destroy(struct gk20a_channel_sync *sync)	861	void gk20a_channel_sync_destroy(struct gk20a_channel_sync *sync)