1 files changed, 176 insertions, 57 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
index d2d8c094..9c8911e9 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
@@ -424,28 +424,52 @@ static void gk20a_channel_semaphore_launcher(
 }
 #endif
-static int add_sema_cmd(struct gk20a *g, struct priv_cmd_entry *cmd,
+static void add_sema_cmd(struct gk20a *g, struct channel_gk20a *c,
-                u64 sema, u32 payload, bool acquire, bool wfi)
+                         struct gk20a_semaphore *s, struct priv_cmd_entry *cmd,
+                         int cmd_size, bool acquire, bool wfi)
 {
        u32 off = cmd->off;
+        u64 va;
+        /*
+         * RO for acquire (since we just need to read the mem) and RW for
+         * release since we will need to write back to the semaphore memory.
+         */
+        va = acquire ? gk20a_semaphore_gpu_ro_va(s) :
+                       gk20a_semaphore_gpu_rw_va(s);
+        /*
+         * If the op is not an acquire (so therefor a release) we should
+         * incr the underlying sema next_value.
+         */
+        if (!acquire)
+                gk20a_semaphore_incr(s);
        /* semaphore_a */
        gk20a_mem_wr32(g, cmd->mem, off++, 0x20010004);
        /* offset_upper */
-        gk20a_mem_wr32(g, cmd->mem, off++, (sema >> 32) & 0xff);
+        gk20a_mem_wr32(g, cmd->mem, off++, (va >> 32) & 0xff);
        /* semaphore_b */
        gk20a_mem_wr32(g, cmd->mem, off++, 0x20010005);
        /* offset */
-        gk20a_mem_wr32(g, cmd->mem, off++, sema & 0xffffffff);
+        gk20a_mem_wr32(g, cmd->mem, off++, va & 0xffffffff);
-        /* semaphore_c */
-        gk20a_mem_wr32(g, cmd->mem, off++, 0x20010006);
-        /* payload */
-        gk20a_mem_wr32(g, cmd->mem, off++, payload);
        if (acquire) {
+                /* semaphore_c */
+                gk20a_mem_wr32(g, cmd->mem, off++, 0x20010006);
+                /* payload */
+                gk20a_mem_wr32(g, cmd->mem, off++,
+                               gk20a_semaphore_get_value(s));
                /* semaphore_d */
                gk20a_mem_wr32(g, cmd->mem, off++, 0x20010007);
                /* operation: acq_geq, switch_en */
                gk20a_mem_wr32(g, cmd->mem, off++, 0x4 | (0x1 << 12));
        } else {
+                /* semaphore_c */
+                gk20a_mem_wr32(g, cmd->mem, off++, 0x20010006);
+                /* payload */
+                gk20a_mem_wr32(g, cmd->mem, off++,
+                               gk20a_semaphore_get_value(s));
                /* semaphore_d */
                gk20a_mem_wr32(g, cmd->mem, off++, 0x20010007);
                /* operation: release, wfi */
@@ -456,7 +480,6 @@ static int add_sema_cmd(struct gk20a *g, struct priv_cmd_entry *cmd,
                /* ignored */
                gk20a_mem_wr32(g, cmd->mem, off++, 0);
        }
-        return off - cmd->off;
 }
 static int gk20a_channel_semaphore_wait_syncpt(
@@ -471,6 +494,76 @@ static int gk20a_channel_semaphore_wait_syncpt(
        return -ENODEV;
 }
+/*
+ * UGHHH - the sync_fence underlying implementation changes from 3.10 to 3.18.
+ * But since there's no API for getting the underlying sync_pts we have to do
+ * some conditional compilation.
+ */
+#ifdef CONFIG_SYNC
+static struct gk20a_semaphore *sema_from_sync_fence(struct sync_fence *f)
+{
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0)
+        struct sync_pt *pt;
+        pt = list_first_entry(&f->pt_list_head, struct sync_pt, pt_list);
+        return gk20a_sync_pt_inst_get_sema(pt);
+#else
+        return gk20a_sync_pt_inst_get_sema(f->cbs[0].sync_pt);
+#endif
+}
+/*
+ * Attempt a fast path for waiting on a sync_fence. Basically if the passed
+ * sync_fence is backed by a gk20a_semaphore then there's no reason to go
+ * through the rigmarole of setting up a separate semaphore which waits on an
+ * interrupt from the GPU and then triggers a worker thread to execute a SW
+ * based semaphore release. Instead just have the GPU wait on the same semaphore
+ * that is going to be incremented by the GPU.
+ *
+ * This function returns 2 possible values: -ENODEV or 0 on success. In the case
+ * of -ENODEV the fastpath cannot be taken due to the fence not being backed by
+ * a GPU semaphore.
+ */
+static int __semaphore_wait_fd_fast_path(struct channel_gk20a *c,
+                                         struct sync_fence *fence,
+                                         struct priv_cmd_entry **wait_cmd,
+                                         struct gk20a_semaphore **fp_sema)
+{
+        struct gk20a_semaphore *sema;
+        int err;
+        if (!gk20a_is_sema_backed_sync_fence(fence))
+                return -ENODEV;
+        sema = sema_from_sync_fence(fence);
+        /*
+         * If there's no underlying sema then that means the underlying sema has
+         * already signaled.
+         */
+        if (!sema) {
+                *fp_sema = NULL;
+                return 0;
+        }
+        err = gk20a_channel_alloc_priv_cmdbuf(c, 8, wait_cmd);
+        if (err)
+                return err;
+        gk20a_semaphore_get(sema);
+        BUG_ON(!atomic_read(&sema->value));
+        add_sema_cmd(c->g, c, sema, *wait_cmd, 8, true, false);
+        /*
+         * Make sure that gk20a_channel_semaphore_wait_fd() can create another
+         * fence with the underlying semaphore.
+         */
+        *fp_sema = sema;
+        return 0;
+}
+#endif
 static int gk20a_channel_semaphore_wait_fd(
                struct gk20a_channel_sync *s, int fd,
                struct priv_cmd_entry **entry,
@@ -480,69 +573,107 @@ static int gk20a_channel_semaphore_wait_fd(
                container_of(s, struct gk20a_channel_semaphore, ops);
        struct channel_gk20a *c = sema->c;
 #ifdef CONFIG_SYNC
+        struct gk20a_semaphore *fp_sema;
        struct sync_fence *sync_fence;
        struct priv_cmd_entry *wait_cmd = NULL;
-        struct wait_fence_work *w;
+        struct wait_fence_work *w = NULL;
-        int written;
+        int err, ret, status;
-        int err, ret;
-        u64 va;
        sync_fence = gk20a_sync_fence_fdget(fd);
        if (!sync_fence)
                return -EINVAL;
+        ret = __semaphore_wait_fd_fast_path(c, sync_fence, &wait_cmd, &fp_sema);
+        if (ret == 0) {
+                if (fp_sema)
+                        *fence = gk20a_fence_from_semaphore(sema->timeline,
+                                                            fp_sema,
+                                                            &c->semaphore_wq,
+                                                            NULL, false);
+                else
+                        /*
+                         * Allocate an empty fence. It will instantly return
+                         * from gk20a_fence_wait().
+                         */
+                        *fence = gk20a_alloc_fence(NULL, NULL, false);
+                sync_fence_put(sync_fence);
+                goto skip_slow_path;
+        }
+        /* If the fence has signaled there is no reason to wait on it. */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0)
+        status = sync_fence->status;
+#else
+        status = atomic_read(&sync_fence->status);
+#endif
+        if (status) {
+                sync_fence_put(sync_fence);
+                goto skip_slow_path;
+        }
+        err = gk20a_channel_alloc_priv_cmdbuf(c, 8, &wait_cmd);
+        if (err) {
+                gk20a_err(dev_from_gk20a(c->g),
+                                "not enough priv cmd buffer space");
+                sync_fence_put(sync_fence);
+                return -ENOMEM;
+        }
        w = kzalloc(sizeof(*w), GFP_KERNEL);
        if (!w) {
                err = -ENOMEM;
-                goto fail;
+                goto fail_free_cmdbuf;
        }
        sync_fence_waiter_init(&w->waiter, gk20a_channel_semaphore_launcher);
        w->ch = c;
-        w->sema = gk20a_semaphore_alloc(sema->pool);
+        w->sema = gk20a_semaphore_alloc(c);
        if (!w->sema) {
                gk20a_err(dev_from_gk20a(c->g), "ran out of semaphores");
                err = -ENOMEM;
-                goto fail;
+                goto fail_free_worker;
        }
        /* worker takes one reference */
        gk20a_semaphore_get(w->sema);
+        gk20a_semaphore_incr(w->sema);
-        err = gk20a_channel_alloc_priv_cmdbuf(c, 8, &wait_cmd);
+        /* GPU unblocked when the semaphore value increments. */
-        if (err) {
+        add_sema_cmd(c->g, c, w->sema, wait_cmd, 8, true, false);
-                gk20a_err(dev_from_gk20a(c->g),
-                                "not enough priv cmd buffer space");
-                goto fail;
-        }
-        va = gk20a_semaphore_gpu_va(w->sema, c->vm);
-        /* GPU unblocked when when the semaphore value becomes 1. */
-        written = add_sema_cmd(c->g, wait_cmd, va, 1, true, false);
-        WARN_ON(written != wait_cmd->size);
        ret = sync_fence_wait_async(sync_fence, &w->waiter);
        /*
         * If the sync_fence has already signaled then the above async_wait
         * will never trigger. This causes the semaphore release op to never
         * happen which, in turn, hangs the GPU. That's bad. So let's just
-         * do the semaphore_release right now.
+         * do the gk20a_semaphore_release() right now.
         */
-        if (ret == 1)
+        if (ret == 1) {
+                sync_fence_put(sync_fence);
                gk20a_semaphore_release(w->sema);
+                gk20a_semaphore_put(w->sema);
+        }
        /* XXX - this fixes an actual bug, we need to hold a ref to this
           semaphore while the job is in flight. */
        *fence = gk20a_fence_from_semaphore(sema->timeline, w->sema,
                                            &c->semaphore_wq,
                                            NULL, false);
+skip_slow_path:
        *entry = wait_cmd;
        return 0;
-fail:
+fail_free_worker:
        if (w && w->sema)
                gk20a_semaphore_put(w->sema);
        kfree(w);
        sync_fence_put(sync_fence);
+fail_free_cmdbuf:
+        if (wait_cmd)
+                gk20a_free_priv_cmdbuf(c, wait_cmd);
        return err;
 #else
        gk20a_err(dev_from_gk20a(c->g),
@@ -558,9 +689,7 @@ static int __gk20a_channel_semaphore_incr(
                struct gk20a_fence **fence,
                bool need_sync_fence)
 {
-        u64 va;
        int incr_cmd_size;
-        int written;
        struct priv_cmd_entry *incr_cmd = NULL;
        struct gk20a_channel_semaphore *sp =
                container_of(s, struct gk20a_channel_semaphore, ops);
@@ -568,7 +697,7 @@ static int __gk20a_channel_semaphore_incr(
        struct gk20a_semaphore *semaphore;
        int err = 0;
-        semaphore = gk20a_semaphore_alloc(sp->pool);
+        semaphore = gk20a_semaphore_alloc(c);
        if (!semaphore) {
                gk20a_err(dev_from_gk20a(c->g),
                                "ran out of semaphores");
@@ -585,9 +714,7 @@ static int __gk20a_channel_semaphore_incr(
        }
        /* Release the completion semaphore. */
-        va = gk20a_semaphore_gpu_va(semaphore, c->vm);
+        add_sema_cmd(c->g, c, semaphore, incr_cmd, 14, false, wfi_cmd);
-        written = add_sema_cmd(c->g, incr_cmd, va, 1, false, wfi_cmd);
-        WARN_ON(written != incr_cmd_size);
        *fence = gk20a_fence_from_semaphore(sp->timeline, semaphore,
                                            &c->semaphore_wq,
@@ -615,8 +742,10 @@ static int gk20a_channel_semaphore_incr(
 {
        /* Don't put wfi cmd to this one since we're not returning
         * a fence to user space. */
-        return __gk20a_channel_semaphore_incr(s, false /* no wfi */,
+        return __gk20a_channel_semaphore_incr(s,
-                                      NULL, entry, fence, need_sync_fence);
+                        false /* no wfi */,
+                        NULL,
+                        entry, fence, need_sync_fence);
 }
 static int gk20a_channel_semaphore_incr_user(
@@ -679,17 +808,16 @@ static void gk20a_channel_semaphore_destroy(struct gk20a_channel_sync *s)
                container_of(s, struct gk20a_channel_semaphore, ops);
        if (sema->timeline)
                gk20a_sync_timeline_destroy(sema->timeline);
-        if (sema->pool) {
-                gk20a_semaphore_pool_unmap(sema->pool, sema->c->vm);
+        /* The sema pool is cleaned up by the VM destroy. */
-                gk20a_semaphore_pool_put(sema->pool);
+        sema->pool = NULL;
-        }
        kfree(sema);
 }
 static struct gk20a_channel_sync *
 gk20a_channel_semaphore_create(struct channel_gk20a *c)
 {
-        int err;
        int asid = -1;
        struct gk20a_channel_semaphore *sema;
        char pool_name[20];
@@ -706,21 +834,15 @@ gk20a_channel_semaphore_create(struct channel_gk20a *c)
                asid = c->vm->as_share->id;
        sprintf(pool_name, "semaphore_pool-%d", c->hw_chid);
-        sema->pool = gk20a_semaphore_pool_alloc(c->g, pool_name, 1024);
+        sema->pool = c->vm->sema_pool;
-        if (!sema->pool)
-                goto clean_up;
-        /* Map the semaphore pool to the channel vm. Map as read-write to the
-         * owner channel (all other channels should map as read only!). */
-        err = gk20a_semaphore_pool_map(sema->pool, c->vm, gk20a_mem_flag_none);
-        if (err)
-                goto clean_up;
 #ifdef CONFIG_SYNC
        sema->timeline = gk20a_sync_timeline_create(
                        "gk20a_ch%d_as%d", c->hw_chid, asid);
-        if (!sema->timeline)
+        if (!sema->timeline) {
-                goto clean_up;
+                gk20a_channel_semaphore_destroy(&sema->ops);
+                return NULL;
+        }
 #endif
        atomic_set(&sema->ops.refcount, 0);
        sema->ops.wait_syncpt   = gk20a_channel_semaphore_wait_syncpt;
@@ -734,9 +856,6 @@ gk20a_channel_semaphore_create(struct channel_gk20a *c)
        sema->ops.destroy       = gk20a_channel_semaphore_destroy;
        return &sema->ops;
-clean_up:
-        gk20a_channel_semaphore_destroy(&sema->ops);
-        return NULL;
 }
 void gk20a_channel_sync_destroy(struct gk20a_channel_sync *sync)

diff --git a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c index d2d8c094..9c8911e9 100644 --- a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
@@ -424,28 +424,52 @@ static void gk20a_channel_semaphore_launcher(
424	}	424	}
425	#endif	425	#endif
426		426
427	static int add_sema_cmd(struct gk20a g, struct priv_cmd_entry cmd,	427	static void add_sema_cmd(struct gk20a g, struct channel_gk20a c,
428	u64 sema, u32 payload, bool acquire, bool wfi)	428	struct gk20a_semaphore s, struct priv_cmd_entry cmd,
		429	int cmd_size, bool acquire, bool wfi)
429	{	430	{
430	u32 off = cmd->off;	431	u32 off = cmd->off;
		432	u64 va;
		433
		434	/*
		435	* RO for acquire (since we just need to read the mem) and RW for
		436	* release since we will need to write back to the semaphore memory.
		437	*/
		438	va = acquire ? gk20a_semaphore_gpu_ro_va(s) :
		439	gk20a_semaphore_gpu_rw_va(s);
		440
		441	/*
		442	* If the op is not an acquire (so therefor a release) we should
		443	* incr the underlying sema next_value.
		444	*/
		445	if (!acquire)
		446	gk20a_semaphore_incr(s);
		447
431	/* semaphore_a */	448	/* semaphore_a */
432	gk20a_mem_wr32(g, cmd->mem, off++, 0x20010004);	449	gk20a_mem_wr32(g, cmd->mem, off++, 0x20010004);
433	/* offset_upper */	450	/* offset_upper */
434	gk20a_mem_wr32(g, cmd->mem, off++, (sema >> 32) & 0xff);	451	gk20a_mem_wr32(g, cmd->mem, off++, (va >> 32) & 0xff);
435	/* semaphore_b */	452	/* semaphore_b */
436	gk20a_mem_wr32(g, cmd->mem, off++, 0x20010005);	453	gk20a_mem_wr32(g, cmd->mem, off++, 0x20010005);
437	/* offset */	454	/* offset */
438	gk20a_mem_wr32(g, cmd->mem, off++, sema & 0xffffffff);	455	gk20a_mem_wr32(g, cmd->mem, off++, va & 0xffffffff);
439	/* semaphore_c */	456
440	gk20a_mem_wr32(g, cmd->mem, off++, 0x20010006);
441	/* payload */
442	gk20a_mem_wr32(g, cmd->mem, off++, payload);
443	if (acquire) {	457	if (acquire) {
		458	/* semaphore_c */
		459	gk20a_mem_wr32(g, cmd->mem, off++, 0x20010006);
		460	/* payload */
		461	gk20a_mem_wr32(g, cmd->mem, off++,
		462	gk20a_semaphore_get_value(s));
444	/* semaphore_d */	463	/* semaphore_d */
445	gk20a_mem_wr32(g, cmd->mem, off++, 0x20010007);	464	gk20a_mem_wr32(g, cmd->mem, off++, 0x20010007);
446	/* operation: acq_geq, switch_en */	465	/* operation: acq_geq, switch_en */
447	gk20a_mem_wr32(g, cmd->mem, off++, 0x4 \| (0x1 << 12));	466	gk20a_mem_wr32(g, cmd->mem, off++, 0x4 \| (0x1 << 12));
448	} else {	467	} else {
		468	/* semaphore_c */
		469	gk20a_mem_wr32(g, cmd->mem, off++, 0x20010006);
		470	/* payload */
		471	gk20a_mem_wr32(g, cmd->mem, off++,
		472	gk20a_semaphore_get_value(s));
449	/* semaphore_d */	473	/* semaphore_d */
450	gk20a_mem_wr32(g, cmd->mem, off++, 0x20010007);	474	gk20a_mem_wr32(g, cmd->mem, off++, 0x20010007);
451	/* operation: release, wfi */	475	/* operation: release, wfi */
@@ -456,7 +480,6 @@ static int add_sema_cmd(struct gk20a g, struct priv_cmd_entry cmd,
456	/* ignored */	480	/* ignored */
457	gk20a_mem_wr32(g, cmd->mem, off++, 0);	481	gk20a_mem_wr32(g, cmd->mem, off++, 0);
458	}	482	}
459	return off - cmd->off;
460	}	483	}
461		484
462	static int gk20a_channel_semaphore_wait_syncpt(	485	static int gk20a_channel_semaphore_wait_syncpt(
@@ -471,6 +494,76 @@ static int gk20a_channel_semaphore_wait_syncpt(
471	return -ENODEV;	494	return -ENODEV;
472	}	495	}
473		496
		497	/*
		498	* UGHHH - the sync_fence underlying implementation changes from 3.10 to 3.18.
		499	* But since there's no API for getting the underlying sync_pts we have to do
		500	* some conditional compilation.
		501	*/
		502	#ifdef CONFIG_SYNC
		503	static struct gk20a_semaphore sema_from_sync_fence(struct sync_fence f)
		504	{
		505	#if LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0)
		506	struct sync_pt *pt;
		507
		508	pt = list_first_entry(&f->pt_list_head, struct sync_pt, pt_list);
		509	return gk20a_sync_pt_inst_get_sema(pt);
		510	#else
		511	return gk20a_sync_pt_inst_get_sema(f->cbs[0].sync_pt);
		512	#endif
		513	}
		514
		515	/*
		516	* Attempt a fast path for waiting on a sync_fence. Basically if the passed
		517	* sync_fence is backed by a gk20a_semaphore then there's no reason to go
		518	* through the rigmarole of setting up a separate semaphore which waits on an
		519	* interrupt from the GPU and then triggers a worker thread to execute a SW
		520	* based semaphore release. Instead just have the GPU wait on the same semaphore
		521	* that is going to be incremented by the GPU.
		522	*
		523	* This function returns 2 possible values: -ENODEV or 0 on success. In the case
		524	* of -ENODEV the fastpath cannot be taken due to the fence not being backed by
		525	* a GPU semaphore.
		526	*/
		527	static int __semaphore_wait_fd_fast_path(struct channel_gk20a *c,
		528	struct sync_fence *fence,
		529	struct priv_cmd_entry **wait_cmd,
		530	struct gk20a_semaphore **fp_sema)
		531	{
		532	struct gk20a_semaphore *sema;
		533	int err;
		534
		535	if (!gk20a_is_sema_backed_sync_fence(fence))
		536	return -ENODEV;
		537
		538	sema = sema_from_sync_fence(fence);
		539
		540	/*
		541	* If there's no underlying sema then that means the underlying sema has
		542	* already signaled.
		543	*/
		544	if (!sema) {
		545	*fp_sema = NULL;
		546	return 0;
		547	}
		548
		549	err = gk20a_channel_alloc_priv_cmdbuf(c, 8, wait_cmd);
		550	if (err)
		551	return err;
		552
		553	gk20a_semaphore_get(sema);
		554	BUG_ON(!atomic_read(&sema->value));
		555	add_sema_cmd(c->g, c, sema, *wait_cmd, 8, true, false);
		556
		557	/*
		558	* Make sure that gk20a_channel_semaphore_wait_fd() can create another
		559	* fence with the underlying semaphore.
		560	*/
		561	*fp_sema = sema;
		562
		563	return 0;
		564	}
		565	#endif
		566
474	static int gk20a_channel_semaphore_wait_fd(	567	static int gk20a_channel_semaphore_wait_fd(
475	struct gk20a_channel_sync *s, int fd,	568	struct gk20a_channel_sync *s, int fd,
476	struct priv_cmd_entry **entry,	569	struct priv_cmd_entry **entry,
@@ -480,69 +573,107 @@ static int gk20a_channel_semaphore_wait_fd(
480	container_of(s, struct gk20a_channel_semaphore, ops);	573	container_of(s, struct gk20a_channel_semaphore, ops);
481	struct channel_gk20a *c = sema->c;	574	struct channel_gk20a *c = sema->c;
482	#ifdef CONFIG_SYNC	575	#ifdef CONFIG_SYNC
		576	struct gk20a_semaphore *fp_sema;
483	struct sync_fence *sync_fence;	577	struct sync_fence *sync_fence;
484	struct priv_cmd_entry *wait_cmd = NULL;	578	struct priv_cmd_entry *wait_cmd = NULL;
485	struct wait_fence_work *w;	579	struct wait_fence_work *w = NULL;
486	int written;	580	int err, ret, status;
487	int err, ret;
488	u64 va;
489		581
490	sync_fence = gk20a_sync_fence_fdget(fd);	582	sync_fence = gk20a_sync_fence_fdget(fd);
491	if (!sync_fence)	583	if (!sync_fence)
492	return -EINVAL;	584	return -EINVAL;
493		585
		586	ret = __semaphore_wait_fd_fast_path(c, sync_fence, &wait_cmd, &fp_sema);
		587	if (ret == 0) {
		588	if (fp_sema)
		589	*fence = gk20a_fence_from_semaphore(sema->timeline,
		590	fp_sema,
		591	&c->semaphore_wq,
		592	NULL, false);
		593	else
		594	/*
		595	* Allocate an empty fence. It will instantly return
		596	* from gk20a_fence_wait().
		597	*/
		598	*fence = gk20a_alloc_fence(NULL, NULL, false);
		599
		600	sync_fence_put(sync_fence);
		601	goto skip_slow_path;
		602	}
		603
		604	/* If the fence has signaled there is no reason to wait on it. */
		605	#if LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0)
		606	status = sync_fence->status;
		607	#else
		608	status = atomic_read(&sync_fence->status);
		609	#endif
		610	if (status) {
		611	sync_fence_put(sync_fence);
		612	goto skip_slow_path;
		613	}
		614
		615	err = gk20a_channel_alloc_priv_cmdbuf(c, 8, &wait_cmd);
		616	if (err) {
		617	gk20a_err(dev_from_gk20a(c->g),
		618	"not enough priv cmd buffer space");
		619	sync_fence_put(sync_fence);
		620	return -ENOMEM;
		621	}
		622
494	w = kzalloc(sizeof(*w), GFP_KERNEL);	623	w = kzalloc(sizeof(*w), GFP_KERNEL);
495	if (!w) {	624	if (!w) {
496	err = -ENOMEM;	625	err = -ENOMEM;
497	goto fail;	626	goto fail_free_cmdbuf;
498	}	627	}
		628
499	sync_fence_waiter_init(&w->waiter, gk20a_channel_semaphore_launcher);	629	sync_fence_waiter_init(&w->waiter, gk20a_channel_semaphore_launcher);
500	w->ch = c;	630	w->ch = c;
501	w->sema = gk20a_semaphore_alloc(sema->pool);	631	w->sema = gk20a_semaphore_alloc(c);
502	if (!w->sema) {	632	if (!w->sema) {
503	gk20a_err(dev_from_gk20a(c->g), "ran out of semaphores");	633	gk20a_err(dev_from_gk20a(c->g), "ran out of semaphores");
504	err = -ENOMEM;	634	err = -ENOMEM;
505	goto fail;	635	goto fail_free_worker;
506	}	636	}
507		637
508	/* worker takes one reference */	638	/* worker takes one reference */
509	gk20a_semaphore_get(w->sema);	639	gk20a_semaphore_get(w->sema);
		640	gk20a_semaphore_incr(w->sema);
510		641
511	err = gk20a_channel_alloc_priv_cmdbuf(c, 8, &wait_cmd);	642	/* GPU unblocked when the semaphore value increments. */
512	if (err) {	643	add_sema_cmd(c->g, c, w->sema, wait_cmd, 8, true, false);
513	gk20a_err(dev_from_gk20a(c->g),
514	"not enough priv cmd buffer space");
515	goto fail;
516	}
517
518	va = gk20a_semaphore_gpu_va(w->sema, c->vm);
519	/* GPU unblocked when when the semaphore value becomes 1. */
520	written = add_sema_cmd(c->g, wait_cmd, va, 1, true, false);
521		644
522	WARN_ON(written != wait_cmd->size);
523	ret = sync_fence_wait_async(sync_fence, &w->waiter);	645	ret = sync_fence_wait_async(sync_fence, &w->waiter);
524		646
525	/*	647	/*
526	* If the sync_fence has already signaled then the above async_wait	648	* If the sync_fence has already signaled then the above async_wait
527	* will never trigger. This causes the semaphore release op to never	649	* will never trigger. This causes the semaphore release op to never
528	* happen which, in turn, hangs the GPU. That's bad. So let's just	650	* happen which, in turn, hangs the GPU. That's bad. So let's just
529	* do the semaphore_release right now.	651	* do the gk20a_semaphore_release() right now.
530	*/	652	*/
531	if (ret == 1)	653	if (ret == 1) {
		654	sync_fence_put(sync_fence);
532	gk20a_semaphore_release(w->sema);	655	gk20a_semaphore_release(w->sema);
		656	gk20a_semaphore_put(w->sema);
		657	}
533		658
534	/* XXX - this fixes an actual bug, we need to hold a ref to this	659	/* XXX - this fixes an actual bug, we need to hold a ref to this
535	semaphore while the job is in flight. */	660	semaphore while the job is in flight. */
536	*fence = gk20a_fence_from_semaphore(sema->timeline, w->sema,	661	*fence = gk20a_fence_from_semaphore(sema->timeline, w->sema,
537	&c->semaphore_wq,	662	&c->semaphore_wq,
538	NULL, false);	663	NULL, false);
		664
		665	skip_slow_path:
539	*entry = wait_cmd;	666	*entry = wait_cmd;
540	return 0;	667	return 0;
541	fail:	668
		669	fail_free_worker:
542	if (w && w->sema)	670	if (w && w->sema)
543	gk20a_semaphore_put(w->sema);	671	gk20a_semaphore_put(w->sema);
544	kfree(w);	672	kfree(w);
545	sync_fence_put(sync_fence);	673	sync_fence_put(sync_fence);
		674	fail_free_cmdbuf:
		675	if (wait_cmd)
		676	gk20a_free_priv_cmdbuf(c, wait_cmd);
546	return err;	677	return err;
547	#else	678	#else
548	gk20a_err(dev_from_gk20a(c->g),	679	gk20a_err(dev_from_gk20a(c->g),
@@ -558,9 +689,7 @@ static int __gk20a_channel_semaphore_incr(
558	struct gk20a_fence **fence,	689	struct gk20a_fence **fence,
559	bool need_sync_fence)	690	bool need_sync_fence)
560	{	691	{
561	u64 va;
562	int incr_cmd_size;	692	int incr_cmd_size;
563	int written;
564	struct priv_cmd_entry *incr_cmd = NULL;	693	struct priv_cmd_entry *incr_cmd = NULL;
565	struct gk20a_channel_semaphore *sp =	694	struct gk20a_channel_semaphore *sp =
566	container_of(s, struct gk20a_channel_semaphore, ops);	695	container_of(s, struct gk20a_channel_semaphore, ops);
@@ -568,7 +697,7 @@ static int __gk20a_channel_semaphore_incr(
568	struct gk20a_semaphore *semaphore;	697	struct gk20a_semaphore *semaphore;
569	int err = 0;	698	int err = 0;
570		699
571	semaphore = gk20a_semaphore_alloc(sp->pool);	700	semaphore = gk20a_semaphore_alloc(c);
572	if (!semaphore) {	701	if (!semaphore) {
573	gk20a_err(dev_from_gk20a(c->g),	702	gk20a_err(dev_from_gk20a(c->g),
574	"ran out of semaphores");	703	"ran out of semaphores");
@@ -585,9 +714,7 @@ static int __gk20a_channel_semaphore_incr(
585	}	714	}
586		715
587	/* Release the completion semaphore. */	716	/* Release the completion semaphore. */
588	va = gk20a_semaphore_gpu_va(semaphore, c->vm);	717	add_sema_cmd(c->g, c, semaphore, incr_cmd, 14, false, wfi_cmd);
589	written = add_sema_cmd(c->g, incr_cmd, va, 1, false, wfi_cmd);
590	WARN_ON(written != incr_cmd_size);
591		718
592	*fence = gk20a_fence_from_semaphore(sp->timeline, semaphore,	719	*fence = gk20a_fence_from_semaphore(sp->timeline, semaphore,
593	&c->semaphore_wq,	720	&c->semaphore_wq,
@@ -615,8 +742,10 @@ static int gk20a_channel_semaphore_incr(
615	{	742	{
616	/* Don't put wfi cmd to this one since we're not returning	743	/* Don't put wfi cmd to this one since we're not returning
617	* a fence to user space. */	744	* a fence to user space. */
618	return __gk20a_channel_semaphore_incr(s, false /* no wfi */,	745	return __gk20a_channel_semaphore_incr(s,
619	NULL, entry, fence, need_sync_fence);	746	false /* no wfi */,
		747	NULL,
		748	entry, fence, need_sync_fence);
620	}	749	}
621		750
622	static int gk20a_channel_semaphore_incr_user(	751	static int gk20a_channel_semaphore_incr_user(
@@ -679,17 +808,16 @@ static void gk20a_channel_semaphore_destroy(struct gk20a_channel_sync *s)
679	container_of(s, struct gk20a_channel_semaphore, ops);	808	container_of(s, struct gk20a_channel_semaphore, ops);
680	if (sema->timeline)	809	if (sema->timeline)
681	gk20a_sync_timeline_destroy(sema->timeline);	810	gk20a_sync_timeline_destroy(sema->timeline);
682	if (sema->pool) {	811
683	gk20a_semaphore_pool_unmap(sema->pool, sema->c->vm);	812	/* The sema pool is cleaned up by the VM destroy. */
684	gk20a_semaphore_pool_put(sema->pool);	813	sema->pool = NULL;
685	}	814
686	kfree(sema);	815	kfree(sema);
687	}	816	}
688		817
689	static struct gk20a_channel_sync *	818	static struct gk20a_channel_sync *
690	gk20a_channel_semaphore_create(struct channel_gk20a *c)	819	gk20a_channel_semaphore_create(struct channel_gk20a *c)
691	{	820	{
692	int err;
693	int asid = -1;	821	int asid = -1;
694	struct gk20a_channel_semaphore *sema;	822	struct gk20a_channel_semaphore *sema;
695	char pool_name[20];	823	char pool_name[20];
@@ -706,21 +834,15 @@ gk20a_channel_semaphore_create(struct channel_gk20a *c)
706	asid = c->vm->as_share->id;	834	asid = c->vm->as_share->id;
707		835
708	sprintf(pool_name, "semaphore_pool-%d", c->hw_chid);	836	sprintf(pool_name, "semaphore_pool-%d", c->hw_chid);
709	sema->pool = gk20a_semaphore_pool_alloc(c->g, pool_name, 1024);	837	sema->pool = c->vm->sema_pool;
710	if (!sema->pool)
711	goto clean_up;
712
713	/* Map the semaphore pool to the channel vm. Map as read-write to the
714	* owner channel (all other channels should map as read only!). */
715	err = gk20a_semaphore_pool_map(sema->pool, c->vm, gk20a_mem_flag_none);
716	if (err)
717	goto clean_up;
718		838
719	#ifdef CONFIG_SYNC	839	#ifdef CONFIG_SYNC
720	sema->timeline = gk20a_sync_timeline_create(	840	sema->timeline = gk20a_sync_timeline_create(
721	"gk20a_ch%d_as%d", c->hw_chid, asid);	841	"gk20a_ch%d_as%d", c->hw_chid, asid);
722	if (!sema->timeline)	842	if (!sema->timeline) {
723	goto clean_up;	843	gk20a_channel_semaphore_destroy(&sema->ops);
		844	return NULL;
		845	}
724	#endif	846	#endif
725	atomic_set(&sema->ops.refcount, 0);	847	atomic_set(&sema->ops.refcount, 0);
726	sema->ops.wait_syncpt = gk20a_channel_semaphore_wait_syncpt;	848	sema->ops.wait_syncpt = gk20a_channel_semaphore_wait_syncpt;
@@ -734,9 +856,6 @@ gk20a_channel_semaphore_create(struct channel_gk20a *c)
734	sema->ops.destroy = gk20a_channel_semaphore_destroy;	856	sema->ops.destroy = gk20a_channel_semaphore_destroy;
735		857
736	return &sema->ops;	858	return &sema->ops;
737	clean_up:
738	gk20a_channel_semaphore_destroy(&sema->ops);
739	return NULL;
740	}	859	}
741		860
742	void gk20a_channel_sync_destroy(struct gk20a_channel_sync *sync)	861	void gk20a_channel_sync_destroy(struct gk20a_channel_sync *sync)