gpu: nvgpu: wait for all prefence semas on gpu

The pre-fence wait for semaphores in the submit path has supported a fast path for fences that have only one underlying semaphore. The fast path just inserts the wait on this sema to the pushbuffer directly. For other fences, the path has been using a CPU wait indirection, signaling another semaphore when we get the CPU-side callback. Instead of only supporting prefences with one sema, unroll all the individual semaphores and insert waits for each to a pushbuffer, like we've already been doing with syncpoints. Now all sema-backed syncs get the fast path. This simplifies the logic and makes it more explicit that only foreign fences need the CPU wait. There is no need to hold references to the sync fence or the semas inside: this submitted job only needs the global read-only sema mapping that is guaranteed to stay alive while the VM of this channel stays alive, and the job does not outlive this channel. Jira NVGPU-43 Jira NVGPU-66 Jira NVGPU-513 Change-Id: I7cfbb510001d998a864aed8d6afd1582b9adb80d Signed-off-by: Konsta Holtta <kholtta@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1636345 Reviewed-by: Alex Waterman <alexw@nvidia.com> Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
author: Konsta Holtta <kholtta@nvidia.com> 2018-03-13 10:58:01 -0400
committer: mobile promotions <svcmobile_promotions@nvidia.com> 2018-03-16 10:34:01 -0400
commit: 34323b559590ed8f1c64ecbb7ffbd838a6478594 (patch)
tree: c6258e44413a2f15ded4cf63e4a16f5118921703
parent: fb40f2a80739985abac273bc493e07341aa003af (diff)
3 files changed, 123 insertions, 148 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
index 4b1be8b9..c6b55bf8 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
@@ -502,10 +502,10 @@ static void gk20a_channel_semaphore_launcher(
 static void add_sema_cmd(struct gk20a *g, struct channel_gk20a *c,
                         struct nvgpu_semaphore *s, struct priv_cmd_entry *cmd,
-                         int cmd_size, bool acquire, bool wfi)
+                         u32 offset, bool acquire, bool wfi)
 {
        int ch = c->chid;
-        u32 ob, off = cmd->off;
+        u32 ob, off = cmd->off + offset;
        u64 va;
        ob = off;
@@ -588,108 +588,79 @@ static int gk20a_channel_semaphore_wait_syncpt(
 }
 #ifdef CONFIG_SYNC
-/*
+static int semaphore_wait_fd_native(struct channel_gk20a *c, int fd,
- * Attempt a fast path for waiting on a sync_fence. Basically if the passed
+                struct priv_cmd_entry *wait_cmd)
- * sync_fence is backed by a nvgpu_semaphore then there's no reason to go
- * through the rigmarole of setting up a separate semaphore which waits on an
- * interrupt from the GPU and then triggers a worker thread to execute a SW
- * based semaphore release. Instead just have the GPU wait on the same semaphore
- * that is going to be incremented by the GPU.
- *
- * This function returns 2 possible values: -ENODEV or 0 on success. In the case
- * of -ENODEV the fastpath cannot be taken due to the fence not being backed by
- * a GPU semaphore.
- */
-static int __semaphore_wait_fd_fast_path(struct channel_gk20a *c,
-                                         struct sync_fence *fence,
-                                         struct priv_cmd_entry *wait_cmd,
-                                         struct nvgpu_semaphore **fp_sema)
 {
-        struct nvgpu_semaphore *sema;
+        struct sync_fence *sync_fence;
        int err;
+        const int wait_cmd_size = 8;
+        int num_wait_cmds;
+        int i;
-        if (!gk20a_is_sema_backed_sync_fence(fence))
+        sync_fence = gk20a_sync_fence_fdget(fd);
-                return -ENODEV;
+        if (!sync_fence)
+                return -EINVAL;
-        sema = gk20a_sync_fence_get_sema(fence);
-        /*
+        num_wait_cmds = sync_fence->num_fences;
-         * If there's no underlying sema then that means the underlying sema has
+        if (num_wait_cmds == 0) {
-         * already signaled.
+                err = 0;
-         */
+                goto put_fence;
-        if (!sema) {
-                *fp_sema = NULL;
-                return 0;
        }
-        err = gk20a_channel_alloc_priv_cmdbuf(c, 8, wait_cmd);
+        err = gk20a_channel_alloc_priv_cmdbuf(c,
-        if (err)
+                        wait_cmd_size * num_wait_cmds,
-                return err;
+                        wait_cmd);
+        if (err) {
+                nvgpu_err(c->g, "not enough priv cmd buffer space");
+                goto put_fence;
+        }
-        nvgpu_semaphore_get(sema);
+        for (i = 0; i < sync_fence->num_fences; i++) {
-        BUG_ON(!sema->incremented);
+                struct fence *f = sync_fence->cbs[i].sync_pt;
-        add_sema_cmd(c->g, c, sema, wait_cmd, 8, true, false);
+                struct sync_pt *pt = sync_pt_from_fence(f);
+                struct nvgpu_semaphore *sema;
-        /*
+                sema = gk20a_sync_pt_sema(pt);
-         * Make sure that gk20a_channel_semaphore_wait_fd() can create another
+                if (!sema) {
-         * fence with the underlying semaphore.
+                        /* expired */
-         */
+                        nvgpu_memset(c->g, wait_cmd->mem,
-        *fp_sema = sema;
+                        (wait_cmd->off + i * wait_cmd_size) * sizeof(u32),
+                                0, wait_cmd_size * sizeof(u32));
+                } else {
+                        WARN_ON(!sema->incremented);
+                        add_sema_cmd(c->g, c, sema, wait_cmd,
+                                        i * wait_cmd_size, true, false);
+                        nvgpu_semaphore_put(sema);
+                }
+        }
-        return 0;
+put_fence:
+        sync_fence_put(sync_fence);
+        return err;
 }
-#endif
-static int gk20a_channel_semaphore_wait_fd(
+static int semaphore_wait_fd_proxy(struct channel_gk20a *c, int fd,
-                struct gk20a_channel_sync *s, int fd,
+                struct priv_cmd_entry *wait_cmd,
-                struct priv_cmd_entry *entry,
+                struct gk20a_fence *fence_out,
-                struct gk20a_fence *fence)
+                struct sync_timeline *timeline)
 {
-        struct gk20a_channel_semaphore *sema =
+        const int wait_cmd_size = 8;
-                container_of(s, struct gk20a_channel_semaphore, ops);
-        struct channel_gk20a *c = sema->c;
-#ifdef CONFIG_SYNC
-        struct nvgpu_semaphore *fp_sema;
        struct sync_fence *sync_fence;
-        struct priv_cmd_entry *wait_cmd = entry;
        struct wait_fence_work *w = NULL;
-        int err, ret, status;
+        int err, status;
-        sync_fence = gk20a_sync_fence_fdget(fd);
+        sync_fence = sync_fence_fdget(fd);
        if (!sync_fence)
                return -EINVAL;
-        ret = __semaphore_wait_fd_fast_path(c, sync_fence, wait_cmd, &fp_sema);
-        if (ret == 0) {
-                if (fp_sema) {
-                        err = gk20a_fence_from_semaphore(c->g, fence,
-                                        sema->timeline,
-                                        fp_sema,
-                                        &c->semaphore_wq,
-                                        false);
-                        if (err) {
-                                nvgpu_semaphore_put(fp_sema);
-                                goto clean_up_priv_cmd;
-                        }
-                } else
-                        /*
-                         * Init an empty fence. It will instantly return
-                         * from gk20a_fence_wait().
-                         */
-                        gk20a_init_fence(fence, NULL, NULL);
-                sync_fence_put(sync_fence);
-                goto skip_slow_path;
-        }
        /* If the fence has signaled there is no reason to wait on it. */
        status = atomic_read(&sync_fence->status);
        if (status == 0) {
                sync_fence_put(sync_fence);
-                goto skip_slow_path;
+                return 0;
        }
-        err = gk20a_channel_alloc_priv_cmdbuf(c, 8, wait_cmd);
+        err = gk20a_channel_alloc_priv_cmdbuf(c, wait_cmd_size, wait_cmd);
        if (err) {
                nvgpu_err(c->g,
                                "not enough priv cmd buffer space");
@@ -718,34 +689,34 @@ static int gk20a_channel_semaphore_wait_fd(
        nvgpu_semaphore_incr(w->sema, c->hw_sema);
        /* GPU unblocked when the semaphore value increments. */
-        add_sema_cmd(c->g, c, w->sema, wait_cmd, 8, true, false);
+        add_sema_cmd(c->g, c, w->sema, wait_cmd, 0, true, false);
        /*
         *  We need to create the fence before adding the waiter to ensure
         *  that we properly clean up in the event the sync_fence has
         *  already signaled
         */
-        err = gk20a_fence_from_semaphore(c->g, fence, sema->timeline, w->sema,
+        err = gk20a_fence_from_semaphore(c->g, fence_out, timeline,
-                        &c->semaphore_wq, false);
+                        w->sema, &c->semaphore_wq, false);
        if (err)
                goto clean_up_sema;
-        ret = sync_fence_wait_async(sync_fence, &w->waiter);
+        err = sync_fence_wait_async(sync_fence, &w->waiter);
        gk20a_add_pending_sema_wait(c->g, w);
-        /*
+        /*
-         * If the sync_fence has already signaled then the above async_wait
+         * If the sync_fence has already signaled then the above wait_async
-         * will never trigger. This causes the semaphore release op to never
+         * will not get scheduled; the fence completed just after doing the
-         * happen which, in turn, hangs the GPU. That's bad. So let's just
+         * status check above before allocs and waiter init, and won the race.
-         * do the nvgpu_semaphore_release() right now.
+         * This causes the waiter to be skipped, so let's release the semaphore
+         * here and put the refs taken for the worker.
         */
-        if (ret == 1) {
+        if (err == 1) {
                sync_fence_put(sync_fence);
                nvgpu_semaphore_release(w->sema, c->hw_sema);
                nvgpu_semaphore_put(w->sema);
        }
-skip_slow_path:
        return 0;
 clean_up_sema:
@@ -758,10 +729,28 @@ clean_up_sema:
 clean_up_worker:
        nvgpu_kfree(c->g, w);
 clean_up_priv_cmd:
-        gk20a_free_priv_cmdbuf(c, entry);
+        gk20a_free_priv_cmdbuf(c, wait_cmd);
 clean_up_sync_fence:
        sync_fence_put(sync_fence);
        return err;
+}
+#endif
+static int gk20a_channel_semaphore_wait_fd(
+                struct gk20a_channel_sync *s, int fd,
+                struct priv_cmd_entry *entry,
+                struct gk20a_fence *fence)
+{
+        struct gk20a_channel_semaphore *sema =
+                container_of(s, struct gk20a_channel_semaphore, ops);
+        struct channel_gk20a *c = sema->c;
+#ifdef CONFIG_SYNC
+        int err;
+        err = semaphore_wait_fd_native(c, fd, entry);
+        if (err)
+                err = semaphore_wait_fd_proxy(c, fd, entry, fence, sema->timeline);
+        return err;
 #else
        nvgpu_err(c->g,
                  "trying to use sync fds with CONFIG_SYNC disabled");
@@ -798,7 +787,7 @@ static int __gk20a_channel_semaphore_incr(
        }
        /* Release the completion semaphore. */
-        add_sema_cmd(c->g, c, semaphore, incr_cmd, 14, false, wfi_cmd);
+        add_sema_cmd(c->g, c, semaphore, incr_cmd, 0, false, wfi_cmd);
        err = gk20a_fence_from_semaphore(c->g, fence,
                        sp->timeline, semaphore,
diff --git a/drivers/gpu/nvgpu/gk20a/sync_gk20a.c b/drivers/gpu/nvgpu/gk20a/sync_gk20a.c
index f6d16b90..a8600bce 100644
--- a/drivers/gpu/nvgpu/gk20a/sync_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/sync_gk20a.c
@@ -1,7 +1,7 @@
 /*
 * GK20A Sync Framework Integration
 *
- * Copyright (c) 2014-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2014-2018, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -70,55 +70,6 @@ struct gk20a_sync_pt_inst {
 };
 /**
- * Check if the passed sync_fence is backed by a single GPU semaphore. In such
- * cases we can short circuit a lot of SW involved in signaling pre-fences and
- * post fences.
- *
- * For now reject multi-sync_pt fences. This could be changed in future. It
- * would require that the sema fast path push a sema acquire for each semaphore
- * in the fence.
- */
-int gk20a_is_sema_backed_sync_fence(struct sync_fence *fence)
-{
-        struct sync_timeline *t;
-        struct fence *pt = fence->cbs[0].sync_pt;
-        struct sync_pt *spt = sync_pt_from_fence(pt);
-        if (fence->num_fences != 1)
-                return 0;
-        if (spt == NULL)
-                return 0;
-        t = sync_pt_parent(spt);
-        if (t->ops == &gk20a_sync_timeline_ops)
-                return 1;
-        return 0;
-}
-struct nvgpu_semaphore *gk20a_sync_fence_get_sema(struct sync_fence *f)
-{
-        struct sync_pt *spt;
-        struct gk20a_sync_pt_inst *pti;
-        struct fence *pt;
-        if (!f)
-                return NULL;
-        if (!gk20a_is_sema_backed_sync_fence(f))
-                return NULL;
-        pt = f->cbs[0].sync_pt;
-        spt = sync_pt_from_fence(pt);
-        pti = container_of(spt, struct gk20a_sync_pt_inst, pt);
-        return pti->shared->sema;
-}
-/**
 * Compares sync pt values a and b, both of which will trigger either before
 * or after ref (i.e. a and b trigger before ref, or a and b trigger after
 * ref). Supplying ref allows us to handle wrapping correctly.
@@ -371,7 +322,44 @@ static const struct sync_timeline_ops gk20a_sync_timeline_ops = {
 struct sync_fence *gk20a_sync_fence_fdget(int fd)
 {
-        return sync_fence_fdget(fd);
+        struct sync_fence *fence = sync_fence_fdget(fd);
+        int i;
+        if (!fence)
+                return NULL;
+        for (i = 0; i < fence->num_fences; i++) {
+                struct fence *pt = fence->cbs[i].sync_pt;
+                struct sync_pt *spt = sync_pt_from_fence(pt);
+                struct sync_timeline *t;
+                if (spt == NULL) {
+                        sync_fence_put(fence);
+                        return NULL;
+                }
+                t = sync_pt_parent(spt);
+                if (t->ops != &gk20a_sync_timeline_ops) {
+                        sync_fence_put(fence);
+                        return NULL;
+                }
+        }
+        return fence;
+}
+struct nvgpu_semaphore *gk20a_sync_pt_sema(struct sync_pt *spt)
+{
+        struct gk20a_sync_pt *pt = to_gk20a_sync_pt(spt);
+        struct nvgpu_semaphore *sema;
+        nvgpu_spinlock_acquire(&pt->lock);
+        sema = pt->sema;
+        if (sema)
+                nvgpu_semaphore_get(sema);
+        nvgpu_spinlock_release(&pt->lock);
+        return sema;
 }
 void gk20a_sync_timeline_signal(struct sync_timeline *timeline)
diff --git a/drivers/gpu/nvgpu/gk20a/sync_gk20a.h b/drivers/gpu/nvgpu/gk20a/sync_gk20a.h
index 7d7aff6d..8a6439ab 100644
--- a/drivers/gpu/nvgpu/gk20a/sync_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/sync_gk20a.h
@@ -3,7 +3,7 @@
 *
 * GK20A Sync Framework Integration
 *
- * Copyright (c) 2014-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2014-2018, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -33,9 +33,6 @@ struct sync_pt;
 struct nvgpu_semaphore;
 struct fence;
-int gk20a_is_sema_backed_sync_fence(struct sync_fence *fence);
-struct nvgpu_semaphore *gk20a_sync_fence_get_sema(struct sync_fence *f);
 #ifdef CONFIG_SYNC
 struct sync_timeline *gk20a_sync_timeline_create(const char *fmt, ...);
 void gk20a_sync_timeline_destroy(struct sync_timeline *);
@@ -46,6 +43,7 @@ struct sync_fence *gk20a_sync_fence_create(
                struct nvgpu_semaphore *,
                const char *fmt, ...);
 struct sync_fence *gk20a_sync_fence_fdget(int fd);
+struct nvgpu_semaphore *gk20a_sync_pt_sema(struct sync_pt *spt);
 #else
 static inline void gk20a_sync_timeline_destroy(struct sync_timeline *obj) {}
 static inline void gk20a_sync_timeline_signal(struct sync_timeline *obj) {}
author	Konsta Holtta <kholtta@nvidia.com>	2018-03-13 10:58:01 -0400
committer	mobile promotions <svcmobile_promotions@nvidia.com>	2018-03-16 10:34:01 -0400
commit	34323b559590ed8f1c64ecbb7ffbd838a6478594 (patch)
tree	c6258e44413a2f15ded4cf63e4a16f5118921703
parent	fb40f2a80739985abac273bc493e07341aa003af (diff)

diff --git a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c index 4b1be8b9..c6b55bf8 100644 --- a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
@@ -502,10 +502,10 @@ static void gk20a_channel_semaphore_launcher(
502		502
503	static void add_sema_cmd(struct gk20a g, struct channel_gk20a c,	503	static void add_sema_cmd(struct gk20a g, struct channel_gk20a c,
504	struct nvgpu_semaphore s, struct priv_cmd_entry cmd,	504	struct nvgpu_semaphore s, struct priv_cmd_entry cmd,
505	int cmd_size, bool acquire, bool wfi)	505	u32 offset, bool acquire, bool wfi)
506	{	506	{
507	int ch = c->chid;	507	int ch = c->chid;
508	u32 ob, off = cmd->off;	508	u32 ob, off = cmd->off + offset;
509	u64 va;	509	u64 va;
510		510
511	ob = off;	511	ob = off;
@@ -588,108 +588,79 @@ static int gk20a_channel_semaphore_wait_syncpt(
588	}	588	}
589		589
590	#ifdef CONFIG_SYNC	590	#ifdef CONFIG_SYNC
591	/*	591	static int semaphore_wait_fd_native(struct channel_gk20a *c, int fd,
592	* Attempt a fast path for waiting on a sync_fence. Basically if the passed	592	struct priv_cmd_entry *wait_cmd)
593	* sync_fence is backed by a nvgpu_semaphore then there's no reason to go
594	* through the rigmarole of setting up a separate semaphore which waits on an
595	* interrupt from the GPU and then triggers a worker thread to execute a SW
596	* based semaphore release. Instead just have the GPU wait on the same semaphore
597	* that is going to be incremented by the GPU.
598	*
599	* This function returns 2 possible values: -ENODEV or 0 on success. In the case
600	* of -ENODEV the fastpath cannot be taken due to the fence not being backed by
601	* a GPU semaphore.
602	*/
603	static int __semaphore_wait_fd_fast_path(struct channel_gk20a *c,
604	struct sync_fence *fence,
605	struct priv_cmd_entry *wait_cmd,
606	struct nvgpu_semaphore **fp_sema)
607	{	593	{
608	struct nvgpu_semaphore *sema;	594	struct sync_fence *sync_fence;
609	int err;	595	int err;
		596	const int wait_cmd_size = 8;
		597	int num_wait_cmds;
		598	int i;
610		599
611	if (!gk20a_is_sema_backed_sync_fence(fence))	600	sync_fence = gk20a_sync_fence_fdget(fd);
612	return -ENODEV;	601	if (!sync_fence)
613		602	return -EINVAL;
614	sema = gk20a_sync_fence_get_sema(fence);
615		603
616	/*	604	num_wait_cmds = sync_fence->num_fences;
617	* If there's no underlying sema then that means the underlying sema has	605	if (num_wait_cmds == 0) {
618	* already signaled.	606	err = 0;
619	*/	607	goto put_fence;
620	if (!sema) {
621	*fp_sema = NULL;
622	return 0;
623	}	608	}
624		609
625	err = gk20a_channel_alloc_priv_cmdbuf(c, 8, wait_cmd);	610	err = gk20a_channel_alloc_priv_cmdbuf(c,
626	if (err)	611	wait_cmd_size * num_wait_cmds,
627	return err;	612	wait_cmd);
		613	if (err) {
		614	nvgpu_err(c->g, "not enough priv cmd buffer space");
		615	goto put_fence;
		616	}
628		617
629	nvgpu_semaphore_get(sema);	618	for (i = 0; i < sync_fence->num_fences; i++) {
630	BUG_ON(!sema->incremented);	619	struct fence *f = sync_fence->cbs[i].sync_pt;
631	add_sema_cmd(c->g, c, sema, wait_cmd, 8, true, false);	620	struct sync_pt *pt = sync_pt_from_fence(f);
		621	struct nvgpu_semaphore *sema;
632		622
633	/*	623	sema = gk20a_sync_pt_sema(pt);
634	* Make sure that gk20a_channel_semaphore_wait_fd() can create another	624	if (!sema) {
635	* fence with the underlying semaphore.	625	/* expired */
636	*/	626	nvgpu_memset(c->g, wait_cmd->mem,
637	*fp_sema = sema;	627	(wait_cmd->off + i * wait_cmd_size) * sizeof(u32),
		628	0, wait_cmd_size * sizeof(u32));
		629	} else {
		630	WARN_ON(!sema->incremented);
		631	add_sema_cmd(c->g, c, sema, wait_cmd,
		632	i * wait_cmd_size, true, false);
		633	nvgpu_semaphore_put(sema);
		634	}
		635	}
638		636
639	return 0;	637	put_fence:
		638	sync_fence_put(sync_fence);
		639	return err;
640	}	640	}
641	#endif
642		641
643	static int gk20a_channel_semaphore_wait_fd(	642	static int semaphore_wait_fd_proxy(struct channel_gk20a *c, int fd,
644	struct gk20a_channel_sync *s, int fd,	643	struct priv_cmd_entry *wait_cmd,
645	struct priv_cmd_entry *entry,	644	struct gk20a_fence *fence_out,
646	struct gk20a_fence *fence)	645	struct sync_timeline *timeline)
647	{	646	{
648	struct gk20a_channel_semaphore *sema =	647	const int wait_cmd_size = 8;
649	container_of(s, struct gk20a_channel_semaphore, ops);
650	struct channel_gk20a *c = sema->c;
651	#ifdef CONFIG_SYNC
652	struct nvgpu_semaphore *fp_sema;
653	struct sync_fence *sync_fence;	648	struct sync_fence *sync_fence;
654	struct priv_cmd_entry *wait_cmd = entry;
655	struct wait_fence_work *w = NULL;	649	struct wait_fence_work *w = NULL;
656	int err, ret, status;	650	int err, status;
657		651
658	sync_fence = gk20a_sync_fence_fdget(fd);	652	sync_fence = sync_fence_fdget(fd);
659	if (!sync_fence)	653	if (!sync_fence)
660	return -EINVAL;	654	return -EINVAL;
661		655
662	ret = __semaphore_wait_fd_fast_path(c, sync_fence, wait_cmd, &fp_sema);
663	if (ret == 0) {
664	if (fp_sema) {
665	err = gk20a_fence_from_semaphore(c->g, fence,
666	sema->timeline,
667	fp_sema,
668	&c->semaphore_wq,
669	false);
670	if (err) {
671	nvgpu_semaphore_put(fp_sema);
672	goto clean_up_priv_cmd;
673	}
674	} else
675	/*
676	* Init an empty fence. It will instantly return
677	* from gk20a_fence_wait().
678	*/
679	gk20a_init_fence(fence, NULL, NULL);
680
681	sync_fence_put(sync_fence);
682	goto skip_slow_path;
683	}
684
685	/* If the fence has signaled there is no reason to wait on it. */	656	/* If the fence has signaled there is no reason to wait on it. */
686	status = atomic_read(&sync_fence->status);	657	status = atomic_read(&sync_fence->status);
687	if (status == 0) {	658	if (status == 0) {
688	sync_fence_put(sync_fence);	659	sync_fence_put(sync_fence);
689	goto skip_slow_path;	660	return 0;
690	}	661	}
691		662
692	err = gk20a_channel_alloc_priv_cmdbuf(c, 8, wait_cmd);	663	err = gk20a_channel_alloc_priv_cmdbuf(c, wait_cmd_size, wait_cmd);
693	if (err) {	664	if (err) {
694	nvgpu_err(c->g,	665	nvgpu_err(c->g,
695	"not enough priv cmd buffer space");	666	"not enough priv cmd buffer space");
@@ -718,34 +689,34 @@ static int gk20a_channel_semaphore_wait_fd(
718	nvgpu_semaphore_incr(w->sema, c->hw_sema);	689	nvgpu_semaphore_incr(w->sema, c->hw_sema);
719		690
720	/* GPU unblocked when the semaphore value increments. */	691	/* GPU unblocked when the semaphore value increments. */
721	add_sema_cmd(c->g, c, w->sema, wait_cmd, 8, true, false);	692	add_sema_cmd(c->g, c, w->sema, wait_cmd, 0, true, false);
722		693
723	/*	694	/*
724	* We need to create the fence before adding the waiter to ensure	695	* We need to create the fence before adding the waiter to ensure
725	* that we properly clean up in the event the sync_fence has	696	* that we properly clean up in the event the sync_fence has
726	* already signaled	697	* already signaled
727	*/	698	*/
728	err = gk20a_fence_from_semaphore(c->g, fence, sema->timeline, w->sema,	699	err = gk20a_fence_from_semaphore(c->g, fence_out, timeline,
729	&c->semaphore_wq, false);	700	w->sema, &c->semaphore_wq, false);
730	if (err)	701	if (err)
731	goto clean_up_sema;	702	goto clean_up_sema;
732		703
733	ret = sync_fence_wait_async(sync_fence, &w->waiter);	704	err = sync_fence_wait_async(sync_fence, &w->waiter);
734	gk20a_add_pending_sema_wait(c->g, w);	705	gk20a_add_pending_sema_wait(c->g, w);
735		706
736	/*	707	/*
737	* If the sync_fence has already signaled then the above async_wait	708	* If the sync_fence has already signaled then the above wait_async
738	* will never trigger. This causes the semaphore release op to never	709	* will not get scheduled; the fence completed just after doing the
739	* happen which, in turn, hangs the GPU. That's bad. So let's just	710	* status check above before allocs and waiter init, and won the race.
740	* do the nvgpu_semaphore_release() right now.	711	* This causes the waiter to be skipped, so let's release the semaphore
		712	* here and put the refs taken for the worker.
741	*/	713	*/
742	if (ret == 1) {	714	if (err == 1) {
743	sync_fence_put(sync_fence);	715	sync_fence_put(sync_fence);
744	nvgpu_semaphore_release(w->sema, c->hw_sema);	716	nvgpu_semaphore_release(w->sema, c->hw_sema);
745	nvgpu_semaphore_put(w->sema);	717	nvgpu_semaphore_put(w->sema);
746	}	718	}
747		719
748	skip_slow_path:
749	return 0;	720	return 0;
750		721
751	clean_up_sema:	722	clean_up_sema:
@@ -758,10 +729,28 @@ clean_up_sema:
758	clean_up_worker:	729	clean_up_worker:
759	nvgpu_kfree(c->g, w);	730	nvgpu_kfree(c->g, w);
760	clean_up_priv_cmd:	731	clean_up_priv_cmd:
761	gk20a_free_priv_cmdbuf(c, entry);	732	gk20a_free_priv_cmdbuf(c, wait_cmd);
762	clean_up_sync_fence:	733	clean_up_sync_fence:
763	sync_fence_put(sync_fence);	734	sync_fence_put(sync_fence);
764	return err;	735	return err;
		736	}
		737	#endif
		738
		739	static int gk20a_channel_semaphore_wait_fd(
		740	struct gk20a_channel_sync *s, int fd,
		741	struct priv_cmd_entry *entry,
		742	struct gk20a_fence *fence)
		743	{
		744	struct gk20a_channel_semaphore *sema =
		745	container_of(s, struct gk20a_channel_semaphore, ops);
		746	struct channel_gk20a *c = sema->c;
		747	#ifdef CONFIG_SYNC
		748	int err;
		749
		750	err = semaphore_wait_fd_native(c, fd, entry);
		751	if (err)
		752	err = semaphore_wait_fd_proxy(c, fd, entry, fence, sema->timeline);
		753	return err;
765	#else	754	#else
766	nvgpu_err(c->g,	755	nvgpu_err(c->g,
767	"trying to use sync fds with CONFIG_SYNC disabled");	756	"trying to use sync fds with CONFIG_SYNC disabled");
@@ -798,7 +787,7 @@ static int __gk20a_channel_semaphore_incr(
798	}	787	}
799		788
800	/* Release the completion semaphore. */	789	/* Release the completion semaphore. */
801	add_sema_cmd(c->g, c, semaphore, incr_cmd, 14, false, wfi_cmd);	790	add_sema_cmd(c->g, c, semaphore, incr_cmd, 0, false, wfi_cmd);
802		791
803	err = gk20a_fence_from_semaphore(c->g, fence,	792	err = gk20a_fence_from_semaphore(c->g, fence,
804	sp->timeline, semaphore,	793	sp->timeline, semaphore,


diff --git a/drivers/gpu/nvgpu/gk20a/sync_gk20a.c b/drivers/gpu/nvgpu/gk20a/sync_gk20a.c index f6d16b90..a8600bce 100644 --- a/drivers/gpu/nvgpu/gk20a/sync_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/sync_gk20a.c
@@ -1,7 +1,7 @@
1	/*	1	/*
2	* GK20A Sync Framework Integration	2	* GK20A Sync Framework Integration
3	*	3	*
4	* Copyright (c) 2014-2017, NVIDIA CORPORATION. All rights reserved.	4	* Copyright (c) 2014-2018, NVIDIA CORPORATION. All rights reserved.
5	*	5	*
6	* Permission is hereby granted, free of charge, to any person obtaining a	6	* Permission is hereby granted, free of charge, to any person obtaining a
7	* copy of this software and associated documentation files (the "Software"),	7	* copy of this software and associated documentation files (the "Software"),
@@ -70,55 +70,6 @@ struct gk20a_sync_pt_inst {
70	};	70	};
71		71
72	/**	72	/**
73	* Check if the passed sync_fence is backed by a single GPU semaphore. In such
74	* cases we can short circuit a lot of SW involved in signaling pre-fences and
75	* post fences.
76	*
77	* For now reject multi-sync_pt fences. This could be changed in future. It
78	* would require that the sema fast path push a sema acquire for each semaphore
79	* in the fence.
80	*/
81	int gk20a_is_sema_backed_sync_fence(struct sync_fence *fence)
82	{
83	struct sync_timeline *t;
84
85	struct fence *pt = fence->cbs[0].sync_pt;
86	struct sync_pt *spt = sync_pt_from_fence(pt);
87
88	if (fence->num_fences != 1)
89	return 0;
90
91	if (spt == NULL)
92	return 0;
93
94	t = sync_pt_parent(spt);
95
96	if (t->ops == &gk20a_sync_timeline_ops)
97	return 1;
98	return 0;
99	}
100
101	struct nvgpu_semaphore gk20a_sync_fence_get_sema(struct sync_fence f)
102	{
103	struct sync_pt *spt;
104	struct gk20a_sync_pt_inst *pti;
105
106	struct fence *pt;
107
108	if (!f)
109	return NULL;
110
111	if (!gk20a_is_sema_backed_sync_fence(f))
112	return NULL;
113
114	pt = f->cbs[0].sync_pt;
115	spt = sync_pt_from_fence(pt);
116	pti = container_of(spt, struct gk20a_sync_pt_inst, pt);
117
118	return pti->shared->sema;
119	}
120
121	/**
122	* Compares sync pt values a and b, both of which will trigger either before	73	* Compares sync pt values a and b, both of which will trigger either before
123	* or after ref (i.e. a and b trigger before ref, or a and b trigger after	74	* or after ref (i.e. a and b trigger before ref, or a and b trigger after
124	* ref). Supplying ref allows us to handle wrapping correctly.	75	* ref). Supplying ref allows us to handle wrapping correctly.
@@ -371,7 +322,44 @@ static const struct sync_timeline_ops gk20a_sync_timeline_ops = {
371		322
372	struct sync_fence *gk20a_sync_fence_fdget(int fd)	323	struct sync_fence *gk20a_sync_fence_fdget(int fd)
373	{	324	{
374	return sync_fence_fdget(fd);	325	struct sync_fence *fence = sync_fence_fdget(fd);
		326	int i;
		327
		328	if (!fence)
		329	return NULL;
		330
		331	for (i = 0; i < fence->num_fences; i++) {
		332	struct fence *pt = fence->cbs[i].sync_pt;
		333	struct sync_pt *spt = sync_pt_from_fence(pt);
		334	struct sync_timeline *t;
		335
		336	if (spt == NULL) {
		337	sync_fence_put(fence);
		338	return NULL;
		339	}
		340
		341	t = sync_pt_parent(spt);
		342	if (t->ops != &gk20a_sync_timeline_ops) {
		343	sync_fence_put(fence);
		344	return NULL;
		345	}
		346	}
		347
		348	return fence;
		349	}
		350
		351	struct nvgpu_semaphore gk20a_sync_pt_sema(struct sync_pt spt)
		352	{
		353	struct gk20a_sync_pt *pt = to_gk20a_sync_pt(spt);
		354	struct nvgpu_semaphore *sema;
		355
		356	nvgpu_spinlock_acquire(&pt->lock);
		357	sema = pt->sema;
		358	if (sema)
		359	nvgpu_semaphore_get(sema);
		360	nvgpu_spinlock_release(&pt->lock);
		361
		362	return sema;
375	}	363	}
376		364
377	void gk20a_sync_timeline_signal(struct sync_timeline *timeline)	365	void gk20a_sync_timeline_signal(struct sync_timeline *timeline)


diff --git a/drivers/gpu/nvgpu/gk20a/sync_gk20a.h b/drivers/gpu/nvgpu/gk20a/sync_gk20a.h index 7d7aff6d..8a6439ab 100644 --- a/drivers/gpu/nvgpu/gk20a/sync_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/sync_gk20a.h
@@ -3,7 +3,7 @@
3	*	3	*
4	* GK20A Sync Framework Integration	4	* GK20A Sync Framework Integration
5	*	5	*
6	* Copyright (c) 2014-2017, NVIDIA CORPORATION. All rights reserved.	6	* Copyright (c) 2014-2018, NVIDIA CORPORATION. All rights reserved.
7	*	7	*
8	* Permission is hereby granted, free of charge, to any person obtaining a	8	* Permission is hereby granted, free of charge, to any person obtaining a
9	* copy of this software and associated documentation files (the "Software"),	9	* copy of this software and associated documentation files (the "Software"),
@@ -33,9 +33,6 @@ struct sync_pt;
33	struct nvgpu_semaphore;	33	struct nvgpu_semaphore;
34	struct fence;	34	struct fence;
35		35
36	int gk20a_is_sema_backed_sync_fence(struct sync_fence *fence);
37	struct nvgpu_semaphore gk20a_sync_fence_get_sema(struct sync_fence f);
38
39	#ifdef CONFIG_SYNC	36	#ifdef CONFIG_SYNC
40	struct sync_timeline gk20a_sync_timeline_create(const char fmt, ...);	37	struct sync_timeline gk20a_sync_timeline_create(const char fmt, ...);
41	void gk20a_sync_timeline_destroy(struct sync_timeline *);	38	void gk20a_sync_timeline_destroy(struct sync_timeline *);
@@ -46,6 +43,7 @@ struct sync_fence *gk20a_sync_fence_create(
46	struct nvgpu_semaphore *,	43	struct nvgpu_semaphore *,
47	const char *fmt, ...);	44	const char *fmt, ...);
48	struct sync_fence *gk20a_sync_fence_fdget(int fd);	45	struct sync_fence *gk20a_sync_fence_fdget(int fd);
		46	struct nvgpu_semaphore gk20a_sync_pt_sema(struct sync_pt spt);
49	#else	47	#else
50	static inline void gk20a_sync_timeline_destroy(struct sync_timeline *obj) {}	48	static inline void gk20a_sync_timeline_destroy(struct sync_timeline *obj) {}
51	static inline void gk20a_sync_timeline_signal(struct sync_timeline *obj) {}	49	static inline void gk20a_sync_timeline_signal(struct sync_timeline *obj) {}