3 files changed, 123 insertions, 148 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
index 4b1be8b9..c6b55bf8 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
@@ -502,10 +502,10 @@ static void gk20a_channel_semaphore_launcher(
 static void add_sema_cmd(struct gk20a *g, struct channel_gk20a *c,
                         struct nvgpu_semaphore *s, struct priv_cmd_entry *cmd,
-                         int cmd_size, bool acquire, bool wfi)
+                         u32 offset, bool acquire, bool wfi)
 {
        int ch = c->chid;
-        u32 ob, off = cmd->off;
+        u32 ob, off = cmd->off + offset;
        u64 va;
        ob = off;
@@ -588,108 +588,79 @@ static int gk20a_channel_semaphore_wait_syncpt(
 }
 #ifdef CONFIG_SYNC
-/*
+static int semaphore_wait_fd_native(struct channel_gk20a *c, int fd,
- * Attempt a fast path for waiting on a sync_fence. Basically if the passed
+                struct priv_cmd_entry *wait_cmd)
- * sync_fence is backed by a nvgpu_semaphore then there's no reason to go
- * through the rigmarole of setting up a separate semaphore which waits on an
- * interrupt from the GPU and then triggers a worker thread to execute a SW
- * based semaphore release. Instead just have the GPU wait on the same semaphore
- * that is going to be incremented by the GPU.
- *
- * This function returns 2 possible values: -ENODEV or 0 on success. In the case
- * of -ENODEV the fastpath cannot be taken due to the fence not being backed by
- * a GPU semaphore.
- */
-static int __semaphore_wait_fd_fast_path(struct channel_gk20a *c,
-                                         struct sync_fence *fence,
-                                         struct priv_cmd_entry *wait_cmd,
-                                         struct nvgpu_semaphore **fp_sema)
 {
-        struct nvgpu_semaphore *sema;
+        struct sync_fence *sync_fence;
        int err;
+        const int wait_cmd_size = 8;
+        int num_wait_cmds;
+        int i;
-        if (!gk20a_is_sema_backed_sync_fence(fence))
+        sync_fence = gk20a_sync_fence_fdget(fd);
-                return -ENODEV;
+        if (!sync_fence)
+                return -EINVAL;
-        sema = gk20a_sync_fence_get_sema(fence);
-        /*
+        num_wait_cmds = sync_fence->num_fences;
-         * If there's no underlying sema then that means the underlying sema has
+        if (num_wait_cmds == 0) {
-         * already signaled.
+                err = 0;
-         */
+                goto put_fence;
-        if (!sema) {
-                *fp_sema = NULL;
-                return 0;
        }
-        err = gk20a_channel_alloc_priv_cmdbuf(c, 8, wait_cmd);
+        err = gk20a_channel_alloc_priv_cmdbuf(c,
-        if (err)
+                        wait_cmd_size * num_wait_cmds,
-                return err;
+                        wait_cmd);
+        if (err) {
+                nvgpu_err(c->g, "not enough priv cmd buffer space");
+                goto put_fence;
+        }
-        nvgpu_semaphore_get(sema);
+        for (i = 0; i < sync_fence->num_fences; i++) {
-        BUG_ON(!sema->incremented);
+                struct fence *f = sync_fence->cbs[i].sync_pt;
-        add_sema_cmd(c->g, c, sema, wait_cmd, 8, true, false);
+                struct sync_pt *pt = sync_pt_from_fence(f);
+                struct nvgpu_semaphore *sema;
-        /*
+                sema = gk20a_sync_pt_sema(pt);
-         * Make sure that gk20a_channel_semaphore_wait_fd() can create another
+                if (!sema) {
-         * fence with the underlying semaphore.
+                        /* expired */
-         */
+                        nvgpu_memset(c->g, wait_cmd->mem,
-        *fp_sema = sema;
+                        (wait_cmd->off + i * wait_cmd_size) * sizeof(u32),
+                                0, wait_cmd_size * sizeof(u32));
+                } else {
+                        WARN_ON(!sema->incremented);
+                        add_sema_cmd(c->g, c, sema, wait_cmd,
+                                        i * wait_cmd_size, true, false);
+                        nvgpu_semaphore_put(sema);
+                }
+        }
-        return 0;
+put_fence:
+        sync_fence_put(sync_fence);
+        return err;
 }
-#endif
-static int gk20a_channel_semaphore_wait_fd(
+static int semaphore_wait_fd_proxy(struct channel_gk20a *c, int fd,
-                struct gk20a_channel_sync *s, int fd,
+                struct priv_cmd_entry *wait_cmd,
-                struct priv_cmd_entry *entry,
+                struct gk20a_fence *fence_out,
-                struct gk20a_fence *fence)
+                struct sync_timeline *timeline)
 {
-        struct gk20a_channel_semaphore *sema =
+        const int wait_cmd_size = 8;
-                container_of(s, struct gk20a_channel_semaphore, ops);
-        struct channel_gk20a *c = sema->c;
-#ifdef CONFIG_SYNC
-        struct nvgpu_semaphore *fp_sema;
        struct sync_fence *sync_fence;
-        struct priv_cmd_entry *wait_cmd = entry;
        struct wait_fence_work *w = NULL;
-        int err, ret, status;
+        int err, status;
-        sync_fence = gk20a_sync_fence_fdget(fd);
+        sync_fence = sync_fence_fdget(fd);
        if (!sync_fence)
                return -EINVAL;
-        ret = __semaphore_wait_fd_fast_path(c, sync_fence, wait_cmd, &fp_sema);
-        if (ret == 0) {
-                if (fp_sema) {
-                        err = gk20a_fence_from_semaphore(c->g, fence,
-                                        sema->timeline,
-                                        fp_sema,
-                                        &c->semaphore_wq,
-                                        false);
-                        if (err) {
-                                nvgpu_semaphore_put(fp_sema);
-                                goto clean_up_priv_cmd;
-                        }
-                } else
-                        /*
-                         * Init an empty fence. It will instantly return
-                         * from gk20a_fence_wait().
-                         */
-                        gk20a_init_fence(fence, NULL, NULL);
-                sync_fence_put(sync_fence);
-                goto skip_slow_path;
-        }
        /* If the fence has signaled there is no reason to wait on it. */
        status = atomic_read(&sync_fence->status);
        if (status == 0) {
                sync_fence_put(sync_fence);
-                goto skip_slow_path;
+                return 0;
        }
-        err = gk20a_channel_alloc_priv_cmdbuf(c, 8, wait_cmd);
+        err = gk20a_channel_alloc_priv_cmdbuf(c, wait_cmd_size, wait_cmd);
        if (err) {
                nvgpu_err(c->g,
                                "not enough priv cmd buffer space");
@@ -718,34 +689,34 @@ static int gk20a_channel_semaphore_wait_fd(
        nvgpu_semaphore_incr(w->sema, c->hw_sema);
        /* GPU unblocked when the semaphore value increments. */
-        add_sema_cmd(c->g, c, w->sema, wait_cmd, 8, true, false);
+        add_sema_cmd(c->g, c, w->sema, wait_cmd, 0, true, false);
        /*
         *  We need to create the fence before adding the waiter to ensure
         *  that we properly clean up in the event the sync_fence has
         *  already signaled
         */
-        err = gk20a_fence_from_semaphore(c->g, fence, sema->timeline, w->sema,
+        err = gk20a_fence_from_semaphore(c->g, fence_out, timeline,
-                        &c->semaphore_wq, false);
+                        w->sema, &c->semaphore_wq, false);
        if (err)
                goto clean_up_sema;
-        ret = sync_fence_wait_async(sync_fence, &w->waiter);
+        err = sync_fence_wait_async(sync_fence, &w->waiter);
        gk20a_add_pending_sema_wait(c->g, w);
-        /*
+        /*
-         * If the sync_fence has already signaled then the above async_wait
+         * If the sync_fence has already signaled then the above wait_async
-         * will never trigger. This causes the semaphore release op to never
+         * will not get scheduled; the fence completed just after doing the
-         * happen which, in turn, hangs the GPU. That's bad. So let's just
+         * status check above before allocs and waiter init, and won the race.
-         * do the nvgpu_semaphore_release() right now.
+         * This causes the waiter to be skipped, so let's release the semaphore
+         * here and put the refs taken for the worker.
         */
-        if (ret == 1) {
+        if (err == 1) {
                sync_fence_put(sync_fence);
                nvgpu_semaphore_release(w->sema, c->hw_sema);
                nvgpu_semaphore_put(w->sema);
        }
-skip_slow_path:
        return 0;
 clean_up_sema:
@@ -758,10 +729,28 @@ clean_up_sema:
 clean_up_worker:
        nvgpu_kfree(c->g, w);
 clean_up_priv_cmd:
-        gk20a_free_priv_cmdbuf(c, entry);
+        gk20a_free_priv_cmdbuf(c, wait_cmd);
 clean_up_sync_fence:
        sync_fence_put(sync_fence);
        return err;
+}
+#endif
+static int gk20a_channel_semaphore_wait_fd(
+                struct gk20a_channel_sync *s, int fd,
+                struct priv_cmd_entry *entry,
+                struct gk20a_fence *fence)
+{
+        struct gk20a_channel_semaphore *sema =
+                container_of(s, struct gk20a_channel_semaphore, ops);
+        struct channel_gk20a *c = sema->c;
+#ifdef CONFIG_SYNC
+        int err;
+        err = semaphore_wait_fd_native(c, fd, entry);
+        if (err)
+                err = semaphore_wait_fd_proxy(c, fd, entry, fence, sema->timeline);
+        return err;
 #else
        nvgpu_err(c->g,
                  "trying to use sync fds with CONFIG_SYNC disabled");
@@ -798,7 +787,7 @@ static int __gk20a_channel_semaphore_incr(
        }
        /* Release the completion semaphore. */
-        add_sema_cmd(c->g, c, semaphore, incr_cmd, 14, false, wfi_cmd);
+        add_sema_cmd(c->g, c, semaphore, incr_cmd, 0, false, wfi_cmd);
        err = gk20a_fence_from_semaphore(c->g, fence,
                        sp->timeline, semaphore,
diff --git a/drivers/gpu/nvgpu/gk20a/sync_gk20a.c b/drivers/gpu/nvgpu/gk20a/sync_gk20a.c
index f6d16b90..a8600bce 100644
--- a/drivers/gpu/nvgpu/gk20a/sync_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/sync_gk20a.c
@@ -1,7 +1,7 @@
 /*
 * GK20A Sync Framework Integration
 *
- * Copyright (c) 2014-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2014-2018, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -70,55 +70,6 @@ struct gk20a_sync_pt_inst {
 };
 /**
- * Check if the passed sync_fence is backed by a single GPU semaphore. In such
- * cases we can short circuit a lot of SW involved in signaling pre-fences and
- * post fences.
- *
- * For now reject multi-sync_pt fences. This could be changed in future. It
- * would require that the sema fast path push a sema acquire for each semaphore
- * in the fence.
- */
-int gk20a_is_sema_backed_sync_fence(struct sync_fence *fence)
-{
-        struct sync_timeline *t;
-        struct fence *pt = fence->cbs[0].sync_pt;
-        struct sync_pt *spt = sync_pt_from_fence(pt);
-        if (fence->num_fences != 1)
-                return 0;
-        if (spt == NULL)
-                return 0;
-        t = sync_pt_parent(spt);
-        if (t->ops == &gk20a_sync_timeline_ops)
-                return 1;
-        return 0;
-}
-struct nvgpu_semaphore *gk20a_sync_fence_get_sema(struct sync_fence *f)
-{
-        struct sync_pt *spt;
-        struct gk20a_sync_pt_inst *pti;
-        struct fence *pt;
-        if (!f)
-                return NULL;
-        if (!gk20a_is_sema_backed_sync_fence(f))
-                return NULL;
-        pt = f->cbs[0].sync_pt;
-        spt = sync_pt_from_fence(pt);
-        pti = container_of(spt, struct gk20a_sync_pt_inst, pt);
-        return pti->shared->sema;
-}
-/**
 * Compares sync pt values a and b, both of which will trigger either before
 * or after ref (i.e. a and b trigger before ref, or a and b trigger after
 * ref). Supplying ref allows us to handle wrapping correctly.
@@ -371,7 +322,44 @@ static const struct sync_timeline_ops gk20a_sync_timeline_ops = {
 struct sync_fence *gk20a_sync_fence_fdget(int fd)
 {
-        return sync_fence_fdget(fd);
+        struct sync_fence *fence = sync_fence_fdget(fd);
+        int i;
+        if (!fence)
+                return NULL;
+        for (i = 0; i < fence->num_fences; i++) {
+                struct fence *pt = fence->cbs[i].sync_pt;
+                struct sync_pt *spt = sync_pt_from_fence(pt);
+                struct sync_timeline *t;
+                if (spt == NULL) {
+                        sync_fence_put(fence);
+                        return NULL;
+                }
+                t = sync_pt_parent(spt);
+                if (t->ops != &gk20a_sync_timeline_ops) {
+                        sync_fence_put(fence);
+                        return NULL;
+                }
+        }
+        return fence;
+}
+struct nvgpu_semaphore *gk20a_sync_pt_sema(struct sync_pt *spt)
+{
+        struct gk20a_sync_pt *pt = to_gk20a_sync_pt(spt);
+        struct nvgpu_semaphore *sema;
+        nvgpu_spinlock_acquire(&pt->lock);
+        sema = pt->sema;
+        if (sema)
+                nvgpu_semaphore_get(sema);
+        nvgpu_spinlock_release(&pt->lock);
+        return sema;
 }
 void gk20a_sync_timeline_signal(struct sync_timeline *timeline)
diff --git a/drivers/gpu/nvgpu/gk20a/sync_gk20a.h b/drivers/gpu/nvgpu/gk20a/sync_gk20a.h
index 7d7aff6d..8a6439ab 100644
--- a/drivers/gpu/nvgpu/gk20a/sync_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/sync_gk20a.h
@@ -3,7 +3,7 @@
 *
 * GK20A Sync Framework Integration
 *
- * Copyright (c) 2014-2017, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2014-2018, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -33,9 +33,6 @@ struct sync_pt;
 struct nvgpu_semaphore;
 struct fence;
-int gk20a_is_sema_backed_sync_fence(struct sync_fence *fence);
-struct nvgpu_semaphore *gk20a_sync_fence_get_sema(struct sync_fence *f);
 #ifdef CONFIG_SYNC
 struct sync_timeline *gk20a_sync_timeline_create(const char *fmt, ...);
 void gk20a_sync_timeline_destroy(struct sync_timeline *);
@@ -46,6 +43,7 @@ struct sync_fence *gk20a_sync_fence_create(
                struct nvgpu_semaphore *,
                const char *fmt, ...);
 struct sync_fence *gk20a_sync_fence_fdget(int fd);
+struct nvgpu_semaphore *gk20a_sync_pt_sema(struct sync_pt *spt);
 #else
 static inline void gk20a_sync_timeline_destroy(struct sync_timeline *obj) {}
 static inline void gk20a_sync_timeline_signal(struct sync_timeline *obj) {}

diff --git a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c index 4b1be8b9..c6b55bf8 100644 --- a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
@@ -502,10 +502,10 @@ static void gk20a_channel_semaphore_launcher(
502		502
503	static void add_sema_cmd(struct gk20a g, struct channel_gk20a c,	503	static void add_sema_cmd(struct gk20a g, struct channel_gk20a c,
504	struct nvgpu_semaphore s, struct priv_cmd_entry cmd,	504	struct nvgpu_semaphore s, struct priv_cmd_entry cmd,
505	int cmd_size, bool acquire, bool wfi)	505	u32 offset, bool acquire, bool wfi)
506	{	506	{
507	int ch = c->chid;	507	int ch = c->chid;
508	u32 ob, off = cmd->off;	508	u32 ob, off = cmd->off + offset;
509	u64 va;	509	u64 va;
510		510
511	ob = off;	511	ob = off;
@@ -588,108 +588,79 @@ static int gk20a_channel_semaphore_wait_syncpt(
588	}	588	}
589		589
590	#ifdef CONFIG_SYNC	590	#ifdef CONFIG_SYNC
591	/*	591	static int semaphore_wait_fd_native(struct channel_gk20a *c, int fd,
592	* Attempt a fast path for waiting on a sync_fence. Basically if the passed	592	struct priv_cmd_entry *wait_cmd)
593	* sync_fence is backed by a nvgpu_semaphore then there's no reason to go
594	* through the rigmarole of setting up a separate semaphore which waits on an
595	* interrupt from the GPU and then triggers a worker thread to execute a SW
596	* based semaphore release. Instead just have the GPU wait on the same semaphore
597	* that is going to be incremented by the GPU.
598	*
599	* This function returns 2 possible values: -ENODEV or 0 on success. In the case
600	* of -ENODEV the fastpath cannot be taken due to the fence not being backed by
601	* a GPU semaphore.
602	*/
603	static int __semaphore_wait_fd_fast_path(struct channel_gk20a *c,
604	struct sync_fence *fence,
605	struct priv_cmd_entry *wait_cmd,
606	struct nvgpu_semaphore **fp_sema)
607	{	593	{
608	struct nvgpu_semaphore *sema;	594	struct sync_fence *sync_fence;
609	int err;	595	int err;
		596	const int wait_cmd_size = 8;
		597	int num_wait_cmds;
		598	int i;
610		599
611	if (!gk20a_is_sema_backed_sync_fence(fence))	600	sync_fence = gk20a_sync_fence_fdget(fd);
612	return -ENODEV;	601	if (!sync_fence)
613		602	return -EINVAL;
614	sema = gk20a_sync_fence_get_sema(fence);
615		603
616	/*	604	num_wait_cmds = sync_fence->num_fences;
617	* If there's no underlying sema then that means the underlying sema has	605	if (num_wait_cmds == 0) {
618	* already signaled.	606	err = 0;
619	*/	607	goto put_fence;
620	if (!sema) {
621	*fp_sema = NULL;
622	return 0;
623	}	608	}
624		609
625	err = gk20a_channel_alloc_priv_cmdbuf(c, 8, wait_cmd);	610	err = gk20a_channel_alloc_priv_cmdbuf(c,
626	if (err)	611	wait_cmd_size * num_wait_cmds,
627	return err;	612	wait_cmd);
		613	if (err) {
		614	nvgpu_err(c->g, "not enough priv cmd buffer space");
		615	goto put_fence;
		616	}
628		617
629	nvgpu_semaphore_get(sema);	618	for (i = 0; i < sync_fence->num_fences; i++) {
630	BUG_ON(!sema->incremented);	619	struct fence *f = sync_fence->cbs[i].sync_pt;
631	add_sema_cmd(c->g, c, sema, wait_cmd, 8, true, false);	620	struct sync_pt *pt = sync_pt_from_fence(f);
		621	struct nvgpu_semaphore *sema;
632		622
633	/*	623	sema = gk20a_sync_pt_sema(pt);
634	* Make sure that gk20a_channel_semaphore_wait_fd() can create another	624	if (!sema) {
635	* fence with the underlying semaphore.	625	/* expired */
636	*/	626	nvgpu_memset(c->g, wait_cmd->mem,
637	*fp_sema = sema;	627	(wait_cmd->off + i * wait_cmd_size) * sizeof(u32),
		628	0, wait_cmd_size * sizeof(u32));
		629	} else {
		630	WARN_ON(!sema->incremented);
		631	add_sema_cmd(c->g, c, sema, wait_cmd,
		632	i * wait_cmd_size, true, false);
		633	nvgpu_semaphore_put(sema);
		634	}
		635	}
638		636
639	return 0;	637	put_fence:
		638	sync_fence_put(sync_fence);
		639	return err;
640	}	640	}
641	#endif
642		641
643	static int gk20a_channel_semaphore_wait_fd(	642	static int semaphore_wait_fd_proxy(struct channel_gk20a *c, int fd,
644	struct gk20a_channel_sync *s, int fd,	643	struct priv_cmd_entry *wait_cmd,
645	struct priv_cmd_entry *entry,	644	struct gk20a_fence *fence_out,
646	struct gk20a_fence *fence)	645	struct sync_timeline *timeline)
647	{	646	{
648	struct gk20a_channel_semaphore *sema =	647	const int wait_cmd_size = 8;
649	container_of(s, struct gk20a_channel_semaphore, ops);
650	struct channel_gk20a *c = sema->c;
651	#ifdef CONFIG_SYNC
652	struct nvgpu_semaphore *fp_sema;
653	struct sync_fence *sync_fence;	648	struct sync_fence *sync_fence;
654	struct priv_cmd_entry *wait_cmd = entry;
655	struct wait_fence_work *w = NULL;	649	struct wait_fence_work *w = NULL;
656	int err, ret, status;	650	int err, status;
657		651
658	sync_fence = gk20a_sync_fence_fdget(fd);	652	sync_fence = sync_fence_fdget(fd);
659	if (!sync_fence)	653	if (!sync_fence)
660	return -EINVAL;	654	return -EINVAL;
661		655
662	ret = __semaphore_wait_fd_fast_path(c, sync_fence, wait_cmd, &fp_sema);
663	if (ret == 0) {
664	if (fp_sema) {
665	err = gk20a_fence_from_semaphore(c->g, fence,
666	sema->timeline,
667	fp_sema,
668	&c->semaphore_wq,
669	false);
670	if (err) {
671	nvgpu_semaphore_put(fp_sema);
672	goto clean_up_priv_cmd;
673	}
674	} else
675	/*
676	* Init an empty fence. It will instantly return
677	* from gk20a_fence_wait().
678	*/
679	gk20a_init_fence(fence, NULL, NULL);
680
681	sync_fence_put(sync_fence);
682	goto skip_slow_path;
683	}
684
685	/* If the fence has signaled there is no reason to wait on it. */	656	/* If the fence has signaled there is no reason to wait on it. */
686	status = atomic_read(&sync_fence->status);	657	status = atomic_read(&sync_fence->status);
687	if (status == 0) {	658	if (status == 0) {
688	sync_fence_put(sync_fence);	659	sync_fence_put(sync_fence);
689	goto skip_slow_path;	660	return 0;
690	}	661	}
691		662
692	err = gk20a_channel_alloc_priv_cmdbuf(c, 8, wait_cmd);	663	err = gk20a_channel_alloc_priv_cmdbuf(c, wait_cmd_size, wait_cmd);
693	if (err) {	664	if (err) {
694	nvgpu_err(c->g,	665	nvgpu_err(c->g,
695	"not enough priv cmd buffer space");	666	"not enough priv cmd buffer space");
@@ -718,34 +689,34 @@ static int gk20a_channel_semaphore_wait_fd(
718	nvgpu_semaphore_incr(w->sema, c->hw_sema);	689	nvgpu_semaphore_incr(w->sema, c->hw_sema);
719		690
720	/* GPU unblocked when the semaphore value increments. */	691	/* GPU unblocked when the semaphore value increments. */
721	add_sema_cmd(c->g, c, w->sema, wait_cmd, 8, true, false);	692	add_sema_cmd(c->g, c, w->sema, wait_cmd, 0, true, false);
722		693
723	/*	694	/*
724	* We need to create the fence before adding the waiter to ensure	695	* We need to create the fence before adding the waiter to ensure
725	* that we properly clean up in the event the sync_fence has	696	* that we properly clean up in the event the sync_fence has
726	* already signaled	697	* already signaled
727	*/	698	*/
728	err = gk20a_fence_from_semaphore(c->g, fence, sema->timeline, w->sema,	699	err = gk20a_fence_from_semaphore(c->g, fence_out, timeline,
729	&c->semaphore_wq, false);	700	w->sema, &c->semaphore_wq, false);
730	if (err)	701	if (err)
731	goto clean_up_sema;	702	goto clean_up_sema;
732		703
733	ret = sync_fence_wait_async(sync_fence, &w->waiter);	704	err = sync_fence_wait_async(sync_fence, &w->waiter);
734	gk20a_add_pending_sema_wait(c->g, w);	705	gk20a_add_pending_sema_wait(c->g, w);
735		706
736	/*	707	/*
737	* If the sync_fence has already signaled then the above async_wait	708	* If the sync_fence has already signaled then the above wait_async
738	* will never trigger. This causes the semaphore release op to never	709	* will not get scheduled; the fence completed just after doing the
739	* happen which, in turn, hangs the GPU. That's bad. So let's just	710	* status check above before allocs and waiter init, and won the race.
740	* do the nvgpu_semaphore_release() right now.	711	* This causes the waiter to be skipped, so let's release the semaphore
		712	* here and put the refs taken for the worker.
741	*/	713	*/
742	if (ret == 1) {	714	if (err == 1) {
743	sync_fence_put(sync_fence);	715	sync_fence_put(sync_fence);
744	nvgpu_semaphore_release(w->sema, c->hw_sema);	716	nvgpu_semaphore_release(w->sema, c->hw_sema);
745	nvgpu_semaphore_put(w->sema);	717	nvgpu_semaphore_put(w->sema);
746	}	718	}
747		719
748	skip_slow_path:
749	return 0;	720	return 0;
750		721
751	clean_up_sema:	722	clean_up_sema:
@@ -758,10 +729,28 @@ clean_up_sema:
758	clean_up_worker:	729	clean_up_worker:
759	nvgpu_kfree(c->g, w);	730	nvgpu_kfree(c->g, w);
760	clean_up_priv_cmd:	731	clean_up_priv_cmd:
761	gk20a_free_priv_cmdbuf(c, entry);	732	gk20a_free_priv_cmdbuf(c, wait_cmd);
762	clean_up_sync_fence:	733	clean_up_sync_fence:
763	sync_fence_put(sync_fence);	734	sync_fence_put(sync_fence);
764	return err;	735	return err;
		736	}
		737	#endif
		738
		739	static int gk20a_channel_semaphore_wait_fd(
		740	struct gk20a_channel_sync *s, int fd,
		741	struct priv_cmd_entry *entry,
		742	struct gk20a_fence *fence)
		743	{
		744	struct gk20a_channel_semaphore *sema =
		745	container_of(s, struct gk20a_channel_semaphore, ops);
		746	struct channel_gk20a *c = sema->c;
		747	#ifdef CONFIG_SYNC
		748	int err;
		749
		750	err = semaphore_wait_fd_native(c, fd, entry);
		751	if (err)
		752	err = semaphore_wait_fd_proxy(c, fd, entry, fence, sema->timeline);
		753	return err;
765	#else	754	#else
766	nvgpu_err(c->g,	755	nvgpu_err(c->g,
767	"trying to use sync fds with CONFIG_SYNC disabled");	756	"trying to use sync fds with CONFIG_SYNC disabled");
@@ -798,7 +787,7 @@ static int __gk20a_channel_semaphore_incr(
798	}	787	}
799		788
800	/* Release the completion semaphore. */	789	/* Release the completion semaphore. */
801	add_sema_cmd(c->g, c, semaphore, incr_cmd, 14, false, wfi_cmd);	790	add_sema_cmd(c->g, c, semaphore, incr_cmd, 0, false, wfi_cmd);
802		791
803	err = gk20a_fence_from_semaphore(c->g, fence,	792	err = gk20a_fence_from_semaphore(c->g, fence,
804	sp->timeline, semaphore,	793	sp->timeline, semaphore,


diff --git a/drivers/gpu/nvgpu/gk20a/sync_gk20a.c b/drivers/gpu/nvgpu/gk20a/sync_gk20a.c index f6d16b90..a8600bce 100644 --- a/drivers/gpu/nvgpu/gk20a/sync_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/sync_gk20a.c
@@ -1,7 +1,7 @@
1	/*	1	/*
2	* GK20A Sync Framework Integration	2	* GK20A Sync Framework Integration
3	*	3	*
4	* Copyright (c) 2014-2017, NVIDIA CORPORATION. All rights reserved.	4	* Copyright (c) 2014-2018, NVIDIA CORPORATION. All rights reserved.
5	*	5	*
6	* Permission is hereby granted, free of charge, to any person obtaining a	6	* Permission is hereby granted, free of charge, to any person obtaining a
7	* copy of this software and associated documentation files (the "Software"),	7	* copy of this software and associated documentation files (the "Software"),
@@ -70,55 +70,6 @@ struct gk20a_sync_pt_inst {
70	};	70	};
71		71
72	/**	72	/**
73	* Check if the passed sync_fence is backed by a single GPU semaphore. In such
74	* cases we can short circuit a lot of SW involved in signaling pre-fences and
75	* post fences.
76	*
77	* For now reject multi-sync_pt fences. This could be changed in future. It
78	* would require that the sema fast path push a sema acquire for each semaphore
79	* in the fence.
80	*/
81	int gk20a_is_sema_backed_sync_fence(struct sync_fence *fence)
82	{
83	struct sync_timeline *t;
84
85	struct fence *pt = fence->cbs[0].sync_pt;
86	struct sync_pt *spt = sync_pt_from_fence(pt);
87
88	if (fence->num_fences != 1)
89	return 0;
90
91	if (spt == NULL)
92	return 0;
93
94	t = sync_pt_parent(spt);
95
96	if (t->ops == &gk20a_sync_timeline_ops)
97	return 1;
98	return 0;
99	}
100
101	struct nvgpu_semaphore gk20a_sync_fence_get_sema(struct sync_fence f)
102	{
103	struct sync_pt *spt;
104	struct gk20a_sync_pt_inst *pti;
105
106	struct fence *pt;
107
108	if (!f)
109	return NULL;
110
111	if (!gk20a_is_sema_backed_sync_fence(f))
112	return NULL;
113
114	pt = f->cbs[0].sync_pt;
115	spt = sync_pt_from_fence(pt);
116	pti = container_of(spt, struct gk20a_sync_pt_inst, pt);
117
118	return pti->shared->sema;
119	}
120
121	/**
122	* Compares sync pt values a and b, both of which will trigger either before	73	* Compares sync pt values a and b, both of which will trigger either before
123	* or after ref (i.e. a and b trigger before ref, or a and b trigger after	74	* or after ref (i.e. a and b trigger before ref, or a and b trigger after
124	* ref). Supplying ref allows us to handle wrapping correctly.	75	* ref). Supplying ref allows us to handle wrapping correctly.
@@ -371,7 +322,44 @@ static const struct sync_timeline_ops gk20a_sync_timeline_ops = {
371		322
372	struct sync_fence *gk20a_sync_fence_fdget(int fd)	323	struct sync_fence *gk20a_sync_fence_fdget(int fd)
373	{	324	{
374	return sync_fence_fdget(fd);	325	struct sync_fence *fence = sync_fence_fdget(fd);
		326	int i;
		327
		328	if (!fence)
		329	return NULL;
		330
		331	for (i = 0; i < fence->num_fences; i++) {
		332	struct fence *pt = fence->cbs[i].sync_pt;
		333	struct sync_pt *spt = sync_pt_from_fence(pt);
		334	struct sync_timeline *t;
		335
		336	if (spt == NULL) {
		337	sync_fence_put(fence);
		338	return NULL;
		339	}
		340
		341	t = sync_pt_parent(spt);
		342	if (t->ops != &gk20a_sync_timeline_ops) {
		343	sync_fence_put(fence);
		344	return NULL;
		345	}
		346	}
		347
		348	return fence;
		349	}
		350
		351	struct nvgpu_semaphore gk20a_sync_pt_sema(struct sync_pt spt)
		352	{
		353	struct gk20a_sync_pt *pt = to_gk20a_sync_pt(spt);
		354	struct nvgpu_semaphore *sema;
		355
		356	nvgpu_spinlock_acquire(&pt->lock);
		357	sema = pt->sema;
		358	if (sema)
		359	nvgpu_semaphore_get(sema);
		360	nvgpu_spinlock_release(&pt->lock);
		361
		362	return sema;
375	}	363	}
376		364
377	void gk20a_sync_timeline_signal(struct sync_timeline *timeline)	365	void gk20a_sync_timeline_signal(struct sync_timeline *timeline)


diff --git a/drivers/gpu/nvgpu/gk20a/sync_gk20a.h b/drivers/gpu/nvgpu/gk20a/sync_gk20a.h index 7d7aff6d..8a6439ab 100644 --- a/drivers/gpu/nvgpu/gk20a/sync_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/sync_gk20a.h
@@ -3,7 +3,7 @@
3	*	3	*
4	* GK20A Sync Framework Integration	4	* GK20A Sync Framework Integration
5	*	5	*
6	* Copyright (c) 2014-2017, NVIDIA CORPORATION. All rights reserved.	6	* Copyright (c) 2014-2018, NVIDIA CORPORATION. All rights reserved.
7	*	7	*
8	* Permission is hereby granted, free of charge, to any person obtaining a	8	* Permission is hereby granted, free of charge, to any person obtaining a
9	* copy of this software and associated documentation files (the "Software"),	9	* copy of this software and associated documentation files (the "Software"),
@@ -33,9 +33,6 @@ struct sync_pt;
33	struct nvgpu_semaphore;	33	struct nvgpu_semaphore;
34	struct fence;	34	struct fence;
35		35
36	int gk20a_is_sema_backed_sync_fence(struct sync_fence *fence);
37	struct nvgpu_semaphore gk20a_sync_fence_get_sema(struct sync_fence f);
38
39	#ifdef CONFIG_SYNC	36	#ifdef CONFIG_SYNC
40	struct sync_timeline gk20a_sync_timeline_create(const char fmt, ...);	37	struct sync_timeline gk20a_sync_timeline_create(const char fmt, ...);
41	void gk20a_sync_timeline_destroy(struct sync_timeline *);	38	void gk20a_sync_timeline_destroy(struct sync_timeline *);
@@ -46,6 +43,7 @@ struct sync_fence *gk20a_sync_fence_create(
46	struct nvgpu_semaphore *,	43	struct nvgpu_semaphore *,
47	const char *fmt, ...);	44	const char *fmt, ...);
48	struct sync_fence *gk20a_sync_fence_fdget(int fd);	45	struct sync_fence *gk20a_sync_fence_fdget(int fd);
		46	struct nvgpu_semaphore gk20a_sync_pt_sema(struct sync_pt spt);
49	#else	47	#else
50	static inline void gk20a_sync_timeline_destroy(struct sync_timeline *obj) {}	48	static inline void gk20a_sync_timeline_destroy(struct sync_timeline *obj) {}
51	static inline void gk20a_sync_timeline_signal(struct sync_timeline *obj) {}	49	static inline void gk20a_sync_timeline_signal(struct sync_timeline *obj) {}