summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKonsta Holtta <kholtta@nvidia.com>2018-03-13 10:58:01 -0400
committermobile promotions <svcmobile_promotions@nvidia.com>2018-03-16 10:34:01 -0400
commit34323b559590ed8f1c64ecbb7ffbd838a6478594 (patch)
treec6258e44413a2f15ded4cf63e4a16f5118921703
parentfb40f2a80739985abac273bc493e07341aa003af (diff)
gpu: nvgpu: wait for all prefence semas on gpu
The pre-fence wait for semaphores in the submit path has supported a fast path for fences that have only one underlying semaphore. The fast path just inserts the wait on this sema to the pushbuffer directly. For other fences, the path has been using a CPU wait indirection, signaling another semaphore when we get the CPU-side callback. Instead of only supporting prefences with one sema, unroll all the individual semaphores and insert waits for each to a pushbuffer, like we've already been doing with syncpoints. Now all sema-backed syncs get the fast path. This simplifies the logic and makes it more explicit that only foreign fences need the CPU wait. There is no need to hold references to the sync fence or the semas inside: this submitted job only needs the global read-only sema mapping that is guaranteed to stay alive while the VM of this channel stays alive, and the job does not outlive this channel. Jira NVGPU-43 Jira NVGPU-66 Jira NVGPU-513 Change-Id: I7cfbb510001d998a864aed8d6afd1582b9adb80d Signed-off-by: Konsta Holtta <kholtta@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1636345 Reviewed-by: Alex Waterman <alexw@nvidia.com> Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
-rw-r--r--drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c175
-rw-r--r--drivers/gpu/nvgpu/gk20a/sync_gk20a.c90
-rw-r--r--drivers/gpu/nvgpu/gk20a/sync_gk20a.h6
3 files changed, 123 insertions, 148 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
index 4b1be8b9..c6b55bf8 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
@@ -502,10 +502,10 @@ static void gk20a_channel_semaphore_launcher(
502 502
503static void add_sema_cmd(struct gk20a *g, struct channel_gk20a *c, 503static void add_sema_cmd(struct gk20a *g, struct channel_gk20a *c,
504 struct nvgpu_semaphore *s, struct priv_cmd_entry *cmd, 504 struct nvgpu_semaphore *s, struct priv_cmd_entry *cmd,
505 int cmd_size, bool acquire, bool wfi) 505 u32 offset, bool acquire, bool wfi)
506{ 506{
507 int ch = c->chid; 507 int ch = c->chid;
508 u32 ob, off = cmd->off; 508 u32 ob, off = cmd->off + offset;
509 u64 va; 509 u64 va;
510 510
511 ob = off; 511 ob = off;
@@ -588,108 +588,79 @@ static int gk20a_channel_semaphore_wait_syncpt(
588} 588}
589 589
590#ifdef CONFIG_SYNC 590#ifdef CONFIG_SYNC
591/* 591static int semaphore_wait_fd_native(struct channel_gk20a *c, int fd,
592 * Attempt a fast path for waiting on a sync_fence. Basically if the passed 592 struct priv_cmd_entry *wait_cmd)
593 * sync_fence is backed by a nvgpu_semaphore then there's no reason to go
594 * through the rigmarole of setting up a separate semaphore which waits on an
595 * interrupt from the GPU and then triggers a worker thread to execute a SW
596 * based semaphore release. Instead just have the GPU wait on the same semaphore
597 * that is going to be incremented by the GPU.
598 *
599 * This function returns 2 possible values: -ENODEV or 0 on success. In the case
600 * of -ENODEV the fastpath cannot be taken due to the fence not being backed by
601 * a GPU semaphore.
602 */
603static int __semaphore_wait_fd_fast_path(struct channel_gk20a *c,
604 struct sync_fence *fence,
605 struct priv_cmd_entry *wait_cmd,
606 struct nvgpu_semaphore **fp_sema)
607{ 593{
608 struct nvgpu_semaphore *sema; 594 struct sync_fence *sync_fence;
609 int err; 595 int err;
596 const int wait_cmd_size = 8;
597 int num_wait_cmds;
598 int i;
610 599
611 if (!gk20a_is_sema_backed_sync_fence(fence)) 600 sync_fence = gk20a_sync_fence_fdget(fd);
612 return -ENODEV; 601 if (!sync_fence)
613 602 return -EINVAL;
614 sema = gk20a_sync_fence_get_sema(fence);
615 603
616 /* 604 num_wait_cmds = sync_fence->num_fences;
617 * If there's no underlying sema then that means the underlying sema has 605 if (num_wait_cmds == 0) {
618 * already signaled. 606 err = 0;
619 */ 607 goto put_fence;
620 if (!sema) {
621 *fp_sema = NULL;
622 return 0;
623 } 608 }
624 609
625 err = gk20a_channel_alloc_priv_cmdbuf(c, 8, wait_cmd); 610 err = gk20a_channel_alloc_priv_cmdbuf(c,
626 if (err) 611 wait_cmd_size * num_wait_cmds,
627 return err; 612 wait_cmd);
613 if (err) {
614 nvgpu_err(c->g, "not enough priv cmd buffer space");
615 goto put_fence;
616 }
628 617
629 nvgpu_semaphore_get(sema); 618 for (i = 0; i < sync_fence->num_fences; i++) {
630 BUG_ON(!sema->incremented); 619 struct fence *f = sync_fence->cbs[i].sync_pt;
631 add_sema_cmd(c->g, c, sema, wait_cmd, 8, true, false); 620 struct sync_pt *pt = sync_pt_from_fence(f);
621 struct nvgpu_semaphore *sema;
632 622
633 /* 623 sema = gk20a_sync_pt_sema(pt);
634 * Make sure that gk20a_channel_semaphore_wait_fd() can create another 624 if (!sema) {
635 * fence with the underlying semaphore. 625 /* expired */
636 */ 626 nvgpu_memset(c->g, wait_cmd->mem,
637 *fp_sema = sema; 627 (wait_cmd->off + i * wait_cmd_size) * sizeof(u32),
628 0, wait_cmd_size * sizeof(u32));
629 } else {
630 WARN_ON(!sema->incremented);
631 add_sema_cmd(c->g, c, sema, wait_cmd,
632 i * wait_cmd_size, true, false);
633 nvgpu_semaphore_put(sema);
634 }
635 }
638 636
639 return 0; 637put_fence:
638 sync_fence_put(sync_fence);
639 return err;
640} 640}
641#endif
642 641
643static int gk20a_channel_semaphore_wait_fd( 642static int semaphore_wait_fd_proxy(struct channel_gk20a *c, int fd,
644 struct gk20a_channel_sync *s, int fd, 643 struct priv_cmd_entry *wait_cmd,
645 struct priv_cmd_entry *entry, 644 struct gk20a_fence *fence_out,
646 struct gk20a_fence *fence) 645 struct sync_timeline *timeline)
647{ 646{
648 struct gk20a_channel_semaphore *sema = 647 const int wait_cmd_size = 8;
649 container_of(s, struct gk20a_channel_semaphore, ops);
650 struct channel_gk20a *c = sema->c;
651#ifdef CONFIG_SYNC
652 struct nvgpu_semaphore *fp_sema;
653 struct sync_fence *sync_fence; 648 struct sync_fence *sync_fence;
654 struct priv_cmd_entry *wait_cmd = entry;
655 struct wait_fence_work *w = NULL; 649 struct wait_fence_work *w = NULL;
656 int err, ret, status; 650 int err, status;
657 651
658 sync_fence = gk20a_sync_fence_fdget(fd); 652 sync_fence = sync_fence_fdget(fd);
659 if (!sync_fence) 653 if (!sync_fence)
660 return -EINVAL; 654 return -EINVAL;
661 655
662 ret = __semaphore_wait_fd_fast_path(c, sync_fence, wait_cmd, &fp_sema);
663 if (ret == 0) {
664 if (fp_sema) {
665 err = gk20a_fence_from_semaphore(c->g, fence,
666 sema->timeline,
667 fp_sema,
668 &c->semaphore_wq,
669 false);
670 if (err) {
671 nvgpu_semaphore_put(fp_sema);
672 goto clean_up_priv_cmd;
673 }
674 } else
675 /*
676 * Init an empty fence. It will instantly return
677 * from gk20a_fence_wait().
678 */
679 gk20a_init_fence(fence, NULL, NULL);
680
681 sync_fence_put(sync_fence);
682 goto skip_slow_path;
683 }
684
685 /* If the fence has signaled there is no reason to wait on it. */ 656 /* If the fence has signaled there is no reason to wait on it. */
686 status = atomic_read(&sync_fence->status); 657 status = atomic_read(&sync_fence->status);
687 if (status == 0) { 658 if (status == 0) {
688 sync_fence_put(sync_fence); 659 sync_fence_put(sync_fence);
689 goto skip_slow_path; 660 return 0;
690 } 661 }
691 662
692 err = gk20a_channel_alloc_priv_cmdbuf(c, 8, wait_cmd); 663 err = gk20a_channel_alloc_priv_cmdbuf(c, wait_cmd_size, wait_cmd);
693 if (err) { 664 if (err) {
694 nvgpu_err(c->g, 665 nvgpu_err(c->g,
695 "not enough priv cmd buffer space"); 666 "not enough priv cmd buffer space");
@@ -718,34 +689,34 @@ static int gk20a_channel_semaphore_wait_fd(
718 nvgpu_semaphore_incr(w->sema, c->hw_sema); 689 nvgpu_semaphore_incr(w->sema, c->hw_sema);
719 690
720 /* GPU unblocked when the semaphore value increments. */ 691 /* GPU unblocked when the semaphore value increments. */
721 add_sema_cmd(c->g, c, w->sema, wait_cmd, 8, true, false); 692 add_sema_cmd(c->g, c, w->sema, wait_cmd, 0, true, false);
722 693
723 /* 694 /*
724 * We need to create the fence before adding the waiter to ensure 695 * We need to create the fence before adding the waiter to ensure
725 * that we properly clean up in the event the sync_fence has 696 * that we properly clean up in the event the sync_fence has
726 * already signaled 697 * already signaled
727 */ 698 */
728 err = gk20a_fence_from_semaphore(c->g, fence, sema->timeline, w->sema, 699 err = gk20a_fence_from_semaphore(c->g, fence_out, timeline,
729 &c->semaphore_wq, false); 700 w->sema, &c->semaphore_wq, false);
730 if (err) 701 if (err)
731 goto clean_up_sema; 702 goto clean_up_sema;
732 703
733 ret = sync_fence_wait_async(sync_fence, &w->waiter); 704 err = sync_fence_wait_async(sync_fence, &w->waiter);
734 gk20a_add_pending_sema_wait(c->g, w); 705 gk20a_add_pending_sema_wait(c->g, w);
735 706
736 /* 707 /*
737 * If the sync_fence has already signaled then the above async_wait 708 * If the sync_fence has already signaled then the above wait_async
738 * will never trigger. This causes the semaphore release op to never 709 * will not get scheduled; the fence completed just after doing the
739 * happen which, in turn, hangs the GPU. That's bad. So let's just 710 * status check above before allocs and waiter init, and won the race.
740 * do the nvgpu_semaphore_release() right now. 711 * This causes the waiter to be skipped, so let's release the semaphore
712 * here and put the refs taken for the worker.
741 */ 713 */
742 if (ret == 1) { 714 if (err == 1) {
743 sync_fence_put(sync_fence); 715 sync_fence_put(sync_fence);
744 nvgpu_semaphore_release(w->sema, c->hw_sema); 716 nvgpu_semaphore_release(w->sema, c->hw_sema);
745 nvgpu_semaphore_put(w->sema); 717 nvgpu_semaphore_put(w->sema);
746 } 718 }
747 719
748skip_slow_path:
749 return 0; 720 return 0;
750 721
751clean_up_sema: 722clean_up_sema:
@@ -758,10 +729,28 @@ clean_up_sema:
758clean_up_worker: 729clean_up_worker:
759 nvgpu_kfree(c->g, w); 730 nvgpu_kfree(c->g, w);
760clean_up_priv_cmd: 731clean_up_priv_cmd:
761 gk20a_free_priv_cmdbuf(c, entry); 732 gk20a_free_priv_cmdbuf(c, wait_cmd);
762clean_up_sync_fence: 733clean_up_sync_fence:
763 sync_fence_put(sync_fence); 734 sync_fence_put(sync_fence);
764 return err; 735 return err;
736}
737#endif
738
739static int gk20a_channel_semaphore_wait_fd(
740 struct gk20a_channel_sync *s, int fd,
741 struct priv_cmd_entry *entry,
742 struct gk20a_fence *fence)
743{
744 struct gk20a_channel_semaphore *sema =
745 container_of(s, struct gk20a_channel_semaphore, ops);
746 struct channel_gk20a *c = sema->c;
747#ifdef CONFIG_SYNC
748 int err;
749
750 err = semaphore_wait_fd_native(c, fd, entry);
751 if (err)
752 err = semaphore_wait_fd_proxy(c, fd, entry, fence, sema->timeline);
753 return err;
765#else 754#else
766 nvgpu_err(c->g, 755 nvgpu_err(c->g,
767 "trying to use sync fds with CONFIG_SYNC disabled"); 756 "trying to use sync fds with CONFIG_SYNC disabled");
@@ -798,7 +787,7 @@ static int __gk20a_channel_semaphore_incr(
798 } 787 }
799 788
800 /* Release the completion semaphore. */ 789 /* Release the completion semaphore. */
801 add_sema_cmd(c->g, c, semaphore, incr_cmd, 14, false, wfi_cmd); 790 add_sema_cmd(c->g, c, semaphore, incr_cmd, 0, false, wfi_cmd);
802 791
803 err = gk20a_fence_from_semaphore(c->g, fence, 792 err = gk20a_fence_from_semaphore(c->g, fence,
804 sp->timeline, semaphore, 793 sp->timeline, semaphore,
diff --git a/drivers/gpu/nvgpu/gk20a/sync_gk20a.c b/drivers/gpu/nvgpu/gk20a/sync_gk20a.c
index f6d16b90..a8600bce 100644
--- a/drivers/gpu/nvgpu/gk20a/sync_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/sync_gk20a.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * GK20A Sync Framework Integration 2 * GK20A Sync Framework Integration
3 * 3 *
4 * Copyright (c) 2014-2017, NVIDIA CORPORATION. All rights reserved. 4 * Copyright (c) 2014-2018, NVIDIA CORPORATION. All rights reserved.
5 * 5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a 6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"), 7 * copy of this software and associated documentation files (the "Software"),
@@ -70,55 +70,6 @@ struct gk20a_sync_pt_inst {
70}; 70};
71 71
72/** 72/**
73 * Check if the passed sync_fence is backed by a single GPU semaphore. In such
74 * cases we can short circuit a lot of SW involved in signaling pre-fences and
75 * post fences.
76 *
77 * For now reject multi-sync_pt fences. This could be changed in future. It
78 * would require that the sema fast path push a sema acquire for each semaphore
79 * in the fence.
80 */
81int gk20a_is_sema_backed_sync_fence(struct sync_fence *fence)
82{
83 struct sync_timeline *t;
84
85 struct fence *pt = fence->cbs[0].sync_pt;
86 struct sync_pt *spt = sync_pt_from_fence(pt);
87
88 if (fence->num_fences != 1)
89 return 0;
90
91 if (spt == NULL)
92 return 0;
93
94 t = sync_pt_parent(spt);
95
96 if (t->ops == &gk20a_sync_timeline_ops)
97 return 1;
98 return 0;
99}
100
101struct nvgpu_semaphore *gk20a_sync_fence_get_sema(struct sync_fence *f)
102{
103 struct sync_pt *spt;
104 struct gk20a_sync_pt_inst *pti;
105
106 struct fence *pt;
107
108 if (!f)
109 return NULL;
110
111 if (!gk20a_is_sema_backed_sync_fence(f))
112 return NULL;
113
114 pt = f->cbs[0].sync_pt;
115 spt = sync_pt_from_fence(pt);
116 pti = container_of(spt, struct gk20a_sync_pt_inst, pt);
117
118 return pti->shared->sema;
119}
120
121/**
122 * Compares sync pt values a and b, both of which will trigger either before 73 * Compares sync pt values a and b, both of which will trigger either before
123 * or after ref (i.e. a and b trigger before ref, or a and b trigger after 74 * or after ref (i.e. a and b trigger before ref, or a and b trigger after
124 * ref). Supplying ref allows us to handle wrapping correctly. 75 * ref). Supplying ref allows us to handle wrapping correctly.
@@ -371,7 +322,44 @@ static const struct sync_timeline_ops gk20a_sync_timeline_ops = {
371 322
372struct sync_fence *gk20a_sync_fence_fdget(int fd) 323struct sync_fence *gk20a_sync_fence_fdget(int fd)
373{ 324{
374 return sync_fence_fdget(fd); 325 struct sync_fence *fence = sync_fence_fdget(fd);
326 int i;
327
328 if (!fence)
329 return NULL;
330
331 for (i = 0; i < fence->num_fences; i++) {
332 struct fence *pt = fence->cbs[i].sync_pt;
333 struct sync_pt *spt = sync_pt_from_fence(pt);
334 struct sync_timeline *t;
335
336 if (spt == NULL) {
337 sync_fence_put(fence);
338 return NULL;
339 }
340
341 t = sync_pt_parent(spt);
342 if (t->ops != &gk20a_sync_timeline_ops) {
343 sync_fence_put(fence);
344 return NULL;
345 }
346 }
347
348 return fence;
349}
350
351struct nvgpu_semaphore *gk20a_sync_pt_sema(struct sync_pt *spt)
352{
353 struct gk20a_sync_pt *pt = to_gk20a_sync_pt(spt);
354 struct nvgpu_semaphore *sema;
355
356 nvgpu_spinlock_acquire(&pt->lock);
357 sema = pt->sema;
358 if (sema)
359 nvgpu_semaphore_get(sema);
360 nvgpu_spinlock_release(&pt->lock);
361
362 return sema;
375} 363}
376 364
377void gk20a_sync_timeline_signal(struct sync_timeline *timeline) 365void gk20a_sync_timeline_signal(struct sync_timeline *timeline)
diff --git a/drivers/gpu/nvgpu/gk20a/sync_gk20a.h b/drivers/gpu/nvgpu/gk20a/sync_gk20a.h
index 7d7aff6d..8a6439ab 100644
--- a/drivers/gpu/nvgpu/gk20a/sync_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/sync_gk20a.h
@@ -3,7 +3,7 @@
3 * 3 *
4 * GK20A Sync Framework Integration 4 * GK20A Sync Framework Integration
5 * 5 *
6 * Copyright (c) 2014-2017, NVIDIA CORPORATION. All rights reserved. 6 * Copyright (c) 2014-2018, NVIDIA CORPORATION. All rights reserved.
7 * 7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a 8 * Permission is hereby granted, free of charge, to any person obtaining a
9 * copy of this software and associated documentation files (the "Software"), 9 * copy of this software and associated documentation files (the "Software"),
@@ -33,9 +33,6 @@ struct sync_pt;
33struct nvgpu_semaphore; 33struct nvgpu_semaphore;
34struct fence; 34struct fence;
35 35
36int gk20a_is_sema_backed_sync_fence(struct sync_fence *fence);
37struct nvgpu_semaphore *gk20a_sync_fence_get_sema(struct sync_fence *f);
38
39#ifdef CONFIG_SYNC 36#ifdef CONFIG_SYNC
40struct sync_timeline *gk20a_sync_timeline_create(const char *fmt, ...); 37struct sync_timeline *gk20a_sync_timeline_create(const char *fmt, ...);
41void gk20a_sync_timeline_destroy(struct sync_timeline *); 38void gk20a_sync_timeline_destroy(struct sync_timeline *);
@@ -46,6 +43,7 @@ struct sync_fence *gk20a_sync_fence_create(
46 struct nvgpu_semaphore *, 43 struct nvgpu_semaphore *,
47 const char *fmt, ...); 44 const char *fmt, ...);
48struct sync_fence *gk20a_sync_fence_fdget(int fd); 45struct sync_fence *gk20a_sync_fence_fdget(int fd);
46struct nvgpu_semaphore *gk20a_sync_pt_sema(struct sync_pt *spt);
49#else 47#else
50static inline void gk20a_sync_timeline_destroy(struct sync_timeline *obj) {} 48static inline void gk20a_sync_timeline_destroy(struct sync_timeline *obj) {}
51static inline void gk20a_sync_timeline_signal(struct sync_timeline *obj) {} 49static inline void gk20a_sync_timeline_signal(struct sync_timeline *obj) {}