diff options
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c | 175 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/sync_gk20a.c | 90 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/sync_gk20a.h | 6 |
3 files changed, 123 insertions, 148 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c index 4b1be8b9..c6b55bf8 100644 --- a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c | |||
@@ -502,10 +502,10 @@ static void gk20a_channel_semaphore_launcher( | |||
502 | 502 | ||
503 | static void add_sema_cmd(struct gk20a *g, struct channel_gk20a *c, | 503 | static void add_sema_cmd(struct gk20a *g, struct channel_gk20a *c, |
504 | struct nvgpu_semaphore *s, struct priv_cmd_entry *cmd, | 504 | struct nvgpu_semaphore *s, struct priv_cmd_entry *cmd, |
505 | int cmd_size, bool acquire, bool wfi) | 505 | u32 offset, bool acquire, bool wfi) |
506 | { | 506 | { |
507 | int ch = c->chid; | 507 | int ch = c->chid; |
508 | u32 ob, off = cmd->off; | 508 | u32 ob, off = cmd->off + offset; |
509 | u64 va; | 509 | u64 va; |
510 | 510 | ||
511 | ob = off; | 511 | ob = off; |
@@ -588,108 +588,79 @@ static int gk20a_channel_semaphore_wait_syncpt( | |||
588 | } | 588 | } |
589 | 589 | ||
590 | #ifdef CONFIG_SYNC | 590 | #ifdef CONFIG_SYNC |
591 | /* | 591 | static int semaphore_wait_fd_native(struct channel_gk20a *c, int fd, |
592 | * Attempt a fast path for waiting on a sync_fence. Basically if the passed | 592 | struct priv_cmd_entry *wait_cmd) |
593 | * sync_fence is backed by a nvgpu_semaphore then there's no reason to go | ||
594 | * through the rigmarole of setting up a separate semaphore which waits on an | ||
595 | * interrupt from the GPU and then triggers a worker thread to execute a SW | ||
596 | * based semaphore release. Instead just have the GPU wait on the same semaphore | ||
597 | * that is going to be incremented by the GPU. | ||
598 | * | ||
599 | * This function returns 2 possible values: -ENODEV or 0 on success. In the case | ||
600 | * of -ENODEV the fastpath cannot be taken due to the fence not being backed by | ||
601 | * a GPU semaphore. | ||
602 | */ | ||
603 | static int __semaphore_wait_fd_fast_path(struct channel_gk20a *c, | ||
604 | struct sync_fence *fence, | ||
605 | struct priv_cmd_entry *wait_cmd, | ||
606 | struct nvgpu_semaphore **fp_sema) | ||
607 | { | 593 | { |
608 | struct nvgpu_semaphore *sema; | 594 | struct sync_fence *sync_fence; |
609 | int err; | 595 | int err; |
596 | const int wait_cmd_size = 8; | ||
597 | int num_wait_cmds; | ||
598 | int i; | ||
610 | 599 | ||
611 | if (!gk20a_is_sema_backed_sync_fence(fence)) | 600 | sync_fence = gk20a_sync_fence_fdget(fd); |
612 | return -ENODEV; | 601 | if (!sync_fence) |
613 | 602 | return -EINVAL; | |
614 | sema = gk20a_sync_fence_get_sema(fence); | ||
615 | 603 | ||
616 | /* | 604 | num_wait_cmds = sync_fence->num_fences; |
617 | * If there's no underlying sema then that means the underlying sema has | 605 | if (num_wait_cmds == 0) { |
618 | * already signaled. | 606 | err = 0; |
619 | */ | 607 | goto put_fence; |
620 | if (!sema) { | ||
621 | *fp_sema = NULL; | ||
622 | return 0; | ||
623 | } | 608 | } |
624 | 609 | ||
625 | err = gk20a_channel_alloc_priv_cmdbuf(c, 8, wait_cmd); | 610 | err = gk20a_channel_alloc_priv_cmdbuf(c, |
626 | if (err) | 611 | wait_cmd_size * num_wait_cmds, |
627 | return err; | 612 | wait_cmd); |
613 | if (err) { | ||
614 | nvgpu_err(c->g, "not enough priv cmd buffer space"); | ||
615 | goto put_fence; | ||
616 | } | ||
628 | 617 | ||
629 | nvgpu_semaphore_get(sema); | 618 | for (i = 0; i < sync_fence->num_fences; i++) { |
630 | BUG_ON(!sema->incremented); | 619 | struct fence *f = sync_fence->cbs[i].sync_pt; |
631 | add_sema_cmd(c->g, c, sema, wait_cmd, 8, true, false); | 620 | struct sync_pt *pt = sync_pt_from_fence(f); |
621 | struct nvgpu_semaphore *sema; | ||
632 | 622 | ||
633 | /* | 623 | sema = gk20a_sync_pt_sema(pt); |
634 | * Make sure that gk20a_channel_semaphore_wait_fd() can create another | 624 | if (!sema) { |
635 | * fence with the underlying semaphore. | 625 | /* expired */ |
636 | */ | 626 | nvgpu_memset(c->g, wait_cmd->mem, |
637 | *fp_sema = sema; | 627 | (wait_cmd->off + i * wait_cmd_size) * sizeof(u32), |
628 | 0, wait_cmd_size * sizeof(u32)); | ||
629 | } else { | ||
630 | WARN_ON(!sema->incremented); | ||
631 | add_sema_cmd(c->g, c, sema, wait_cmd, | ||
632 | i * wait_cmd_size, true, false); | ||
633 | nvgpu_semaphore_put(sema); | ||
634 | } | ||
635 | } | ||
638 | 636 | ||
639 | return 0; | 637 | put_fence: |
638 | sync_fence_put(sync_fence); | ||
639 | return err; | ||
640 | } | 640 | } |
641 | #endif | ||
642 | 641 | ||
643 | static int gk20a_channel_semaphore_wait_fd( | 642 | static int semaphore_wait_fd_proxy(struct channel_gk20a *c, int fd, |
644 | struct gk20a_channel_sync *s, int fd, | 643 | struct priv_cmd_entry *wait_cmd, |
645 | struct priv_cmd_entry *entry, | 644 | struct gk20a_fence *fence_out, |
646 | struct gk20a_fence *fence) | 645 | struct sync_timeline *timeline) |
647 | { | 646 | { |
648 | struct gk20a_channel_semaphore *sema = | 647 | const int wait_cmd_size = 8; |
649 | container_of(s, struct gk20a_channel_semaphore, ops); | ||
650 | struct channel_gk20a *c = sema->c; | ||
651 | #ifdef CONFIG_SYNC | ||
652 | struct nvgpu_semaphore *fp_sema; | ||
653 | struct sync_fence *sync_fence; | 648 | struct sync_fence *sync_fence; |
654 | struct priv_cmd_entry *wait_cmd = entry; | ||
655 | struct wait_fence_work *w = NULL; | 649 | struct wait_fence_work *w = NULL; |
656 | int err, ret, status; | 650 | int err, status; |
657 | 651 | ||
658 | sync_fence = gk20a_sync_fence_fdget(fd); | 652 | sync_fence = sync_fence_fdget(fd); |
659 | if (!sync_fence) | 653 | if (!sync_fence) |
660 | return -EINVAL; | 654 | return -EINVAL; |
661 | 655 | ||
662 | ret = __semaphore_wait_fd_fast_path(c, sync_fence, wait_cmd, &fp_sema); | ||
663 | if (ret == 0) { | ||
664 | if (fp_sema) { | ||
665 | err = gk20a_fence_from_semaphore(c->g, fence, | ||
666 | sema->timeline, | ||
667 | fp_sema, | ||
668 | &c->semaphore_wq, | ||
669 | false); | ||
670 | if (err) { | ||
671 | nvgpu_semaphore_put(fp_sema); | ||
672 | goto clean_up_priv_cmd; | ||
673 | } | ||
674 | } else | ||
675 | /* | ||
676 | * Init an empty fence. It will instantly return | ||
677 | * from gk20a_fence_wait(). | ||
678 | */ | ||
679 | gk20a_init_fence(fence, NULL, NULL); | ||
680 | |||
681 | sync_fence_put(sync_fence); | ||
682 | goto skip_slow_path; | ||
683 | } | ||
684 | |||
685 | /* If the fence has signaled there is no reason to wait on it. */ | 656 | /* If the fence has signaled there is no reason to wait on it. */ |
686 | status = atomic_read(&sync_fence->status); | 657 | status = atomic_read(&sync_fence->status); |
687 | if (status == 0) { | 658 | if (status == 0) { |
688 | sync_fence_put(sync_fence); | 659 | sync_fence_put(sync_fence); |
689 | goto skip_slow_path; | 660 | return 0; |
690 | } | 661 | } |
691 | 662 | ||
692 | err = gk20a_channel_alloc_priv_cmdbuf(c, 8, wait_cmd); | 663 | err = gk20a_channel_alloc_priv_cmdbuf(c, wait_cmd_size, wait_cmd); |
693 | if (err) { | 664 | if (err) { |
694 | nvgpu_err(c->g, | 665 | nvgpu_err(c->g, |
695 | "not enough priv cmd buffer space"); | 666 | "not enough priv cmd buffer space"); |
@@ -718,34 +689,34 @@ static int gk20a_channel_semaphore_wait_fd( | |||
718 | nvgpu_semaphore_incr(w->sema, c->hw_sema); | 689 | nvgpu_semaphore_incr(w->sema, c->hw_sema); |
719 | 690 | ||
720 | /* GPU unblocked when the semaphore value increments. */ | 691 | /* GPU unblocked when the semaphore value increments. */ |
721 | add_sema_cmd(c->g, c, w->sema, wait_cmd, 8, true, false); | 692 | add_sema_cmd(c->g, c, w->sema, wait_cmd, 0, true, false); |
722 | 693 | ||
723 | /* | 694 | /* |
724 | * We need to create the fence before adding the waiter to ensure | 695 | * We need to create the fence before adding the waiter to ensure |
725 | * that we properly clean up in the event the sync_fence has | 696 | * that we properly clean up in the event the sync_fence has |
726 | * already signaled | 697 | * already signaled |
727 | */ | 698 | */ |
728 | err = gk20a_fence_from_semaphore(c->g, fence, sema->timeline, w->sema, | 699 | err = gk20a_fence_from_semaphore(c->g, fence_out, timeline, |
729 | &c->semaphore_wq, false); | 700 | w->sema, &c->semaphore_wq, false); |
730 | if (err) | 701 | if (err) |
731 | goto clean_up_sema; | 702 | goto clean_up_sema; |
732 | 703 | ||
733 | ret = sync_fence_wait_async(sync_fence, &w->waiter); | 704 | err = sync_fence_wait_async(sync_fence, &w->waiter); |
734 | gk20a_add_pending_sema_wait(c->g, w); | 705 | gk20a_add_pending_sema_wait(c->g, w); |
735 | 706 | ||
736 | /* | 707 | /* |
737 | * If the sync_fence has already signaled then the above async_wait | 708 | * If the sync_fence has already signaled then the above wait_async |
738 | * will never trigger. This causes the semaphore release op to never | 709 | * will not get scheduled; the fence completed just after doing the |
739 | * happen which, in turn, hangs the GPU. That's bad. So let's just | 710 | * status check above before allocs and waiter init, and won the race. |
740 | * do the nvgpu_semaphore_release() right now. | 711 | * This causes the waiter to be skipped, so let's release the semaphore |
712 | * here and put the refs taken for the worker. | ||
741 | */ | 713 | */ |
742 | if (ret == 1) { | 714 | if (err == 1) { |
743 | sync_fence_put(sync_fence); | 715 | sync_fence_put(sync_fence); |
744 | nvgpu_semaphore_release(w->sema, c->hw_sema); | 716 | nvgpu_semaphore_release(w->sema, c->hw_sema); |
745 | nvgpu_semaphore_put(w->sema); | 717 | nvgpu_semaphore_put(w->sema); |
746 | } | 718 | } |
747 | 719 | ||
748 | skip_slow_path: | ||
749 | return 0; | 720 | return 0; |
750 | 721 | ||
751 | clean_up_sema: | 722 | clean_up_sema: |
@@ -758,10 +729,28 @@ clean_up_sema: | |||
758 | clean_up_worker: | 729 | clean_up_worker: |
759 | nvgpu_kfree(c->g, w); | 730 | nvgpu_kfree(c->g, w); |
760 | clean_up_priv_cmd: | 731 | clean_up_priv_cmd: |
761 | gk20a_free_priv_cmdbuf(c, entry); | 732 | gk20a_free_priv_cmdbuf(c, wait_cmd); |
762 | clean_up_sync_fence: | 733 | clean_up_sync_fence: |
763 | sync_fence_put(sync_fence); | 734 | sync_fence_put(sync_fence); |
764 | return err; | 735 | return err; |
736 | } | ||
737 | #endif | ||
738 | |||
739 | static int gk20a_channel_semaphore_wait_fd( | ||
740 | struct gk20a_channel_sync *s, int fd, | ||
741 | struct priv_cmd_entry *entry, | ||
742 | struct gk20a_fence *fence) | ||
743 | { | ||
744 | struct gk20a_channel_semaphore *sema = | ||
745 | container_of(s, struct gk20a_channel_semaphore, ops); | ||
746 | struct channel_gk20a *c = sema->c; | ||
747 | #ifdef CONFIG_SYNC | ||
748 | int err; | ||
749 | |||
750 | err = semaphore_wait_fd_native(c, fd, entry); | ||
751 | if (err) | ||
752 | err = semaphore_wait_fd_proxy(c, fd, entry, fence, sema->timeline); | ||
753 | return err; | ||
765 | #else | 754 | #else |
766 | nvgpu_err(c->g, | 755 | nvgpu_err(c->g, |
767 | "trying to use sync fds with CONFIG_SYNC disabled"); | 756 | "trying to use sync fds with CONFIG_SYNC disabled"); |
@@ -798,7 +787,7 @@ static int __gk20a_channel_semaphore_incr( | |||
798 | } | 787 | } |
799 | 788 | ||
800 | /* Release the completion semaphore. */ | 789 | /* Release the completion semaphore. */ |
801 | add_sema_cmd(c->g, c, semaphore, incr_cmd, 14, false, wfi_cmd); | 790 | add_sema_cmd(c->g, c, semaphore, incr_cmd, 0, false, wfi_cmd); |
802 | 791 | ||
803 | err = gk20a_fence_from_semaphore(c->g, fence, | 792 | err = gk20a_fence_from_semaphore(c->g, fence, |
804 | sp->timeline, semaphore, | 793 | sp->timeline, semaphore, |
diff --git a/drivers/gpu/nvgpu/gk20a/sync_gk20a.c b/drivers/gpu/nvgpu/gk20a/sync_gk20a.c index f6d16b90..a8600bce 100644 --- a/drivers/gpu/nvgpu/gk20a/sync_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/sync_gk20a.c | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * GK20A Sync Framework Integration | 2 | * GK20A Sync Framework Integration |
3 | * | 3 | * |
4 | * Copyright (c) 2014-2017, NVIDIA CORPORATION. All rights reserved. | 4 | * Copyright (c) 2014-2018, NVIDIA CORPORATION. All rights reserved. |
5 | * | 5 | * |
6 | * Permission is hereby granted, free of charge, to any person obtaining a | 6 | * Permission is hereby granted, free of charge, to any person obtaining a |
7 | * copy of this software and associated documentation files (the "Software"), | 7 | * copy of this software and associated documentation files (the "Software"), |
@@ -70,55 +70,6 @@ struct gk20a_sync_pt_inst { | |||
70 | }; | 70 | }; |
71 | 71 | ||
72 | /** | 72 | /** |
73 | * Check if the passed sync_fence is backed by a single GPU semaphore. In such | ||
74 | * cases we can short circuit a lot of SW involved in signaling pre-fences and | ||
75 | * post fences. | ||
76 | * | ||
77 | * For now reject multi-sync_pt fences. This could be changed in future. It | ||
78 | * would require that the sema fast path push a sema acquire for each semaphore | ||
79 | * in the fence. | ||
80 | */ | ||
81 | int gk20a_is_sema_backed_sync_fence(struct sync_fence *fence) | ||
82 | { | ||
83 | struct sync_timeline *t; | ||
84 | |||
85 | struct fence *pt = fence->cbs[0].sync_pt; | ||
86 | struct sync_pt *spt = sync_pt_from_fence(pt); | ||
87 | |||
88 | if (fence->num_fences != 1) | ||
89 | return 0; | ||
90 | |||
91 | if (spt == NULL) | ||
92 | return 0; | ||
93 | |||
94 | t = sync_pt_parent(spt); | ||
95 | |||
96 | if (t->ops == &gk20a_sync_timeline_ops) | ||
97 | return 1; | ||
98 | return 0; | ||
99 | } | ||
100 | |||
101 | struct nvgpu_semaphore *gk20a_sync_fence_get_sema(struct sync_fence *f) | ||
102 | { | ||
103 | struct sync_pt *spt; | ||
104 | struct gk20a_sync_pt_inst *pti; | ||
105 | |||
106 | struct fence *pt; | ||
107 | |||
108 | if (!f) | ||
109 | return NULL; | ||
110 | |||
111 | if (!gk20a_is_sema_backed_sync_fence(f)) | ||
112 | return NULL; | ||
113 | |||
114 | pt = f->cbs[0].sync_pt; | ||
115 | spt = sync_pt_from_fence(pt); | ||
116 | pti = container_of(spt, struct gk20a_sync_pt_inst, pt); | ||
117 | |||
118 | return pti->shared->sema; | ||
119 | } | ||
120 | |||
121 | /** | ||
122 | * Compares sync pt values a and b, both of which will trigger either before | 73 | * Compares sync pt values a and b, both of which will trigger either before |
123 | * or after ref (i.e. a and b trigger before ref, or a and b trigger after | 74 | * or after ref (i.e. a and b trigger before ref, or a and b trigger after |
124 | * ref). Supplying ref allows us to handle wrapping correctly. | 75 | * ref). Supplying ref allows us to handle wrapping correctly. |
@@ -371,7 +322,44 @@ static const struct sync_timeline_ops gk20a_sync_timeline_ops = { | |||
371 | 322 | ||
372 | struct sync_fence *gk20a_sync_fence_fdget(int fd) | 323 | struct sync_fence *gk20a_sync_fence_fdget(int fd) |
373 | { | 324 | { |
374 | return sync_fence_fdget(fd); | 325 | struct sync_fence *fence = sync_fence_fdget(fd); |
326 | int i; | ||
327 | |||
328 | if (!fence) | ||
329 | return NULL; | ||
330 | |||
331 | for (i = 0; i < fence->num_fences; i++) { | ||
332 | struct fence *pt = fence->cbs[i].sync_pt; | ||
333 | struct sync_pt *spt = sync_pt_from_fence(pt); | ||
334 | struct sync_timeline *t; | ||
335 | |||
336 | if (spt == NULL) { | ||
337 | sync_fence_put(fence); | ||
338 | return NULL; | ||
339 | } | ||
340 | |||
341 | t = sync_pt_parent(spt); | ||
342 | if (t->ops != &gk20a_sync_timeline_ops) { | ||
343 | sync_fence_put(fence); | ||
344 | return NULL; | ||
345 | } | ||
346 | } | ||
347 | |||
348 | return fence; | ||
349 | } | ||
350 | |||
351 | struct nvgpu_semaphore *gk20a_sync_pt_sema(struct sync_pt *spt) | ||
352 | { | ||
353 | struct gk20a_sync_pt *pt = to_gk20a_sync_pt(spt); | ||
354 | struct nvgpu_semaphore *sema; | ||
355 | |||
356 | nvgpu_spinlock_acquire(&pt->lock); | ||
357 | sema = pt->sema; | ||
358 | if (sema) | ||
359 | nvgpu_semaphore_get(sema); | ||
360 | nvgpu_spinlock_release(&pt->lock); | ||
361 | |||
362 | return sema; | ||
375 | } | 363 | } |
376 | 364 | ||
377 | void gk20a_sync_timeline_signal(struct sync_timeline *timeline) | 365 | void gk20a_sync_timeline_signal(struct sync_timeline *timeline) |
diff --git a/drivers/gpu/nvgpu/gk20a/sync_gk20a.h b/drivers/gpu/nvgpu/gk20a/sync_gk20a.h index 7d7aff6d..8a6439ab 100644 --- a/drivers/gpu/nvgpu/gk20a/sync_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/sync_gk20a.h | |||
@@ -3,7 +3,7 @@ | |||
3 | * | 3 | * |
4 | * GK20A Sync Framework Integration | 4 | * GK20A Sync Framework Integration |
5 | * | 5 | * |
6 | * Copyright (c) 2014-2017, NVIDIA CORPORATION. All rights reserved. | 6 | * Copyright (c) 2014-2018, NVIDIA CORPORATION. All rights reserved. |
7 | * | 7 | * |
8 | * Permission is hereby granted, free of charge, to any person obtaining a | 8 | * Permission is hereby granted, free of charge, to any person obtaining a |
9 | * copy of this software and associated documentation files (the "Software"), | 9 | * copy of this software and associated documentation files (the "Software"), |
@@ -33,9 +33,6 @@ struct sync_pt; | |||
33 | struct nvgpu_semaphore; | 33 | struct nvgpu_semaphore; |
34 | struct fence; | 34 | struct fence; |
35 | 35 | ||
36 | int gk20a_is_sema_backed_sync_fence(struct sync_fence *fence); | ||
37 | struct nvgpu_semaphore *gk20a_sync_fence_get_sema(struct sync_fence *f); | ||
38 | |||
39 | #ifdef CONFIG_SYNC | 36 | #ifdef CONFIG_SYNC |
40 | struct sync_timeline *gk20a_sync_timeline_create(const char *fmt, ...); | 37 | struct sync_timeline *gk20a_sync_timeline_create(const char *fmt, ...); |
41 | void gk20a_sync_timeline_destroy(struct sync_timeline *); | 38 | void gk20a_sync_timeline_destroy(struct sync_timeline *); |
@@ -46,6 +43,7 @@ struct sync_fence *gk20a_sync_fence_create( | |||
46 | struct nvgpu_semaphore *, | 43 | struct nvgpu_semaphore *, |
47 | const char *fmt, ...); | 44 | const char *fmt, ...); |
48 | struct sync_fence *gk20a_sync_fence_fdget(int fd); | 45 | struct sync_fence *gk20a_sync_fence_fdget(int fd); |
46 | struct nvgpu_semaphore *gk20a_sync_pt_sema(struct sync_pt *spt); | ||
49 | #else | 47 | #else |
50 | static inline void gk20a_sync_timeline_destroy(struct sync_timeline *obj) {} | 48 | static inline void gk20a_sync_timeline_destroy(struct sync_timeline *obj) {} |
51 | static inline void gk20a_sync_timeline_signal(struct sync_timeline *obj) {} | 49 | static inline void gk20a_sync_timeline_signal(struct sync_timeline *obj) {} |