aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChris Wilson <chris@chris-wilson.co.uk>2018-02-22 04:25:44 -0500
committerChris Wilson <chris@chris-wilson.co.uk>2018-03-06 07:12:45 -0500
commitcd46c545b7db2a9ac14f6db66944b017cbf21faf (patch)
tree769bcffed1af6fc14e57cf0b0f6f30e6964c19e8
parent7cc62d0b8e257fbac8e2972074351bc766b96853 (diff)
drm/i915/breadcrumbs: Reduce signaler rbtree to a sorted list
The goal here is to try and reduce the latency of signaling additional requests following the wakeup from interrupt by reducing the list of to-be-signaled requests from an rbtree to a sorted linked list. The original choice of using an rbtree was to facilitate random insertions of request into the signaler while maintaining a sorted list. However, if we assume that most new requests are added when they are submitted, we see those new requests in execution order making a insertion sort fast, and the reduction in overhead of each signaler iteration significant. Since commit 56299fb7d904 ("drm/i915: Signal first fence from irq handler if complete"), we signal most fences directly from notify_ring() in the interrupt handler greatly reducing the amount of work that actually needs to be done by the signaler kthread. All the thread is then required to do is operate as the bottom-half, cleaning up after the interrupt handler and preparing the next waiter. This includes signaling all later completed fences in a saturated system, but on a mostly idle system we only have to rebuild the wait rbtree in time for the next interrupt. With this de-emphasis of the signaler's role, we want to rejig it's datastructures to reduce the amount of work we require to both setup the signal tree and maintain it on every interrupt. References: 56299fb7d904 ("drm/i915: Signal first fence from irq handler if complete") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20180222092545.17216-1-chris@chris-wilson.co.uk
-rw-r--r--drivers/gpu/drm/i915/i915_request.h2
-rw-r--r--drivers/gpu/drm/i915/intel_breadcrumbs.c261
-rw-r--r--drivers/gpu/drm/i915/intel_ringbuffer.h4
3 files changed, 116 insertions, 151 deletions
diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
index 74311fc53e2f..7d6eb82eeb91 100644
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -44,8 +44,8 @@ struct intel_wait {
44}; 44};
45 45
46struct intel_signal_node { 46struct intel_signal_node {
47 struct rb_node node;
48 struct intel_wait wait; 47 struct intel_wait wait;
48 struct list_head link;
49}; 49};
50 50
51struct i915_dependency { 51struct i915_dependency {
diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
index 094f010908b8..03bbc1dfbc51 100644
--- a/drivers/gpu/drm/i915/intel_breadcrumbs.c
+++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
@@ -340,7 +340,8 @@ static inline void __intel_breadcrumbs_finish(struct intel_breadcrumbs *b,
340 lockdep_assert_held(&b->rb_lock); 340 lockdep_assert_held(&b->rb_lock);
341 GEM_BUG_ON(b->irq_wait == wait); 341 GEM_BUG_ON(b->irq_wait == wait);
342 342
343 /* This request is completed, so remove it from the tree, mark it as 343 /*
344 * This request is completed, so remove it from the tree, mark it as
344 * complete, and *then* wake up the associated task. N.B. when the 345 * complete, and *then* wake up the associated task. N.B. when the
345 * task wakes up, it will find the empty rb_node, discern that it 346 * task wakes up, it will find the empty rb_node, discern that it
346 * has already been removed from the tree and skip the serialisation 347 * has already been removed from the tree and skip the serialisation
@@ -351,7 +352,8 @@ static inline void __intel_breadcrumbs_finish(struct intel_breadcrumbs *b,
351 rb_erase(&wait->node, &b->waiters); 352 rb_erase(&wait->node, &b->waiters);
352 RB_CLEAR_NODE(&wait->node); 353 RB_CLEAR_NODE(&wait->node);
353 354
354 wake_up_process(wait->tsk); /* implicit smp_wmb() */ 355 if (wait->tsk->state != TASK_RUNNING)
356 wake_up_process(wait->tsk); /* implicit smp_wmb() */
355} 357}
356 358
357static inline void __intel_breadcrumbs_next(struct intel_engine_cs *engine, 359static inline void __intel_breadcrumbs_next(struct intel_engine_cs *engine,
@@ -592,23 +594,6 @@ void intel_engine_remove_wait(struct intel_engine_cs *engine,
592 spin_unlock_irq(&b->rb_lock); 594 spin_unlock_irq(&b->rb_lock);
593} 595}
594 596
595static bool signal_complete(const struct i915_request *request)
596{
597 if (!request)
598 return false;
599
600 /*
601 * Carefully check if the request is complete, giving time for the
602 * seqno to be visible or if the GPU hung.
603 */
604 return __i915_request_irq_complete(request);
605}
606
607static struct i915_request *to_signaler(struct rb_node *rb)
608{
609 return rb_entry(rb, struct i915_request, signaling.node);
610}
611
612static void signaler_set_rtpriority(void) 597static void signaler_set_rtpriority(void)
613{ 598{
614 struct sched_param param = { .sched_priority = 1 }; 599 struct sched_param param = { .sched_priority = 1 };
@@ -616,78 +601,26 @@ static void signaler_set_rtpriority(void)
616 sched_setscheduler_nocheck(current, SCHED_FIFO, &param); 601 sched_setscheduler_nocheck(current, SCHED_FIFO, &param);
617} 602}
618 603
619static void __intel_engine_remove_signal(struct intel_engine_cs *engine,
620 struct i915_request *request)
621{
622 struct intel_breadcrumbs *b = &engine->breadcrumbs;
623
624 lockdep_assert_held(&b->rb_lock);
625
626 /*
627 * Wake up all other completed waiters and select the
628 * next bottom-half for the next user interrupt.
629 */
630 __intel_engine_remove_wait(engine, &request->signaling.wait);
631
632 /*
633 * Find the next oldest signal. Note that as we have
634 * not been holding the lock, another client may
635 * have installed an even older signal than the one
636 * we just completed - so double check we are still
637 * the oldest before picking the next one.
638 */
639 if (request->signaling.wait.seqno) {
640 if (request == rcu_access_pointer(b->first_signal)) {
641 struct rb_node *rb = rb_next(&request->signaling.node);
642 rcu_assign_pointer(b->first_signal,
643 rb ? to_signaler(rb) : NULL);
644 }
645
646 rb_erase(&request->signaling.node, &b->signals);
647 request->signaling.wait.seqno = 0;
648 }
649}
650
651static struct i915_request *
652get_first_signal_rcu(struct intel_breadcrumbs *b)
653{
654 /*
655 * See the big warnings for i915_gem_active_get_rcu() and similarly
656 * for dma_fence_get_rcu_safe() that explain the intricacies involved
657 * here with defeating CPU/compiler speculation and enforcing
658 * the required memory barriers.
659 */
660 do {
661 struct i915_request *request;
662
663 request = rcu_dereference(b->first_signal);
664 if (request)
665 request = i915_request_get_rcu(request);
666
667 barrier();
668
669 if (!request || request == rcu_access_pointer(b->first_signal))
670 return rcu_pointer_handoff(request);
671
672 i915_request_put(request);
673 } while (1);
674}
675
676static int intel_breadcrumbs_signaler(void *arg) 604static int intel_breadcrumbs_signaler(void *arg)
677{ 605{
678 struct intel_engine_cs *engine = arg; 606 struct intel_engine_cs *engine = arg;
679 struct intel_breadcrumbs *b = &engine->breadcrumbs; 607 struct intel_breadcrumbs *b = &engine->breadcrumbs;
680 struct i915_request *request; 608 struct i915_request *rq, *n;
681 609
682 /* Install ourselves with high priority to reduce signalling latency */ 610 /* Install ourselves with high priority to reduce signalling latency */
683 signaler_set_rtpriority(); 611 signaler_set_rtpriority();
684 612
685 do { 613 do {
686 bool do_schedule = true; 614 bool do_schedule = true;
615 LIST_HEAD(list);
616 u32 seqno;
687 617
688 set_current_state(TASK_INTERRUPTIBLE); 618 set_current_state(TASK_INTERRUPTIBLE);
619 if (list_empty(&b->signals))
620 goto sleep;
689 621
690 /* We are either woken up by the interrupt bottom-half, 622 /*
623 * We are either woken up by the interrupt bottom-half,
691 * or by a client adding a new signaller. In both cases, 624 * or by a client adding a new signaller. In both cases,
692 * the GPU seqno may have advanced beyond our oldest signal. 625 * the GPU seqno may have advanced beyond our oldest signal.
693 * If it has, propagate the signal, remove the waiter and 626 * If it has, propagate the signal, remove the waiter and
@@ -695,25 +628,45 @@ static int intel_breadcrumbs_signaler(void *arg)
695 * need to wait for a new interrupt from the GPU or for 628 * need to wait for a new interrupt from the GPU or for
696 * a new client. 629 * a new client.
697 */ 630 */
698 rcu_read_lock(); 631 seqno = intel_engine_get_seqno(engine);
699 request = get_first_signal_rcu(b); 632
700 rcu_read_unlock(); 633 spin_lock_irq(&b->rb_lock);
701 if (signal_complete(request)) { 634 list_for_each_entry_safe(rq, n, &b->signals, signaling.link) {
702 if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, 635 u32 this = rq->signaling.wait.seqno;
703 &request->fence.flags)) { 636
704 local_bh_disable(); 637 GEM_BUG_ON(!rq->signaling.wait.seqno);
705 dma_fence_signal(&request->fence);
706 GEM_BUG_ON(!i915_request_completed(request));
707 local_bh_enable(); /* kick start the tasklets */
708 }
709 638
710 if (READ_ONCE(request->signaling.wait.seqno)) { 639 if (!i915_seqno_passed(seqno, this))
711 spin_lock_irq(&b->rb_lock); 640 break;
712 __intel_engine_remove_signal(engine, request); 641
713 spin_unlock_irq(&b->rb_lock); 642 if (likely(this == i915_request_global_seqno(rq))) {
643 __intel_engine_remove_wait(engine,
644 &rq->signaling.wait);
645
646 rq->signaling.wait.seqno = 0;
647 __list_del_entry(&rq->signaling.link);
648
649 if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
650 &rq->fence.flags)) {
651 list_add_tail(&rq->signaling.link,
652 &list);
653 i915_request_get(rq);
654 }
655 }
656 }
657 spin_unlock_irq(&b->rb_lock);
658
659 if (!list_empty(&list)) {
660 local_bh_disable();
661 list_for_each_entry_safe(rq, n, &list, signaling.link) {
662 dma_fence_signal(&rq->fence);
663 GEM_BUG_ON(!i915_request_completed(rq));
664 i915_request_put(rq);
714 } 665 }
666 local_bh_enable(); /* kick start the tasklets */
715 667
716 /* If the engine is saturated we may be continually 668 /*
669 * If the engine is saturated we may be continually
717 * processing completed requests. This angers the 670 * processing completed requests. This angers the
718 * NMI watchdog if we never let anything else 671 * NMI watchdog if we never let anything else
719 * have access to the CPU. Let's pretend to be nice 672 * have access to the CPU. Let's pretend to be nice
@@ -722,9 +675,19 @@ static int intel_breadcrumbs_signaler(void *arg)
722 */ 675 */
723 do_schedule = need_resched(); 676 do_schedule = need_resched();
724 } 677 }
725 i915_request_put(request);
726 678
727 if (unlikely(do_schedule)) { 679 if (unlikely(do_schedule)) {
680 /* Before we sleep, check for a missed seqno */
681 if (current->state & TASK_NORMAL &&
682 !list_empty(&b->signals) &&
683 engine->irq_seqno_barrier &&
684 test_and_clear_bit(ENGINE_IRQ_BREADCRUMB,
685 &engine->irq_posted)) {
686 engine->irq_seqno_barrier(engine);
687 intel_engine_wakeup(engine);
688 }
689
690sleep:
728 if (kthread_should_park()) 691 if (kthread_should_park())
729 kthread_parkme(); 692 kthread_parkme();
730 693
@@ -739,13 +702,40 @@ static int intel_breadcrumbs_signaler(void *arg)
739 return 0; 702 return 0;
740} 703}
741 704
705static void insert_signal(struct intel_breadcrumbs *b,
706 struct i915_request *request,
707 const u32 seqno)
708{
709 struct i915_request *iter;
710
711 lockdep_assert_held(&b->rb_lock);
712
713 /*
714 * A reasonable assumption is that we are called to add signals
715 * in sequence, as the requests are submitted for execution and
716 * assigned a global_seqno. This will be the case for the majority
717 * of internally generated signals (inter-engine signaling).
718 *
719 * Out of order waiters triggering random signaling enabling will
720 * be more problematic, but hopefully rare enough and the list
721 * small enough that the O(N) insertion sort is not an issue.
722 */
723
724 list_for_each_entry_reverse(iter, &b->signals, signaling.link)
725 if (i915_seqno_passed(seqno, iter->signaling.wait.seqno))
726 break;
727
728 list_add(&request->signaling.link, &iter->signaling.link);
729}
730
742void intel_engine_enable_signaling(struct i915_request *request, bool wakeup) 731void intel_engine_enable_signaling(struct i915_request *request, bool wakeup)
743{ 732{
744 struct intel_engine_cs *engine = request->engine; 733 struct intel_engine_cs *engine = request->engine;
745 struct intel_breadcrumbs *b = &engine->breadcrumbs; 734 struct intel_breadcrumbs *b = &engine->breadcrumbs;
746 u32 seqno; 735 u32 seqno;
747 736
748 /* Note that we may be called from an interrupt handler on another 737 /*
738 * Note that we may be called from an interrupt handler on another
749 * device (e.g. nouveau signaling a fence completion causing us 739 * device (e.g. nouveau signaling a fence completion causing us
750 * to submit a request, and so enable signaling). As such, 740 * to submit a request, and so enable signaling). As such,
751 * we need to make sure that all other users of b->rb_lock protect 741 * we need to make sure that all other users of b->rb_lock protect
@@ -757,17 +747,16 @@ void intel_engine_enable_signaling(struct i915_request *request, bool wakeup)
757 lockdep_assert_held(&request->lock); 747 lockdep_assert_held(&request->lock);
758 748
759 seqno = i915_request_global_seqno(request); 749 seqno = i915_request_global_seqno(request);
760 if (!seqno) 750 if (!seqno) /* will be enabled later upon execution */
761 return; 751 return;
762 752
763 spin_lock(&b->rb_lock);
764
765 GEM_BUG_ON(request->signaling.wait.seqno); 753 GEM_BUG_ON(request->signaling.wait.seqno);
766 request->signaling.wait.tsk = b->signaler; 754 request->signaling.wait.tsk = b->signaler;
767 request->signaling.wait.request = request; 755 request->signaling.wait.request = request;
768 request->signaling.wait.seqno = seqno; 756 request->signaling.wait.seqno = seqno;
769 757
770 /* First add ourselves into the list of waiters, but register our 758 /*
759 * Add ourselves into the list of waiters, but registering our
771 * bottom-half as the signaller thread. As per usual, only the oldest 760 * bottom-half as the signaller thread. As per usual, only the oldest
772 * waiter (not just signaller) is tasked as the bottom-half waking 761 * waiter (not just signaller) is tasked as the bottom-half waking
773 * up all completed waiters after the user interrupt. 762 * up all completed waiters after the user interrupt.
@@ -775,39 +764,9 @@ void intel_engine_enable_signaling(struct i915_request *request, bool wakeup)
775 * If we are the oldest waiter, enable the irq (after which we 764 * If we are the oldest waiter, enable the irq (after which we
776 * must double check that the seqno did not complete). 765 * must double check that the seqno did not complete).
777 */ 766 */
767 spin_lock(&b->rb_lock);
768 insert_signal(b, request, seqno);
778 wakeup &= __intel_engine_add_wait(engine, &request->signaling.wait); 769 wakeup &= __intel_engine_add_wait(engine, &request->signaling.wait);
779
780 if (!__i915_request_completed(request, seqno)) {
781 struct rb_node *parent, **p;
782 bool first;
783
784 /* Now insert ourselves into the retirement ordered list of
785 * signals on this engine. We track the oldest seqno as that
786 * will be the first signal to complete.
787 */
788 parent = NULL;
789 first = true;
790 p = &b->signals.rb_node;
791 while (*p) {
792 parent = *p;
793 if (i915_seqno_passed(seqno,
794 to_signaler(parent)->signaling.wait.seqno)) {
795 p = &parent->rb_right;
796 first = false;
797 } else {
798 p = &parent->rb_left;
799 }
800 }
801 rb_link_node(&request->signaling.node, parent, p);
802 rb_insert_color(&request->signaling.node, &b->signals);
803 if (first)
804 rcu_assign_pointer(b->first_signal, request);
805 } else {
806 __intel_engine_remove_wait(engine, &request->signaling.wait);
807 request->signaling.wait.seqno = 0;
808 wakeup = false;
809 }
810
811 spin_unlock(&b->rb_lock); 770 spin_unlock(&b->rb_lock);
812 771
813 if (wakeup) 772 if (wakeup)
@@ -816,17 +775,20 @@ void intel_engine_enable_signaling(struct i915_request *request, bool wakeup)
816 775
817void intel_engine_cancel_signaling(struct i915_request *request) 776void intel_engine_cancel_signaling(struct i915_request *request)
818{ 777{
778 struct intel_engine_cs *engine = request->engine;
779 struct intel_breadcrumbs *b = &engine->breadcrumbs;
780
819 GEM_BUG_ON(!irqs_disabled()); 781 GEM_BUG_ON(!irqs_disabled());
820 lockdep_assert_held(&request->lock); 782 lockdep_assert_held(&request->lock);
821 783
822 if (READ_ONCE(request->signaling.wait.seqno)) { 784 if (!READ_ONCE(request->signaling.wait.seqno))
823 struct intel_engine_cs *engine = request->engine; 785 return;
824 struct intel_breadcrumbs *b = &engine->breadcrumbs;
825 786
826 spin_lock(&b->rb_lock); 787 spin_lock(&b->rb_lock);
827 __intel_engine_remove_signal(engine, request); 788 __intel_engine_remove_wait(engine, &request->signaling.wait);
828 spin_unlock(&b->rb_lock); 789 if (fetch_and_zero(&request->signaling.wait.seqno))
829 } 790 __list_del_entry(&request->signaling.link);
791 spin_unlock(&b->rb_lock);
830} 792}
831 793
832int intel_engine_init_breadcrumbs(struct intel_engine_cs *engine) 794int intel_engine_init_breadcrumbs(struct intel_engine_cs *engine)
@@ -840,6 +802,8 @@ int intel_engine_init_breadcrumbs(struct intel_engine_cs *engine)
840 timer_setup(&b->fake_irq, intel_breadcrumbs_fake_irq, 0); 802 timer_setup(&b->fake_irq, intel_breadcrumbs_fake_irq, 0);
841 timer_setup(&b->hangcheck, intel_breadcrumbs_hangcheck, 0); 803 timer_setup(&b->hangcheck, intel_breadcrumbs_hangcheck, 0);
842 804
805 INIT_LIST_HEAD(&b->signals);
806
843 /* Spawn a thread to provide a common bottom-half for all signals. 807 /* Spawn a thread to provide a common bottom-half for all signals.
844 * As this is an asynchronous interface we cannot steal the current 808 * As this is an asynchronous interface we cannot steal the current
845 * task for handling the bottom-half to the user interrupt, therefore 809 * task for handling the bottom-half to the user interrupt, therefore
@@ -899,8 +863,7 @@ void intel_engine_fini_breadcrumbs(struct intel_engine_cs *engine)
899 /* The engines should be idle and all requests accounted for! */ 863 /* The engines should be idle and all requests accounted for! */
900 WARN_ON(READ_ONCE(b->irq_wait)); 864 WARN_ON(READ_ONCE(b->irq_wait));
901 WARN_ON(!RB_EMPTY_ROOT(&b->waiters)); 865 WARN_ON(!RB_EMPTY_ROOT(&b->waiters));
902 WARN_ON(rcu_access_pointer(b->first_signal)); 866 WARN_ON(!list_empty(&b->signals));
903 WARN_ON(!RB_EMPTY_ROOT(&b->signals));
904 867
905 if (!IS_ERR_OR_NULL(b->signaler)) 868 if (!IS_ERR_OR_NULL(b->signaler))
906 kthread_stop(b->signaler); 869 kthread_stop(b->signaler);
@@ -913,20 +876,22 @@ bool intel_breadcrumbs_busy(struct intel_engine_cs *engine)
913 struct intel_breadcrumbs *b = &engine->breadcrumbs; 876 struct intel_breadcrumbs *b = &engine->breadcrumbs;
914 bool busy = false; 877 bool busy = false;
915 878
916 spin_lock_irq(&b->rb_lock);
917
918 if (b->irq_wait) { 879 if (b->irq_wait) {
919 wake_up_process(b->irq_wait->tsk); 880 spin_lock_irq(&b->irq_lock);
920 busy = true; 881
882 if (b->irq_wait) {
883 wake_up_process(b->irq_wait->tsk);
884 busy = true;
885 }
886
887 spin_unlock_irq(&b->irq_lock);
921 } 888 }
922 889
923 if (rcu_access_pointer(b->first_signal)) { 890 if (!busy && !list_empty(&b->signals)) {
924 wake_up_process(b->signaler); 891 wake_up_process(b->signaler);
925 busy = true; 892 busy = true;
926 } 893 }
927 894
928 spin_unlock_irq(&b->rb_lock);
929
930 return busy; 895 return busy;
931} 896}
932 897
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 90e4380cbdd5..e7526a4f05e5 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -356,9 +356,9 @@ struct intel_engine_cs {
356 356
357 spinlock_t rb_lock; /* protects the rb and wraps irq_lock */ 357 spinlock_t rb_lock; /* protects the rb and wraps irq_lock */
358 struct rb_root waiters; /* sorted by retirement, priority */ 358 struct rb_root waiters; /* sorted by retirement, priority */
359 struct rb_root signals; /* sorted by retirement */ 359 struct list_head signals; /* sorted by retirement */
360 struct task_struct *signaler; /* used for fence signalling */ 360 struct task_struct *signaler; /* used for fence signalling */
361 struct i915_request __rcu *first_signal; 361
362 struct timer_list fake_irq; /* used after a missed interrupt */ 362 struct timer_list fake_irq; /* used after a missed interrupt */
363 struct timer_list hangcheck; /* detect missed interrupts */ 363 struct timer_list hangcheck; /* detect missed interrupts */
364 364