1 files changed, 59 insertions, 1 deletions
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index b836721d3b13..f6c78c0fa74b 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -425,6 +425,26 @@ void __i915_request_submit(struct i915_request *request)
        if (i915_gem_context_is_banned(request->gem_context))
                i915_request_skip(request, -EIO);
+        /*
+         * Are we using semaphores when the gpu is already saturated?
+         *
+         * Using semaphores incurs a cost in having the GPU poll a
+         * memory location, busywaiting for it to change. The continual
+         * memory reads can have a noticeable impact on the rest of the
+         * system with the extra bus traffic, stalling the cpu as it too
+         * tries to access memory across the bus (perf stat -e bus-cycles).
+         *
+         * If we installed a semaphore on this request and we only submit
+         * the request after the signaler completed, that indicates the
+         * system is overloaded and using semaphores at this time only
+         * increases the amount of work we are doing. If so, we disable
+         * further use of semaphores until we are idle again, whence we
+         * optimistically try again.
+         */
+        if (request->sched.semaphores &&
+            i915_sw_fence_signaled(&request->semaphore))
+                request->hw_context->saturated |= request->sched.semaphores;
        /* We may be recursing from the signal callback of another i915 fence */
        spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING);
@@ -432,6 +452,7 @@ void __i915_request_submit(struct i915_request *request)
        set_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags);
        if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags) &&
+            !test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &request->fence.flags) &&
            !i915_request_enable_breadcrumb(request))
                intel_engine_queue_breadcrumbs(engine);
@@ -799,6 +820,39 @@ err_unreserve:
 }
 static int
+i915_request_await_start(struct i915_request *rq, struct i915_request *signal)
+{
+        if (list_is_first(&signal->ring_link, &signal->ring->request_list))
+                return 0;
+        signal = list_prev_entry(signal, ring_link);
+        if (i915_timeline_sync_is_later(rq->timeline, &signal->fence))
+                return 0;
+        return i915_sw_fence_await_dma_fence(&rq->submit,
+                                             &signal->fence, 0,
+                                             I915_FENCE_GFP);
+}
+static intel_engine_mask_t
+already_busywaiting(struct i915_request *rq)
+{
+        /*
+         * Polling a semaphore causes bus traffic, delaying other users of
+         * both the GPU and CPU. We want to limit the impact on others,
+         * while taking advantage of early submission to reduce GPU
+         * latency. Therefore we restrict ourselves to not using more
+         * than one semaphore from each source, and not using a semaphore
+         * if we have detected the engine is saturated (i.e. would not be
+         * submitted early and cause bus traffic reading an already passed
+         * semaphore).
+         *
+         * See the are-we-too-late? check in __i915_request_submit().
+         */
+        return rq->sched.semaphores | rq->hw_context->saturated;
+}
+static int
 emit_semaphore_wait(struct i915_request *to,
                    struct i915_request *from,
                    gfp_t gfp)
@@ -811,11 +865,15 @@ emit_semaphore_wait(struct i915_request *to,
        GEM_BUG_ON(INTEL_GEN(to->i915) < 8);
        /* Just emit the first semaphore we see as request space is limited. */
-        if (to->sched.semaphores & from->engine->mask)
+        if (already_busywaiting(to) & from->engine->mask)
                return i915_sw_fence_await_dma_fence(&to->submit,
                                                     &from->fence, 0,
                                                     I915_FENCE_GFP);
+        err = i915_request_await_start(to, from);
+        if (err < 0)
+                return err;
        err = i915_sw_fence_await_dma_fence(&to->semaphore,
                                            &from->fence, 0,
                                            I915_FENCE_GFP);

diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index b836721d3b13..f6c78c0fa74b 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c
@@ -425,6 +425,26 @@ void __i915_request_submit(struct i915_request *request)
425	if (i915_gem_context_is_banned(request->gem_context))	425	if (i915_gem_context_is_banned(request->gem_context))
426	i915_request_skip(request, -EIO);	426	i915_request_skip(request, -EIO);
427		427
		428	/*
		429	* Are we using semaphores when the gpu is already saturated?
		430	*
		431	* Using semaphores incurs a cost in having the GPU poll a
		432	* memory location, busywaiting for it to change. The continual
		433	* memory reads can have a noticeable impact on the rest of the
		434	* system with the extra bus traffic, stalling the cpu as it too
		435	* tries to access memory across the bus (perf stat -e bus-cycles).
		436	*
		437	* If we installed a semaphore on this request and we only submit
		438	* the request after the signaler completed, that indicates the
		439	* system is overloaded and using semaphores at this time only
		440	* increases the amount of work we are doing. If so, we disable
		441	* further use of semaphores until we are idle again, whence we
		442	* optimistically try again.
		443	*/
		444	if (request->sched.semaphores &&
		445	i915_sw_fence_signaled(&request->semaphore))
		446	request->hw_context->saturated \|= request->sched.semaphores;
		447
428	/* We may be recursing from the signal callback of another i915 fence */	448	/* We may be recursing from the signal callback of another i915 fence */
429	spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING);	449	spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING);
430		450
@@ -432,6 +452,7 @@ void __i915_request_submit(struct i915_request *request)
432	set_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags);	452	set_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags);
433		453
434	if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags) &&	454	if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags) &&
		455	!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &request->fence.flags) &&
435	!i915_request_enable_breadcrumb(request))	456	!i915_request_enable_breadcrumb(request))
436	intel_engine_queue_breadcrumbs(engine);	457	intel_engine_queue_breadcrumbs(engine);
437		458
@@ -799,6 +820,39 @@ err_unreserve:
799	}	820	}
800		821
801	static int	822	static int
		823	i915_request_await_start(struct i915_request rq, struct i915_request signal)
		824	{
		825	if (list_is_first(&signal->ring_link, &signal->ring->request_list))
		826	return 0;
		827
		828	signal = list_prev_entry(signal, ring_link);
		829	if (i915_timeline_sync_is_later(rq->timeline, &signal->fence))
		830	return 0;
		831
		832	return i915_sw_fence_await_dma_fence(&rq->submit,
		833	&signal->fence, 0,
		834	I915_FENCE_GFP);
		835	}
		836
		837	static intel_engine_mask_t
		838	already_busywaiting(struct i915_request *rq)
		839	{
		840	/*
		841	* Polling a semaphore causes bus traffic, delaying other users of
		842	* both the GPU and CPU. We want to limit the impact on others,
		843	* while taking advantage of early submission to reduce GPU
		844	* latency. Therefore we restrict ourselves to not using more
		845	* than one semaphore from each source, and not using a semaphore
		846	* if we have detected the engine is saturated (i.e. would not be
		847	* submitted early and cause bus traffic reading an already passed
		848	* semaphore).
		849	*
		850	* See the are-we-too-late? check in __i915_request_submit().
		851	*/
		852	return rq->sched.semaphores \| rq->hw_context->saturated;
		853	}
		854
		855	static int
802	emit_semaphore_wait(struct i915_request *to,	856	emit_semaphore_wait(struct i915_request *to,
803	struct i915_request *from,	857	struct i915_request *from,
804	gfp_t gfp)	858	gfp_t gfp)
@@ -811,11 +865,15 @@ emit_semaphore_wait(struct i915_request *to,
811	GEM_BUG_ON(INTEL_GEN(to->i915) < 8);	865	GEM_BUG_ON(INTEL_GEN(to->i915) < 8);
812		866
813	/* Just emit the first semaphore we see as request space is limited. */	867	/* Just emit the first semaphore we see as request space is limited. */
814	if (to->sched.semaphores & from->engine->mask)	868	if (already_busywaiting(to) & from->engine->mask)
815	return i915_sw_fence_await_dma_fence(&to->submit,	869	return i915_sw_fence_await_dma_fence(&to->submit,
816	&from->fence, 0,	870	&from->fence, 0,
817	I915_FENCE_GFP);	871	I915_FENCE_GFP);
818		872
		873	err = i915_request_await_start(to, from);
		874	if (err < 0)
		875	return err;
		876
819	err = i915_sw_fence_await_dma_fence(&to->semaphore,	877	err = i915_sw_fence_await_dma_fence(&to->semaphore,
820	&from->fence, 0,	878	&from->fence, 0,
821	I915_FENCE_GFP);	879	I915_FENCE_GFP);