drm/i915: Use PIPE_CONTROL for flushing on gen6+.

v2 by danvet: Use a new flag to flush the render target cache on gen6+ (hw reuses the old write flush bit), as suggested by Ben Widawsdy. Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org> Signed-off-by: Kenneth Graunke <kenneth@whitecape.org> [danvet: this seems to fix cairo-perf-trace hangs on my snb] Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch> Signed-off-by: Keith Packard <keithp@keithp.com>
author: Jesse Barnes <jbarnes@virtuousgeek.org> 2011-10-16 04:23:31 -0400
committer: Keith Packard <keithp@keithp.com> 2011-10-20 18:26:41 -0400
commit: 8d31528703ceda6f631e39953130abe9b3ca52b2 (patch)
tree: f64ff55c111adb9e479cad97ceede6174b824aa6 /drivers/gpu/drm/i915/intel_ringbuffer.c
parent: 9d971b37534fb268251f74cc04a36a0a16f7da04 (diff)
1 files changed, 124 insertions, 12 deletions
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index ca8363531a64..ca70e2f10445 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -34,6 +34,16 @@
 #include "i915_trace.h"
 #include "intel_drv.h"
+/*
+ * 965+ support PIPE_CONTROL commands, which provide finer grained control
+ * over cache flushing.
+ */
+struct pipe_control {
+        struct drm_i915_gem_object *obj;
+        volatile u32 *cpu_page;
+        u32 gtt_offset;
+};
 static inline int ring_space(struct intel_ring_buffer *ring)
 {
        int space = (ring->head & HEAD_ADDR) - (ring->tail + 8);
@@ -123,6 +133,118 @@ render_ring_flush(struct intel_ring_buffer *ring,
        return 0;
 }
+/**
+ * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
+ * implementing two workarounds on gen6.  From section 1.4.7.1
+ * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
+ *
+ * [DevSNB-C+{W/A}] Before any depth stall flush (including those
+ * produced by non-pipelined state commands), software needs to first
+ * send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
+ * 0.
+ *
+ * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
+ * =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
+ *
+ * And the workaround for these two requires this workaround first:
+ *
+ * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
+ * BEFORE the pipe-control with a post-sync op and no write-cache
+ * flushes.
+ *
+ * And this last workaround is tricky because of the requirements on
+ * that bit.  From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
+ * volume 2 part 1:
+ *
+ *     "1 of the following must also be set:
+ *      - Render Target Cache Flush Enable ([12] of DW1)
+ *      - Depth Cache Flush Enable ([0] of DW1)
+ *      - Stall at Pixel Scoreboard ([1] of DW1)
+ *      - Depth Stall ([13] of DW1)
+ *      - Post-Sync Operation ([13] of DW1)
+ *      - Notify Enable ([8] of DW1)"
+ *
+ * The cache flushes require the workaround flush that triggered this
+ * one, so we can't use it.  Depth stall would trigger the same.
+ * Post-sync nonzero is what triggered this second workaround, so we
+ * can't use that one either.  Notify enable is IRQs, which aren't
+ * really our business.  That leaves only stall at scoreboard.
+ */
+static int
+intel_emit_post_sync_nonzero_flush(struct intel_ring_buffer *ring)
+{
+        struct pipe_control *pc = ring->private;
+        u32 scratch_addr = pc->gtt_offset + 128;
+        int ret;
+        ret = intel_ring_begin(ring, 6);
+        if (ret)
+                return ret;
+        intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(5));
+        intel_ring_emit(ring, PIPE_CONTROL_CS_STALL |
+                        PIPE_CONTROL_STALL_AT_SCOREBOARD);
+        intel_ring_emit(ring, scratch_addr | PIPE_CONTROL_GLOBAL_GTT); /* address */
+        intel_ring_emit(ring, 0); /* low dword */
+        intel_ring_emit(ring, 0); /* high dword */
+        intel_ring_emit(ring, MI_NOOP);
+        intel_ring_advance(ring);
+        ret = intel_ring_begin(ring, 6);
+        if (ret)
+                return ret;
+        intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(5));
+        intel_ring_emit(ring, PIPE_CONTROL_QW_WRITE);
+        intel_ring_emit(ring, scratch_addr | PIPE_CONTROL_GLOBAL_GTT); /* address */
+        intel_ring_emit(ring, 0);
+        intel_ring_emit(ring, 0);
+        intel_ring_emit(ring, MI_NOOP);
+        intel_ring_advance(ring);
+        return 0;
+}
+static int
+gen6_render_ring_flush(struct intel_ring_buffer *ring,
+                         u32 invalidate_domains, u32 flush_domains)
+{
+        u32 flags = 0;
+        struct pipe_control *pc = ring->private;
+        u32 scratch_addr = pc->gtt_offset + 128;
+        int ret;
+        /* Force SNB workarounds for PIPE_CONTROL flushes */
+        intel_emit_post_sync_nonzero_flush(ring);
+        /* Just flush everything.  Experiments have shown that reducing the
+         * number of bits based on the write domains has little performance
+         * impact.
+         */
+        flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
+        flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
+        flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
+        flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
+        flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
+        flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
+        flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
+        ret = intel_ring_begin(ring, 6);
+        if (ret)
+                return ret;
+        intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(5));
+        intel_ring_emit(ring, flags);
+        intel_ring_emit(ring, scratch_addr | PIPE_CONTROL_GLOBAL_GTT);
+        intel_ring_emit(ring, 0); /* lower dword */
+        intel_ring_emit(ring, 0); /* uppwer dword */
+        intel_ring_emit(ring, MI_NOOP);
+        intel_ring_advance(ring);
+        return 0;
+}
 static void ring_write_tail(struct intel_ring_buffer *ring,
                            u32 value)
 {
@@ -206,16 +328,6 @@ static int init_ring_common(struct intel_ring_buffer *ring)
        return 0;
 }
-/*
- * 965+ support PIPE_CONTROL commands, which provide finer grained control
- * over cache flushing.
- */
-struct pipe_control {
-        struct drm_i915_gem_object *obj;
-        volatile u32 *cpu_page;
-        u32 gtt_offset;
-};
 static int
 init_pipe_control(struct intel_ring_buffer *ring)
 {
@@ -296,8 +408,7 @@ static int init_render_ring(struct intel_ring_buffer *ring)
                                   GFX_MODE_ENABLE(GFX_REPLAY_MODE));
        }
-        if (INTEL_INFO(dev)->gen >= 6) {
+        if (INTEL_INFO(dev)->gen >= 5) {
-        } else if (IS_GEN5(dev)) {
                ret = init_pipe_control(ring);
                if (ret)
                        return ret;
@@ -1360,6 +1471,7 @@ int intel_init_render_ring_buffer(struct drm_device *dev)
        *ring = render_ring;
        if (INTEL_INFO(dev)->gen >= 6) {
                ring->add_request = gen6_add_request;
+                ring->flush = gen6_render_ring_flush;
                ring->irq_get = gen6_render_ring_get_irq;
                ring->irq_put = gen6_render_ring_put_irq;
        } else if (IS_GEN5(dev)) {
author	Jesse Barnes <jbarnes@virtuousgeek.org>	2011-10-16 04:23:31 -0400
committer	Keith Packard <keithp@keithp.com>	2011-10-20 18:26:41 -0400
commit	8d31528703ceda6f631e39953130abe9b3ca52b2 (patch)
tree	f64ff55c111adb9e479cad97ceede6174b824aa6 /drivers/gpu/drm/i915/intel_ringbuffer.c
parent	9d971b37534fb268251f74cc04a36a0a16f7da04 (diff)

diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c index ca8363531a64..ca70e2f10445 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -34,6 +34,16 @@
34	#include "i915_trace.h"	34	#include "i915_trace.h"
35	#include "intel_drv.h"	35	#include "intel_drv.h"
36		36
		37	/*
		38	* 965+ support PIPE_CONTROL commands, which provide finer grained control
		39	* over cache flushing.
		40	*/
		41	struct pipe_control {
		42	struct drm_i915_gem_object *obj;
		43	volatile u32 *cpu_page;
		44	u32 gtt_offset;
		45	};
		46
37	static inline int ring_space(struct intel_ring_buffer *ring)	47	static inline int ring_space(struct intel_ring_buffer *ring)
38	{	48	{
39	int space = (ring->head & HEAD_ADDR) - (ring->tail + 8);	49	int space = (ring->head & HEAD_ADDR) - (ring->tail + 8);
@@ -123,6 +133,118 @@ render_ring_flush(struct intel_ring_buffer *ring,
123	return 0;	133	return 0;
124	}	134	}
125		135
		136	/**
		137	* Emits a PIPE_CONTROL with a non-zero post-sync operation, for
		138	* implementing two workarounds on gen6. From section 1.4.7.1
		139	* "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
		140	*
		141	* [DevSNB-C+{W/A}] Before any depth stall flush (including those
		142	* produced by non-pipelined state commands), software needs to first
		143	* send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
		144	* 0.
		145	*
		146	* [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
		147	* =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
		148	*
		149	* And the workaround for these two requires this workaround first:
		150	*
		151	* [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
		152	* BEFORE the pipe-control with a post-sync op and no write-cache
		153	* flushes.
		154	*
		155	* And this last workaround is tricky because of the requirements on
		156	* that bit. From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
		157	* volume 2 part 1:
		158	*
		159	* "1 of the following must also be set:
		160	* - Render Target Cache Flush Enable ([12] of DW1)
		161	* - Depth Cache Flush Enable ([0] of DW1)
		162	* - Stall at Pixel Scoreboard ([1] of DW1)
		163	* - Depth Stall ([13] of DW1)
		164	* - Post-Sync Operation ([13] of DW1)
		165	* - Notify Enable ([8] of DW1)"
		166	*
		167	* The cache flushes require the workaround flush that triggered this
		168	* one, so we can't use it. Depth stall would trigger the same.
		169	* Post-sync nonzero is what triggered this second workaround, so we
		170	* can't use that one either. Notify enable is IRQs, which aren't
		171	* really our business. That leaves only stall at scoreboard.
		172	*/
		173	static int
		174	intel_emit_post_sync_nonzero_flush(struct intel_ring_buffer *ring)
		175	{
		176	struct pipe_control *pc = ring->private;
		177	u32 scratch_addr = pc->gtt_offset + 128;
		178	int ret;
		179
		180
		181	ret = intel_ring_begin(ring, 6);
		182	if (ret)
		183	return ret;
		184
		185	intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(5));
		186	intel_ring_emit(ring, PIPE_CONTROL_CS_STALL \|
		187	PIPE_CONTROL_STALL_AT_SCOREBOARD);
		188	intel_ring_emit(ring, scratch_addr \| PIPE_CONTROL_GLOBAL_GTT); /* address */
		189	intel_ring_emit(ring, 0); /* low dword */
		190	intel_ring_emit(ring, 0); /* high dword */
		191	intel_ring_emit(ring, MI_NOOP);
		192	intel_ring_advance(ring);
		193
		194	ret = intel_ring_begin(ring, 6);
		195	if (ret)
		196	return ret;
		197
		198	intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(5));
		199	intel_ring_emit(ring, PIPE_CONTROL_QW_WRITE);
		200	intel_ring_emit(ring, scratch_addr \| PIPE_CONTROL_GLOBAL_GTT); /* address */
		201	intel_ring_emit(ring, 0);
		202	intel_ring_emit(ring, 0);
		203	intel_ring_emit(ring, MI_NOOP);
		204	intel_ring_advance(ring);
		205
		206	return 0;
		207	}
		208
		209	static int
		210	gen6_render_ring_flush(struct intel_ring_buffer *ring,
		211	u32 invalidate_domains, u32 flush_domains)
		212	{
		213	u32 flags = 0;
		214	struct pipe_control *pc = ring->private;
		215	u32 scratch_addr = pc->gtt_offset + 128;
		216	int ret;
		217
		218	/* Force SNB workarounds for PIPE_CONTROL flushes */
		219	intel_emit_post_sync_nonzero_flush(ring);
		220
		221	/* Just flush everything. Experiments have shown that reducing the
		222	* number of bits based on the write domains has little performance
		223	* impact.
		224	*/
		225	flags \|= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
		226	flags \|= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
		227	flags \|= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
		228	flags \|= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
		229	flags \|= PIPE_CONTROL_VF_CACHE_INVALIDATE;
		230	flags \|= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
		231	flags \|= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
		232
		233	ret = intel_ring_begin(ring, 6);
		234	if (ret)
		235	return ret;
		236
		237	intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(5));
		238	intel_ring_emit(ring, flags);
		239	intel_ring_emit(ring, scratch_addr \| PIPE_CONTROL_GLOBAL_GTT);
		240	intel_ring_emit(ring, 0); /* lower dword */
		241	intel_ring_emit(ring, 0); /* uppwer dword */
		242	intel_ring_emit(ring, MI_NOOP);
		243	intel_ring_advance(ring);
		244
		245	return 0;
		246	}
		247
126	static void ring_write_tail(struct intel_ring_buffer *ring,	248	static void ring_write_tail(struct intel_ring_buffer *ring,
127	u32 value)	249	u32 value)
128	{	250	{
@@ -206,16 +328,6 @@ static int init_ring_common(struct intel_ring_buffer *ring)
206	return 0;	328	return 0;
207	}	329	}
208		330
209	/*
210	* 965+ support PIPE_CONTROL commands, which provide finer grained control
211	* over cache flushing.
212	*/
213	struct pipe_control {
214	struct drm_i915_gem_object *obj;
215	volatile u32 *cpu_page;
216	u32 gtt_offset;
217	};
218
219	static int	331	static int
220	init_pipe_control(struct intel_ring_buffer *ring)	332	init_pipe_control(struct intel_ring_buffer *ring)
221	{	333	{
@@ -296,8 +408,7 @@ static int init_render_ring(struct intel_ring_buffer *ring)
296	GFX_MODE_ENABLE(GFX_REPLAY_MODE));	408	GFX_MODE_ENABLE(GFX_REPLAY_MODE));
297	}	409	}
298		410
299	if (INTEL_INFO(dev)->gen >= 6) {	411	if (INTEL_INFO(dev)->gen >= 5) {
300	} else if (IS_GEN5(dev)) {
301	ret = init_pipe_control(ring);	412	ret = init_pipe_control(ring);
302	if (ret)	413	if (ret)
303	return ret;	414	return ret;
@@ -1360,6 +1471,7 @@ int intel_init_render_ring_buffer(struct drm_device *dev)
1360	*ring = render_ring;	1471	*ring = render_ring;
1361	if (INTEL_INFO(dev)->gen >= 6) {	1472	if (INTEL_INFO(dev)->gen >= 6) {
1362	ring->add_request = gen6_add_request;	1473	ring->add_request = gen6_add_request;
		1474	ring->flush = gen6_render_ring_flush;
1363	ring->irq_get = gen6_render_ring_get_irq;	1475	ring->irq_get = gen6_render_ring_get_irq;
1364	ring->irq_put = gen6_render_ring_put_irq;	1476	ring->irq_put = gen6_render_ring_put_irq;
1365	} else if (IS_GEN5(dev)) {	1477	} else if (IS_GEN5(dev)) {