drm/i915: Async GPU relocation processing

If the user requires patching of their batch or auxiliary buffers, we currently make the alterations on the cpu. If they are active on the GPU at the time, we wait under the struct_mutex for them to finish executing before we rewrite the contents. This happens if shared relocation trees are used between different contexts with separate address space (and the buffers then have different addresses in each), the 3D state will need to be adjusted between execution on each context. However, we don't need to use the CPU to do the relocation patching, as we could queue commands to the GPU to perform it and use fences to serialise the operation with the current activity and future - so the operation on the GPU appears just as atomic as performing it immediately. Performing the relocation rewrites on the GPU is not free, in terms of pure throughput, the number of relocations/s is about halved - but more importantly so is the time under the struct_mutex. v2: Break out the request/batch allocation for clearer error flow. v3: A few asserts to ensure rq ordering is maintained Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
author: Chris Wilson <chris@chris-wilson.co.uk> 2017-06-16 10:05:24 -0400
committer: Chris Wilson <chris@chris-wilson.co.uk> 2017-06-16 11:54:05 -0400
commit: 7dd4f6729f9243bd7046c6f04c107a456bda38eb (patch)
tree: b3f453d82aee261b40dc54142966c1fb24e9c2a2
parent: 1a71cf2fa646799d4397a49b223549d8617fece0 (diff)
2 files changed, 220 insertions, 8 deletions
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 96b344901a7b..7dcac3bfb771 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4397,7 +4397,6 @@ static void __i915_gem_free_objects(struct drm_i915_private *i915,
                GEM_BUG_ON(i915_gem_object_is_active(obj));
                list_for_each_entry_safe(vma, vn,
                                         &obj->vma_list, obj_link) {
-                        GEM_BUG_ON(!i915_vma_is_ggtt(vma));
                        GEM_BUG_ON(i915_vma_is_active(vma));
                        vma->flags &= ~I915_VMA_PIN_MASK;
                        i915_vma_close(vma);
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index e262133a7cf5..2f7a2d2510fc 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -40,7 +40,12 @@
 #include "intel_drv.h"
 #include "intel_frontbuffer.h"
-#define DBG_USE_CPU_RELOC 0 /* -1 force GTT relocs; 1 force CPU relocs */
+enum {
+        FORCE_CPU_RELOC = 1,
+        FORCE_GTT_RELOC,
+        FORCE_GPU_RELOC,
+#define DBG_FORCE_RELOC 0 /* choose one of the above! */
+};
 #define __EXEC_OBJECT_HAS_REF           BIT(31)
 #define __EXEC_OBJECT_HAS_PIN           BIT(30)
@@ -212,10 +217,15 @@ struct i915_execbuffer {
                struct drm_mm_node node; /** temporary GTT binding */
                unsigned long vaddr; /** Current kmap address */
                unsigned long page; /** Currently mapped page index */
+                unsigned int gen; /** Cached value of INTEL_GEN */
                bool use_64bit_reloc : 1;
                bool has_llc : 1;
                bool has_fence : 1;
                bool needs_unfenced : 1;
+                struct drm_i915_gem_request *rq;
+                u32 *rq_cmd;
+                unsigned int rq_size;
        } reloc_cache;
        u64 invalid_flags; /** Set of execobj.flags that are invalid */
@@ -496,8 +506,11 @@ static inline int use_cpu_reloc(const struct reloc_cache *cache,
        if (!i915_gem_object_has_struct_page(obj))
                return false;
-        if (DBG_USE_CPU_RELOC)
+        if (DBG_FORCE_RELOC == FORCE_CPU_RELOC)
-                return DBG_USE_CPU_RELOC > 0;
+                return true;
+        if (DBG_FORCE_RELOC == FORCE_GTT_RELOC)
+                return false;
        return (cache->has_llc ||
                obj->cache_dirty ||
@@ -887,6 +900,8 @@ static void eb_reset_vmas(const struct i915_execbuffer *eb)
 static void eb_destroy(const struct i915_execbuffer *eb)
 {
+        GEM_BUG_ON(eb->reloc_cache.rq);
        if (eb->lut_size >= 0)
                kfree(eb->buckets);
 }
@@ -904,11 +919,14 @@ static void reloc_cache_init(struct reloc_cache *cache,
        cache->page = -1;
        cache->vaddr = 0;
        /* Must be a variable in the struct to allow GCC to unroll. */
+        cache->gen = INTEL_GEN(i915);
        cache->has_llc = HAS_LLC(i915);
-        cache->has_fence = INTEL_GEN(i915) < 4;
-        cache->needs_unfenced = INTEL_INFO(i915)->unfenced_needs_alignment;
        cache->use_64bit_reloc = HAS_64BIT_RELOC(i915);
+        cache->has_fence = cache->gen < 4;
+        cache->needs_unfenced = INTEL_INFO(i915)->unfenced_needs_alignment;
        cache->node.allocated = false;
+        cache->rq = NULL;
+        cache->rq_size = 0;
 }
 static inline void *unmask_page(unsigned long p)
@@ -930,10 +948,24 @@ static inline struct i915_ggtt *cache_to_ggtt(struct reloc_cache *cache)
        return &i915->ggtt;
 }
+static void reloc_gpu_flush(struct reloc_cache *cache)
+{
+        GEM_BUG_ON(cache->rq_size >= cache->rq->batch->obj->base.size / sizeof(u32));
+        cache->rq_cmd[cache->rq_size] = MI_BATCH_BUFFER_END;
+        i915_gem_object_unpin_map(cache->rq->batch->obj);
+        i915_gem_chipset_flush(cache->rq->i915);
+        __i915_add_request(cache->rq, true);
+        cache->rq = NULL;
+}
 static void reloc_cache_reset(struct reloc_cache *cache)
 {
        void *vaddr;
+        if (cache->rq)
+                reloc_gpu_flush(cache);
        if (!cache->vaddr)
                return;
@@ -1099,6 +1131,121 @@ static void clflush_write32(u32 *addr, u32 value, unsigned int flushes)
                *addr = value;
 }
+static int __reloc_gpu_alloc(struct i915_execbuffer *eb,
+                             struct i915_vma *vma,
+                             unsigned int len)
+{
+        struct reloc_cache *cache = &eb->reloc_cache;
+        struct drm_i915_gem_object *obj;
+        struct drm_i915_gem_request *rq;
+        struct i915_vma *batch;
+        u32 *cmd;
+        int err;
+        GEM_BUG_ON(vma->obj->base.write_domain & I915_GEM_DOMAIN_CPU);
+        obj = i915_gem_batch_pool_get(&eb->engine->batch_pool, PAGE_SIZE);
+        if (IS_ERR(obj))
+                return PTR_ERR(obj);
+        cmd = i915_gem_object_pin_map(obj,
+                                      cache->has_llc ? I915_MAP_WB : I915_MAP_WC);
+        i915_gem_object_unpin_pages(obj);
+        if (IS_ERR(cmd))
+                return PTR_ERR(cmd);
+        err = i915_gem_object_set_to_wc_domain(obj, false);
+        if (err)
+                goto err_unmap;
+        batch = i915_vma_instance(obj, vma->vm, NULL);
+        if (IS_ERR(batch)) {
+                err = PTR_ERR(batch);
+                goto err_unmap;
+        }
+        err = i915_vma_pin(batch, 0, 0, PIN_USER | PIN_NONBLOCK);
+        if (err)
+                goto err_unmap;
+        rq = i915_gem_request_alloc(eb->engine, eb->ctx);
+        if (IS_ERR(rq)) {
+                err = PTR_ERR(rq);
+                goto err_unpin;
+        }
+        err = i915_gem_request_await_object(rq, vma->obj, true);
+        if (err)
+                goto err_request;
+        err = eb->engine->emit_flush(rq, EMIT_INVALIDATE);
+        if (err)
+                goto err_request;
+        err = i915_switch_context(rq);
+        if (err)
+                goto err_request;
+        err = eb->engine->emit_bb_start(rq,
+                                        batch->node.start, PAGE_SIZE,
+                                        cache->gen > 5 ? 0 : I915_DISPATCH_SECURE);
+        if (err)
+                goto err_request;
+        GEM_BUG_ON(!reservation_object_test_signaled_rcu(obj->resv, true));
+        i915_vma_move_to_active(batch, rq, 0);
+        reservation_object_lock(obj->resv, NULL);
+        reservation_object_add_excl_fence(obj->resv, &rq->fence);
+        reservation_object_unlock(obj->resv);
+        i915_vma_unpin(batch);
+        i915_vma_move_to_active(vma, rq, true);
+        reservation_object_lock(vma->obj->resv, NULL);
+        reservation_object_add_excl_fence(vma->obj->resv, &rq->fence);
+        reservation_object_unlock(vma->obj->resv);
+        rq->batch = batch;
+        cache->rq = rq;
+        cache->rq_cmd = cmd;
+        cache->rq_size = 0;
+        /* Return with batch mapping (cmd) still pinned */
+        return 0;
+err_request:
+        i915_add_request(rq);
+err_unpin:
+        i915_vma_unpin(batch);
+err_unmap:
+        i915_gem_object_unpin_map(obj);
+        return err;
+}
+static u32 *reloc_gpu(struct i915_execbuffer *eb,
+                      struct i915_vma *vma,
+                      unsigned int len)
+{
+        struct reloc_cache *cache = &eb->reloc_cache;
+        u32 *cmd;
+        if (cache->rq_size > PAGE_SIZE/sizeof(u32) - (len + 1))
+                reloc_gpu_flush(cache);
+        if (unlikely(!cache->rq)) {
+                int err;
+                err = __reloc_gpu_alloc(eb, vma, len);
+                if (unlikely(err))
+                        return ERR_PTR(err);
+        }
+        cmd = cache->rq_cmd + cache->rq_size;
+        cache->rq_size += len;
+        return cmd;
+}
 static u64
 relocate_entry(struct i915_vma *vma,
               const struct drm_i915_gem_relocation_entry *reloc,
@@ -1111,6 +1258,67 @@ relocate_entry(struct i915_vma *vma,
        bool wide = eb->reloc_cache.use_64bit_reloc;
        void *vaddr;
+        if (!eb->reloc_cache.vaddr &&
+            (DBG_FORCE_RELOC == FORCE_GPU_RELOC ||
+             !reservation_object_test_signaled_rcu(obj->resv, true))) {
+                const unsigned int gen = eb->reloc_cache.gen;
+                unsigned int len;
+                u32 *batch;
+                u64 addr;
+                if (wide)
+                        len = offset & 7 ? 8 : 5;
+                else if (gen >= 4)
+                        len = 4;
+                else if (gen >= 3)
+                        len = 3;
+                else /* On gen2 MI_STORE_DWORD_IMM uses a physical address */
+                        goto repeat;
+                batch = reloc_gpu(eb, vma, len);
+                if (IS_ERR(batch))
+                        goto repeat;
+                addr = gen8_canonical_addr(vma->node.start + offset);
+                if (wide) {
+                        if (offset & 7) {
+                                *batch++ = MI_STORE_DWORD_IMM_GEN4;
+                                *batch++ = lower_32_bits(addr);
+                                *batch++ = upper_32_bits(addr);
+                                *batch++ = lower_32_bits(target_offset);
+                                addr = gen8_canonical_addr(addr + 4);
+                                *batch++ = MI_STORE_DWORD_IMM_GEN4;
+                                *batch++ = lower_32_bits(addr);
+                                *batch++ = upper_32_bits(addr);
+                                *batch++ = upper_32_bits(target_offset);
+                        } else {
+                                *batch++ = (MI_STORE_DWORD_IMM_GEN4 | (1 << 21)) + 1;
+                                *batch++ = lower_32_bits(addr);
+                                *batch++ = upper_32_bits(addr);
+                                *batch++ = lower_32_bits(target_offset);
+                                *batch++ = upper_32_bits(target_offset);
+                        }
+                } else if (gen >= 6) {
+                        *batch++ = MI_STORE_DWORD_IMM_GEN4;
+                        *batch++ = 0;
+                        *batch++ = addr;
+                        *batch++ = target_offset;
+                } else if (gen >= 4) {
+                        *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+                        *batch++ = 0;
+                        *batch++ = addr;
+                        *batch++ = target_offset;
+                } else {
+                        *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
+                        *batch++ = addr;
+                        *batch++ = target_offset;
+                }
+                goto out;
+        }
 repeat:
        vaddr = reloc_vaddr(obj, &eb->reloc_cache, offset >> PAGE_SHIFT);
        if (IS_ERR(vaddr))
@@ -1127,6 +1335,7 @@ repeat:
                goto repeat;
        }
+out:
        return target->node.start | UPDATE;
 }
@@ -1189,7 +1398,8 @@ eb_relocate_entry(struct i915_execbuffer *eb,
         * If the relocation already has the right value in it, no
         * more work needs to be done.
         */
-        if (gen8_canonical_addr(target->node.start) == reloc->presumed_offset)
+        if (!DBG_FORCE_RELOC &&
+            gen8_canonical_addr(target->node.start) == reloc->presumed_offset)
                return 0;
        /* Check that the relocation address is valid... */
@@ -1915,7 +2125,7 @@ i915_gem_do_execbuffer(struct drm_device *dev,
        eb.i915 = to_i915(dev);
        eb.file = file;
        eb.args = args;
-        if (!(args->flags & I915_EXEC_NO_RELOC))
+        if (DBG_FORCE_RELOC || !(args->flags & I915_EXEC_NO_RELOC))
                args->flags |= __EXEC_HAS_RELOC;
        eb.exec = exec;
        eb.ctx = NULL;
@@ -2068,6 +2278,9 @@ i915_gem_do_execbuffer(struct drm_device *dev,
                eb.batch = vma;
        }
+        /* All GPU relocation batches must be submitted prior to the user rq */
+        GEM_BUG_ON(eb.reloc_cache.rq);
        /* Allocate a request for this batch buffer nice and early. */
        eb.request = i915_gem_request_alloc(eb.engine, eb.ctx);
        if (IS_ERR(eb.request)) {
author	Chris Wilson <chris@chris-wilson.co.uk>	2017-06-16 10:05:24 -0400
committer	Chris Wilson <chris@chris-wilson.co.uk>	2017-06-16 11:54:05 -0400
commit	7dd4f6729f9243bd7046c6f04c107a456bda38eb (patch)
tree	b3f453d82aee261b40dc54142966c1fb24e9c2a2
parent	1a71cf2fa646799d4397a49b223549d8617fece0 (diff)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 96b344901a7b..7dcac3bfb771 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4397,7 +4397,6 @@ static void __i915_gem_free_objects(struct drm_i915_private *i915,
4397	GEM_BUG_ON(i915_gem_object_is_active(obj));	4397	GEM_BUG_ON(i915_gem_object_is_active(obj));
4398	list_for_each_entry_safe(vma, vn,	4398	list_for_each_entry_safe(vma, vn,
4399	&obj->vma_list, obj_link) {	4399	&obj->vma_list, obj_link) {
4400	GEM_BUG_ON(!i915_vma_is_ggtt(vma));
4401	GEM_BUG_ON(i915_vma_is_active(vma));	4400	GEM_BUG_ON(i915_vma_is_active(vma));
4402	vma->flags &= ~I915_VMA_PIN_MASK;	4401	vma->flags &= ~I915_VMA_PIN_MASK;
4403	i915_vma_close(vma);	4402	i915_vma_close(vma);


diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index e262133a7cf5..2f7a2d2510fc 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -40,7 +40,12 @@
40	#include "intel_drv.h"	40	#include "intel_drv.h"
41	#include "intel_frontbuffer.h"	41	#include "intel_frontbuffer.h"
42		42
43	#define DBG_USE_CPU_RELOC 0 /* -1 force GTT relocs; 1 force CPU relocs */	43	enum {
		44	FORCE_CPU_RELOC = 1,
		45	FORCE_GTT_RELOC,
		46	FORCE_GPU_RELOC,
		47	#define DBG_FORCE_RELOC 0 /* choose one of the above! */
		48	};
44		49
45	#define __EXEC_OBJECT_HAS_REF BIT(31)	50	#define __EXEC_OBJECT_HAS_REF BIT(31)
46	#define __EXEC_OBJECT_HAS_PIN BIT(30)	51	#define __EXEC_OBJECT_HAS_PIN BIT(30)
@@ -212,10 +217,15 @@ struct i915_execbuffer {
212	struct drm_mm_node node; /** temporary GTT binding */	217	struct drm_mm_node node; /** temporary GTT binding */
213	unsigned long vaddr; /** Current kmap address */	218	unsigned long vaddr; /** Current kmap address */
214	unsigned long page; /** Currently mapped page index */	219	unsigned long page; /** Currently mapped page index */
		220	unsigned int gen; /** Cached value of INTEL_GEN */
215	bool use_64bit_reloc : 1;	221	bool use_64bit_reloc : 1;
216	bool has_llc : 1;	222	bool has_llc : 1;
217	bool has_fence : 1;	223	bool has_fence : 1;
218	bool needs_unfenced : 1;	224	bool needs_unfenced : 1;
		225
		226	struct drm_i915_gem_request *rq;
		227	u32 *rq_cmd;
		228	unsigned int rq_size;
219	} reloc_cache;	229	} reloc_cache;
220		230
221	u64 invalid_flags; /** Set of execobj.flags that are invalid */	231	u64 invalid_flags; /** Set of execobj.flags that are invalid */
@@ -496,8 +506,11 @@ static inline int use_cpu_reloc(const struct reloc_cache *cache,
496	if (!i915_gem_object_has_struct_page(obj))	506	if (!i915_gem_object_has_struct_page(obj))
497	return false;	507	return false;
498		508
499	if (DBG_USE_CPU_RELOC)	509	if (DBG_FORCE_RELOC == FORCE_CPU_RELOC)
500	return DBG_USE_CPU_RELOC > 0;	510	return true;
		511
		512	if (DBG_FORCE_RELOC == FORCE_GTT_RELOC)
		513	return false;
501		514
502	return (cache->has_llc \|\|	515	return (cache->has_llc \|\|
503	obj->cache_dirty \|\|	516	obj->cache_dirty \|\|
@@ -887,6 +900,8 @@ static void eb_reset_vmas(const struct i915_execbuffer *eb)
887		900
888	static void eb_destroy(const struct i915_execbuffer *eb)	901	static void eb_destroy(const struct i915_execbuffer *eb)
889	{	902	{
		903	GEM_BUG_ON(eb->reloc_cache.rq);
		904
890	if (eb->lut_size >= 0)	905	if (eb->lut_size >= 0)
891	kfree(eb->buckets);	906	kfree(eb->buckets);
892	}	907	}
@@ -904,11 +919,14 @@ static void reloc_cache_init(struct reloc_cache *cache,
904	cache->page = -1;	919	cache->page = -1;
905	cache->vaddr = 0;	920	cache->vaddr = 0;
906	/* Must be a variable in the struct to allow GCC to unroll. */	921	/* Must be a variable in the struct to allow GCC to unroll. */
		922	cache->gen = INTEL_GEN(i915);
907	cache->has_llc = HAS_LLC(i915);	923	cache->has_llc = HAS_LLC(i915);
908	cache->has_fence = INTEL_GEN(i915) < 4;
909	cache->needs_unfenced = INTEL_INFO(i915)->unfenced_needs_alignment;
910	cache->use_64bit_reloc = HAS_64BIT_RELOC(i915);	924	cache->use_64bit_reloc = HAS_64BIT_RELOC(i915);
		925	cache->has_fence = cache->gen < 4;
		926	cache->needs_unfenced = INTEL_INFO(i915)->unfenced_needs_alignment;
911	cache->node.allocated = false;	927	cache->node.allocated = false;
		928	cache->rq = NULL;
		929	cache->rq_size = 0;
912	}	930	}
913		931
914	static inline void *unmask_page(unsigned long p)	932	static inline void *unmask_page(unsigned long p)
@@ -930,10 +948,24 @@ static inline struct i915_ggtt cache_to_ggtt(struct reloc_cache cache)
930	return &i915->ggtt;	948	return &i915->ggtt;
931	}	949	}
932		950
		951	static void reloc_gpu_flush(struct reloc_cache *cache)
		952	{
		953	GEM_BUG_ON(cache->rq_size >= cache->rq->batch->obj->base.size / sizeof(u32));
		954	cache->rq_cmd[cache->rq_size] = MI_BATCH_BUFFER_END;
		955	i915_gem_object_unpin_map(cache->rq->batch->obj);
		956	i915_gem_chipset_flush(cache->rq->i915);
		957
		958	__i915_add_request(cache->rq, true);
		959	cache->rq = NULL;
		960	}
		961
933	static void reloc_cache_reset(struct reloc_cache *cache)	962	static void reloc_cache_reset(struct reloc_cache *cache)
934	{	963	{
935	void *vaddr;	964	void *vaddr;
936		965
		966	if (cache->rq)
		967	reloc_gpu_flush(cache);
		968
937	if (!cache->vaddr)	969	if (!cache->vaddr)
938	return;	970	return;
939		971
@@ -1099,6 +1131,121 @@ static void clflush_write32(u32 *addr, u32 value, unsigned int flushes)
1099	*addr = value;	1131	*addr = value;
1100	}	1132	}
1101		1133
		1134	static int __reloc_gpu_alloc(struct i915_execbuffer *eb,
		1135	struct i915_vma *vma,
		1136	unsigned int len)
		1137	{
		1138	struct reloc_cache *cache = &eb->reloc_cache;
		1139	struct drm_i915_gem_object *obj;
		1140	struct drm_i915_gem_request *rq;
		1141	struct i915_vma *batch;
		1142	u32 *cmd;
		1143	int err;
		1144
		1145	GEM_BUG_ON(vma->obj->base.write_domain & I915_GEM_DOMAIN_CPU);
		1146
		1147	obj = i915_gem_batch_pool_get(&eb->engine->batch_pool, PAGE_SIZE);
		1148	if (IS_ERR(obj))
		1149	return PTR_ERR(obj);
		1150
		1151	cmd = i915_gem_object_pin_map(obj,
		1152	cache->has_llc ? I915_MAP_WB : I915_MAP_WC);
		1153	i915_gem_object_unpin_pages(obj);
		1154	if (IS_ERR(cmd))
		1155	return PTR_ERR(cmd);
		1156
		1157	err = i915_gem_object_set_to_wc_domain(obj, false);
		1158	if (err)
		1159	goto err_unmap;
		1160
		1161	batch = i915_vma_instance(obj, vma->vm, NULL);
		1162	if (IS_ERR(batch)) {
		1163	err = PTR_ERR(batch);
		1164	goto err_unmap;
		1165	}
		1166
		1167	err = i915_vma_pin(batch, 0, 0, PIN_USER \| PIN_NONBLOCK);
		1168	if (err)
		1169	goto err_unmap;
		1170
		1171	rq = i915_gem_request_alloc(eb->engine, eb->ctx);
		1172	if (IS_ERR(rq)) {
		1173	err = PTR_ERR(rq);
		1174	goto err_unpin;
		1175	}
		1176
		1177	err = i915_gem_request_await_object(rq, vma->obj, true);
		1178	if (err)
		1179	goto err_request;
		1180
		1181	err = eb->engine->emit_flush(rq, EMIT_INVALIDATE);
		1182	if (err)
		1183	goto err_request;
		1184
		1185	err = i915_switch_context(rq);
		1186	if (err)
		1187	goto err_request;
		1188
		1189	err = eb->engine->emit_bb_start(rq,
		1190	batch->node.start, PAGE_SIZE,
		1191	cache->gen > 5 ? 0 : I915_DISPATCH_SECURE);
		1192	if (err)
		1193	goto err_request;
		1194
		1195	GEM_BUG_ON(!reservation_object_test_signaled_rcu(obj->resv, true));
		1196	i915_vma_move_to_active(batch, rq, 0);
		1197	reservation_object_lock(obj->resv, NULL);
		1198	reservation_object_add_excl_fence(obj->resv, &rq->fence);
		1199	reservation_object_unlock(obj->resv);
		1200	i915_vma_unpin(batch);
		1201
		1202	i915_vma_move_to_active(vma, rq, true);
		1203	reservation_object_lock(vma->obj->resv, NULL);
		1204	reservation_object_add_excl_fence(vma->obj->resv, &rq->fence);
		1205	reservation_object_unlock(vma->obj->resv);
		1206
		1207	rq->batch = batch;
		1208
		1209	cache->rq = rq;
		1210	cache->rq_cmd = cmd;
		1211	cache->rq_size = 0;
		1212
		1213	/* Return with batch mapping (cmd) still pinned */
		1214	return 0;
		1215
		1216	err_request:
		1217	i915_add_request(rq);
		1218	err_unpin:
		1219	i915_vma_unpin(batch);
		1220	err_unmap:
		1221	i915_gem_object_unpin_map(obj);
		1222	return err;
		1223	}
		1224
		1225	static u32 reloc_gpu(struct i915_execbuffer eb,
		1226	struct i915_vma *vma,
		1227	unsigned int len)
		1228	{
		1229	struct reloc_cache *cache = &eb->reloc_cache;
		1230	u32 *cmd;
		1231
		1232	if (cache->rq_size > PAGE_SIZE/sizeof(u32) - (len + 1))
		1233	reloc_gpu_flush(cache);
		1234
		1235	if (unlikely(!cache->rq)) {
		1236	int err;
		1237
		1238	err = __reloc_gpu_alloc(eb, vma, len);
		1239	if (unlikely(err))
		1240	return ERR_PTR(err);
		1241	}
		1242
		1243	cmd = cache->rq_cmd + cache->rq_size;
		1244	cache->rq_size += len;
		1245
		1246	return cmd;
		1247	}
		1248
1102	static u64	1249	static u64
1103	relocate_entry(struct i915_vma *vma,	1250	relocate_entry(struct i915_vma *vma,
1104	const struct drm_i915_gem_relocation_entry *reloc,	1251	const struct drm_i915_gem_relocation_entry *reloc,
@@ -1111,6 +1258,67 @@ relocate_entry(struct i915_vma *vma,
1111	bool wide = eb->reloc_cache.use_64bit_reloc;	1258	bool wide = eb->reloc_cache.use_64bit_reloc;
1112	void *vaddr;	1259	void *vaddr;
1113		1260
		1261	if (!eb->reloc_cache.vaddr &&
		1262	(DBG_FORCE_RELOC == FORCE_GPU_RELOC \|\|
		1263	!reservation_object_test_signaled_rcu(obj->resv, true))) {
		1264	const unsigned int gen = eb->reloc_cache.gen;
		1265	unsigned int len;
		1266	u32 *batch;
		1267	u64 addr;
		1268
		1269	if (wide)
		1270	len = offset & 7 ? 8 : 5;
		1271	else if (gen >= 4)
		1272	len = 4;
		1273	else if (gen >= 3)
		1274	len = 3;
		1275	else /* On gen2 MI_STORE_DWORD_IMM uses a physical address */
		1276	goto repeat;
		1277
		1278	batch = reloc_gpu(eb, vma, len);
		1279	if (IS_ERR(batch))
		1280	goto repeat;
		1281
		1282	addr = gen8_canonical_addr(vma->node.start + offset);
		1283	if (wide) {
		1284	if (offset & 7) {
		1285	*batch++ = MI_STORE_DWORD_IMM_GEN4;
		1286	*batch++ = lower_32_bits(addr);
		1287	*batch++ = upper_32_bits(addr);
		1288	*batch++ = lower_32_bits(target_offset);
		1289
		1290	addr = gen8_canonical_addr(addr + 4);
		1291
		1292	*batch++ = MI_STORE_DWORD_IMM_GEN4;
		1293	*batch++ = lower_32_bits(addr);
		1294	*batch++ = upper_32_bits(addr);
		1295	*batch++ = upper_32_bits(target_offset);
		1296	} else {
		1297	*batch++ = (MI_STORE_DWORD_IMM_GEN4 \| (1 << 21)) + 1;
		1298	*batch++ = lower_32_bits(addr);
		1299	*batch++ = upper_32_bits(addr);
		1300	*batch++ = lower_32_bits(target_offset);
		1301	*batch++ = upper_32_bits(target_offset);
		1302	}
		1303	} else if (gen >= 6) {
		1304	*batch++ = MI_STORE_DWORD_IMM_GEN4;
		1305	*batch++ = 0;
		1306	*batch++ = addr;
		1307	*batch++ = target_offset;
		1308	} else if (gen >= 4) {
		1309	*batch++ = MI_STORE_DWORD_IMM_GEN4 \| MI_USE_GGTT;
		1310	*batch++ = 0;
		1311	*batch++ = addr;
		1312	*batch++ = target_offset;
		1313	} else {
		1314	*batch++ = MI_STORE_DWORD_IMM \| MI_MEM_VIRTUAL;
		1315	*batch++ = addr;
		1316	*batch++ = target_offset;
		1317	}
		1318
		1319	goto out;
		1320	}
		1321
1114	repeat:	1322	repeat:
1115	vaddr = reloc_vaddr(obj, &eb->reloc_cache, offset >> PAGE_SHIFT);	1323	vaddr = reloc_vaddr(obj, &eb->reloc_cache, offset >> PAGE_SHIFT);
1116	if (IS_ERR(vaddr))	1324	if (IS_ERR(vaddr))
@@ -1127,6 +1335,7 @@ repeat:
1127	goto repeat;	1335	goto repeat;
1128	}	1336	}
1129		1337
		1338	out:
1130	return target->node.start \| UPDATE;	1339	return target->node.start \| UPDATE;
1131	}	1340	}
1132		1341
@@ -1189,7 +1398,8 @@ eb_relocate_entry(struct i915_execbuffer *eb,
1189	* If the relocation already has the right value in it, no	1398	* If the relocation already has the right value in it, no
1190	* more work needs to be done.	1399	* more work needs to be done.
1191	*/	1400	*/
1192	if (gen8_canonical_addr(target->node.start) == reloc->presumed_offset)	1401	if (!DBG_FORCE_RELOC &&
		1402	gen8_canonical_addr(target->node.start) == reloc->presumed_offset)
1193	return 0;	1403	return 0;
1194		1404
1195	/* Check that the relocation address is valid... */	1405	/* Check that the relocation address is valid... */
@@ -1915,7 +2125,7 @@ i915_gem_do_execbuffer(struct drm_device *dev,
1915	eb.i915 = to_i915(dev);	2125	eb.i915 = to_i915(dev);
1916	eb.file = file;	2126	eb.file = file;
1917	eb.args = args;	2127	eb.args = args;
1918	if (!(args->flags & I915_EXEC_NO_RELOC))	2128	if (DBG_FORCE_RELOC \|\| !(args->flags & I915_EXEC_NO_RELOC))
1919	args->flags \|= __EXEC_HAS_RELOC;	2129	args->flags \|= __EXEC_HAS_RELOC;
1920	eb.exec = exec;	2130	eb.exec = exec;
1921	eb.ctx = NULL;	2131	eb.ctx = NULL;
@@ -2068,6 +2278,9 @@ i915_gem_do_execbuffer(struct drm_device *dev,
2068	eb.batch = vma;	2278	eb.batch = vma;
2069	}	2279	}
2070		2280
		2281	/* All GPU relocation batches must be submitted prior to the user rq */
		2282	GEM_BUG_ON(eb.reloc_cache.rq);
		2283
2071	/* Allocate a request for this batch buffer nice and early. */	2284	/* Allocate a request for this batch buffer nice and early. */
2072	eb.request = i915_gem_request_alloc(eb.engine, eb.ctx);	2285	eb.request = i915_gem_request_alloc(eb.engine, eb.ctx);
2073	if (IS_ERR(eb.request)) {	2286	if (IS_ERR(eb.request)) {