aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChris Wilson <chris@chris-wilson.co.uk>2017-06-16 10:05:24 -0400
committerChris Wilson <chris@chris-wilson.co.uk>2017-06-16 11:54:05 -0400
commit7dd4f6729f9243bd7046c6f04c107a456bda38eb (patch)
treeb3f453d82aee261b40dc54142966c1fb24e9c2a2
parent1a71cf2fa646799d4397a49b223549d8617fece0 (diff)
drm/i915: Async GPU relocation processing
If the user requires patching of their batch or auxiliary buffers, we currently make the alterations on the cpu. If they are active on the GPU at the time, we wait under the struct_mutex for them to finish executing before we rewrite the contents. This happens if shared relocation trees are used between different contexts with separate address space (and the buffers then have different addresses in each), the 3D state will need to be adjusted between execution on each context. However, we don't need to use the CPU to do the relocation patching, as we could queue commands to the GPU to perform it and use fences to serialise the operation with the current activity and future - so the operation on the GPU appears just as atomic as performing it immediately. Performing the relocation rewrites on the GPU is not free, in terms of pure throughput, the number of relocations/s is about halved - but more importantly so is the time under the struct_mutex. v2: Break out the request/batch allocation for clearer error flow. v3: A few asserts to ensure rq ordering is maintained Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
-rw-r--r--drivers/gpu/drm/i915/i915_gem.c1
-rw-r--r--drivers/gpu/drm/i915/i915_gem_execbuffer.c227
2 files changed, 220 insertions, 8 deletions
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 96b344901a7b..7dcac3bfb771 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4397,7 +4397,6 @@ static void __i915_gem_free_objects(struct drm_i915_private *i915,
4397 GEM_BUG_ON(i915_gem_object_is_active(obj)); 4397 GEM_BUG_ON(i915_gem_object_is_active(obj));
4398 list_for_each_entry_safe(vma, vn, 4398 list_for_each_entry_safe(vma, vn,
4399 &obj->vma_list, obj_link) { 4399 &obj->vma_list, obj_link) {
4400 GEM_BUG_ON(!i915_vma_is_ggtt(vma));
4401 GEM_BUG_ON(i915_vma_is_active(vma)); 4400 GEM_BUG_ON(i915_vma_is_active(vma));
4402 vma->flags &= ~I915_VMA_PIN_MASK; 4401 vma->flags &= ~I915_VMA_PIN_MASK;
4403 i915_vma_close(vma); 4402 i915_vma_close(vma);
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index e262133a7cf5..2f7a2d2510fc 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -40,7 +40,12 @@
40#include "intel_drv.h" 40#include "intel_drv.h"
41#include "intel_frontbuffer.h" 41#include "intel_frontbuffer.h"
42 42
43#define DBG_USE_CPU_RELOC 0 /* -1 force GTT relocs; 1 force CPU relocs */ 43enum {
44 FORCE_CPU_RELOC = 1,
45 FORCE_GTT_RELOC,
46 FORCE_GPU_RELOC,
47#define DBG_FORCE_RELOC 0 /* choose one of the above! */
48};
44 49
45#define __EXEC_OBJECT_HAS_REF BIT(31) 50#define __EXEC_OBJECT_HAS_REF BIT(31)
46#define __EXEC_OBJECT_HAS_PIN BIT(30) 51#define __EXEC_OBJECT_HAS_PIN BIT(30)
@@ -212,10 +217,15 @@ struct i915_execbuffer {
212 struct drm_mm_node node; /** temporary GTT binding */ 217 struct drm_mm_node node; /** temporary GTT binding */
213 unsigned long vaddr; /** Current kmap address */ 218 unsigned long vaddr; /** Current kmap address */
214 unsigned long page; /** Currently mapped page index */ 219 unsigned long page; /** Currently mapped page index */
220 unsigned int gen; /** Cached value of INTEL_GEN */
215 bool use_64bit_reloc : 1; 221 bool use_64bit_reloc : 1;
216 bool has_llc : 1; 222 bool has_llc : 1;
217 bool has_fence : 1; 223 bool has_fence : 1;
218 bool needs_unfenced : 1; 224 bool needs_unfenced : 1;
225
226 struct drm_i915_gem_request *rq;
227 u32 *rq_cmd;
228 unsigned int rq_size;
219 } reloc_cache; 229 } reloc_cache;
220 230
221 u64 invalid_flags; /** Set of execobj.flags that are invalid */ 231 u64 invalid_flags; /** Set of execobj.flags that are invalid */
@@ -496,8 +506,11 @@ static inline int use_cpu_reloc(const struct reloc_cache *cache,
496 if (!i915_gem_object_has_struct_page(obj)) 506 if (!i915_gem_object_has_struct_page(obj))
497 return false; 507 return false;
498 508
499 if (DBG_USE_CPU_RELOC) 509 if (DBG_FORCE_RELOC == FORCE_CPU_RELOC)
500 return DBG_USE_CPU_RELOC > 0; 510 return true;
511
512 if (DBG_FORCE_RELOC == FORCE_GTT_RELOC)
513 return false;
501 514
502 return (cache->has_llc || 515 return (cache->has_llc ||
503 obj->cache_dirty || 516 obj->cache_dirty ||
@@ -887,6 +900,8 @@ static void eb_reset_vmas(const struct i915_execbuffer *eb)
887 900
888static void eb_destroy(const struct i915_execbuffer *eb) 901static void eb_destroy(const struct i915_execbuffer *eb)
889{ 902{
903 GEM_BUG_ON(eb->reloc_cache.rq);
904
890 if (eb->lut_size >= 0) 905 if (eb->lut_size >= 0)
891 kfree(eb->buckets); 906 kfree(eb->buckets);
892} 907}
@@ -904,11 +919,14 @@ static void reloc_cache_init(struct reloc_cache *cache,
904 cache->page = -1; 919 cache->page = -1;
905 cache->vaddr = 0; 920 cache->vaddr = 0;
906 /* Must be a variable in the struct to allow GCC to unroll. */ 921 /* Must be a variable in the struct to allow GCC to unroll. */
922 cache->gen = INTEL_GEN(i915);
907 cache->has_llc = HAS_LLC(i915); 923 cache->has_llc = HAS_LLC(i915);
908 cache->has_fence = INTEL_GEN(i915) < 4;
909 cache->needs_unfenced = INTEL_INFO(i915)->unfenced_needs_alignment;
910 cache->use_64bit_reloc = HAS_64BIT_RELOC(i915); 924 cache->use_64bit_reloc = HAS_64BIT_RELOC(i915);
925 cache->has_fence = cache->gen < 4;
926 cache->needs_unfenced = INTEL_INFO(i915)->unfenced_needs_alignment;
911 cache->node.allocated = false; 927 cache->node.allocated = false;
928 cache->rq = NULL;
929 cache->rq_size = 0;
912} 930}
913 931
914static inline void *unmask_page(unsigned long p) 932static inline void *unmask_page(unsigned long p)
@@ -930,10 +948,24 @@ static inline struct i915_ggtt *cache_to_ggtt(struct reloc_cache *cache)
930 return &i915->ggtt; 948 return &i915->ggtt;
931} 949}
932 950
951static void reloc_gpu_flush(struct reloc_cache *cache)
952{
953 GEM_BUG_ON(cache->rq_size >= cache->rq->batch->obj->base.size / sizeof(u32));
954 cache->rq_cmd[cache->rq_size] = MI_BATCH_BUFFER_END;
955 i915_gem_object_unpin_map(cache->rq->batch->obj);
956 i915_gem_chipset_flush(cache->rq->i915);
957
958 __i915_add_request(cache->rq, true);
959 cache->rq = NULL;
960}
961
933static void reloc_cache_reset(struct reloc_cache *cache) 962static void reloc_cache_reset(struct reloc_cache *cache)
934{ 963{
935 void *vaddr; 964 void *vaddr;
936 965
966 if (cache->rq)
967 reloc_gpu_flush(cache);
968
937 if (!cache->vaddr) 969 if (!cache->vaddr)
938 return; 970 return;
939 971
@@ -1099,6 +1131,121 @@ static void clflush_write32(u32 *addr, u32 value, unsigned int flushes)
1099 *addr = value; 1131 *addr = value;
1100} 1132}
1101 1133
1134static int __reloc_gpu_alloc(struct i915_execbuffer *eb,
1135 struct i915_vma *vma,
1136 unsigned int len)
1137{
1138 struct reloc_cache *cache = &eb->reloc_cache;
1139 struct drm_i915_gem_object *obj;
1140 struct drm_i915_gem_request *rq;
1141 struct i915_vma *batch;
1142 u32 *cmd;
1143 int err;
1144
1145 GEM_BUG_ON(vma->obj->base.write_domain & I915_GEM_DOMAIN_CPU);
1146
1147 obj = i915_gem_batch_pool_get(&eb->engine->batch_pool, PAGE_SIZE);
1148 if (IS_ERR(obj))
1149 return PTR_ERR(obj);
1150
1151 cmd = i915_gem_object_pin_map(obj,
1152 cache->has_llc ? I915_MAP_WB : I915_MAP_WC);
1153 i915_gem_object_unpin_pages(obj);
1154 if (IS_ERR(cmd))
1155 return PTR_ERR(cmd);
1156
1157 err = i915_gem_object_set_to_wc_domain(obj, false);
1158 if (err)
1159 goto err_unmap;
1160
1161 batch = i915_vma_instance(obj, vma->vm, NULL);
1162 if (IS_ERR(batch)) {
1163 err = PTR_ERR(batch);
1164 goto err_unmap;
1165 }
1166
1167 err = i915_vma_pin(batch, 0, 0, PIN_USER | PIN_NONBLOCK);
1168 if (err)
1169 goto err_unmap;
1170
1171 rq = i915_gem_request_alloc(eb->engine, eb->ctx);
1172 if (IS_ERR(rq)) {
1173 err = PTR_ERR(rq);
1174 goto err_unpin;
1175 }
1176
1177 err = i915_gem_request_await_object(rq, vma->obj, true);
1178 if (err)
1179 goto err_request;
1180
1181 err = eb->engine->emit_flush(rq, EMIT_INVALIDATE);
1182 if (err)
1183 goto err_request;
1184
1185 err = i915_switch_context(rq);
1186 if (err)
1187 goto err_request;
1188
1189 err = eb->engine->emit_bb_start(rq,
1190 batch->node.start, PAGE_SIZE,
1191 cache->gen > 5 ? 0 : I915_DISPATCH_SECURE);
1192 if (err)
1193 goto err_request;
1194
1195 GEM_BUG_ON(!reservation_object_test_signaled_rcu(obj->resv, true));
1196 i915_vma_move_to_active(batch, rq, 0);
1197 reservation_object_lock(obj->resv, NULL);
1198 reservation_object_add_excl_fence(obj->resv, &rq->fence);
1199 reservation_object_unlock(obj->resv);
1200 i915_vma_unpin(batch);
1201
1202 i915_vma_move_to_active(vma, rq, true);
1203 reservation_object_lock(vma->obj->resv, NULL);
1204 reservation_object_add_excl_fence(vma->obj->resv, &rq->fence);
1205 reservation_object_unlock(vma->obj->resv);
1206
1207 rq->batch = batch;
1208
1209 cache->rq = rq;
1210 cache->rq_cmd = cmd;
1211 cache->rq_size = 0;
1212
1213 /* Return with batch mapping (cmd) still pinned */
1214 return 0;
1215
1216err_request:
1217 i915_add_request(rq);
1218err_unpin:
1219 i915_vma_unpin(batch);
1220err_unmap:
1221 i915_gem_object_unpin_map(obj);
1222 return err;
1223}
1224
1225static u32 *reloc_gpu(struct i915_execbuffer *eb,
1226 struct i915_vma *vma,
1227 unsigned int len)
1228{
1229 struct reloc_cache *cache = &eb->reloc_cache;
1230 u32 *cmd;
1231
1232 if (cache->rq_size > PAGE_SIZE/sizeof(u32) - (len + 1))
1233 reloc_gpu_flush(cache);
1234
1235 if (unlikely(!cache->rq)) {
1236 int err;
1237
1238 err = __reloc_gpu_alloc(eb, vma, len);
1239 if (unlikely(err))
1240 return ERR_PTR(err);
1241 }
1242
1243 cmd = cache->rq_cmd + cache->rq_size;
1244 cache->rq_size += len;
1245
1246 return cmd;
1247}
1248
1102static u64 1249static u64
1103relocate_entry(struct i915_vma *vma, 1250relocate_entry(struct i915_vma *vma,
1104 const struct drm_i915_gem_relocation_entry *reloc, 1251 const struct drm_i915_gem_relocation_entry *reloc,
@@ -1111,6 +1258,67 @@ relocate_entry(struct i915_vma *vma,
1111 bool wide = eb->reloc_cache.use_64bit_reloc; 1258 bool wide = eb->reloc_cache.use_64bit_reloc;
1112 void *vaddr; 1259 void *vaddr;
1113 1260
1261 if (!eb->reloc_cache.vaddr &&
1262 (DBG_FORCE_RELOC == FORCE_GPU_RELOC ||
1263 !reservation_object_test_signaled_rcu(obj->resv, true))) {
1264 const unsigned int gen = eb->reloc_cache.gen;
1265 unsigned int len;
1266 u32 *batch;
1267 u64 addr;
1268
1269 if (wide)
1270 len = offset & 7 ? 8 : 5;
1271 else if (gen >= 4)
1272 len = 4;
1273 else if (gen >= 3)
1274 len = 3;
1275 else /* On gen2 MI_STORE_DWORD_IMM uses a physical address */
1276 goto repeat;
1277
1278 batch = reloc_gpu(eb, vma, len);
1279 if (IS_ERR(batch))
1280 goto repeat;
1281
1282 addr = gen8_canonical_addr(vma->node.start + offset);
1283 if (wide) {
1284 if (offset & 7) {
1285 *batch++ = MI_STORE_DWORD_IMM_GEN4;
1286 *batch++ = lower_32_bits(addr);
1287 *batch++ = upper_32_bits(addr);
1288 *batch++ = lower_32_bits(target_offset);
1289
1290 addr = gen8_canonical_addr(addr + 4);
1291
1292 *batch++ = MI_STORE_DWORD_IMM_GEN4;
1293 *batch++ = lower_32_bits(addr);
1294 *batch++ = upper_32_bits(addr);
1295 *batch++ = upper_32_bits(target_offset);
1296 } else {
1297 *batch++ = (MI_STORE_DWORD_IMM_GEN4 | (1 << 21)) + 1;
1298 *batch++ = lower_32_bits(addr);
1299 *batch++ = upper_32_bits(addr);
1300 *batch++ = lower_32_bits(target_offset);
1301 *batch++ = upper_32_bits(target_offset);
1302 }
1303 } else if (gen >= 6) {
1304 *batch++ = MI_STORE_DWORD_IMM_GEN4;
1305 *batch++ = 0;
1306 *batch++ = addr;
1307 *batch++ = target_offset;
1308 } else if (gen >= 4) {
1309 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
1310 *batch++ = 0;
1311 *batch++ = addr;
1312 *batch++ = target_offset;
1313 } else {
1314 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
1315 *batch++ = addr;
1316 *batch++ = target_offset;
1317 }
1318
1319 goto out;
1320 }
1321
1114repeat: 1322repeat:
1115 vaddr = reloc_vaddr(obj, &eb->reloc_cache, offset >> PAGE_SHIFT); 1323 vaddr = reloc_vaddr(obj, &eb->reloc_cache, offset >> PAGE_SHIFT);
1116 if (IS_ERR(vaddr)) 1324 if (IS_ERR(vaddr))
@@ -1127,6 +1335,7 @@ repeat:
1127 goto repeat; 1335 goto repeat;
1128 } 1336 }
1129 1337
1338out:
1130 return target->node.start | UPDATE; 1339 return target->node.start | UPDATE;
1131} 1340}
1132 1341
@@ -1189,7 +1398,8 @@ eb_relocate_entry(struct i915_execbuffer *eb,
1189 * If the relocation already has the right value in it, no 1398 * If the relocation already has the right value in it, no
1190 * more work needs to be done. 1399 * more work needs to be done.
1191 */ 1400 */
1192 if (gen8_canonical_addr(target->node.start) == reloc->presumed_offset) 1401 if (!DBG_FORCE_RELOC &&
1402 gen8_canonical_addr(target->node.start) == reloc->presumed_offset)
1193 return 0; 1403 return 0;
1194 1404
1195 /* Check that the relocation address is valid... */ 1405 /* Check that the relocation address is valid... */
@@ -1915,7 +2125,7 @@ i915_gem_do_execbuffer(struct drm_device *dev,
1915 eb.i915 = to_i915(dev); 2125 eb.i915 = to_i915(dev);
1916 eb.file = file; 2126 eb.file = file;
1917 eb.args = args; 2127 eb.args = args;
1918 if (!(args->flags & I915_EXEC_NO_RELOC)) 2128 if (DBG_FORCE_RELOC || !(args->flags & I915_EXEC_NO_RELOC))
1919 args->flags |= __EXEC_HAS_RELOC; 2129 args->flags |= __EXEC_HAS_RELOC;
1920 eb.exec = exec; 2130 eb.exec = exec;
1921 eb.ctx = NULL; 2131 eb.ctx = NULL;
@@ -2068,6 +2278,9 @@ i915_gem_do_execbuffer(struct drm_device *dev,
2068 eb.batch = vma; 2278 eb.batch = vma;
2069 } 2279 }
2070 2280
2281 /* All GPU relocation batches must be submitted prior to the user rq */
2282 GEM_BUG_ON(eb.reloc_cache.rq);
2283
2071 /* Allocate a request for this batch buffer nice and early. */ 2284 /* Allocate a request for this batch buffer nice and early. */
2072 eb.request = i915_gem_request_alloc(eb.engine, eb.ctx); 2285 eb.request = i915_gem_request_alloc(eb.engine, eb.ctx);
2073 if (IS_ERR(eb.request)) { 2286 if (IS_ERR(eb.request)) {