aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDaniel Vetter <daniel.vetter@ffwll.ch>2013-01-17 16:23:36 -0500
committerDaniel Vetter <daniel.vetter@ffwll.ch>2013-01-17 16:23:36 -0500
commited5982e6ce5f106abcbf071f80730db344a6da42 (patch)
tree3669b5e3640209cdf6ebfb3200dfd4947777dff3
parentbcffc3faa692d6b2ef734e4f0c8f097175284db6 (diff)
drm/i915: Allow userspace to hint that the relocations were known
Userspace is able to hint to the kernel that its command stream and auxiliary state buffers already hold the correct presumed addresses and so the relocation process may be skipped if the kernel does not need to move any buffers in preparation for the execbuffer. Thus for the common case where the allotment of buffers is static between batches, we can avoid the overhead of individually checking the relocation entries. Note that this requires userspace to supply the domain tracking and requests for workarounds itself that would otherwise be computed based upon the relocation entries. Using copywinwin10 as an example that is dependent upon emitting a lot of relocations (2 per operation), we see improvements of: c2d/gm45: 618000.0/sec to 632000.0/sec. i3-330m: 748000.0/sec to 830000.0/sec. (measured relative to a baseline with neither optimisations applied). Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Imre Deak <imre.deak@intel.com> [danvet: Fixup merge conflict in userspace header due to different baseline trees.] Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
-rw-r--r--drivers/gpu/drm/i915/i915_dma.c3
-rw-r--r--drivers/gpu/drm/i915/i915_gem_execbuffer.c68
-rw-r--r--include/uapi/drm/i915_drm.h14
3 files changed, 62 insertions, 23 deletions
diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index 6d8a1dc74934..a6e047d533ec 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -992,6 +992,9 @@ static int i915_getparam(struct drm_device *dev, void *data,
992 case I915_PARAM_HAS_PINNED_BATCHES: 992 case I915_PARAM_HAS_PINNED_BATCHES:
993 value = 1; 993 value = 1;
994 break; 994 break;
995 case I915_PARAM_HAS_EXEC_NO_RELOC:
996 value = 1;
997 break;
995 default: 998 default:
996 DRM_DEBUG_DRIVER("Unknown parameter %d\n", 999 DRM_DEBUG_DRIVER("Unknown parameter %d\n",
997 param->param); 1000 param->param);
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 386677f8fd38..34f6cdffa9f8 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -373,7 +373,8 @@ need_reloc_mappable(struct drm_i915_gem_object *obj)
373 373
374static int 374static int
375i915_gem_execbuffer_reserve_object(struct drm_i915_gem_object *obj, 375i915_gem_execbuffer_reserve_object(struct drm_i915_gem_object *obj,
376 struct intel_ring_buffer *ring) 376 struct intel_ring_buffer *ring,
377 bool *need_reloc)
377{ 378{
378 struct drm_i915_private *dev_priv = obj->base.dev->dev_private; 379 struct drm_i915_private *dev_priv = obj->base.dev->dev_private;
379 struct drm_i915_gem_exec_object2 *entry = obj->exec_entry; 380 struct drm_i915_gem_exec_object2 *entry = obj->exec_entry;
@@ -414,7 +415,20 @@ i915_gem_execbuffer_reserve_object(struct drm_i915_gem_object *obj,
414 obj->has_aliasing_ppgtt_mapping = 1; 415 obj->has_aliasing_ppgtt_mapping = 1;
415 } 416 }
416 417
417 entry->offset = obj->gtt_offset; 418 if (entry->offset != obj->gtt_offset) {
419 entry->offset = obj->gtt_offset;
420 *need_reloc = true;
421 }
422
423 if (entry->flags & EXEC_OBJECT_WRITE) {
424 obj->base.pending_read_domains = I915_GEM_DOMAIN_RENDER;
425 obj->base.pending_write_domain = I915_GEM_DOMAIN_RENDER;
426 }
427
428 if (entry->flags & EXEC_OBJECT_NEEDS_GTT &&
429 !obj->has_global_gtt_mapping)
430 i915_gem_gtt_bind_object(obj, obj->cache_level);
431
418 return 0; 432 return 0;
419} 433}
420 434
@@ -440,7 +454,8 @@ i915_gem_execbuffer_unreserve_object(struct drm_i915_gem_object *obj)
440static int 454static int
441i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring, 455i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring,
442 struct drm_file *file, 456 struct drm_file *file,
443 struct list_head *objects) 457 struct list_head *objects,
458 bool *need_relocs)
444{ 459{
445 struct drm_i915_gem_object *obj; 460 struct drm_i915_gem_object *obj;
446 struct list_head ordered_objects; 461 struct list_head ordered_objects;
@@ -468,7 +483,7 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring,
468 else 483 else
469 list_move_tail(&obj->exec_list, &ordered_objects); 484 list_move_tail(&obj->exec_list, &ordered_objects);
470 485
471 obj->base.pending_read_domains = 0; 486 obj->base.pending_read_domains = I915_GEM_GPU_DOMAINS & ~I915_GEM_DOMAIN_COMMAND;
472 obj->base.pending_write_domain = 0; 487 obj->base.pending_write_domain = 0;
473 obj->pending_fenced_gpu_access = false; 488 obj->pending_fenced_gpu_access = false;
474 } 489 }
@@ -508,7 +523,7 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring,
508 (need_mappable && !obj->map_and_fenceable)) 523 (need_mappable && !obj->map_and_fenceable))
509 ret = i915_gem_object_unbind(obj); 524 ret = i915_gem_object_unbind(obj);
510 else 525 else
511 ret = i915_gem_execbuffer_reserve_object(obj, ring); 526 ret = i915_gem_execbuffer_reserve_object(obj, ring, need_relocs);
512 if (ret) 527 if (ret)
513 goto err; 528 goto err;
514 } 529 }
@@ -518,7 +533,7 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring,
518 if (obj->gtt_space) 533 if (obj->gtt_space)
519 continue; 534 continue;
520 535
521 ret = i915_gem_execbuffer_reserve_object(obj, ring); 536 ret = i915_gem_execbuffer_reserve_object(obj, ring, need_relocs);
522 if (ret) 537 if (ret)
523 goto err; 538 goto err;
524 } 539 }
@@ -538,16 +553,18 @@ err: /* Decrement pin count for bound objects */
538 553
539static int 554static int
540i915_gem_execbuffer_relocate_slow(struct drm_device *dev, 555i915_gem_execbuffer_relocate_slow(struct drm_device *dev,
556 struct drm_i915_gem_execbuffer2 *args,
541 struct drm_file *file, 557 struct drm_file *file,
542 struct intel_ring_buffer *ring, 558 struct intel_ring_buffer *ring,
543 struct eb_objects *eb, 559 struct eb_objects *eb,
544 struct drm_i915_gem_exec_object2 *exec, 560 struct drm_i915_gem_exec_object2 *exec)
545 int count)
546{ 561{
547 struct drm_i915_gem_relocation_entry *reloc; 562 struct drm_i915_gem_relocation_entry *reloc;
548 struct drm_i915_gem_object *obj; 563 struct drm_i915_gem_object *obj;
564 bool need_relocs;
549 int *reloc_offset; 565 int *reloc_offset;
550 int i, total, ret; 566 int i, total, ret;
567 int count = args->buffer_count;
551 568
552 /* We may process another execbuffer during the unlock... */ 569 /* We may process another execbuffer during the unlock... */
553 while (!list_empty(&eb->objects)) { 570 while (!list_empty(&eb->objects)) {
@@ -602,7 +619,8 @@ i915_gem_execbuffer_relocate_slow(struct drm_device *dev,
602 if (ret) 619 if (ret)
603 goto err; 620 goto err;
604 621
605 ret = i915_gem_execbuffer_reserve(ring, file, &eb->objects); 622 need_relocs = (args->flags & I915_EXEC_NO_RELOC) == 0;
623 ret = i915_gem_execbuffer_reserve(ring, file, &eb->objects, &need_relocs);
606 if (ret) 624 if (ret)
607 goto err; 625 goto err;
608 626
@@ -660,6 +678,9 @@ i915_gem_execbuffer_move_to_gpu(struct intel_ring_buffer *ring,
660static bool 678static bool
661i915_gem_check_execbuffer(struct drm_i915_gem_execbuffer2 *exec) 679i915_gem_check_execbuffer(struct drm_i915_gem_execbuffer2 *exec)
662{ 680{
681 if (exec->flags & __I915_EXEC_UNKNOWN_FLAGS)
682 return false;
683
663 return ((exec->batch_start_offset | exec->batch_len) & 0x7) == 0; 684 return ((exec->batch_start_offset | exec->batch_len) & 0x7) == 0;
664} 685}
665 686
@@ -673,6 +694,9 @@ validate_exec_list(struct drm_i915_gem_exec_object2 *exec,
673 char __user *ptr = (char __user *)(uintptr_t)exec[i].relocs_ptr; 694 char __user *ptr = (char __user *)(uintptr_t)exec[i].relocs_ptr;
674 int length; /* limited by fault_in_pages_readable() */ 695 int length; /* limited by fault_in_pages_readable() */
675 696
697 if (exec[i].flags & __EXEC_OBJECT_UNKNOWN_FLAGS)
698 return -EINVAL;
699
676 /* First check for malicious input causing overflow */ 700 /* First check for malicious input causing overflow */
677 if (exec[i].relocation_count > 701 if (exec[i].relocation_count >
678 INT_MAX / sizeof(struct drm_i915_gem_relocation_entry)) 702 INT_MAX / sizeof(struct drm_i915_gem_relocation_entry))
@@ -680,9 +704,6 @@ validate_exec_list(struct drm_i915_gem_exec_object2 *exec,
680 704
681 length = exec[i].relocation_count * 705 length = exec[i].relocation_count *
682 sizeof(struct drm_i915_gem_relocation_entry); 706 sizeof(struct drm_i915_gem_relocation_entry);
683 if (!access_ok(VERIFY_READ, ptr, length))
684 return -EFAULT;
685
686 /* we may also need to update the presumed offsets */ 707 /* we may also need to update the presumed offsets */
687 if (!access_ok(VERIFY_WRITE, ptr, length)) 708 if (!access_ok(VERIFY_WRITE, ptr, length))
688 return -EFAULT; 709 return -EFAULT;
@@ -704,8 +725,10 @@ i915_gem_execbuffer_move_to_active(struct list_head *objects,
704 u32 old_read = obj->base.read_domains; 725 u32 old_read = obj->base.read_domains;
705 u32 old_write = obj->base.write_domain; 726 u32 old_write = obj->base.write_domain;
706 727
707 obj->base.read_domains = obj->base.pending_read_domains;
708 obj->base.write_domain = obj->base.pending_write_domain; 728 obj->base.write_domain = obj->base.pending_write_domain;
729 if (obj->base.write_domain == 0)
730 obj->base.pending_read_domains |= obj->base.read_domains;
731 obj->base.read_domains = obj->base.pending_read_domains;
709 obj->fenced_gpu_access = obj->pending_fenced_gpu_access; 732 obj->fenced_gpu_access = obj->pending_fenced_gpu_access;
710 733
711 i915_gem_object_move_to_active(obj, ring); 734 i915_gem_object_move_to_active(obj, ring);
@@ -770,14 +793,12 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
770 struct intel_ring_buffer *ring; 793 struct intel_ring_buffer *ring;
771 u32 ctx_id = i915_execbuffer2_get_context_id(*args); 794 u32 ctx_id = i915_execbuffer2_get_context_id(*args);
772 u32 exec_start, exec_len; 795 u32 exec_start, exec_len;
773 u32 mask; 796 u32 mask, flags;
774 u32 flags;
775 int ret, mode, i; 797 int ret, mode, i;
798 bool need_relocs;
776 799
777 if (!i915_gem_check_execbuffer(args)) { 800 if (!i915_gem_check_execbuffer(args))
778 DRM_DEBUG("execbuf with invalid offset/length\n");
779 return -EINVAL; 801 return -EINVAL;
780 }
781 802
782 ret = validate_exec_list(exec, args->buffer_count); 803 ret = validate_exec_list(exec, args->buffer_count);
783 if (ret) 804 if (ret)
@@ -916,17 +937,18 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
916 exec_list); 937 exec_list);
917 938
918 /* Move the objects en-masse into the GTT, evicting if necessary. */ 939 /* Move the objects en-masse into the GTT, evicting if necessary. */
919 ret = i915_gem_execbuffer_reserve(ring, file, &eb->objects); 940 need_relocs = (args->flags & I915_EXEC_NO_RELOC) == 0;
941 ret = i915_gem_execbuffer_reserve(ring, file, &eb->objects, &need_relocs);
920 if (ret) 942 if (ret)
921 goto err; 943 goto err;
922 944
923 /* The objects are in their final locations, apply the relocations. */ 945 /* The objects are in their final locations, apply the relocations. */
924 ret = i915_gem_execbuffer_relocate(dev, eb); 946 if (need_relocs)
947 ret = i915_gem_execbuffer_relocate(dev, eb);
925 if (ret) { 948 if (ret) {
926 if (ret == -EFAULT) { 949 if (ret == -EFAULT) {
927 ret = i915_gem_execbuffer_relocate_slow(dev, file, ring, 950 ret = i915_gem_execbuffer_relocate_slow(dev, args, file, ring,
928 eb, exec, 951 eb, exec);
929 args->buffer_count);
930 BUG_ON(!mutex_is_locked(&dev->struct_mutex)); 952 BUG_ON(!mutex_is_locked(&dev->struct_mutex));
931 } 953 }
932 if (ret) 954 if (ret)
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index c4d2e9c74002..2430b6ad6a85 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -308,6 +308,7 @@ typedef struct drm_i915_irq_wait {
308#define I915_PARAM_RSVD_FOR_FUTURE_USE 22 308#define I915_PARAM_RSVD_FOR_FUTURE_USE 22
309#define I915_PARAM_HAS_SECURE_BATCHES 23 309#define I915_PARAM_HAS_SECURE_BATCHES 23
310#define I915_PARAM_HAS_PINNED_BATCHES 24 310#define I915_PARAM_HAS_PINNED_BATCHES 24
311#define I915_PARAM_HAS_EXEC_NO_RELOC 25
311 312
312typedef struct drm_i915_getparam { 313typedef struct drm_i915_getparam {
313 int param; 314 int param;
@@ -628,7 +629,11 @@ struct drm_i915_gem_exec_object2 {
628 __u64 offset; 629 __u64 offset;
629 630
630#define EXEC_OBJECT_NEEDS_FENCE (1<<0) 631#define EXEC_OBJECT_NEEDS_FENCE (1<<0)
632#define EXEC_OBJECT_NEEDS_GTT (1<<1)
633#define EXEC_OBJECT_WRITE (1<<2)
634#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_WRITE<<1)
631 __u64 flags; 635 __u64 flags;
636
632 __u64 rsvd1; 637 __u64 rsvd1;
633 __u64 rsvd2; 638 __u64 rsvd2;
634}; 639};
@@ -687,6 +692,15 @@ struct drm_i915_gem_execbuffer2 {
687 */ 692 */
688#define I915_EXEC_IS_PINNED (1<<10) 693#define I915_EXEC_IS_PINNED (1<<10)
689 694
695/** Provide a hint to the kernel that the command stream and auxilliary
696 * state buffers already holds the correct presumed addresses and so the
697 * relocation process may be skipped if no buffers need to be moved in
698 * preparation for the execbuffer.
699 */
700#define I915_EXEC_NO_RELOC (1<<11)
701
702#define __I915_EXEC_UNKNOWN_FLAGS -(I915_EXEC_NO_RELOC<<1)
703
690#define I915_EXEC_CONTEXT_ID_MASK (0xffffffff) 704#define I915_EXEC_CONTEXT_ID_MASK (0xffffffff)
691#define i915_execbuffer2_set_context_id(eb2, context) \ 705#define i915_execbuffer2_set_context_id(eb2, context) \
692 (eb2).rsvd1 = context & I915_EXEC_CONTEXT_ID_MASK 706 (eb2).rsvd1 = context & I915_EXEC_CONTEXT_ID_MASK