From 3bb73aba1ed5198a2c1dfaac4f3c95459930d84a Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 20 Jul 2012 12:40:59 +0100 Subject: drm/i915: Allow late allocation of request for i915_add_request() Request preallocation was added to i915_add_request() in order to support the overlay. However, not all users care and can quite happily ignore the failure to allocate the request as they will simply repeat the request in the future. By pushing the allocation down into i915_add_request(), we can then remove some rather ugly error handling in the callers. v2: Nullify request->file_priv otherwise we chase a garbage pointer when retiring requests. Signed-off-by: Chris Wilson Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'drivers/gpu/drm/i915/i915_gem_execbuffer.c') diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index 5af631e788c8..f94ec574db2b 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -972,16 +972,11 @@ i915_gem_execbuffer_retire_commands(struct drm_device *dev, struct drm_file *file, struct intel_ring_buffer *ring) { - struct drm_i915_gem_request *request; - /* Unconditionally force add_request to emit a full flush. */ ring->gpu_caches_dirty = true; /* Add a breadcrumb for the completion of the batch buffer */ - request = kzalloc(sizeof(*request), GFP_KERNEL); - if (request == NULL || i915_add_request(ring, file, request)) { - kfree(request); - } + (void)i915_add_request(ring, file, NULL); } static int -- cgit v1.2.2 From 0201f1ecf4b81f08799b1fb9c8cdf1125b9b78a6 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 20 Jul 2012 12:41:01 +0100 Subject: drm/i915: Replace the pending_gpu_write flag with an explicit seqno As we always flush the GPU cache prior to emitting the breadcrumb, we no longer have to worry about the deferred flush causing the pending_gpu_write to be delayed. So we can instead utilize the known last_write_seqno to hopefully minimise the wait times. Signed-off-by: Chris Wilson Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/i915/i915_gem_execbuffer.c') diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index f94ec574db2b..2353e6ee2f0d 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -954,7 +954,7 @@ i915_gem_execbuffer_move_to_active(struct list_head *objects, i915_gem_object_move_to_active(obj, ring, seqno); if (obj->base.write_domain) { obj->dirty = 1; - obj->pending_gpu_write = true; + obj->last_write_seqno = seqno; list_move_tail(&obj->gpu_write_list, &ring->gpu_write_list); if (obj->pin_count) /* check for potential scanout */ -- cgit v1.2.2 From 69c2fc891343cb5217c866d10709343cff190bdc Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 20 Jul 2012 12:41:03 +0100 Subject: drm/i915: Remove the per-ring write list This is now handled by a global flag to ensure we emit a flush before the next serialisation point (if we failed to queue one previously). Signed-off-by: Chris Wilson Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'drivers/gpu/drm/i915/i915_gem_execbuffer.c') diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index 2353e6ee2f0d..36c940c1a978 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -943,9 +943,8 @@ i915_gem_execbuffer_move_to_active(struct list_head *objects, struct drm_i915_gem_object *obj; list_for_each_entry(obj, objects, exec_list) { - u32 old_read = obj->base.read_domains; - u32 old_write = obj->base.write_domain; - + u32 old_read = obj->base.read_domains; + u32 old_write = obj->base.write_domain; obj->base.read_domains = obj->base.pending_read_domains; obj->base.write_domain = obj->base.pending_write_domain; @@ -955,8 +954,6 @@ i915_gem_execbuffer_move_to_active(struct list_head *objects, if (obj->base.write_domain) { obj->dirty = 1; obj->last_write_seqno = seqno; - list_move_tail(&obj->gpu_write_list, - &ring->gpu_write_list); if (obj->pin_count) /* check for potential scanout */ intel_mark_busy(ring->dev, obj); } -- cgit v1.2.2 From 6ac42f4148bc27e5ffd18a9ab0eac57f58822af4 Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Sat, 21 Jul 2012 12:25:01 +0200 Subject: drm/i915: Replace the complex flushing logic with simple invalidate/flush all Now that we unconditionally flush and invalidate between every batch buffer, we no longer need the complex logic to decide which domains require flushing. Remove it and rejoice. v2 (danvet): Keep around the flip waiting logic. It's gross and broken, I know, but we can't just kill that thing ... even if we just keep it around as a reminder that things are broken. Signed-off-by: Chris Wilson Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 222 +++-------------------------- 1 file changed, 20 insertions(+), 202 deletions(-) (limited to 'drivers/gpu/drm/i915/i915_gem_execbuffer.c') diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index 36c940c1a978..6c810798de92 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -34,180 +34,6 @@ #include "intel_drv.h" #include -struct change_domains { - uint32_t invalidate_domains; - uint32_t flush_domains; - uint32_t flush_rings; - uint32_t flips; -}; - -/* - * Set the next domain for the specified object. This - * may not actually perform the necessary flushing/invaliding though, - * as that may want to be batched with other set_domain operations - * - * This is (we hope) the only really tricky part of gem. The goal - * is fairly simple -- track which caches hold bits of the object - * and make sure they remain coherent. A few concrete examples may - * help to explain how it works. For shorthand, we use the notation - * (read_domains, write_domain), e.g. (CPU, CPU) to indicate the - * a pair of read and write domain masks. - * - * Case 1: the batch buffer - * - * 1. Allocated - * 2. Written by CPU - * 3. Mapped to GTT - * 4. Read by GPU - * 5. Unmapped from GTT - * 6. Freed - * - * Let's take these a step at a time - * - * 1. Allocated - * Pages allocated from the kernel may still have - * cache contents, so we set them to (CPU, CPU) always. - * 2. Written by CPU (using pwrite) - * The pwrite function calls set_domain (CPU, CPU) and - * this function does nothing (as nothing changes) - * 3. Mapped by GTT - * This function asserts that the object is not - * currently in any GPU-based read or write domains - * 4. Read by GPU - * i915_gem_execbuffer calls set_domain (COMMAND, 0). - * As write_domain is zero, this function adds in the - * current read domains (CPU+COMMAND, 0). - * flush_domains is set to CPU. - * invalidate_domains is set to COMMAND - * clflush is run to get data out of the CPU caches - * then i915_dev_set_domain calls i915_gem_flush to - * emit an MI_FLUSH and drm_agp_chipset_flush - * 5. Unmapped from GTT - * i915_gem_object_unbind calls set_domain (CPU, CPU) - * flush_domains and invalidate_domains end up both zero - * so no flushing/invalidating happens - * 6. Freed - * yay, done - * - * Case 2: The shared render buffer - * - * 1. Allocated - * 2. Mapped to GTT - * 3. Read/written by GPU - * 4. set_domain to (CPU,CPU) - * 5. Read/written by CPU - * 6. Read/written by GPU - * - * 1. Allocated - * Same as last example, (CPU, CPU) - * 2. Mapped to GTT - * Nothing changes (assertions find that it is not in the GPU) - * 3. Read/written by GPU - * execbuffer calls set_domain (RENDER, RENDER) - * flush_domains gets CPU - * invalidate_domains gets GPU - * clflush (obj) - * MI_FLUSH and drm_agp_chipset_flush - * 4. set_domain (CPU, CPU) - * flush_domains gets GPU - * invalidate_domains gets CPU - * wait_rendering (obj) to make sure all drawing is complete. - * This will include an MI_FLUSH to get the data from GPU - * to memory - * clflush (obj) to invalidate the CPU cache - * Another MI_FLUSH in i915_gem_flush (eliminate this somehow?) - * 5. Read/written by CPU - * cache lines are loaded and dirtied - * 6. Read written by GPU - * Same as last GPU access - * - * Case 3: The constant buffer - * - * 1. Allocated - * 2. Written by CPU - * 3. Read by GPU - * 4. Updated (written) by CPU again - * 5. Read by GPU - * - * 1. Allocated - * (CPU, CPU) - * 2. Written by CPU - * (CPU, CPU) - * 3. Read by GPU - * (CPU+RENDER, 0) - * flush_domains = CPU - * invalidate_domains = RENDER - * clflush (obj) - * MI_FLUSH - * drm_agp_chipset_flush - * 4. Updated (written) by CPU again - * (CPU, CPU) - * flush_domains = 0 (no previous write domain) - * invalidate_domains = 0 (no new read domains) - * 5. Read by GPU - * (CPU+RENDER, 0) - * flush_domains = CPU - * invalidate_domains = RENDER - * clflush (obj) - * MI_FLUSH - * drm_agp_chipset_flush - */ -static void -i915_gem_object_set_to_gpu_domain(struct drm_i915_gem_object *obj, - struct intel_ring_buffer *ring, - struct change_domains *cd) -{ - uint32_t invalidate_domains = 0, flush_domains = 0; - - /* - * If the object isn't moving to a new write domain, - * let the object stay in multiple read domains - */ - if (obj->base.pending_write_domain == 0) - obj->base.pending_read_domains |= obj->base.read_domains; - - /* - * Flush the current write domain if - * the new read domains don't match. Invalidate - * any read domains which differ from the old - * write domain - */ - if (obj->base.write_domain && - (((obj->base.write_domain != obj->base.pending_read_domains || - obj->ring != ring)) || - (obj->fenced_gpu_access && !obj->pending_fenced_gpu_access))) { - flush_domains |= obj->base.write_domain; - invalidate_domains |= - obj->base.pending_read_domains & ~obj->base.write_domain; - } - /* - * Invalidate any read caches which may have - * stale data. That is, any new read domains. - */ - invalidate_domains |= obj->base.pending_read_domains & ~obj->base.read_domains; - if ((flush_domains | invalidate_domains) & I915_GEM_DOMAIN_CPU) - i915_gem_clflush_object(obj); - - if (obj->base.pending_write_domain) - cd->flips |= atomic_read(&obj->pending_flip); - - /* The actual obj->write_domain will be updated with - * pending_write_domain after we emit the accumulated flush for all - * of our domain changes in execbuffers (which clears objects' - * write_domains). So if we have a current write domain that we - * aren't changing, set pending_write_domain to that. - */ - if (flush_domains == 0 && obj->base.pending_write_domain == 0) - obj->base.pending_write_domain = obj->base.write_domain; - - cd->invalidate_domains |= invalidate_domains; - cd->flush_domains |= flush_domains; - if (flush_domains & I915_GEM_GPU_DOMAINS) - cd->flush_rings |= intel_ring_flag(obj->ring); - if (invalidate_domains & I915_GEM_GPU_DOMAINS) - cd->flush_rings |= intel_ring_flag(ring); -} - struct eb_objects { int and; struct hlist_head buckets[0]; @@ -810,18 +636,6 @@ err: return ret; } -static void -i915_gem_execbuffer_flush(struct drm_device *dev, - uint32_t invalidate_domains, - uint32_t flush_domains) -{ - if (flush_domains & I915_GEM_DOMAIN_CPU) - intel_gtt_chipset_flush(); - - if (flush_domains & I915_GEM_DOMAIN_GTT) - wmb(); -} - static int i915_gem_execbuffer_wait_for_flips(struct intel_ring_buffer *ring, u32 flips) { @@ -854,37 +668,41 @@ i915_gem_execbuffer_wait_for_flips(struct intel_ring_buffer *ring, u32 flips) return 0; } - static int i915_gem_execbuffer_move_to_gpu(struct intel_ring_buffer *ring, struct list_head *objects) { struct drm_i915_gem_object *obj; - struct change_domains cd; + uint32_t flush_domains = 0; + uint32_t flips = 0; int ret; - memset(&cd, 0, sizeof(cd)); - list_for_each_entry(obj, objects, exec_list) - i915_gem_object_set_to_gpu_domain(obj, ring, &cd); - - if (cd.invalidate_domains | cd.flush_domains) { - i915_gem_execbuffer_flush(ring->dev, - cd.invalidate_domains, - cd.flush_domains); - } - - if (cd.flips) { - ret = i915_gem_execbuffer_wait_for_flips(ring, cd.flips); + list_for_each_entry(obj, objects, exec_list) { + ret = i915_gem_object_sync(obj, ring); if (ret) return ret; + + if (obj->base.write_domain & I915_GEM_DOMAIN_CPU) + i915_gem_clflush_object(obj); + + if (obj->base.pending_write_domain) + flips |= atomic_read(&obj->pending_flip); + + flush_domains |= obj->base.write_domain; } - list_for_each_entry(obj, objects, exec_list) { - ret = i915_gem_object_sync(obj, ring); + if (flips) { + ret = i915_gem_execbuffer_wait_for_flips(ring, flips); if (ret) return ret; } + if (flush_domains & I915_GEM_DOMAIN_CPU) + intel_gtt_chipset_flush(); + + if (flush_domains & I915_GEM_DOMAIN_GTT) + wmb(); + /* Unconditionally invalidate gpu caches and ensure that we do flush * any residual writes from the previous batch. */ -- cgit v1.2.2 From 016fd0c1aee31902d82c1ac32312f1cc32298b66 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 20 Jul 2012 12:41:07 +0100 Subject: drm/i915: Clear the pending_gpu_fenced_access flag at the start of execbuffer Otherwise once we use the buffer with a BLT command on gen2/3, we will always regard future command submissions as continuing the fenced access. However, now that we flush/invalidate between every batch we can drop this pessimism. Signed-off-by: Chris Wilson Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/gpu/drm/i915/i915_gem_execbuffer.c') diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index 6c810798de92..55a94c1a1f59 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -413,6 +413,7 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring, obj->base.pending_read_domains = 0; obj->base.pending_write_domain = 0; + obj->pending_fenced_gpu_access = false; } list_splice(&ordered_objects, objects); -- cgit v1.2.2 From a7b9761d0a2ded58170ffb4d423ff3d7228103f4 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 20 Jul 2012 12:41:08 +0100 Subject: drm/i915: Split i915_gem_flush_ring() into seperate invalidate/flush funcs By moving the function to intel_ringbuffer and currying the appropriate parameter, hopefully we make the callsites easier to read and understand. Signed-off-by: Chris Wilson Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'drivers/gpu/drm/i915/i915_gem_execbuffer.c') diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index 55a94c1a1f59..6be1a8920a84 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -707,14 +707,7 @@ i915_gem_execbuffer_move_to_gpu(struct intel_ring_buffer *ring, /* Unconditionally invalidate gpu caches and ensure that we do flush * any residual writes from the previous batch. */ - ret = i915_gem_flush_ring(ring, - I915_GEM_GPU_DOMAINS, - ring->gpu_caches_dirty ? I915_GEM_GPU_DOMAINS : 0); - if (ret) - return ret; - - ring->gpu_caches_dirty = false; - return 0; + return intel_ring_invalidate_all_caches(ring); } static bool -- cgit v1.2.2 From f047e395ddc9da6c307a10629a237502e627ed85 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Sat, 21 Jul 2012 12:31:41 +0100 Subject: drm/i915: Avoid concurrent access when marking the device as idle/busy As suggested by Daniel, rip out the independent timers for device and crtc busyness and integrate the manual powermanagement of the display engine into the GEM core and its request tracking. The benefits are that the code is a lot smaller, fewer moving parts and should fit more neatly into the overall activity tracking of the driver. v2: Complete overhaul and removal of the racy timers and workers. Signed-off-by: Chris Wilson Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'drivers/gpu/drm/i915/i915_gem_execbuffer.c') diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index 6be1a8920a84..25b2c54e1261 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -767,13 +767,11 @@ i915_gem_execbuffer_move_to_active(struct list_head *objects, obj->dirty = 1; obj->last_write_seqno = seqno; if (obj->pin_count) /* check for potential scanout */ - intel_mark_busy(ring->dev, obj); + intel_mark_fb_busy(obj); } trace_i915_gem_object_change_domain(obj, old_read, old_write); } - - intel_mark_busy(ring->dev, NULL); } static void -- cgit v1.2.2 From 6c085a728cf000ac1865d66f8c9b52935558b328 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 20 Aug 2012 11:40:46 +0200 Subject: drm/i915: Track unbound pages When dealing with a working set larger than the GATT, or even the mappable aperture when touching through the GTT, we end up with evicting objects only to rebind them at a new offset again later. Moving an object into and out of the GTT requires clflushing the pages, thus causing a double-clflush penalty for rebinding. To avoid having to clflush on rebinding, we can track the pages as they are evicted from the GTT and only relinquish those pages on memory pressure. As usual, if it were not for the handling of out-of-memory condition and having to manually shrink our own bo caches, it would be a net reduction of code. Alas. Note: The patch also contains a few changes to the last-hope evict_everything logic in i916_gem_execbuffer.c - we no longer try to only evict the purgeable stuff in a first try (since that's superflous and only helps in OOM corner-cases, not fragmented-gtt trashing situations). Also, the extraction of the get_pages retry loop from bind_to_gtt (and other callsites) to get_pages should imo have been a separate patch. v2: Ditch the newly added put_pages (for unbound objects only) in i915_gem_reset. A quick irc discussion hasn't revealed any important reason for this, so if we need this, I'd like to have a git blame'able explanation for it. v3: Undo the s/drm_malloc_ab/kmalloc/ in get_pages that Chris noticed. Signed-off-by: Chris Wilson [danvet: Split out code movements and rant a bit in the commit message with a few Notes. Done v2] Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'drivers/gpu/drm/i915/i915_gem_execbuffer.c') diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index afb312ee050c..834a636b44f0 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -502,17 +502,12 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring, } } - if (ret != -ENOSPC || retry > 1) + if (ret != -ENOSPC || retry++) return ret; - /* First attempt, just clear anything that is purgeable. - * Second attempt, clear the entire GTT. - */ - ret = i915_gem_evict_everything(ring->dev, retry == 0); + ret = i915_gem_evict_everything(ring->dev); if (ret) return ret; - - retry++; } while (1); err: -- cgit v1.2.2 From 86a1ee26bb60e1ab8984e92f0e9186c354670aed Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Sat, 11 Aug 2012 15:41:04 +0100 Subject: drm/i915: Only pwrite through the GTT if there is space in the aperture Avoid stalling and waiting for the GPU by checking to see if there is sufficient inactive space in the aperture for us to bind the buffer prior to writing through the GTT. If there is inadequate space we will have to stall waiting for the GPU, and incur overheads moving objects about. Instead, only incur the clflush overhead on the target object by writing through shmem. Signed-off-by: Chris Wilson Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/i915/i915_gem_execbuffer.c') diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index 834a636b44f0..f7346d876551 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -354,7 +354,7 @@ pin_and_fence_object(struct drm_i915_gem_object *obj, obj->tiling_mode != I915_TILING_NONE; need_mappable = need_fence || need_reloc_mappable(obj); - ret = i915_gem_object_pin(obj, entry->alignment, need_mappable); + ret = i915_gem_object_pin(obj, entry->alignment, need_mappable, false); if (ret) return ret; -- cgit v1.2.2 From 504c7267a1e84b157cbd7e9c1b805e1bc0c2c846 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 23 Aug 2012 13:12:52 +0100 Subject: drm/i915: Use cpu relocations if the object is in the GTT but not mappable This prevents the case of unbinding the object in order to process the relocations through the GTT and then rebinding it only to then proceed to use cpu relocations as the object is now in the CPU write domain. By choosing to use cpu relocations up front, we can therefore avoid the rebind penalty. Signed-off-by: Chris Wilson Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/gpu/drm/i915/i915_gem_execbuffer.c') diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index f7346d876551..dc87563440f9 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -95,6 +95,7 @@ eb_destroy(struct eb_objects *eb) static inline int use_cpu_reloc(struct drm_i915_gem_object *obj) { return (obj->base.write_domain == I915_GEM_DOMAIN_CPU || + !obj->map_and_fenceable || obj->cache_level != I915_CACHE_NONE); } -- cgit v1.2.2 From 7788a765205f63abcb8645c16c85a968bd578f4f Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 24 Aug 2012 19:18:18 +0100 Subject: drm/i915: Avoid unbinding due to an interrupted pin_and_fence during execbuffer If we need to stall in order to complete the pin_and_fence operation during execbuffer reservation, there is a high likelihood that the operation will be interrupted by a signal (thanks X!). In order to simplify the cleanup along that error path, the object was unconditionally unbound and the error propagated. However, being interrupted here is far more common than I would like and so we can strive to avoid the extra work by eliminating the forced unbind. v2: In discussion over the indecent colour of the new functions and unwind path, we realised that we can use the new unreserve function to clean up the code even further. Signed-off-by: Chris Wilson Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 114 ++++++++++++----------------- 1 file changed, 45 insertions(+), 69 deletions(-) (limited to 'drivers/gpu/drm/i915/i915_gem_execbuffer.c') diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index dc87563440f9..e6b2205ecf6d 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -331,7 +331,8 @@ i915_gem_execbuffer_relocate(struct drm_device *dev, return ret; } -#define __EXEC_OBJECT_HAS_FENCE (1<<31) +#define __EXEC_OBJECT_HAS_PIN (1<<31) +#define __EXEC_OBJECT_HAS_FENCE (1<<30) static int need_reloc_mappable(struct drm_i915_gem_object *obj) @@ -341,9 +342,10 @@ need_reloc_mappable(struct drm_i915_gem_object *obj) } static int -pin_and_fence_object(struct drm_i915_gem_object *obj, - struct intel_ring_buffer *ring) +i915_gem_execbuffer_reserve_object(struct drm_i915_gem_object *obj, + struct intel_ring_buffer *ring) { + struct drm_i915_private *dev_priv = obj->base.dev->dev_private; struct drm_i915_gem_exec_object2 *entry = obj->exec_entry; bool has_fenced_gpu_access = INTEL_INFO(ring->dev)->gen < 4; bool need_fence, need_mappable; @@ -359,11 +361,13 @@ pin_and_fence_object(struct drm_i915_gem_object *obj, if (ret) return ret; + entry->flags |= __EXEC_OBJECT_HAS_PIN; + if (has_fenced_gpu_access) { if (entry->flags & EXEC_OBJECT_NEEDS_FENCE) { ret = i915_gem_object_get_fence(obj); if (ret) - goto err_unpin; + return ret; if (i915_gem_object_pin_fence(obj)) entry->flags |= __EXEC_OBJECT_HAS_FENCE; @@ -372,12 +376,35 @@ pin_and_fence_object(struct drm_i915_gem_object *obj, } } + /* Ensure ppgtt mapping exists if needed */ + if (dev_priv->mm.aliasing_ppgtt && !obj->has_aliasing_ppgtt_mapping) { + i915_ppgtt_bind_object(dev_priv->mm.aliasing_ppgtt, + obj, obj->cache_level); + + obj->has_aliasing_ppgtt_mapping = 1; + } + entry->offset = obj->gtt_offset; return 0; +} -err_unpin: - i915_gem_object_unpin(obj); - return ret; +static void +i915_gem_execbuffer_unreserve_object(struct drm_i915_gem_object *obj) +{ + struct drm_i915_gem_exec_object2 *entry; + + if (!obj->gtt_space) + return; + + entry = obj->exec_entry; + + if (entry->flags & __EXEC_OBJECT_HAS_FENCE) + i915_gem_object_unpin_fence(obj); + + if (entry->flags & __EXEC_OBJECT_HAS_PIN) + i915_gem_object_unpin(obj); + + entry->flags &= ~(__EXEC_OBJECT_HAS_FENCE | __EXEC_OBJECT_HAS_PIN); } static int @@ -385,11 +412,10 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring, struct drm_file *file, struct list_head *objects) { - drm_i915_private_t *dev_priv = ring->dev->dev_private; struct drm_i915_gem_object *obj; - int ret, retry; - bool has_fenced_gpu_access = INTEL_INFO(ring->dev)->gen < 4; struct list_head ordered_objects; + bool has_fenced_gpu_access = INTEL_INFO(ring->dev)->gen < 4; + int retry; INIT_LIST_HEAD(&ordered_objects); while (!list_empty(objects)) { @@ -427,12 +453,12 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring, * 2. Bind new objects. * 3. Decrement pin count. * - * This avoid unnecessary unbinding of later objects in order to makr + * This avoid unnecessary unbinding of later objects in order to make * room for the earlier objects *unless* we need to defragment. */ retry = 0; do { - ret = 0; + int ret = 0; /* Unbind any ill-fitting objects or pin. */ list_for_each_entry(obj, objects, exec_list) { @@ -452,7 +478,7 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring, (need_mappable && !obj->map_and_fenceable)) ret = i915_gem_object_unbind(obj); else - ret = pin_and_fence_object(obj, ring); + ret = i915_gem_execbuffer_reserve_object(obj, ring); if (ret) goto err; } @@ -462,46 +488,14 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring, if (obj->gtt_space) continue; - ret = pin_and_fence_object(obj, ring); - if (ret) { - int ret_ignore; - - /* This can potentially raise a harmless - * -EINVAL if we failed to bind in the above - * call. It cannot raise -EINTR since we know - * that the bo is freshly bound and so will - * not need to be flushed or waited upon. - */ - ret_ignore = i915_gem_object_unbind(obj); - (void)ret_ignore; - WARN_ON(obj->gtt_space); - break; - } + ret = i915_gem_execbuffer_reserve_object(obj, ring); + if (ret) + goto err; } - /* Decrement pin count for bound objects */ - list_for_each_entry(obj, objects, exec_list) { - struct drm_i915_gem_exec_object2 *entry; - - if (!obj->gtt_space) - continue; - - entry = obj->exec_entry; - if (entry->flags & __EXEC_OBJECT_HAS_FENCE) { - i915_gem_object_unpin_fence(obj); - entry->flags &= ~__EXEC_OBJECT_HAS_FENCE; - } - - i915_gem_object_unpin(obj); - - /* ... and ensure ppgtt mapping exist if needed. */ - if (dev_priv->mm.aliasing_ppgtt && !obj->has_aliasing_ppgtt_mapping) { - i915_ppgtt_bind_object(dev_priv->mm.aliasing_ppgtt, - obj, obj->cache_level); - - obj->has_aliasing_ppgtt_mapping = 1; - } - } +err: /* Decrement pin count for bound objects */ + list_for_each_entry(obj, objects, exec_list) + i915_gem_execbuffer_unreserve_object(obj); if (ret != -ENOSPC || retry++) return ret; @@ -510,24 +504,6 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring, if (ret) return ret; } while (1); - -err: - list_for_each_entry_continue_reverse(obj, objects, exec_list) { - struct drm_i915_gem_exec_object2 *entry; - - if (!obj->gtt_space) - continue; - - entry = obj->exec_entry; - if (entry->flags & __EXEC_OBJECT_HAS_FENCE) { - i915_gem_object_unpin_fence(obj); - entry->flags &= ~__EXEC_OBJECT_HAS_FENCE; - } - - i915_gem_object_unpin(obj); - } - - return ret; } static int -- cgit v1.2.2 From 9da3da660d8c19a54f6e93361d147509be3fff84 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 1 Jun 2012 15:20:22 +0100 Subject: drm/i915: Replace the array of pages with a scatterlist Rather than have multiple data structures for describing our page layout in conjunction with the array of pages, we can migrate all users over to a scatterlist. One major advantage, other than unifying the page tracking structures, this offers is that we replace the vmalloc'ed array (which can be up to a megabyte in size) with a chain of individual pages which helps reduce memory pressure. The disadvantage is that we then do not have a simple array to iterate, or to access randomly. The common case for this is in the relocation processing, which will typically fit within a single scatterlist page and so be almost the same cost as the simple array. For iterating over the array, the extra function call could be optimised away, but in reality is an insignificant cost of either binding the pages, or performing the pwrite/pread. v2: Fix drm_clflush_sg() to not invoke wbinvd as well! And fix the trivial compile error from rebasing. Signed-off-by: Chris Wilson Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/i915/i915_gem_execbuffer.c') diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index e6b2205ecf6d..4ab008397d60 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -210,7 +210,8 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj, if (ret) return ret; - vaddr = kmap_atomic(obj->pages[reloc->offset >> PAGE_SHIFT]); + vaddr = kmap_atomic(i915_gem_object_get_page(obj, + reloc->offset >> PAGE_SHIFT)); *(uint32_t *)(vaddr + page_offset) = reloc->delta; kunmap_atomic(vaddr); } else { -- cgit v1.2.2 From ba7a64587bcff8d85666a70b0a515d5fbcaa000c Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 14 Sep 2012 11:46:00 +0100 Subject: drm/i915: Drop the misleading cast to the wrong user pointer type The exec_list is of type drm_i915_gem_exec_object2 and so casting it to a drm_i915_gem_relocation_entry is very confusing! Signed-off-by: Chris Wilson Reviewed-by: Jani Nikula Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'drivers/gpu/drm/i915/i915_gem_execbuffer.c') diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index 4ab008397d60..8186f63474f4 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -1102,8 +1102,7 @@ i915_gem_execbuffer(struct drm_device *dev, void *data, return -ENOMEM; } ret = copy_from_user(exec_list, - (struct drm_i915_relocation_entry __user *) - (uintptr_t) args->buffers_ptr, + (void __user *)(uintptr_t)args->buffers_ptr, sizeof(*exec_list) * args->buffer_count); if (ret != 0) { DRM_DEBUG("copy %d exec entries failed %d\n", @@ -1142,8 +1141,7 @@ i915_gem_execbuffer(struct drm_device *dev, void *data, for (i = 0; i < args->buffer_count; i++) exec_list[i].offset = exec2_list[i].offset; /* ... and back out to userspace */ - ret = copy_to_user((struct drm_i915_relocation_entry __user *) - (uintptr_t) args->buffers_ptr, + ret = copy_to_user((void __user *)(uintptr_t)args->buffers_ptr, exec_list, sizeof(*exec_list) * args->buffer_count); if (ret) { @@ -1197,8 +1195,7 @@ i915_gem_execbuffer2(struct drm_device *dev, void *data, ret = i915_gem_do_execbuffer(dev, data, file, args, exec2_list); if (!ret) { /* Copy the new buffer offsets back to the user's exec list. */ - ret = copy_to_user((struct drm_i915_relocation_entry __user *) - (uintptr_t) args->buffers_ptr, + ret = copy_to_user((void __user *)(uintptr_t)args->buffers_ptr, exec2_list, sizeof(*exec2_list) * args->buffer_count); if (ret) { -- cgit v1.2.2 From 41783eea1a7f7d033ace4fd78218dd5f37b2db5f Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Tue, 18 Sep 2012 10:04:02 +0100 Subject: drm/i915: Assert that the exec object lookup table is a power-of-two As we make the simplification of using a power-of-two size for the execbuffer handle-to-object TLB, we should validate that this is actually true and so clarify that premise. Signed-off-by: Chris Wilson Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/gpu/drm/i915/i915_gem_execbuffer.c') diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index 8186f63474f4..6a2f3e50c714 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -44,6 +44,7 @@ eb_create(int size) { struct eb_objects *eb; int count = PAGE_SIZE / sizeof(struct hlist_head) / 2; + BUILD_BUG_ON(!is_power_of_2(PAGE_SIZE / sizeof(struct hlist_head))); while (count > size) count >>= 1; eb = kzalloc(count*sizeof(struct hlist_head) + -- cgit v1.2.2