aboutsummaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorChris Wilson <chris@chris-wilson.co.uk>2018-12-03 06:36:55 -0500
committerChris Wilson <chris@chris-wilson.co.uk>2018-12-04 06:26:33 -0500
commit3800960afe158fa3d4c622774eaf59a9b3960a82 (patch)
tree5831a4f236500f4b93778bfaa3026c18048231b6 /drivers
parent0ce611c906bff3d6c3d72f3d5d8de79ea0490fa0 (diff)
drm/i915: Complete the fences as they are cancelled due to wedging
We inspect the requests under the assumption that they will be marked as completed when they are removed from the queue. Currently however, in the process of wedging the requests will be removed from the queue before they are completed, so rearrange the code to complete the fences before the locks are dropped. <1>[ 354.473346] BUG: unable to handle kernel NULL pointer dereference at 0000000000000250 <6>[ 354.473363] PGD 0 P4D 0 <4>[ 354.473370] Oops: 0000 [#1] PREEMPT SMP PTI <4>[ 354.473380] CPU: 0 PID: 4470 Comm: gem_eio Tainted: G U 4.20.0-rc4-CI-CI_DRM_5216+ #1 <4>[ 354.473393] Hardware name: Intel Corporation NUC7CJYH/NUC7JYB, BIOS JYGLKCPX.86A.0027.2018.0125.1347 01/25/2018 <4>[ 354.473480] RIP: 0010:__i915_schedule+0x311/0x5e0 [i915] <4>[ 354.473490] Code: 49 89 44 24 20 4d 89 4c 24 28 4d 89 29 44 39 b3 a0 04 00 00 7d 3a 41 8b 44 24 78 85 c0 74 13 48 8b 93 78 04 00 00 48 83 e2 fc <39> 82 50 02 00 00 79 1e 44 89 b3 a0 04 00 00 48 8d bb d0 03 00 00 <4>[ 354.473515] RSP: 0018:ffffc900001bba90 EFLAGS: 00010046 <4>[ 354.473524] RAX: 0000000000000003 RBX: ffff8882624c8008 RCX: f34a737800000000 <4>[ 354.473535] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff8882624c8048 <4>[ 354.473545] RBP: ffffc900001bbab0 R08: 000000005963f1f1 R09: 0000000000000000 <4>[ 354.473556] R10: ffffc900001bba10 R11: ffff8882624c8060 R12: ffff88824fdd7b98 <4>[ 354.473567] R13: ffff88824fdd7bb8 R14: 0000000000000001 R15: ffff88824fdd7750 <4>[ 354.473578] FS: 00007f44b4b5b980(0000) GS:ffff888277e00000(0000) knlGS:0000000000000000 <4>[ 354.473590] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 <4>[ 354.473599] CR2: 0000000000000250 CR3: 000000026976e000 CR4: 0000000000340ef0 <4>[ 354.473611] Call Trace: <4>[ 354.473622] ? lock_acquire+0xa6/0x1c0 <4>[ 354.473677] ? i915_schedule_bump_priority+0x57/0xd0 [i915] <4>[ 354.473736] i915_schedule_bump_priority+0x72/0xd0 [i915] <4>[ 354.473792] i915_request_wait+0x4db/0x840 [i915] <4>[ 354.473804] ? get_pwq.isra.4+0x2c/0x50 <4>[ 354.473813] ? ___preempt_schedule+0x16/0x18 <4>[ 354.473824] ? wake_up_q+0x70/0x70 <4>[ 354.473831] ? wake_up_q+0x70/0x70 <4>[ 354.473882] ? gen6_rps_boost+0x118/0x120 [i915] <4>[ 354.473936] i915_gem_object_wait_fence+0x8a/0x110 [i915] <4>[ 354.473991] i915_gem_object_wait+0x113/0x500 [i915] <4>[ 354.474047] i915_gem_wait_ioctl+0x11c/0x2f0 [i915] <4>[ 354.474101] ? i915_gem_unset_wedged+0x210/0x210 [i915] <4>[ 354.474113] drm_ioctl_kernel+0x81/0xf0 <4>[ 354.474123] drm_ioctl+0x2de/0x390 <4>[ 354.474175] ? i915_gem_unset_wedged+0x210/0x210 [i915] <4>[ 354.474187] ? finish_task_switch+0x95/0x260 <4>[ 354.474197] ? lock_acquire+0xa6/0x1c0 <4>[ 354.474207] do_vfs_ioctl+0xa0/0x6e0 <4>[ 354.474217] ? __fget+0xfc/0x1e0 <4>[ 354.474225] ksys_ioctl+0x35/0x60 <4>[ 354.474233] __x64_sys_ioctl+0x11/0x20 <4>[ 354.474241] do_syscall_64+0x55/0x190 <4>[ 354.474251] entry_SYSCALL_64_after_hwframe+0x49/0xbe <4>[ 354.474260] RIP: 0033:0x7f44b3de65d7 <4>[ 354.474267] Code: b3 66 90 48 8b 05 b1 48 2d 00 64 c7 00 26 00 00 00 48 c7 c0 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 81 48 2d 00 f7 d8 64 89 01 48 <4>[ 354.474293] RSP: 002b:00007fff974948e8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 <4>[ 354.474305] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f44b3de65d7 <4>[ 354.474316] RDX: 00007fff97494940 RSI: 00000000c010646c RDI: 0000000000000007 <4>[ 354.474327] RBP: 00007fff97494940 R08: 0000000000000000 R09: 00007f44b40bbc40 <4>[ 354.474337] R10: 0000000000000000 R11: 0000000000000246 R12: 00000000c010646c <4>[ 354.474348] R13: 0000000000000007 R14: 0000000000000000 R15: 0000000000000000 v2: Avoid floating requests. v3: Can't call dma_fence_signal() under the timeline lock! v4: Can't call dma_fence_signal() from inside another fence either. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20181203113701.12106-2-chris@chris-wilson.co.uk
Diffstat (limited to 'drivers')
-rw-r--r--drivers/gpu/drm/i915/i915_gem.c54
-rw-r--r--drivers/gpu/drm/i915/intel_lrc.c11
-rw-r--r--drivers/gpu/drm/i915/intel_ringbuffer.c13
3 files changed, 30 insertions, 48 deletions
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index c55b1f75c980..834240a9b262 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3309,16 +3309,6 @@ void i915_gem_reset_finish(struct drm_i915_private *dev_priv)
3309 3309
3310static void nop_submit_request(struct i915_request *request) 3310static void nop_submit_request(struct i915_request *request)
3311{ 3311{
3312 GEM_TRACE("%s fence %llx:%d -> -EIO\n",
3313 request->engine->name,
3314 request->fence.context, request->fence.seqno);
3315 dma_fence_set_error(&request->fence, -EIO);
3316
3317 i915_request_submit(request);
3318}
3319
3320static void nop_complete_submit_request(struct i915_request *request)
3321{
3322 unsigned long flags; 3312 unsigned long flags;
3323 3313
3324 GEM_TRACE("%s fence %llx:%d -> -EIO\n", 3314 GEM_TRACE("%s fence %llx:%d -> -EIO\n",
@@ -3354,57 +3344,33 @@ void i915_gem_set_wedged(struct drm_i915_private *i915)
3354 * rolling the global seqno forward (since this would complete requests 3344 * rolling the global seqno forward (since this would complete requests
3355 * for which we haven't set the fence error to EIO yet). 3345 * for which we haven't set the fence error to EIO yet).
3356 */ 3346 */
3357 for_each_engine(engine, i915, id) { 3347 for_each_engine(engine, i915, id)
3358 i915_gem_reset_prepare_engine(engine); 3348 i915_gem_reset_prepare_engine(engine);
3359 3349
3360 engine->submit_request = nop_submit_request;
3361 engine->schedule = NULL;
3362 }
3363 i915->caps.scheduler = 0;
3364
3365 /* Even if the GPU reset fails, it should still stop the engines */ 3350 /* Even if the GPU reset fails, it should still stop the engines */
3366 if (INTEL_GEN(i915) >= 5) 3351 if (INTEL_GEN(i915) >= 5)
3367 intel_gpu_reset(i915, ALL_ENGINES); 3352 intel_gpu_reset(i915, ALL_ENGINES);
3368 3353
3369 /*
3370 * Make sure no one is running the old callback before we proceed with
3371 * cancelling requests and resetting the completion tracking. Otherwise
3372 * we might submit a request to the hardware which never completes.
3373 */
3374 synchronize_rcu();
3375
3376 for_each_engine(engine, i915, id) { 3354 for_each_engine(engine, i915, id) {
3377 /* Mark all executing requests as skipped */ 3355 engine->submit_request = nop_submit_request;
3378 engine->cancel_requests(engine); 3356 engine->schedule = NULL;
3379
3380 /*
3381 * Only once we've force-cancelled all in-flight requests can we
3382 * start to complete all requests.
3383 */
3384 engine->submit_request = nop_complete_submit_request;
3385 } 3357 }
3358 i915->caps.scheduler = 0;
3386 3359
3387 /* 3360 /*
3388 * Make sure no request can slip through without getting completed by 3361 * Make sure no request can slip through without getting completed by
3389 * either this call here to intel_engine_init_global_seqno, or the one 3362 * either this call here to intel_engine_init_global_seqno, or the one
3390 * in nop_complete_submit_request. 3363 * in nop_submit_request.
3391 */ 3364 */
3392 synchronize_rcu(); 3365 synchronize_rcu();
3393 3366
3394 for_each_engine(engine, i915, id) { 3367 /* Mark all executing requests as skipped */
3395 unsigned long flags; 3368 for_each_engine(engine, i915, id)
3396 3369 engine->cancel_requests(engine);
3397 /*
3398 * Mark all pending requests as complete so that any concurrent
3399 * (lockless) lookup doesn't try and wait upon the request as we
3400 * reset it.
3401 */
3402 spin_lock_irqsave(&engine->timeline.lock, flags);
3403 intel_engine_init_global_seqno(engine,
3404 intel_engine_last_submit(engine));
3405 spin_unlock_irqrestore(&engine->timeline.lock, flags);
3406 3370
3371 for_each_engine(engine, i915, id) {
3407 i915_gem_reset_finish_engine(engine); 3372 i915_gem_reset_finish_engine(engine);
3373 intel_engine_wakeup(engine);
3408 } 3374 }
3409 3375
3410out: 3376out:
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 1f004683b777..87d42a2b9400 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -820,8 +820,11 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
820 /* Mark all executing requests as skipped. */ 820 /* Mark all executing requests as skipped. */
821 list_for_each_entry(rq, &engine->timeline.requests, link) { 821 list_for_each_entry(rq, &engine->timeline.requests, link) {
822 GEM_BUG_ON(!rq->global_seqno); 822 GEM_BUG_ON(!rq->global_seqno);
823 if (!i915_request_completed(rq)) 823
824 dma_fence_set_error(&rq->fence, -EIO); 824 if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &rq->fence.flags))
825 continue;
826
827 dma_fence_set_error(&rq->fence, -EIO);
825 } 828 }
826 829
827 /* Flush the queued requests to the timeline list (for retiring). */ 830 /* Flush the queued requests to the timeline list (for retiring). */
@@ -841,6 +844,10 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
841 kmem_cache_free(engine->i915->priorities, p); 844 kmem_cache_free(engine->i915->priorities, p);
842 } 845 }
843 846
847 intel_write_status_page(engine,
848 I915_GEM_HWS_INDEX,
849 intel_engine_last_submit(engine));
850
844 /* Remaining _unready_ requests will be nop'ed when submitted */ 851 /* Remaining _unready_ requests will be nop'ed when submitted */
845 852
846 execlists->queue_priority = INT_MIN; 853 execlists->queue_priority = INT_MIN;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index d81eaf5f6b3e..81b10d85b738 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -755,9 +755,18 @@ static void cancel_requests(struct intel_engine_cs *engine)
755 /* Mark all submitted requests as skipped. */ 755 /* Mark all submitted requests as skipped. */
756 list_for_each_entry(request, &engine->timeline.requests, link) { 756 list_for_each_entry(request, &engine->timeline.requests, link) {
757 GEM_BUG_ON(!request->global_seqno); 757 GEM_BUG_ON(!request->global_seqno);
758 if (!i915_request_completed(request)) 758
759 dma_fence_set_error(&request->fence, -EIO); 759 if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
760 &request->fence.flags))
761 continue;
762
763 dma_fence_set_error(&request->fence, -EIO);
760 } 764 }
765
766 intel_write_status_page(engine,
767 I915_GEM_HWS_INDEX,
768 intel_engine_last_submit(engine));
769
761 /* Remaining _unready_ requests will be nop'ed when submitted */ 770 /* Remaining _unready_ requests will be nop'ed when submitted */
762 771
763 spin_unlock_irqrestore(&engine->timeline.lock, flags); 772 spin_unlock_irqrestore(&engine->timeline.lock, flags);