aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMika Kuoppala <mika.kuoppala@linux.intel.com>2013-05-30 02:04:29 -0400
committerDaniel Vetter <daniel.vetter@ffwll.ch>2013-06-03 04:58:21 -0400
commit05407ff889ceebe383aa5907219f86582ef96b72 (patch)
treed93841848ec8d135566a085809eb0eb4c7fa85d7
parent35c20a60c7549b11fd1d8c5d5d7ab5b6b54d6ff9 (diff)
drm/i915: detect hang using per ring hangcheck_score
Keep track of ring seqno progress and if there are no progress detected, declare hang. Use actual head (acthd) to distinguish between ring stuck and batchbuffer looping situation. Stuck ring will be kicked to trigger progress. This commit adds a hard limit for batchbuffer completion time. If batchbuffer completion time is more than 4.5 seconds, the gpu will be declared hung. Review comment from Ben which nicely clarifies the semantic change: "Maybe I'm just stating the functional changes of the patch, but in case they were unintended here is what I see as potential issues: 1. "If ring B is waiting on ring A via semaphore, and ring A is making progress, albeit slowly - the hangcheck will fire. The check will determine that A is moving, however ring B will appear hung because the ACTHD doesn't move. I honestly can't say if that's actually a realistic problem to hit it probably implies the timeout value is too low. 2. "There's also another corner case on the kick. If the seqno = 2 (though not stuck), and on the 3rd hangcheck, the ring is stuck, and we try to kick it... we don't actually try to find out if the kick helped" v2: use atchd to detect stuck ring from loop (Ben Widawsky) v3: Use acthd to check when ring needs kicking. Declare hang on third time in order to give time for kick_ring to take effect. v4: Update commit msg Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com> Reviewed-by: Ben Widawsky <ben@bwidawsk.net> [danvet: Paste in Ben's review comment.] Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
-rw-r--r--drivers/gpu/drm/i915/i915_irq.c80
-rw-r--r--drivers/gpu/drm/i915/intel_ringbuffer.h2
2 files changed, 49 insertions, 33 deletions
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 5ae5ca8854d4..e88f173d6b33 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -683,7 +683,6 @@ static void notify_ring(struct drm_device *dev,
683 683
684 wake_up_all(&ring->irq_queue); 684 wake_up_all(&ring->irq_queue);
685 if (i915_enable_hangcheck) { 685 if (i915_enable_hangcheck) {
686 dev_priv->gpu_error.hangcheck_count = 0;
687 mod_timer(&dev_priv->gpu_error.hangcheck_timer, 686 mod_timer(&dev_priv->gpu_error.hangcheck_timer,
688 round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES)); 687 round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
689 } 688 }
@@ -2422,61 +2421,76 @@ static bool i915_hangcheck_hung(struct drm_device *dev)
2422 2421
2423/** 2422/**
2424 * This is called when the chip hasn't reported back with completed 2423 * This is called when the chip hasn't reported back with completed
2425 * batchbuffers in a long time. The first time this is called we simply record 2424 * batchbuffers in a long time. We keep track per ring seqno progress and
2426 * ACTHD. If ACTHD hasn't changed by the time the hangcheck timer elapses 2425 * if there are no progress, hangcheck score for that ring is increased.
2427 * again, we assume the chip is wedged and try to fix it. 2426 * Further, acthd is inspected to see if the ring is stuck. On stuck case
2427 * we kick the ring. If we see no progress on three subsequent calls
2428 * we assume chip is wedged and try to fix it by resetting the chip.
2428 */ 2429 */
2429void i915_hangcheck_elapsed(unsigned long data) 2430void i915_hangcheck_elapsed(unsigned long data)
2430{ 2431{
2431 struct drm_device *dev = (struct drm_device *)data; 2432 struct drm_device *dev = (struct drm_device *)data;
2432 drm_i915_private_t *dev_priv = dev->dev_private; 2433 drm_i915_private_t *dev_priv = dev->dev_private;
2433 struct intel_ring_buffer *ring; 2434 struct intel_ring_buffer *ring;
2434 bool err = false, idle;
2435 int i; 2435 int i;
2436 u32 seqno[I915_NUM_RINGS]; 2436 int busy_count = 0, rings_hung = 0;
2437 bool work_done; 2437 bool stuck[I915_NUM_RINGS];
2438 2438
2439 if (!i915_enable_hangcheck) 2439 if (!i915_enable_hangcheck)
2440 return; 2440 return;
2441 2441
2442 idle = true;
2443 for_each_ring(ring, dev_priv, i) { 2442 for_each_ring(ring, dev_priv, i) {
2444 seqno[i] = ring->get_seqno(ring, false); 2443 u32 seqno, acthd;
2445 idle &= i915_hangcheck_ring_idle(ring, seqno[i], &err); 2444 bool idle, err = false;
2446 } 2445
2446 seqno = ring->get_seqno(ring, false);
2447 acthd = intel_ring_get_active_head(ring);
2448 idle = i915_hangcheck_ring_idle(ring, seqno, &err);
2449 stuck[i] = ring->hangcheck.acthd == acthd;
2450
2451 if (idle) {
2452 if (err)
2453 ring->hangcheck.score += 2;
2454 else
2455 ring->hangcheck.score = 0;
2456 } else {
2457 busy_count++;
2447 2458
2448 /* If all work is done then ACTHD clearly hasn't advanced. */ 2459 if (ring->hangcheck.seqno == seqno) {
2449 if (idle) { 2460 ring->hangcheck.score++;
2450 if (err) {
2451 if (i915_hangcheck_hung(dev))
2452 return;
2453 2461
2454 goto repeat; 2462 /* Kick ring if stuck*/
2463 if (stuck[i])
2464 i915_hangcheck_ring_hung(ring);
2465 } else {
2466 ring->hangcheck.score = 0;
2467 }
2455 } 2468 }
2456 2469
2457 dev_priv->gpu_error.hangcheck_count = 0; 2470 ring->hangcheck.seqno = seqno;
2458 return; 2471 ring->hangcheck.acthd = acthd;
2459 } 2472 }
2460 2473
2461 work_done = false;
2462 for_each_ring(ring, dev_priv, i) { 2474 for_each_ring(ring, dev_priv, i) {
2463 if (ring->hangcheck.seqno != seqno[i]) { 2475 if (ring->hangcheck.score > 2) {
2464 work_done = true; 2476 rings_hung++;
2465 ring->hangcheck.seqno = seqno[i]; 2477 DRM_ERROR("%s: %s on %s 0x%x\n", ring->name,
2478 stuck[i] ? "stuck" : "no progress",
2479 stuck[i] ? "addr" : "seqno",
2480 stuck[i] ? ring->hangcheck.acthd & HEAD_ADDR :
2481 ring->hangcheck.seqno);
2466 } 2482 }
2467 } 2483 }
2468 2484
2469 if (!work_done) { 2485 if (rings_hung)
2470 if (i915_hangcheck_hung(dev)) 2486 return i915_handle_error(dev, true);
2471 return;
2472 } else {
2473 dev_priv->gpu_error.hangcheck_count = 0;
2474 }
2475 2487
2476repeat: 2488 if (busy_count)
2477 /* Reset timer case chip hangs without another request being added */ 2489 /* Reset timer case chip hangs without another request
2478 mod_timer(&dev_priv->gpu_error.hangcheck_timer, 2490 * being added */
2479 round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES)); 2491 mod_timer(&dev_priv->gpu_error.hangcheck_timer,
2492 round_jiffies_up(jiffies +
2493 DRM_I915_HANGCHECK_JIFFIES));
2480} 2494}
2481 2495
2482/* drm_dma.h hooks 2496/* drm_dma.h hooks
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 022d07e43d12..4c7e103e6fa4 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -39,6 +39,8 @@ struct intel_hw_status_page {
39 39
40struct intel_ring_hangcheck { 40struct intel_ring_hangcheck {
41 u32 seqno; 41 u32 seqno;
42 u32 acthd;
43 int score;
42}; 44};
43 45
44struct intel_ring_buffer { 46struct intel_ring_buffer {