aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChris Wilson <chris@chris-wilson.co.uk>2018-01-29 09:41:04 -0500
committerRodrigo Vivi <rodrigo.vivi@intel.com>2018-02-01 10:33:02 -0500
commitb26a32a82a901f894c79d1ec8b1ac94dde83e83c (patch)
tree5cd5f88fb395901f4776a07c4b7dd024deb6404c
parentb5a756a722286af9702d565501e1f690d075d16b (diff)
drm/i915: Always run hangcheck while the GPU is busy
Previously, we relied on only running the hangcheck while somebody was waiting on the GPU, in order to minimise the amount of time hangcheck had to run. (If nobody was watching the GPU, nobody would notice if the GPU wasn't responding -- eventually somebody would care and so kick hangcheck into action.) However, this falls apart from around commit 4680816be336 ("drm/i915: Wait first for submission, before waiting for request completion"), as not all waiters declare themselves to hangcheck and so we could switch off hangcheck and miss GPU hangs even when waiting under the struct_mutex. If we enable hangcheck from the first request submission, and let it run until the GPU is idle again, we forgo all the complexity involved with only enabling around waiters. We just have to remember to be careful that we do not declare a GPU hang when idly waiting for the next request to be come ready, as we will run hangcheck continuously even when the engines are stalled waiting for external events. This should be true already as we should only be tracking requests submitted to hardware for execution as an indicator that the engine is busy. Fixes: 4680816be336 ("drm/i915: Wait first for submission, before waiting for request completion" Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=104840 Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20180129144104.3921-1-chris@chris-wilson.co.uk Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com> (cherry picked from commit 889230489b6b138ba97ba2f13fc9644a3d16d0d2) Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
-rw-r--r--drivers/gpu/drm/i915/i915_gem.c7
-rw-r--r--drivers/gpu/drm/i915/i915_gem_request.c2
-rw-r--r--drivers/gpu/drm/i915/intel_breadcrumbs.c11
-rw-r--r--drivers/gpu/drm/i915/intel_hangcheck.c7
4 files changed, 6 insertions, 21 deletions
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 1135a77b383a..dd89abd2263d 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3323,16 +3323,15 @@ i915_gem_retire_work_handler(struct work_struct *work)
3323 mutex_unlock(&dev->struct_mutex); 3323 mutex_unlock(&dev->struct_mutex);
3324 } 3324 }
3325 3325
3326 /* Keep the retire handler running until we are finally idle. 3326 /*
3327 * Keep the retire handler running until we are finally idle.
3327 * We do not need to do this test under locking as in the worst-case 3328 * We do not need to do this test under locking as in the worst-case
3328 * we queue the retire worker once too often. 3329 * we queue the retire worker once too often.
3329 */ 3330 */
3330 if (READ_ONCE(dev_priv->gt.awake)) { 3331 if (READ_ONCE(dev_priv->gt.awake))
3331 i915_queue_hangcheck(dev_priv);
3332 queue_delayed_work(dev_priv->wq, 3332 queue_delayed_work(dev_priv->wq,
3333 &dev_priv->gt.retire_work, 3333 &dev_priv->gt.retire_work,
3334 round_jiffies_up_relative(HZ)); 3334 round_jiffies_up_relative(HZ));
3335 }
3336} 3335}
3337 3336
3338static inline bool 3337static inline bool
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index d575109f7a7f..e09d18df8b7f 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -276,6 +276,8 @@ static void mark_busy(struct drm_i915_private *i915)
276 276
277 intel_engines_unpark(i915); 277 intel_engines_unpark(i915);
278 278
279 i915_queue_hangcheck(i915);
280
279 queue_delayed_work(i915->wq, 281 queue_delayed_work(i915->wq,
280 &i915->gt.retire_work, 282 &i915->gt.retire_work,
281 round_jiffies_up_relative(HZ)); 283 round_jiffies_up_relative(HZ));
diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
index 58c624f982d9..bd40fea16b4f 100644
--- a/drivers/gpu/drm/i915/intel_breadcrumbs.c
+++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
@@ -149,17 +149,6 @@ static void intel_breadcrumbs_fake_irq(struct timer_list *t)
149 return; 149 return;
150 150
151 mod_timer(&b->fake_irq, jiffies + 1); 151 mod_timer(&b->fake_irq, jiffies + 1);
152
153 /* Ensure that even if the GPU hangs, we get woken up.
154 *
155 * However, note that if no one is waiting, we never notice
156 * a gpu hang. Eventually, we will have to wait for a resource
157 * held by the GPU and so trigger a hangcheck. In the most
158 * pathological case, this will be upon memory starvation! To
159 * prevent this, we also queue the hangcheck from the retire
160 * worker.
161 */
162 i915_queue_hangcheck(engine->i915);
163} 152}
164 153
165static void irq_enable(struct intel_engine_cs *engine) 154static void irq_enable(struct intel_engine_cs *engine)
diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
index 31f01d64c021..348a4f7ffb67 100644
--- a/drivers/gpu/drm/i915/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/intel_hangcheck.c
@@ -411,7 +411,6 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
411 struct intel_engine_cs *engine; 411 struct intel_engine_cs *engine;
412 enum intel_engine_id id; 412 enum intel_engine_id id;
413 unsigned int hung = 0, stuck = 0; 413 unsigned int hung = 0, stuck = 0;
414 int busy_count = 0;
415 414
416 if (!i915_modparams.enable_hangcheck) 415 if (!i915_modparams.enable_hangcheck)
417 return; 416 return;
@@ -429,7 +428,6 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
429 intel_uncore_arm_unclaimed_mmio_detection(dev_priv); 428 intel_uncore_arm_unclaimed_mmio_detection(dev_priv);
430 429
431 for_each_engine(engine, dev_priv, id) { 430 for_each_engine(engine, dev_priv, id) {
432 const bool busy = intel_engine_has_waiter(engine);
433 struct intel_engine_hangcheck hc; 431 struct intel_engine_hangcheck hc;
434 432
435 semaphore_clear_deadlocks(dev_priv); 433 semaphore_clear_deadlocks(dev_priv);
@@ -443,16 +441,13 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
443 if (hc.action != ENGINE_DEAD) 441 if (hc.action != ENGINE_DEAD)
444 stuck |= intel_engine_flag(engine); 442 stuck |= intel_engine_flag(engine);
445 } 443 }
446
447 busy_count += busy;
448 } 444 }
449 445
450 if (hung) 446 if (hung)
451 hangcheck_declare_hang(dev_priv, hung, stuck); 447 hangcheck_declare_hang(dev_priv, hung, stuck);
452 448
453 /* Reset timer in case GPU hangs without another request being added */ 449 /* Reset timer in case GPU hangs without another request being added */
454 if (busy_count) 450 i915_queue_hangcheck(dev_priv);
455 i915_queue_hangcheck(dev_priv);
456} 451}
457 452
458void intel_engine_init_hangcheck(struct intel_engine_cs *engine) 453void intel_engine_init_hangcheck(struct intel_engine_cs *engine)