aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm
diff options
context:
space:
mode:
authorDaniel Vetter <daniel.vetter@ffwll.ch>2012-11-15 11:17:22 -0500
committerDaniel Vetter <daniel.vetter@ffwll.ch>2013-01-20 07:11:16 -0500
commit1f83fee08d625f8d0130f9fe5ef7b17c2e022f3c (patch)
treef876ef636a0e99c2afd8d20db81789e50d259439 /drivers/gpu/drm
parent308887aad14c4ecc3fc10a3c58ec42641c5e4423 (diff)
drm/i915: clear up wedged transitions
We have two important transitions of the wedged state in the current code: - 0 -> 1: This means a hang has been detected, and signals to everyone that they please get of any locks, so that the reset work item can do its job. - 1 -> 0: The reset handler has completed. Now the last transition mixes up two states: "Reset completed and successful" and "Reset failed". To distinguish these two we do some tricks with the reset completion, but I simply could not convince myself that this doesn't race under odd circumstances. Hence split this up, and add a new terminal state indicating that the hw is gone for good. Also add explicit #defines for both states, update comments. v2: Split out the reset handling bugfix for the throttle ioctl. v3: s/tmp/wedged/ sugested by Chris Wilson. Also fixup up a rebase error which prevented this patch from actually compiling. v4: To unify the wedged state with the reset counter, keep the reset-in-progress state just as a flag. The terminally-wedged state is now denoted with a big number. v5: Add a comment to the reset_counter special values explaining that WEDGED & RESET_IN_PROGRESS needs to be true for the code to be correct. v6: Fixup logic errors introduced with the wedged+reset_counter unification. Since WEDGED implies reset-in-progress (in a way we're terminally stuck in the dead-but-reset-not-completed state), we need ensure that we check for this everywhere. The specific bug was in wait_for_error, which would simply have timed out. v7: Extract an inline i915_reset_in_progress helper to make the code more readable. Also annote the reset-in-progress case with an unlikely, to help the compiler optimize the fastpath. Do the same for the terminally wedged case with i915_terminally_wedged. Reviewed-by: Damien Lespiau <damien.lespiau@intel.com> Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Diffstat (limited to 'drivers/gpu/drm')
-rw-r--r--drivers/gpu/drm/i915/i915_debugfs.c2
-rw-r--r--drivers/gpu/drm/i915/i915_drv.h40
-rw-r--r--drivers/gpu/drm/i915/i915_gem.c49
-rw-r--r--drivers/gpu/drm/i915/i915_irq.c23
-rw-r--r--drivers/gpu/drm/i915/intel_display.c4
5 files changed, 74 insertions, 44 deletions
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index e1b7eaf60d24..384f19368a1d 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1672,7 +1672,7 @@ i915_wedged_read(struct file *filp,
1672 1672
1673 len = snprintf(buf, sizeof(buf), 1673 len = snprintf(buf, sizeof(buf),
1674 "wedged : %d\n", 1674 "wedged : %d\n",
1675 atomic_read(&dev_priv->gpu_error.wedged)); 1675 atomic_read(&dev_priv->gpu_error.reset_counter));
1676 1676
1677 if (len > sizeof(buf)) 1677 if (len > sizeof(buf))
1678 len = sizeof(buf); 1678 len = sizeof(buf);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 62da6c7a4884..c84743bb6937 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -771,11 +771,37 @@ struct i915_gpu_error {
771 /* Protected by the above dev->gpu_error.lock. */ 771 /* Protected by the above dev->gpu_error.lock. */
772 struct drm_i915_error_state *first_error; 772 struct drm_i915_error_state *first_error;
773 struct work_struct work; 773 struct work_struct work;
774 struct completion completion;
775 774
776 unsigned long last_reset; 775 unsigned long last_reset;
777 776
778 atomic_t wedged; 777 /**
778 * State variable controlling the reset flow
779 *
780 * Upper bits are for the reset counter.
781 *
782 * Lowest bit controls the reset state machine: Set means a reset is in
783 * progress. This state will (presuming we don't have any bugs) decay
784 * into either unset (successful reset) or the special WEDGED value (hw
785 * terminally sour). All waiters on the reset_queue will be woken when
786 * that happens.
787 */
788 atomic_t reset_counter;
789
790 /**
791 * Special values/flags for reset_counter
792 *
793 * Note that the code relies on
794 * I915_WEDGED & I915_RESET_IN_PROGRESS_FLAG
795 * being true.
796 */
797#define I915_RESET_IN_PROGRESS_FLAG 1
798#define I915_WEDGED 0xffffffff
799
800 /**
801 * Waitqueue to signal when the reset has completed. Used by clients
802 * that wait for dev_priv->mm.wedged to settle.
803 */
804 wait_queue_head_t reset_queue;
779 805
780 /* For gpu hang simulation. */ 806 /* For gpu hang simulation. */
781 unsigned int stop_rings; 807 unsigned int stop_rings;
@@ -1543,6 +1569,16 @@ void i915_gem_retire_requests(struct drm_device *dev);
1543void i915_gem_retire_requests_ring(struct intel_ring_buffer *ring); 1569void i915_gem_retire_requests_ring(struct intel_ring_buffer *ring);
1544int __must_check i915_gem_check_wedge(struct i915_gpu_error *error, 1570int __must_check i915_gem_check_wedge(struct i915_gpu_error *error,
1545 bool interruptible); 1571 bool interruptible);
1572static inline bool i915_reset_in_progress(struct i915_gpu_error *error)
1573{
1574 return unlikely(atomic_read(&error->reset_counter)
1575 & I915_RESET_IN_PROGRESS_FLAG);
1576}
1577
1578static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
1579{
1580 return atomic_read(&error->reset_counter) == I915_WEDGED;
1581}
1546 1582
1547void i915_gem_reset(struct drm_device *dev); 1583void i915_gem_reset(struct drm_device *dev);
1548void i915_gem_clflush_object(struct drm_i915_gem_object *obj); 1584void i915_gem_clflush_object(struct drm_i915_gem_object *obj);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index c96a501b8205..2ca901194824 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -89,36 +89,32 @@ static void i915_gem_info_remove_obj(struct drm_i915_private *dev_priv,
89static int 89static int
90i915_gem_wait_for_error(struct i915_gpu_error *error) 90i915_gem_wait_for_error(struct i915_gpu_error *error)
91{ 91{
92 struct completion *x = &error->completion;
93 unsigned long flags;
94 int ret; 92 int ret;
95 93
96 if (!atomic_read(&error->wedged)) 94#define EXIT_COND (!i915_reset_in_progress(error))
95 if (EXIT_COND)
97 return 0; 96 return 0;
98 97
98 /* GPU is already declared terminally dead, give up. */
99 if (i915_terminally_wedged(error))
100 return -EIO;
101
99 /* 102 /*
100 * Only wait 10 seconds for the gpu reset to complete to avoid hanging 103 * Only wait 10 seconds for the gpu reset to complete to avoid hanging
101 * userspace. If it takes that long something really bad is going on and 104 * userspace. If it takes that long something really bad is going on and
102 * we should simply try to bail out and fail as gracefully as possible. 105 * we should simply try to bail out and fail as gracefully as possible.
103 */ 106 */
104 ret = wait_for_completion_interruptible_timeout(x, 10*HZ); 107 ret = wait_event_interruptible_timeout(error->reset_queue,
108 EXIT_COND,
109 10*HZ);
105 if (ret == 0) { 110 if (ret == 0) {
106 DRM_ERROR("Timed out waiting for the gpu reset to complete\n"); 111 DRM_ERROR("Timed out waiting for the gpu reset to complete\n");
107 return -EIO; 112 return -EIO;
108 } else if (ret < 0) { 113 } else if (ret < 0) {
109 return ret; 114 return ret;
110 } 115 }
116#undef EXIT_COND
111 117
112 if (atomic_read(&error->wedged)) {
113 /* GPU is hung, bump the completion count to account for
114 * the token we just consumed so that we never hit zero and
115 * end up waiting upon a subsequent completion event that
116 * will never happen.
117 */
118 spin_lock_irqsave(&x->wait.lock, flags);
119 x->done++;
120 spin_unlock_irqrestore(&x->wait.lock, flags);
121 }
122 return 0; 118 return 0;
123} 119}
124 120
@@ -942,23 +938,14 @@ int
942i915_gem_check_wedge(struct i915_gpu_error *error, 938i915_gem_check_wedge(struct i915_gpu_error *error,
943 bool interruptible) 939 bool interruptible)
944{ 940{
945 if (atomic_read(&error->wedged)) { 941 if (i915_reset_in_progress(error)) {
946 struct completion *x = &error->completion;
947 bool recovery_complete;
948 unsigned long flags;
949
950 /* Give the error handler a chance to run. */
951 spin_lock_irqsave(&x->wait.lock, flags);
952 recovery_complete = x->done > 0;
953 spin_unlock_irqrestore(&x->wait.lock, flags);
954
955 /* Non-interruptible callers can't handle -EAGAIN, hence return 942 /* Non-interruptible callers can't handle -EAGAIN, hence return
956 * -EIO unconditionally for these. */ 943 * -EIO unconditionally for these. */
957 if (!interruptible) 944 if (!interruptible)
958 return -EIO; 945 return -EIO;
959 946
960 /* Recovery complete, but still wedged means reset failure. */ 947 /* Recovery complete, but the reset failed ... */
961 if (recovery_complete) 948 if (i915_terminally_wedged(error))
962 return -EIO; 949 return -EIO;
963 950
964 return -EAGAIN; 951 return -EAGAIN;
@@ -1025,7 +1012,7 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno,
1025 1012
1026#define EXIT_COND \ 1013#define EXIT_COND \
1027 (i915_seqno_passed(ring->get_seqno(ring, false), seqno) || \ 1014 (i915_seqno_passed(ring->get_seqno(ring, false), seqno) || \
1028 atomic_read(&dev_priv->gpu_error.wedged)) 1015 i915_reset_in_progress(&dev_priv->gpu_error))
1029 do { 1016 do {
1030 if (interruptible) 1017 if (interruptible)
1031 end = wait_event_interruptible_timeout(ring->irq_queue, 1018 end = wait_event_interruptible_timeout(ring->irq_queue,
@@ -1379,7 +1366,7 @@ out:
1379 /* If this -EIO is due to a gpu hang, give the reset code a 1366 /* If this -EIO is due to a gpu hang, give the reset code a
1380 * chance to clean up the mess. Otherwise return the proper 1367 * chance to clean up the mess. Otherwise return the proper
1381 * SIGBUS. */ 1368 * SIGBUS. */
1382 if (!atomic_read(&dev_priv->gpu_error.wedged)) 1369 if (i915_terminally_wedged(&dev_priv->gpu_error))
1383 return VM_FAULT_SIGBUS; 1370 return VM_FAULT_SIGBUS;
1384 case -EAGAIN: 1371 case -EAGAIN:
1385 /* Give the error handler a chance to run and move the 1372 /* Give the error handler a chance to run and move the
@@ -3983,9 +3970,9 @@ i915_gem_entervt_ioctl(struct drm_device *dev, void *data,
3983 if (drm_core_check_feature(dev, DRIVER_MODESET)) 3970 if (drm_core_check_feature(dev, DRIVER_MODESET))
3984 return 0; 3971 return 0;
3985 3972
3986 if (atomic_read(&dev_priv->gpu_error.wedged)) { 3973 if (i915_reset_in_progress(&dev_priv->gpu_error)) {
3987 DRM_ERROR("Reenabling wedged hardware, good luck\n"); 3974 DRM_ERROR("Reenabling wedged hardware, good luck\n");
3988 atomic_set(&dev_priv->gpu_error.wedged, 0); 3975 atomic_set(&dev_priv->gpu_error.reset_counter, 0);
3989 } 3976 }
3990 3977
3991 mutex_lock(&dev->struct_mutex); 3978 mutex_lock(&dev->struct_mutex);
@@ -4069,7 +4056,7 @@ i915_gem_load(struct drm_device *dev)
4069 INIT_LIST_HEAD(&dev_priv->fence_regs[i].lru_list); 4056 INIT_LIST_HEAD(&dev_priv->fence_regs[i].lru_list);
4070 INIT_DELAYED_WORK(&dev_priv->mm.retire_work, 4057 INIT_DELAYED_WORK(&dev_priv->mm.retire_work,
4071 i915_gem_retire_work_handler); 4058 i915_gem_retire_work_handler);
4072 init_completion(&dev_priv->gpu_error.completion); 4059 init_waitqueue_head(&dev_priv->gpu_error.reset_queue);
4073 4060
4074 /* On GEN3 we really need to make sure the ARB C3 LP bit is set */ 4061 /* On GEN3 we really need to make sure the ARB C3 LP bit is set */
4075 if (IS_GEN3(dev)) { 4062 if (IS_GEN3(dev)) {
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index f2c0016fa533..4562c5406ef8 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -862,8 +862,10 @@ done:
862 */ 862 */
863static void i915_error_work_func(struct work_struct *work) 863static void i915_error_work_func(struct work_struct *work)
864{ 864{
865 drm_i915_private_t *dev_priv = container_of(work, drm_i915_private_t, 865 struct i915_gpu_error *error = container_of(work, struct i915_gpu_error,
866 gpu_error.work); 866 work);
867 drm_i915_private_t *dev_priv = container_of(error, drm_i915_private_t,
868 gpu_error);
867 struct drm_device *dev = dev_priv->dev; 869 struct drm_device *dev = dev_priv->dev;
868 char *error_event[] = { "ERROR=1", NULL }; 870 char *error_event[] = { "ERROR=1", NULL };
869 char *reset_event[] = { "RESET=1", NULL }; 871 char *reset_event[] = { "RESET=1", NULL };
@@ -871,14 +873,18 @@ static void i915_error_work_func(struct work_struct *work)
871 873
872 kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, error_event); 874 kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, error_event);
873 875
874 if (atomic_read(&dev_priv->gpu_error.wedged)) { 876 if (i915_reset_in_progress(error)) {
875 DRM_DEBUG_DRIVER("resetting chip\n"); 877 DRM_DEBUG_DRIVER("resetting chip\n");
876 kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, reset_event); 878 kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, reset_event);
879
877 if (!i915_reset(dev)) { 880 if (!i915_reset(dev)) {
878 atomic_set(&dev_priv->gpu_error.wedged, 0); 881 atomic_set(&error->reset_counter, 0);
879 kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, reset_done_event); 882 kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, reset_done_event);
883 } else {
884 atomic_set(&error->reset_counter, I915_WEDGED);
880 } 885 }
881 complete_all(&dev_priv->gpu_error.completion); 886
887 wake_up_all(&dev_priv->gpu_error.reset_queue);
882 } 888 }
883} 889}
884 890
@@ -1482,11 +1488,12 @@ void i915_handle_error(struct drm_device *dev, bool wedged)
1482 i915_report_and_clear_eir(dev); 1488 i915_report_and_clear_eir(dev);
1483 1489
1484 if (wedged) { 1490 if (wedged) {
1485 INIT_COMPLETION(dev_priv->gpu_error.completion); 1491 atomic_set(&dev_priv->gpu_error.reset_counter,
1486 atomic_set(&dev_priv->gpu_error.wedged, 1); 1492 I915_RESET_IN_PROGRESS_FLAG);
1487 1493
1488 /* 1494 /*
1489 * Wakeup waiting processes so they don't hang 1495 * Wakeup waiting processes so that the reset work item
1496 * doesn't deadlock trying to grab various locks.
1490 */ 1497 */
1491 for_each_ring(ring, dev_priv, i) 1498 for_each_ring(ring, dev_priv, i)
1492 wake_up_all(&ring->irq_queue); 1499 wake_up_all(&ring->irq_queue);
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 160aa5f76118..77254460a5cb 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -2223,7 +2223,7 @@ intel_finish_fb(struct drm_framebuffer *old_fb)
2223 WARN_ON(waitqueue_active(&dev_priv->pending_flip_queue)); 2223 WARN_ON(waitqueue_active(&dev_priv->pending_flip_queue));
2224 2224
2225 wait_event(dev_priv->pending_flip_queue, 2225 wait_event(dev_priv->pending_flip_queue,
2226 atomic_read(&dev_priv->gpu_error.wedged) || 2226 i915_reset_in_progress(&dev_priv->gpu_error) ||
2227 atomic_read(&obj->pending_flip) == 0); 2227 atomic_read(&obj->pending_flip) == 0);
2228 2228
2229 /* Big Hammer, we also need to ensure that any pending 2229 /* Big Hammer, we also need to ensure that any pending
@@ -2871,7 +2871,7 @@ static bool intel_crtc_has_pending_flip(struct drm_crtc *crtc)
2871 unsigned long flags; 2871 unsigned long flags;
2872 bool pending; 2872 bool pending;
2873 2873
2874 if (atomic_read(&dev_priv->gpu_error.wedged)) 2874 if (i915_reset_in_progress(&dev_priv->gpu_error))
2875 return false; 2875 return false;
2876 2876
2877 spin_lock_irqsave(&dev->event_lock, flags); 2877 spin_lock_irqsave(&dev->event_lock, flags);