diff options
author | Daniel Vetter <daniel.vetter@ffwll.ch> | 2012-11-15 11:17:22 -0500 |
---|---|---|
committer | Daniel Vetter <daniel.vetter@ffwll.ch> | 2013-01-20 07:11:16 -0500 |
commit | 1f83fee08d625f8d0130f9fe5ef7b17c2e022f3c (patch) | |
tree | f876ef636a0e99c2afd8d20db81789e50d259439 /drivers/gpu | |
parent | 308887aad14c4ecc3fc10a3c58ec42641c5e4423 (diff) |
drm/i915: clear up wedged transitions
We have two important transitions of the wedged state in the current
code:
- 0 -> 1: This means a hang has been detected, and signals to everyone
that they please get of any locks, so that the reset work item can
do its job.
- 1 -> 0: The reset handler has completed.
Now the last transition mixes up two states: "Reset completed and
successful" and "Reset failed". To distinguish these two we do some
tricks with the reset completion, but I simply could not convince
myself that this doesn't race under odd circumstances.
Hence split this up, and add a new terminal state indicating that the
hw is gone for good.
Also add explicit #defines for both states, update comments.
v2: Split out the reset handling bugfix for the throttle ioctl.
v3: s/tmp/wedged/ sugested by Chris Wilson. Also fixup up a rebase
error which prevented this patch from actually compiling.
v4: To unify the wedged state with the reset counter, keep the
reset-in-progress state just as a flag. The terminally-wedged state is
now denoted with a big number.
v5: Add a comment to the reset_counter special values explaining that
WEDGED & RESET_IN_PROGRESS needs to be true for the code to be
correct.
v6: Fixup logic errors introduced with the wedged+reset_counter
unification. Since WEDGED implies reset-in-progress (in a way we're
terminally stuck in the dead-but-reset-not-completed state), we need
ensure that we check for this everywhere. The specific bug was in
wait_for_error, which would simply have timed out.
v7: Extract an inline i915_reset_in_progress helper to make the code
more readable. Also annote the reset-in-progress case with an
unlikely, to help the compiler optimize the fastpath. Do the same for
the terminally wedged case with i915_terminally_wedged.
Reviewed-by: Damien Lespiau <damien.lespiau@intel.com>
Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Diffstat (limited to 'drivers/gpu')
-rw-r--r-- | drivers/gpu/drm/i915/i915_debugfs.c | 2 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/i915_drv.h | 40 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/i915_gem.c | 49 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/i915_irq.c | 23 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/intel_display.c | 4 |
5 files changed, 74 insertions, 44 deletions
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index e1b7eaf60d24..384f19368a1d 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c | |||
@@ -1672,7 +1672,7 @@ i915_wedged_read(struct file *filp, | |||
1672 | 1672 | ||
1673 | len = snprintf(buf, sizeof(buf), | 1673 | len = snprintf(buf, sizeof(buf), |
1674 | "wedged : %d\n", | 1674 | "wedged : %d\n", |
1675 | atomic_read(&dev_priv->gpu_error.wedged)); | 1675 | atomic_read(&dev_priv->gpu_error.reset_counter)); |
1676 | 1676 | ||
1677 | if (len > sizeof(buf)) | 1677 | if (len > sizeof(buf)) |
1678 | len = sizeof(buf); | 1678 | len = sizeof(buf); |
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 62da6c7a4884..c84743bb6937 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h | |||
@@ -771,11 +771,37 @@ struct i915_gpu_error { | |||
771 | /* Protected by the above dev->gpu_error.lock. */ | 771 | /* Protected by the above dev->gpu_error.lock. */ |
772 | struct drm_i915_error_state *first_error; | 772 | struct drm_i915_error_state *first_error; |
773 | struct work_struct work; | 773 | struct work_struct work; |
774 | struct completion completion; | ||
775 | 774 | ||
776 | unsigned long last_reset; | 775 | unsigned long last_reset; |
777 | 776 | ||
778 | atomic_t wedged; | 777 | /** |
778 | * State variable controlling the reset flow | ||
779 | * | ||
780 | * Upper bits are for the reset counter. | ||
781 | * | ||
782 | * Lowest bit controls the reset state machine: Set means a reset is in | ||
783 | * progress. This state will (presuming we don't have any bugs) decay | ||
784 | * into either unset (successful reset) or the special WEDGED value (hw | ||
785 | * terminally sour). All waiters on the reset_queue will be woken when | ||
786 | * that happens. | ||
787 | */ | ||
788 | atomic_t reset_counter; | ||
789 | |||
790 | /** | ||
791 | * Special values/flags for reset_counter | ||
792 | * | ||
793 | * Note that the code relies on | ||
794 | * I915_WEDGED & I915_RESET_IN_PROGRESS_FLAG | ||
795 | * being true. | ||
796 | */ | ||
797 | #define I915_RESET_IN_PROGRESS_FLAG 1 | ||
798 | #define I915_WEDGED 0xffffffff | ||
799 | |||
800 | /** | ||
801 | * Waitqueue to signal when the reset has completed. Used by clients | ||
802 | * that wait for dev_priv->mm.wedged to settle. | ||
803 | */ | ||
804 | wait_queue_head_t reset_queue; | ||
779 | 805 | ||
780 | /* For gpu hang simulation. */ | 806 | /* For gpu hang simulation. */ |
781 | unsigned int stop_rings; | 807 | unsigned int stop_rings; |
@@ -1543,6 +1569,16 @@ void i915_gem_retire_requests(struct drm_device *dev); | |||
1543 | void i915_gem_retire_requests_ring(struct intel_ring_buffer *ring); | 1569 | void i915_gem_retire_requests_ring(struct intel_ring_buffer *ring); |
1544 | int __must_check i915_gem_check_wedge(struct i915_gpu_error *error, | 1570 | int __must_check i915_gem_check_wedge(struct i915_gpu_error *error, |
1545 | bool interruptible); | 1571 | bool interruptible); |
1572 | static inline bool i915_reset_in_progress(struct i915_gpu_error *error) | ||
1573 | { | ||
1574 | return unlikely(atomic_read(&error->reset_counter) | ||
1575 | & I915_RESET_IN_PROGRESS_FLAG); | ||
1576 | } | ||
1577 | |||
1578 | static inline bool i915_terminally_wedged(struct i915_gpu_error *error) | ||
1579 | { | ||
1580 | return atomic_read(&error->reset_counter) == I915_WEDGED; | ||
1581 | } | ||
1546 | 1582 | ||
1547 | void i915_gem_reset(struct drm_device *dev); | 1583 | void i915_gem_reset(struct drm_device *dev); |
1548 | void i915_gem_clflush_object(struct drm_i915_gem_object *obj); | 1584 | void i915_gem_clflush_object(struct drm_i915_gem_object *obj); |
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index c96a501b8205..2ca901194824 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c | |||
@@ -89,36 +89,32 @@ static void i915_gem_info_remove_obj(struct drm_i915_private *dev_priv, | |||
89 | static int | 89 | static int |
90 | i915_gem_wait_for_error(struct i915_gpu_error *error) | 90 | i915_gem_wait_for_error(struct i915_gpu_error *error) |
91 | { | 91 | { |
92 | struct completion *x = &error->completion; | ||
93 | unsigned long flags; | ||
94 | int ret; | 92 | int ret; |
95 | 93 | ||
96 | if (!atomic_read(&error->wedged)) | 94 | #define EXIT_COND (!i915_reset_in_progress(error)) |
95 | if (EXIT_COND) | ||
97 | return 0; | 96 | return 0; |
98 | 97 | ||
98 | /* GPU is already declared terminally dead, give up. */ | ||
99 | if (i915_terminally_wedged(error)) | ||
100 | return -EIO; | ||
101 | |||
99 | /* | 102 | /* |
100 | * Only wait 10 seconds for the gpu reset to complete to avoid hanging | 103 | * Only wait 10 seconds for the gpu reset to complete to avoid hanging |
101 | * userspace. If it takes that long something really bad is going on and | 104 | * userspace. If it takes that long something really bad is going on and |
102 | * we should simply try to bail out and fail as gracefully as possible. | 105 | * we should simply try to bail out and fail as gracefully as possible. |
103 | */ | 106 | */ |
104 | ret = wait_for_completion_interruptible_timeout(x, 10*HZ); | 107 | ret = wait_event_interruptible_timeout(error->reset_queue, |
108 | EXIT_COND, | ||
109 | 10*HZ); | ||
105 | if (ret == 0) { | 110 | if (ret == 0) { |
106 | DRM_ERROR("Timed out waiting for the gpu reset to complete\n"); | 111 | DRM_ERROR("Timed out waiting for the gpu reset to complete\n"); |
107 | return -EIO; | 112 | return -EIO; |
108 | } else if (ret < 0) { | 113 | } else if (ret < 0) { |
109 | return ret; | 114 | return ret; |
110 | } | 115 | } |
116 | #undef EXIT_COND | ||
111 | 117 | ||
112 | if (atomic_read(&error->wedged)) { | ||
113 | /* GPU is hung, bump the completion count to account for | ||
114 | * the token we just consumed so that we never hit zero and | ||
115 | * end up waiting upon a subsequent completion event that | ||
116 | * will never happen. | ||
117 | */ | ||
118 | spin_lock_irqsave(&x->wait.lock, flags); | ||
119 | x->done++; | ||
120 | spin_unlock_irqrestore(&x->wait.lock, flags); | ||
121 | } | ||
122 | return 0; | 118 | return 0; |
123 | } | 119 | } |
124 | 120 | ||
@@ -942,23 +938,14 @@ int | |||
942 | i915_gem_check_wedge(struct i915_gpu_error *error, | 938 | i915_gem_check_wedge(struct i915_gpu_error *error, |
943 | bool interruptible) | 939 | bool interruptible) |
944 | { | 940 | { |
945 | if (atomic_read(&error->wedged)) { | 941 | if (i915_reset_in_progress(error)) { |
946 | struct completion *x = &error->completion; | ||
947 | bool recovery_complete; | ||
948 | unsigned long flags; | ||
949 | |||
950 | /* Give the error handler a chance to run. */ | ||
951 | spin_lock_irqsave(&x->wait.lock, flags); | ||
952 | recovery_complete = x->done > 0; | ||
953 | spin_unlock_irqrestore(&x->wait.lock, flags); | ||
954 | |||
955 | /* Non-interruptible callers can't handle -EAGAIN, hence return | 942 | /* Non-interruptible callers can't handle -EAGAIN, hence return |
956 | * -EIO unconditionally for these. */ | 943 | * -EIO unconditionally for these. */ |
957 | if (!interruptible) | 944 | if (!interruptible) |
958 | return -EIO; | 945 | return -EIO; |
959 | 946 | ||
960 | /* Recovery complete, but still wedged means reset failure. */ | 947 | /* Recovery complete, but the reset failed ... */ |
961 | if (recovery_complete) | 948 | if (i915_terminally_wedged(error)) |
962 | return -EIO; | 949 | return -EIO; |
963 | 950 | ||
964 | return -EAGAIN; | 951 | return -EAGAIN; |
@@ -1025,7 +1012,7 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno, | |||
1025 | 1012 | ||
1026 | #define EXIT_COND \ | 1013 | #define EXIT_COND \ |
1027 | (i915_seqno_passed(ring->get_seqno(ring, false), seqno) || \ | 1014 | (i915_seqno_passed(ring->get_seqno(ring, false), seqno) || \ |
1028 | atomic_read(&dev_priv->gpu_error.wedged)) | 1015 | i915_reset_in_progress(&dev_priv->gpu_error)) |
1029 | do { | 1016 | do { |
1030 | if (interruptible) | 1017 | if (interruptible) |
1031 | end = wait_event_interruptible_timeout(ring->irq_queue, | 1018 | end = wait_event_interruptible_timeout(ring->irq_queue, |
@@ -1379,7 +1366,7 @@ out: | |||
1379 | /* If this -EIO is due to a gpu hang, give the reset code a | 1366 | /* If this -EIO is due to a gpu hang, give the reset code a |
1380 | * chance to clean up the mess. Otherwise return the proper | 1367 | * chance to clean up the mess. Otherwise return the proper |
1381 | * SIGBUS. */ | 1368 | * SIGBUS. */ |
1382 | if (!atomic_read(&dev_priv->gpu_error.wedged)) | 1369 | if (i915_terminally_wedged(&dev_priv->gpu_error)) |
1383 | return VM_FAULT_SIGBUS; | 1370 | return VM_FAULT_SIGBUS; |
1384 | case -EAGAIN: | 1371 | case -EAGAIN: |
1385 | /* Give the error handler a chance to run and move the | 1372 | /* Give the error handler a chance to run and move the |
@@ -3983,9 +3970,9 @@ i915_gem_entervt_ioctl(struct drm_device *dev, void *data, | |||
3983 | if (drm_core_check_feature(dev, DRIVER_MODESET)) | 3970 | if (drm_core_check_feature(dev, DRIVER_MODESET)) |
3984 | return 0; | 3971 | return 0; |
3985 | 3972 | ||
3986 | if (atomic_read(&dev_priv->gpu_error.wedged)) { | 3973 | if (i915_reset_in_progress(&dev_priv->gpu_error)) { |
3987 | DRM_ERROR("Reenabling wedged hardware, good luck\n"); | 3974 | DRM_ERROR("Reenabling wedged hardware, good luck\n"); |
3988 | atomic_set(&dev_priv->gpu_error.wedged, 0); | 3975 | atomic_set(&dev_priv->gpu_error.reset_counter, 0); |
3989 | } | 3976 | } |
3990 | 3977 | ||
3991 | mutex_lock(&dev->struct_mutex); | 3978 | mutex_lock(&dev->struct_mutex); |
@@ -4069,7 +4056,7 @@ i915_gem_load(struct drm_device *dev) | |||
4069 | INIT_LIST_HEAD(&dev_priv->fence_regs[i].lru_list); | 4056 | INIT_LIST_HEAD(&dev_priv->fence_regs[i].lru_list); |
4070 | INIT_DELAYED_WORK(&dev_priv->mm.retire_work, | 4057 | INIT_DELAYED_WORK(&dev_priv->mm.retire_work, |
4071 | i915_gem_retire_work_handler); | 4058 | i915_gem_retire_work_handler); |
4072 | init_completion(&dev_priv->gpu_error.completion); | 4059 | init_waitqueue_head(&dev_priv->gpu_error.reset_queue); |
4073 | 4060 | ||
4074 | /* On GEN3 we really need to make sure the ARB C3 LP bit is set */ | 4061 | /* On GEN3 we really need to make sure the ARB C3 LP bit is set */ |
4075 | if (IS_GEN3(dev)) { | 4062 | if (IS_GEN3(dev)) { |
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index f2c0016fa533..4562c5406ef8 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c | |||
@@ -862,8 +862,10 @@ done: | |||
862 | */ | 862 | */ |
863 | static void i915_error_work_func(struct work_struct *work) | 863 | static void i915_error_work_func(struct work_struct *work) |
864 | { | 864 | { |
865 | drm_i915_private_t *dev_priv = container_of(work, drm_i915_private_t, | 865 | struct i915_gpu_error *error = container_of(work, struct i915_gpu_error, |
866 | gpu_error.work); | 866 | work); |
867 | drm_i915_private_t *dev_priv = container_of(error, drm_i915_private_t, | ||
868 | gpu_error); | ||
867 | struct drm_device *dev = dev_priv->dev; | 869 | struct drm_device *dev = dev_priv->dev; |
868 | char *error_event[] = { "ERROR=1", NULL }; | 870 | char *error_event[] = { "ERROR=1", NULL }; |
869 | char *reset_event[] = { "RESET=1", NULL }; | 871 | char *reset_event[] = { "RESET=1", NULL }; |
@@ -871,14 +873,18 @@ static void i915_error_work_func(struct work_struct *work) | |||
871 | 873 | ||
872 | kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, error_event); | 874 | kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, error_event); |
873 | 875 | ||
874 | if (atomic_read(&dev_priv->gpu_error.wedged)) { | 876 | if (i915_reset_in_progress(error)) { |
875 | DRM_DEBUG_DRIVER("resetting chip\n"); | 877 | DRM_DEBUG_DRIVER("resetting chip\n"); |
876 | kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, reset_event); | 878 | kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, reset_event); |
879 | |||
877 | if (!i915_reset(dev)) { | 880 | if (!i915_reset(dev)) { |
878 | atomic_set(&dev_priv->gpu_error.wedged, 0); | 881 | atomic_set(&error->reset_counter, 0); |
879 | kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, reset_done_event); | 882 | kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, reset_done_event); |
883 | } else { | ||
884 | atomic_set(&error->reset_counter, I915_WEDGED); | ||
880 | } | 885 | } |
881 | complete_all(&dev_priv->gpu_error.completion); | 886 | |
887 | wake_up_all(&dev_priv->gpu_error.reset_queue); | ||
882 | } | 888 | } |
883 | } | 889 | } |
884 | 890 | ||
@@ -1482,11 +1488,12 @@ void i915_handle_error(struct drm_device *dev, bool wedged) | |||
1482 | i915_report_and_clear_eir(dev); | 1488 | i915_report_and_clear_eir(dev); |
1483 | 1489 | ||
1484 | if (wedged) { | 1490 | if (wedged) { |
1485 | INIT_COMPLETION(dev_priv->gpu_error.completion); | 1491 | atomic_set(&dev_priv->gpu_error.reset_counter, |
1486 | atomic_set(&dev_priv->gpu_error.wedged, 1); | 1492 | I915_RESET_IN_PROGRESS_FLAG); |
1487 | 1493 | ||
1488 | /* | 1494 | /* |
1489 | * Wakeup waiting processes so they don't hang | 1495 | * Wakeup waiting processes so that the reset work item |
1496 | * doesn't deadlock trying to grab various locks. | ||
1490 | */ | 1497 | */ |
1491 | for_each_ring(ring, dev_priv, i) | 1498 | for_each_ring(ring, dev_priv, i) |
1492 | wake_up_all(&ring->irq_queue); | 1499 | wake_up_all(&ring->irq_queue); |
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index 160aa5f76118..77254460a5cb 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c | |||
@@ -2223,7 +2223,7 @@ intel_finish_fb(struct drm_framebuffer *old_fb) | |||
2223 | WARN_ON(waitqueue_active(&dev_priv->pending_flip_queue)); | 2223 | WARN_ON(waitqueue_active(&dev_priv->pending_flip_queue)); |
2224 | 2224 | ||
2225 | wait_event(dev_priv->pending_flip_queue, | 2225 | wait_event(dev_priv->pending_flip_queue, |
2226 | atomic_read(&dev_priv->gpu_error.wedged) || | 2226 | i915_reset_in_progress(&dev_priv->gpu_error) || |
2227 | atomic_read(&obj->pending_flip) == 0); | 2227 | atomic_read(&obj->pending_flip) == 0); |
2228 | 2228 | ||
2229 | /* Big Hammer, we also need to ensure that any pending | 2229 | /* Big Hammer, we also need to ensure that any pending |
@@ -2871,7 +2871,7 @@ static bool intel_crtc_has_pending_flip(struct drm_crtc *crtc) | |||
2871 | unsigned long flags; | 2871 | unsigned long flags; |
2872 | bool pending; | 2872 | bool pending; |
2873 | 2873 | ||
2874 | if (atomic_read(&dev_priv->gpu_error.wedged)) | 2874 | if (i915_reset_in_progress(&dev_priv->gpu_error)) |
2875 | return false; | 2875 | return false; |
2876 | 2876 | ||
2877 | spin_lock_irqsave(&dev->event_lock, flags); | 2877 | spin_lock_irqsave(&dev->event_lock, flags); |