drm/i915: clear up wedged transitions

We have two important transitions of the wedged state in the current code: - 0 -> 1: This means a hang has been detected, and signals to everyone that they please get of any locks, so that the reset work item can do its job. - 1 -> 0: The reset handler has completed. Now the last transition mixes up two states: "Reset completed and successful" and "Reset failed". To distinguish these two we do some tricks with the reset completion, but I simply could not convince myself that this doesn't race under odd circumstances. Hence split this up, and add a new terminal state indicating that the hw is gone for good. Also add explicit #defines for both states, update comments. v2: Split out the reset handling bugfix for the throttle ioctl. v3: s/tmp/wedged/ sugested by Chris Wilson. Also fixup up a rebase error which prevented this patch from actually compiling. v4: To unify the wedged state with the reset counter, keep the reset-in-progress state just as a flag. The terminally-wedged state is now denoted with a big number. v5: Add a comment to the reset_counter special values explaining that WEDGED & RESET_IN_PROGRESS needs to be true for the code to be correct. v6: Fixup logic errors introduced with the wedged+reset_counter unification. Since WEDGED implies reset-in-progress (in a way we're terminally stuck in the dead-but-reset-not-completed state), we need ensure that we check for this everywhere. The specific bug was in wait_for_error, which would simply have timed out. v7: Extract an inline i915_reset_in_progress helper to make the code more readable. Also annote the reset-in-progress case with an unlikely, to help the compiler optimize the fastpath. Do the same for the terminally wedged case with i915_terminally_wedged. Reviewed-by: Damien Lespiau <damien.lespiau@intel.com> Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
author: Daniel Vetter <daniel.vetter@ffwll.ch> 2012-11-15 11:17:22 -0500
committer: Daniel Vetter <daniel.vetter@ffwll.ch> 2013-01-20 07:11:16 -0500
commit: 1f83fee08d625f8d0130f9fe5ef7b17c2e022f3c (patch)
tree: f876ef636a0e99c2afd8d20db81789e50d259439 /drivers/gpu
parent: 308887aad14c4ecc3fc10a3c58ec42641c5e4423 (diff)
5 files changed, 74 insertions, 44 deletions
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index e1b7eaf60d24..384f19368a1d 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1672,7 +1672,7 @@ i915_wedged_read(struct file *filp,
        len = snprintf(buf, sizeof(buf),
                       "wedged :  %d\n",
-                       atomic_read(&dev_priv->gpu_error.wedged));
+                       atomic_read(&dev_priv->gpu_error.reset_counter));
        if (len > sizeof(buf))
                len = sizeof(buf);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 62da6c7a4884..c84743bb6937 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -771,11 +771,37 @@ struct i915_gpu_error {
        /* Protected by the above dev->gpu_error.lock. */
        struct drm_i915_error_state *first_error;
        struct work_struct work;
-        struct completion completion;
        unsigned long last_reset;
-        atomic_t wedged;
+        /**
+         * State variable controlling the reset flow
+         *
+         * Upper bits are for the reset counter.
+         *
+         * Lowest bit controls the reset state machine: Set means a reset is in
+         * progress. This state will (presuming we don't have any bugs) decay
+         * into either unset (successful reset) or the special WEDGED value (hw
+         * terminally sour). All waiters on the reset_queue will be woken when
+         * that happens.
+         */
+        atomic_t reset_counter;
+        /**
+         * Special values/flags for reset_counter
+         *
+         * Note that the code relies on
+         *      I915_WEDGED & I915_RESET_IN_PROGRESS_FLAG
+         * being true.
+         */
+#define I915_RESET_IN_PROGRESS_FLAG     1
+#define I915_WEDGED                     0xffffffff
+        /**
+         * Waitqueue to signal when the reset has completed. Used by clients
+         * that wait for dev_priv->mm.wedged to settle.
+         */
+        wait_queue_head_t reset_queue;
        /* For gpu hang simulation. */
        unsigned int stop_rings;
@@ -1543,6 +1569,16 @@ void i915_gem_retire_requests(struct drm_device *dev);
 void i915_gem_retire_requests_ring(struct intel_ring_buffer *ring);
 int __must_check i915_gem_check_wedge(struct i915_gpu_error *error,
                                      bool interruptible);
+static inline bool i915_reset_in_progress(struct i915_gpu_error *error)
+{
+        return unlikely(atomic_read(&error->reset_counter)
+                        & I915_RESET_IN_PROGRESS_FLAG);
+}
+static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
+{
+        return atomic_read(&error->reset_counter) == I915_WEDGED;
+}
 void i915_gem_reset(struct drm_device *dev);
 void i915_gem_clflush_object(struct drm_i915_gem_object *obj);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index c96a501b8205..2ca901194824 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -89,36 +89,32 @@ static void i915_gem_info_remove_obj(struct drm_i915_private *dev_priv,
 static int
 i915_gem_wait_for_error(struct i915_gpu_error *error)
 {
-        struct completion *x = &error->completion;
-        unsigned long flags;
        int ret;
-        if (!atomic_read(&error->wedged))
+#define EXIT_COND (!i915_reset_in_progress(error))
+        if (EXIT_COND)
                return 0;
+        /* GPU is already declared terminally dead, give up. */
+        if (i915_terminally_wedged(error))
+                return -EIO;
        /*
         * Only wait 10 seconds for the gpu reset to complete to avoid hanging
         * userspace. If it takes that long something really bad is going on and
         * we should simply try to bail out and fail as gracefully as possible.
         */
-        ret = wait_for_completion_interruptible_timeout(x, 10*HZ);
+        ret = wait_event_interruptible_timeout(error->reset_queue,
+                                               EXIT_COND,
+                                               10*HZ);
        if (ret == 0) {
                DRM_ERROR("Timed out waiting for the gpu reset to complete\n");
                return -EIO;
        } else if (ret < 0) {
                return ret;
        }
+#undef EXIT_COND
-        if (atomic_read(&error->wedged)) {
-                /* GPU is hung, bump the completion count to account for
-                 * the token we just consumed so that we never hit zero and
-                 * end up waiting upon a subsequent completion event that
-                 * will never happen.
-                 */
-                spin_lock_irqsave(&x->wait.lock, flags);
-                x->done++;
-                spin_unlock_irqrestore(&x->wait.lock, flags);
-        }
        return 0;
 }
@@ -942,23 +938,14 @@ int
 i915_gem_check_wedge(struct i915_gpu_error *error,
                     bool interruptible)
 {
-        if (atomic_read(&error->wedged)) {
+        if (i915_reset_in_progress(error)) {
-                struct completion *x = &error->completion;
-                bool recovery_complete;
-                unsigned long flags;
-                /* Give the error handler a chance to run. */
-                spin_lock_irqsave(&x->wait.lock, flags);
-                recovery_complete = x->done > 0;
-                spin_unlock_irqrestore(&x->wait.lock, flags);
                /* Non-interruptible callers can't handle -EAGAIN, hence return
                 * -EIO unconditionally for these. */
                if (!interruptible)
                        return -EIO;
-                /* Recovery complete, but still wedged means reset failure. */
+                /* Recovery complete, but the reset failed ... */
-                if (recovery_complete)
+                if (i915_terminally_wedged(error))
                        return -EIO;
                return -EAGAIN;
@@ -1025,7 +1012,7 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno,
 #define EXIT_COND \
        (i915_seqno_passed(ring->get_seqno(ring, false), seqno) || \
-        atomic_read(&dev_priv->gpu_error.wedged))
+        i915_reset_in_progress(&dev_priv->gpu_error))
        do {
                if (interruptible)
                        end = wait_event_interruptible_timeout(ring->irq_queue,
@@ -1379,7 +1366,7 @@ out:
                /* If this -EIO is due to a gpu hang, give the reset code a
                 * chance to clean up the mess. Otherwise return the proper
                 * SIGBUS. */
-                if (!atomic_read(&dev_priv->gpu_error.wedged))
+                if (i915_terminally_wedged(&dev_priv->gpu_error))
                        return VM_FAULT_SIGBUS;
        case -EAGAIN:
                /* Give the error handler a chance to run and move the
@@ -3983,9 +3970,9 @@ i915_gem_entervt_ioctl(struct drm_device *dev, void *data,
        if (drm_core_check_feature(dev, DRIVER_MODESET))
                return 0;
-        if (atomic_read(&dev_priv->gpu_error.wedged)) {
+        if (i915_reset_in_progress(&dev_priv->gpu_error)) {
                DRM_ERROR("Reenabling wedged hardware, good luck\n");
-                atomic_set(&dev_priv->gpu_error.wedged, 0);
+                atomic_set(&dev_priv->gpu_error.reset_counter, 0);
        }
        mutex_lock(&dev->struct_mutex);
@@ -4069,7 +4056,7 @@ i915_gem_load(struct drm_device *dev)
                INIT_LIST_HEAD(&dev_priv->fence_regs[i].lru_list);
        INIT_DELAYED_WORK(&dev_priv->mm.retire_work,
                          i915_gem_retire_work_handler);
-        init_completion(&dev_priv->gpu_error.completion);
+        init_waitqueue_head(&dev_priv->gpu_error.reset_queue);
        /* On GEN3 we really need to make sure the ARB C3 LP bit is set */
        if (IS_GEN3(dev)) {
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index f2c0016fa533..4562c5406ef8 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -862,8 +862,10 @@ done:
 */
 static void i915_error_work_func(struct work_struct *work)
 {
-        drm_i915_private_t *dev_priv = container_of(work, drm_i915_private_t,
+        struct i915_gpu_error *error = container_of(work, struct i915_gpu_error,
-                                                    gpu_error.work);
+                                                    work);
+        drm_i915_private_t *dev_priv = container_of(error, drm_i915_private_t,
+                                                    gpu_error);
        struct drm_device *dev = dev_priv->dev;
        char *error_event[] = { "ERROR=1", NULL };
        char *reset_event[] = { "RESET=1", NULL };
@@ -871,14 +873,18 @@ static void i915_error_work_func(struct work_struct *work)
        kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, error_event);
-        if (atomic_read(&dev_priv->gpu_error.wedged)) {
+        if (i915_reset_in_progress(error)) {
                DRM_DEBUG_DRIVER("resetting chip\n");
                kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, reset_event);
                if (!i915_reset(dev)) {
-                        atomic_set(&dev_priv->gpu_error.wedged, 0);
+                        atomic_set(&error->reset_counter, 0);
                        kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, reset_done_event);
+                } else {
+                        atomic_set(&error->reset_counter, I915_WEDGED);
                }
-                complete_all(&dev_priv->gpu_error.completion);
+                wake_up_all(&dev_priv->gpu_error.reset_queue);
        }
 }
@@ -1482,11 +1488,12 @@ void i915_handle_error(struct drm_device *dev, bool wedged)
        i915_report_and_clear_eir(dev);
        if (wedged) {
-                INIT_COMPLETION(dev_priv->gpu_error.completion);
+                atomic_set(&dev_priv->gpu_error.reset_counter,
-                atomic_set(&dev_priv->gpu_error.wedged, 1);
+                           I915_RESET_IN_PROGRESS_FLAG);
                /*
-                 * Wakeup waiting processes so they don't hang
+                 * Wakeup waiting processes so that the reset work item
+                 * doesn't deadlock trying to grab various locks.
                 */
                for_each_ring(ring, dev_priv, i)
                        wake_up_all(&ring->irq_queue);
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 160aa5f76118..77254460a5cb 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -2223,7 +2223,7 @@ intel_finish_fb(struct drm_framebuffer *old_fb)
        WARN_ON(waitqueue_active(&dev_priv->pending_flip_queue));
        wait_event(dev_priv->pending_flip_queue,
-                   atomic_read(&dev_priv->gpu_error.wedged) ||
+                   i915_reset_in_progress(&dev_priv->gpu_error) ||
                   atomic_read(&obj->pending_flip) == 0);
        /* Big Hammer, we also need to ensure that any pending
@@ -2871,7 +2871,7 @@ static bool intel_crtc_has_pending_flip(struct drm_crtc *crtc)
        unsigned long flags;
        bool pending;
-        if (atomic_read(&dev_priv->gpu_error.wedged))
+        if (i915_reset_in_progress(&dev_priv->gpu_error))
                return false;
        spin_lock_irqsave(&dev->event_lock, flags);
author	Daniel Vetter <daniel.vetter@ffwll.ch>	2012-11-15 11:17:22 -0500
committer	Daniel Vetter <daniel.vetter@ffwll.ch>	2013-01-20 07:11:16 -0500
commit	1f83fee08d625f8d0130f9fe5ef7b17c2e022f3c (patch)
tree	f876ef636a0e99c2afd8d20db81789e50d259439 /drivers/gpu
parent	308887aad14c4ecc3fc10a3c58ec42641c5e4423 (diff)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index e1b7eaf60d24..384f19368a1d 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1672,7 +1672,7 @@ i915_wedged_read(struct file *filp,
1672		1672
1673	len = snprintf(buf, sizeof(buf),	1673	len = snprintf(buf, sizeof(buf),
1674	"wedged : %d\n",	1674	"wedged : %d\n",
1675	atomic_read(&dev_priv->gpu_error.wedged));	1675	atomic_read(&dev_priv->gpu_error.reset_counter));
1676		1676
1677	if (len > sizeof(buf))	1677	if (len > sizeof(buf))
1678	len = sizeof(buf);	1678	len = sizeof(buf);


diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 62da6c7a4884..c84743bb6937 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -771,11 +771,37 @@ struct i915_gpu_error {
771	/* Protected by the above dev->gpu_error.lock. */	771	/* Protected by the above dev->gpu_error.lock. */
772	struct drm_i915_error_state *first_error;	772	struct drm_i915_error_state *first_error;
773	struct work_struct work;	773	struct work_struct work;
774	struct completion completion;
775		774
776	unsigned long last_reset;	775	unsigned long last_reset;
777		776
778	atomic_t wedged;	777	/**
		778	* State variable controlling the reset flow
		779	*
		780	* Upper bits are for the reset counter.
		781	*
		782	* Lowest bit controls the reset state machine: Set means a reset is in
		783	* progress. This state will (presuming we don't have any bugs) decay
		784	* into either unset (successful reset) or the special WEDGED value (hw
		785	* terminally sour). All waiters on the reset_queue will be woken when
		786	* that happens.
		787	*/
		788	atomic_t reset_counter;
		789
		790	/**
		791	* Special values/flags for reset_counter
		792	*
		793	* Note that the code relies on
		794	* I915_WEDGED & I915_RESET_IN_PROGRESS_FLAG
		795	* being true.
		796	*/
		797	#define I915_RESET_IN_PROGRESS_FLAG 1
		798	#define I915_WEDGED 0xffffffff
		799
		800	/**
		801	* Waitqueue to signal when the reset has completed. Used by clients
		802	* that wait for dev_priv->mm.wedged to settle.
		803	*/
		804	wait_queue_head_t reset_queue;
779		805
780	/* For gpu hang simulation. */	806	/* For gpu hang simulation. */
781	unsigned int stop_rings;	807	unsigned int stop_rings;
@@ -1543,6 +1569,16 @@ void i915_gem_retire_requests(struct drm_device *dev);
1543	void i915_gem_retire_requests_ring(struct intel_ring_buffer *ring);	1569	void i915_gem_retire_requests_ring(struct intel_ring_buffer *ring);
1544	int __must_check i915_gem_check_wedge(struct i915_gpu_error *error,	1570	int __must_check i915_gem_check_wedge(struct i915_gpu_error *error,
1545	bool interruptible);	1571	bool interruptible);
		1572	static inline bool i915_reset_in_progress(struct i915_gpu_error *error)
		1573	{
		1574	return unlikely(atomic_read(&error->reset_counter)
		1575	& I915_RESET_IN_PROGRESS_FLAG);
		1576	}
		1577
		1578	static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
		1579	{
		1580	return atomic_read(&error->reset_counter) == I915_WEDGED;
		1581	}
1546		1582
1547	void i915_gem_reset(struct drm_device *dev);	1583	void i915_gem_reset(struct drm_device *dev);
1548	void i915_gem_clflush_object(struct drm_i915_gem_object *obj);	1584	void i915_gem_clflush_object(struct drm_i915_gem_object *obj);


diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index c96a501b8205..2ca901194824 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -89,36 +89,32 @@ static void i915_gem_info_remove_obj(struct drm_i915_private *dev_priv,
89	static int	89	static int
90	i915_gem_wait_for_error(struct i915_gpu_error *error)	90	i915_gem_wait_for_error(struct i915_gpu_error *error)
91	{	91	{
92	struct completion *x = &error->completion;
93	unsigned long flags;
94	int ret;	92	int ret;
95		93
96	if (!atomic_read(&error->wedged))	94	#define EXIT_COND (!i915_reset_in_progress(error))
		95	if (EXIT_COND)
97	return 0;	96	return 0;
98		97
		98	/* GPU is already declared terminally dead, give up. */
		99	if (i915_terminally_wedged(error))
		100	return -EIO;
		101
99	/*	102	/*
100	* Only wait 10 seconds for the gpu reset to complete to avoid hanging	103	* Only wait 10 seconds for the gpu reset to complete to avoid hanging
101	* userspace. If it takes that long something really bad is going on and	104	* userspace. If it takes that long something really bad is going on and
102	* we should simply try to bail out and fail as gracefully as possible.	105	* we should simply try to bail out and fail as gracefully as possible.
103	*/	106	*/
104	ret = wait_for_completion_interruptible_timeout(x, 10*HZ);	107	ret = wait_event_interruptible_timeout(error->reset_queue,
		108	EXIT_COND,
		109	10*HZ);
105	if (ret == 0) {	110	if (ret == 0) {
106	DRM_ERROR("Timed out waiting for the gpu reset to complete\n");	111	DRM_ERROR("Timed out waiting for the gpu reset to complete\n");
107	return -EIO;	112	return -EIO;
108	} else if (ret < 0) {	113	} else if (ret < 0) {
109	return ret;	114	return ret;
110	}	115	}
		116	#undef EXIT_COND
111		117
112	if (atomic_read(&error->wedged)) {
113	/* GPU is hung, bump the completion count to account for
114	* the token we just consumed so that we never hit zero and
115	* end up waiting upon a subsequent completion event that
116	* will never happen.
117	*/
118	spin_lock_irqsave(&x->wait.lock, flags);
119	x->done++;
120	spin_unlock_irqrestore(&x->wait.lock, flags);
121	}
122	return 0;	118	return 0;
123	}	119	}
124		120
@@ -942,23 +938,14 @@ int
942	i915_gem_check_wedge(struct i915_gpu_error *error,	938	i915_gem_check_wedge(struct i915_gpu_error *error,
943	bool interruptible)	939	bool interruptible)
944	{	940	{
945	if (atomic_read(&error->wedged)) {	941	if (i915_reset_in_progress(error)) {
946	struct completion *x = &error->completion;
947	bool recovery_complete;
948	unsigned long flags;
949
950	/* Give the error handler a chance to run. */
951	spin_lock_irqsave(&x->wait.lock, flags);
952	recovery_complete = x->done > 0;
953	spin_unlock_irqrestore(&x->wait.lock, flags);
954
955	/* Non-interruptible callers can't handle -EAGAIN, hence return	942	/* Non-interruptible callers can't handle -EAGAIN, hence return
956	* -EIO unconditionally for these. */	943	* -EIO unconditionally for these. */
957	if (!interruptible)	944	if (!interruptible)
958	return -EIO;	945	return -EIO;
959		946
960	/* Recovery complete, but still wedged means reset failure. */	947	/* Recovery complete, but the reset failed ... */
961	if (recovery_complete)	948	if (i915_terminally_wedged(error))
962	return -EIO;	949	return -EIO;
963		950
964	return -EAGAIN;	951	return -EAGAIN;
@@ -1025,7 +1012,7 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno,
1025		1012
1026	#define EXIT_COND \	1013	#define EXIT_COND \
1027	(i915_seqno_passed(ring->get_seqno(ring, false), seqno) \|\| \	1014	(i915_seqno_passed(ring->get_seqno(ring, false), seqno) \|\| \
1028	atomic_read(&dev_priv->gpu_error.wedged))	1015	i915_reset_in_progress(&dev_priv->gpu_error))
1029	do {	1016	do {
1030	if (interruptible)	1017	if (interruptible)
1031	end = wait_event_interruptible_timeout(ring->irq_queue,	1018	end = wait_event_interruptible_timeout(ring->irq_queue,
@@ -1379,7 +1366,7 @@ out:
1379	/* If this -EIO is due to a gpu hang, give the reset code a	1366	/* If this -EIO is due to a gpu hang, give the reset code a
1380	* chance to clean up the mess. Otherwise return the proper	1367	* chance to clean up the mess. Otherwise return the proper
1381	* SIGBUS. */	1368	* SIGBUS. */
1382	if (!atomic_read(&dev_priv->gpu_error.wedged))	1369	if (i915_terminally_wedged(&dev_priv->gpu_error))
1383	return VM_FAULT_SIGBUS;	1370	return VM_FAULT_SIGBUS;
1384	case -EAGAIN:	1371	case -EAGAIN:
1385	/* Give the error handler a chance to run and move the	1372	/* Give the error handler a chance to run and move the
@@ -3983,9 +3970,9 @@ i915_gem_entervt_ioctl(struct drm_device dev, void data,
3983	if (drm_core_check_feature(dev, DRIVER_MODESET))	3970	if (drm_core_check_feature(dev, DRIVER_MODESET))
3984	return 0;	3971	return 0;
3985		3972
3986	if (atomic_read(&dev_priv->gpu_error.wedged)) {	3973	if (i915_reset_in_progress(&dev_priv->gpu_error)) {
3987	DRM_ERROR("Reenabling wedged hardware, good luck\n");	3974	DRM_ERROR("Reenabling wedged hardware, good luck\n");
3988	atomic_set(&dev_priv->gpu_error.wedged, 0);	3975	atomic_set(&dev_priv->gpu_error.reset_counter, 0);
3989	}	3976	}
3990		3977
3991	mutex_lock(&dev->struct_mutex);	3978	mutex_lock(&dev->struct_mutex);
@@ -4069,7 +4056,7 @@ i915_gem_load(struct drm_device *dev)
4069	INIT_LIST_HEAD(&dev_priv->fence_regs[i].lru_list);	4056	INIT_LIST_HEAD(&dev_priv->fence_regs[i].lru_list);
4070	INIT_DELAYED_WORK(&dev_priv->mm.retire_work,	4057	INIT_DELAYED_WORK(&dev_priv->mm.retire_work,
4071	i915_gem_retire_work_handler);	4058	i915_gem_retire_work_handler);
4072	init_completion(&dev_priv->gpu_error.completion);	4059	init_waitqueue_head(&dev_priv->gpu_error.reset_queue);
4073		4060
4074	/* On GEN3 we really need to make sure the ARB C3 LP bit is set */	4061	/* On GEN3 we really need to make sure the ARB C3 LP bit is set */
4075	if (IS_GEN3(dev)) {	4062	if (IS_GEN3(dev)) {


diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index f2c0016fa533..4562c5406ef8 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -862,8 +862,10 @@ done:
862	*/	862	*/
863	static void i915_error_work_func(struct work_struct *work)	863	static void i915_error_work_func(struct work_struct *work)
864	{	864	{
865	drm_i915_private_t *dev_priv = container_of(work, drm_i915_private_t,	865	struct i915_gpu_error *error = container_of(work, struct i915_gpu_error,
866	gpu_error.work);	866	work);
		867	drm_i915_private_t *dev_priv = container_of(error, drm_i915_private_t,
		868	gpu_error);
867	struct drm_device *dev = dev_priv->dev;	869	struct drm_device *dev = dev_priv->dev;
868	char *error_event[] = { "ERROR=1", NULL };	870	char *error_event[] = { "ERROR=1", NULL };
869	char *reset_event[] = { "RESET=1", NULL };	871	char *reset_event[] = { "RESET=1", NULL };
@@ -871,14 +873,18 @@ static void i915_error_work_func(struct work_struct *work)
871		873
872	kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, error_event);	874	kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, error_event);
873		875
874	if (atomic_read(&dev_priv->gpu_error.wedged)) {	876	if (i915_reset_in_progress(error)) {
875	DRM_DEBUG_DRIVER("resetting chip\n");	877	DRM_DEBUG_DRIVER("resetting chip\n");
876	kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, reset_event);	878	kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, reset_event);
		879
877	if (!i915_reset(dev)) {	880	if (!i915_reset(dev)) {
878	atomic_set(&dev_priv->gpu_error.wedged, 0);	881	atomic_set(&error->reset_counter, 0);
879	kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, reset_done_event);	882	kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, reset_done_event);
		883	} else {
		884	atomic_set(&error->reset_counter, I915_WEDGED);
880	}	885	}
881	complete_all(&dev_priv->gpu_error.completion);	886
		887	wake_up_all(&dev_priv->gpu_error.reset_queue);
882	}	888	}
883	}	889	}
884		890
@@ -1482,11 +1488,12 @@ void i915_handle_error(struct drm_device *dev, bool wedged)
1482	i915_report_and_clear_eir(dev);	1488	i915_report_and_clear_eir(dev);
1483		1489
1484	if (wedged) {	1490	if (wedged) {
1485	INIT_COMPLETION(dev_priv->gpu_error.completion);	1491	atomic_set(&dev_priv->gpu_error.reset_counter,
1486	atomic_set(&dev_priv->gpu_error.wedged, 1);	1492	I915_RESET_IN_PROGRESS_FLAG);
1487		1493
1488	/*	1494	/*
1489	* Wakeup waiting processes so they don't hang	1495	* Wakeup waiting processes so that the reset work item
		1496	* doesn't deadlock trying to grab various locks.
1490	*/	1497	*/
1491	for_each_ring(ring, dev_priv, i)	1498	for_each_ring(ring, dev_priv, i)
1492	wake_up_all(&ring->irq_queue);	1499	wake_up_all(&ring->irq_queue);


diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index 160aa5f76118..77254460a5cb 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c
@@ -2223,7 +2223,7 @@ intel_finish_fb(struct drm_framebuffer *old_fb)
2223	WARN_ON(waitqueue_active(&dev_priv->pending_flip_queue));	2223	WARN_ON(waitqueue_active(&dev_priv->pending_flip_queue));
2224		2224
2225	wait_event(dev_priv->pending_flip_queue,	2225	wait_event(dev_priv->pending_flip_queue,
2226	atomic_read(&dev_priv->gpu_error.wedged) \|\|	2226	i915_reset_in_progress(&dev_priv->gpu_error) \|\|
2227	atomic_read(&obj->pending_flip) == 0);	2227	atomic_read(&obj->pending_flip) == 0);
2228		2228
2229	/* Big Hammer, we also need to ensure that any pending	2229	/* Big Hammer, we also need to ensure that any pending
@@ -2871,7 +2871,7 @@ static bool intel_crtc_has_pending_flip(struct drm_crtc *crtc)
2871	unsigned long flags;	2871	unsigned long flags;
2872	bool pending;	2872	bool pending;
2873		2873
2874	if (atomic_read(&dev_priv->gpu_error.wedged))	2874	if (i915_reset_in_progress(&dev_priv->gpu_error))
2875	return false;	2875	return false;
2876		2876
2877	spin_lock_irqsave(&dev->event_lock, flags);	2877	spin_lock_irqsave(&dev->event_lock, flags);