1 files changed, 54 insertions, 14 deletions
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 83cce0cdb769..4b91228fd9bd 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1469,6 +1469,34 @@ static irqreturn_t ironlake_irq_handler(int irq, void *arg)
        return ret;
 }
+static void i915_error_wake_up(struct drm_i915_private *dev_priv,
+                               bool reset_completed)
+{
+        struct intel_ring_buffer *ring;
+        int i;
+        /*
+         * Notify all waiters for GPU completion events that reset state has
+         * been changed, and that they need to restart their wait after
+         * checking for potential errors (and bail out to drop locks if there is
+         * a gpu reset pending so that i915_error_work_func can acquire them).
+         */
+        /* Wake up __wait_seqno, potentially holding dev->struct_mutex. */
+        for_each_ring(ring, dev_priv, i)
+                wake_up_all(&ring->irq_queue);
+        /* Wake up intel_crtc_wait_for_pending_flips, holding crtc->mutex. */
+        wake_up_all(&dev_priv->pending_flip_queue);
+        /*
+         * Signal tasks blocked in i915_gem_wait_for_error that the pending
+         * reset state is cleared.
+         */
+        if (reset_completed)
+                wake_up_all(&dev_priv->gpu_error.reset_queue);
+}
 /**
 * i915_error_work_func - do process context error handling work
 * @work: work struct
@@ -1483,11 +1511,10 @@ static void i915_error_work_func(struct work_struct *work)
        drm_i915_private_t *dev_priv = container_of(error, drm_i915_private_t,
                                                    gpu_error);
        struct drm_device *dev = dev_priv->dev;
-        struct intel_ring_buffer *ring;
        char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
        char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
        char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
-        int i, ret;
+        int ret;
        kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, error_event);
@@ -1506,8 +1533,16 @@ static void i915_error_work_func(struct work_struct *work)
                kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE,
                                   reset_event);
+                /*
+                 * All state reset _must_ be completed before we update the
+                 * reset counter, for otherwise waiters might miss the reset
+                 * pending state and not properly drop locks, resulting in
+                 * deadlocks with the reset work.
+                 */
                ret = i915_reset(dev);
+                intel_display_handle_reset(dev);
                if (ret == 0) {
                        /*
                         * After all the gem state is reset, increment the reset
@@ -1528,12 +1563,11 @@ static void i915_error_work_func(struct work_struct *work)
                        atomic_set(&error->reset_counter, I915_WEDGED);
                }
-                for_each_ring(ring, dev_priv, i)
+                /*
-                        wake_up_all(&ring->irq_queue);
+                 * Note: The wake_up also serves as a memory barrier so that
+                 * waiters see the update value of the reset counter atomic_t.
-                intel_display_handle_reset(dev);
+                 */
+                i915_error_wake_up(dev_priv, true);
-                wake_up_all(&dev_priv->gpu_error.reset_queue);
        }
 }
@@ -1642,8 +1676,6 @@ static void i915_report_and_clear_eir(struct drm_device *dev)
 void i915_handle_error(struct drm_device *dev, bool wedged)
 {
        struct drm_i915_private *dev_priv = dev->dev_private;
-        struct intel_ring_buffer *ring;
-        int i;
        i915_capture_error_state(dev);
        i915_report_and_clear_eir(dev);
@@ -1653,11 +1685,19 @@ void i915_handle_error(struct drm_device *dev, bool wedged)
                                &dev_priv->gpu_error.reset_counter);
                /*
-                 * Wakeup waiting processes so that the reset work item
+                 * Wakeup waiting processes so that the reset work function
-                 * doesn't deadlock trying to grab various locks.
+                 * i915_error_work_func doesn't deadlock trying to grab various
+                 * locks. By bumping the reset counter first, the woken
+                 * processes will see a reset in progress and back off,
+                 * releasing their locks and then wait for the reset completion.
+                 * We must do this for _all_ gpu waiters that might hold locks
+                 * that the reset work needs to acquire.
+                 *
+                 * Note: The wake_up serves as the required memory barrier to
+                 * ensure that the waiters see the updated value of the reset
+                 * counter atomic_t.
                 */
-                for_each_ring(ring, dev_priv, i)
+                i915_error_wake_up(dev_priv, false);
-                        wake_up_all(&ring->irq_queue);
        }
        /*

diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index 83cce0cdb769..4b91228fd9bd 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1469,6 +1469,34 @@ static irqreturn_t ironlake_irq_handler(int irq, void *arg)
1469	return ret;	1469	return ret;
1470	}	1470	}
1471		1471
		1472	static void i915_error_wake_up(struct drm_i915_private *dev_priv,
		1473	bool reset_completed)
		1474	{
		1475	struct intel_ring_buffer *ring;
		1476	int i;
		1477
		1478	/*
		1479	* Notify all waiters for GPU completion events that reset state has
		1480	* been changed, and that they need to restart their wait after
		1481	* checking for potential errors (and bail out to drop locks if there is
		1482	* a gpu reset pending so that i915_error_work_func can acquire them).
		1483	*/
		1484
		1485	/* Wake up __wait_seqno, potentially holding dev->struct_mutex. */
		1486	for_each_ring(ring, dev_priv, i)
		1487	wake_up_all(&ring->irq_queue);
		1488
		1489	/* Wake up intel_crtc_wait_for_pending_flips, holding crtc->mutex. */
		1490	wake_up_all(&dev_priv->pending_flip_queue);
		1491
		1492	/*
		1493	* Signal tasks blocked in i915_gem_wait_for_error that the pending
		1494	* reset state is cleared.
		1495	*/
		1496	if (reset_completed)
		1497	wake_up_all(&dev_priv->gpu_error.reset_queue);
		1498	}
		1499
1472	/**	1500	/**
1473	* i915_error_work_func - do process context error handling work	1501	* i915_error_work_func - do process context error handling work
1474	* @work: work struct	1502	* @work: work struct
@@ -1483,11 +1511,10 @@ static void i915_error_work_func(struct work_struct *work)
1483	drm_i915_private_t *dev_priv = container_of(error, drm_i915_private_t,	1511	drm_i915_private_t *dev_priv = container_of(error, drm_i915_private_t,
1484	gpu_error);	1512	gpu_error);
1485	struct drm_device *dev = dev_priv->dev;	1513	struct drm_device *dev = dev_priv->dev;
1486	struct intel_ring_buffer *ring;
1487	char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };	1514	char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
1488	char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };	1515	char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
1489	char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };	1516	char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
1490	int i, ret;	1517	int ret;
1491		1518
1492	kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, error_event);	1519	kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, error_event);
1493		1520
@@ -1506,8 +1533,16 @@ static void i915_error_work_func(struct work_struct *work)
1506	kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE,	1533	kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE,
1507	reset_event);	1534	reset_event);
1508		1535
		1536	/*
		1537	* All state reset _must_ be completed before we update the
		1538	* reset counter, for otherwise waiters might miss the reset
		1539	* pending state and not properly drop locks, resulting in
		1540	* deadlocks with the reset work.
		1541	*/
1509	ret = i915_reset(dev);	1542	ret = i915_reset(dev);
1510		1543
		1544	intel_display_handle_reset(dev);
		1545
1511	if (ret == 0) {	1546	if (ret == 0) {
1512	/*	1547	/*
1513	* After all the gem state is reset, increment the reset	1548	* After all the gem state is reset, increment the reset
@@ -1528,12 +1563,11 @@ static void i915_error_work_func(struct work_struct *work)
1528	atomic_set(&error->reset_counter, I915_WEDGED);	1563	atomic_set(&error->reset_counter, I915_WEDGED);
1529	}	1564	}
1530		1565
1531	for_each_ring(ring, dev_priv, i)	1566	/*
1532	wake_up_all(&ring->irq_queue);	1567	* Note: The wake_up also serves as a memory barrier so that
1533		1568	* waiters see the update value of the reset counter atomic_t.
1534	intel_display_handle_reset(dev);	1569	*/
1535		1570	i915_error_wake_up(dev_priv, true);
1536	wake_up_all(&dev_priv->gpu_error.reset_queue);
1537	}	1571	}
1538	}	1572	}
1539		1573
@@ -1642,8 +1676,6 @@ static void i915_report_and_clear_eir(struct drm_device *dev)
1642	void i915_handle_error(struct drm_device *dev, bool wedged)	1676	void i915_handle_error(struct drm_device *dev, bool wedged)
1643	{	1677	{
1644	struct drm_i915_private *dev_priv = dev->dev_private;	1678	struct drm_i915_private *dev_priv = dev->dev_private;
1645	struct intel_ring_buffer *ring;
1646	int i;
1647		1679
1648	i915_capture_error_state(dev);	1680	i915_capture_error_state(dev);
1649	i915_report_and_clear_eir(dev);	1681	i915_report_and_clear_eir(dev);
@@ -1653,11 +1685,19 @@ void i915_handle_error(struct drm_device *dev, bool wedged)
1653	&dev_priv->gpu_error.reset_counter);	1685	&dev_priv->gpu_error.reset_counter);
1654		1686
1655	/*	1687	/*
1656	* Wakeup waiting processes so that the reset work item	1688	* Wakeup waiting processes so that the reset work function
1657	* doesn't deadlock trying to grab various locks.	1689	* i915_error_work_func doesn't deadlock trying to grab various
		1690	* locks. By bumping the reset counter first, the woken
		1691	* processes will see a reset in progress and back off,
		1692	* releasing their locks and then wait for the reset completion.
		1693	* We must do this for _all_ gpu waiters that might hold locks
		1694	* that the reset work needs to acquire.
		1695	*
		1696	* Note: The wake_up serves as the required memory barrier to
		1697	* ensure that the waiters see the updated value of the reset
		1698	* counter atomic_t.
1658	*/	1699	*/
1659	for_each_ring(ring, dev_priv, i)	1700	i915_error_wake_up(dev_priv, false);
1660	wake_up_all(&ring->irq_queue);
1661	}	1701	}
1662		1702
1663	/*	1703	/*