drm/radeon/kms: fence cleanup + more reliable GPU lockup detection V4

This patch cleanup the fence code, it drops the timeout field of fence as the time to complete each IB is unpredictable and shouldn't be bound. The fence cleanup lead to GPU lockup detection improvement, this patch introduce a callback, allowing to do asic specific test for lockup detection. In this patch the CP is use as a first indicator of GPU lockup. If CP doesn't make progress during 1second we assume we are facing a GPU lockup. To avoid overhead of testing GPU lockup frequently due to fence taking time to be signaled we query the lockup callback every 500msec. There is plenty code comment explaining the design & choise inside the code. This have been tested mostly on R3XX/R5XX hw, in normal running destkop (compiz firefox, quake3 running) the lockup callback wasn't call once (1 hour session). Also tested with forcing GPU lockup and lockup was reported after the 1s CP activity timeout. V2 switch to 500ms timeout so GPU lockup get call at least 2 times in less than 2sec. V3 store last jiffies in fence struct so on ERESTART, EBUSY we keep track of how long we already wait for a given fence V4 make sure we got up to date cp read pointer so we don't have false positive Signed-off-by: Jerome Glisse <jglisse@redhat.com> Signed-off-by: Dave Airlie <airlied@redhat.com>
author: Jerome Glisse <jglisse@redhat.com> 2010-03-09 09:45:10 -0500
committer: Dave Airlie <airlied@redhat.com> 2010-04-05 20:42:45 -0400
commit: 225758d8ba4fdcc1e8c9cf617fd89529bd4a9596 (patch)
tree: a9ac2f23435d4a6db5aa33774ba94d9f0aeb5c4c /drivers/gpu/drm/radeon/r100.c
parent: 95beb690170e6ce918fe53c73a0fcc7cf64d704a (diff)
1 files changed, 86 insertions, 0 deletions
diff --git a/drivers/gpu/drm/radeon/r100.c b/drivers/gpu/drm/radeon/r100.c
index 3ae51ada1abf..845c8f3063fe 100644
--- a/drivers/gpu/drm/radeon/r100.c
+++ b/drivers/gpu/drm/radeon/r100.c
@@ -1777,6 +1777,92 @@ int r100_rb2d_reset(struct radeon_device *rdev)
        return -1;
 }
+void r100_gpu_lockup_update(struct r100_gpu_lockup *lockup, struct radeon_cp *cp)
+{
+        lockup->last_cp_rptr = cp->rptr;
+        lockup->last_jiffies = jiffies;
+}
+/**
+ * r100_gpu_cp_is_lockup() - check if CP is lockup by recording information
+ * @rdev:       radeon device structure
+ * @lockup:     r100_gpu_lockup structure holding CP lockup tracking informations
+ * @cp:         radeon_cp structure holding CP information
+ *
+ * We don't need to initialize the lockup tracking information as we will either
+ * have CP rptr to a different value of jiffies wrap around which will force
+ * initialization of the lockup tracking informations.
+ *
+ * A possible false positivie is if we get call after while and last_cp_rptr ==
+ * the current CP rptr, even if it's unlikely it might happen. To avoid this
+ * if the elapsed time since last call is bigger than 2 second than we return
+ * false and update the tracking information. Due to this the caller must call
+ * r100_gpu_cp_is_lockup several time in less than 2sec for lockup to be reported
+ * the fencing code should be cautious about that.
+ *
+ * Caller should write to the ring to force CP to do something so we don't get
+ * false positive when CP is just gived nothing to do.
+ *
+ **/
+bool r100_gpu_cp_is_lockup(struct radeon_device *rdev, struct r100_gpu_lockup *lockup, struct radeon_cp *cp)
+{
+        unsigned long cjiffies, elapsed;
+        cjiffies = jiffies;
+        if (!time_after(cjiffies, lockup->last_jiffies)) {
+                /* likely a wrap around */
+                lockup->last_cp_rptr = cp->rptr;
+                lockup->last_jiffies = jiffies;
+                return false;
+        }
+        if (cp->rptr != lockup->last_cp_rptr) {
+                /* CP is still working no lockup */
+                lockup->last_cp_rptr = cp->rptr;
+                lockup->last_jiffies = jiffies;
+                return false;
+        }
+        elapsed = jiffies_to_msecs(cjiffies - lockup->last_jiffies);
+        if (elapsed >= 3000) {
+                /* very likely the improbable case where current
+                 * rptr is equal to last recorded, a while ago, rptr
+                 * this is more likely a false positive update tracking
+                 * information which should force us to be recall at
+                 * latter point
+                 */
+                lockup->last_cp_rptr = cp->rptr;
+                lockup->last_jiffies = jiffies;
+                return false;
+        }
+        if (elapsed >= 1000) {
+                dev_err(rdev->dev, "GPU lockup CP stall for more than %lumsec\n", elapsed);
+                return true;
+        }
+        /* give a chance to the GPU ... */
+        return false;
+}
+bool r100_gpu_is_lockup(struct radeon_device *rdev)
+{
+        u32 rbbm_status;
+        int r;
+        rbbm_status = RREG32(R_000E40_RBBM_STATUS);
+        if (!G_000E40_GUI_ACTIVE(rbbm_status)) {
+                r100_gpu_lockup_update(&rdev->config.r100.lockup, &rdev->cp);
+                return false;
+        }
+        /* force CP activities */
+        r = radeon_ring_lock(rdev, 2);
+        if (!r) {
+                /* PACKET2 NOP */
+                radeon_ring_write(rdev, 0x80000000);
+                radeon_ring_write(rdev, 0x80000000);
+                radeon_ring_unlock_commit(rdev);
+        }
+        rdev->cp.rptr = RREG32(RADEON_CP_RB_RPTR);
+        return r100_gpu_cp_is_lockup(rdev, &rdev->config.r100.lockup, &rdev->cp);
+}
 int r100_gpu_reset(struct radeon_device *rdev)
 {
        uint32_t status;
author	Jerome Glisse <jglisse@redhat.com>	2010-03-09 09:45:10 -0500
committer	Dave Airlie <airlied@redhat.com>	2010-04-05 20:42:45 -0400
commit	225758d8ba4fdcc1e8c9cf617fd89529bd4a9596 (patch)
tree	a9ac2f23435d4a6db5aa33774ba94d9f0aeb5c4c /drivers/gpu/drm/radeon/r100.c
parent	95beb690170e6ce918fe53c73a0fcc7cf64d704a (diff)

diff --git a/drivers/gpu/drm/radeon/r100.c b/drivers/gpu/drm/radeon/r100.c index 3ae51ada1abf..845c8f3063fe 100644 --- a/drivers/gpu/drm/radeon/r100.c +++ b/drivers/gpu/drm/radeon/r100.c
@@ -1777,6 +1777,92 @@ int r100_rb2d_reset(struct radeon_device *rdev)
1777	return -1;	1777	return -1;
1778	}	1778	}
1779		1779
		1780	void r100_gpu_lockup_update(struct r100_gpu_lockup lockup, struct radeon_cp cp)
		1781	{
		1782	lockup->last_cp_rptr = cp->rptr;
		1783	lockup->last_jiffies = jiffies;
		1784	}
		1785
		1786	/**
		1787	* r100_gpu_cp_is_lockup() - check if CP is lockup by recording information
		1788	* @rdev: radeon device structure
		1789	* @lockup: r100_gpu_lockup structure holding CP lockup tracking informations
		1790	* @cp: radeon_cp structure holding CP information
		1791	*
		1792	* We don't need to initialize the lockup tracking information as we will either
		1793	* have CP rptr to a different value of jiffies wrap around which will force
		1794	* initialization of the lockup tracking informations.
		1795	*
		1796	* A possible false positivie is if we get call after while and last_cp_rptr ==
		1797	* the current CP rptr, even if it's unlikely it might happen. To avoid this
		1798	* if the elapsed time since last call is bigger than 2 second than we return
		1799	* false and update the tracking information. Due to this the caller must call
		1800	* r100_gpu_cp_is_lockup several time in less than 2sec for lockup to be reported
		1801	* the fencing code should be cautious about that.
		1802	*
		1803	* Caller should write to the ring to force CP to do something so we don't get
		1804	* false positive when CP is just gived nothing to do.
		1805	*
		1806	**/
		1807	bool r100_gpu_cp_is_lockup(struct radeon_device rdev, struct r100_gpu_lockup lockup, struct radeon_cp *cp)
		1808	{
		1809	unsigned long cjiffies, elapsed;
		1810
		1811	cjiffies = jiffies;
		1812	if (!time_after(cjiffies, lockup->last_jiffies)) {
		1813	/* likely a wrap around */
		1814	lockup->last_cp_rptr = cp->rptr;
		1815	lockup->last_jiffies = jiffies;
		1816	return false;
		1817	}
		1818	if (cp->rptr != lockup->last_cp_rptr) {
		1819	/* CP is still working no lockup */
		1820	lockup->last_cp_rptr = cp->rptr;
		1821	lockup->last_jiffies = jiffies;
		1822	return false;
		1823	}
		1824	elapsed = jiffies_to_msecs(cjiffies - lockup->last_jiffies);
		1825	if (elapsed >= 3000) {
		1826	/* very likely the improbable case where current
		1827	* rptr is equal to last recorded, a while ago, rptr
		1828	* this is more likely a false positive update tracking
		1829	* information which should force us to be recall at
		1830	* latter point
		1831	*/
		1832	lockup->last_cp_rptr = cp->rptr;
		1833	lockup->last_jiffies = jiffies;
		1834	return false;
		1835	}
		1836	if (elapsed >= 1000) {
		1837	dev_err(rdev->dev, "GPU lockup CP stall for more than %lumsec\n", elapsed);
		1838	return true;
		1839	}
		1840	/* give a chance to the GPU ... */
		1841	return false;
		1842	}
		1843
		1844	bool r100_gpu_is_lockup(struct radeon_device *rdev)
		1845	{
		1846	u32 rbbm_status;
		1847	int r;
		1848
		1849	rbbm_status = RREG32(R_000E40_RBBM_STATUS);
		1850	if (!G_000E40_GUI_ACTIVE(rbbm_status)) {
		1851	r100_gpu_lockup_update(&rdev->config.r100.lockup, &rdev->cp);
		1852	return false;
		1853	}
		1854	/* force CP activities */
		1855	r = radeon_ring_lock(rdev, 2);
		1856	if (!r) {
		1857	/* PACKET2 NOP */
		1858	radeon_ring_write(rdev, 0x80000000);
		1859	radeon_ring_write(rdev, 0x80000000);
		1860	radeon_ring_unlock_commit(rdev);
		1861	}
		1862	rdev->cp.rptr = RREG32(RADEON_CP_RB_RPTR);
		1863	return r100_gpu_cp_is_lockup(rdev, &rdev->config.r100.lockup, &rdev->cp);
		1864	}
		1865
1780	int r100_gpu_reset(struct radeon_device *rdev)	1866	int r100_gpu_reset(struct radeon_device *rdev)
1781	{	1867	{
1782	uint32_t status;	1868	uint32_t status;