gpu: nvgpu: fix deadlock on railgate_lock during race condition

We have below race condition during __gk20a_do_idle() and force_reset case : - before execution of __gk20a_do_idle(), a process drops the last usage count of GPU, which triggers GPU railgate process - but before GPU is really railgated (there is 500 mS delay), some process calls __gk20a_do_idle() - in __gk20a_do_idle(), we first take railgate_lock - then we check if GPU is already railgated or not - since it is not railgated yet (due to 500 mS delay), this returns false - then we call pm_runtime_get_noresume() which just increases the usage counter - in this particular case, this call just increases usage count to 1 from 0, but whereas GPU is already on its way to railgate - while we check if GPU usage count drops to one, GPU gets railgated - now if we have force_reset=true case, we will end up calling pm_runtime_get_sync() which will take railgate_lock lock _again_ and try to unrailgate GPU - this causes a deadlock on railgate_lock To fix this, use below sequence : - take railgate_lock - check if GPU is already railgated - release railgate_lock - call pm_runtime_get_sync() which will keep GPU active even if railgating is already triggered - take railgate_lock again to prevent unrailgate in futher process Also, add more descriptive comments to explain the flow Bug 1624537 Change-Id: I0febc65d7bfac03ee738be200cf321322ffbe5a6 Signed-off-by: Deepak Nibade <dnibade@nvidia.com> Reviewed-on: http://git-master/r/719625 (cherry picked from commit 480284eda16e2b50ee6368bad3d15574e098b231) Reviewed-on: http://git-master/r/719620 Reviewed-by: Sachin Nikam <snikam@nvidia.com>
author: Deepak Nibade <dnibade@nvidia.com> 2015-03-19 09:29:11 -0400
committer: Ishan Mittal <imittal@nvidia.com> 2015-05-18 01:48:24 -0400
commit: c5f2d00d04f5048e1414f1a2cbe702026528b4db (patch)
tree: a1c29cd29b0f393eb9ca8ed7402d869b209af680 /drivers
parent: 900f63393d80cc1978c5758a368cf1edb3d267ae (diff)
1 files changed, 44 insertions, 19 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c
index eb6774da..6add8441 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.c
@@ -1774,6 +1774,10 @@ void gk20a_reset(struct gk20a *g, u32 units)
 * __gk20a_do_idle() - force the GPU to idle and railgate
 *
 * In success, this call MUST be balanced by caller with __gk20a_do_unidle()
+ *
+ * Acquires two locks : &g->busy_lock and &platform->railgate_lock
+ * In success, we hold these locks and return
+ * In failure, we release these locks and return
 */
 int __gk20a_do_idle(struct platform_device *pdev, bool force_reset)
 {
@@ -1794,15 +1798,15 @@ int __gk20a_do_idle(struct platform_device *pdev, bool force_reset)
        if (platform->is_railgated(pdev))
                return 0;
-        /* check if global force_reset flag is set */
+        /*
-        force_reset |= platform->force_reset_in_do_idle;
+         * release railgate_lock, prevent suspend by incrementing usage counter,
+         * re-acquire railgate_lock
-        /* prevent suspend by incrementing usage counter */
+         */
-        pm_runtime_get_noresume(&pdev->dev);
+        mutex_unlock(&platform->railgate_lock);
+        pm_runtime_get_sync(&pdev->dev);
+        mutex_lock(&platform->railgate_lock);
        /* check and wait until GPU is idle (with a timeout) */
-        pm_runtime_barrier(&pdev->dev);
        do {
                msleep(1);
                ref_cnt = atomic_read(&pdev->dev.power.usage_count);
@@ -1811,19 +1815,21 @@ int __gk20a_do_idle(struct platform_device *pdev, bool force_reset)
        if (ref_cnt != 1) {
                gk20a_err(&pdev->dev, "failed to idle - refcount %d != 1\n",
                        ref_cnt);
-                goto fail;
+                goto fail_drop_usage_count;
        }
-        /*
+        /* check if global force_reset flag is set */
-         * if GPU is now idle, we will have only one ref count
+        force_reset |= platform->force_reset_in_do_idle;
-         * drop this ref which will rail gate the GPU (if GPU
-         * railgate is supported)
-         * if GPU railgate is not supported then we need to
-         * explicitly reset it
-         */
-        pm_runtime_put_sync(&pdev->dev);
        if (platform->can_railgate && !force_reset) {
+                /*
+                 * Case 1 : GPU railgate is supported
+                 *
+                 * if GPU is now idle, we will have only one ref count,
+                 * drop this ref which will rail gate the GPU
+                 */
+                pm_runtime_put_sync(&pdev->dev);
                /* add sufficient delay to allow GPU to rail gate */
                msleep(platform->railgate_delay);
@@ -1842,12 +1848,24 @@ int __gk20a_do_idle(struct platform_device *pdev, bool force_reset)
                        goto fail_timeout;
                }
        } else {
+                /*
+                 * Case 2 : GPU railgate is not supported or we explicitly
+                 * do not want to do railgate
+                 *
+                 * if GPU is now idle, call prepare_poweroff() to save the
+                 * state and then assert the reset
+                 *
+                 * __gk20a_do_unidle() needs to deassert reset, call
+                 * finalize_poweron(), and then call pm_runtime_put_sync()
+                 * to balance the GPU usage counter
+                 */
                if (!platform->reset_assert || !platform->reset_deassert)
-                        goto fail_timeout;
+                        goto fail_drop_usage_count;
-                pm_runtime_get_sync(&pdev->dev);
+                /* Save the GPU state */
                gk20a_pm_prepare_poweroff(&pdev->dev);
+                /* assert GPU reset */
                platform->reset_assert(pdev);
                udelay(10);
@@ -1856,7 +1874,7 @@ int __gk20a_do_idle(struct platform_device *pdev, bool force_reset)
                return 0;
        }
-fail:
+fail_drop_usage_count:
        pm_runtime_put_noidle(&pdev->dev);
 fail_timeout:
        mutex_unlock(&platform->railgate_lock);
@@ -1892,9 +1910,16 @@ int __gk20a_do_unidle(struct platform_device *pdev)
        struct gk20a_platform *platform = dev_get_drvdata(&pdev->dev);
        if (g->forced_reset) {
+                /*
+                 * If we did a reset (and not railgate),
+                 * then deassert the GPU reset here first
+                 */
                platform->reset_deassert(pdev);
+                /* restore the GPU state */
                gk20a_pm_finalize_poweron(&pdev->dev);
+                /* balance GPU usage counter */
                pm_runtime_put_sync(&pdev->dev);
                g->forced_reset = false;
author	Deepak Nibade <dnibade@nvidia.com>	2015-03-19 09:29:11 -0400
committer	Ishan Mittal <imittal@nvidia.com>	2015-05-18 01:48:24 -0400
commit	c5f2d00d04f5048e1414f1a2cbe702026528b4db (patch)
tree	a1c29cd29b0f393eb9ca8ed7402d869b209af680 /drivers
parent	900f63393d80cc1978c5758a368cf1edb3d267ae (diff)

diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c index eb6774da..6add8441 100644 --- a/drivers/gpu/nvgpu/gk20a/gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/gk20a.c
@@ -1774,6 +1774,10 @@ void gk20a_reset(struct gk20a *g, u32 units)
1774	* __gk20a_do_idle() - force the GPU to idle and railgate	1774	* __gk20a_do_idle() - force the GPU to idle and railgate
1775	*	1775	*
1776	* In success, this call MUST be balanced by caller with __gk20a_do_unidle()	1776	* In success, this call MUST be balanced by caller with __gk20a_do_unidle()
		1777	*
		1778	* Acquires two locks : &g->busy_lock and &platform->railgate_lock
		1779	* In success, we hold these locks and return
		1780	* In failure, we release these locks and return
1777	*/	1781	*/
1778	int __gk20a_do_idle(struct platform_device *pdev, bool force_reset)	1782	int __gk20a_do_idle(struct platform_device *pdev, bool force_reset)
1779	{	1783	{
@@ -1794,15 +1798,15 @@ int __gk20a_do_idle(struct platform_device *pdev, bool force_reset)
1794	if (platform->is_railgated(pdev))	1798	if (platform->is_railgated(pdev))
1795	return 0;	1799	return 0;
1796		1800
1797	/* check if global force_reset flag is set */	1801	/*
1798	force_reset \|= platform->force_reset_in_do_idle;	1802	* release railgate_lock, prevent suspend by incrementing usage counter,
1799		1803	* re-acquire railgate_lock
1800	/* prevent suspend by incrementing usage counter */	1804	*/
1801	pm_runtime_get_noresume(&pdev->dev);	1805	mutex_unlock(&platform->railgate_lock);
		1806	pm_runtime_get_sync(&pdev->dev);
		1807	mutex_lock(&platform->railgate_lock);
1802		1808
1803	/* check and wait until GPU is idle (with a timeout) */	1809	/* check and wait until GPU is idle (with a timeout) */
1804	pm_runtime_barrier(&pdev->dev);
1805
1806	do {	1810	do {
1807	msleep(1);	1811	msleep(1);
1808	ref_cnt = atomic_read(&pdev->dev.power.usage_count);	1812	ref_cnt = atomic_read(&pdev->dev.power.usage_count);
@@ -1811,19 +1815,21 @@ int __gk20a_do_idle(struct platform_device *pdev, bool force_reset)
1811	if (ref_cnt != 1) {	1815	if (ref_cnt != 1) {
1812	gk20a_err(&pdev->dev, "failed to idle - refcount %d != 1\n",	1816	gk20a_err(&pdev->dev, "failed to idle - refcount %d != 1\n",
1813	ref_cnt);	1817	ref_cnt);
1814	goto fail;	1818	goto fail_drop_usage_count;
1815	}	1819	}
1816		1820
1817	/*	1821	/* check if global force_reset flag is set */
1818	* if GPU is now idle, we will have only one ref count	1822	force_reset \|= platform->force_reset_in_do_idle;
1819	* drop this ref which will rail gate the GPU (if GPU
1820	* railgate is supported)
1821	* if GPU railgate is not supported then we need to
1822	* explicitly reset it
1823	*/
1824	pm_runtime_put_sync(&pdev->dev);
1825		1823
1826	if (platform->can_railgate && !force_reset) {	1824	if (platform->can_railgate && !force_reset) {
		1825	/*
		1826	* Case 1 : GPU railgate is supported
		1827	*
		1828	* if GPU is now idle, we will have only one ref count,
		1829	* drop this ref which will rail gate the GPU
		1830	*/
		1831	pm_runtime_put_sync(&pdev->dev);
		1832
1827	/* add sufficient delay to allow GPU to rail gate */	1833	/* add sufficient delay to allow GPU to rail gate */
1828	msleep(platform->railgate_delay);	1834	msleep(platform->railgate_delay);
1829		1835
@@ -1842,12 +1848,24 @@ int __gk20a_do_idle(struct platform_device *pdev, bool force_reset)
1842	goto fail_timeout;	1848	goto fail_timeout;
1843	}	1849	}
1844	} else {	1850	} else {
		1851	/*
		1852	* Case 2 : GPU railgate is not supported or we explicitly
		1853	* do not want to do railgate
		1854	*
		1855	* if GPU is now idle, call prepare_poweroff() to save the
		1856	* state and then assert the reset
		1857	*
		1858	* __gk20a_do_unidle() needs to deassert reset, call
		1859	* finalize_poweron(), and then call pm_runtime_put_sync()
		1860	* to balance the GPU usage counter
		1861	*/
1845	if (!platform->reset_assert \|\| !platform->reset_deassert)	1862	if (!platform->reset_assert \|\| !platform->reset_deassert)
1846	goto fail_timeout;	1863	goto fail_drop_usage_count;
1847		1864
1848	pm_runtime_get_sync(&pdev->dev);	1865	/* Save the GPU state */
1849	gk20a_pm_prepare_poweroff(&pdev->dev);	1866	gk20a_pm_prepare_poweroff(&pdev->dev);
1850		1867
		1868	/* assert GPU reset */
1851	platform->reset_assert(pdev);	1869	platform->reset_assert(pdev);
1852		1870
1853	udelay(10);	1871	udelay(10);
@@ -1856,7 +1874,7 @@ int __gk20a_do_idle(struct platform_device *pdev, bool force_reset)
1856	return 0;	1874	return 0;
1857	}	1875	}
1858		1876
1859	fail:	1877	fail_drop_usage_count:
1860	pm_runtime_put_noidle(&pdev->dev);	1878	pm_runtime_put_noidle(&pdev->dev);
1861	fail_timeout:	1879	fail_timeout:
1862	mutex_unlock(&platform->railgate_lock);	1880	mutex_unlock(&platform->railgate_lock);
@@ -1892,9 +1910,16 @@ int __gk20a_do_unidle(struct platform_device *pdev)
1892	struct gk20a_platform *platform = dev_get_drvdata(&pdev->dev);	1910	struct gk20a_platform *platform = dev_get_drvdata(&pdev->dev);
1893		1911
1894	if (g->forced_reset) {	1912	if (g->forced_reset) {
		1913	/*
		1914	* If we did a reset (and not railgate),
		1915	* then deassert the GPU reset here first
		1916	*/
1895	platform->reset_deassert(pdev);	1917	platform->reset_deassert(pdev);
1896		1918
		1919	/* restore the GPU state */
1897	gk20a_pm_finalize_poweron(&pdev->dev);	1920	gk20a_pm_finalize_poweron(&pdev->dev);
		1921
		1922	/* balance GPU usage counter */
1898	pm_runtime_put_sync(&pdev->dev);	1923	pm_runtime_put_sync(&pdev->dev);
1899		1924
1900	g->forced_reset = false;	1925	g->forced_reset = false;