summaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorDeepak Nibade <dnibade@nvidia.com>2015-03-19 09:29:11 -0400
committerIshan Mittal <imittal@nvidia.com>2015-05-18 01:48:24 -0400
commitc5f2d00d04f5048e1414f1a2cbe702026528b4db (patch)
treea1c29cd29b0f393eb9ca8ed7402d869b209af680 /drivers
parent900f63393d80cc1978c5758a368cf1edb3d267ae (diff)
gpu: nvgpu: fix deadlock on railgate_lock during race condition
We have below race condition during __gk20a_do_idle() and force_reset case : - before execution of __gk20a_do_idle(), a process drops the last usage count of GPU, which triggers GPU railgate process - but before GPU is really railgated (there is 500 mS delay), some process calls __gk20a_do_idle() - in __gk20a_do_idle(), we first take railgate_lock - then we check if GPU is already railgated or not - since it is not railgated yet (due to 500 mS delay), this returns false - then we call pm_runtime_get_noresume() which just increases the usage counter - in this particular case, this call just increases usage count to 1 from 0, but whereas GPU is already on its way to railgate - while we check if GPU usage count drops to one, GPU gets railgated - now if we have force_reset=true case, we will end up calling pm_runtime_get_sync() which will take railgate_lock lock _again_ and try to unrailgate GPU - this causes a deadlock on railgate_lock To fix this, use below sequence : - take railgate_lock - check if GPU is already railgated - release railgate_lock - call pm_runtime_get_sync() which will keep GPU active even if railgating is already triggered - take railgate_lock again to prevent unrailgate in futher process Also, add more descriptive comments to explain the flow Bug 1624537 Change-Id: I0febc65d7bfac03ee738be200cf321322ffbe5a6 Signed-off-by: Deepak Nibade <dnibade@nvidia.com> Reviewed-on: http://git-master/r/719625 (cherry picked from commit 480284eda16e2b50ee6368bad3d15574e098b231) Reviewed-on: http://git-master/r/719620 Reviewed-by: Sachin Nikam <snikam@nvidia.com>
Diffstat (limited to 'drivers')
-rw-r--r--drivers/gpu/nvgpu/gk20a/gk20a.c63
1 files changed, 44 insertions, 19 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c
index eb6774da..6add8441 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.c
@@ -1774,6 +1774,10 @@ void gk20a_reset(struct gk20a *g, u32 units)
1774 * __gk20a_do_idle() - force the GPU to idle and railgate 1774 * __gk20a_do_idle() - force the GPU to idle and railgate
1775 * 1775 *
1776 * In success, this call MUST be balanced by caller with __gk20a_do_unidle() 1776 * In success, this call MUST be balanced by caller with __gk20a_do_unidle()
1777 *
1778 * Acquires two locks : &g->busy_lock and &platform->railgate_lock
1779 * In success, we hold these locks and return
1780 * In failure, we release these locks and return
1777 */ 1781 */
1778int __gk20a_do_idle(struct platform_device *pdev, bool force_reset) 1782int __gk20a_do_idle(struct platform_device *pdev, bool force_reset)
1779{ 1783{
@@ -1794,15 +1798,15 @@ int __gk20a_do_idle(struct platform_device *pdev, bool force_reset)
1794 if (platform->is_railgated(pdev)) 1798 if (platform->is_railgated(pdev))
1795 return 0; 1799 return 0;
1796 1800
1797 /* check if global force_reset flag is set */ 1801 /*
1798 force_reset |= platform->force_reset_in_do_idle; 1802 * release railgate_lock, prevent suspend by incrementing usage counter,
1799 1803 * re-acquire railgate_lock
1800 /* prevent suspend by incrementing usage counter */ 1804 */
1801 pm_runtime_get_noresume(&pdev->dev); 1805 mutex_unlock(&platform->railgate_lock);
1806 pm_runtime_get_sync(&pdev->dev);
1807 mutex_lock(&platform->railgate_lock);
1802 1808
1803 /* check and wait until GPU is idle (with a timeout) */ 1809 /* check and wait until GPU is idle (with a timeout) */
1804 pm_runtime_barrier(&pdev->dev);
1805
1806 do { 1810 do {
1807 msleep(1); 1811 msleep(1);
1808 ref_cnt = atomic_read(&pdev->dev.power.usage_count); 1812 ref_cnt = atomic_read(&pdev->dev.power.usage_count);
@@ -1811,19 +1815,21 @@ int __gk20a_do_idle(struct platform_device *pdev, bool force_reset)
1811 if (ref_cnt != 1) { 1815 if (ref_cnt != 1) {
1812 gk20a_err(&pdev->dev, "failed to idle - refcount %d != 1\n", 1816 gk20a_err(&pdev->dev, "failed to idle - refcount %d != 1\n",
1813 ref_cnt); 1817 ref_cnt);
1814 goto fail; 1818 goto fail_drop_usage_count;
1815 } 1819 }
1816 1820
1817 /* 1821 /* check if global force_reset flag is set */
1818 * if GPU is now idle, we will have only one ref count 1822 force_reset |= platform->force_reset_in_do_idle;
1819 * drop this ref which will rail gate the GPU (if GPU
1820 * railgate is supported)
1821 * if GPU railgate is not supported then we need to
1822 * explicitly reset it
1823 */
1824 pm_runtime_put_sync(&pdev->dev);
1825 1823
1826 if (platform->can_railgate && !force_reset) { 1824 if (platform->can_railgate && !force_reset) {
1825 /*
1826 * Case 1 : GPU railgate is supported
1827 *
1828 * if GPU is now idle, we will have only one ref count,
1829 * drop this ref which will rail gate the GPU
1830 */
1831 pm_runtime_put_sync(&pdev->dev);
1832
1827 /* add sufficient delay to allow GPU to rail gate */ 1833 /* add sufficient delay to allow GPU to rail gate */
1828 msleep(platform->railgate_delay); 1834 msleep(platform->railgate_delay);
1829 1835
@@ -1842,12 +1848,24 @@ int __gk20a_do_idle(struct platform_device *pdev, bool force_reset)
1842 goto fail_timeout; 1848 goto fail_timeout;
1843 } 1849 }
1844 } else { 1850 } else {
1851 /*
1852 * Case 2 : GPU railgate is not supported or we explicitly
1853 * do not want to do railgate
1854 *
1855 * if GPU is now idle, call prepare_poweroff() to save the
1856 * state and then assert the reset
1857 *
1858 * __gk20a_do_unidle() needs to deassert reset, call
1859 * finalize_poweron(), and then call pm_runtime_put_sync()
1860 * to balance the GPU usage counter
1861 */
1845 if (!platform->reset_assert || !platform->reset_deassert) 1862 if (!platform->reset_assert || !platform->reset_deassert)
1846 goto fail_timeout; 1863 goto fail_drop_usage_count;
1847 1864
1848 pm_runtime_get_sync(&pdev->dev); 1865 /* Save the GPU state */
1849 gk20a_pm_prepare_poweroff(&pdev->dev); 1866 gk20a_pm_prepare_poweroff(&pdev->dev);
1850 1867
1868 /* assert GPU reset */
1851 platform->reset_assert(pdev); 1869 platform->reset_assert(pdev);
1852 1870
1853 udelay(10); 1871 udelay(10);
@@ -1856,7 +1874,7 @@ int __gk20a_do_idle(struct platform_device *pdev, bool force_reset)
1856 return 0; 1874 return 0;
1857 } 1875 }
1858 1876
1859fail: 1877fail_drop_usage_count:
1860 pm_runtime_put_noidle(&pdev->dev); 1878 pm_runtime_put_noidle(&pdev->dev);
1861fail_timeout: 1879fail_timeout:
1862 mutex_unlock(&platform->railgate_lock); 1880 mutex_unlock(&platform->railgate_lock);
@@ -1892,9 +1910,16 @@ int __gk20a_do_unidle(struct platform_device *pdev)
1892 struct gk20a_platform *platform = dev_get_drvdata(&pdev->dev); 1910 struct gk20a_platform *platform = dev_get_drvdata(&pdev->dev);
1893 1911
1894 if (g->forced_reset) { 1912 if (g->forced_reset) {
1913 /*
1914 * If we did a reset (and not railgate),
1915 * then deassert the GPU reset here first
1916 */
1895 platform->reset_deassert(pdev); 1917 platform->reset_deassert(pdev);
1896 1918
1919 /* restore the GPU state */
1897 gk20a_pm_finalize_poweron(&pdev->dev); 1920 gk20a_pm_finalize_poweron(&pdev->dev);
1921
1922 /* balance GPU usage counter */
1898 pm_runtime_put_sync(&pdev->dev); 1923 pm_runtime_put_sync(&pdev->dev);
1899 1924
1900 g->forced_reset = false; 1925 g->forced_reset = false;