aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-01-20 14:51:46 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2015-01-20 14:51:46 -0500
commitd4b2d0061d76e43f614e23eae3017f43a1a7c6c1 (patch)
treee33fcf87626f3c5f35c5ae4e986f1eaf3221a47c /kernel
parent06efe0e54018cb19cf0807447dc3ac747ffcfd1c (diff)
parent29187a9eeaf362d8422e62e17a22a6e115277a49 (diff)
Merge branch 'for-3.19-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq
Pull workqueue fix from Tejun Heo: "The xfs folks have been running into weird and very rare lockups for some time now. I didn't think this could have been from workqueue side because no one else was reporting it. This time, Eric had a kdump which we looked into and it turned out this actually was a workqueue bug and the bug has been there since the beginning of concurrency managed workqueue. A worker pool ensures forward progress of the workqueues associated with it by always having at least one worker reserved from executing work items. When the pool is under contention, the idle one tries to create more workers for the pool and if that doesn't succeed quickly enough, it calls the rescuers to the pool. This logic had a subtle race condition in an early exit path. When a worker invokes this manager function, the function may return %false indicating that the caller may proceed to executing work items either because another worker is already performing the role or conditions have changed and the pool is no longer under contention. The latter part depended on the assumption that whether more workers are necessary or not remains stable while the pool is locked; however, pool->nr_running (concurrency count) may change asynchronously and it getting bumped from zero asynchronously could send off the last idle worker to execute work items. The race window is fairly narrow, and, even when it gets triggered, the pool deadlocks iff if all work items get blocked on pending work items of the pool, which is highly unlikely but can be triggered by xfs. The patch removes the race window by removing the early exit path, which doesn't server any purpose anymore anyway" * 'for-3.19-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq: workqueue: fix subtle pool management issue which can stall whole worker_pool
Diffstat (limited to 'kernel')
-rw-r--r--kernel/workqueue.c25
1 files changed, 8 insertions, 17 deletions
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 6202b08f1933..beeeac9e0e3e 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1841,17 +1841,11 @@ static void pool_mayday_timeout(unsigned long __pool)
1841 * spin_lock_irq(pool->lock) which may be released and regrabbed 1841 * spin_lock_irq(pool->lock) which may be released and regrabbed
1842 * multiple times. Does GFP_KERNEL allocations. Called only from 1842 * multiple times. Does GFP_KERNEL allocations. Called only from
1843 * manager. 1843 * manager.
1844 *
1845 * Return:
1846 * %false if no action was taken and pool->lock stayed locked, %true
1847 * otherwise.
1848 */ 1844 */
1849static bool maybe_create_worker(struct worker_pool *pool) 1845static void maybe_create_worker(struct worker_pool *pool)
1850__releases(&pool->lock) 1846__releases(&pool->lock)
1851__acquires(&pool->lock) 1847__acquires(&pool->lock)
1852{ 1848{
1853 if (!need_to_create_worker(pool))
1854 return false;
1855restart: 1849restart:
1856 spin_unlock_irq(&pool->lock); 1850 spin_unlock_irq(&pool->lock);
1857 1851
@@ -1877,7 +1871,6 @@ restart:
1877 */ 1871 */
1878 if (need_to_create_worker(pool)) 1872 if (need_to_create_worker(pool))
1879 goto restart; 1873 goto restart;
1880 return true;
1881} 1874}
1882 1875
1883/** 1876/**
@@ -1897,16 +1890,14 @@ restart:
1897 * multiple times. Does GFP_KERNEL allocations. 1890 * multiple times. Does GFP_KERNEL allocations.
1898 * 1891 *
1899 * Return: 1892 * Return:
1900 * %false if the pool don't need management and the caller can safely start 1893 * %false if the pool doesn't need management and the caller can safely
1901 * processing works, %true indicates that the function released pool->lock 1894 * start processing works, %true if management function was performed and
1902 * and reacquired it to perform some management function and that the 1895 * the conditions that the caller verified before calling the function may
1903 * conditions that the caller verified while holding the lock before 1896 * no longer be true.
1904 * calling the function might no longer be true.
1905 */ 1897 */
1906static bool manage_workers(struct worker *worker) 1898static bool manage_workers(struct worker *worker)
1907{ 1899{
1908 struct worker_pool *pool = worker->pool; 1900 struct worker_pool *pool = worker->pool;
1909 bool ret = false;
1910 1901
1911 /* 1902 /*
1912 * Anyone who successfully grabs manager_arb wins the arbitration 1903 * Anyone who successfully grabs manager_arb wins the arbitration
@@ -1919,12 +1910,12 @@ static bool manage_workers(struct worker *worker)
1919 * actual management, the pool may stall indefinitely. 1910 * actual management, the pool may stall indefinitely.
1920 */ 1911 */
1921 if (!mutex_trylock(&pool->manager_arb)) 1912 if (!mutex_trylock(&pool->manager_arb))
1922 return ret; 1913 return false;
1923 1914
1924 ret |= maybe_create_worker(pool); 1915 maybe_create_worker(pool);
1925 1916
1926 mutex_unlock(&pool->manager_arb); 1917 mutex_unlock(&pool->manager_arb);
1927 return ret; 1918 return true;
1928} 1919}
1929 1920
1930/** 1921/**