1 files changed, 67 insertions, 27 deletions
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0b72e816b8d0..29b79852a845 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -16,9 +16,10 @@
 *
 * This is the generic async execution mechanism.  Work items as are
 * executed in process context.  The worker pool is shared and
- * automatically managed.  There is one worker pool for each CPU and
+ * automatically managed.  There are two worker pools for each CPU (one for
- * one extra for works which are better served by workers which are
+ * normal work items and the other for high priority ones) and some extra
- * not bound to any specific CPU.
+ * pools for workqueues which are not bound to any specific CPU - the
+ * number of these backing pools is dynamic.
 *
 * Please read Documentation/workqueue.txt for details.
 */
@@ -2033,8 +2034,11 @@ static bool maybe_destroy_workers(struct worker_pool *pool)
 * multiple times.  Does GFP_KERNEL allocations.
 *
 * RETURNS:
- * spin_lock_irq(pool->lock) which may be released and regrabbed
+ * %false if the pool don't need management and the caller can safely start
- * multiple times.  Does GFP_KERNEL allocations.
+ * processing works, %true indicates that the function released pool->lock
+ * and reacquired it to perform some management function and that the
+ * conditions that the caller verified while holding the lock before
+ * calling the function might no longer be true.
 */
 static bool manage_workers(struct worker *worker)
 {
@@ -2201,6 +2205,15 @@ __acquires(&pool->lock)
                dump_stack();
        }
+        /*
+         * The following prevents a kworker from hogging CPU on !PREEMPT
+         * kernels, where a requeueing work item waiting for something to
+         * happen could deadlock with stop_machine as such work item could
+         * indefinitely requeue itself while all other CPUs are trapped in
+         * stop_machine.
+         */
+        cond_resched();
        spin_lock_irq(&pool->lock);
        /* clear cpu intensive status */
@@ -2817,6 +2830,19 @@ already_gone:
        return false;
 }
+static bool __flush_work(struct work_struct *work)
+{
+        struct wq_barrier barr;
+        if (start_flush_work(work, &barr)) {
+                wait_for_completion(&barr.done);
+                destroy_work_on_stack(&barr.work);
+                return true;
+        } else {
+                return false;
+        }
+}
 /**
 * flush_work - wait for a work to finish executing the last queueing instance
 * @work: the work to flush
@@ -2830,18 +2856,10 @@ already_gone:
 */
 bool flush_work(struct work_struct *work)
 {
-        struct wq_barrier barr;
        lock_map_acquire(&work->lockdep_map);
        lock_map_release(&work->lockdep_map);
-        if (start_flush_work(work, &barr)) {
+        return __flush_work(work);
-                wait_for_completion(&barr.done);
-                destroy_work_on_stack(&barr.work);
-                return true;
-        } else {
-                return false;
-        }
 }
 EXPORT_SYMBOL_GPL(flush_work);
@@ -3081,25 +3099,26 @@ static struct workqueue_struct *dev_to_wq(struct device *dev)
        return wq_dev->wq;
 }
-static ssize_t wq_per_cpu_show(struct device *dev,
+static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr,
-                               struct device_attribute *attr, char *buf)
+                            char *buf)
 {
        struct workqueue_struct *wq = dev_to_wq(dev);
        return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
 }
+static DEVICE_ATTR_RO(per_cpu);
-static ssize_t wq_max_active_show(struct device *dev,
+static ssize_t max_active_show(struct device *dev,
-                                  struct device_attribute *attr, char *buf)
+                               struct device_attribute *attr, char *buf)
 {
        struct workqueue_struct *wq = dev_to_wq(dev);
        return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
 }
-static ssize_t wq_max_active_store(struct device *dev,
+static ssize_t max_active_store(struct device *dev,
-                                   struct device_attribute *attr,
+                                struct device_attribute *attr, const char *buf,
-                                   const char *buf, size_t count)
+                                size_t count)
 {
        struct workqueue_struct *wq = dev_to_wq(dev);
        int val;
@@ -3110,12 +3129,14 @@ static ssize_t wq_max_active_store(struct device *dev,
        workqueue_set_max_active(wq, val);
        return count;
 }
+static DEVICE_ATTR_RW(max_active);
-static struct device_attribute wq_sysfs_attrs[] = {
+static struct attribute *wq_sysfs_attrs[] = {
-        __ATTR(per_cpu, 0444, wq_per_cpu_show, NULL),
+        &dev_attr_per_cpu.attr,
-        __ATTR(max_active, 0644, wq_max_active_show, wq_max_active_store),
+        &dev_attr_max_active.attr,
-        __ATTR_NULL,
+        NULL,
 };
+ATTRIBUTE_GROUPS(wq_sysfs);
 static ssize_t wq_pool_ids_show(struct device *dev,
                                struct device_attribute *attr, char *buf)
@@ -3265,7 +3286,7 @@ static struct device_attribute wq_sysfs_unbound_attrs[] = {
 static struct bus_type wq_subsys = {
        .name                           = "workqueue",
-        .dev_attrs                      = wq_sysfs_attrs,
+        .dev_groups                     = wq_sysfs_groups,
 };
 static int __init wq_sysfs_init(void)
@@ -3411,6 +3432,12 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to,
 {
        to->nice = from->nice;
        cpumask_copy(to->cpumask, from->cpumask);
+        /*
+         * Unlike hash and equality test, this function doesn't ignore
+         * ->no_numa as it is used for both pool and wq attrs.  Instead,
+         * get_unbound_pool() explicitly clears ->no_numa after copying.
+         */
+        to->no_numa = from->no_numa;
 }
 /* hash value of the content of @attr */
@@ -3578,6 +3605,12 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
        lockdep_set_subclass(&pool->lock, 1);   /* see put_pwq() */
        copy_workqueue_attrs(pool->attrs, attrs);
+        /*
+         * no_numa isn't a worker_pool attribute, always clear it.  See
+         * 'struct workqueue_attrs' comments for detail.
+         */
+        pool->attrs->no_numa = false;
        /* if cpumask is contained inside a NUMA node, we belong to that node */
        if (wq_numa_enabled) {
                for_each_node(node) {
@@ -4756,7 +4789,14 @@ long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
        INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);
        schedule_work_on(cpu, &wfc.work);
-        flush_work(&wfc.work);
+        /*
+         * The work item is on-stack and can't lead to deadlock through
+         * flushing.  Use __flush_work() to avoid spurious lockdep warnings
+         * when work_on_cpu()s are nested.
+         */
+        __flush_work(&wfc.work);
        return wfc.ret;
 }
 EXPORT_SYMBOL_GPL(work_on_cpu);

diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 0b72e816b8d0..29b79852a845 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c
@@ -16,9 +16,10 @@
16	*	16	*
17	* This is the generic async execution mechanism. Work items as are	17	* This is the generic async execution mechanism. Work items as are
18	* executed in process context. The worker pool is shared and	18	* executed in process context. The worker pool is shared and
19	* automatically managed. There is one worker pool for each CPU and	19	* automatically managed. There are two worker pools for each CPU (one for
20	* one extra for works which are better served by workers which are	20	* normal work items and the other for high priority ones) and some extra
21	* not bound to any specific CPU.	21	* pools for workqueues which are not bound to any specific CPU - the
		22	* number of these backing pools is dynamic.
22	*	23	*
23	* Please read Documentation/workqueue.txt for details.	24	* Please read Documentation/workqueue.txt for details.
24	*/	25	*/
@@ -2033,8 +2034,11 @@ static bool maybe_destroy_workers(struct worker_pool *pool)
2033	* multiple times. Does GFP_KERNEL allocations.	2034	* multiple times. Does GFP_KERNEL allocations.
2034	*	2035	*
2035	* RETURNS:	2036	* RETURNS:
2036	* spin_lock_irq(pool->lock) which may be released and regrabbed	2037	* %false if the pool don't need management and the caller can safely start
2037	* multiple times. Does GFP_KERNEL allocations.	2038	* processing works, %true indicates that the function released pool->lock
		2039	* and reacquired it to perform some management function and that the
		2040	* conditions that the caller verified while holding the lock before
		2041	* calling the function might no longer be true.
2038	*/	2042	*/
2039	static bool manage_workers(struct worker *worker)	2043	static bool manage_workers(struct worker *worker)
2040	{	2044	{
@@ -2201,6 +2205,15 @@ __acquires(&pool->lock)
2201	dump_stack();	2205	dump_stack();
2202	}	2206	}
2203		2207
		2208	/*
		2209	* The following prevents a kworker from hogging CPU on !PREEMPT
		2210	* kernels, where a requeueing work item waiting for something to
		2211	* happen could deadlock with stop_machine as such work item could
		2212	* indefinitely requeue itself while all other CPUs are trapped in
		2213	* stop_machine.
		2214	*/
		2215	cond_resched();
		2216
2204	spin_lock_irq(&pool->lock);	2217	spin_lock_irq(&pool->lock);
2205		2218
2206	/* clear cpu intensive status */	2219	/* clear cpu intensive status */
@@ -2817,6 +2830,19 @@ already_gone:
2817	return false;	2830	return false;
2818	}	2831	}
2819		2832
		2833	static bool __flush_work(struct work_struct *work)
		2834	{
		2835	struct wq_barrier barr;
		2836
		2837	if (start_flush_work(work, &barr)) {
		2838	wait_for_completion(&barr.done);
		2839	destroy_work_on_stack(&barr.work);
		2840	return true;
		2841	} else {
		2842	return false;
		2843	}
		2844	}
		2845
2820	/**	2846	/**
2821	* flush_work - wait for a work to finish executing the last queueing instance	2847	* flush_work - wait for a work to finish executing the last queueing instance
2822	* @work: the work to flush	2848	* @work: the work to flush
@@ -2830,18 +2856,10 @@ already_gone:
2830	*/	2856	*/
2831	bool flush_work(struct work_struct *work)	2857	bool flush_work(struct work_struct *work)
2832	{	2858	{
2833	struct wq_barrier barr;
2834
2835	lock_map_acquire(&work->lockdep_map);	2859	lock_map_acquire(&work->lockdep_map);
2836	lock_map_release(&work->lockdep_map);	2860	lock_map_release(&work->lockdep_map);
2837		2861
2838	if (start_flush_work(work, &barr)) {	2862	return __flush_work(work);
2839	wait_for_completion(&barr.done);
2840	destroy_work_on_stack(&barr.work);
2841	return true;
2842	} else {
2843	return false;
2844	}
2845	}	2863	}
2846	EXPORT_SYMBOL_GPL(flush_work);	2864	EXPORT_SYMBOL_GPL(flush_work);
2847		2865
@@ -3081,25 +3099,26 @@ static struct workqueue_struct dev_to_wq(struct device dev)
3081	return wq_dev->wq;	3099	return wq_dev->wq;
3082	}	3100	}
3083		3101
3084	static ssize_t wq_per_cpu_show(struct device *dev,	3102	static ssize_t per_cpu_show(struct device dev, struct device_attribute attr,
3085	struct device_attribute attr, char buf)	3103	char *buf)
3086	{	3104	{
3087	struct workqueue_struct *wq = dev_to_wq(dev);	3105	struct workqueue_struct *wq = dev_to_wq(dev);
3088		3106
3089	return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));	3107	return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
3090	}	3108	}
		3109	static DEVICE_ATTR_RO(per_cpu);
3091		3110
3092	static ssize_t wq_max_active_show(struct device *dev,	3111	static ssize_t max_active_show(struct device *dev,
3093	struct device_attribute attr, char buf)	3112	struct device_attribute attr, char buf)
3094	{	3113	{
3095	struct workqueue_struct *wq = dev_to_wq(dev);	3114	struct workqueue_struct *wq = dev_to_wq(dev);
3096		3115
3097	return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);	3116	return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
3098	}	3117	}
3099		3118
3100	static ssize_t wq_max_active_store(struct device *dev,	3119	static ssize_t max_active_store(struct device *dev,
3101	struct device_attribute *attr,	3120	struct device_attribute attr, const char buf,
3102	const char *buf, size_t count)	3121	size_t count)
3103	{	3122	{
3104	struct workqueue_struct *wq = dev_to_wq(dev);	3123	struct workqueue_struct *wq = dev_to_wq(dev);
3105	int val;	3124	int val;
@@ -3110,12 +3129,14 @@ static ssize_t wq_max_active_store(struct device *dev,
3110	workqueue_set_max_active(wq, val);	3129	workqueue_set_max_active(wq, val);
3111	return count;	3130	return count;
3112	}	3131	}
		3132	static DEVICE_ATTR_RW(max_active);
3113		3133
3114	static struct device_attribute wq_sysfs_attrs[] = {	3134	static struct attribute *wq_sysfs_attrs[] = {
3115	__ATTR(per_cpu, 0444, wq_per_cpu_show, NULL),	3135	&dev_attr_per_cpu.attr,
3116	__ATTR(max_active, 0644, wq_max_active_show, wq_max_active_store),	3136	&dev_attr_max_active.attr,
3117	__ATTR_NULL,	3137	NULL,
3118	};	3138	};
		3139	ATTRIBUTE_GROUPS(wq_sysfs);
3119		3140
3120	static ssize_t wq_pool_ids_show(struct device *dev,	3141	static ssize_t wq_pool_ids_show(struct device *dev,
3121	struct device_attribute attr, char buf)	3142	struct device_attribute attr, char buf)
@@ -3265,7 +3286,7 @@ static struct device_attribute wq_sysfs_unbound_attrs[] = {
3265		3286
3266	static struct bus_type wq_subsys = {	3287	static struct bus_type wq_subsys = {
3267	.name = "workqueue",	3288	.name = "workqueue",
3268	.dev_attrs = wq_sysfs_attrs,	3289	.dev_groups = wq_sysfs_groups,
3269	};	3290	};
3270		3291
3271	static int __init wq_sysfs_init(void)	3292	static int __init wq_sysfs_init(void)
@@ -3411,6 +3432,12 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to,
3411	{	3432	{
3412	to->nice = from->nice;	3433	to->nice = from->nice;
3413	cpumask_copy(to->cpumask, from->cpumask);	3434	cpumask_copy(to->cpumask, from->cpumask);
		3435	/*
		3436	* Unlike hash and equality test, this function doesn't ignore
		3437	* ->no_numa as it is used for both pool and wq attrs. Instead,
		3438	* get_unbound_pool() explicitly clears ->no_numa after copying.
		3439	*/
		3440	to->no_numa = from->no_numa;
3414	}	3441	}
3415		3442
3416	/* hash value of the content of @attr */	3443	/* hash value of the content of @attr */
@@ -3578,6 +3605,12 @@ static struct worker_pool get_unbound_pool(const struct workqueue_attrs attrs)
3578	lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */	3605	lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */
3579	copy_workqueue_attrs(pool->attrs, attrs);	3606	copy_workqueue_attrs(pool->attrs, attrs);
3580		3607
		3608	/*
		3609	* no_numa isn't a worker_pool attribute, always clear it. See
		3610	* 'struct workqueue_attrs' comments for detail.
		3611	*/
		3612	pool->attrs->no_numa = false;
		3613
3581	/* if cpumask is contained inside a NUMA node, we belong to that node */	3614	/* if cpumask is contained inside a NUMA node, we belong to that node */
3582	if (wq_numa_enabled) {	3615	if (wq_numa_enabled) {
3583	for_each_node(node) {	3616	for_each_node(node) {
@@ -4756,7 +4789,14 @@ long work_on_cpu(int cpu, long (fn)(void ), void *arg)
4756		4789
4757	INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);	4790	INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);
4758	schedule_work_on(cpu, &wfc.work);	4791	schedule_work_on(cpu, &wfc.work);
4759	flush_work(&wfc.work);	4792
		4793	/*
		4794	* The work item is on-stack and can't lead to deadlock through
		4795	* flushing. Use __flush_work() to avoid spurious lockdep warnings
		4796	* when work_on_cpu()s are nested.
		4797	*/
		4798	__flush_work(&wfc.work);
		4799
4760	return wfc.ret;	4800	return wfc.ret;
4761	}	4801	}
4762	EXPORT_SYMBOL_GPL(work_on_cpu);	4802	EXPORT_SYMBOL_GPL(work_on_cpu);