1 files changed, 259 insertions, 19 deletions
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index d9a4aeb844d5..57cd77de4a4f 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -45,6 +45,7 @@
 #include <linux/hashtable.h>
 #include <linux/rculist.h>
 #include <linux/nodemask.h>
+#include <linux/moduleparam.h>
 #include "workqueue_internal.h"
@@ -245,6 +246,7 @@ struct workqueue_struct {
        int                     saved_max_active; /* WQ: saved pwq max_active */
        struct workqueue_attrs  *unbound_attrs; /* WQ: only for unbound wqs */
+        struct pool_workqueue   *dfl_pwq;       /* WQ: only for unbound wqs */
 #ifdef CONFIG_SYSFS
        struct wq_device        *wq_dev;        /* I: for sysfs interface */
@@ -268,6 +270,9 @@ static cpumask_var_t *wq_numa_possible_cpumask;
 static bool wq_numa_enabled;            /* unbound NUMA affinity enabled */
+/* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */
+static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf;
 static DEFINE_MUTEX(wq_pool_mutex);     /* protects pools and workqueues list */
 static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */
@@ -3710,6 +3715,61 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
        return pwq;
 }
+/* undo alloc_unbound_pwq(), used only in the error path */
+static void free_unbound_pwq(struct pool_workqueue *pwq)
+{
+        lockdep_assert_held(&wq_pool_mutex);
+        if (pwq) {
+                put_unbound_pool(pwq->pool);
+                kfree(pwq);
+        }
+}
+/**
+ * wq_calc_node_mask - calculate a wq_attrs' cpumask for the specified node
+ * @attrs: the wq_attrs of interest
+ * @node: the target NUMA node
+ * @cpu_going_down: if >= 0, the CPU to consider as offline
+ * @cpumask: outarg, the resulting cpumask
+ *
+ * Calculate the cpumask a workqueue with @attrs should use on @node.  If
+ * @cpu_going_down is >= 0, that cpu is considered offline during
+ * calculation.  The result is stored in @cpumask.  This function returns
+ * %true if the resulting @cpumask is different from @attrs->cpumask,
+ * %false if equal.
+ *
+ * If NUMA affinity is not enabled, @attrs->cpumask is always used.  If
+ * enabled and @node has online CPUs requested by @attrs, the returned
+ * cpumask is the intersection of the possible CPUs of @node and
+ * @attrs->cpumask.
+ *
+ * The caller is responsible for ensuring that the cpumask of @node stays
+ * stable.
+ */
+static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node,
+                                 int cpu_going_down, cpumask_t *cpumask)
+{
+        if (!wq_numa_enabled)
+                goto use_dfl;
+        /* does @node have any online CPUs @attrs wants? */
+        cpumask_and(cpumask, cpumask_of_node(node), attrs->cpumask);
+        if (cpu_going_down >= 0)
+                cpumask_clear_cpu(cpu_going_down, cpumask);
+        if (cpumask_empty(cpumask))
+                goto use_dfl;
+        /* yeap, return possible CPUs in @node that @attrs wants */
+        cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]);
+        return !cpumask_equal(cpumask, attrs->cpumask);
+use_dfl:
+        cpumask_copy(cpumask, attrs->cpumask);
+        return false;
+}
 /* install @pwq into @wq's numa_pwq_tbl[] for @node and return the old pwq */
 static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
                                                   int node,
@@ -3732,11 +3792,12 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
 * @wq: the target workqueue
 * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs()
 *
- * Apply @attrs to an unbound workqueue @wq.  If @attrs doesn't match the
+ * Apply @attrs to an unbound workqueue @wq.  Unless disabled, on NUMA
- * current attributes, a new pwq is created and made the first pwq which
+ * machines, this function maps a separate pwq to each NUMA node with
- * will serve all new work items.  Older pwqs are released as in-flight
+ * possibles CPUs in @attrs->cpumask so that work items are affine to the
- * work items finish.  Note that a work item which repeatedly requeues
+ * NUMA node it was issued on.  Older pwqs are released as in-flight work
- * itself back-to-back will stay on its current pwq.
+ * items finish.  Note that a work item which repeatedly requeues itself
+ * back-to-back will stay on its current pwq.
 *
 * Performs GFP_KERNEL allocations.  Returns 0 on success and -errno on
 * failure.
@@ -3744,8 +3805,8 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
 int apply_workqueue_attrs(struct workqueue_struct *wq,
                          const struct workqueue_attrs *attrs)
 {
-        struct workqueue_attrs *new_attrs;
+        struct workqueue_attrs *new_attrs, *tmp_attrs;
-        struct pool_workqueue *pwq, *last_pwq = NULL;
+        struct pool_workqueue **pwq_tbl, *dfl_pwq;
        int node, ret;
        /* only unbound workqueues can change attributes */
@@ -3756,40 +3817,191 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
        if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
                return -EINVAL;
-        /* make a copy of @attrs and sanitize it */
+        pwq_tbl = kzalloc(wq_numa_tbl_len * sizeof(pwq_tbl[0]), GFP_KERNEL);
        new_attrs = alloc_workqueue_attrs(GFP_KERNEL);
-        if (!new_attrs)
+        tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL);
+        if (!pwq_tbl || !new_attrs || !tmp_attrs)
                goto enomem;
+        /* make a copy of @attrs and sanitize it */
        copy_workqueue_attrs(new_attrs, attrs);
        cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
+        /*
+         * We may create multiple pwqs with differing cpumasks.  Make a
+         * copy of @new_attrs which will be modified and used to obtain
+         * pools.
+         */
+        copy_workqueue_attrs(tmp_attrs, new_attrs);
+        /*
+         * CPUs should stay stable across pwq creations and installations.
+         * Pin CPUs, determine the target cpumask for each node and create
+         * pwqs accordingly.
+         */
+        get_online_cpus();
        mutex_lock(&wq_pool_mutex);
-        pwq = alloc_unbound_pwq(wq, new_attrs);
+        /*
+         * If something goes wrong during CPU up/down, we'll fall back to
+         * the default pwq covering whole @attrs->cpumask.  Always create
+         * it even if we don't use it immediately.
+         */
+        dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
+        if (!dfl_pwq)
+                goto enomem_pwq;
+        for_each_node(node) {
+                if (wq_calc_node_cpumask(attrs, node, -1, tmp_attrs->cpumask)) {
+                        pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
+                        if (!pwq_tbl[node])
+                                goto enomem_pwq;
+                } else {
+                        dfl_pwq->refcnt++;
+                        pwq_tbl[node] = dfl_pwq;
+                }
+        }
        mutex_unlock(&wq_pool_mutex);
-        if (!pwq)
-                goto enomem;
+        /* all pwqs have been created successfully, let's install'em */
        mutex_lock(&wq->mutex);
        copy_workqueue_attrs(wq->unbound_attrs, new_attrs);
+        /* save the previous pwq and install the new one */
        for_each_node(node)
-                last_pwq = numa_pwq_tbl_install(wq, node, pwq);
+                pwq_tbl[node] = numa_pwq_tbl_install(wq, node, pwq_tbl[node]);
+        /* @dfl_pwq might not have been used, ensure it's linked */
+        link_pwq(dfl_pwq);
+        swap(wq->dfl_pwq, dfl_pwq);
        mutex_unlock(&wq->mutex);
-        put_pwq_unlocked(last_pwq);
+        /* put the old pwqs */
+        for_each_node(node)
+                put_pwq_unlocked(pwq_tbl[node]);
+        put_pwq_unlocked(dfl_pwq);
+        put_online_cpus();
        ret = 0;
        /* fall through */
 out_free:
+        free_workqueue_attrs(tmp_attrs);
        free_workqueue_attrs(new_attrs);
+        kfree(pwq_tbl);
        return ret;
+enomem_pwq:
+        free_unbound_pwq(dfl_pwq);
+        for_each_node(node)
+                if (pwq_tbl && pwq_tbl[node] != dfl_pwq)
+                        free_unbound_pwq(pwq_tbl[node]);
+        mutex_unlock(&wq_pool_mutex);
+        put_online_cpus();
 enomem:
        ret = -ENOMEM;
        goto out_free;
 }
+/**
+ * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug
+ * @wq: the target workqueue
+ * @cpu: the CPU coming up or going down
+ * @online: whether @cpu is coming up or going down
+ *
+ * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and
+ * %CPU_DOWN_FAILED.  @cpu is being hot[un]plugged, update NUMA affinity of
+ * @wq accordingly.
+ *
+ * If NUMA affinity can't be adjusted due to memory allocation failure, it
+ * falls back to @wq->dfl_pwq which may not be optimal but is always
+ * correct.
+ *
+ * Note that when the last allowed CPU of a NUMA node goes offline for a
+ * workqueue with a cpumask spanning multiple nodes, the workers which were
+ * already executing the work items for the workqueue will lose their CPU
+ * affinity and may execute on any CPU.  This is similar to how per-cpu
+ * workqueues behave on CPU_DOWN.  If a workqueue user wants strict
+ * affinity, it's the user's responsibility to flush the work item from
+ * CPU_DOWN_PREPARE.
+ */
+static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
+                                   bool online)
+{
+        int node = cpu_to_node(cpu);
+        int cpu_off = online ? -1 : cpu;
+        struct pool_workqueue *old_pwq = NULL, *pwq;
+        struct workqueue_attrs *target_attrs;
+        cpumask_t *cpumask;
+        lockdep_assert_held(&wq_pool_mutex);
+        if (!wq_numa_enabled || !(wq->flags & WQ_UNBOUND))
+                return;
+        /*
+         * We don't wanna alloc/free wq_attrs for each wq for each CPU.
+         * Let's use a preallocated one.  The following buf is protected by
+         * CPU hotplug exclusion.
+         */
+        target_attrs = wq_update_unbound_numa_attrs_buf;
+        cpumask = target_attrs->cpumask;
+        mutex_lock(&wq->mutex);
+        copy_workqueue_attrs(target_attrs, wq->unbound_attrs);
+        pwq = unbound_pwq_by_node(wq, node);
+        /*
+         * Let's determine what needs to be done.  If the target cpumask is
+         * different from wq's, we need to compare it to @pwq's and create
+         * a new one if they don't match.  If the target cpumask equals
+         * wq's, the default pwq should be used.  If @pwq is already the
+         * default one, nothing to do; otherwise, install the default one.
+         */
+        if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) {
+                if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask))
+                        goto out_unlock;
+        } else {
+                if (pwq == wq->dfl_pwq)
+                        goto out_unlock;
+                else
+                        goto use_dfl_pwq;
+        }
+        mutex_unlock(&wq->mutex);
+        /* create a new pwq */
+        pwq = alloc_unbound_pwq(wq, target_attrs);
+        if (!pwq) {
+                pr_warning("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n",
+                           wq->name);
+                goto out_unlock;
+        }
+        /*
+         * Install the new pwq.  As this function is called only from CPU
+         * hotplug callbacks and applying a new attrs is wrapped with
+         * get/put_online_cpus(), @wq->unbound_attrs couldn't have changed
+         * inbetween.
+         */
+        mutex_lock(&wq->mutex);
+        old_pwq = numa_pwq_tbl_install(wq, node, pwq);
+        goto out_unlock;
+use_dfl_pwq:
+        spin_lock_irq(&wq->dfl_pwq->pool->lock);
+        get_pwq(wq->dfl_pwq);
+        spin_unlock_irq(&wq->dfl_pwq->pool->lock);
+        old_pwq = numa_pwq_tbl_install(wq, node, wq->dfl_pwq);
+out_unlock:
+        mutex_unlock(&wq->mutex);
+        put_pwq_unlocked(old_pwq);
+}
 static int alloc_and_link_pwqs(struct workqueue_struct *wq)
 {
        bool highpri = wq->flags & WQ_HIGHPRI;
@@ -3942,6 +4154,7 @@ EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
 void destroy_workqueue(struct workqueue_struct *wq)
 {
        struct pool_workqueue *pwq;
+        int node;
        /* drain it before proceeding with destruction */
        drain_workqueue(wq);
@@ -3993,11 +4206,21 @@ void destroy_workqueue(struct workqueue_struct *wq)
        } else {
                /*
                 * We're the sole accessor of @wq at this point.  Directly
-                 * access the first pwq and put the base ref.  @wq will be
+                 * access numa_pwq_tbl[] and dfl_pwq to put the base refs.
-                 * freed when the last pwq is released.
+                 * @wq will be freed when the last pwq is released.
                 */
-                pwq = list_first_entry(&wq->pwqs, struct pool_workqueue,
+                for_each_node(node) {
-                                       pwqs_node);
+                        pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]);
+                        RCU_INIT_POINTER(wq->numa_pwq_tbl[node], NULL);
+                        put_pwq_unlocked(pwq);
+                }
+                /*
+                 * Put dfl_pwq.  @wq may be freed any time after dfl_pwq is
+                 * put.  Don't access it afterwards.
+                 */
+                pwq = wq->dfl_pwq;
+                wq->dfl_pwq = NULL;
                put_pwq_unlocked(pwq);
        }
 }
@@ -4285,6 +4508,7 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
 {
        int cpu = (unsigned long)hcpu;
        struct worker_pool *pool;
+        struct workqueue_struct *wq;
        int pi;
        switch (action & ~CPU_TASKS_FROZEN) {
@@ -4317,6 +4541,10 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
                        mutex_unlock(&pool->manager_mutex);
                }
+                /* update NUMA affinity of unbound workqueues */
+                list_for_each_entry(wq, &workqueues, list)
+                        wq_update_unbound_numa(wq, cpu, true);
                mutex_unlock(&wq_pool_mutex);
                break;
        }
@@ -4333,12 +4561,21 @@ static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb,
 {
        int cpu = (unsigned long)hcpu;
        struct work_struct unbind_work;
+        struct workqueue_struct *wq;
        switch (action & ~CPU_TASKS_FROZEN) {
        case CPU_DOWN_PREPARE:
-                /* unbinding should happen on the local CPU */
+                /* unbinding per-cpu workers should happen on the local CPU */
                INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);
                queue_work_on(cpu, system_highpri_wq, &unbind_work);
+                /* update NUMA affinity of unbound workqueues */
+                mutex_lock(&wq_pool_mutex);
+                list_for_each_entry(wq, &workqueues, list)
+                        wq_update_unbound_numa(wq, cpu, false);
+                mutex_unlock(&wq_pool_mutex);
+                /* wait for per-cpu unbinding to finish */
                flush_work(&unbind_work);
                break;
        }
@@ -4526,6 +4763,9 @@ static void __init wq_numa_init(void)
        if (num_possible_nodes() <= 1)
                return;
+        wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(GFP_KERNEL);
+        BUG_ON(!wq_update_unbound_numa_attrs_buf);
        /*
         * We want masks of possible CPUs of each node which isn't readily
         * available.  Build one from cpu_to_node() which should have been

diff --git a/kernel/workqueue.c b/kernel/workqueue.c index d9a4aeb844d5..57cd77de4a4f 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c
@@ -45,6 +45,7 @@
45	#include <linux/hashtable.h>	45	#include <linux/hashtable.h>
46	#include <linux/rculist.h>	46	#include <linux/rculist.h>
47	#include <linux/nodemask.h>	47	#include <linux/nodemask.h>
		48	#include <linux/moduleparam.h>
48		49
49	#include "workqueue_internal.h"	50	#include "workqueue_internal.h"
50		51
@@ -245,6 +246,7 @@ struct workqueue_struct {
245	int saved_max_active; /* WQ: saved pwq max_active */	246	int saved_max_active; /* WQ: saved pwq max_active */
246		247
247	struct workqueue_attrs unbound_attrs; / WQ: only for unbound wqs */	248	struct workqueue_attrs unbound_attrs; / WQ: only for unbound wqs */
		249	struct pool_workqueue dfl_pwq; / WQ: only for unbound wqs */
248		250
249	#ifdef CONFIG_SYSFS	251	#ifdef CONFIG_SYSFS
250	struct wq_device wq_dev; / I: for sysfs interface */	252	struct wq_device wq_dev; / I: for sysfs interface */
@@ -268,6 +270,9 @@ static cpumask_var_t *wq_numa_possible_cpumask;
268		270
269	static bool wq_numa_enabled; /* unbound NUMA affinity enabled */	271	static bool wq_numa_enabled; /* unbound NUMA affinity enabled */
270		272
		273	/* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */
		274	static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf;
		275
271	static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */	276	static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */
272	static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */	277	static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */
273		278
@@ -3710,6 +3715,61 @@ static struct pool_workqueue alloc_unbound_pwq(struct workqueue_struct wq,
3710	return pwq;	3715	return pwq;
3711	}	3716	}
3712		3717
		3718	/* undo alloc_unbound_pwq(), used only in the error path */
		3719	static void free_unbound_pwq(struct pool_workqueue *pwq)
		3720	{
		3721	lockdep_assert_held(&wq_pool_mutex);
		3722
		3723	if (pwq) {
		3724	put_unbound_pool(pwq->pool);
		3725	kfree(pwq);
		3726	}
		3727	}
		3728
		3729	/**
		3730	* wq_calc_node_mask - calculate a wq_attrs' cpumask for the specified node
		3731	* @attrs: the wq_attrs of interest
		3732	* @node: the target NUMA node
		3733	* @cpu_going_down: if >= 0, the CPU to consider as offline
		3734	* @cpumask: outarg, the resulting cpumask
		3735	*
		3736	* Calculate the cpumask a workqueue with @attrs should use on @node. If
		3737	* @cpu_going_down is >= 0, that cpu is considered offline during
		3738	* calculation. The result is stored in @cpumask. This function returns
		3739	* %true if the resulting @cpumask is different from @attrs->cpumask,
		3740	* %false if equal.
		3741	*
		3742	* If NUMA affinity is not enabled, @attrs->cpumask is always used. If
		3743	* enabled and @node has online CPUs requested by @attrs, the returned
		3744	* cpumask is the intersection of the possible CPUs of @node and
		3745	* @attrs->cpumask.
		3746	*
		3747	* The caller is responsible for ensuring that the cpumask of @node stays
		3748	* stable.
		3749	*/
		3750	static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node,
		3751	int cpu_going_down, cpumask_t *cpumask)
		3752	{
		3753	if (!wq_numa_enabled)
		3754	goto use_dfl;
		3755
		3756	/* does @node have any online CPUs @attrs wants? */
		3757	cpumask_and(cpumask, cpumask_of_node(node), attrs->cpumask);
		3758	if (cpu_going_down >= 0)
		3759	cpumask_clear_cpu(cpu_going_down, cpumask);
		3760
		3761	if (cpumask_empty(cpumask))
		3762	goto use_dfl;
		3763
		3764	/* yeap, return possible CPUs in @node that @attrs wants */
		3765	cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]);
		3766	return !cpumask_equal(cpumask, attrs->cpumask);
		3767
		3768	use_dfl:
		3769	cpumask_copy(cpumask, attrs->cpumask);
		3770	return false;
		3771	}
		3772
3713	/* install @pwq into @wq's numa_pwq_tbl[] for @node and return the old pwq */	3773	/* install @pwq into @wq's numa_pwq_tbl[] for @node and return the old pwq */
3714	static struct pool_workqueue numa_pwq_tbl_install(struct workqueue_struct wq,	3774	static struct pool_workqueue numa_pwq_tbl_install(struct workqueue_struct wq,
3715	int node,	3775	int node,
@@ -3732,11 +3792,12 @@ static struct pool_workqueue numa_pwq_tbl_install(struct workqueue_struct wq,
3732	* @wq: the target workqueue	3792	* @wq: the target workqueue
3733	* @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs()	3793	* @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs()
3734	*	3794	*
3735	* Apply @attrs to an unbound workqueue @wq. If @attrs doesn't match the	3795	* Apply @attrs to an unbound workqueue @wq. Unless disabled, on NUMA
3736	* current attributes, a new pwq is created and made the first pwq which	3796	* machines, this function maps a separate pwq to each NUMA node with
3737	* will serve all new work items. Older pwqs are released as in-flight	3797	* possibles CPUs in @attrs->cpumask so that work items are affine to the
3738	* work items finish. Note that a work item which repeatedly requeues	3798	* NUMA node it was issued on. Older pwqs are released as in-flight work
3739	* itself back-to-back will stay on its current pwq.	3799	* items finish. Note that a work item which repeatedly requeues itself
		3800	* back-to-back will stay on its current pwq.
3740	*	3801	*
3741	* Performs GFP_KERNEL allocations. Returns 0 on success and -errno on	3802	* Performs GFP_KERNEL allocations. Returns 0 on success and -errno on
3742	* failure.	3803	* failure.
@@ -3744,8 +3805,8 @@ static struct pool_workqueue numa_pwq_tbl_install(struct workqueue_struct wq,
3744	int apply_workqueue_attrs(struct workqueue_struct *wq,	3805	int apply_workqueue_attrs(struct workqueue_struct *wq,
3745	const struct workqueue_attrs *attrs)	3806	const struct workqueue_attrs *attrs)
3746	{	3807	{
3747	struct workqueue_attrs *new_attrs;	3808	struct workqueue_attrs new_attrs, tmp_attrs;
3748	struct pool_workqueue pwq, last_pwq = NULL;	3809	struct pool_workqueue *pwq_tbl, dfl_pwq;
3749	int node, ret;	3810	int node, ret;
3750		3811
3751	/* only unbound workqueues can change attributes */	3812	/* only unbound workqueues can change attributes */
@@ -3756,40 +3817,191 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
3756	if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))	3817	if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
3757	return -EINVAL;	3818	return -EINVAL;
3758		3819
3759	/* make a copy of @attrs and sanitize it */	3820	pwq_tbl = kzalloc(wq_numa_tbl_len * sizeof(pwq_tbl[0]), GFP_KERNEL);
3760	new_attrs = alloc_workqueue_attrs(GFP_KERNEL);	3821	new_attrs = alloc_workqueue_attrs(GFP_KERNEL);
3761	if (!new_attrs)	3822	tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL);
		3823	if (!pwq_tbl \|\| !new_attrs \|\| !tmp_attrs)
3762	goto enomem;	3824	goto enomem;
3763		3825
		3826	/* make a copy of @attrs and sanitize it */
3764	copy_workqueue_attrs(new_attrs, attrs);	3827	copy_workqueue_attrs(new_attrs, attrs);
3765	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);	3828	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
3766		3829
		3830	/*
		3831	* We may create multiple pwqs with differing cpumasks. Make a
		3832	* copy of @new_attrs which will be modified and used to obtain
		3833	* pools.
		3834	*/
		3835	copy_workqueue_attrs(tmp_attrs, new_attrs);
		3836
		3837	/*
		3838	* CPUs should stay stable across pwq creations and installations.
		3839	* Pin CPUs, determine the target cpumask for each node and create
		3840	* pwqs accordingly.
		3841	*/
		3842	get_online_cpus();
		3843
3767	mutex_lock(&wq_pool_mutex);	3844	mutex_lock(&wq_pool_mutex);
3768	pwq = alloc_unbound_pwq(wq, new_attrs);	3845
		3846	/*
		3847	* If something goes wrong during CPU up/down, we'll fall back to
		3848	* the default pwq covering whole @attrs->cpumask. Always create
		3849	* it even if we don't use it immediately.
		3850	*/
		3851	dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
		3852	if (!dfl_pwq)
		3853	goto enomem_pwq;
		3854
		3855	for_each_node(node) {
		3856	if (wq_calc_node_cpumask(attrs, node, -1, tmp_attrs->cpumask)) {
		3857	pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
		3858	if (!pwq_tbl[node])
		3859	goto enomem_pwq;
		3860	} else {
		3861	dfl_pwq->refcnt++;
		3862	pwq_tbl[node] = dfl_pwq;
		3863	}
		3864	}
		3865
3769	mutex_unlock(&wq_pool_mutex);	3866	mutex_unlock(&wq_pool_mutex);
3770	if (!pwq)
3771	goto enomem;
3772		3867
		3868	/* all pwqs have been created successfully, let's install'em */
3773	mutex_lock(&wq->mutex);	3869	mutex_lock(&wq->mutex);
3774		3870
3775	copy_workqueue_attrs(wq->unbound_attrs, new_attrs);	3871	copy_workqueue_attrs(wq->unbound_attrs, new_attrs);
		3872
		3873	/* save the previous pwq and install the new one */
3776	for_each_node(node)	3874	for_each_node(node)
3777	last_pwq = numa_pwq_tbl_install(wq, node, pwq);	3875	pwq_tbl[node] = numa_pwq_tbl_install(wq, node, pwq_tbl[node]);
		3876
		3877	/* @dfl_pwq might not have been used, ensure it's linked */
		3878	link_pwq(dfl_pwq);
		3879	swap(wq->dfl_pwq, dfl_pwq);
3778		3880
3779	mutex_unlock(&wq->mutex);	3881	mutex_unlock(&wq->mutex);
3780		3882
3781	put_pwq_unlocked(last_pwq);	3883	/* put the old pwqs */
		3884	for_each_node(node)
		3885	put_pwq_unlocked(pwq_tbl[node]);
		3886	put_pwq_unlocked(dfl_pwq);
		3887
		3888	put_online_cpus();
3782	ret = 0;	3889	ret = 0;
3783	/* fall through */	3890	/* fall through */
3784	out_free:	3891	out_free:
		3892	free_workqueue_attrs(tmp_attrs);
3785	free_workqueue_attrs(new_attrs);	3893	free_workqueue_attrs(new_attrs);
		3894	kfree(pwq_tbl);
3786	return ret;	3895	return ret;
3787		3896
		3897	enomem_pwq:
		3898	free_unbound_pwq(dfl_pwq);
		3899	for_each_node(node)
		3900	if (pwq_tbl && pwq_tbl[node] != dfl_pwq)
		3901	free_unbound_pwq(pwq_tbl[node]);
		3902	mutex_unlock(&wq_pool_mutex);
		3903	put_online_cpus();
3788	enomem:	3904	enomem:
3789	ret = -ENOMEM;	3905	ret = -ENOMEM;
3790	goto out_free;	3906	goto out_free;
3791	}	3907	}
3792		3908
		3909	/**
		3910	* wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug
		3911	* @wq: the target workqueue
		3912	* @cpu: the CPU coming up or going down
		3913	* @online: whether @cpu is coming up or going down
		3914	*
		3915	* This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and
		3916	* %CPU_DOWN_FAILED. @cpu is being hot[un]plugged, update NUMA affinity of
		3917	* @wq accordingly.
		3918	*
		3919	* If NUMA affinity can't be adjusted due to memory allocation failure, it
		3920	* falls back to @wq->dfl_pwq which may not be optimal but is always
		3921	* correct.
		3922	*
		3923	* Note that when the last allowed CPU of a NUMA node goes offline for a
		3924	* workqueue with a cpumask spanning multiple nodes, the workers which were
		3925	* already executing the work items for the workqueue will lose their CPU
		3926	* affinity and may execute on any CPU. This is similar to how per-cpu
		3927	* workqueues behave on CPU_DOWN. If a workqueue user wants strict
		3928	* affinity, it's the user's responsibility to flush the work item from
		3929	* CPU_DOWN_PREPARE.
		3930	*/
		3931	static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
		3932	bool online)
		3933	{
		3934	int node = cpu_to_node(cpu);
		3935	int cpu_off = online ? -1 : cpu;
		3936	struct pool_workqueue old_pwq = NULL, pwq;
		3937	struct workqueue_attrs *target_attrs;
		3938	cpumask_t *cpumask;
		3939
		3940	lockdep_assert_held(&wq_pool_mutex);
		3941
		3942	if (!wq_numa_enabled \|\| !(wq->flags & WQ_UNBOUND))
		3943	return;
		3944
		3945	/*
		3946	* We don't wanna alloc/free wq_attrs for each wq for each CPU.
		3947	* Let's use a preallocated one. The following buf is protected by
		3948	* CPU hotplug exclusion.
		3949	*/
		3950	target_attrs = wq_update_unbound_numa_attrs_buf;
		3951	cpumask = target_attrs->cpumask;
		3952
		3953	mutex_lock(&wq->mutex);
		3954
		3955	copy_workqueue_attrs(target_attrs, wq->unbound_attrs);
		3956	pwq = unbound_pwq_by_node(wq, node);
		3957
		3958	/*
		3959	* Let's determine what needs to be done. If the target cpumask is
		3960	* different from wq's, we need to compare it to @pwq's and create
		3961	* a new one if they don't match. If the target cpumask equals
		3962	* wq's, the default pwq should be used. If @pwq is already the
		3963	* default one, nothing to do; otherwise, install the default one.
		3964	*/
		3965	if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) {
		3966	if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask))
		3967	goto out_unlock;
		3968	} else {
		3969	if (pwq == wq->dfl_pwq)
		3970	goto out_unlock;
		3971	else
		3972	goto use_dfl_pwq;
		3973	}
		3974
		3975	mutex_unlock(&wq->mutex);
		3976
		3977	/* create a new pwq */
		3978	pwq = alloc_unbound_pwq(wq, target_attrs);
		3979	if (!pwq) {
		3980	pr_warning("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n",
		3981	wq->name);
		3982	goto out_unlock;
		3983	}
		3984
		3985	/*
		3986	* Install the new pwq. As this function is called only from CPU
		3987	* hotplug callbacks and applying a new attrs is wrapped with
		3988	* get/put_online_cpus(), @wq->unbound_attrs couldn't have changed
		3989	* inbetween.
		3990	*/
		3991	mutex_lock(&wq->mutex);
		3992	old_pwq = numa_pwq_tbl_install(wq, node, pwq);
		3993	goto out_unlock;
		3994
		3995	use_dfl_pwq:
		3996	spin_lock_irq(&wq->dfl_pwq->pool->lock);
		3997	get_pwq(wq->dfl_pwq);
		3998	spin_unlock_irq(&wq->dfl_pwq->pool->lock);
		3999	old_pwq = numa_pwq_tbl_install(wq, node, wq->dfl_pwq);
		4000	out_unlock:
		4001	mutex_unlock(&wq->mutex);
		4002	put_pwq_unlocked(old_pwq);
		4003	}
		4004
3793	static int alloc_and_link_pwqs(struct workqueue_struct *wq)	4005	static int alloc_and_link_pwqs(struct workqueue_struct *wq)
3794	{	4006	{
3795	bool highpri = wq->flags & WQ_HIGHPRI;	4007	bool highpri = wq->flags & WQ_HIGHPRI;
@@ -3942,6 +4154,7 @@ EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
3942	void destroy_workqueue(struct workqueue_struct *wq)	4154	void destroy_workqueue(struct workqueue_struct *wq)
3943	{	4155	{
3944	struct pool_workqueue *pwq;	4156	struct pool_workqueue *pwq;
		4157	int node;
3945		4158
3946	/* drain it before proceeding with destruction */	4159	/* drain it before proceeding with destruction */
3947	drain_workqueue(wq);	4160	drain_workqueue(wq);
@@ -3993,11 +4206,21 @@ void destroy_workqueue(struct workqueue_struct *wq)
3993	} else {	4206	} else {
3994	/*	4207	/*
3995	* We're the sole accessor of @wq at this point. Directly	4208	* We're the sole accessor of @wq at this point. Directly
3996	* access the first pwq and put the base ref. @wq will be	4209	* access numa_pwq_tbl[] and dfl_pwq to put the base refs.
3997	* freed when the last pwq is released.	4210	* @wq will be freed when the last pwq is released.
3998	*/	4211	*/
3999	pwq = list_first_entry(&wq->pwqs, struct pool_workqueue,	4212	for_each_node(node) {
4000	pwqs_node);	4213	pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]);
		4214	RCU_INIT_POINTER(wq->numa_pwq_tbl[node], NULL);
		4215	put_pwq_unlocked(pwq);
		4216	}
		4217
		4218	/*
		4219	* Put dfl_pwq. @wq may be freed any time after dfl_pwq is
		4220	* put. Don't access it afterwards.
		4221	*/
		4222	pwq = wq->dfl_pwq;
		4223	wq->dfl_pwq = NULL;
4001	put_pwq_unlocked(pwq);	4224	put_pwq_unlocked(pwq);
4002	}	4225	}
4003	}	4226	}
@@ -4285,6 +4508,7 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
4285	{	4508	{
4286	int cpu = (unsigned long)hcpu;	4509	int cpu = (unsigned long)hcpu;
4287	struct worker_pool *pool;	4510	struct worker_pool *pool;
		4511	struct workqueue_struct *wq;
4288	int pi;	4512	int pi;
4289		4513
4290	switch (action & ~CPU_TASKS_FROZEN) {	4514	switch (action & ~CPU_TASKS_FROZEN) {
@@ -4317,6 +4541,10 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
4317	mutex_unlock(&pool->manager_mutex);	4541	mutex_unlock(&pool->manager_mutex);
4318	}	4542	}
4319		4543
		4544	/* update NUMA affinity of unbound workqueues */
		4545	list_for_each_entry(wq, &workqueues, list)
		4546	wq_update_unbound_numa(wq, cpu, true);
		4547
4320	mutex_unlock(&wq_pool_mutex);	4548	mutex_unlock(&wq_pool_mutex);
4321	break;	4549	break;
4322	}	4550	}
@@ -4333,12 +4561,21 @@ static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb,
4333	{	4561	{
4334	int cpu = (unsigned long)hcpu;	4562	int cpu = (unsigned long)hcpu;
4335	struct work_struct unbind_work;	4563	struct work_struct unbind_work;
		4564	struct workqueue_struct *wq;
4336		4565
4337	switch (action & ~CPU_TASKS_FROZEN) {	4566	switch (action & ~CPU_TASKS_FROZEN) {
4338	case CPU_DOWN_PREPARE:	4567	case CPU_DOWN_PREPARE:
4339	/* unbinding should happen on the local CPU */	4568	/* unbinding per-cpu workers should happen on the local CPU */
4340	INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);	4569	INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);
4341	queue_work_on(cpu, system_highpri_wq, &unbind_work);	4570	queue_work_on(cpu, system_highpri_wq, &unbind_work);
		4571
		4572	/* update NUMA affinity of unbound workqueues */
		4573	mutex_lock(&wq_pool_mutex);
		4574	list_for_each_entry(wq, &workqueues, list)
		4575	wq_update_unbound_numa(wq, cpu, false);
		4576	mutex_unlock(&wq_pool_mutex);
		4577
		4578	/* wait for per-cpu unbinding to finish */
4342	flush_work(&unbind_work);	4579	flush_work(&unbind_work);
4343	break;	4580	break;
4344	}	4581	}
@@ -4526,6 +4763,9 @@ static void __init wq_numa_init(void)
4526	if (num_possible_nodes() <= 1)	4763	if (num_possible_nodes() <= 1)
4527	return;	4764	return;
4528		4765
		4766	wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(GFP_KERNEL);
		4767	BUG_ON(!wq_update_unbound_numa_attrs_buf);
		4768
4529	/*	4769	/*
4530	* We want masks of possible CPUs of each node which isn't readily	4770	* We want masks of possible CPUs of each node which isn't readily
4531	* available. Build one from cpu_to_node() which should have been	4771	* available. Build one from cpu_to_node() which should have been