aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--kernel/workqueue.c278
1 files changed, 259 insertions, 19 deletions
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index d9a4aeb844d5..57cd77de4a4f 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -45,6 +45,7 @@
45#include <linux/hashtable.h> 45#include <linux/hashtable.h>
46#include <linux/rculist.h> 46#include <linux/rculist.h>
47#include <linux/nodemask.h> 47#include <linux/nodemask.h>
48#include <linux/moduleparam.h>
48 49
49#include "workqueue_internal.h" 50#include "workqueue_internal.h"
50 51
@@ -245,6 +246,7 @@ struct workqueue_struct {
245 int saved_max_active; /* WQ: saved pwq max_active */ 246 int saved_max_active; /* WQ: saved pwq max_active */
246 247
247 struct workqueue_attrs *unbound_attrs; /* WQ: only for unbound wqs */ 248 struct workqueue_attrs *unbound_attrs; /* WQ: only for unbound wqs */
249 struct pool_workqueue *dfl_pwq; /* WQ: only for unbound wqs */
248 250
249#ifdef CONFIG_SYSFS 251#ifdef CONFIG_SYSFS
250 struct wq_device *wq_dev; /* I: for sysfs interface */ 252 struct wq_device *wq_dev; /* I: for sysfs interface */
@@ -268,6 +270,9 @@ static cpumask_var_t *wq_numa_possible_cpumask;
268 270
269static bool wq_numa_enabled; /* unbound NUMA affinity enabled */ 271static bool wq_numa_enabled; /* unbound NUMA affinity enabled */
270 272
273/* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */
274static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf;
275
271static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ 276static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */
272static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ 277static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */
273 278
@@ -3710,6 +3715,61 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
3710 return pwq; 3715 return pwq;
3711} 3716}
3712 3717
3718/* undo alloc_unbound_pwq(), used only in the error path */
3719static void free_unbound_pwq(struct pool_workqueue *pwq)
3720{
3721 lockdep_assert_held(&wq_pool_mutex);
3722
3723 if (pwq) {
3724 put_unbound_pool(pwq->pool);
3725 kfree(pwq);
3726 }
3727}
3728
3729/**
3730 * wq_calc_node_mask - calculate a wq_attrs' cpumask for the specified node
3731 * @attrs: the wq_attrs of interest
3732 * @node: the target NUMA node
3733 * @cpu_going_down: if >= 0, the CPU to consider as offline
3734 * @cpumask: outarg, the resulting cpumask
3735 *
3736 * Calculate the cpumask a workqueue with @attrs should use on @node. If
3737 * @cpu_going_down is >= 0, that cpu is considered offline during
3738 * calculation. The result is stored in @cpumask. This function returns
3739 * %true if the resulting @cpumask is different from @attrs->cpumask,
3740 * %false if equal.
3741 *
3742 * If NUMA affinity is not enabled, @attrs->cpumask is always used. If
3743 * enabled and @node has online CPUs requested by @attrs, the returned
3744 * cpumask is the intersection of the possible CPUs of @node and
3745 * @attrs->cpumask.
3746 *
3747 * The caller is responsible for ensuring that the cpumask of @node stays
3748 * stable.
3749 */
3750static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node,
3751 int cpu_going_down, cpumask_t *cpumask)
3752{
3753 if (!wq_numa_enabled)
3754 goto use_dfl;
3755
3756 /* does @node have any online CPUs @attrs wants? */
3757 cpumask_and(cpumask, cpumask_of_node(node), attrs->cpumask);
3758 if (cpu_going_down >= 0)
3759 cpumask_clear_cpu(cpu_going_down, cpumask);
3760
3761 if (cpumask_empty(cpumask))
3762 goto use_dfl;
3763
3764 /* yeap, return possible CPUs in @node that @attrs wants */
3765 cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]);
3766 return !cpumask_equal(cpumask, attrs->cpumask);
3767
3768use_dfl:
3769 cpumask_copy(cpumask, attrs->cpumask);
3770 return false;
3771}
3772
3713/* install @pwq into @wq's numa_pwq_tbl[] for @node and return the old pwq */ 3773/* install @pwq into @wq's numa_pwq_tbl[] for @node and return the old pwq */
3714static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq, 3774static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
3715 int node, 3775 int node,
@@ -3732,11 +3792,12 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
3732 * @wq: the target workqueue 3792 * @wq: the target workqueue
3733 * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs() 3793 * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs()
3734 * 3794 *
3735 * Apply @attrs to an unbound workqueue @wq. If @attrs doesn't match the 3795 * Apply @attrs to an unbound workqueue @wq. Unless disabled, on NUMA
3736 * current attributes, a new pwq is created and made the first pwq which 3796 * machines, this function maps a separate pwq to each NUMA node with
3737 * will serve all new work items. Older pwqs are released as in-flight 3797 * possibles CPUs in @attrs->cpumask so that work items are affine to the
3738 * work items finish. Note that a work item which repeatedly requeues 3798 * NUMA node it was issued on. Older pwqs are released as in-flight work
3739 * itself back-to-back will stay on its current pwq. 3799 * items finish. Note that a work item which repeatedly requeues itself
3800 * back-to-back will stay on its current pwq.
3740 * 3801 *
3741 * Performs GFP_KERNEL allocations. Returns 0 on success and -errno on 3802 * Performs GFP_KERNEL allocations. Returns 0 on success and -errno on
3742 * failure. 3803 * failure.
@@ -3744,8 +3805,8 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
3744int apply_workqueue_attrs(struct workqueue_struct *wq, 3805int apply_workqueue_attrs(struct workqueue_struct *wq,
3745 const struct workqueue_attrs *attrs) 3806 const struct workqueue_attrs *attrs)
3746{ 3807{
3747 struct workqueue_attrs *new_attrs; 3808 struct workqueue_attrs *new_attrs, *tmp_attrs;
3748 struct pool_workqueue *pwq, *last_pwq = NULL; 3809 struct pool_workqueue **pwq_tbl, *dfl_pwq;
3749 int node, ret; 3810 int node, ret;
3750 3811
3751 /* only unbound workqueues can change attributes */ 3812 /* only unbound workqueues can change attributes */
@@ -3756,40 +3817,191 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
3756 if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs))) 3817 if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
3757 return -EINVAL; 3818 return -EINVAL;
3758 3819
3759 /* make a copy of @attrs and sanitize it */ 3820 pwq_tbl = kzalloc(wq_numa_tbl_len * sizeof(pwq_tbl[0]), GFP_KERNEL);
3760 new_attrs = alloc_workqueue_attrs(GFP_KERNEL); 3821 new_attrs = alloc_workqueue_attrs(GFP_KERNEL);
3761 if (!new_attrs) 3822 tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL);
3823 if (!pwq_tbl || !new_attrs || !tmp_attrs)
3762 goto enomem; 3824 goto enomem;
3763 3825
3826 /* make a copy of @attrs and sanitize it */
3764 copy_workqueue_attrs(new_attrs, attrs); 3827 copy_workqueue_attrs(new_attrs, attrs);
3765 cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask); 3828 cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
3766 3829
3830 /*
3831 * We may create multiple pwqs with differing cpumasks. Make a
3832 * copy of @new_attrs which will be modified and used to obtain
3833 * pools.
3834 */
3835 copy_workqueue_attrs(tmp_attrs, new_attrs);
3836
3837 /*
3838 * CPUs should stay stable across pwq creations and installations.
3839 * Pin CPUs, determine the target cpumask for each node and create
3840 * pwqs accordingly.
3841 */
3842 get_online_cpus();
3843
3767 mutex_lock(&wq_pool_mutex); 3844 mutex_lock(&wq_pool_mutex);
3768 pwq = alloc_unbound_pwq(wq, new_attrs); 3845
3846 /*
3847 * If something goes wrong during CPU up/down, we'll fall back to
3848 * the default pwq covering whole @attrs->cpumask. Always create
3849 * it even if we don't use it immediately.
3850 */
3851 dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
3852 if (!dfl_pwq)
3853 goto enomem_pwq;
3854
3855 for_each_node(node) {
3856 if (wq_calc_node_cpumask(attrs, node, -1, tmp_attrs->cpumask)) {
3857 pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
3858 if (!pwq_tbl[node])
3859 goto enomem_pwq;
3860 } else {
3861 dfl_pwq->refcnt++;
3862 pwq_tbl[node] = dfl_pwq;
3863 }
3864 }
3865
3769 mutex_unlock(&wq_pool_mutex); 3866 mutex_unlock(&wq_pool_mutex);
3770 if (!pwq)
3771 goto enomem;
3772 3867
3868 /* all pwqs have been created successfully, let's install'em */
3773 mutex_lock(&wq->mutex); 3869 mutex_lock(&wq->mutex);
3774 3870
3775 copy_workqueue_attrs(wq->unbound_attrs, new_attrs); 3871 copy_workqueue_attrs(wq->unbound_attrs, new_attrs);
3872
3873 /* save the previous pwq and install the new one */
3776 for_each_node(node) 3874 for_each_node(node)
3777 last_pwq = numa_pwq_tbl_install(wq, node, pwq); 3875 pwq_tbl[node] = numa_pwq_tbl_install(wq, node, pwq_tbl[node]);
3876
3877 /* @dfl_pwq might not have been used, ensure it's linked */
3878 link_pwq(dfl_pwq);
3879 swap(wq->dfl_pwq, dfl_pwq);
3778 3880
3779 mutex_unlock(&wq->mutex); 3881 mutex_unlock(&wq->mutex);
3780 3882
3781 put_pwq_unlocked(last_pwq); 3883 /* put the old pwqs */
3884 for_each_node(node)
3885 put_pwq_unlocked(pwq_tbl[node]);
3886 put_pwq_unlocked(dfl_pwq);
3887
3888 put_online_cpus();
3782 ret = 0; 3889 ret = 0;
3783 /* fall through */ 3890 /* fall through */
3784out_free: 3891out_free:
3892 free_workqueue_attrs(tmp_attrs);
3785 free_workqueue_attrs(new_attrs); 3893 free_workqueue_attrs(new_attrs);
3894 kfree(pwq_tbl);
3786 return ret; 3895 return ret;
3787 3896
3897enomem_pwq:
3898 free_unbound_pwq(dfl_pwq);
3899 for_each_node(node)
3900 if (pwq_tbl && pwq_tbl[node] != dfl_pwq)
3901 free_unbound_pwq(pwq_tbl[node]);
3902 mutex_unlock(&wq_pool_mutex);
3903 put_online_cpus();
3788enomem: 3904enomem:
3789 ret = -ENOMEM; 3905 ret = -ENOMEM;
3790 goto out_free; 3906 goto out_free;
3791} 3907}
3792 3908
3909/**
3910 * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug
3911 * @wq: the target workqueue
3912 * @cpu: the CPU coming up or going down
3913 * @online: whether @cpu is coming up or going down
3914 *
3915 * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and
3916 * %CPU_DOWN_FAILED. @cpu is being hot[un]plugged, update NUMA affinity of
3917 * @wq accordingly.
3918 *
3919 * If NUMA affinity can't be adjusted due to memory allocation failure, it
3920 * falls back to @wq->dfl_pwq which may not be optimal but is always
3921 * correct.
3922 *
3923 * Note that when the last allowed CPU of a NUMA node goes offline for a
3924 * workqueue with a cpumask spanning multiple nodes, the workers which were
3925 * already executing the work items for the workqueue will lose their CPU
3926 * affinity and may execute on any CPU. This is similar to how per-cpu
3927 * workqueues behave on CPU_DOWN. If a workqueue user wants strict
3928 * affinity, it's the user's responsibility to flush the work item from
3929 * CPU_DOWN_PREPARE.
3930 */
3931static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
3932 bool online)
3933{
3934 int node = cpu_to_node(cpu);
3935 int cpu_off = online ? -1 : cpu;
3936 struct pool_workqueue *old_pwq = NULL, *pwq;
3937 struct workqueue_attrs *target_attrs;
3938 cpumask_t *cpumask;
3939
3940 lockdep_assert_held(&wq_pool_mutex);
3941
3942 if (!wq_numa_enabled || !(wq->flags & WQ_UNBOUND))
3943 return;
3944
3945 /*
3946 * We don't wanna alloc/free wq_attrs for each wq for each CPU.
3947 * Let's use a preallocated one. The following buf is protected by
3948 * CPU hotplug exclusion.
3949 */
3950 target_attrs = wq_update_unbound_numa_attrs_buf;
3951 cpumask = target_attrs->cpumask;
3952
3953 mutex_lock(&wq->mutex);
3954
3955 copy_workqueue_attrs(target_attrs, wq->unbound_attrs);
3956 pwq = unbound_pwq_by_node(wq, node);
3957
3958 /*
3959 * Let's determine what needs to be done. If the target cpumask is
3960 * different from wq's, we need to compare it to @pwq's and create
3961 * a new one if they don't match. If the target cpumask equals
3962 * wq's, the default pwq should be used. If @pwq is already the
3963 * default one, nothing to do; otherwise, install the default one.
3964 */
3965 if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) {
3966 if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask))
3967 goto out_unlock;
3968 } else {
3969 if (pwq == wq->dfl_pwq)
3970 goto out_unlock;
3971 else
3972 goto use_dfl_pwq;
3973 }
3974
3975 mutex_unlock(&wq->mutex);
3976
3977 /* create a new pwq */
3978 pwq = alloc_unbound_pwq(wq, target_attrs);
3979 if (!pwq) {
3980 pr_warning("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n",
3981 wq->name);
3982 goto out_unlock;
3983 }
3984
3985 /*
3986 * Install the new pwq. As this function is called only from CPU
3987 * hotplug callbacks and applying a new attrs is wrapped with
3988 * get/put_online_cpus(), @wq->unbound_attrs couldn't have changed
3989 * inbetween.
3990 */
3991 mutex_lock(&wq->mutex);
3992 old_pwq = numa_pwq_tbl_install(wq, node, pwq);
3993 goto out_unlock;
3994
3995use_dfl_pwq:
3996 spin_lock_irq(&wq->dfl_pwq->pool->lock);
3997 get_pwq(wq->dfl_pwq);
3998 spin_unlock_irq(&wq->dfl_pwq->pool->lock);
3999 old_pwq = numa_pwq_tbl_install(wq, node, wq->dfl_pwq);
4000out_unlock:
4001 mutex_unlock(&wq->mutex);
4002 put_pwq_unlocked(old_pwq);
4003}
4004
3793static int alloc_and_link_pwqs(struct workqueue_struct *wq) 4005static int alloc_and_link_pwqs(struct workqueue_struct *wq)
3794{ 4006{
3795 bool highpri = wq->flags & WQ_HIGHPRI; 4007 bool highpri = wq->flags & WQ_HIGHPRI;
@@ -3942,6 +4154,7 @@ EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
3942void destroy_workqueue(struct workqueue_struct *wq) 4154void destroy_workqueue(struct workqueue_struct *wq)
3943{ 4155{
3944 struct pool_workqueue *pwq; 4156 struct pool_workqueue *pwq;
4157 int node;
3945 4158
3946 /* drain it before proceeding with destruction */ 4159 /* drain it before proceeding with destruction */
3947 drain_workqueue(wq); 4160 drain_workqueue(wq);
@@ -3993,11 +4206,21 @@ void destroy_workqueue(struct workqueue_struct *wq)
3993 } else { 4206 } else {
3994 /* 4207 /*
3995 * We're the sole accessor of @wq at this point. Directly 4208 * We're the sole accessor of @wq at this point. Directly
3996 * access the first pwq and put the base ref. @wq will be 4209 * access numa_pwq_tbl[] and dfl_pwq to put the base refs.
3997 * freed when the last pwq is released. 4210 * @wq will be freed when the last pwq is released.
3998 */ 4211 */
3999 pwq = list_first_entry(&wq->pwqs, struct pool_workqueue, 4212 for_each_node(node) {
4000 pwqs_node); 4213 pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]);
4214 RCU_INIT_POINTER(wq->numa_pwq_tbl[node], NULL);
4215 put_pwq_unlocked(pwq);
4216 }
4217
4218 /*
4219 * Put dfl_pwq. @wq may be freed any time after dfl_pwq is
4220 * put. Don't access it afterwards.
4221 */
4222 pwq = wq->dfl_pwq;
4223 wq->dfl_pwq = NULL;
4001 put_pwq_unlocked(pwq); 4224 put_pwq_unlocked(pwq);
4002 } 4225 }
4003} 4226}
@@ -4285,6 +4508,7 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
4285{ 4508{
4286 int cpu = (unsigned long)hcpu; 4509 int cpu = (unsigned long)hcpu;
4287 struct worker_pool *pool; 4510 struct worker_pool *pool;
4511 struct workqueue_struct *wq;
4288 int pi; 4512 int pi;
4289 4513
4290 switch (action & ~CPU_TASKS_FROZEN) { 4514 switch (action & ~CPU_TASKS_FROZEN) {
@@ -4317,6 +4541,10 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
4317 mutex_unlock(&pool->manager_mutex); 4541 mutex_unlock(&pool->manager_mutex);
4318 } 4542 }
4319 4543
4544 /* update NUMA affinity of unbound workqueues */
4545 list_for_each_entry(wq, &workqueues, list)
4546 wq_update_unbound_numa(wq, cpu, true);
4547
4320 mutex_unlock(&wq_pool_mutex); 4548 mutex_unlock(&wq_pool_mutex);
4321 break; 4549 break;
4322 } 4550 }
@@ -4333,12 +4561,21 @@ static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb,
4333{ 4561{
4334 int cpu = (unsigned long)hcpu; 4562 int cpu = (unsigned long)hcpu;
4335 struct work_struct unbind_work; 4563 struct work_struct unbind_work;
4564 struct workqueue_struct *wq;
4336 4565
4337 switch (action & ~CPU_TASKS_FROZEN) { 4566 switch (action & ~CPU_TASKS_FROZEN) {
4338 case CPU_DOWN_PREPARE: 4567 case CPU_DOWN_PREPARE:
4339 /* unbinding should happen on the local CPU */ 4568 /* unbinding per-cpu workers should happen on the local CPU */
4340 INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn); 4569 INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);
4341 queue_work_on(cpu, system_highpri_wq, &unbind_work); 4570 queue_work_on(cpu, system_highpri_wq, &unbind_work);
4571
4572 /* update NUMA affinity of unbound workqueues */
4573 mutex_lock(&wq_pool_mutex);
4574 list_for_each_entry(wq, &workqueues, list)
4575 wq_update_unbound_numa(wq, cpu, false);
4576 mutex_unlock(&wq_pool_mutex);
4577
4578 /* wait for per-cpu unbinding to finish */
4342 flush_work(&unbind_work); 4579 flush_work(&unbind_work);
4343 break; 4580 break;
4344 } 4581 }
@@ -4526,6 +4763,9 @@ static void __init wq_numa_init(void)
4526 if (num_possible_nodes() <= 1) 4763 if (num_possible_nodes() <= 1)
4527 return; 4764 return;
4528 4765
4766 wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(GFP_KERNEL);
4767 BUG_ON(!wq_update_unbound_numa_attrs_buf);
4768
4529 /* 4769 /*
4530 * We want masks of possible CPUs of each node which isn't readily 4770 * We want masks of possible CPUs of each node which isn't readily
4531 * available. Build one from cpu_to_node() which should have been 4771 * available. Build one from cpu_to_node() which should have been