aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2013-03-19 16:45:21 -0400
committerTejun Heo <tj@kernel.org>2013-03-19 16:45:21 -0400
commita9ab775bcadf122d91e1a201eb66ae2eec90365a (patch)
tree98f30f2272d2ad62258744a48570c49ecfab66af /kernel
parentbd7c089eb25b26d2e03fd34f97e5517a4463f871 (diff)
workqueue: directly restore CPU affinity of workers from CPU_ONLINE
Rebinding workers of a per-cpu pool after a CPU comes online involves a lot of back-and-forth mostly because only the task itself could adjust CPU affinity if PF_THREAD_BOUND was set. As CPU_ONLINE itself couldn't adjust affinity, it had to somehow coerce the workers themselves to perform set_cpus_allowed_ptr(). Due to the various states a worker can be in, this led to three different paths a worker may be rebound. worker->rebind_work is queued to busy workers. Idle ones are signaled by unlinking worker->entry and call idle_worker_rebind(). The manager isn't covered by either and implements its own mechanism. PF_THREAD_BOUND has been relaced with PF_NO_SETAFFINITY and CPU_ONLINE itself now can manipulate CPU affinity of workers. This patch replaces the existing rebind mechanism with direct one where CPU_ONLINE iterates over all workers using for_each_pool_worker(), restores CPU affinity, and clears WORKER_UNBOUND. There are a couple subtleties. All bound idle workers should have their runqueues set to that of the bound CPU; however, if the target task isn't running, set_cpus_allowed_ptr() just updates the cpus_allowed mask deferring the actual migration to when the task wakes up. This is worked around by waking up idle workers after restoring CPU affinity before any workers can become bound. Another subtlety is stems from matching @pool->nr_running with the number of running unbound workers. While DISASSOCIATED, all workers are unbound and nr_running is zero. As workers become bound again, nr_running needs to be adjusted accordingly; however, there is no good way to tell whether a given worker is running without poking into scheduler internals. Instead of clearing UNBOUND directly, rebind_workers() replaces UNBOUND with another new NOT_RUNNING flag - REBOUND, which will later be cleared by the workers themselves while preparing for the next round of work item execution. The only change needed for the workers is clearing REBOUND along with PREP. * This patch leaves for_each_busy_worker() without any user. Removed. * idle_worker_rebind(), busy_worker_rebind_fn(), worker->rebind_work and rebind logic in manager_workers() removed. * worker_thread() now looks at WORKER_DIE instead of testing whether @worker->entry is empty to determine whether it needs to do something special as dying is the only special thing now. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/workqueue.c192
-rw-r--r--kernel/workqueue_internal.h3
2 files changed, 64 insertions, 131 deletions
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 3e297c574be8..9508b5ed7336 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -75,9 +75,10 @@ enum {
75 WORKER_PREP = 1 << 3, /* preparing to run works */ 75 WORKER_PREP = 1 << 3, /* preparing to run works */
76 WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ 76 WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */
77 WORKER_UNBOUND = 1 << 7, /* worker is unbound */ 77 WORKER_UNBOUND = 1 << 7, /* worker is unbound */
78 WORKER_REBOUND = 1 << 8, /* worker was rebound */
78 79
79 WORKER_NOT_RUNNING = WORKER_PREP | WORKER_UNBOUND | 80 WORKER_NOT_RUNNING = WORKER_PREP | WORKER_CPU_INTENSIVE |
80 WORKER_CPU_INTENSIVE, 81 WORKER_UNBOUND | WORKER_REBOUND,
81 82
82 NR_STD_WORKER_POOLS = 2, /* # standard pools per cpu */ 83 NR_STD_WORKER_POOLS = 2, /* # standard pools per cpu */
83 84
@@ -316,9 +317,6 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to,
316 (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ 317 (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \
317 (pool)++) 318 (pool)++)
318 319
319#define for_each_busy_worker(worker, i, pool) \
320 hash_for_each(pool->busy_hash, i, worker, hentry)
321
322/** 320/**
323 * for_each_pool - iterate through all worker_pools in the system 321 * for_each_pool - iterate through all worker_pools in the system
324 * @pool: iteration cursor 322 * @pool: iteration cursor
@@ -1612,37 +1610,6 @@ __acquires(&pool->lock)
1612 } 1610 }
1613} 1611}
1614 1612
1615/*
1616 * Rebind an idle @worker to its CPU. worker_thread() will test
1617 * list_empty(@worker->entry) before leaving idle and call this function.
1618 */
1619static void idle_worker_rebind(struct worker *worker)
1620{
1621 /* CPU may go down again inbetween, clear UNBOUND only on success */
1622 if (worker_maybe_bind_and_lock(worker->pool))
1623 worker_clr_flags(worker, WORKER_UNBOUND);
1624
1625 /* rebind complete, become available again */
1626 list_add(&worker->entry, &worker->pool->idle_list);
1627 spin_unlock_irq(&worker->pool->lock);
1628}
1629
1630/*
1631 * Function for @worker->rebind.work used to rebind unbound busy workers to
1632 * the associated cpu which is coming back online. This is scheduled by
1633 * cpu up but can race with other cpu hotplug operations and may be
1634 * executed twice without intervening cpu down.
1635 */
1636static void busy_worker_rebind_fn(struct work_struct *work)
1637{
1638 struct worker *worker = container_of(work, struct worker, rebind_work);
1639
1640 if (worker_maybe_bind_and_lock(worker->pool))
1641 worker_clr_flags(worker, WORKER_UNBOUND);
1642
1643 spin_unlock_irq(&worker->pool->lock);
1644}
1645
1646static struct worker *alloc_worker(void) 1613static struct worker *alloc_worker(void)
1647{ 1614{
1648 struct worker *worker; 1615 struct worker *worker;
@@ -1651,7 +1618,6 @@ static struct worker *alloc_worker(void)
1651 if (worker) { 1618 if (worker) {
1652 INIT_LIST_HEAD(&worker->entry); 1619 INIT_LIST_HEAD(&worker->entry);
1653 INIT_LIST_HEAD(&worker->scheduled); 1620 INIT_LIST_HEAD(&worker->scheduled);
1654 INIT_WORK(&worker->rebind_work, busy_worker_rebind_fn);
1655 /* on creation a worker is in !idle && prep state */ 1621 /* on creation a worker is in !idle && prep state */
1656 worker->flags = WORKER_PREP; 1622 worker->flags = WORKER_PREP;
1657 } 1623 }
@@ -2053,22 +2019,6 @@ static bool manage_workers(struct worker *worker)
2053 if (unlikely(!mutex_trylock(&pool->manager_mutex))) { 2019 if (unlikely(!mutex_trylock(&pool->manager_mutex))) {
2054 spin_unlock_irq(&pool->lock); 2020 spin_unlock_irq(&pool->lock);
2055 mutex_lock(&pool->manager_mutex); 2021 mutex_lock(&pool->manager_mutex);
2056 /*
2057 * CPU hotplug could have happened while we were waiting
2058 * for assoc_mutex. Hotplug itself can't handle us
2059 * because manager isn't either on idle or busy list, and
2060 * @pool's state and ours could have deviated.
2061 *
2062 * As hotplug is now excluded via manager_mutex, we can
2063 * simply try to bind. It will succeed or fail depending
2064 * on @pool's current state. Try it and adjust
2065 * %WORKER_UNBOUND accordingly.
2066 */
2067 if (worker_maybe_bind_and_lock(pool))
2068 worker->flags &= ~WORKER_UNBOUND;
2069 else
2070 worker->flags |= WORKER_UNBOUND;
2071
2072 ret = true; 2022 ret = true;
2073 } 2023 }
2074 2024
@@ -2252,19 +2202,12 @@ static int worker_thread(void *__worker)
2252woke_up: 2202woke_up:
2253 spin_lock_irq(&pool->lock); 2203 spin_lock_irq(&pool->lock);
2254 2204
2255 /* we are off idle list if destruction or rebind is requested */ 2205 /* am I supposed to die? */
2256 if (unlikely(list_empty(&worker->entry))) { 2206 if (unlikely(worker->flags & WORKER_DIE)) {
2257 spin_unlock_irq(&pool->lock); 2207 spin_unlock_irq(&pool->lock);
2258 2208 WARN_ON_ONCE(!list_empty(&worker->entry));
2259 /* if DIE is set, destruction is requested */ 2209 worker->task->flags &= ~PF_WQ_WORKER;
2260 if (worker->flags & WORKER_DIE) { 2210 return 0;
2261 worker->task->flags &= ~PF_WQ_WORKER;
2262 return 0;
2263 }
2264
2265 /* otherwise, rebind */
2266 idle_worker_rebind(worker);
2267 goto woke_up;
2268 } 2211 }
2269 2212
2270 worker_leave_idle(worker); 2213 worker_leave_idle(worker);
@@ -2285,11 +2228,13 @@ recheck:
2285 WARN_ON_ONCE(!list_empty(&worker->scheduled)); 2228 WARN_ON_ONCE(!list_empty(&worker->scheduled));
2286 2229
2287 /* 2230 /*
2288 * When control reaches this point, we're guaranteed to have 2231 * Finish PREP stage. We're guaranteed to have at least one idle
2289 * at least one idle worker or that someone else has already 2232 * worker or that someone else has already assumed the manager
2290 * assumed the manager role. 2233 * role. This is where @worker starts participating in concurrency
2234 * management if applicable and concurrency management is restored
2235 * after being rebound. See rebind_workers() for details.
2291 */ 2236 */
2292 worker_clr_flags(worker, WORKER_PREP); 2237 worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND);
2293 2238
2294 do { 2239 do {
2295 struct work_struct *work = 2240 struct work_struct *work =
@@ -4076,7 +4021,7 @@ static void wq_unbind_fn(struct work_struct *work)
4076 int cpu = smp_processor_id(); 4021 int cpu = smp_processor_id();
4077 struct worker_pool *pool; 4022 struct worker_pool *pool;
4078 struct worker *worker; 4023 struct worker *worker;
4079 int i; 4024 int wi;
4080 4025
4081 for_each_cpu_worker_pool(pool, cpu) { 4026 for_each_cpu_worker_pool(pool, cpu) {
4082 WARN_ON_ONCE(cpu != smp_processor_id()); 4027 WARN_ON_ONCE(cpu != smp_processor_id());
@@ -4091,10 +4036,7 @@ static void wq_unbind_fn(struct work_struct *work)
4091 * before the last CPU down must be on the cpu. After 4036 * before the last CPU down must be on the cpu. After
4092 * this, they may become diasporas. 4037 * this, they may become diasporas.
4093 */ 4038 */
4094 list_for_each_entry(worker, &pool->idle_list, entry) 4039 for_each_pool_worker(worker, wi, pool)
4095 worker->flags |= WORKER_UNBOUND;
4096
4097 for_each_busy_worker(worker, i, pool)
4098 worker->flags |= WORKER_UNBOUND; 4040 worker->flags |= WORKER_UNBOUND;
4099 4041
4100 pool->flags |= POOL_DISASSOCIATED; 4042 pool->flags |= POOL_DISASSOCIATED;
@@ -4129,71 +4071,64 @@ static void wq_unbind_fn(struct work_struct *work)
4129 * rebind_workers - rebind all workers of a pool to the associated CPU 4071 * rebind_workers - rebind all workers of a pool to the associated CPU
4130 * @pool: pool of interest 4072 * @pool: pool of interest
4131 * 4073 *
4132 * @pool->cpu is coming online. Rebind all workers to the CPU. Rebinding 4074 * @pool->cpu is coming online. Rebind all workers to the CPU.
4133 * is different for idle and busy ones.
4134 *
4135 * Idle ones will be removed from the idle_list and woken up. They will
4136 * add themselves back after completing rebind. This ensures that the
4137 * idle_list doesn't contain any unbound workers when re-bound busy workers
4138 * try to perform local wake-ups for concurrency management.
4139 *
4140 * Busy workers can rebind after they finish their current work items.
4141 * Queueing the rebind work item at the head of the scheduled list is
4142 * enough. Note that nr_running will be properly bumped as busy workers
4143 * rebind.
4144 *
4145 * On return, all non-manager workers are scheduled for rebind - see
4146 * manage_workers() for the manager special case. Any idle worker
4147 * including the manager will not appear on @idle_list until rebind is
4148 * complete, making local wake-ups safe.
4149 */ 4075 */
4150static void rebind_workers(struct worker_pool *pool) 4076static void rebind_workers(struct worker_pool *pool)
4151{ 4077{
4152 struct worker *worker, *n; 4078 struct worker *worker;
4153 int i; 4079 int wi;
4154 4080
4155 lockdep_assert_held(&pool->manager_mutex); 4081 lockdep_assert_held(&pool->manager_mutex);
4156 lockdep_assert_held(&pool->lock);
4157
4158 /* dequeue and kick idle ones */
4159 list_for_each_entry_safe(worker, n, &pool->idle_list, entry) {
4160 /*
4161 * idle workers should be off @pool->idle_list until rebind
4162 * is complete to avoid receiving premature local wake-ups.
4163 */
4164 list_del_init(&worker->entry);
4165 4082
4166 /* 4083 /*
4167 * worker_thread() will see the above dequeuing and call 4084 * Restore CPU affinity of all workers. As all idle workers should
4168 * idle_worker_rebind(). 4085 * be on the run-queue of the associated CPU before any local
4169 */ 4086 * wake-ups for concurrency management happen, restore CPU affinty
4170 wake_up_process(worker->task); 4087 * of all workers first and then clear UNBOUND. As we're called
4171 } 4088 * from CPU_ONLINE, the following shouldn't fail.
4172 4089 */
4173 /* rebind busy workers */ 4090 for_each_pool_worker(worker, wi, pool)
4174 for_each_busy_worker(worker, i, pool) { 4091 WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
4175 struct work_struct *rebind_work = &worker->rebind_work; 4092 pool->attrs->cpumask) < 0);
4176 struct workqueue_struct *wq;
4177 4093
4178 if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, 4094 spin_lock_irq(&pool->lock);
4179 work_data_bits(rebind_work)))
4180 continue;
4181 4095
4182 debug_work_activate(rebind_work); 4096 for_each_pool_worker(worker, wi, pool) {
4097 unsigned int worker_flags = worker->flags;
4183 4098
4184 /* 4099 /*
4185 * wq doesn't really matter but let's keep @worker->pool 4100 * A bound idle worker should actually be on the runqueue
4186 * and @pwq->pool consistent for sanity. 4101 * of the associated CPU for local wake-ups targeting it to
4102 * work. Kick all idle workers so that they migrate to the
4103 * associated CPU. Doing this in the same loop as
4104 * replacing UNBOUND with REBOUND is safe as no worker will
4105 * be bound before @pool->lock is released.
4187 */ 4106 */
4188 if (worker->pool->attrs->nice < 0) 4107 if (worker_flags & WORKER_IDLE)
4189 wq = system_highpri_wq; 4108 wake_up_process(worker->task);
4190 else
4191 wq = system_wq;
4192 4109
4193 insert_work(per_cpu_ptr(wq->cpu_pwqs, pool->cpu), rebind_work, 4110 /*
4194 worker->scheduled.next, 4111 * We want to clear UNBOUND but can't directly call
4195 work_color_to_flags(WORK_NO_COLOR)); 4112 * worker_clr_flags() or adjust nr_running. Atomically
4113 * replace UNBOUND with another NOT_RUNNING flag REBOUND.
4114 * @worker will clear REBOUND using worker_clr_flags() when
4115 * it initiates the next execution cycle thus restoring
4116 * concurrency management. Note that when or whether
4117 * @worker clears REBOUND doesn't affect correctness.
4118 *
4119 * ACCESS_ONCE() is necessary because @worker->flags may be
4120 * tested without holding any lock in
4121 * wq_worker_waking_up(). Without it, NOT_RUNNING test may
4122 * fail incorrectly leading to premature concurrency
4123 * management operations.
4124 */
4125 WARN_ON_ONCE(!(worker_flags & WORKER_UNBOUND));
4126 worker_flags |= WORKER_REBOUND;
4127 worker_flags &= ~WORKER_UNBOUND;
4128 ACCESS_ONCE(worker->flags) = worker_flags;
4196 } 4129 }
4130
4131 spin_unlock_irq(&pool->lock);
4197} 4132}
4198 4133
4199/* 4134/*
@@ -4221,12 +4156,13 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
4221 case CPU_ONLINE: 4156 case CPU_ONLINE:
4222 for_each_cpu_worker_pool(pool, cpu) { 4157 for_each_cpu_worker_pool(pool, cpu) {
4223 mutex_lock(&pool->manager_mutex); 4158 mutex_lock(&pool->manager_mutex);
4224 spin_lock_irq(&pool->lock);
4225 4159
4160 spin_lock_irq(&pool->lock);
4226 pool->flags &= ~POOL_DISASSOCIATED; 4161 pool->flags &= ~POOL_DISASSOCIATED;
4162 spin_unlock_irq(&pool->lock);
4163
4227 rebind_workers(pool); 4164 rebind_workers(pool);
4228 4165
4229 spin_unlock_irq(&pool->lock);
4230 mutex_unlock(&pool->manager_mutex); 4166 mutex_unlock(&pool->manager_mutex);
4231 } 4167 }
4232 break; 4168 break;
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index f116f071d919..84ab6e1dc6fb 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -38,9 +38,6 @@ struct worker {
38 unsigned int flags; /* X: flags */ 38 unsigned int flags; /* X: flags */
39 int id; /* I: worker id */ 39 int id; /* I: worker id */
40 40
41 /* for rebinding worker to CPU */
42 struct work_struct rebind_work; /* L: for busy worker */
43
44 /* used only by rescuers to point to the target workqueue */ 41 /* used only by rescuers to point to the target workqueue */
45 struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */ 42 struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */
46}; 43};