aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/workqueue.c
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2012-07-17 15:39:27 -0400
committerTejun Heo <tj@kernel.org>2012-07-17 15:39:27 -0400
commit628c78e7ea19d5b70d2b6a59030362168cdbe1ad (patch)
tree7867a9f82aae3d31c40356f32ae24223ae0ddf0c /kernel/workqueue.c
parent3ce63377305b694f53e7dd0c72907591c5344224 (diff)
workqueue: remove CPU offline trustee
With the previous changes, a disassociated global_cwq now can run as an unbound one on its own - it can create workers as necessary to drain remaining works after the CPU has been brought down and manage the number of workers using the usual idle timer mechanism making trustee completely redundant except for the actual unbinding operation. This patch removes the trustee and let a disassociated global_cwq manage itself. Unbinding is moved to a work item (for CPU affinity) which is scheduled and flushed from CPU_DONW_PREPARE. This patch moves nr_running clearing outside gcwq and manager locks to simplify the code. As nr_running is unused at the point, this is safe. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: "Rafael J. Wysocki" <rjw@sisk.pl>
Diffstat (limited to 'kernel/workqueue.c')
-rw-r--r--kernel/workqueue.c288
1 files changed, 36 insertions, 252 deletions
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index acfabb22e2c4..d1545daa74ad 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -79,13 +79,6 @@ enum {
79 WORKER_NOT_RUNNING = WORKER_PREP | WORKER_REBIND | WORKER_UNBOUND | 79 WORKER_NOT_RUNNING = WORKER_PREP | WORKER_REBIND | WORKER_UNBOUND |
80 WORKER_CPU_INTENSIVE, 80 WORKER_CPU_INTENSIVE,
81 81
82 /* gcwq->trustee_state */
83 TRUSTEE_START = 0, /* start */
84 TRUSTEE_IN_CHARGE = 1, /* trustee in charge of gcwq */
85 TRUSTEE_BUTCHER = 2, /* butcher workers */
86 TRUSTEE_RELEASE = 3, /* release workers */
87 TRUSTEE_DONE = 4, /* trustee is done */
88
89 NR_WORKER_POOLS = 2, /* # worker pools per gcwq */ 82 NR_WORKER_POOLS = 2, /* # worker pools per gcwq */
90 83
91 BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ 84 BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */
@@ -100,7 +93,6 @@ enum {
100 (min two ticks) */ 93 (min two ticks) */
101 MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ 94 MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */
102 CREATE_COOLDOWN = HZ, /* time to breath after fail */ 95 CREATE_COOLDOWN = HZ, /* time to breath after fail */
103 TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */
104 96
105 /* 97 /*
106 * Rescue workers are used only on emergencies and shared by 98 * Rescue workers are used only on emergencies and shared by
@@ -194,10 +186,6 @@ struct global_cwq {
194 struct worker_pool pools[2]; /* normal and highpri pools */ 186 struct worker_pool pools[2]; /* normal and highpri pools */
195 187
196 wait_queue_head_t rebind_hold; /* rebind hold wait */ 188 wait_queue_head_t rebind_hold; /* rebind hold wait */
197
198 struct task_struct *trustee; /* L: for gcwq shutdown */
199 unsigned int trustee_state; /* L: trustee state */
200 wait_queue_head_t trustee_wait; /* trustee wait */
201} ____cacheline_aligned_in_smp; 189} ____cacheline_aligned_in_smp;
202 190
203/* 191/*
@@ -753,11 +741,11 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
753 * worklist not empty test sequence is in insert_work(). 741 * worklist not empty test sequence is in insert_work().
754 * Please read comment there. 742 * Please read comment there.
755 * 743 *
756 * NOT_RUNNING is clear. This means that trustee is not in 744 * NOT_RUNNING is clear. This means that we're bound to and
757 * charge and we're running on the local cpu w/ rq lock held 745 * running on the local cpu w/ rq lock held and preemption
758 * and preemption disabled, which in turn means that none else 746 * disabled, which in turn means that none else could be
759 * could be manipulating idle_list, so dereferencing idle_list 747 * manipulating idle_list, so dereferencing idle_list without gcwq
760 * without gcwq lock is safe. 748 * lock is safe.
761 */ 749 */
762 if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist)) 750 if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist))
763 to_wakeup = first_worker(pool); 751 to_wakeup = first_worker(pool);
@@ -1217,19 +1205,16 @@ static void worker_enter_idle(struct worker *worker)
1217 /* idle_list is LIFO */ 1205 /* idle_list is LIFO */
1218 list_add(&worker->entry, &pool->idle_list); 1206 list_add(&worker->entry, &pool->idle_list);
1219 1207
1220 if (likely(gcwq->trustee_state != TRUSTEE_DONE)) { 1208 if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
1221 if (too_many_workers(pool) && !timer_pending(&pool->idle_timer)) 1209 mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
1222 mod_timer(&pool->idle_timer,
1223 jiffies + IDLE_WORKER_TIMEOUT);
1224 } else
1225 wake_up_all(&gcwq->trustee_wait);
1226 1210
1227 /* 1211 /*
1228 * Sanity check nr_running. Because trustee releases gcwq->lock 1212 * Sanity check nr_running. Because gcwq_unbind_fn() releases
1229 * between setting %WORKER_UNBOUND and zapping nr_running, the 1213 * gcwq->lock between setting %WORKER_UNBOUND and zapping
1230 * warning may trigger spuriously. Check iff trustee is idle. 1214 * nr_running, the warning may trigger spuriously. Check iff
1215 * unbind is not in progress.
1231 */ 1216 */
1232 WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE && 1217 WARN_ON_ONCE(!(gcwq->flags & GCWQ_DISASSOCIATED) &&
1233 pool->nr_workers == pool->nr_idle && 1218 pool->nr_workers == pool->nr_idle &&
1234 atomic_read(get_pool_nr_running(pool))); 1219 atomic_read(get_pool_nr_running(pool)));
1235} 1220}
@@ -3367,46 +3352,9 @@ EXPORT_SYMBOL_GPL(work_busy);
3367 * gcwqs serve mix of short, long and very long running works making 3352 * gcwqs serve mix of short, long and very long running works making
3368 * blocked draining impractical. 3353 * blocked draining impractical.
3369 * 3354 *
3370 * This is solved by allowing a gcwq to be detached from CPU, running it 3355 * This is solved by allowing a gcwq to be disassociated from the CPU
3371 * with unbound workers and allowing it to be reattached later if the cpu 3356 * running as an unbound one and allowing it to be reattached later if the
3372 * comes back online. A separate thread is created to govern a gcwq in 3357 * cpu comes back online.
3373 * such state and is called the trustee of the gcwq.
3374 *
3375 * Trustee states and their descriptions.
3376 *
3377 * START Command state used on startup. On CPU_DOWN_PREPARE, a
3378 * new trustee is started with this state.
3379 *
3380 * IN_CHARGE Once started, trustee will enter this state after
3381 * assuming the manager role and making all existing
3382 * workers rogue. DOWN_PREPARE waits for trustee to
3383 * enter this state. After reaching IN_CHARGE, trustee
3384 * tries to execute the pending worklist until it's empty
3385 * and the state is set to BUTCHER, or the state is set
3386 * to RELEASE.
3387 *
3388 * BUTCHER Command state which is set by the cpu callback after
3389 * the cpu has went down. Once this state is set trustee
3390 * knows that there will be no new works on the worklist
3391 * and once the worklist is empty it can proceed to
3392 * killing idle workers.
3393 *
3394 * RELEASE Command state which is set by the cpu callback if the
3395 * cpu down has been canceled or it has come online
3396 * again. After recognizing this state, trustee stops
3397 * trying to drain or butcher and clears ROGUE, rebinds
3398 * all remaining workers back to the cpu and releases
3399 * manager role.
3400 *
3401 * DONE Trustee will enter this state after BUTCHER or RELEASE
3402 * is complete.
3403 *
3404 * trustee CPU draining
3405 * took over down complete
3406 * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE
3407 * | | ^
3408 * | CPU is back online v return workers |
3409 * ----------------> RELEASE --------------
3410 */ 3358 */
3411 3359
3412/* claim manager positions of all pools */ 3360/* claim manager positions of all pools */
@@ -3427,61 +3375,11 @@ static void gcwq_release_management(struct global_cwq *gcwq)
3427 mutex_unlock(&pool->manager_mutex); 3375 mutex_unlock(&pool->manager_mutex);
3428} 3376}
3429 3377
3430/** 3378static void gcwq_unbind_fn(struct work_struct *work)
3431 * trustee_wait_event_timeout - timed event wait for trustee
3432 * @cond: condition to wait for
3433 * @timeout: timeout in jiffies
3434 *
3435 * wait_event_timeout() for trustee to use. Handles locking and
3436 * checks for RELEASE request.
3437 *
3438 * CONTEXT:
3439 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
3440 * multiple times. To be used by trustee.
3441 *
3442 * RETURNS:
3443 * Positive indicating left time if @cond is satisfied, 0 if timed
3444 * out, -1 if canceled.
3445 */
3446#define trustee_wait_event_timeout(cond, timeout) ({ \
3447 long __ret = (timeout); \
3448 while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) && \
3449 __ret) { \
3450 spin_unlock_irq(&gcwq->lock); \
3451 __wait_event_timeout(gcwq->trustee_wait, (cond) || \
3452 (gcwq->trustee_state == TRUSTEE_RELEASE), \
3453 __ret); \
3454 spin_lock_irq(&gcwq->lock); \
3455 } \
3456 gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret); \
3457})
3458
3459/**
3460 * trustee_wait_event - event wait for trustee
3461 * @cond: condition to wait for
3462 *
3463 * wait_event() for trustee to use. Automatically handles locking and
3464 * checks for CANCEL request.
3465 *
3466 * CONTEXT:
3467 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
3468 * multiple times. To be used by trustee.
3469 *
3470 * RETURNS:
3471 * 0 if @cond is satisfied, -1 if canceled.
3472 */
3473#define trustee_wait_event(cond) ({ \
3474 long __ret1; \
3475 __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\
3476 __ret1 < 0 ? -1 : 0; \
3477})
3478
3479static int __cpuinit trustee_thread(void *__gcwq)
3480{ 3379{
3481 struct global_cwq *gcwq = __gcwq; 3380 struct global_cwq *gcwq = get_gcwq(smp_processor_id());
3482 struct worker_pool *pool; 3381 struct worker_pool *pool;
3483 struct worker *worker; 3382 struct worker *worker;
3484 struct work_struct *work;
3485 struct hlist_node *pos; 3383 struct hlist_node *pos;
3486 int i; 3384 int i;
3487 3385
@@ -3505,119 +3403,29 @@ static int __cpuinit trustee_thread(void *__gcwq)
3505 3403
3506 gcwq->flags |= GCWQ_DISASSOCIATED; 3404 gcwq->flags |= GCWQ_DISASSOCIATED;
3507 3405
3406 spin_unlock_irq(&gcwq->lock);
3407 gcwq_release_management(gcwq);
3408
3508 /* 3409 /*
3509 * Call schedule() so that we cross rq->lock and thus can guarantee 3410 * Call schedule() so that we cross rq->lock and thus can guarantee
3510 * sched callbacks see the unbound flag. This is necessary as 3411 * sched callbacks see the %WORKER_UNBOUND flag. This is necessary
3511 * scheduler callbacks may be invoked from other cpus. 3412 * as scheduler callbacks may be invoked from other cpus.
3512 */ 3413 */
3513 spin_unlock_irq(&gcwq->lock);
3514 schedule(); 3414 schedule();
3515 spin_lock_irq(&gcwq->lock);
3516 3415
3517 /* 3416 /*
3518 * Sched callbacks are disabled now. Zap nr_running. After 3417 * Sched callbacks are disabled now. Zap nr_running. After this,
3519 * this, nr_running stays zero and need_more_worker() and 3418 * nr_running stays zero and need_more_worker() and keep_working()
3520 * keep_working() are always true as long as the worklist is 3419 * are always true as long as the worklist is not empty. @gcwq now
3521 * not empty. 3420 * behaves as unbound (in terms of concurrency management) gcwq
3421 * which is served by workers tied to the CPU.
3422 *
3423 * On return from this function, the current worker would trigger
3424 * unbound chain execution of pending work items if other workers
3425 * didn't already.
3522 */ 3426 */
3523 for_each_worker_pool(pool, gcwq) 3427 for_each_worker_pool(pool, gcwq)
3524 atomic_set(get_pool_nr_running(pool), 0); 3428 atomic_set(get_pool_nr_running(pool), 0);
3525
3526 spin_unlock_irq(&gcwq->lock);
3527 for_each_worker_pool(pool, gcwq)
3528 del_timer_sync(&pool->idle_timer);
3529 spin_lock_irq(&gcwq->lock);
3530
3531 /*
3532 * We're now in charge. Notify and proceed to drain. We need
3533 * to keep the gcwq running during the whole CPU down
3534 * procedure as other cpu hotunplug callbacks may need to
3535 * flush currently running tasks.
3536 */
3537 gcwq->trustee_state = TRUSTEE_IN_CHARGE;
3538 wake_up_all(&gcwq->trustee_wait);
3539
3540 /*
3541 * The original cpu is in the process of dying and may go away
3542 * anytime now. When that happens, we and all workers would
3543 * be migrated to other cpus. Try draining any left work. We
3544 * want to get it over with ASAP - spam rescuers, wake up as
3545 * many idlers as necessary and create new ones till the
3546 * worklist is empty. Note that if the gcwq is frozen, there
3547 * may be frozen works in freezable cwqs. Don't declare
3548 * completion while frozen.
3549 */
3550 while (true) {
3551 bool busy = false;
3552
3553 for_each_worker_pool(pool, gcwq)
3554 busy |= pool->nr_workers != pool->nr_idle;
3555
3556 if (!busy && !(gcwq->flags & GCWQ_FREEZING) &&
3557 gcwq->trustee_state != TRUSTEE_IN_CHARGE)
3558 break;
3559
3560 for_each_worker_pool(pool, gcwq) {
3561 int nr_works = 0;
3562
3563 list_for_each_entry(work, &pool->worklist, entry) {
3564 send_mayday(work);
3565 nr_works++;
3566 }
3567
3568 list_for_each_entry(worker, &pool->idle_list, entry) {
3569 if (!nr_works--)
3570 break;
3571 wake_up_process(worker->task);
3572 }
3573
3574 if (need_to_create_worker(pool)) {
3575 spin_unlock_irq(&gcwq->lock);
3576 worker = create_worker(pool);
3577 spin_lock_irq(&gcwq->lock);
3578 if (worker)
3579 start_worker(worker);
3580 }
3581 }
3582
3583 /* give a breather */
3584 if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0)
3585 break;
3586 }
3587
3588 gcwq_release_management(gcwq);
3589
3590 /* notify completion */
3591 gcwq->trustee = NULL;
3592 gcwq->trustee_state = TRUSTEE_DONE;
3593 wake_up_all(&gcwq->trustee_wait);
3594 spin_unlock_irq(&gcwq->lock);
3595 return 0;
3596}
3597
3598/**
3599 * wait_trustee_state - wait for trustee to enter the specified state
3600 * @gcwq: gcwq the trustee of interest belongs to
3601 * @state: target state to wait for
3602 *
3603 * Wait for the trustee to reach @state. DONE is already matched.
3604 *
3605 * CONTEXT:
3606 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
3607 * multiple times. To be used by cpu_callback.
3608 */
3609static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state)
3610__releases(&gcwq->lock)
3611__acquires(&gcwq->lock)
3612{
3613 if (!(gcwq->trustee_state == state ||
3614 gcwq->trustee_state == TRUSTEE_DONE)) {
3615 spin_unlock_irq(&gcwq->lock);
3616 __wait_event(gcwq->trustee_wait,
3617 gcwq->trustee_state == state ||
3618 gcwq->trustee_state == TRUSTEE_DONE);
3619 spin_lock_irq(&gcwq->lock);
3620 }
3621} 3429}
3622 3430
3623static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, 3431static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
@@ -3626,19 +3434,18 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
3626{ 3434{
3627 unsigned int cpu = (unsigned long)hcpu; 3435 unsigned int cpu = (unsigned long)hcpu;
3628 struct global_cwq *gcwq = get_gcwq(cpu); 3436 struct global_cwq *gcwq = get_gcwq(cpu);
3629 struct task_struct *new_trustee = NULL;
3630 struct worker_pool *pool; 3437 struct worker_pool *pool;
3438 struct work_struct unbind_work;
3631 unsigned long flags; 3439 unsigned long flags;
3632 3440
3633 action &= ~CPU_TASKS_FROZEN; 3441 action &= ~CPU_TASKS_FROZEN;
3634 3442
3635 switch (action) { 3443 switch (action) {
3636 case CPU_DOWN_PREPARE: 3444 case CPU_DOWN_PREPARE:
3637 new_trustee = kthread_create(trustee_thread, gcwq, 3445 /* unbinding should happen on the local CPU */
3638 "workqueue_trustee/%d\n", cpu); 3446 INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn);
3639 if (IS_ERR(new_trustee)) 3447 schedule_work_on(cpu, &unbind_work);
3640 return notifier_from_errno(PTR_ERR(new_trustee)); 3448 flush_work(&unbind_work);
3641 kthread_bind(new_trustee, cpu);
3642 break; 3449 break;
3643 3450
3644 case CPU_UP_PREPARE: 3451 case CPU_UP_PREPARE:
@@ -3662,27 +3469,8 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
3662 spin_lock_irqsave(&gcwq->lock, flags); 3469 spin_lock_irqsave(&gcwq->lock, flags);
3663 3470
3664 switch (action) { 3471 switch (action) {
3665 case CPU_DOWN_PREPARE:
3666 /* initialize trustee and tell it to acquire the gcwq */
3667 BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE);
3668 gcwq->trustee = new_trustee;
3669 gcwq->trustee_state = TRUSTEE_START;
3670 wake_up_process(gcwq->trustee);
3671 wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE);
3672 break;
3673
3674 case CPU_POST_DEAD:
3675 gcwq->trustee_state = TRUSTEE_BUTCHER;
3676 break;
3677
3678 case CPU_DOWN_FAILED: 3472 case CPU_DOWN_FAILED:
3679 case CPU_ONLINE: 3473 case CPU_ONLINE:
3680 if (gcwq->trustee_state != TRUSTEE_DONE) {
3681 gcwq->trustee_state = TRUSTEE_RELEASE;
3682 wake_up_process(gcwq->trustee);
3683 wait_trustee_state(gcwq, TRUSTEE_DONE);
3684 }
3685
3686 spin_unlock_irq(&gcwq->lock); 3474 spin_unlock_irq(&gcwq->lock);
3687 gcwq_claim_management(gcwq); 3475 gcwq_claim_management(gcwq);
3688 spin_lock_irq(&gcwq->lock); 3476 spin_lock_irq(&gcwq->lock);
@@ -3727,7 +3515,6 @@ static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb,
3727{ 3515{
3728 switch (action & ~CPU_TASKS_FROZEN) { 3516 switch (action & ~CPU_TASKS_FROZEN) {
3729 case CPU_DOWN_PREPARE: 3517 case CPU_DOWN_PREPARE:
3730 case CPU_POST_DEAD:
3731 return workqueue_cpu_callback(nfb, action, hcpu); 3518 return workqueue_cpu_callback(nfb, action, hcpu);
3732 } 3519 }
3733 return NOTIFY_OK; 3520 return NOTIFY_OK;
@@ -3960,9 +3747,6 @@ static int __init init_workqueues(void)
3960 } 3747 }
3961 3748
3962 init_waitqueue_head(&gcwq->rebind_hold); 3749 init_waitqueue_head(&gcwq->rebind_hold);
3963
3964 gcwq->trustee_state = TRUSTEE_DONE;
3965 init_waitqueue_head(&gcwq->trustee_wait);
3966 } 3750 }
3967 3751
3968 /* create the initial worker */ 3752 /* create the initial worker */