aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--kernel/workqueue.c288
1 files changed, 36 insertions, 252 deletions
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index acfabb22e2c4..d1545daa74ad 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -79,13 +79,6 @@ enum {
79 WORKER_NOT_RUNNING = WORKER_PREP | WORKER_REBIND | WORKER_UNBOUND | 79 WORKER_NOT_RUNNING = WORKER_PREP | WORKER_REBIND | WORKER_UNBOUND |
80 WORKER_CPU_INTENSIVE, 80 WORKER_CPU_INTENSIVE,
81 81
82 /* gcwq->trustee_state */
83 TRUSTEE_START = 0, /* start */
84 TRUSTEE_IN_CHARGE = 1, /* trustee in charge of gcwq */
85 TRUSTEE_BUTCHER = 2, /* butcher workers */
86 TRUSTEE_RELEASE = 3, /* release workers */
87 TRUSTEE_DONE = 4, /* trustee is done */
88
89 NR_WORKER_POOLS = 2, /* # worker pools per gcwq */ 82 NR_WORKER_POOLS = 2, /* # worker pools per gcwq */
90 83
91 BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ 84 BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */
@@ -100,7 +93,6 @@ enum {
100 (min two ticks) */ 93 (min two ticks) */
101 MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ 94 MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */
102 CREATE_COOLDOWN = HZ, /* time to breath after fail */ 95 CREATE_COOLDOWN = HZ, /* time to breath after fail */
103 TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */
104 96
105 /* 97 /*
106 * Rescue workers are used only on emergencies and shared by 98 * Rescue workers are used only on emergencies and shared by
@@ -194,10 +186,6 @@ struct global_cwq {
194 struct worker_pool pools[2]; /* normal and highpri pools */ 186 struct worker_pool pools[2]; /* normal and highpri pools */
195 187
196 wait_queue_head_t rebind_hold; /* rebind hold wait */ 188 wait_queue_head_t rebind_hold; /* rebind hold wait */
197
198 struct task_struct *trustee; /* L: for gcwq shutdown */
199 unsigned int trustee_state; /* L: trustee state */
200 wait_queue_head_t trustee_wait; /* trustee wait */
201} ____cacheline_aligned_in_smp; 189} ____cacheline_aligned_in_smp;
202 190
203/* 191/*
@@ -753,11 +741,11 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
753 * worklist not empty test sequence is in insert_work(). 741 * worklist not empty test sequence is in insert_work().
754 * Please read comment there. 742 * Please read comment there.
755 * 743 *
756 * NOT_RUNNING is clear. This means that trustee is not in 744 * NOT_RUNNING is clear. This means that we're bound to and
757 * charge and we're running on the local cpu w/ rq lock held 745 * running on the local cpu w/ rq lock held and preemption
758 * and preemption disabled, which in turn means that none else 746 * disabled, which in turn means that none else could be
759 * could be manipulating idle_list, so dereferencing idle_list 747 * manipulating idle_list, so dereferencing idle_list without gcwq
760 * without gcwq lock is safe. 748 * lock is safe.
761 */ 749 */
762 if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist)) 750 if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist))
763 to_wakeup = first_worker(pool); 751 to_wakeup = first_worker(pool);
@@ -1217,19 +1205,16 @@ static void worker_enter_idle(struct worker *worker)
1217 /* idle_list is LIFO */ 1205 /* idle_list is LIFO */
1218 list_add(&worker->entry, &pool->idle_list); 1206 list_add(&worker->entry, &pool->idle_list);
1219 1207
1220 if (likely(gcwq->trustee_state != TRUSTEE_DONE)) { 1208 if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
1221 if (too_many_workers(pool) && !timer_pending(&pool->idle_timer)) 1209 mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
1222 mod_timer(&pool->idle_timer,
1223 jiffies + IDLE_WORKER_TIMEOUT);
1224 } else
1225 wake_up_all(&gcwq->trustee_wait);
1226 1210
1227 /* 1211 /*
1228 * Sanity check nr_running. Because trustee releases gcwq->lock 1212 * Sanity check nr_running. Because gcwq_unbind_fn() releases
1229 * between setting %WORKER_UNBOUND and zapping nr_running, the 1213 * gcwq->lock between setting %WORKER_UNBOUND and zapping
1230 * warning may trigger spuriously. Check iff trustee is idle. 1214 * nr_running, the warning may trigger spuriously. Check iff
1215 * unbind is not in progress.
1231 */ 1216 */
1232 WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE && 1217 WARN_ON_ONCE(!(gcwq->flags & GCWQ_DISASSOCIATED) &&
1233 pool->nr_workers == pool->nr_idle && 1218 pool->nr_workers == pool->nr_idle &&
1234 atomic_read(get_pool_nr_running(pool))); 1219 atomic_read(get_pool_nr_running(pool)));
1235} 1220}
@@ -3367,46 +3352,9 @@ EXPORT_SYMBOL_GPL(work_busy);
3367 * gcwqs serve mix of short, long and very long running works making 3352 * gcwqs serve mix of short, long and very long running works making
3368 * blocked draining impractical. 3353 * blocked draining impractical.
3369 * 3354 *
3370 * This is solved by allowing a gcwq to be detached from CPU, running it 3355 * This is solved by allowing a gcwq to be disassociated from the CPU
3371 * with unbound workers and allowing it to be reattached later if the cpu 3356 * running as an unbound one and allowing it to be reattached later if the
3372 * comes back online. A separate thread is created to govern a gcwq in 3357 * cpu comes back online.
3373 * such state and is called the trustee of the gcwq.
3374 *
3375 * Trustee states and their descriptions.
3376 *
3377 * START Command state used on startup. On CPU_DOWN_PREPARE, a
3378 * new trustee is started with this state.
3379 *
3380 * IN_CHARGE Once started, trustee will enter this state after
3381 * assuming the manager role and making all existing
3382 * workers rogue. DOWN_PREPARE waits for trustee to
3383 * enter this state. After reaching IN_CHARGE, trustee
3384 * tries to execute the pending worklist until it's empty
3385 * and the state is set to BUTCHER, or the state is set
3386 * to RELEASE.
3387 *
3388 * BUTCHER Command state which is set by the cpu callback after
3389 * the cpu has went down. Once this state is set trustee
3390 * knows that there will be no new works on the worklist
3391 * and once the worklist is empty it can proceed to
3392 * killing idle workers.
3393 *
3394 * RELEASE Command state which is set by the cpu callback if the
3395 * cpu down has been canceled or it has come online
3396 * again. After recognizing this state, trustee stops
3397 * trying to drain or butcher and clears ROGUE, rebinds
3398 * all remaining workers back to the cpu and releases
3399 * manager role.
3400 *
3401 * DONE Trustee will enter this state after BUTCHER or RELEASE
3402 * is complete.
3403 *
3404 * trustee CPU draining
3405 * took over down complete
3406 * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE
3407 * | | ^
3408 * | CPU is back online v return workers |
3409 * ----------------> RELEASE --------------
3410 */ 3358 */
3411 3359
3412/* claim manager positions of all pools */ 3360/* claim manager positions of all pools */
@@ -3427,61 +3375,11 @@ static void gcwq_release_management(struct global_cwq *gcwq)
3427 mutex_unlock(&pool->manager_mutex); 3375 mutex_unlock(&pool->manager_mutex);
3428} 3376}
3429 3377
3430/** 3378static void gcwq_unbind_fn(struct work_struct *work)
3431 * trustee_wait_event_timeout - timed event wait for trustee
3432 * @cond: condition to wait for
3433 * @timeout: timeout in jiffies
3434 *
3435 * wait_event_timeout() for trustee to use. Handles locking and
3436 * checks for RELEASE request.
3437 *
3438 * CONTEXT:
3439 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
3440 * multiple times. To be used by trustee.
3441 *
3442 * RETURNS:
3443 * Positive indicating left time if @cond is satisfied, 0 if timed
3444 * out, -1 if canceled.
3445 */
3446#define trustee_wait_event_timeout(cond, timeout) ({ \
3447 long __ret = (timeout); \
3448 while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) && \
3449 __ret) { \
3450 spin_unlock_irq(&gcwq->lock); \
3451 __wait_event_timeout(gcwq->trustee_wait, (cond) || \
3452 (gcwq->trustee_state == TRUSTEE_RELEASE), \
3453 __ret); \
3454 spin_lock_irq(&gcwq->lock); \
3455 } \
3456 gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret); \
3457})
3458
3459/**
3460 * trustee_wait_event - event wait for trustee
3461 * @cond: condition to wait for
3462 *
3463 * wait_event() for trustee to use. Automatically handles locking and
3464 * checks for CANCEL request.
3465 *
3466 * CONTEXT:
3467 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
3468 * multiple times. To be used by trustee.
3469 *
3470 * RETURNS:
3471 * 0 if @cond is satisfied, -1 if canceled.
3472 */
3473#define trustee_wait_event(cond) ({ \
3474 long __ret1; \
3475 __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\
3476 __ret1 < 0 ? -1 : 0; \
3477})
3478
3479static int __cpuinit trustee_thread(void *__gcwq)
3480{ 3379{
3481 struct global_cwq *gcwq = __gcwq; 3380 struct global_cwq *gcwq = get_gcwq(smp_processor_id());
3482 struct worker_pool *pool; 3381 struct worker_pool *pool;
3483 struct worker *worker; 3382 struct worker *worker;
3484 struct work_struct *work;
3485 struct hlist_node *pos; 3383 struct hlist_node *pos;
3486 int i; 3384 int i;
3487 3385
@@ -3505,119 +3403,29 @@ static int __cpuinit trustee_thread(void *__gcwq)
3505 3403
3506 gcwq->flags |= GCWQ_DISASSOCIATED; 3404 gcwq->flags |= GCWQ_DISASSOCIATED;
3507 3405
3406 spin_unlock_irq(&gcwq->lock);
3407 gcwq_release_management(gcwq);
3408
3508 /* 3409 /*
3509 * Call schedule() so that we cross rq->lock and thus can guarantee 3410 * Call schedule() so that we cross rq->lock and thus can guarantee
3510 * sched callbacks see the unbound flag. This is necessary as 3411 * sched callbacks see the %WORKER_UNBOUND flag. This is necessary
3511 * scheduler callbacks may be invoked from other cpus. 3412 * as scheduler callbacks may be invoked from other cpus.
3512 */ 3413 */
3513 spin_unlock_irq(&gcwq->lock);
3514 schedule(); 3414 schedule();
3515 spin_lock_irq(&gcwq->lock);
3516 3415
3517 /* 3416 /*
3518 * Sched callbacks are disabled now. Zap nr_running. After 3417 * Sched callbacks are disabled now. Zap nr_running. After this,
3519 * this, nr_running stays zero and need_more_worker() and 3418 * nr_running stays zero and need_more_worker() and keep_working()
3520 * keep_working() are always true as long as the worklist is 3419 * are always true as long as the worklist is not empty. @gcwq now
3521 * not empty. 3420 * behaves as unbound (in terms of concurrency management) gcwq
3421 * which is served by workers tied to the CPU.
3422 *
3423 * On return from this function, the current worker would trigger
3424 * unbound chain execution of pending work items if other workers
3425 * didn't already.
3522 */ 3426 */
3523 for_each_worker_pool(pool, gcwq) 3427 for_each_worker_pool(pool, gcwq)
3524 atomic_set(get_pool_nr_running(pool), 0); 3428 atomic_set(get_pool_nr_running(pool), 0);
3525
3526 spin_unlock_irq(&gcwq->lock);
3527 for_each_worker_pool(pool, gcwq)
3528 del_timer_sync(&pool->idle_timer);
3529 spin_lock_irq(&gcwq->lock);
3530
3531 /*
3532 * We're now in charge. Notify and proceed to drain. We need
3533 * to keep the gcwq running during the whole CPU down
3534 * procedure as other cpu hotunplug callbacks may need to
3535 * flush currently running tasks.
3536 */
3537 gcwq->trustee_state = TRUSTEE_IN_CHARGE;
3538 wake_up_all(&gcwq->trustee_wait);
3539
3540 /*
3541 * The original cpu is in the process of dying and may go away
3542 * anytime now. When that happens, we and all workers would
3543 * be migrated to other cpus. Try draining any left work. We
3544 * want to get it over with ASAP - spam rescuers, wake up as
3545 * many idlers as necessary and create new ones till the
3546 * worklist is empty. Note that if the gcwq is frozen, there
3547 * may be frozen works in freezable cwqs. Don't declare
3548 * completion while frozen.
3549 */
3550 while (true) {
3551 bool busy = false;
3552
3553 for_each_worker_pool(pool, gcwq)
3554 busy |= pool->nr_workers != pool->nr_idle;
3555
3556 if (!busy && !(gcwq->flags & GCWQ_FREEZING) &&
3557 gcwq->trustee_state != TRUSTEE_IN_CHARGE)
3558 break;
3559
3560 for_each_worker_pool(pool, gcwq) {
3561 int nr_works = 0;
3562
3563 list_for_each_entry(work, &pool->worklist, entry) {
3564 send_mayday(work);
3565 nr_works++;
3566 }
3567
3568 list_for_each_entry(worker, &pool->idle_list, entry) {
3569 if (!nr_works--)
3570 break;
3571 wake_up_process(worker->task);
3572 }
3573
3574 if (need_to_create_worker(pool)) {
3575 spin_unlock_irq(&gcwq->lock);
3576 worker = create_worker(pool);
3577 spin_lock_irq(&gcwq->lock);
3578 if (worker)
3579 start_worker(worker);
3580 }
3581 }
3582
3583 /* give a breather */
3584 if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0)
3585 break;
3586 }
3587
3588 gcwq_release_management(gcwq);
3589
3590 /* notify completion */
3591 gcwq->trustee = NULL;
3592 gcwq->trustee_state = TRUSTEE_DONE;
3593 wake_up_all(&gcwq->trustee_wait);
3594 spin_unlock_irq(&gcwq->lock);
3595 return 0;
3596}
3597
3598/**
3599 * wait_trustee_state - wait for trustee to enter the specified state
3600 * @gcwq: gcwq the trustee of interest belongs to
3601 * @state: target state to wait for
3602 *
3603 * Wait for the trustee to reach @state. DONE is already matched.
3604 *
3605 * CONTEXT:
3606 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
3607 * multiple times. To be used by cpu_callback.
3608 */
3609static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state)
3610__releases(&gcwq->lock)
3611__acquires(&gcwq->lock)
3612{
3613 if (!(gcwq->trustee_state == state ||
3614 gcwq->trustee_state == TRUSTEE_DONE)) {
3615 spin_unlock_irq(&gcwq->lock);
3616 __wait_event(gcwq->trustee_wait,
3617 gcwq->trustee_state == state ||
3618 gcwq->trustee_state == TRUSTEE_DONE);
3619 spin_lock_irq(&gcwq->lock);
3620 }
3621} 3429}
3622 3430
3623static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, 3431static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
@@ -3626,19 +3434,18 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
3626{ 3434{
3627 unsigned int cpu = (unsigned long)hcpu; 3435 unsigned int cpu = (unsigned long)hcpu;
3628 struct global_cwq *gcwq = get_gcwq(cpu); 3436 struct global_cwq *gcwq = get_gcwq(cpu);
3629 struct task_struct *new_trustee = NULL;
3630 struct worker_pool *pool; 3437 struct worker_pool *pool;
3438 struct work_struct unbind_work;
3631 unsigned long flags; 3439 unsigned long flags;
3632 3440
3633 action &= ~CPU_TASKS_FROZEN; 3441 action &= ~CPU_TASKS_FROZEN;
3634 3442
3635 switch (action) { 3443 switch (action) {
3636 case CPU_DOWN_PREPARE: 3444 case CPU_DOWN_PREPARE:
3637 new_trustee = kthread_create(trustee_thread, gcwq, 3445 /* unbinding should happen on the local CPU */
3638 "workqueue_trustee/%d\n", cpu); 3446 INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn);
3639 if (IS_ERR(new_trustee)) 3447 schedule_work_on(cpu, &unbind_work);
3640 return notifier_from_errno(PTR_ERR(new_trustee)); 3448 flush_work(&unbind_work);
3641 kthread_bind(new_trustee, cpu);
3642 break; 3449 break;
3643 3450
3644 case CPU_UP_PREPARE: 3451 case CPU_UP_PREPARE:
@@ -3662,27 +3469,8 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
3662 spin_lock_irqsave(&gcwq->lock, flags); 3469 spin_lock_irqsave(&gcwq->lock, flags);
3663 3470
3664 switch (action) { 3471 switch (action) {
3665 case CPU_DOWN_PREPARE:
3666 /* initialize trustee and tell it to acquire the gcwq */
3667 BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE);
3668 gcwq->trustee = new_trustee;
3669 gcwq->trustee_state = TRUSTEE_START;
3670 wake_up_process(gcwq->trustee);
3671 wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE);
3672 break;
3673
3674 case CPU_POST_DEAD:
3675 gcwq->trustee_state = TRUSTEE_BUTCHER;
3676 break;
3677
3678 case CPU_DOWN_FAILED: 3472 case CPU_DOWN_FAILED:
3679 case CPU_ONLINE: 3473 case CPU_ONLINE:
3680 if (gcwq->trustee_state != TRUSTEE_DONE) {
3681 gcwq->trustee_state = TRUSTEE_RELEASE;
3682 wake_up_process(gcwq->trustee);
3683 wait_trustee_state(gcwq, TRUSTEE_DONE);
3684 }
3685
3686 spin_unlock_irq(&gcwq->lock); 3474 spin_unlock_irq(&gcwq->lock);
3687 gcwq_claim_management(gcwq); 3475 gcwq_claim_management(gcwq);
3688 spin_lock_irq(&gcwq->lock); 3476 spin_lock_irq(&gcwq->lock);
@@ -3727,7 +3515,6 @@ static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb,
3727{ 3515{
3728 switch (action & ~CPU_TASKS_FROZEN) { 3516 switch (action & ~CPU_TASKS_FROZEN) {
3729 case CPU_DOWN_PREPARE: 3517 case CPU_DOWN_PREPARE:
3730 case CPU_POST_DEAD:
3731 return workqueue_cpu_callback(nfb, action, hcpu); 3518 return workqueue_cpu_callback(nfb, action, hcpu);
3732 } 3519 }
3733 return NOTIFY_OK; 3520 return NOTIFY_OK;
@@ -3960,9 +3747,6 @@ static int __init init_workqueues(void)
3960 } 3747 }
3961 3748
3962 init_waitqueue_head(&gcwq->rebind_hold); 3749 init_waitqueue_head(&gcwq->rebind_hold);
3963
3964 gcwq->trustee_state = TRUSTEE_DONE;
3965 init_waitqueue_head(&gcwq->trustee_wait);
3966 } 3750 }
3967 3751
3968 /* create the initial worker */ 3752 /* create the initial worker */