diff options
-rw-r--r-- | kernel/workqueue.c | 288 |
1 files changed, 36 insertions, 252 deletions
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index acfabb22e2c4..d1545daa74ad 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -79,13 +79,6 @@ enum { | |||
79 | WORKER_NOT_RUNNING = WORKER_PREP | WORKER_REBIND | WORKER_UNBOUND | | 79 | WORKER_NOT_RUNNING = WORKER_PREP | WORKER_REBIND | WORKER_UNBOUND | |
80 | WORKER_CPU_INTENSIVE, | 80 | WORKER_CPU_INTENSIVE, |
81 | 81 | ||
82 | /* gcwq->trustee_state */ | ||
83 | TRUSTEE_START = 0, /* start */ | ||
84 | TRUSTEE_IN_CHARGE = 1, /* trustee in charge of gcwq */ | ||
85 | TRUSTEE_BUTCHER = 2, /* butcher workers */ | ||
86 | TRUSTEE_RELEASE = 3, /* release workers */ | ||
87 | TRUSTEE_DONE = 4, /* trustee is done */ | ||
88 | |||
89 | NR_WORKER_POOLS = 2, /* # worker pools per gcwq */ | 82 | NR_WORKER_POOLS = 2, /* # worker pools per gcwq */ |
90 | 83 | ||
91 | BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ | 84 | BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ |
@@ -100,7 +93,6 @@ enum { | |||
100 | (min two ticks) */ | 93 | (min two ticks) */ |
101 | MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ | 94 | MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ |
102 | CREATE_COOLDOWN = HZ, /* time to breath after fail */ | 95 | CREATE_COOLDOWN = HZ, /* time to breath after fail */ |
103 | TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */ | ||
104 | 96 | ||
105 | /* | 97 | /* |
106 | * Rescue workers are used only on emergencies and shared by | 98 | * Rescue workers are used only on emergencies and shared by |
@@ -194,10 +186,6 @@ struct global_cwq { | |||
194 | struct worker_pool pools[2]; /* normal and highpri pools */ | 186 | struct worker_pool pools[2]; /* normal and highpri pools */ |
195 | 187 | ||
196 | wait_queue_head_t rebind_hold; /* rebind hold wait */ | 188 | wait_queue_head_t rebind_hold; /* rebind hold wait */ |
197 | |||
198 | struct task_struct *trustee; /* L: for gcwq shutdown */ | ||
199 | unsigned int trustee_state; /* L: trustee state */ | ||
200 | wait_queue_head_t trustee_wait; /* trustee wait */ | ||
201 | } ____cacheline_aligned_in_smp; | 189 | } ____cacheline_aligned_in_smp; |
202 | 190 | ||
203 | /* | 191 | /* |
@@ -753,11 +741,11 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, | |||
753 | * worklist not empty test sequence is in insert_work(). | 741 | * worklist not empty test sequence is in insert_work(). |
754 | * Please read comment there. | 742 | * Please read comment there. |
755 | * | 743 | * |
756 | * NOT_RUNNING is clear. This means that trustee is not in | 744 | * NOT_RUNNING is clear. This means that we're bound to and |
757 | * charge and we're running on the local cpu w/ rq lock held | 745 | * running on the local cpu w/ rq lock held and preemption |
758 | * and preemption disabled, which in turn means that none else | 746 | * disabled, which in turn means that none else could be |
759 | * could be manipulating idle_list, so dereferencing idle_list | 747 | * manipulating idle_list, so dereferencing idle_list without gcwq |
760 | * without gcwq lock is safe. | 748 | * lock is safe. |
761 | */ | 749 | */ |
762 | if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist)) | 750 | if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist)) |
763 | to_wakeup = first_worker(pool); | 751 | to_wakeup = first_worker(pool); |
@@ -1217,19 +1205,16 @@ static void worker_enter_idle(struct worker *worker) | |||
1217 | /* idle_list is LIFO */ | 1205 | /* idle_list is LIFO */ |
1218 | list_add(&worker->entry, &pool->idle_list); | 1206 | list_add(&worker->entry, &pool->idle_list); |
1219 | 1207 | ||
1220 | if (likely(gcwq->trustee_state != TRUSTEE_DONE)) { | 1208 | if (too_many_workers(pool) && !timer_pending(&pool->idle_timer)) |
1221 | if (too_many_workers(pool) && !timer_pending(&pool->idle_timer)) | 1209 | mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); |
1222 | mod_timer(&pool->idle_timer, | ||
1223 | jiffies + IDLE_WORKER_TIMEOUT); | ||
1224 | } else | ||
1225 | wake_up_all(&gcwq->trustee_wait); | ||
1226 | 1210 | ||
1227 | /* | 1211 | /* |
1228 | * Sanity check nr_running. Because trustee releases gcwq->lock | 1212 | * Sanity check nr_running. Because gcwq_unbind_fn() releases |
1229 | * between setting %WORKER_UNBOUND and zapping nr_running, the | 1213 | * gcwq->lock between setting %WORKER_UNBOUND and zapping |
1230 | * warning may trigger spuriously. Check iff trustee is idle. | 1214 | * nr_running, the warning may trigger spuriously. Check iff |
1215 | * unbind is not in progress. | ||
1231 | */ | 1216 | */ |
1232 | WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE && | 1217 | WARN_ON_ONCE(!(gcwq->flags & GCWQ_DISASSOCIATED) && |
1233 | pool->nr_workers == pool->nr_idle && | 1218 | pool->nr_workers == pool->nr_idle && |
1234 | atomic_read(get_pool_nr_running(pool))); | 1219 | atomic_read(get_pool_nr_running(pool))); |
1235 | } | 1220 | } |
@@ -3367,46 +3352,9 @@ EXPORT_SYMBOL_GPL(work_busy); | |||
3367 | * gcwqs serve mix of short, long and very long running works making | 3352 | * gcwqs serve mix of short, long and very long running works making |
3368 | * blocked draining impractical. | 3353 | * blocked draining impractical. |
3369 | * | 3354 | * |
3370 | * This is solved by allowing a gcwq to be detached from CPU, running it | 3355 | * This is solved by allowing a gcwq to be disassociated from the CPU |
3371 | * with unbound workers and allowing it to be reattached later if the cpu | 3356 | * running as an unbound one and allowing it to be reattached later if the |
3372 | * comes back online. A separate thread is created to govern a gcwq in | 3357 | * cpu comes back online. |
3373 | * such state and is called the trustee of the gcwq. | ||
3374 | * | ||
3375 | * Trustee states and their descriptions. | ||
3376 | * | ||
3377 | * START Command state used on startup. On CPU_DOWN_PREPARE, a | ||
3378 | * new trustee is started with this state. | ||
3379 | * | ||
3380 | * IN_CHARGE Once started, trustee will enter this state after | ||
3381 | * assuming the manager role and making all existing | ||
3382 | * workers rogue. DOWN_PREPARE waits for trustee to | ||
3383 | * enter this state. After reaching IN_CHARGE, trustee | ||
3384 | * tries to execute the pending worklist until it's empty | ||
3385 | * and the state is set to BUTCHER, or the state is set | ||
3386 | * to RELEASE. | ||
3387 | * | ||
3388 | * BUTCHER Command state which is set by the cpu callback after | ||
3389 | * the cpu has went down. Once this state is set trustee | ||
3390 | * knows that there will be no new works on the worklist | ||
3391 | * and once the worklist is empty it can proceed to | ||
3392 | * killing idle workers. | ||
3393 | * | ||
3394 | * RELEASE Command state which is set by the cpu callback if the | ||
3395 | * cpu down has been canceled or it has come online | ||
3396 | * again. After recognizing this state, trustee stops | ||
3397 | * trying to drain or butcher and clears ROGUE, rebinds | ||
3398 | * all remaining workers back to the cpu and releases | ||
3399 | * manager role. | ||
3400 | * | ||
3401 | * DONE Trustee will enter this state after BUTCHER or RELEASE | ||
3402 | * is complete. | ||
3403 | * | ||
3404 | * trustee CPU draining | ||
3405 | * took over down complete | ||
3406 | * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE | ||
3407 | * | | ^ | ||
3408 | * | CPU is back online v return workers | | ||
3409 | * ----------------> RELEASE -------------- | ||
3410 | */ | 3358 | */ |
3411 | 3359 | ||
3412 | /* claim manager positions of all pools */ | 3360 | /* claim manager positions of all pools */ |
@@ -3427,61 +3375,11 @@ static void gcwq_release_management(struct global_cwq *gcwq) | |||
3427 | mutex_unlock(&pool->manager_mutex); | 3375 | mutex_unlock(&pool->manager_mutex); |
3428 | } | 3376 | } |
3429 | 3377 | ||
3430 | /** | 3378 | static void gcwq_unbind_fn(struct work_struct *work) |
3431 | * trustee_wait_event_timeout - timed event wait for trustee | ||
3432 | * @cond: condition to wait for | ||
3433 | * @timeout: timeout in jiffies | ||
3434 | * | ||
3435 | * wait_event_timeout() for trustee to use. Handles locking and | ||
3436 | * checks for RELEASE request. | ||
3437 | * | ||
3438 | * CONTEXT: | ||
3439 | * spin_lock_irq(gcwq->lock) which may be released and regrabbed | ||
3440 | * multiple times. To be used by trustee. | ||
3441 | * | ||
3442 | * RETURNS: | ||
3443 | * Positive indicating left time if @cond is satisfied, 0 if timed | ||
3444 | * out, -1 if canceled. | ||
3445 | */ | ||
3446 | #define trustee_wait_event_timeout(cond, timeout) ({ \ | ||
3447 | long __ret = (timeout); \ | ||
3448 | while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) && \ | ||
3449 | __ret) { \ | ||
3450 | spin_unlock_irq(&gcwq->lock); \ | ||
3451 | __wait_event_timeout(gcwq->trustee_wait, (cond) || \ | ||
3452 | (gcwq->trustee_state == TRUSTEE_RELEASE), \ | ||
3453 | __ret); \ | ||
3454 | spin_lock_irq(&gcwq->lock); \ | ||
3455 | } \ | ||
3456 | gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret); \ | ||
3457 | }) | ||
3458 | |||
3459 | /** | ||
3460 | * trustee_wait_event - event wait for trustee | ||
3461 | * @cond: condition to wait for | ||
3462 | * | ||
3463 | * wait_event() for trustee to use. Automatically handles locking and | ||
3464 | * checks for CANCEL request. | ||
3465 | * | ||
3466 | * CONTEXT: | ||
3467 | * spin_lock_irq(gcwq->lock) which may be released and regrabbed | ||
3468 | * multiple times. To be used by trustee. | ||
3469 | * | ||
3470 | * RETURNS: | ||
3471 | * 0 if @cond is satisfied, -1 if canceled. | ||
3472 | */ | ||
3473 | #define trustee_wait_event(cond) ({ \ | ||
3474 | long __ret1; \ | ||
3475 | __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\ | ||
3476 | __ret1 < 0 ? -1 : 0; \ | ||
3477 | }) | ||
3478 | |||
3479 | static int __cpuinit trustee_thread(void *__gcwq) | ||
3480 | { | 3379 | { |
3481 | struct global_cwq *gcwq = __gcwq; | 3380 | struct global_cwq *gcwq = get_gcwq(smp_processor_id()); |
3482 | struct worker_pool *pool; | 3381 | struct worker_pool *pool; |
3483 | struct worker *worker; | 3382 | struct worker *worker; |
3484 | struct work_struct *work; | ||
3485 | struct hlist_node *pos; | 3383 | struct hlist_node *pos; |
3486 | int i; | 3384 | int i; |
3487 | 3385 | ||
@@ -3505,119 +3403,29 @@ static int __cpuinit trustee_thread(void *__gcwq) | |||
3505 | 3403 | ||
3506 | gcwq->flags |= GCWQ_DISASSOCIATED; | 3404 | gcwq->flags |= GCWQ_DISASSOCIATED; |
3507 | 3405 | ||
3406 | spin_unlock_irq(&gcwq->lock); | ||
3407 | gcwq_release_management(gcwq); | ||
3408 | |||
3508 | /* | 3409 | /* |
3509 | * Call schedule() so that we cross rq->lock and thus can guarantee | 3410 | * Call schedule() so that we cross rq->lock and thus can guarantee |
3510 | * sched callbacks see the unbound flag. This is necessary as | 3411 | * sched callbacks see the %WORKER_UNBOUND flag. This is necessary |
3511 | * scheduler callbacks may be invoked from other cpus. | 3412 | * as scheduler callbacks may be invoked from other cpus. |
3512 | */ | 3413 | */ |
3513 | spin_unlock_irq(&gcwq->lock); | ||
3514 | schedule(); | 3414 | schedule(); |
3515 | spin_lock_irq(&gcwq->lock); | ||
3516 | 3415 | ||
3517 | /* | 3416 | /* |
3518 | * Sched callbacks are disabled now. Zap nr_running. After | 3417 | * Sched callbacks are disabled now. Zap nr_running. After this, |
3519 | * this, nr_running stays zero and need_more_worker() and | 3418 | * nr_running stays zero and need_more_worker() and keep_working() |
3520 | * keep_working() are always true as long as the worklist is | 3419 | * are always true as long as the worklist is not empty. @gcwq now |
3521 | * not empty. | 3420 | * behaves as unbound (in terms of concurrency management) gcwq |
3421 | * which is served by workers tied to the CPU. | ||
3422 | * | ||
3423 | * On return from this function, the current worker would trigger | ||
3424 | * unbound chain execution of pending work items if other workers | ||
3425 | * didn't already. | ||
3522 | */ | 3426 | */ |
3523 | for_each_worker_pool(pool, gcwq) | 3427 | for_each_worker_pool(pool, gcwq) |
3524 | atomic_set(get_pool_nr_running(pool), 0); | 3428 | atomic_set(get_pool_nr_running(pool), 0); |
3525 | |||
3526 | spin_unlock_irq(&gcwq->lock); | ||
3527 | for_each_worker_pool(pool, gcwq) | ||
3528 | del_timer_sync(&pool->idle_timer); | ||
3529 | spin_lock_irq(&gcwq->lock); | ||
3530 | |||
3531 | /* | ||
3532 | * We're now in charge. Notify and proceed to drain. We need | ||
3533 | * to keep the gcwq running during the whole CPU down | ||
3534 | * procedure as other cpu hotunplug callbacks may need to | ||
3535 | * flush currently running tasks. | ||
3536 | */ | ||
3537 | gcwq->trustee_state = TRUSTEE_IN_CHARGE; | ||
3538 | wake_up_all(&gcwq->trustee_wait); | ||
3539 | |||
3540 | /* | ||
3541 | * The original cpu is in the process of dying and may go away | ||
3542 | * anytime now. When that happens, we and all workers would | ||
3543 | * be migrated to other cpus. Try draining any left work. We | ||
3544 | * want to get it over with ASAP - spam rescuers, wake up as | ||
3545 | * many idlers as necessary and create new ones till the | ||
3546 | * worklist is empty. Note that if the gcwq is frozen, there | ||
3547 | * may be frozen works in freezable cwqs. Don't declare | ||
3548 | * completion while frozen. | ||
3549 | */ | ||
3550 | while (true) { | ||
3551 | bool busy = false; | ||
3552 | |||
3553 | for_each_worker_pool(pool, gcwq) | ||
3554 | busy |= pool->nr_workers != pool->nr_idle; | ||
3555 | |||
3556 | if (!busy && !(gcwq->flags & GCWQ_FREEZING) && | ||
3557 | gcwq->trustee_state != TRUSTEE_IN_CHARGE) | ||
3558 | break; | ||
3559 | |||
3560 | for_each_worker_pool(pool, gcwq) { | ||
3561 | int nr_works = 0; | ||
3562 | |||
3563 | list_for_each_entry(work, &pool->worklist, entry) { | ||
3564 | send_mayday(work); | ||
3565 | nr_works++; | ||
3566 | } | ||
3567 | |||
3568 | list_for_each_entry(worker, &pool->idle_list, entry) { | ||
3569 | if (!nr_works--) | ||
3570 | break; | ||
3571 | wake_up_process(worker->task); | ||
3572 | } | ||
3573 | |||
3574 | if (need_to_create_worker(pool)) { | ||
3575 | spin_unlock_irq(&gcwq->lock); | ||
3576 | worker = create_worker(pool); | ||
3577 | spin_lock_irq(&gcwq->lock); | ||
3578 | if (worker) | ||
3579 | start_worker(worker); | ||
3580 | } | ||
3581 | } | ||
3582 | |||
3583 | /* give a breather */ | ||
3584 | if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0) | ||
3585 | break; | ||
3586 | } | ||
3587 | |||
3588 | gcwq_release_management(gcwq); | ||
3589 | |||
3590 | /* notify completion */ | ||
3591 | gcwq->trustee = NULL; | ||
3592 | gcwq->trustee_state = TRUSTEE_DONE; | ||
3593 | wake_up_all(&gcwq->trustee_wait); | ||
3594 | spin_unlock_irq(&gcwq->lock); | ||
3595 | return 0; | ||
3596 | } | ||
3597 | |||
3598 | /** | ||
3599 | * wait_trustee_state - wait for trustee to enter the specified state | ||
3600 | * @gcwq: gcwq the trustee of interest belongs to | ||
3601 | * @state: target state to wait for | ||
3602 | * | ||
3603 | * Wait for the trustee to reach @state. DONE is already matched. | ||
3604 | * | ||
3605 | * CONTEXT: | ||
3606 | * spin_lock_irq(gcwq->lock) which may be released and regrabbed | ||
3607 | * multiple times. To be used by cpu_callback. | ||
3608 | */ | ||
3609 | static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state) | ||
3610 | __releases(&gcwq->lock) | ||
3611 | __acquires(&gcwq->lock) | ||
3612 | { | ||
3613 | if (!(gcwq->trustee_state == state || | ||
3614 | gcwq->trustee_state == TRUSTEE_DONE)) { | ||
3615 | spin_unlock_irq(&gcwq->lock); | ||
3616 | __wait_event(gcwq->trustee_wait, | ||
3617 | gcwq->trustee_state == state || | ||
3618 | gcwq->trustee_state == TRUSTEE_DONE); | ||
3619 | spin_lock_irq(&gcwq->lock); | ||
3620 | } | ||
3621 | } | 3429 | } |
3622 | 3430 | ||
3623 | static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, | 3431 | static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, |
@@ -3626,19 +3434,18 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, | |||
3626 | { | 3434 | { |
3627 | unsigned int cpu = (unsigned long)hcpu; | 3435 | unsigned int cpu = (unsigned long)hcpu; |
3628 | struct global_cwq *gcwq = get_gcwq(cpu); | 3436 | struct global_cwq *gcwq = get_gcwq(cpu); |
3629 | struct task_struct *new_trustee = NULL; | ||
3630 | struct worker_pool *pool; | 3437 | struct worker_pool *pool; |
3438 | struct work_struct unbind_work; | ||
3631 | unsigned long flags; | 3439 | unsigned long flags; |
3632 | 3440 | ||
3633 | action &= ~CPU_TASKS_FROZEN; | 3441 | action &= ~CPU_TASKS_FROZEN; |
3634 | 3442 | ||
3635 | switch (action) { | 3443 | switch (action) { |
3636 | case CPU_DOWN_PREPARE: | 3444 | case CPU_DOWN_PREPARE: |
3637 | new_trustee = kthread_create(trustee_thread, gcwq, | 3445 | /* unbinding should happen on the local CPU */ |
3638 | "workqueue_trustee/%d\n", cpu); | 3446 | INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn); |
3639 | if (IS_ERR(new_trustee)) | 3447 | schedule_work_on(cpu, &unbind_work); |
3640 | return notifier_from_errno(PTR_ERR(new_trustee)); | 3448 | flush_work(&unbind_work); |
3641 | kthread_bind(new_trustee, cpu); | ||
3642 | break; | 3449 | break; |
3643 | 3450 | ||
3644 | case CPU_UP_PREPARE: | 3451 | case CPU_UP_PREPARE: |
@@ -3662,27 +3469,8 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, | |||
3662 | spin_lock_irqsave(&gcwq->lock, flags); | 3469 | spin_lock_irqsave(&gcwq->lock, flags); |
3663 | 3470 | ||
3664 | switch (action) { | 3471 | switch (action) { |
3665 | case CPU_DOWN_PREPARE: | ||
3666 | /* initialize trustee and tell it to acquire the gcwq */ | ||
3667 | BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE); | ||
3668 | gcwq->trustee = new_trustee; | ||
3669 | gcwq->trustee_state = TRUSTEE_START; | ||
3670 | wake_up_process(gcwq->trustee); | ||
3671 | wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE); | ||
3672 | break; | ||
3673 | |||
3674 | case CPU_POST_DEAD: | ||
3675 | gcwq->trustee_state = TRUSTEE_BUTCHER; | ||
3676 | break; | ||
3677 | |||
3678 | case CPU_DOWN_FAILED: | 3472 | case CPU_DOWN_FAILED: |
3679 | case CPU_ONLINE: | 3473 | case CPU_ONLINE: |
3680 | if (gcwq->trustee_state != TRUSTEE_DONE) { | ||
3681 | gcwq->trustee_state = TRUSTEE_RELEASE; | ||
3682 | wake_up_process(gcwq->trustee); | ||
3683 | wait_trustee_state(gcwq, TRUSTEE_DONE); | ||
3684 | } | ||
3685 | |||
3686 | spin_unlock_irq(&gcwq->lock); | 3474 | spin_unlock_irq(&gcwq->lock); |
3687 | gcwq_claim_management(gcwq); | 3475 | gcwq_claim_management(gcwq); |
3688 | spin_lock_irq(&gcwq->lock); | 3476 | spin_lock_irq(&gcwq->lock); |
@@ -3727,7 +3515,6 @@ static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb, | |||
3727 | { | 3515 | { |
3728 | switch (action & ~CPU_TASKS_FROZEN) { | 3516 | switch (action & ~CPU_TASKS_FROZEN) { |
3729 | case CPU_DOWN_PREPARE: | 3517 | case CPU_DOWN_PREPARE: |
3730 | case CPU_POST_DEAD: | ||
3731 | return workqueue_cpu_callback(nfb, action, hcpu); | 3518 | return workqueue_cpu_callback(nfb, action, hcpu); |
3732 | } | 3519 | } |
3733 | return NOTIFY_OK; | 3520 | return NOTIFY_OK; |
@@ -3960,9 +3747,6 @@ static int __init init_workqueues(void) | |||
3960 | } | 3747 | } |
3961 | 3748 | ||
3962 | init_waitqueue_head(&gcwq->rebind_hold); | 3749 | init_waitqueue_head(&gcwq->rebind_hold); |
3963 | |||
3964 | gcwq->trustee_state = TRUSTEE_DONE; | ||
3965 | init_waitqueue_head(&gcwq->trustee_wait); | ||
3966 | } | 3750 | } |
3967 | 3751 | ||
3968 | /* create the initial worker */ | 3752 | /* create the initial worker */ |