workqueue: remove CPU offline trustee

With the previous changes, a disassociated global_cwq now can run as an unbound one on its own - it can create workers as necessary to drain remaining works after the CPU has been brought down and manage the number of workers using the usual idle timer mechanism making trustee completely redundant except for the actual unbinding operation. This patch removes the trustee and let a disassociated global_cwq manage itself. Unbinding is moved to a work item (for CPU affinity) which is scheduled and flushed from CPU_DONW_PREPARE. This patch moves nr_running clearing outside gcwq and manager locks to simplify the code. As nr_running is unused at the point, this is safe. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: "Rafael J. Wysocki" <rjw@sisk.pl>
author: Tejun Heo <tj@kernel.org> 2012-07-17 15:39:27 -0400
committer: Tejun Heo <tj@kernel.org> 2012-07-17 15:39:27 -0400
commit: 628c78e7ea19d5b70d2b6a59030362168cdbe1ad (patch)
tree: 7867a9f82aae3d31c40356f32ae24223ae0ddf0c /kernel/workqueue.c
parent: 3ce63377305b694f53e7dd0c72907591c5344224 (diff)
1 files changed, 36 insertions, 252 deletions
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index acfabb22e2c4..d1545daa74ad 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -79,13 +79,6 @@ enum {
        WORKER_NOT_RUNNING      = WORKER_PREP | WORKER_REBIND | WORKER_UNBOUND |
                                  WORKER_CPU_INTENSIVE,
-        /* gcwq->trustee_state */
-        TRUSTEE_START           = 0,            /* start */
-        TRUSTEE_IN_CHARGE       = 1,            /* trustee in charge of gcwq */
-        TRUSTEE_BUTCHER         = 2,            /* butcher workers */
-        TRUSTEE_RELEASE         = 3,            /* release workers */
-        TRUSTEE_DONE            = 4,            /* trustee is done */
        NR_WORKER_POOLS         = 2,            /* # worker pools per gcwq */
        BUSY_WORKER_HASH_ORDER  = 6,            /* 64 pointers */
@@ -100,7 +93,6 @@ enum {
                                                   (min two ticks) */
        MAYDAY_INTERVAL         = HZ / 10,      /* and then every 100ms */
        CREATE_COOLDOWN         = HZ,           /* time to breath after fail */
-        TRUSTEE_COOLDOWN        = HZ / 10,      /* for trustee draining */
        /*
         * Rescue workers are used only on emergencies and shared by
@@ -194,10 +186,6 @@ struct global_cwq {
        struct worker_pool      pools[2];       /* normal and highpri pools */
        wait_queue_head_t       rebind_hold;    /* rebind hold wait */
-        struct task_struct      *trustee;       /* L: for gcwq shutdown */
-        unsigned int            trustee_state;  /* L: trustee state */
-        wait_queue_head_t       trustee_wait;   /* trustee wait */
 } ____cacheline_aligned_in_smp;
 /*
@@ -753,11 +741,11 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
         * worklist not empty test sequence is in insert_work().
         * Please read comment there.
         *
-         * NOT_RUNNING is clear.  This means that trustee is not in
+         * NOT_RUNNING is clear.  This means that we're bound to and
-         * charge and we're running on the local cpu w/ rq lock held
+         * running on the local cpu w/ rq lock held and preemption
-         * and preemption disabled, which in turn means that none else
+         * disabled, which in turn means that none else could be
-         * could be manipulating idle_list, so dereferencing idle_list
+         * manipulating idle_list, so dereferencing idle_list without gcwq
-         * without gcwq lock is safe.
+         * lock is safe.
         */
        if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist))
                to_wakeup = first_worker(pool);
@@ -1217,19 +1205,16 @@ static void worker_enter_idle(struct worker *worker)
        /* idle_list is LIFO */
        list_add(&worker->entry, &pool->idle_list);
-        if (likely(gcwq->trustee_state != TRUSTEE_DONE)) {
+        if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
-                if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
+                mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
-                        mod_timer(&pool->idle_timer,
-                                  jiffies + IDLE_WORKER_TIMEOUT);
-        } else
-                wake_up_all(&gcwq->trustee_wait);
        /*
-         * Sanity check nr_running.  Because trustee releases gcwq->lock
+         * Sanity check nr_running.  Because gcwq_unbind_fn() releases
-         * between setting %WORKER_UNBOUND and zapping nr_running, the
+         * gcwq->lock between setting %WORKER_UNBOUND and zapping
-         * warning may trigger spuriously.  Check iff trustee is idle.
+         * nr_running, the warning may trigger spuriously.  Check iff
+         * unbind is not in progress.
         */
-        WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE &&
+        WARN_ON_ONCE(!(gcwq->flags & GCWQ_DISASSOCIATED) &&
                     pool->nr_workers == pool->nr_idle &&
                     atomic_read(get_pool_nr_running(pool)));
 }
@@ -3367,46 +3352,9 @@ EXPORT_SYMBOL_GPL(work_busy);
 * gcwqs serve mix of short, long and very long running works making
 * blocked draining impractical.
 *
- * This is solved by allowing a gcwq to be detached from CPU, running it
+ * This is solved by allowing a gcwq to be disassociated from the CPU
- * with unbound workers and allowing it to be reattached later if the cpu
+ * running as an unbound one and allowing it to be reattached later if the
- * comes back online.  A separate thread is created to govern a gcwq in
+ * cpu comes back online.
- * such state and is called the trustee of the gcwq.
- *
- * Trustee states and their descriptions.
- *
- * START        Command state used on startup.  On CPU_DOWN_PREPARE, a
- *              new trustee is started with this state.
- *
- * IN_CHARGE    Once started, trustee will enter this state after
- *              assuming the manager role and making all existing
- *              workers rogue.  DOWN_PREPARE waits for trustee to
- *              enter this state.  After reaching IN_CHARGE, trustee
- *              tries to execute the pending worklist until it's empty
- *              and the state is set to BUTCHER, or the state is set
- *              to RELEASE.
- *
- * BUTCHER      Command state which is set by the cpu callback after
- *              the cpu has went down.  Once this state is set trustee
- *              knows that there will be no new works on the worklist
- *              and once the worklist is empty it can proceed to
- *              killing idle workers.
- *
- * RELEASE      Command state which is set by the cpu callback if the
- *              cpu down has been canceled or it has come online
- *              again.  After recognizing this state, trustee stops
- *              trying to drain or butcher and clears ROGUE, rebinds
- *              all remaining workers back to the cpu and releases
- *              manager role.
- *
- * DONE         Trustee will enter this state after BUTCHER or RELEASE
- *              is complete.
- *
- *          trustee                 CPU                draining
- *         took over                down               complete
- * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE
- *                        |                     |                  ^
- *                        | CPU is back online  v   return workers |
- *                         ----------------> RELEASE --------------
 */
 /* claim manager positions of all pools */
@@ -3427,61 +3375,11 @@ static void gcwq_release_management(struct global_cwq *gcwq)
                mutex_unlock(&pool->manager_mutex);
 }
-/**
+static void gcwq_unbind_fn(struct work_struct *work)
- * trustee_wait_event_timeout - timed event wait for trustee
- * @cond: condition to wait for
- * @timeout: timeout in jiffies
- *
- * wait_event_timeout() for trustee to use.  Handles locking and
- * checks for RELEASE request.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock) which may be released and regrabbed
- * multiple times.  To be used by trustee.
- *
- * RETURNS:
- * Positive indicating left time if @cond is satisfied, 0 if timed
- * out, -1 if canceled.
- */
-#define trustee_wait_event_timeout(cond, timeout) ({                    \
-        long __ret = (timeout);                                         \
-        while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) && \
-               __ret) {                                                 \
-                spin_unlock_irq(&gcwq->lock);                           \
-                __wait_event_timeout(gcwq->trustee_wait, (cond) ||      \
-                        (gcwq->trustee_state == TRUSTEE_RELEASE),       \
-                        __ret);                                         \
-                spin_lock_irq(&gcwq->lock);                             \
-        }                                                               \
-        gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret);          \
-})
-/**
- * trustee_wait_event - event wait for trustee
- * @cond: condition to wait for
- *
- * wait_event() for trustee to use.  Automatically handles locking and
- * checks for CANCEL request.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock) which may be released and regrabbed
- * multiple times.  To be used by trustee.
- *
- * RETURNS:
- * 0 if @cond is satisfied, -1 if canceled.
- */
-#define trustee_wait_event(cond) ({                                     \
-        long __ret1;                                                    \
-        __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\
-        __ret1 < 0 ? -1 : 0;                                            \
-})
-static int __cpuinit trustee_thread(void *__gcwq)
 {
-        struct global_cwq *gcwq = __gcwq;
+        struct global_cwq *gcwq = get_gcwq(smp_processor_id());
        struct worker_pool *pool;
        struct worker *worker;
-        struct work_struct *work;
        struct hlist_node *pos;
        int i;
@@ -3505,119 +3403,29 @@ static int __cpuinit trustee_thread(void *__gcwq)
        gcwq->flags |= GCWQ_DISASSOCIATED;
+        spin_unlock_irq(&gcwq->lock);
+        gcwq_release_management(gcwq);
        /*
         * Call schedule() so that we cross rq->lock and thus can guarantee
-         * sched callbacks see the unbound flag.  This is necessary as
+         * sched callbacks see the %WORKER_UNBOUND flag.  This is necessary
-         * scheduler callbacks may be invoked from other cpus.
+         * as scheduler callbacks may be invoked from other cpus.
         */
-        spin_unlock_irq(&gcwq->lock);
        schedule();
-        spin_lock_irq(&gcwq->lock);
        /*
-         * Sched callbacks are disabled now.  Zap nr_running.  After
+         * Sched callbacks are disabled now.  Zap nr_running.  After this,
-         * this, nr_running stays zero and need_more_worker() and
+         * nr_running stays zero and need_more_worker() and keep_working()
-         * keep_working() are always true as long as the worklist is
+         * are always true as long as the worklist is not empty.  @gcwq now
-         * not empty.
+         * behaves as unbound (in terms of concurrency management) gcwq
+         * which is served by workers tied to the CPU.
+         *
+         * On return from this function, the current worker would trigger
+         * unbound chain execution of pending work items if other workers
+         * didn't already.
         */
        for_each_worker_pool(pool, gcwq)
                atomic_set(get_pool_nr_running(pool), 0);
-        spin_unlock_irq(&gcwq->lock);
-        for_each_worker_pool(pool, gcwq)
-                del_timer_sync(&pool->idle_timer);
-        spin_lock_irq(&gcwq->lock);
-        /*
-         * We're now in charge.  Notify and proceed to drain.  We need
-         * to keep the gcwq running during the whole CPU down
-         * procedure as other cpu hotunplug callbacks may need to
-         * flush currently running tasks.
-         */
-        gcwq->trustee_state = TRUSTEE_IN_CHARGE;
-        wake_up_all(&gcwq->trustee_wait);
-        /*
-         * The original cpu is in the process of dying and may go away
-         * anytime now.  When that happens, we and all workers would
-         * be migrated to other cpus.  Try draining any left work.  We
-         * want to get it over with ASAP - spam rescuers, wake up as
-         * many idlers as necessary and create new ones till the
-         * worklist is empty.  Note that if the gcwq is frozen, there
-         * may be frozen works in freezable cwqs.  Don't declare
-         * completion while frozen.
-         */
-        while (true) {
-                bool busy = false;
-                for_each_worker_pool(pool, gcwq)
-                        busy |= pool->nr_workers != pool->nr_idle;
-                if (!busy && !(gcwq->flags & GCWQ_FREEZING) &&
-                    gcwq->trustee_state != TRUSTEE_IN_CHARGE)
-                        break;
-                for_each_worker_pool(pool, gcwq) {
-                        int nr_works = 0;
-                        list_for_each_entry(work, &pool->worklist, entry) {
-                                send_mayday(work);
-                                nr_works++;
-                        }
-                        list_for_each_entry(worker, &pool->idle_list, entry) {
-                                if (!nr_works--)
-                                        break;
-                                wake_up_process(worker->task);
-                        }
-                        if (need_to_create_worker(pool)) {
-                                spin_unlock_irq(&gcwq->lock);
-                                worker = create_worker(pool);
-                                spin_lock_irq(&gcwq->lock);
-                                if (worker)
-                                        start_worker(worker);
-                        }
-                }
-                /* give a breather */
-                if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0)
-                        break;
-        }
-        gcwq_release_management(gcwq);
-        /* notify completion */
-        gcwq->trustee = NULL;
-        gcwq->trustee_state = TRUSTEE_DONE;
-        wake_up_all(&gcwq->trustee_wait);
-        spin_unlock_irq(&gcwq->lock);
-        return 0;
-}
-/**
- * wait_trustee_state - wait for trustee to enter the specified state
- * @gcwq: gcwq the trustee of interest belongs to
- * @state: target state to wait for
- *
- * Wait for the trustee to reach @state.  DONE is already matched.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock) which may be released and regrabbed
- * multiple times.  To be used by cpu_callback.
- */
-static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state)
-__releases(&gcwq->lock)
-__acquires(&gcwq->lock)
-{
-        if (!(gcwq->trustee_state == state ||
-              gcwq->trustee_state == TRUSTEE_DONE)) {
-                spin_unlock_irq(&gcwq->lock);
-                __wait_event(gcwq->trustee_wait,
-                             gcwq->trustee_state == state ||
-                             gcwq->trustee_state == TRUSTEE_DONE);
-                spin_lock_irq(&gcwq->lock);
-        }
 }
 static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
@@ -3626,19 +3434,18 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 {
        unsigned int cpu = (unsigned long)hcpu;
        struct global_cwq *gcwq = get_gcwq(cpu);
-        struct task_struct *new_trustee = NULL;
        struct worker_pool *pool;
+        struct work_struct unbind_work;
        unsigned long flags;
        action &= ~CPU_TASKS_FROZEN;
        switch (action) {
        case CPU_DOWN_PREPARE:
-                new_trustee = kthread_create(trustee_thread, gcwq,
+                /* unbinding should happen on the local CPU */
-                                             "workqueue_trustee/%d\n", cpu);
+                INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn);
-                if (IS_ERR(new_trustee))
+                schedule_work_on(cpu, &unbind_work);
-                        return notifier_from_errno(PTR_ERR(new_trustee));
+                flush_work(&unbind_work);
-                kthread_bind(new_trustee, cpu);
                break;
        case CPU_UP_PREPARE:
@@ -3662,27 +3469,8 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
        spin_lock_irqsave(&gcwq->lock, flags);
        switch (action) {
-        case CPU_DOWN_PREPARE:
-                /* initialize trustee and tell it to acquire the gcwq */
-                BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE);
-                gcwq->trustee = new_trustee;
-                gcwq->trustee_state = TRUSTEE_START;
-                wake_up_process(gcwq->trustee);
-                wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE);
-                break;
-        case CPU_POST_DEAD:
-                gcwq->trustee_state = TRUSTEE_BUTCHER;
-                break;
        case CPU_DOWN_FAILED:
        case CPU_ONLINE:
-                if (gcwq->trustee_state != TRUSTEE_DONE) {
-                        gcwq->trustee_state = TRUSTEE_RELEASE;
-                        wake_up_process(gcwq->trustee);
-                        wait_trustee_state(gcwq, TRUSTEE_DONE);
-                }
                spin_unlock_irq(&gcwq->lock);
                gcwq_claim_management(gcwq);
                spin_lock_irq(&gcwq->lock);
@@ -3727,7 +3515,6 @@ static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb,
 {
        switch (action & ~CPU_TASKS_FROZEN) {
        case CPU_DOWN_PREPARE:
-        case CPU_POST_DEAD:
                return workqueue_cpu_callback(nfb, action, hcpu);
        }
        return NOTIFY_OK;
@@ -3960,9 +3747,6 @@ static int __init init_workqueues(void)
                }
                init_waitqueue_head(&gcwq->rebind_hold);
-                gcwq->trustee_state = TRUSTEE_DONE;
-                init_waitqueue_head(&gcwq->trustee_wait);
        }
        /* create the initial worker */
author	Tejun Heo <tj@kernel.org>	2012-07-17 15:39:27 -0400
committer	Tejun Heo <tj@kernel.org>	2012-07-17 15:39:27 -0400
commit	628c78e7ea19d5b70d2b6a59030362168cdbe1ad (patch)
tree	7867a9f82aae3d31c40356f32ae24223ae0ddf0c /kernel/workqueue.c
parent	3ce63377305b694f53e7dd0c72907591c5344224 (diff)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c index acfabb22e2c4..d1545daa74ad 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c
@@ -79,13 +79,6 @@ enum {
79	WORKER_NOT_RUNNING = WORKER_PREP \| WORKER_REBIND \| WORKER_UNBOUND \|	79	WORKER_NOT_RUNNING = WORKER_PREP \| WORKER_REBIND \| WORKER_UNBOUND \|
80	WORKER_CPU_INTENSIVE,	80	WORKER_CPU_INTENSIVE,
81		81
82	/* gcwq->trustee_state */
83	TRUSTEE_START = 0, /* start */
84	TRUSTEE_IN_CHARGE = 1, /* trustee in charge of gcwq */
85	TRUSTEE_BUTCHER = 2, /* butcher workers */
86	TRUSTEE_RELEASE = 3, /* release workers */
87	TRUSTEE_DONE = 4, /* trustee is done */
88
89	NR_WORKER_POOLS = 2, /* # worker pools per gcwq */	82	NR_WORKER_POOLS = 2, /* # worker pools per gcwq */
90		83
91	BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */	84	BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */
@@ -100,7 +93,6 @@ enum {
100	(min two ticks) */	93	(min two ticks) */
101	MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */	94	MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */
102	CREATE_COOLDOWN = HZ, /* time to breath after fail */	95	CREATE_COOLDOWN = HZ, /* time to breath after fail */
103	TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */
104		96
105	/*	97	/*
106	* Rescue workers are used only on emergencies and shared by	98	* Rescue workers are used only on emergencies and shared by
@@ -194,10 +186,6 @@ struct global_cwq {
194	struct worker_pool pools[2]; /* normal and highpri pools */	186	struct worker_pool pools[2]; /* normal and highpri pools */
195		187
196	wait_queue_head_t rebind_hold; /* rebind hold wait */	188	wait_queue_head_t rebind_hold; /* rebind hold wait */
197
198	struct task_struct trustee; / L: for gcwq shutdown */
199	unsigned int trustee_state; /* L: trustee state */
200	wait_queue_head_t trustee_wait; /* trustee wait */
201	} ____cacheline_aligned_in_smp;	189	} ____cacheline_aligned_in_smp;
202		190
203	/*	191	/*
@@ -753,11 +741,11 @@ struct task_struct wq_worker_sleeping(struct task_struct task,
753	* worklist not empty test sequence is in insert_work().	741	* worklist not empty test sequence is in insert_work().
754	* Please read comment there.	742	* Please read comment there.
755	*	743	*
756	* NOT_RUNNING is clear. This means that trustee is not in	744	* NOT_RUNNING is clear. This means that we're bound to and
757	* charge and we're running on the local cpu w/ rq lock held	745	* running on the local cpu w/ rq lock held and preemption
758	* and preemption disabled, which in turn means that none else	746	* disabled, which in turn means that none else could be
759	* could be manipulating idle_list, so dereferencing idle_list	747	* manipulating idle_list, so dereferencing idle_list without gcwq
760	* without gcwq lock is safe.	748	* lock is safe.
761	*/	749	*/
762	if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist))	750	if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist))
763	to_wakeup = first_worker(pool);	751	to_wakeup = first_worker(pool);
@@ -1217,19 +1205,16 @@ static void worker_enter_idle(struct worker *worker)
1217	/* idle_list is LIFO */	1205	/* idle_list is LIFO */
1218	list_add(&worker->entry, &pool->idle_list);	1206	list_add(&worker->entry, &pool->idle_list);
1219		1207
1220	if (likely(gcwq->trustee_state != TRUSTEE_DONE)) {	1208	if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
1221	if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))	1209	mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
1222	mod_timer(&pool->idle_timer,
1223	jiffies + IDLE_WORKER_TIMEOUT);
1224	} else
1225	wake_up_all(&gcwq->trustee_wait);
1226		1210
1227	/*	1211	/*
1228	* Sanity check nr_running. Because trustee releases gcwq->lock	1212	* Sanity check nr_running. Because gcwq_unbind_fn() releases
1229	* between setting %WORKER_UNBOUND and zapping nr_running, the	1213	* gcwq->lock between setting %WORKER_UNBOUND and zapping
1230	* warning may trigger spuriously. Check iff trustee is idle.	1214	* nr_running, the warning may trigger spuriously. Check iff
		1215	* unbind is not in progress.
1231	*/	1216	*/
1232	WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE &&	1217	WARN_ON_ONCE(!(gcwq->flags & GCWQ_DISASSOCIATED) &&
1233	pool->nr_workers == pool->nr_idle &&	1218	pool->nr_workers == pool->nr_idle &&
1234	atomic_read(get_pool_nr_running(pool)));	1219	atomic_read(get_pool_nr_running(pool)));
1235	}	1220	}
@@ -3367,46 +3352,9 @@ EXPORT_SYMBOL_GPL(work_busy);
3367	* gcwqs serve mix of short, long and very long running works making	3352	* gcwqs serve mix of short, long and very long running works making
3368	* blocked draining impractical.	3353	* blocked draining impractical.
3369	*	3354	*
3370	* This is solved by allowing a gcwq to be detached from CPU, running it	3355	* This is solved by allowing a gcwq to be disassociated from the CPU
3371	* with unbound workers and allowing it to be reattached later if the cpu	3356	* running as an unbound one and allowing it to be reattached later if the
3372	* comes back online. A separate thread is created to govern a gcwq in	3357	* cpu comes back online.
3373	* such state and is called the trustee of the gcwq.
3374	*
3375	* Trustee states and their descriptions.
3376	*
3377	* START Command state used on startup. On CPU_DOWN_PREPARE, a
3378	* new trustee is started with this state.
3379	*
3380	* IN_CHARGE Once started, trustee will enter this state after
3381	* assuming the manager role and making all existing
3382	* workers rogue. DOWN_PREPARE waits for trustee to
3383	* enter this state. After reaching IN_CHARGE, trustee
3384	* tries to execute the pending worklist until it's empty
3385	* and the state is set to BUTCHER, or the state is set
3386	* to RELEASE.
3387	*
3388	* BUTCHER Command state which is set by the cpu callback after
3389	* the cpu has went down. Once this state is set trustee
3390	* knows that there will be no new works on the worklist
3391	* and once the worklist is empty it can proceed to
3392	* killing idle workers.
3393	*
3394	* RELEASE Command state which is set by the cpu callback if the
3395	* cpu down has been canceled or it has come online
3396	* again. After recognizing this state, trustee stops
3397	* trying to drain or butcher and clears ROGUE, rebinds
3398	* all remaining workers back to the cpu and releases
3399	* manager role.
3400	*
3401	* DONE Trustee will enter this state after BUTCHER or RELEASE
3402	* is complete.
3403	*
3404	* trustee CPU draining
3405	* took over down complete
3406	* START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE
3407	* \| \| ^
3408	* \| CPU is back online v return workers \|
3409	* ----------------> RELEASE --------------
3410	*/	3358	*/
3411		3359
3412	/* claim manager positions of all pools */	3360	/* claim manager positions of all pools */
@@ -3427,61 +3375,11 @@ static void gcwq_release_management(struct global_cwq *gcwq)
3427	mutex_unlock(&pool->manager_mutex);	3375	mutex_unlock(&pool->manager_mutex);
3428	}	3376	}
3429		3377
3430	/**	3378	static void gcwq_unbind_fn(struct work_struct *work)
3431	* trustee_wait_event_timeout - timed event wait for trustee
3432	* @cond: condition to wait for
3433	* @timeout: timeout in jiffies
3434	*
3435	* wait_event_timeout() for trustee to use. Handles locking and
3436	* checks for RELEASE request.
3437	*
3438	* CONTEXT:
3439	* spin_lock_irq(gcwq->lock) which may be released and regrabbed
3440	* multiple times. To be used by trustee.
3441	*
3442	* RETURNS:
3443	* Positive indicating left time if @cond is satisfied, 0 if timed
3444	* out, -1 if canceled.
3445	*/
3446	#define trustee_wait_event_timeout(cond, timeout) ({ \
3447	long __ret = (timeout); \
3448	while (!((cond) \|\| (gcwq->trustee_state == TRUSTEE_RELEASE)) && \
3449	__ret) { \
3450	spin_unlock_irq(&gcwq->lock); \
3451	__wait_event_timeout(gcwq->trustee_wait, (cond) \|\| \
3452	(gcwq->trustee_state == TRUSTEE_RELEASE), \
3453	__ret); \
3454	spin_lock_irq(&gcwq->lock); \
3455	} \
3456	gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret); \
3457	})
3458
3459	/**
3460	* trustee_wait_event - event wait for trustee
3461	* @cond: condition to wait for
3462	*
3463	* wait_event() for trustee to use. Automatically handles locking and
3464	* checks for CANCEL request.
3465	*
3466	* CONTEXT:
3467	* spin_lock_irq(gcwq->lock) which may be released and regrabbed
3468	* multiple times. To be used by trustee.
3469	*
3470	* RETURNS:
3471	* 0 if @cond is satisfied, -1 if canceled.
3472	*/
3473	#define trustee_wait_event(cond) ({ \
3474	long __ret1; \
3475	__ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\
3476	__ret1 < 0 ? -1 : 0; \
3477	})
3478
3479	static int __cpuinit trustee_thread(void *__gcwq)
3480	{	3379	{
3481	struct global_cwq *gcwq = __gcwq;	3380	struct global_cwq *gcwq = get_gcwq(smp_processor_id());
3482	struct worker_pool *pool;	3381	struct worker_pool *pool;
3483	struct worker *worker;	3382	struct worker *worker;
3484	struct work_struct *work;
3485	struct hlist_node *pos;	3383	struct hlist_node *pos;
3486	int i;	3384	int i;
3487		3385
@@ -3505,119 +3403,29 @@ static int __cpuinit trustee_thread(void *__gcwq)
3505		3403
3506	gcwq->flags \|= GCWQ_DISASSOCIATED;	3404	gcwq->flags \|= GCWQ_DISASSOCIATED;
3507		3405
		3406	spin_unlock_irq(&gcwq->lock);
		3407	gcwq_release_management(gcwq);
		3408
3508	/*	3409	/*
3509	* Call schedule() so that we cross rq->lock and thus can guarantee	3410	* Call schedule() so that we cross rq->lock and thus can guarantee
3510	* sched callbacks see the unbound flag. This is necessary as	3411	* sched callbacks see the %WORKER_UNBOUND flag. This is necessary
3511	* scheduler callbacks may be invoked from other cpus.	3412	* as scheduler callbacks may be invoked from other cpus.
3512	*/	3413	*/
3513	spin_unlock_irq(&gcwq->lock);
3514	schedule();	3414	schedule();
3515	spin_lock_irq(&gcwq->lock);
3516		3415
3517	/*	3416	/*
3518	* Sched callbacks are disabled now. Zap nr_running. After	3417	* Sched callbacks are disabled now. Zap nr_running. After this,
3519	* this, nr_running stays zero and need_more_worker() and	3418	* nr_running stays zero and need_more_worker() and keep_working()
3520	* keep_working() are always true as long as the worklist is	3419	* are always true as long as the worklist is not empty. @gcwq now
3521	* not empty.	3420	* behaves as unbound (in terms of concurrency management) gcwq
		3421	* which is served by workers tied to the CPU.
		3422	*
		3423	* On return from this function, the current worker would trigger
		3424	* unbound chain execution of pending work items if other workers
		3425	* didn't already.
3522	*/	3426	*/
3523	for_each_worker_pool(pool, gcwq)	3427	for_each_worker_pool(pool, gcwq)
3524	atomic_set(get_pool_nr_running(pool), 0);	3428	atomic_set(get_pool_nr_running(pool), 0);
3525
3526	spin_unlock_irq(&gcwq->lock);
3527	for_each_worker_pool(pool, gcwq)
3528	del_timer_sync(&pool->idle_timer);
3529	spin_lock_irq(&gcwq->lock);
3530
3531	/*
3532	* We're now in charge. Notify and proceed to drain. We need
3533	* to keep the gcwq running during the whole CPU down
3534	* procedure as other cpu hotunplug callbacks may need to
3535	* flush currently running tasks.
3536	*/
3537	gcwq->trustee_state = TRUSTEE_IN_CHARGE;
3538	wake_up_all(&gcwq->trustee_wait);
3539
3540	/*
3541	* The original cpu is in the process of dying and may go away
3542	* anytime now. When that happens, we and all workers would
3543	* be migrated to other cpus. Try draining any left work. We
3544	* want to get it over with ASAP - spam rescuers, wake up as
3545	* many idlers as necessary and create new ones till the
3546	* worklist is empty. Note that if the gcwq is frozen, there
3547	* may be frozen works in freezable cwqs. Don't declare
3548	* completion while frozen.
3549	*/
3550	while (true) {
3551	bool busy = false;
3552
3553	for_each_worker_pool(pool, gcwq)
3554	busy \|= pool->nr_workers != pool->nr_idle;
3555
3556	if (!busy && !(gcwq->flags & GCWQ_FREEZING) &&
3557	gcwq->trustee_state != TRUSTEE_IN_CHARGE)
3558	break;
3559
3560	for_each_worker_pool(pool, gcwq) {
3561	int nr_works = 0;
3562
3563	list_for_each_entry(work, &pool->worklist, entry) {
3564	send_mayday(work);
3565	nr_works++;
3566	}
3567
3568	list_for_each_entry(worker, &pool->idle_list, entry) {
3569	if (!nr_works--)
3570	break;
3571	wake_up_process(worker->task);
3572	}
3573
3574	if (need_to_create_worker(pool)) {
3575	spin_unlock_irq(&gcwq->lock);
3576	worker = create_worker(pool);
3577	spin_lock_irq(&gcwq->lock);
3578	if (worker)
3579	start_worker(worker);
3580	}
3581	}
3582
3583	/* give a breather */
3584	if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0)
3585	break;
3586	}
3587
3588	gcwq_release_management(gcwq);
3589
3590	/* notify completion */
3591	gcwq->trustee = NULL;
3592	gcwq->trustee_state = TRUSTEE_DONE;
3593	wake_up_all(&gcwq->trustee_wait);
3594	spin_unlock_irq(&gcwq->lock);
3595	return 0;
3596	}
3597
3598	/**
3599	* wait_trustee_state - wait for trustee to enter the specified state
3600	* @gcwq: gcwq the trustee of interest belongs to
3601	* @state: target state to wait for
3602	*
3603	* Wait for the trustee to reach @state. DONE is already matched.
3604	*
3605	* CONTEXT:
3606	* spin_lock_irq(gcwq->lock) which may be released and regrabbed
3607	* multiple times. To be used by cpu_callback.
3608	*/
3609	static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state)
3610	__releases(&gcwq->lock)
3611	__acquires(&gcwq->lock)
3612	{
3613	if (!(gcwq->trustee_state == state \|\|
3614	gcwq->trustee_state == TRUSTEE_DONE)) {
3615	spin_unlock_irq(&gcwq->lock);
3616	__wait_event(gcwq->trustee_wait,
3617	gcwq->trustee_state == state \|\|
3618	gcwq->trustee_state == TRUSTEE_DONE);
3619	spin_lock_irq(&gcwq->lock);
3620	}
3621	}	3429	}
3622		3430
3623	static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,	3431	static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
@@ -3626,19 +3434,18 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
3626	{	3434	{
3627	unsigned int cpu = (unsigned long)hcpu;	3435	unsigned int cpu = (unsigned long)hcpu;
3628	struct global_cwq *gcwq = get_gcwq(cpu);	3436	struct global_cwq *gcwq = get_gcwq(cpu);
3629	struct task_struct *new_trustee = NULL;
3630	struct worker_pool *pool;	3437	struct worker_pool *pool;
		3438	struct work_struct unbind_work;
3631	unsigned long flags;	3439	unsigned long flags;
3632		3440
3633	action &= ~CPU_TASKS_FROZEN;	3441	action &= ~CPU_TASKS_FROZEN;
3634		3442
3635	switch (action) {	3443	switch (action) {
3636	case CPU_DOWN_PREPARE:	3444	case CPU_DOWN_PREPARE:
3637	new_trustee = kthread_create(trustee_thread, gcwq,	3445	/* unbinding should happen on the local CPU */
3638	"workqueue_trustee/%d\n", cpu);	3446	INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn);
3639	if (IS_ERR(new_trustee))	3447	schedule_work_on(cpu, &unbind_work);
3640	return notifier_from_errno(PTR_ERR(new_trustee));	3448	flush_work(&unbind_work);
3641	kthread_bind(new_trustee, cpu);
3642	break;	3449	break;
3643		3450
3644	case CPU_UP_PREPARE:	3451	case CPU_UP_PREPARE:
@@ -3662,27 +3469,8 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
3662	spin_lock_irqsave(&gcwq->lock, flags);	3469	spin_lock_irqsave(&gcwq->lock, flags);
3663		3470
3664	switch (action) {	3471	switch (action) {
3665	case CPU_DOWN_PREPARE:
3666	/* initialize trustee and tell it to acquire the gcwq */
3667	BUG_ON(gcwq->trustee \|\| gcwq->trustee_state != TRUSTEE_DONE);
3668	gcwq->trustee = new_trustee;
3669	gcwq->trustee_state = TRUSTEE_START;
3670	wake_up_process(gcwq->trustee);
3671	wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE);
3672	break;
3673
3674	case CPU_POST_DEAD:
3675	gcwq->trustee_state = TRUSTEE_BUTCHER;
3676	break;
3677
3678	case CPU_DOWN_FAILED:	3472	case CPU_DOWN_FAILED:
3679	case CPU_ONLINE:	3473	case CPU_ONLINE:
3680	if (gcwq->trustee_state != TRUSTEE_DONE) {
3681	gcwq->trustee_state = TRUSTEE_RELEASE;
3682	wake_up_process(gcwq->trustee);
3683	wait_trustee_state(gcwq, TRUSTEE_DONE);
3684	}
3685
3686	spin_unlock_irq(&gcwq->lock);	3474	spin_unlock_irq(&gcwq->lock);
3687	gcwq_claim_management(gcwq);	3475	gcwq_claim_management(gcwq);
3688	spin_lock_irq(&gcwq->lock);	3476	spin_lock_irq(&gcwq->lock);
@@ -3727,7 +3515,6 @@ static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb,
3727	{	3515	{
3728	switch (action & ~CPU_TASKS_FROZEN) {	3516	switch (action & ~CPU_TASKS_FROZEN) {
3729	case CPU_DOWN_PREPARE:	3517	case CPU_DOWN_PREPARE:
3730	case CPU_POST_DEAD:
3731	return workqueue_cpu_callback(nfb, action, hcpu);	3518	return workqueue_cpu_callback(nfb, action, hcpu);
3732	}	3519	}
3733	return NOTIFY_OK;	3520	return NOTIFY_OK;
@@ -3960,9 +3747,6 @@ static int __init init_workqueues(void)
3960	}	3747	}
3961		3748
3962	init_waitqueue_head(&gcwq->rebind_hold);	3749	init_waitqueue_head(&gcwq->rebind_hold);
3963
3964	gcwq->trustee_state = TRUSTEE_DONE;
3965	init_waitqueue_head(&gcwq->trustee_wait);
3966	}	3750	}
3967		3751
3968	/* create the initial worker */	3752	/* create the initial worker */