Merge branch 'for-3.6' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq

Pull workqueue changes from Tejun Heo: "There are three major changes. - WQ_HIGHPRI has been reimplemented so that high priority work items are served by worker threads with -20 nice value from dedicated highpri worker pools. - CPU hotplug support has been reimplemented such that idle workers are kept across CPU hotplug events. This makes CPU hotplug cheaper (for PM) and makes the code simpler. - flush_kthread_work() has been reimplemented so that a work item can be freed while executing. This removes an annoying behavior difference between kthread_worker and workqueue." * 'for-3.6' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq: workqueue: fix spurious CPU locality WARN from process_one_work() kthread_worker: reimplement flush_kthread_work() to allow freeing the work item being executed kthread_worker: reorganize to prepare for flush_kthread_work() reimplementation workqueue: simplify CPU hotplug code workqueue: remove CPU offline trustee workqueue: don't butcher idle workers on an offline CPU workqueue: reimplement CPU online rebinding to handle idle workers workqueue: drop @bind from create_worker() workqueue: use mutex for global_cwq manager exclusion workqueue: ROGUE workers are UNBOUND workers workqueue: drop CPU_DYING notifier operation workqueue: perform cpu down operations from low priority cpu_notifier() workqueue: reimplement WQ_HIGHPRI using a separate worker_pool workqueue: introduce NR_WORKER_POOLS and for_each_worker_pool() workqueue: separate out worker_pool flags workqueue: use @pool instead of @gcwq or @cpu where applicable workqueue: factor out worker_pool from global_cwq workqueue: don't use WQ_HIGHPRI for unbound workqueues
author: Linus Torvalds <torvalds@linux-foundation.org> 2012-07-24 20:46:16 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2012-07-24 20:46:16 -0400
commit: a08489c569dc174cff97d2cb165aa81e3f1501cc (patch)
tree: c583700a11bab82ea864425004dd5bb03bf8a987 /kernel
parent: 08d9329c29ec98477e8ac2f7a513f2bfa3e9f3c5 (diff)
parent: 6fec10a1a5866dda3cd6a825a521fc7c2f226ba5 (diff)
2 files changed, 584 insertions, 648 deletions
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 3d3de633702e..b579af57ea10 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -360,16 +360,12 @@ repeat:
                                        struct kthread_work, node);
                list_del_init(&work->node);
        }
+        worker->current_work = work;
        spin_unlock_irq(&worker->lock);
        if (work) {
                __set_current_state(TASK_RUNNING);
                work->func(work);
-                smp_wmb();      /* wmb worker-b0 paired with flush-b1 */
-                work->done_seq = work->queue_seq;
-                smp_mb();       /* mb worker-b1 paired with flush-b0 */
-                if (atomic_read(&work->flushing))
-                        wake_up_all(&work->done);
        } else if (!freezing(current))
                schedule();
@@ -378,6 +374,19 @@ repeat:
 }
 EXPORT_SYMBOL_GPL(kthread_worker_fn);
+/* insert @work before @pos in @worker */
+static void insert_kthread_work(struct kthread_worker *worker,
+                               struct kthread_work *work,
+                               struct list_head *pos)
+{
+        lockdep_assert_held(&worker->lock);
+        list_add_tail(&work->node, pos);
+        work->worker = worker;
+        if (likely(worker->task))
+                wake_up_process(worker->task);
+}
 /**
 * queue_kthread_work - queue a kthread_work
 * @worker: target kthread_worker
@@ -395,10 +404,7 @@ bool queue_kthread_work(struct kthread_worker *worker,
        spin_lock_irqsave(&worker->lock, flags);
        if (list_empty(&work->node)) {
-                list_add_tail(&work->node, &worker->work_list);
+                insert_kthread_work(worker, work, &worker->work_list);
-                work->queue_seq++;
-                if (likely(worker->task))
-                        wake_up_process(worker->task);
                ret = true;
        }
        spin_unlock_irqrestore(&worker->lock, flags);
@@ -406,6 +412,18 @@ bool queue_kthread_work(struct kthread_worker *worker,
 }
 EXPORT_SYMBOL_GPL(queue_kthread_work);
+struct kthread_flush_work {
+        struct kthread_work     work;
+        struct completion       done;
+};
+static void kthread_flush_work_fn(struct kthread_work *work)
+{
+        struct kthread_flush_work *fwork =
+                container_of(work, struct kthread_flush_work, work);
+        complete(&fwork->done);
+}
 /**
 * flush_kthread_work - flush a kthread_work
 * @work: work to flush
@@ -414,39 +432,37 @@ EXPORT_SYMBOL_GPL(queue_kthread_work);
 */
 void flush_kthread_work(struct kthread_work *work)
 {
-        int seq = work->queue_seq;
+        struct kthread_flush_work fwork = {
+                KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
-        atomic_inc(&work->flushing);
+                COMPLETION_INITIALIZER_ONSTACK(fwork.done),
+        };
+        struct kthread_worker *worker;
+        bool noop = false;
-        /*
+retry:
-         * mb flush-b0 paired with worker-b1, to make sure either
+        worker = work->worker;
-         * worker sees the above increment or we see done_seq update.
+        if (!worker)
-         */
+                return;
-        smp_mb__after_atomic_inc();
-        /* A - B <= 0 tests whether B is in front of A regardless of overflow */
+        spin_lock_irq(&worker->lock);
-        wait_event(work->done, seq - work->done_seq <= 0);
+        if (work->worker != worker) {
-        atomic_dec(&work->flushing);
+                spin_unlock_irq(&worker->lock);
+                goto retry;
+        }
-        /*
+        if (!list_empty(&work->node))
-         * rmb flush-b1 paired with worker-b0, to make sure our caller
+                insert_kthread_work(worker, &fwork.work, work->node.next);
-         * sees every change made by work->func().
+        else if (worker->current_work == work)
-         */
+                insert_kthread_work(worker, &fwork.work, worker->work_list.next);
-        smp_mb__after_atomic_dec();
+        else
-}
+                noop = true;
-EXPORT_SYMBOL_GPL(flush_kthread_work);
-struct kthread_flush_work {
+        spin_unlock_irq(&worker->lock);
-        struct kthread_work     work;
-        struct completion       done;
-};
-static void kthread_flush_work_fn(struct kthread_work *work)
+        if (!noop)
-{
+                wait_for_completion(&fwork.done);
-        struct kthread_flush_work *fwork =
-                container_of(work, struct kthread_flush_work, work);
-        complete(&fwork->done);
 }
+EXPORT_SYMBOL_GPL(flush_kthread_work);
 /**
 * flush_kthread_worker - flush all current works on a kthread_worker
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 9a3128dc67df..692d97628a10 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -45,32 +45,41 @@
 #include "workqueue_sched.h"
 enum {
-        /* global_cwq flags */
+        /*
-        GCWQ_MANAGE_WORKERS     = 1 << 0,       /* need to manage workers */
+         * global_cwq flags
-        GCWQ_MANAGING_WORKERS   = 1 << 1,       /* managing workers */
+         *
-        GCWQ_DISASSOCIATED      = 1 << 2,       /* cpu can't serve workers */
+         * A bound gcwq is either associated or disassociated with its CPU.
-        GCWQ_FREEZING           = 1 << 3,       /* freeze in progress */
+         * While associated (!DISASSOCIATED), all workers are bound to the
-        GCWQ_HIGHPRI_PENDING    = 1 << 4,       /* highpri works on queue */
+         * CPU and none has %WORKER_UNBOUND set and concurrency management
+         * is in effect.
+         *
+         * While DISASSOCIATED, the cpu may be offline and all workers have
+         * %WORKER_UNBOUND set and concurrency management disabled, and may
+         * be executing on any CPU.  The gcwq behaves as an unbound one.
+         *
+         * Note that DISASSOCIATED can be flipped only while holding
+         * managership of all pools on the gcwq to avoid changing binding
+         * state while create_worker() is in progress.
+         */
+        GCWQ_DISASSOCIATED      = 1 << 0,       /* cpu can't serve workers */
+        GCWQ_FREEZING           = 1 << 1,       /* freeze in progress */
+        /* pool flags */
+        POOL_MANAGE_WORKERS     = 1 << 0,       /* need to manage workers */
        /* worker flags */
        WORKER_STARTED          = 1 << 0,       /* started */
        WORKER_DIE              = 1 << 1,       /* die die die */
        WORKER_IDLE             = 1 << 2,       /* is idle */
        WORKER_PREP             = 1 << 3,       /* preparing to run works */
-        WORKER_ROGUE            = 1 << 4,       /* not bound to any cpu */
        WORKER_REBIND           = 1 << 5,       /* mom is home, come back */
        WORKER_CPU_INTENSIVE    = 1 << 6,       /* cpu intensive */
        WORKER_UNBOUND          = 1 << 7,       /* worker is unbound */
-        WORKER_NOT_RUNNING      = WORKER_PREP | WORKER_ROGUE | WORKER_REBIND |
+        WORKER_NOT_RUNNING      = WORKER_PREP | WORKER_REBIND | WORKER_UNBOUND |
-                                  WORKER_CPU_INTENSIVE | WORKER_UNBOUND,
+                                  WORKER_CPU_INTENSIVE,
-        /* gcwq->trustee_state */
+        NR_WORKER_POOLS         = 2,            /* # worker pools per gcwq */
-        TRUSTEE_START           = 0,            /* start */
-        TRUSTEE_IN_CHARGE       = 1,            /* trustee in charge of gcwq */
-        TRUSTEE_BUTCHER         = 2,            /* butcher workers */
-        TRUSTEE_RELEASE         = 3,            /* release workers */
-        TRUSTEE_DONE            = 4,            /* trustee is done */
        BUSY_WORKER_HASH_ORDER  = 6,            /* 64 pointers */
        BUSY_WORKER_HASH_SIZE   = 1 << BUSY_WORKER_HASH_ORDER,
@@ -84,13 +93,13 @@ enum {
                                                   (min two ticks) */
        MAYDAY_INTERVAL         = HZ / 10,      /* and then every 100ms */
        CREATE_COOLDOWN         = HZ,           /* time to breath after fail */
-        TRUSTEE_COOLDOWN        = HZ / 10,      /* for trustee draining */
        /*
         * Rescue workers are used only on emergencies and shared by
         * all cpus.  Give -20.
         */
        RESCUER_NICE_LEVEL      = -20,
+        HIGHPRI_NICE_LEVEL      = -20,
 };
 /*
@@ -115,6 +124,8 @@ enum {
 */
 struct global_cwq;
+struct worker_pool;
+struct idle_rebind;
 /*
 * The poor guys doing the actual heavy lifting.  All on-duty workers
@@ -131,12 +142,31 @@ struct worker {
        struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */
        struct list_head        scheduled;      /* L: scheduled works */
        struct task_struct      *task;          /* I: worker task */
-        struct global_cwq       *gcwq;          /* I: the associated gcwq */
+        struct worker_pool      *pool;          /* I: the associated pool */
        /* 64 bytes boundary on 64bit, 32 on 32bit */
        unsigned long           last_active;    /* L: last active timestamp */
        unsigned int            flags;          /* X: flags */
        int                     id;             /* I: worker id */
-        struct work_struct      rebind_work;    /* L: rebind worker to cpu */
+        /* for rebinding worker to CPU */
+        struct idle_rebind      *idle_rebind;   /* L: for idle worker */
+        struct work_struct      rebind_work;    /* L: for busy worker */
+};
+struct worker_pool {
+        struct global_cwq       *gcwq;          /* I: the owning gcwq */
+        unsigned int            flags;          /* X: flags */
+        struct list_head        worklist;       /* L: list of pending works */
+        int                     nr_workers;     /* L: total number of workers */
+        int                     nr_idle;        /* L: currently idle ones */
+        struct list_head        idle_list;      /* X: list of idle workers */
+        struct timer_list       idle_timer;     /* L: worker idle timeout */
+        struct timer_list       mayday_timer;   /* L: SOS timer for workers */
+        struct mutex            manager_mutex;  /* mutex manager should hold */
+        struct ida              worker_ida;     /* L: for worker IDs */
 };
 /*
@@ -146,27 +176,16 @@ struct worker {
 */
 struct global_cwq {
        spinlock_t              lock;           /* the gcwq lock */
-        struct list_head        worklist;       /* L: list of pending works */
        unsigned int            cpu;            /* I: the associated cpu */
        unsigned int            flags;          /* L: GCWQ_* flags */
-        int                     nr_workers;     /* L: total number of workers */
+        /* workers are chained either in busy_hash or pool idle_list */
-        int                     nr_idle;        /* L: currently idle ones */
-        /* workers are chained either in the idle_list or busy_hash */
-        struct list_head        idle_list;      /* X: list of idle workers */
        struct hlist_head       busy_hash[BUSY_WORKER_HASH_SIZE];
                                                /* L: hash of busy workers */
-        struct timer_list       idle_timer;     /* L: worker idle timeout */
+        struct worker_pool      pools[2];       /* normal and highpri pools */
-        struct timer_list       mayday_timer;   /* L: SOS timer for dworkers */
-        struct ida              worker_ida;     /* L: for worker IDs */
-        struct task_struct      *trustee;       /* L: for gcwq shutdown */
+        wait_queue_head_t       rebind_hold;    /* rebind hold wait */
-        unsigned int            trustee_state;  /* L: trustee state */
-        wait_queue_head_t       trustee_wait;   /* trustee wait */
-        struct worker           *first_idle;    /* L: first idle worker */
 } ____cacheline_aligned_in_smp;
 /*
@@ -175,7 +194,7 @@ struct global_cwq {
 * aligned at two's power of the number of flag bits.
 */
 struct cpu_workqueue_struct {
-        struct global_cwq       *gcwq;          /* I: the associated gcwq */
+        struct worker_pool      *pool;          /* I: the associated pool */
        struct workqueue_struct *wq;            /* I: the owning workqueue */
        int                     work_color;     /* L: current color */
        int                     flush_color;    /* L: flushing color */
@@ -264,6 +283,10 @@ EXPORT_SYMBOL_GPL(system_nrt_freezable_wq);
 #define CREATE_TRACE_POINTS
 #include <trace/events/workqueue.h>
+#define for_each_worker_pool(pool, gcwq)                                \
+        for ((pool) = &(gcwq)->pools[0];                                \
+             (pool) < &(gcwq)->pools[NR_WORKER_POOLS]; (pool)++)
 #define for_each_busy_worker(worker, i, pos, gcwq)                      \
        for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)                     \
                hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
@@ -444,7 +467,7 @@ static bool workqueue_freezing;		/* W: have wqs started freezing? */
 * try_to_wake_up().  Put it in a separate cacheline.
 */
 static DEFINE_PER_CPU(struct global_cwq, global_cwq);
-static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running);
+static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, pool_nr_running[NR_WORKER_POOLS]);
 /*
 * Global cpu workqueue and nr_running counter for unbound gcwq.  The
@@ -452,10 +475,17 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running);
 * workers have WORKER_UNBOUND set.
 */
 static struct global_cwq unbound_global_cwq;
-static atomic_t unbound_gcwq_nr_running = ATOMIC_INIT(0);       /* always 0 */
+static atomic_t unbound_pool_nr_running[NR_WORKER_POOLS] = {
+        [0 ... NR_WORKER_POOLS - 1]     = ATOMIC_INIT(0),       /* always 0 */
+};
 static int worker_thread(void *__worker);
+static int worker_pool_pri(struct worker_pool *pool)
+{
+        return pool - pool->gcwq->pools;
+}
 static struct global_cwq *get_gcwq(unsigned int cpu)
 {
        if (cpu != WORK_CPU_UNBOUND)
@@ -464,12 +494,15 @@ static struct global_cwq *get_gcwq(unsigned int cpu)
                return &unbound_global_cwq;
 }
-static atomic_t *get_gcwq_nr_running(unsigned int cpu)
+static atomic_t *get_pool_nr_running(struct worker_pool *pool)
 {
+        int cpu = pool->gcwq->cpu;
+        int idx = worker_pool_pri(pool);
        if (cpu != WORK_CPU_UNBOUND)
-                return &per_cpu(gcwq_nr_running, cpu);
+                return &per_cpu(pool_nr_running, cpu)[idx];
        else
-                return &unbound_gcwq_nr_running;
+                return &unbound_pool_nr_running[idx];
 }
 static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
@@ -555,7 +588,7 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)
        if (data & WORK_STRUCT_CWQ)
                return ((struct cpu_workqueue_struct *)
-                        (data & WORK_STRUCT_WQ_DATA_MASK))->gcwq;
+                        (data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq;
        cpu = data >> WORK_STRUCT_FLAG_BITS;
        if (cpu == WORK_CPU_NONE)
@@ -566,60 +599,62 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)
 }
 /*
- * Policy functions.  These define the policies on how the global
+ * Policy functions.  These define the policies on how the global worker
- * worker pool is managed.  Unless noted otherwise, these functions
+ * pools are managed.  Unless noted otherwise, these functions assume that
- * assume that they're being called with gcwq->lock held.
+ * they're being called with gcwq->lock held.
 */
-static bool __need_more_worker(struct global_cwq *gcwq)
+static bool __need_more_worker(struct worker_pool *pool)
 {
-        return !atomic_read(get_gcwq_nr_running(gcwq->cpu)) ||
+        return !atomic_read(get_pool_nr_running(pool));
-                gcwq->flags & GCWQ_HIGHPRI_PENDING;
 }
 /*
 * Need to wake up a worker?  Called from anything but currently
 * running workers.
+ *
+ * Note that, because unbound workers never contribute to nr_running, this
+ * function will always return %true for unbound gcwq as long as the
+ * worklist isn't empty.
 */
-static bool need_more_worker(struct global_cwq *gcwq)
+static bool need_more_worker(struct worker_pool *pool)
 {
-        return !list_empty(&gcwq->worklist) && __need_more_worker(gcwq);
+        return !list_empty(&pool->worklist) && __need_more_worker(pool);
 }
 /* Can I start working?  Called from busy but !running workers. */
-static bool may_start_working(struct global_cwq *gcwq)
+static bool may_start_working(struct worker_pool *pool)
 {
-        return gcwq->nr_idle;
+        return pool->nr_idle;
 }
 /* Do I need to keep working?  Called from currently running workers. */
-static bool keep_working(struct global_cwq *gcwq)
+static bool keep_working(struct worker_pool *pool)
 {
-        atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
+        atomic_t *nr_running = get_pool_nr_running(pool);
-        return !list_empty(&gcwq->worklist) &&
+        return !list_empty(&pool->worklist) && atomic_read(nr_running) <= 1;
-                (atomic_read(nr_running) <= 1 ||
-                 gcwq->flags & GCWQ_HIGHPRI_PENDING);
 }
 /* Do we need a new worker?  Called from manager. */
-static bool need_to_create_worker(struct global_cwq *gcwq)
+static bool need_to_create_worker(struct worker_pool *pool)
 {
-        return need_more_worker(gcwq) && !may_start_working(gcwq);
+        return need_more_worker(pool) && !may_start_working(pool);
 }
 /* Do I need to be the manager? */
-static bool need_to_manage_workers(struct global_cwq *gcwq)
+static bool need_to_manage_workers(struct worker_pool *pool)
 {
-        return need_to_create_worker(gcwq) || gcwq->flags & GCWQ_MANAGE_WORKERS;
+        return need_to_create_worker(pool) ||
+                (pool->flags & POOL_MANAGE_WORKERS);
 }
 /* Do we have too many workers and should some go away? */
-static bool too_many_workers(struct global_cwq *gcwq)
+static bool too_many_workers(struct worker_pool *pool)
 {
-        bool managing = gcwq->flags & GCWQ_MANAGING_WORKERS;
+        bool managing = mutex_is_locked(&pool->manager_mutex);
-        int nr_idle = gcwq->nr_idle + managing; /* manager is considered idle */
+        int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
-        int nr_busy = gcwq->nr_workers - nr_idle;
+        int nr_busy = pool->nr_workers - nr_idle;
        return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
 }
@@ -629,26 +664,26 @@ static bool too_many_workers(struct global_cwq *gcwq)
 */
 /* Return the first worker.  Safe with preemption disabled */
-static struct worker *first_worker(struct global_cwq *gcwq)
+static struct worker *first_worker(struct worker_pool *pool)
 {
-        if (unlikely(list_empty(&gcwq->idle_list)))
+        if (unlikely(list_empty(&pool->idle_list)))
                return NULL;
-        return list_first_entry(&gcwq->idle_list, struct worker, entry);
+        return list_first_entry(&pool->idle_list, struct worker, entry);
 }
 /**
 * wake_up_worker - wake up an idle worker
- * @gcwq: gcwq to wake worker for
+ * @pool: worker pool to wake worker from
 *
- * Wake up the first idle worker of @gcwq.
+ * Wake up the first idle worker of @pool.
 *
 * CONTEXT:
 * spin_lock_irq(gcwq->lock).
 */
-static void wake_up_worker(struct global_cwq *gcwq)
+static void wake_up_worker(struct worker_pool *pool)
 {
-        struct worker *worker = first_worker(gcwq);
+        struct worker *worker = first_worker(pool);
        if (likely(worker))
                wake_up_process(worker->task);
@@ -670,7 +705,7 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
        struct worker *worker = kthread_data(task);
        if (!(worker->flags & WORKER_NOT_RUNNING))
-                atomic_inc(get_gcwq_nr_running(cpu));
+                atomic_inc(get_pool_nr_running(worker->pool));
 }
 /**
@@ -692,8 +727,8 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
                                       unsigned int cpu)
 {
        struct worker *worker = kthread_data(task), *to_wakeup = NULL;
-        struct global_cwq *gcwq = get_gcwq(cpu);
+        struct worker_pool *pool = worker->pool;
-        atomic_t *nr_running = get_gcwq_nr_running(cpu);
+        atomic_t *nr_running = get_pool_nr_running(pool);
        if (worker->flags & WORKER_NOT_RUNNING)
                return NULL;
@@ -706,14 +741,14 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
         * worklist not empty test sequence is in insert_work().
         * Please read comment there.
         *
-         * NOT_RUNNING is clear.  This means that trustee is not in
+         * NOT_RUNNING is clear.  This means that we're bound to and
-         * charge and we're running on the local cpu w/ rq lock held
+         * running on the local cpu w/ rq lock held and preemption
-         * and preemption disabled, which in turn means that none else
+         * disabled, which in turn means that none else could be
-         * could be manipulating idle_list, so dereferencing idle_list
+         * manipulating idle_list, so dereferencing idle_list without gcwq
-         * without gcwq lock is safe.
+         * lock is safe.
         */
-        if (atomic_dec_and_test(nr_running) && !list_empty(&gcwq->worklist))
+        if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist))
-                to_wakeup = first_worker(gcwq);
+                to_wakeup = first_worker(pool);
        return to_wakeup ? to_wakeup->task : NULL;
 }
@@ -733,7 +768,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
 static inline void worker_set_flags(struct worker *worker, unsigned int flags,
                                    bool wakeup)
 {
-        struct global_cwq *gcwq = worker->gcwq;
+        struct worker_pool *pool = worker->pool;
        WARN_ON_ONCE(worker->task != current);
@@ -744,12 +779,12 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags,
         */
        if ((flags & WORKER_NOT_RUNNING) &&
            !(worker->flags & WORKER_NOT_RUNNING)) {
-                atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
+                atomic_t *nr_running = get_pool_nr_running(pool);
                if (wakeup) {
                        if (atomic_dec_and_test(nr_running) &&
-                            !list_empty(&gcwq->worklist))
+                            !list_empty(&pool->worklist))
-                                wake_up_worker(gcwq);
+                                wake_up_worker(pool);
                } else
                        atomic_dec(nr_running);
        }
@@ -769,7 +804,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags,
 */
 static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
 {
-        struct global_cwq *gcwq = worker->gcwq;
+        struct worker_pool *pool = worker->pool;
        unsigned int oflags = worker->flags;
        WARN_ON_ONCE(worker->task != current);
@@ -783,7 +818,7 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
         */
        if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
                if (!(worker->flags & WORKER_NOT_RUNNING))
-                        atomic_inc(get_gcwq_nr_running(gcwq->cpu));
+                        atomic_inc(get_pool_nr_running(pool));
 }
 /**
@@ -867,43 +902,6 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
 }
 /**
- * gcwq_determine_ins_pos - find insertion position
- * @gcwq: gcwq of interest
- * @cwq: cwq a work is being queued for
- *
- * A work for @cwq is about to be queued on @gcwq, determine insertion
- * position for the work.  If @cwq is for HIGHPRI wq, the work is
- * queued at the head of the queue but in FIFO order with respect to
- * other HIGHPRI works; otherwise, at the end of the queue.  This
- * function also sets GCWQ_HIGHPRI_PENDING flag to hint @gcwq that
- * there are HIGHPRI works pending.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock).
- *
- * RETURNS:
- * Pointer to inserstion position.
- */
-static inline struct list_head *gcwq_determine_ins_pos(struct global_cwq *gcwq,
-                                               struct cpu_workqueue_struct *cwq)
-{
-        struct work_struct *twork;
-        if (likely(!(cwq->wq->flags & WQ_HIGHPRI)))
-                return &gcwq->worklist;
-        list_for_each_entry(twork, &gcwq->worklist, entry) {
-                struct cpu_workqueue_struct *tcwq = get_work_cwq(twork);
-                if (!(tcwq->wq->flags & WQ_HIGHPRI))
-                        break;
-        }
-        gcwq->flags |= GCWQ_HIGHPRI_PENDING;
-        return &twork->entry;
-}
-/**
 * insert_work - insert a work into gcwq
 * @cwq: cwq @work belongs to
 * @work: work to insert
@@ -920,7 +918,7 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
                        struct work_struct *work, struct list_head *head,
                        unsigned int extra_flags)
 {
-        struct global_cwq *gcwq = cwq->gcwq;
+        struct worker_pool *pool = cwq->pool;
        /* we own @work, set data and link */
        set_work_cwq(work, cwq, extra_flags);
@@ -940,8 +938,8 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
         */
        smp_mb();
-        if (__need_more_worker(gcwq))
+        if (__need_more_worker(pool))
-                wake_up_worker(gcwq);
+                wake_up_worker(pool);
 }
 /*
@@ -1043,7 +1041,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
        if (likely(cwq->nr_active < cwq->max_active)) {
                trace_workqueue_activate_work(work);
                cwq->nr_active++;
-                worklist = gcwq_determine_ins_pos(gcwq, cwq);
+                worklist = &cwq->pool->worklist;
        } else {
                work_flags |= WORK_STRUCT_DELAYED;
                worklist = &cwq->delayed_works;
@@ -1192,7 +1190,8 @@ EXPORT_SYMBOL_GPL(queue_delayed_work_on);
 */
 static void worker_enter_idle(struct worker *worker)
 {
-        struct global_cwq *gcwq = worker->gcwq;
+        struct worker_pool *pool = worker->pool;
+        struct global_cwq *gcwq = pool->gcwq;
        BUG_ON(worker->flags & WORKER_IDLE);
        BUG_ON(!list_empty(&worker->entry) &&
@@ -1200,27 +1199,24 @@ static void worker_enter_idle(struct worker *worker)
        /* can't use worker_set_flags(), also called from start_worker() */
        worker->flags |= WORKER_IDLE;
-        gcwq->nr_idle++;
+        pool->nr_idle++;
        worker->last_active = jiffies;
        /* idle_list is LIFO */
-        list_add(&worker->entry, &gcwq->idle_list);
+        list_add(&worker->entry, &pool->idle_list);
-        if (likely(!(worker->flags & WORKER_ROGUE))) {
+        if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
-                if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer))
+                mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
-                        mod_timer(&gcwq->idle_timer,
-                                  jiffies + IDLE_WORKER_TIMEOUT);
-        } else
-                wake_up_all(&gcwq->trustee_wait);
        /*
-         * Sanity check nr_running.  Because trustee releases gcwq->lock
+         * Sanity check nr_running.  Because gcwq_unbind_fn() releases
-         * between setting %WORKER_ROGUE and zapping nr_running, the
+         * gcwq->lock between setting %WORKER_UNBOUND and zapping
-         * warning may trigger spuriously.  Check iff trustee is idle.
+         * nr_running, the warning may trigger spuriously.  Check iff
+         * unbind is not in progress.
         */
-        WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE &&
+        WARN_ON_ONCE(!(gcwq->flags & GCWQ_DISASSOCIATED) &&
-                     gcwq->nr_workers == gcwq->nr_idle &&
+                     pool->nr_workers == pool->nr_idle &&
-                     atomic_read(get_gcwq_nr_running(gcwq->cpu)));
+                     atomic_read(get_pool_nr_running(pool)));
 }
 /**
@@ -1234,11 +1230,11 @@ static void worker_enter_idle(struct worker *worker)
 */
 static void worker_leave_idle(struct worker *worker)
 {
-        struct global_cwq *gcwq = worker->gcwq;
+        struct worker_pool *pool = worker->pool;
        BUG_ON(!(worker->flags & WORKER_IDLE));
        worker_clr_flags(worker, WORKER_IDLE);
-        gcwq->nr_idle--;
+        pool->nr_idle--;
        list_del_init(&worker->entry);
 }
@@ -1258,11 +1254,11 @@ static void worker_leave_idle(struct worker *worker)
 * verbatim as it's best effort and blocking and gcwq may be
 * [dis]associated in the meantime.
 *
- * This function tries set_cpus_allowed() and locks gcwq and verifies
+ * This function tries set_cpus_allowed() and locks gcwq and verifies the
- * the binding against GCWQ_DISASSOCIATED which is set during
+ * binding against %GCWQ_DISASSOCIATED which is set during
- * CPU_DYING and cleared during CPU_ONLINE, so if the worker enters
+ * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker
- * idle state or fetches works without dropping lock, it can guarantee
+ * enters idle state or fetches works without dropping lock, it can
- * the scheduling requirement described in the first paragraph.
+ * guarantee the scheduling requirement described in the first paragraph.
 *
 * CONTEXT:
 * Might sleep.  Called without any lock but returns with gcwq->lock
@@ -1275,7 +1271,7 @@ static void worker_leave_idle(struct worker *worker)
 static bool worker_maybe_bind_and_lock(struct worker *worker)
 __acquires(&gcwq->lock)
 {
-        struct global_cwq *gcwq = worker->gcwq;
+        struct global_cwq *gcwq = worker->pool->gcwq;
        struct task_struct *task = worker->task;
        while (true) {
@@ -1308,16 +1304,40 @@ __acquires(&gcwq->lock)
        }
 }
+struct idle_rebind {
+        int                     cnt;            /* # workers to be rebound */
+        struct completion       done;           /* all workers rebound */
+};
+/*
+ * Rebind an idle @worker to its CPU.  During CPU onlining, this has to
+ * happen synchronously for idle workers.  worker_thread() will test
+ * %WORKER_REBIND before leaving idle and call this function.
+ */
+static void idle_worker_rebind(struct worker *worker)
+{
+        struct global_cwq *gcwq = worker->pool->gcwq;
+        /* CPU must be online at this point */
+        WARN_ON(!worker_maybe_bind_and_lock(worker));
+        if (!--worker->idle_rebind->cnt)
+                complete(&worker->idle_rebind->done);
+        spin_unlock_irq(&worker->pool->gcwq->lock);
+        /* we did our part, wait for rebind_workers() to finish up */
+        wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND));
+}
 /*
- * Function for worker->rebind_work used to rebind rogue busy workers
+ * Function for @worker->rebind.work used to rebind unbound busy workers to
- * to the associated cpu which is coming back online.  This is
+ * the associated cpu which is coming back online.  This is scheduled by
- * scheduled by cpu up but can race with other cpu hotplug operations
+ * cpu up but can race with other cpu hotplug operations and may be
- * and may be executed twice without intervening cpu down.
+ * executed twice without intervening cpu down.
 */
-static void worker_rebind_fn(struct work_struct *work)
+static void busy_worker_rebind_fn(struct work_struct *work)
 {
        struct worker *worker = container_of(work, struct worker, rebind_work);
-        struct global_cwq *gcwq = worker->gcwq;
+        struct global_cwq *gcwq = worker->pool->gcwq;
        if (worker_maybe_bind_and_lock(worker))
                worker_clr_flags(worker, WORKER_REBIND);
@@ -1325,6 +1345,112 @@ static void worker_rebind_fn(struct work_struct *work)
        spin_unlock_irq(&gcwq->lock);
 }
+/**
+ * rebind_workers - rebind all workers of a gcwq to the associated CPU
+ * @gcwq: gcwq of interest
+ *
+ * @gcwq->cpu is coming online.  Rebind all workers to the CPU.  Rebinding
+ * is different for idle and busy ones.
+ *
+ * The idle ones should be rebound synchronously and idle rebinding should
+ * be complete before any worker starts executing work items with
+ * concurrency management enabled; otherwise, scheduler may oops trying to
+ * wake up non-local idle worker from wq_worker_sleeping().
+ *
+ * This is achieved by repeatedly requesting rebinding until all idle
+ * workers are known to have been rebound under @gcwq->lock and holding all
+ * idle workers from becoming busy until idle rebinding is complete.
+ *
+ * Once idle workers are rebound, busy workers can be rebound as they
+ * finish executing their current work items.  Queueing the rebind work at
+ * the head of their scheduled lists is enough.  Note that nr_running will
+ * be properbly bumped as busy workers rebind.
+ *
+ * On return, all workers are guaranteed to either be bound or have rebind
+ * work item scheduled.
+ */
+static void rebind_workers(struct global_cwq *gcwq)
+        __releases(&gcwq->lock) __acquires(&gcwq->lock)
+{
+        struct idle_rebind idle_rebind;
+        struct worker_pool *pool;
+        struct worker *worker;
+        struct hlist_node *pos;
+        int i;
+        lockdep_assert_held(&gcwq->lock);
+        for_each_worker_pool(pool, gcwq)
+                lockdep_assert_held(&pool->manager_mutex);
+        /*
+         * Rebind idle workers.  Interlocked both ways.  We wait for
+         * workers to rebind via @idle_rebind.done.  Workers will wait for
+         * us to finish up by watching %WORKER_REBIND.
+         */
+        init_completion(&idle_rebind.done);
+retry:
+        idle_rebind.cnt = 1;
+        INIT_COMPLETION(idle_rebind.done);
+        /* set REBIND and kick idle ones, we'll wait for these later */
+        for_each_worker_pool(pool, gcwq) {
+                list_for_each_entry(worker, &pool->idle_list, entry) {
+                        if (worker->flags & WORKER_REBIND)
+                                continue;
+                        /* morph UNBOUND to REBIND */
+                        worker->flags &= ~WORKER_UNBOUND;
+                        worker->flags |= WORKER_REBIND;
+                        idle_rebind.cnt++;
+                        worker->idle_rebind = &idle_rebind;
+                        /* worker_thread() will call idle_worker_rebind() */
+                        wake_up_process(worker->task);
+                }
+        }
+        if (--idle_rebind.cnt) {
+                spin_unlock_irq(&gcwq->lock);
+                wait_for_completion(&idle_rebind.done);
+                spin_lock_irq(&gcwq->lock);
+                /* busy ones might have become idle while waiting, retry */
+                goto retry;
+        }
+        /*
+         * All idle workers are rebound and waiting for %WORKER_REBIND to
+         * be cleared inside idle_worker_rebind().  Clear and release.
+         * Clearing %WORKER_REBIND from this foreign context is safe
+         * because these workers are still guaranteed to be idle.
+         */
+        for_each_worker_pool(pool, gcwq)
+                list_for_each_entry(worker, &pool->idle_list, entry)
+                        worker->flags &= ~WORKER_REBIND;
+        wake_up_all(&gcwq->rebind_hold);
+        /* rebind busy workers */
+        for_each_busy_worker(worker, i, pos, gcwq) {
+                struct work_struct *rebind_work = &worker->rebind_work;
+                /* morph UNBOUND to REBIND */
+                worker->flags &= ~WORKER_UNBOUND;
+                worker->flags |= WORKER_REBIND;
+                if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
+                                     work_data_bits(rebind_work)))
+                        continue;
+                /* wq doesn't matter, use the default one */
+                debug_work_activate(rebind_work);
+                insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work,
+                            worker->scheduled.next,
+                            work_color_to_flags(WORK_NO_COLOR));
+        }
+}
 static struct worker *alloc_worker(void)
 {
        struct worker *worker;
@@ -1333,7 +1459,7 @@ static struct worker *alloc_worker(void)
        if (worker) {
                INIT_LIST_HEAD(&worker->entry);
                INIT_LIST_HEAD(&worker->scheduled);
-                INIT_WORK(&worker->rebind_work, worker_rebind_fn);
+                INIT_WORK(&worker->rebind_work, busy_worker_rebind_fn);
                /* on creation a worker is in !idle && prep state */
                worker->flags = WORKER_PREP;
        }
@@ -1342,10 +1468,9 @@ static struct worker *alloc_worker(void)
 /**
 * create_worker - create a new workqueue worker
- * @gcwq: gcwq the new worker will belong to
+ * @pool: pool the new worker will belong to
- * @bind: whether to set affinity to @cpu or not
 *
- * Create a new worker which is bound to @gcwq.  The returned worker
+ * Create a new worker which is bound to @pool.  The returned worker
 * can be started by calling start_worker() or destroyed using
 * destroy_worker().
 *
@@ -1355,16 +1480,17 @@ static struct worker *alloc_worker(void)
 * RETURNS:
 * Pointer to the newly created worker.
 */
-static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
+static struct worker *create_worker(struct worker_pool *pool)
 {
-        bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND;
+        struct global_cwq *gcwq = pool->gcwq;
+        const char *pri = worker_pool_pri(pool) ? "H" : "";
        struct worker *worker = NULL;
        int id = -1;
        spin_lock_irq(&gcwq->lock);
-        while (ida_get_new(&gcwq->worker_ida, &id)) {
+        while (ida_get_new(&pool->worker_ida, &id)) {
                spin_unlock_irq(&gcwq->lock);
-                if (!ida_pre_get(&gcwq->worker_ida, GFP_KERNEL))
+                if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL))
                        goto fail;
                spin_lock_irq(&gcwq->lock);
        }
@@ -1374,38 +1500,43 @@ static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
        if (!worker)
                goto fail;
-        worker->gcwq = gcwq;
+        worker->pool = pool;
        worker->id = id;
-        if (!on_unbound_cpu)
+        if (gcwq->cpu != WORK_CPU_UNBOUND)
                worker->task = kthread_create_on_node(worker_thread,
-                                                      worker,
+                                        worker, cpu_to_node(gcwq->cpu),
-                                                      cpu_to_node(gcwq->cpu),
+                                        "kworker/%u:%d%s", gcwq->cpu, id, pri);
-                                                      "kworker/%u:%d", gcwq->cpu, id);
        else
                worker->task = kthread_create(worker_thread, worker,
-                                              "kworker/u:%d", id);
+                                              "kworker/u:%d%s", id, pri);
        if (IS_ERR(worker->task))
                goto fail;
+        if (worker_pool_pri(pool))
+                set_user_nice(worker->task, HIGHPRI_NICE_LEVEL);
        /*
-         * A rogue worker will become a regular one if CPU comes
+         * Determine CPU binding of the new worker depending on
-         * online later on.  Make sure every worker has
+         * %GCWQ_DISASSOCIATED.  The caller is responsible for ensuring the
-         * PF_THREAD_BOUND set.
+         * flag remains stable across this function.  See the comments
+         * above the flag definition for details.
+         *
+         * As an unbound worker may later become a regular one if CPU comes
+         * online, make sure every worker has %PF_THREAD_BOUND set.
         */
-        if (bind && !on_unbound_cpu)
+        if (!(gcwq->flags & GCWQ_DISASSOCIATED)) {
                kthread_bind(worker->task, gcwq->cpu);
-        else {
+        } else {
                worker->task->flags |= PF_THREAD_BOUND;
-                if (on_unbound_cpu)
+                worker->flags |= WORKER_UNBOUND;
-                        worker->flags |= WORKER_UNBOUND;
        }
        return worker;
 fail:
        if (id >= 0) {
                spin_lock_irq(&gcwq->lock);
-                ida_remove(&gcwq->worker_ida, id);
+                ida_remove(&pool->worker_ida, id);
                spin_unlock_irq(&gcwq->lock);
        }
        kfree(worker);
@@ -1424,7 +1555,7 @@ fail:
 static void start_worker(struct worker *worker)
 {
        worker->flags |= WORKER_STARTED;
-        worker->gcwq->nr_workers++;
+        worker->pool->nr_workers++;
        worker_enter_idle(worker);
        wake_up_process(worker->task);
 }
@@ -1440,7 +1571,8 @@ static void start_worker(struct worker *worker)
 */
 static void destroy_worker(struct worker *worker)
 {
-        struct global_cwq *gcwq = worker->gcwq;
+        struct worker_pool *pool = worker->pool;
+        struct global_cwq *gcwq = pool->gcwq;
        int id = worker->id;
        /* sanity check frenzy */
@@ -1448,9 +1580,9 @@ static void destroy_worker(struct worker *worker)
        BUG_ON(!list_empty(&worker->scheduled));
        if (worker->flags & WORKER_STARTED)
-                gcwq->nr_workers--;
+                pool->nr_workers--;
        if (worker->flags & WORKER_IDLE)
-                gcwq->nr_idle--;
+                pool->nr_idle--;
        list_del_init(&worker->entry);
        worker->flags |= WORKER_DIE;
@@ -1461,29 +1593,30 @@ static void destroy_worker(struct worker *worker)
        kfree(worker);
        spin_lock_irq(&gcwq->lock);
-        ida_remove(&gcwq->worker_ida, id);
+        ida_remove(&pool->worker_ida, id);
 }
-static void idle_worker_timeout(unsigned long __gcwq)
+static void idle_worker_timeout(unsigned long __pool)
 {
-        struct global_cwq *gcwq = (void *)__gcwq;
+        struct worker_pool *pool = (void *)__pool;
+        struct global_cwq *gcwq = pool->gcwq;
        spin_lock_irq(&gcwq->lock);
-        if (too_many_workers(gcwq)) {
+        if (too_many_workers(pool)) {
                struct worker *worker;
                unsigned long expires;
                /* idle_list is kept in LIFO order, check the last one */
-                worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
+                worker = list_entry(pool->idle_list.prev, struct worker, entry);
                expires = worker->last_active + IDLE_WORKER_TIMEOUT;
                if (time_before(jiffies, expires))
-                        mod_timer(&gcwq->idle_timer, expires);
+                        mod_timer(&pool->idle_timer, expires);
                else {
                        /* it's been idle for too long, wake up manager */
-                        gcwq->flags |= GCWQ_MANAGE_WORKERS;
+                        pool->flags |= POOL_MANAGE_WORKERS;
-                        wake_up_worker(gcwq);
+                        wake_up_worker(pool);
                }
        }
@@ -1500,7 +1633,7 @@ static bool send_mayday(struct work_struct *work)
                return false;
        /* mayday mayday mayday */
-        cpu = cwq->gcwq->cpu;
+        cpu = cwq->pool->gcwq->cpu;
        /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */
        if (cpu == WORK_CPU_UNBOUND)
                cpu = 0;
@@ -1509,37 +1642,38 @@ static bool send_mayday(struct work_struct *work)
        return true;
 }
-static void gcwq_mayday_timeout(unsigned long __gcwq)
+static void gcwq_mayday_timeout(unsigned long __pool)
 {
-        struct global_cwq *gcwq = (void *)__gcwq;
+        struct worker_pool *pool = (void *)__pool;
+        struct global_cwq *gcwq = pool->gcwq;
        struct work_struct *work;
        spin_lock_irq(&gcwq->lock);
-        if (need_to_create_worker(gcwq)) {
+        if (need_to_create_worker(pool)) {
                /*
                 * We've been trying to create a new worker but
                 * haven't been successful.  We might be hitting an
                 * allocation deadlock.  Send distress signals to
                 * rescuers.
                 */
-                list_for_each_entry(work, &gcwq->worklist, entry)
+                list_for_each_entry(work, &pool->worklist, entry)
                        send_mayday(work);
        }
        spin_unlock_irq(&gcwq->lock);
-        mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INTERVAL);
+        mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
 }
 /**
 * maybe_create_worker - create a new worker if necessary
- * @gcwq: gcwq to create a new worker for
+ * @pool: pool to create a new worker for
 *
- * Create a new worker for @gcwq if necessary.  @gcwq is guaranteed to
+ * Create a new worker for @pool if necessary.  @pool is guaranteed to
 * have at least one idle worker on return from this function.  If
 * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
- * sent to all rescuers with works scheduled on @gcwq to resolve
+ * sent to all rescuers with works scheduled on @pool to resolve
 * possible allocation deadlock.
 *
 * On return, need_to_create_worker() is guaranteed to be false and
@@ -1554,52 +1688,54 @@ static void gcwq_mayday_timeout(unsigned long __gcwq)
 * false if no action was taken and gcwq->lock stayed locked, true
 * otherwise.
 */
-static bool maybe_create_worker(struct global_cwq *gcwq)
+static bool maybe_create_worker(struct worker_pool *pool)
 __releases(&gcwq->lock)
 __acquires(&gcwq->lock)
 {
-        if (!need_to_create_worker(gcwq))
+        struct global_cwq *gcwq = pool->gcwq;
+        if (!need_to_create_worker(pool))
                return false;
 restart:
        spin_unlock_irq(&gcwq->lock);
        /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
-        mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
+        mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
        while (true) {
                struct worker *worker;
-                worker = create_worker(gcwq, true);
+                worker = create_worker(pool);
                if (worker) {
-                        del_timer_sync(&gcwq->mayday_timer);
+                        del_timer_sync(&pool->mayday_timer);
                        spin_lock_irq(&gcwq->lock);
                        start_worker(worker);
-                        BUG_ON(need_to_create_worker(gcwq));
+                        BUG_ON(need_to_create_worker(pool));
                        return true;
                }
-                if (!need_to_create_worker(gcwq))
+                if (!need_to_create_worker(pool))
                        break;
                __set_current_state(TASK_INTERRUPTIBLE);
                schedule_timeout(CREATE_COOLDOWN);
-                if (!need_to_create_worker(gcwq))
+                if (!need_to_create_worker(pool))
                        break;
        }
-        del_timer_sync(&gcwq->mayday_timer);
+        del_timer_sync(&pool->mayday_timer);
        spin_lock_irq(&gcwq->lock);
-        if (need_to_create_worker(gcwq))
+        if (need_to_create_worker(pool))
                goto restart;
        return true;
 }
 /**
 * maybe_destroy_worker - destroy workers which have been idle for a while
- * @gcwq: gcwq to destroy workers for
+ * @pool: pool to destroy workers for
 *
- * Destroy @gcwq workers which have been idle for longer than
+ * Destroy @pool workers which have been idle for longer than
 * IDLE_WORKER_TIMEOUT.
 *
 * LOCKING:
@@ -1610,19 +1746,19 @@ restart:
 * false if no action was taken and gcwq->lock stayed locked, true
 * otherwise.
 */
-static bool maybe_destroy_workers(struct global_cwq *gcwq)
+static bool maybe_destroy_workers(struct worker_pool *pool)
 {
        bool ret = false;
-        while (too_many_workers(gcwq)) {
+        while (too_many_workers(pool)) {
                struct worker *worker;
                unsigned long expires;
-                worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
+                worker = list_entry(pool->idle_list.prev, struct worker, entry);
                expires = worker->last_active + IDLE_WORKER_TIMEOUT;
                if (time_before(jiffies, expires)) {
-                        mod_timer(&gcwq->idle_timer, expires);
+                        mod_timer(&pool->idle_timer, expires);
                        break;
                }
@@ -1655,31 +1791,22 @@ static bool maybe_destroy_workers(struct global_cwq *gcwq)
 */
 static bool manage_workers(struct worker *worker)
 {
-        struct global_cwq *gcwq = worker->gcwq;
+        struct worker_pool *pool = worker->pool;
        bool ret = false;
-        if (gcwq->flags & GCWQ_MANAGING_WORKERS)
+        if (!mutex_trylock(&pool->manager_mutex))
                return ret;
-        gcwq->flags &= ~GCWQ_MANAGE_WORKERS;
+        pool->flags &= ~POOL_MANAGE_WORKERS;
-        gcwq->flags |= GCWQ_MANAGING_WORKERS;
        /*
         * Destroy and then create so that may_start_working() is true
         * on return.
         */
-        ret |= maybe_destroy_workers(gcwq);
+        ret |= maybe_destroy_workers(pool);
-        ret |= maybe_create_worker(gcwq);
+        ret |= maybe_create_worker(pool);
-        gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
-        /*
-         * The trustee might be waiting to take over the manager
-         * position, tell it we're done.
-         */
-        if (unlikely(gcwq->trustee))
-                wake_up_all(&gcwq->trustee_wait);
+        mutex_unlock(&pool->manager_mutex);
        return ret;
 }
@@ -1728,10 +1855,9 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
 {
        struct work_struct *work = list_first_entry(&cwq->delayed_works,
                                                    struct work_struct, entry);
-        struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq);
        trace_workqueue_activate_work(work);
-        move_linked_works(work, pos, NULL);
+        move_linked_works(work, &cwq->pool->worklist, NULL);
        __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
        cwq->nr_active++;
 }
@@ -1804,7 +1930,8 @@ __releases(&gcwq->lock)
 __acquires(&gcwq->lock)
 {
        struct cpu_workqueue_struct *cwq = get_work_cwq(work);
-        struct global_cwq *gcwq = cwq->gcwq;
+        struct worker_pool *pool = worker->pool;
+        struct global_cwq *gcwq = pool->gcwq;
        struct hlist_head *bwh = busy_worker_head(gcwq, work);
        bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE;
        work_func_t f = work->func;
@@ -1823,6 +1950,15 @@ __acquires(&gcwq->lock)
        lockdep_copy_map(&lockdep_map, &work->lockdep_map);
 #endif
        /*
+         * Ensure we're on the correct CPU.  DISASSOCIATED test is
+         * necessary to avoid spurious warnings from rescuers servicing the
+         * unbound or a disassociated gcwq.
+         */
+        WARN_ON_ONCE(!(worker->flags & (WORKER_UNBOUND | WORKER_REBIND)) &&
+                     !(gcwq->flags & GCWQ_DISASSOCIATED) &&
+                     raw_smp_processor_id() != gcwq->cpu);
+        /*
         * A single work shouldn't be executed concurrently by
         * multiple workers on a single cpu.  Check whether anyone is
         * already processing the work.  If so, defer the work to the
@@ -1846,27 +1982,19 @@ __acquires(&gcwq->lock)
        list_del_init(&work->entry);
        /*
-         * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI,
-         * wake up another worker; otherwise, clear HIGHPRI_PENDING.
-         */
-        if (unlikely(gcwq->flags & GCWQ_HIGHPRI_PENDING)) {
-                struct work_struct *nwork = list_first_entry(&gcwq->worklist,
-                                                struct work_struct, entry);
-                if (!list_empty(&gcwq->worklist) &&
-                    get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI)
-                        wake_up_worker(gcwq);
-                else
-                        gcwq->flags &= ~GCWQ_HIGHPRI_PENDING;
-        }
-        /*
         * CPU intensive works don't participate in concurrency
         * management.  They're the scheduler's responsibility.
         */
        if (unlikely(cpu_intensive))
                worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
+        /*
+         * Unbound gcwq isn't concurrency managed and work items should be
+         * executed ASAP.  Wake up another worker if necessary.
+         */
+        if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool))
+                wake_up_worker(pool);
        spin_unlock_irq(&gcwq->lock);
        work_clear_pending(work);
@@ -1939,28 +2067,38 @@ static void process_scheduled_works(struct worker *worker)
 static int worker_thread(void *__worker)
 {
        struct worker *worker = __worker;
-        struct global_cwq *gcwq = worker->gcwq;
+        struct worker_pool *pool = worker->pool;
+        struct global_cwq *gcwq = pool->gcwq;
        /* tell the scheduler that this is a workqueue worker */
        worker->task->flags |= PF_WQ_WORKER;
 woke_up:
        spin_lock_irq(&gcwq->lock);
-        /* DIE can be set only while we're idle, checking here is enough */
+        /*
-        if (worker->flags & WORKER_DIE) {
+         * DIE can be set only while idle and REBIND set while busy has
+         * @worker->rebind_work scheduled.  Checking here is enough.
+         */
+        if (unlikely(worker->flags & (WORKER_REBIND | WORKER_DIE))) {
                spin_unlock_irq(&gcwq->lock);
-                worker->task->flags &= ~PF_WQ_WORKER;
-                return 0;
+                if (worker->flags & WORKER_DIE) {
+                        worker->task->flags &= ~PF_WQ_WORKER;
+                        return 0;
+                }
+                idle_worker_rebind(worker);
+                goto woke_up;
        }
        worker_leave_idle(worker);
 recheck:
        /* no more worker necessary? */
-        if (!need_more_worker(gcwq))
+        if (!need_more_worker(pool))
                goto sleep;
        /* do we need to manage? */
-        if (unlikely(!may_start_working(gcwq)) && manage_workers(worker))
+        if (unlikely(!may_start_working(pool)) && manage_workers(worker))
                goto recheck;
        /*
@@ -1979,7 +2117,7 @@ recheck:
        do {
                struct work_struct *work =
-                        list_first_entry(&gcwq->worklist,
+                        list_first_entry(&pool->worklist,
                                         struct work_struct, entry);
                if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
@@ -1991,11 +2129,11 @@ recheck:
                        move_linked_works(work, &worker->scheduled, NULL);
                        process_scheduled_works(worker);
                }
-        } while (keep_working(gcwq));
+        } while (keep_working(pool));
        worker_set_flags(worker, WORKER_PREP, false);
 sleep:
-        if (unlikely(need_to_manage_workers(gcwq)) && manage_workers(worker))
+        if (unlikely(need_to_manage_workers(pool)) && manage_workers(worker))
                goto recheck;
        /*
@@ -2053,14 +2191,15 @@ repeat:
        for_each_mayday_cpu(cpu, wq->mayday_mask) {
                unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu;
                struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq);
-                struct global_cwq *gcwq = cwq->gcwq;
+                struct worker_pool *pool = cwq->pool;
+                struct global_cwq *gcwq = pool->gcwq;
                struct work_struct *work, *n;
                __set_current_state(TASK_RUNNING);
                mayday_clear_cpu(cpu, wq->mayday_mask);
                /* migrate to the target cpu if possible */
-                rescuer->gcwq = gcwq;
+                rescuer->pool = pool;
                worker_maybe_bind_and_lock(rescuer);
                /*
@@ -2068,7 +2207,7 @@ repeat:
                 * process'em.
                 */
                BUG_ON(!list_empty(&rescuer->scheduled));
-                list_for_each_entry_safe(work, n, &gcwq->worklist, entry)
+                list_for_each_entry_safe(work, n, &pool->worklist, entry)
                        if (get_work_cwq(work) == cwq)
                                move_linked_works(work, scheduled, &n);
@@ -2079,8 +2218,8 @@ repeat:
                 * regular worker; otherwise, we end up with 0 concurrency
                 * and stalling the execution.
                 */
-                if (keep_working(gcwq))
+                if (keep_working(pool))
-                        wake_up_worker(gcwq);
+                        wake_up_worker(pool);
                spin_unlock_irq(&gcwq->lock);
        }
@@ -2205,7 +2344,7 @@ static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq,
        for_each_cwq_cpu(cpu, wq) {
                struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
-                struct global_cwq *gcwq = cwq->gcwq;
+                struct global_cwq *gcwq = cwq->pool->gcwq;
                spin_lock_irq(&gcwq->lock);
@@ -2421,9 +2560,9 @@ reflush:
                struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
                bool drained;
-                spin_lock_irq(&cwq->gcwq->lock);
+                spin_lock_irq(&cwq->pool->gcwq->lock);
                drained = !cwq->nr_active && list_empty(&cwq->delayed_works);
-                spin_unlock_irq(&cwq->gcwq->lock);
+                spin_unlock_irq(&cwq->pool->gcwq->lock);
                if (drained)
                        continue;
@@ -2463,7 +2602,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
                 */
                smp_rmb();
                cwq = get_work_cwq(work);
-                if (unlikely(!cwq || gcwq != cwq->gcwq))
+                if (unlikely(!cwq || gcwq != cwq->pool->gcwq))
                        goto already_gone;
        } else if (wait_executing) {
                worker = find_worker_executing_work(gcwq, work);
@@ -2984,13 +3123,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
        if (flags & WQ_MEM_RECLAIM)
                flags |= WQ_RESCUER;
-        /*
-         * Unbound workqueues aren't concurrency managed and should be
-         * dispatched to workers immediately.
-         */
-        if (flags & WQ_UNBOUND)
-                flags |= WQ_HIGHPRI;
        max_active = max_active ?: WQ_DFL_ACTIVE;
        max_active = wq_clamp_max_active(max_active, flags, wq->name);
@@ -3011,9 +3143,10 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
        for_each_cwq_cpu(cpu, wq) {
                struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
                struct global_cwq *gcwq = get_gcwq(cpu);
+                int pool_idx = (bool)(flags & WQ_HIGHPRI);
                BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK);
-                cwq->gcwq = gcwq;
+                cwq->pool = &gcwq->pools[pool_idx];
                cwq->wq = wq;
                cwq->flush_color = -1;
                cwq->max_active = max_active;
@@ -3225,369 +3358,143 @@ EXPORT_SYMBOL_GPL(work_busy);
 * gcwqs serve mix of short, long and very long running works making
 * blocked draining impractical.
 *
- * This is solved by allowing a gcwq to be detached from CPU, running
+ * This is solved by allowing a gcwq to be disassociated from the CPU
- * it with unbound (rogue) workers and allowing it to be reattached
+ * running as an unbound one and allowing it to be reattached later if the
- * later if the cpu comes back online.  A separate thread is created
+ * cpu comes back online.
- * to govern a gcwq in such state and is called the trustee of the
- * gcwq.
- *
- * Trustee states and their descriptions.
- *
- * START        Command state used on startup.  On CPU_DOWN_PREPARE, a
- *              new trustee is started with this state.
- *
- * IN_CHARGE    Once started, trustee will enter this state after
- *              assuming the manager role and making all existing
- *              workers rogue.  DOWN_PREPARE waits for trustee to
- *              enter this state.  After reaching IN_CHARGE, trustee
- *              tries to execute the pending worklist until it's empty
- *              and the state is set to BUTCHER, or the state is set
- *              to RELEASE.
- *
- * BUTCHER      Command state which is set by the cpu callback after
- *              the cpu has went down.  Once this state is set trustee
- *              knows that there will be no new works on the worklist
- *              and once the worklist is empty it can proceed to
- *              killing idle workers.
- *
- * RELEASE      Command state which is set by the cpu callback if the
- *              cpu down has been canceled or it has come online
- *              again.  After recognizing this state, trustee stops
- *              trying to drain or butcher and clears ROGUE, rebinds
- *              all remaining workers back to the cpu and releases
- *              manager role.
- *
- * DONE         Trustee will enter this state after BUTCHER or RELEASE
- *              is complete.
- *
- *          trustee                 CPU                draining
- *         took over                down               complete
- * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE
- *                        |                     |                  ^
- *                        | CPU is back online  v   return workers |
- *                         ----------------> RELEASE --------------
 */
-/**
+/* claim manager positions of all pools */
- * trustee_wait_event_timeout - timed event wait for trustee
+static void gcwq_claim_management_and_lock(struct global_cwq *gcwq)
- * @cond: condition to wait for
- * @timeout: timeout in jiffies
- *
- * wait_event_timeout() for trustee to use.  Handles locking and
- * checks for RELEASE request.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock) which may be released and regrabbed
- * multiple times.  To be used by trustee.
- *
- * RETURNS:
- * Positive indicating left time if @cond is satisfied, 0 if timed
- * out, -1 if canceled.
- */
-#define trustee_wait_event_timeout(cond, timeout) ({                    \
-        long __ret = (timeout);                                         \
-        while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) && \
-               __ret) {                                                 \
-                spin_unlock_irq(&gcwq->lock);                           \
-                __wait_event_timeout(gcwq->trustee_wait, (cond) ||      \
-                        (gcwq->trustee_state == TRUSTEE_RELEASE),       \
-                        __ret);                                         \
-                spin_lock_irq(&gcwq->lock);                             \
-        }                                                               \
-        gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret);          \
-})
-/**
- * trustee_wait_event - event wait for trustee
- * @cond: condition to wait for
- *
- * wait_event() for trustee to use.  Automatically handles locking and
- * checks for CANCEL request.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock) which may be released and regrabbed
- * multiple times.  To be used by trustee.
- *
- * RETURNS:
- * 0 if @cond is satisfied, -1 if canceled.
- */
-#define trustee_wait_event(cond) ({                                     \
-        long __ret1;                                                    \
-        __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\
-        __ret1 < 0 ? -1 : 0;                                            \
-})
-static int __cpuinit trustee_thread(void *__gcwq)
 {
-        struct global_cwq *gcwq = __gcwq;
+        struct worker_pool *pool;
-        struct worker *worker;
-        struct work_struct *work;
-        struct hlist_node *pos;
-        long rc;
-        int i;
-        BUG_ON(gcwq->cpu != smp_processor_id());
+        for_each_worker_pool(pool, gcwq)
+                mutex_lock_nested(&pool->manager_mutex, pool - gcwq->pools);
        spin_lock_irq(&gcwq->lock);
-        /*
+}
-         * Claim the manager position and make all workers rogue.
-         * Trustee must be bound to the target cpu and can't be
-         * cancelled.
-         */
-        BUG_ON(gcwq->cpu != smp_processor_id());
-        rc = trustee_wait_event(!(gcwq->flags & GCWQ_MANAGING_WORKERS));
-        BUG_ON(rc < 0);
-        gcwq->flags |= GCWQ_MANAGING_WORKERS;
-        list_for_each_entry(worker, &gcwq->idle_list, entry)
-                worker->flags |= WORKER_ROGUE;
-        for_each_busy_worker(worker, i, pos, gcwq)
+/* release manager positions */
-                worker->flags |= WORKER_ROGUE;
+static void gcwq_release_management_and_unlock(struct global_cwq *gcwq)
+{
+        struct worker_pool *pool;
-        /*
-         * Call schedule() so that we cross rq->lock and thus can
-         * guarantee sched callbacks see the rogue flag.  This is
-         * necessary as scheduler callbacks may be invoked from other
-         * cpus.
-         */
        spin_unlock_irq(&gcwq->lock);
-        schedule();
+        for_each_worker_pool(pool, gcwq)
-        spin_lock_irq(&gcwq->lock);
+                mutex_unlock(&pool->manager_mutex);
+}
-        /*
+static void gcwq_unbind_fn(struct work_struct *work)
-         * Sched callbacks are disabled now.  Zap nr_running.  After
+{
-         * this, nr_running stays zero and need_more_worker() and
+        struct global_cwq *gcwq = get_gcwq(smp_processor_id());
-         * keep_working() are always true as long as the worklist is
+        struct worker_pool *pool;
-         * not empty.
+        struct worker *worker;
-         */
+        struct hlist_node *pos;
-        atomic_set(get_gcwq_nr_running(gcwq->cpu), 0);
+        int i;
-        spin_unlock_irq(&gcwq->lock);
+        BUG_ON(gcwq->cpu != smp_processor_id());
-        del_timer_sync(&gcwq->idle_timer);
-        spin_lock_irq(&gcwq->lock);
-        /*
+        gcwq_claim_management_and_lock(gcwq);
-         * We're now in charge.  Notify and proceed to drain.  We need
-         * to keep the gcwq running during the whole CPU down
-         * procedure as other cpu hotunplug callbacks may need to
-         * flush currently running tasks.
-         */
-        gcwq->trustee_state = TRUSTEE_IN_CHARGE;
-        wake_up_all(&gcwq->trustee_wait);
        /*
-         * The original cpu is in the process of dying and may go away
+         * We've claimed all manager positions.  Make all workers unbound
-         * anytime now.  When that happens, we and all workers would
+         * and set DISASSOCIATED.  Before this, all workers except for the
-         * be migrated to other cpus.  Try draining any left work.  We
+         * ones which are still executing works from before the last CPU
-         * want to get it over with ASAP - spam rescuers, wake up as
+         * down must be on the cpu.  After this, they may become diasporas.
-         * many idlers as necessary and create new ones till the
-         * worklist is empty.  Note that if the gcwq is frozen, there
-         * may be frozen works in freezable cwqs.  Don't declare
-         * completion while frozen.
         */
-        while (gcwq->nr_workers != gcwq->nr_idle ||
+        for_each_worker_pool(pool, gcwq)
-               gcwq->flags & GCWQ_FREEZING ||
+                list_for_each_entry(worker, &pool->idle_list, entry)
-               gcwq->trustee_state == TRUSTEE_IN_CHARGE) {
+                        worker->flags |= WORKER_UNBOUND;
-                int nr_works = 0;
-                list_for_each_entry(work, &gcwq->worklist, entry) {
-                        send_mayday(work);
-                        nr_works++;
-                }
-                list_for_each_entry(worker, &gcwq->idle_list, entry) {
+        for_each_busy_worker(worker, i, pos, gcwq)
-                        if (!nr_works--)
+                worker->flags |= WORKER_UNBOUND;
-                                break;
-                        wake_up_process(worker->task);
-                }
-                if (need_to_create_worker(gcwq)) {
+        gcwq->flags |= GCWQ_DISASSOCIATED;
-                        spin_unlock_irq(&gcwq->lock);
-                        worker = create_worker(gcwq, false);
-                        spin_lock_irq(&gcwq->lock);
-                        if (worker) {
-                                worker->flags |= WORKER_ROGUE;
-                                start_worker(worker);
-                        }
-                }
-                /* give a breather */
+        gcwq_release_management_and_unlock(gcwq);
-                if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0)
-                        break;
-        }
        /*
-         * Either all works have been scheduled and cpu is down, or
+         * Call schedule() so that we cross rq->lock and thus can guarantee
-         * cpu down has already been canceled.  Wait for and butcher
+         * sched callbacks see the %WORKER_UNBOUND flag.  This is necessary
-         * all workers till we're canceled.
+         * as scheduler callbacks may be invoked from other cpus.
         */
-        do {
+        schedule();
-                rc = trustee_wait_event(!list_empty(&gcwq->idle_list));
-                while (!list_empty(&gcwq->idle_list))
-                        destroy_worker(list_first_entry(&gcwq->idle_list,
-                                                        struct worker, entry));
-        } while (gcwq->nr_workers && rc >= 0);
        /*
-         * At this point, either draining has completed and no worker
+         * Sched callbacks are disabled now.  Zap nr_running.  After this,
-         * is left, or cpu down has been canceled or the cpu is being
+         * nr_running stays zero and need_more_worker() and keep_working()
-         * brought back up.  There shouldn't be any idle one left.
+         * are always true as long as the worklist is not empty.  @gcwq now
-         * Tell the remaining busy ones to rebind once it finishes the
+         * behaves as unbound (in terms of concurrency management) gcwq
-         * currently scheduled works by scheduling the rebind_work.
+         * which is served by workers tied to the CPU.
+         *
+         * On return from this function, the current worker would trigger
+         * unbound chain execution of pending work items if other workers
+         * didn't already.
         */
-        WARN_ON(!list_empty(&gcwq->idle_list));
+        for_each_worker_pool(pool, gcwq)
+                atomic_set(get_pool_nr_running(pool), 0);
-        for_each_busy_worker(worker, i, pos, gcwq) {
-                struct work_struct *rebind_work = &worker->rebind_work;
-                /*
-                 * Rebind_work may race with future cpu hotplug
-                 * operations.  Use a separate flag to mark that
-                 * rebinding is scheduled.
-                 */
-                worker->flags |= WORKER_REBIND;
-                worker->flags &= ~WORKER_ROGUE;
-                /* queue rebind_work, wq doesn't matter, use the default one */
-                if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
-                                     work_data_bits(rebind_work)))
-                        continue;
-                debug_work_activate(rebind_work);
-                insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work,
-                            worker->scheduled.next,
-                            work_color_to_flags(WORK_NO_COLOR));
-        }
-        /* relinquish manager role */
-        gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
-        /* notify completion */
-        gcwq->trustee = NULL;
-        gcwq->trustee_state = TRUSTEE_DONE;
-        wake_up_all(&gcwq->trustee_wait);
-        spin_unlock_irq(&gcwq->lock);
-        return 0;
 }
-/**
+/*
- * wait_trustee_state - wait for trustee to enter the specified state
+ * Workqueues should be brought up before normal priority CPU notifiers.
- * @gcwq: gcwq the trustee of interest belongs to
+ * This will be registered high priority CPU notifier.
- * @state: target state to wait for
- *
- * Wait for the trustee to reach @state.  DONE is already matched.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock) which may be released and regrabbed
- * multiple times.  To be used by cpu_callback.
 */
-static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state)
+static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb,
-__releases(&gcwq->lock)
+                                               unsigned long action,
-__acquires(&gcwq->lock)
+                                               void *hcpu)
-{
-        if (!(gcwq->trustee_state == state ||
-              gcwq->trustee_state == TRUSTEE_DONE)) {
-                spin_unlock_irq(&gcwq->lock);
-                __wait_event(gcwq->trustee_wait,
-                             gcwq->trustee_state == state ||
-                             gcwq->trustee_state == TRUSTEE_DONE);
-                spin_lock_irq(&gcwq->lock);
-        }
-}
-static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
-                                                unsigned long action,
-                                                void *hcpu)
 {
        unsigned int cpu = (unsigned long)hcpu;
        struct global_cwq *gcwq = get_gcwq(cpu);
-        struct task_struct *new_trustee = NULL;
+        struct worker_pool *pool;
-        struct worker *uninitialized_var(new_worker);
-        unsigned long flags;
-        action &= ~CPU_TASKS_FROZEN;
-        switch (action) {
+        switch (action & ~CPU_TASKS_FROZEN) {
-        case CPU_DOWN_PREPARE:
-                new_trustee = kthread_create(trustee_thread, gcwq,
-                                             "workqueue_trustee/%d\n", cpu);
-                if (IS_ERR(new_trustee))
-                        return notifier_from_errno(PTR_ERR(new_trustee));
-                kthread_bind(new_trustee, cpu);
-                /* fall through */
        case CPU_UP_PREPARE:
-                BUG_ON(gcwq->first_idle);
+                for_each_worker_pool(pool, gcwq) {
-                new_worker = create_worker(gcwq, false);
+                        struct worker *worker;
-                if (!new_worker) {
-                        if (new_trustee)
-                                kthread_stop(new_trustee);
-                        return NOTIFY_BAD;
-                }
-        }
-        /* some are called w/ irq disabled, don't disturb irq status */
-        spin_lock_irqsave(&gcwq->lock, flags);
-        switch (action) {
+                        if (pool->nr_workers)
-        case CPU_DOWN_PREPARE:
+                                continue;
-                /* initialize trustee and tell it to acquire the gcwq */
-                BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE);
-                gcwq->trustee = new_trustee;
-                gcwq->trustee_state = TRUSTEE_START;
-                wake_up_process(gcwq->trustee);
-                wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE);
-                /* fall through */
-        case CPU_UP_PREPARE:
-                BUG_ON(gcwq->first_idle);
-                gcwq->first_idle = new_worker;
-                break;
-        case CPU_DYING:
+                        worker = create_worker(pool);
-                /*
+                        if (!worker)
-                 * Before this, the trustee and all workers except for
+                                return NOTIFY_BAD;
-                 * the ones which are still executing works from
-                 * before the last CPU down must be on the cpu.  After
-                 * this, they'll all be diasporas.
-                 */
-                gcwq->flags |= GCWQ_DISASSOCIATED;
-                break;
-        case CPU_POST_DEAD:
+                        spin_lock_irq(&gcwq->lock);
-                gcwq->trustee_state = TRUSTEE_BUTCHER;
+                        start_worker(worker);
-                /* fall through */
+                        spin_unlock_irq(&gcwq->lock);
-        case CPU_UP_CANCELED:
+                }
-                destroy_worker(gcwq->first_idle);
-                gcwq->first_idle = NULL;
                break;
        case CPU_DOWN_FAILED:
        case CPU_ONLINE:
+                gcwq_claim_management_and_lock(gcwq);
                gcwq->flags &= ~GCWQ_DISASSOCIATED;
-                if (gcwq->trustee_state != TRUSTEE_DONE) {
+                rebind_workers(gcwq);
-                        gcwq->trustee_state = TRUSTEE_RELEASE;
+                gcwq_release_management_and_unlock(gcwq);
-                        wake_up_process(gcwq->trustee);
-                        wait_trustee_state(gcwq, TRUSTEE_DONE);
-                }
-                /*
-                 * Trustee is done and there might be no worker left.
-                 * Put the first_idle in and request a real manager to
-                 * take a look.
-                 */
-                spin_unlock_irq(&gcwq->lock);
-                kthread_bind(gcwq->first_idle->task, cpu);
-                spin_lock_irq(&gcwq->lock);
-                gcwq->flags |= GCWQ_MANAGE_WORKERS;
-                start_worker(gcwq->first_idle);
-                gcwq->first_idle = NULL;
                break;
        }
+        return NOTIFY_OK;
+}
-        spin_unlock_irqrestore(&gcwq->lock, flags);
+/*
+ * Workqueues should be brought down after normal priority CPU notifiers.
+ * This will be registered as low priority CPU notifier.
+ */
+static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb,
+                                                 unsigned long action,
+                                                 void *hcpu)
+{
+        unsigned int cpu = (unsigned long)hcpu;
+        struct work_struct unbind_work;
-        return notifier_from_errno(0);
+        switch (action & ~CPU_TASKS_FROZEN) {
+        case CPU_DOWN_PREPARE:
+                /* unbinding should happen on the local CPU */
+                INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn);
+                schedule_work_on(cpu, &unbind_work);
+                flush_work(&unbind_work);
+                break;
+        }
+        return NOTIFY_OK;
 }
 #ifdef CONFIG_SMP
@@ -3746,6 +3653,7 @@ void thaw_workqueues(void)
        for_each_gcwq_cpu(cpu) {
                struct global_cwq *gcwq = get_gcwq(cpu);
+                struct worker_pool *pool;
                struct workqueue_struct *wq;
                spin_lock_irq(&gcwq->lock);
@@ -3767,7 +3675,8 @@ void thaw_workqueues(void)
                                cwq_activate_first_delayed(cwq);
                }
-                wake_up_worker(gcwq);
+                for_each_worker_pool(pool, gcwq)
+                        wake_up_worker(pool);
                spin_unlock_irq(&gcwq->lock);
        }
@@ -3783,46 +3692,57 @@ static int __init init_workqueues(void)
        unsigned int cpu;
        int i;
-        cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE);
+        cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
+        cpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
        /* initialize gcwqs */
        for_each_gcwq_cpu(cpu) {
                struct global_cwq *gcwq = get_gcwq(cpu);
+                struct worker_pool *pool;
                spin_lock_init(&gcwq->lock);
-                INIT_LIST_HEAD(&gcwq->worklist);
                gcwq->cpu = cpu;
                gcwq->flags |= GCWQ_DISASSOCIATED;
-                INIT_LIST_HEAD(&gcwq->idle_list);
                for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)
                        INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
-                init_timer_deferrable(&gcwq->idle_timer);
+                for_each_worker_pool(pool, gcwq) {
-                gcwq->idle_timer.function = idle_worker_timeout;
+                        pool->gcwq = gcwq;
-                gcwq->idle_timer.data = (unsigned long)gcwq;
+                        INIT_LIST_HEAD(&pool->worklist);
+                        INIT_LIST_HEAD(&pool->idle_list);
+                        init_timer_deferrable(&pool->idle_timer);
+                        pool->idle_timer.function = idle_worker_timeout;
+                        pool->idle_timer.data = (unsigned long)pool;
-                setup_timer(&gcwq->mayday_timer, gcwq_mayday_timeout,
+                        setup_timer(&pool->mayday_timer, gcwq_mayday_timeout,
-                            (unsigned long)gcwq);
+                                    (unsigned long)pool);
-                ida_init(&gcwq->worker_ida);
+                        mutex_init(&pool->manager_mutex);
+                        ida_init(&pool->worker_ida);
+                }
-                gcwq->trustee_state = TRUSTEE_DONE;
+                init_waitqueue_head(&gcwq->rebind_hold);
-                init_waitqueue_head(&gcwq->trustee_wait);
        }
        /* create the initial worker */
        for_each_online_gcwq_cpu(cpu) {
                struct global_cwq *gcwq = get_gcwq(cpu);
-                struct worker *worker;
+                struct worker_pool *pool;
                if (cpu != WORK_CPU_UNBOUND)
                        gcwq->flags &= ~GCWQ_DISASSOCIATED;
-                worker = create_worker(gcwq, true);
-                BUG_ON(!worker);
+                for_each_worker_pool(pool, gcwq) {
-                spin_lock_irq(&gcwq->lock);
+                        struct worker *worker;
-                start_worker(worker);
-                spin_unlock_irq(&gcwq->lock);
+                        worker = create_worker(pool);
+                        BUG_ON(!worker);
+                        spin_lock_irq(&gcwq->lock);
+                        start_worker(worker);
+                        spin_unlock_irq(&gcwq->lock);
+                }
        }
        system_wq = alloc_workqueue("events", 0, 0);
author	Linus Torvalds <torvalds@linux-foundation.org>	2012-07-24 20:46:16 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2012-07-24 20:46:16 -0400
commit	a08489c569dc174cff97d2cb165aa81e3f1501cc (patch)
tree	c583700a11bab82ea864425004dd5bb03bf8a987 /kernel
parent	08d9329c29ec98477e8ac2f7a513f2bfa3e9f3c5 (diff)
parent	6fec10a1a5866dda3cd6a825a521fc7c2f226ba5 (diff)