10 files changed, 207 insertions, 153 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b7935fcec7d9..7fee567153f0 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1253,7 +1253,7 @@ retry:
 /*
 * Cross CPU call to disable a performance event
 */
-static int __perf_event_disable(void *info)
+int __perf_event_disable(void *info)
 {
        struct perf_event *event = info;
        struct perf_event_context *ctx = event->ctx;
@@ -2935,12 +2935,12 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel);
 /*
 * Called when the last reference to the file is gone.
 */
-static int perf_release(struct inode *inode, struct file *file)
+static void put_event(struct perf_event *event)
 {
-        struct perf_event *event = file->private_data;
        struct task_struct *owner;
-        file->private_data = NULL;
+        if (!atomic_long_dec_and_test(&event->refcount))
+                return;
        rcu_read_lock();
        owner = ACCESS_ONCE(event->owner);
@@ -2975,7 +2975,13 @@ static int perf_release(struct inode *inode, struct file *file)
                put_task_struct(owner);
        }
-        return perf_event_release_kernel(event);
+        perf_event_release_kernel(event);
+}
+static int perf_release(struct inode *inode, struct file *file)
+{
+        put_event(file->private_data);
+        return 0;
 }
 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
@@ -3227,7 +3233,7 @@ unlock:
 static const struct file_operations perf_fops;
-static struct perf_event *perf_fget_light(int fd, int *fput_needed)
+static struct file *perf_fget_light(int fd, int *fput_needed)
 {
        struct file *file;
@@ -3241,7 +3247,7 @@ static struct perf_event *perf_fget_light(int fd, int *fput_needed)
                return ERR_PTR(-EBADF);
        }
-        return file->private_data;
+        return file;
 }
 static int perf_event_set_output(struct perf_event *event,
@@ -3273,19 +3279,21 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
        case PERF_EVENT_IOC_SET_OUTPUT:
        {
+                struct file *output_file = NULL;
                struct perf_event *output_event = NULL;
                int fput_needed = 0;
                int ret;
                if (arg != -1) {
-                        output_event = perf_fget_light(arg, &fput_needed);
+                        output_file = perf_fget_light(arg, &fput_needed);
-                        if (IS_ERR(output_event))
+                        if (IS_ERR(output_file))
-                                return PTR_ERR(output_event);
+                                return PTR_ERR(output_file);
+                        output_event = output_file->private_data;
                }
                ret = perf_event_set_output(event, output_event);
                if (output_event)
-                        fput_light(output_event->filp, fput_needed);
+                        fput_light(output_file, fput_needed);
                return ret;
        }
@@ -5950,6 +5958,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
        mutex_init(&event->mmap_mutex);
+        atomic_long_set(&event->refcount, 1);
        event->cpu              = cpu;
        event->attr             = *attr;
        event->group_leader     = group_leader;
@@ -6260,12 +6269,12 @@ SYSCALL_DEFINE5(perf_event_open,
                return event_fd;
        if (group_fd != -1) {
-                group_leader = perf_fget_light(group_fd, &fput_needed);
+                group_file = perf_fget_light(group_fd, &fput_needed);
-                if (IS_ERR(group_leader)) {
+                if (IS_ERR(group_file)) {
-                        err = PTR_ERR(group_leader);
+                        err = PTR_ERR(group_file);
                        goto err_fd;
                }
-                group_file = group_leader->filp;
+                group_leader = group_file->private_data;
                if (flags & PERF_FLAG_FD_OUTPUT)
                        output_event = group_leader;
                if (flags & PERF_FLAG_FD_NO_GROUP)
@@ -6402,7 +6411,6 @@ SYSCALL_DEFINE5(perf_event_open,
                put_ctx(gctx);
        }
-        event->filp = event_file;
        WARN_ON_ONCE(ctx->parent_ctx);
        mutex_lock(&ctx->mutex);
@@ -6496,7 +6504,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
                goto err_free;
        }
-        event->filp = NULL;
        WARN_ON_ONCE(ctx->parent_ctx);
        mutex_lock(&ctx->mutex);
        perf_install_in_context(ctx, event, cpu);
@@ -6578,7 +6585,7 @@ static void sync_child_event(struct perf_event *child_event,
         * Release the parent event, if this was the last
         * reference to it.
         */
-        fput(parent_event->filp);
+        put_event(parent_event);
 }
 static void
@@ -6654,9 +6661,8 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
         *
         *   __perf_event_exit_task()
         *     sync_child_event()
-         *       fput(parent_event->filp)
+         *       put_event()
-         *         perf_release()
+         *         mutex_lock(&ctx->mutex)
-         *           mutex_lock(&ctx->mutex)
         *
         * But since its the parent context it won't be the same instance.
         */
@@ -6724,7 +6730,7 @@ static void perf_free_event(struct perf_event *event,
        list_del_init(&event->child_list);
        mutex_unlock(&parent->child_mutex);
-        fput(parent->filp);
+        put_event(parent);
        perf_group_detach(event);
        list_del_event(event, ctx);
@@ -6804,6 +6810,12 @@ inherit_event(struct perf_event *parent_event,
                                           NULL, NULL);
        if (IS_ERR(child_event))
                return child_event;
+        if (!atomic_long_inc_not_zero(&parent_event->refcount)) {
+                free_event(child_event);
+                return NULL;
+        }
        get_ctx(child_ctx);
        /*
@@ -6845,14 +6857,6 @@ inherit_event(struct perf_event *parent_event,
        raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
        /*
-         * Get a reference to the parent filp - we will fput it
-         * when the child event exits. This is safe to do because
-         * we are in the parent and we know that the filp still
-         * exists and has a nonzero count:
-         */
-        atomic_long_inc(&parent_event->filp->f_count);
-        /*
         * Link this into the parent event's child list
         */
        WARN_ON_ONCE(parent_event->ctx->parent_ctx);
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index bb38c4d3ee12..9a7b487c6fe2 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -453,7 +453,16 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att
        int old_type = bp->attr.bp_type;
        int err = 0;
-        perf_event_disable(bp);
+        /*
+         * modify_user_hw_breakpoint can be invoked with IRQs disabled and hence it
+         * will not be possible to raise IPIs that invoke __perf_event_disable.
+         * So call the function directly after making sure we are targeting the
+         * current task.
+         */
+        if (irqs_disabled() && bp->ctx && bp->ctx->task == current)
+                __perf_event_disable(bp);
+        else
+                perf_event_disable(bp);
        bp->attr.bp_addr = attr->bp_addr;
        bp->attr.bp_type = attr->bp_type;
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index b3c7fd554250..6144bab8fd8e 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -232,15 +232,19 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
         */
        tmp.data = &current->nsproxy->pid_ns->last_pid;
-        return proc_dointvec(&tmp, write, buffer, lenp, ppos);
+        return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
 }
+extern int pid_max;
+static int zero = 0;
 static struct ctl_table pid_ns_ctl_table[] = {
        {
                .procname = "ns_last_pid",
                .maxlen = sizeof(int),
                .mode = 0666, /* permissions are checked in the handler */
                .proc_handler = pid_ns_ctl_handler,
+                .extra1 = &zero,
+                .extra2 = &pid_max,
        },
        { }
 };
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fbf1fd098dc6..649c9f876cb1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5304,27 +5304,17 @@ void idle_task_exit(void)
 }
 /*
- * While a dead CPU has no uninterruptible tasks queued at this point,
+ * Since this CPU is going 'away' for a while, fold any nr_active delta
- * it might still have a nonzero ->nr_uninterruptible counter, because
+ * we might have. Assumes we're called after migrate_tasks() so that the
- * for performance reasons the counter is not stricly tracking tasks to
+ * nr_active count is stable.
- * their home CPUs. So we just add the counter to another CPU's counter,
+ *
- * to keep the global sum constant after CPU-down:
+ * Also see the comment "Global load-average calculations".
- */
-static void migrate_nr_uninterruptible(struct rq *rq_src)
-{
-        struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
-        rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
-        rq_src->nr_uninterruptible = 0;
-}
-/*
- * remove the tasks which were accounted by rq from calc_load_tasks.
 */
-static void calc_global_load_remove(struct rq *rq)
+static void calc_load_migrate(struct rq *rq)
 {
-        atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
+        long delta = calc_load_fold_active(rq);
-        rq->calc_load_active = 0;
+        if (delta)
+                atomic_long_add(delta, &calc_load_tasks);
 }
 /*
@@ -5352,9 +5342,6 @@ static void migrate_tasks(unsigned int dead_cpu)
         */
        rq->stop = NULL;
-        /* Ensure any throttled groups are reachable by pick_next_task */
-        unthrottle_offline_cfs_rqs(rq);
        for ( ; ; ) {
                /*
                 * There's this thread running, bail when that's the only
@@ -5618,8 +5605,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                BUG_ON(rq->nr_running != 1); /* the migration thread */
                raw_spin_unlock_irqrestore(&rq->lock, flags);
-                migrate_nr_uninterruptible(rq);
+                calc_load_migrate(rq);
-                calc_global_load_remove(rq);
                break;
 #endif
        }
@@ -6028,11 +6014,6 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
 * allows us to avoid some pointer chasing select_idle_sibling().
 *
- * Iterate domains and sched_groups downward, assigning CPUs to be
- * select_idle_sibling() hw buddy.  Cross-wiring hw makes bouncing
- * due to random perturbation self canceling, ie sw buddies pull
- * their counterpart to their CPU's hw counterpart.
- *
 * Also keep a unique ID per domain (we use the first cpu number in
 * the cpumask of the domain), this allows us to quickly tell if
 * two cpus are in the same cache domain, see cpus_share_cache().
@@ -6046,40 +6027,8 @@ static void update_top_cache_domain(int cpu)
        int id = cpu;
        sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
-        if (sd) {
+        if (sd)
-                struct sched_domain *tmp = sd;
-                struct sched_group *sg, *prev;
-                bool right;
-                /*
-                 * Traverse to first CPU in group, and count hops
-                 * to cpu from there, switching direction on each
-                 * hop, never ever pointing the last CPU rightward.
-                 */
-                do {
-                        id = cpumask_first(sched_domain_span(tmp));
-                        prev = sg = tmp->groups;
-                        right = 1;
-                        while (cpumask_first(sched_group_cpus(sg)) != id)
-                                sg = sg->next;
-                        while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) {
-                                prev = sg;
-                                sg = sg->next;
-                                right = !right;
-                        }
-                        /* A CPU went down, never point back to domain start. */
-                        if (right && cpumask_first(sched_group_cpus(sg->next)) == id)
-                                right = false;
-                        sg = right ? sg->next : prev;
-                        tmp->idle_buddy = cpumask_first(sched_group_cpus(sg));
-                } while ((tmp = tmp->child));
                id = cpumask_first(sched_domain_span(sd));
-        }
        rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
        per_cpu(sd_llc_id, cpu) = id;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c219bf8d704c..96e2b18b6283 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2052,7 +2052,7 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
        hrtimer_cancel(&cfs_b->slack_timer);
 }
-void unthrottle_offline_cfs_rqs(struct rq *rq)
+static void unthrottle_offline_cfs_rqs(struct rq *rq)
 {
        struct cfs_rq *cfs_rq;
@@ -2106,7 +2106,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
        return NULL;
 }
 static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
-void unthrottle_offline_cfs_rqs(struct rq *rq) {}
+static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
 #endif /* CONFIG_CFS_BANDWIDTH */
@@ -2637,6 +2637,8 @@ static int select_idle_sibling(struct task_struct *p, int target)
        int cpu = smp_processor_id();
        int prev_cpu = task_cpu(p);
        struct sched_domain *sd;
+        struct sched_group *sg;
+        int i;
        /*
         * If the task is going to be woken-up on this cpu and if it is
@@ -2653,17 +2655,29 @@ static int select_idle_sibling(struct task_struct *p, int target)
                return prev_cpu;
        /*
-         * Otherwise, check assigned siblings to find an elegible idle cpu.
+         * Otherwise, iterate the domains and find an elegible idle cpu.
         */
        sd = rcu_dereference(per_cpu(sd_llc, target));
        for_each_lower_domain(sd) {
-                if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p)))
+                sg = sd->groups;
-                        continue;
+                do {
-                if (idle_cpu(sd->idle_buddy))
+                        if (!cpumask_intersects(sched_group_cpus(sg),
-                        return sd->idle_buddy;
+                                                tsk_cpus_allowed(p)))
-        }
+                                goto next;
+                        for_each_cpu(i, sched_group_cpus(sg)) {
+                                if (!idle_cpu(i))
+                                        goto next;
+                        }
+                        target = cpumask_first_and(sched_group_cpus(sg),
+                                        tsk_cpus_allowed(p));
+                        goto done;
+next:
+                        sg = sg->next;
+                } while (sg != sd->groups);
+        }
+done:
        return target;
 }
@@ -3658,7 +3672,6 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
 * @group: sched_group whose statistics are to be updated.
 * @load_idx: Load index of sched_domain of this_cpu for load calc.
 * @local_group: Does group contain this_cpu.
- * @cpus: Set of cpus considered for load balancing.
 * @balance: Should we balance.
 * @sgs: variable to hold the statistics for this group.
 */
@@ -3805,7 +3818,6 @@ static bool update_sd_pick_busiest(struct lb_env *env,
 /**
 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
 * @env: The load balancing environment.
- * @cpus: Set of cpus considered for load balancing.
 * @balance: Should we balance.
 * @sds: variable to hold the statistics for this sched_domain.
 */
@@ -4956,6 +4968,9 @@ static void rq_online_fair(struct rq *rq)
 static void rq_offline_fair(struct rq *rq)
 {
        update_sysctl();
+        /* Ensure any throttled groups are reachable by pick_next_task */
+        unthrottle_offline_cfs_rqs(rq);
 }
 #endif /* CONFIG_SMP */
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 944cb68420e9..e0b7ba9c040f 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -691,6 +691,7 @@ balanced:
                 * runtime - in which case borrowing doesn't make sense.
                 */
                rt_rq->rt_runtime = RUNTIME_INF;
+                rt_rq->rt_throttled = 0;
                raw_spin_unlock(&rt_rq->rt_runtime_lock);
                raw_spin_unlock(&rt_b->rt_runtime_lock);
        }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f6714d009e77..0848fa36c383 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1144,7 +1144,6 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
 extern void init_cfs_rq(struct cfs_rq *cfs_rq);
 extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
-extern void unthrottle_offline_cfs_rqs(struct rq *rq);
 extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 024540f97f74..3a9e5d5c1091 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -573,6 +573,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
        tick_do_update_jiffies64(now);
        update_cpu_load_nohz();
+        calc_load_exit_idle();
        touch_softlockup_watchdog();
        /*
         * Cancel the scheduled timer and restore the tick
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 34e5eac81424..d3b91e75cecd 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -303,10 +303,11 @@ void getnstimeofday(struct timespec *ts)
                seq = read_seqbegin(&tk->lock);
                ts->tv_sec = tk->xtime_sec;
-                ts->tv_nsec = timekeeping_get_ns(tk);
+                nsecs = timekeeping_get_ns(tk);
        } while (read_seqretry(&tk->lock, seq));
+        ts->tv_nsec = 0;
        timespec_add_ns(ts, nsecs);
 }
 EXPORT_SYMBOL(getnstimeofday);
@@ -345,6 +346,7 @@ void ktime_get_ts(struct timespec *ts)
 {
        struct timekeeper *tk = &timekeeper;
        struct timespec tomono;
+        s64 nsec;
        unsigned int seq;
        WARN_ON(timekeeping_suspended);
@@ -352,13 +354,14 @@ void ktime_get_ts(struct timespec *ts)
        do {
                seq = read_seqbegin(&tk->lock);
                ts->tv_sec = tk->xtime_sec;
-                ts->tv_nsec = timekeeping_get_ns(tk);
+                nsec = timekeeping_get_ns(tk);
                tomono = tk->wall_to_monotonic;
        } while (read_seqretry(&tk->lock, seq));
-        set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
+        ts->tv_sec += tomono.tv_sec;
-                                ts->tv_nsec + tomono.tv_nsec);
+        ts->tv_nsec = 0;
+        timespec_add_ns(ts, nsec + tomono.tv_nsec);
 }
 EXPORT_SYMBOL_GPL(ktime_get_ts);
@@ -1244,6 +1247,7 @@ void get_monotonic_boottime(struct timespec *ts)
 {
        struct timekeeper *tk = &timekeeper;
        struct timespec tomono, sleep;
+        s64 nsec;
        unsigned int seq;
        WARN_ON(timekeeping_suspended);
@@ -1251,14 +1255,15 @@ void get_monotonic_boottime(struct timespec *ts)
        do {
                seq = read_seqbegin(&tk->lock);
                ts->tv_sec = tk->xtime_sec;
-                ts->tv_nsec = timekeeping_get_ns(tk);
+                nsec = timekeeping_get_ns(tk);
                tomono = tk->wall_to_monotonic;
                sleep = tk->total_sleep_time;
        } while (read_seqretry(&tk->lock, seq));
-        set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec,
+        ts->tv_sec += tomono.tv_sec + sleep.tv_sec;
-                        ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec);
+        ts->tv_nsec = 0;
+        timespec_add_ns(ts, nsec + tomono.tv_nsec + sleep.tv_nsec);
 }
 EXPORT_SYMBOL_GPL(get_monotonic_boottime);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 692d97628a10..3c5a79e2134c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -66,6 +66,7 @@ enum {
        /* pool flags */
        POOL_MANAGE_WORKERS     = 1 << 0,       /* need to manage workers */
+        POOL_MANAGING_WORKERS   = 1 << 1,       /* managing workers */
        /* worker flags */
        WORKER_STARTED          = 1 << 0,       /* started */
@@ -652,7 +653,7 @@ static bool need_to_manage_workers(struct worker_pool *pool)
 /* Do we have too many workers and should some go away? */
 static bool too_many_workers(struct worker_pool *pool)
 {
-        bool managing = mutex_is_locked(&pool->manager_mutex);
+        bool managing = pool->flags & POOL_MANAGING_WORKERS;
        int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
        int nr_busy = pool->nr_workers - nr_idle;
@@ -1326,6 +1327,15 @@ static void idle_worker_rebind(struct worker *worker)
        /* we did our part, wait for rebind_workers() to finish up */
        wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND));
+        /*
+         * rebind_workers() shouldn't finish until all workers passed the
+         * above WORKER_REBIND wait.  Tell it when done.
+         */
+        spin_lock_irq(&worker->pool->gcwq->lock);
+        if (!--worker->idle_rebind->cnt)
+                complete(&worker->idle_rebind->done);
+        spin_unlock_irq(&worker->pool->gcwq->lock);
 }
 /*
@@ -1339,8 +1349,16 @@ static void busy_worker_rebind_fn(struct work_struct *work)
        struct worker *worker = container_of(work, struct worker, rebind_work);
        struct global_cwq *gcwq = worker->pool->gcwq;
-        if (worker_maybe_bind_and_lock(worker))
+        worker_maybe_bind_and_lock(worker);
-                worker_clr_flags(worker, WORKER_REBIND);
+        /*
+         * %WORKER_REBIND must be cleared even if the above binding failed;
+         * otherwise, we may confuse the next CPU_UP cycle or oops / get
+         * stuck by calling idle_worker_rebind() prematurely.  If CPU went
+         * down again inbetween, %WORKER_UNBOUND would be set, so clearing
+         * %WORKER_REBIND is always safe.
+         */
+        worker_clr_flags(worker, WORKER_REBIND);
        spin_unlock_irq(&gcwq->lock);
 }
@@ -1396,12 +1414,15 @@ retry:
        /* set REBIND and kick idle ones, we'll wait for these later */
        for_each_worker_pool(pool, gcwq) {
                list_for_each_entry(worker, &pool->idle_list, entry) {
+                        unsigned long worker_flags = worker->flags;
                        if (worker->flags & WORKER_REBIND)
                                continue;
-                        /* morph UNBOUND to REBIND */
+                        /* morph UNBOUND to REBIND atomically */
-                        worker->flags &= ~WORKER_UNBOUND;
+                        worker_flags &= ~WORKER_UNBOUND;
-                        worker->flags |= WORKER_REBIND;
+                        worker_flags |= WORKER_REBIND;
+                        ACCESS_ONCE(worker->flags) = worker_flags;
                        idle_rebind.cnt++;
                        worker->idle_rebind = &idle_rebind;
@@ -1419,25 +1440,15 @@ retry:
                goto retry;
        }
-        /*
+        /* all idle workers are rebound, rebind busy workers */
-         * All idle workers are rebound and waiting for %WORKER_REBIND to
-         * be cleared inside idle_worker_rebind().  Clear and release.
-         * Clearing %WORKER_REBIND from this foreign context is safe
-         * because these workers are still guaranteed to be idle.
-         */
-        for_each_worker_pool(pool, gcwq)
-                list_for_each_entry(worker, &pool->idle_list, entry)
-                        worker->flags &= ~WORKER_REBIND;
-        wake_up_all(&gcwq->rebind_hold);
-        /* rebind busy workers */
        for_each_busy_worker(worker, i, pos, gcwq) {
                struct work_struct *rebind_work = &worker->rebind_work;
+                unsigned long worker_flags = worker->flags;
-                /* morph UNBOUND to REBIND */
+                /* morph UNBOUND to REBIND atomically */
-                worker->flags &= ~WORKER_UNBOUND;
+                worker_flags &= ~WORKER_UNBOUND;
-                worker->flags |= WORKER_REBIND;
+                worker_flags |= WORKER_REBIND;
+                ACCESS_ONCE(worker->flags) = worker_flags;
                if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
                                     work_data_bits(rebind_work)))
@@ -1449,6 +1460,34 @@ retry:
                            worker->scheduled.next,
                            work_color_to_flags(WORK_NO_COLOR));
        }
+        /*
+         * All idle workers are rebound and waiting for %WORKER_REBIND to
+         * be cleared inside idle_worker_rebind().  Clear and release.
+         * Clearing %WORKER_REBIND from this foreign context is safe
+         * because these workers are still guaranteed to be idle.
+         *
+         * We need to make sure all idle workers passed WORKER_REBIND wait
+         * in idle_worker_rebind() before returning; otherwise, workers can
+         * get stuck at the wait if hotplug cycle repeats.
+         */
+        idle_rebind.cnt = 1;
+        INIT_COMPLETION(idle_rebind.done);
+        for_each_worker_pool(pool, gcwq) {
+                list_for_each_entry(worker, &pool->idle_list, entry) {
+                        worker->flags &= ~WORKER_REBIND;
+                        idle_rebind.cnt++;
+                }
+        }
+        wake_up_all(&gcwq->rebind_hold);
+        if (--idle_rebind.cnt) {
+                spin_unlock_irq(&gcwq->lock);
+                wait_for_completion(&idle_rebind.done);
+                spin_lock_irq(&gcwq->lock);
+        }
 }
 static struct worker *alloc_worker(void)
@@ -1794,9 +1833,45 @@ static bool manage_workers(struct worker *worker)
        struct worker_pool *pool = worker->pool;
        bool ret = false;
-        if (!mutex_trylock(&pool->manager_mutex))
+        if (pool->flags & POOL_MANAGING_WORKERS)
                return ret;
+        pool->flags |= POOL_MANAGING_WORKERS;
+        /*
+         * To simplify both worker management and CPU hotplug, hold off
+         * management while hotplug is in progress.  CPU hotplug path can't
+         * grab %POOL_MANAGING_WORKERS to achieve this because that can
+         * lead to idle worker depletion (all become busy thinking someone
+         * else is managing) which in turn can result in deadlock under
+         * extreme circumstances.  Use @pool->manager_mutex to synchronize
+         * manager against CPU hotplug.
+         *
+         * manager_mutex would always be free unless CPU hotplug is in
+         * progress.  trylock first without dropping @gcwq->lock.
+         */
+        if (unlikely(!mutex_trylock(&pool->manager_mutex))) {
+                spin_unlock_irq(&pool->gcwq->lock);
+                mutex_lock(&pool->manager_mutex);
+                /*
+                 * CPU hotplug could have happened while we were waiting
+                 * for manager_mutex.  Hotplug itself can't handle us
+                 * because manager isn't either on idle or busy list, and
+                 * @gcwq's state and ours could have deviated.
+                 *
+                 * As hotplug is now excluded via manager_mutex, we can
+                 * simply try to bind.  It will succeed or fail depending
+                 * on @gcwq's current state.  Try it and adjust
+                 * %WORKER_UNBOUND accordingly.
+                 */
+                if (worker_maybe_bind_and_lock(worker))
+                        worker->flags &= ~WORKER_UNBOUND;
+                else
+                        worker->flags |= WORKER_UNBOUND;
+                ret = true;
+        }
        pool->flags &= ~POOL_MANAGE_WORKERS;
        /*
@@ -1806,6 +1881,7 @@ static bool manage_workers(struct worker *worker)
        ret |= maybe_destroy_workers(pool);
        ret |= maybe_create_worker(pool);
+        pool->flags &= ~POOL_MANAGING_WORKERS;
        mutex_unlock(&pool->manager_mutex);
        return ret;
 }
@@ -3500,18 +3576,17 @@ static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb,
 #ifdef CONFIG_SMP
 struct work_for_cpu {
-        struct completion completion;
+        struct work_struct work;
        long (*fn)(void *);
        void *arg;
        long ret;
 };
-static int do_work_for_cpu(void *_wfc)
+static void work_for_cpu_fn(struct work_struct *work)
 {
-        struct work_for_cpu *wfc = _wfc;
+        struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work);
        wfc->ret = wfc->fn(wfc->arg);
-        complete(&wfc->completion);
-        return 0;
 }
 /**
@@ -3526,19 +3601,11 @@ static int do_work_for_cpu(void *_wfc)
 */
 long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
 {
-        struct task_struct *sub_thread;
+        struct work_for_cpu wfc = { .fn = fn, .arg = arg };
-        struct work_for_cpu wfc = {
-                .completion = COMPLETION_INITIALIZER_ONSTACK(wfc.completion),
-                .fn = fn,
-                .arg = arg,
-        };
-        sub_thread = kthread_create(do_work_for_cpu, &wfc, "work_for_cpu");
+        INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);
-        if (IS_ERR(sub_thread))
+        schedule_work_on(cpu, &wfc.work);
-                return PTR_ERR(sub_thread);
+        flush_work(&wfc.work);
-        kthread_bind(sub_thread, cpu);
-        wake_up_process(sub_thread);
-        wait_for_completion(&wfc.completion);
        return wfc.ret;
 }
 EXPORT_SYMBOL_GPL(work_on_cpu);