32 files changed, 897 insertions, 579 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d863b3c057bb..9efe7108ccaf 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7402,26 +7402,12 @@ static int __perf_cgroup_move(void *info)
        return 0;
 }
-static void perf_cgroup_move(struct task_struct *task)
+static void
+perf_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *task)
 {
        task_function_call(task, __perf_cgroup_move, task);
 }
-static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-                struct cgroup *old_cgrp, struct task_struct *task,
-                bool threadgroup)
-{
-        perf_cgroup_move(task);
-        if (threadgroup) {
-                struct task_struct *c;
-                rcu_read_lock();
-                list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
-                        perf_cgroup_move(c);
-                }
-                rcu_read_unlock();
-        }
-}
 static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
                struct cgroup *old_cgrp, struct task_struct *task)
 {
@@ -7433,7 +7419,7 @@ static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
        if (!(task->flags & PF_EXITING))
                return;
-        perf_cgroup_move(task);
+        perf_cgroup_attach_task(cgrp, task);
 }
 struct cgroup_subsys perf_subsys = {
@@ -7442,6 +7428,6 @@ struct cgroup_subsys perf_subsys = {
        .create         = perf_cgroup_create,
        .destroy        = perf_cgroup_destroy,
        .exit           = perf_cgroup_exit,
-        .attach         = perf_cgroup_attach,
+        .attach_task    = perf_cgroup_attach_task,
 };
 #endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/exit.c b/kernel/exit.c
index 20a406471525..f2b321bae440 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -561,29 +561,28 @@ void exit_files(struct task_struct *tsk)
 #ifdef CONFIG_MM_OWNER
 /*
- * Task p is exiting and it owned mm, lets find a new owner for it
+ * A task is exiting.   If it owned this mm, find a new owner for the mm.
 */
-static inline int
-mm_need_new_owner(struct mm_struct *mm, struct task_struct *p)
-{
-        /*
-         * If there are other users of the mm and the owner (us) is exiting
-         * we need to find a new owner to take on the responsibility.
-         */
-        if (atomic_read(&mm->mm_users) <= 1)
-                return 0;
-        if (mm->owner != p)
-                return 0;
-        return 1;
-}
 void mm_update_next_owner(struct mm_struct *mm)
 {
        struct task_struct *c, *g, *p = current;
 retry:
-        if (!mm_need_new_owner(mm, p))
+        /*
+         * If the exiting or execing task is not the owner, it's
+         * someone else's problem.
+         */
+        if (mm->owner != p)
                return;
+        /*
+         * The current owner is exiting/execing and there are no other
+         * candidates.  Do not leave the mm pointing to a possibly
+         * freed task structure.
+         */
+        if (atomic_read(&mm->mm_users) <= 1) {
+                mm->owner = NULL;
+                return;
+        }
        read_lock(&tasklist_lock);
        /*
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index b8cadf70b1fb..5bf924d80b5c 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -2,7 +2,8 @@ menu "GCOV-based kernel profiling"
 config GCOV_KERNEL
        bool "Enable gcov-based kernel profiling"
-        depends on DEBUG_FS && CONSTRUCTORS
+        depends on DEBUG_FS
+        select CONSTRUCTORS
        default n
        ---help---
        This option enables gcov-based code profiling (e.g. for code coverage
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index 31a9db711906..3a2cab407b93 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -101,10 +101,10 @@ void irq_gc_unmask_enable_reg(struct irq_data *d)
 }
 /**
- * irq_gc_ack - Ack pending interrupt
+ * irq_gc_ack_set_bit - Ack pending interrupt via setting bit
 * @d: irq_data
 */
-void irq_gc_ack(struct irq_data *d)
+void irq_gc_ack_set_bit(struct irq_data *d)
 {
        struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
        u32 mask = 1 << (d->irq - gc->irq_base);
@@ -115,6 +115,20 @@ void irq_gc_ack(struct irq_data *d)
 }
 /**
+ * irq_gc_ack_clr_bit - Ack pending interrupt via clearing bit
+ * @d: irq_data
+ */
+void irq_gc_ack_clr_bit(struct irq_data *d)
+{
+        struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+        u32 mask = ~(1 << (d->irq - gc->irq_base));
+        irq_gc_lock(gc);
+        irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack);
+        irq_gc_unlock(gc);
+}
+/**
 * irq_gc_mask_disable_reg_and_ack- Mask and ack pending interrupt
 * @d: irq_data
 */
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 90cb55f6d7eb..470d08c82bbe 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -133,12 +133,6 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
                switch (res) {
                case IRQ_WAKE_THREAD:
                        /*
-                         * Set result to handled so the spurious check
-                         * does not trigger.
-                         */
-                        res = IRQ_HANDLED;
-                        /*
                         * Catch drivers which return WAKE_THREAD but
                         * did not set up a thread function
                         */
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 886e80347b32..4c60a50e66b2 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -257,13 +257,11 @@ int __init early_irq_init(void)
        count = ARRAY_SIZE(irq_desc);
        for (i = 0; i < count; i++) {
-                desc[i].irq_data.irq = i;
-                desc[i].irq_data.chip = &no_irq_chip;
                desc[i].kstat_irqs = alloc_percpu(unsigned int);
-                irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS);
+                alloc_masks(&desc[i], GFP_KERNEL, node);
-                alloc_masks(desc + i, GFP_KERNEL, node);
+                raw_spin_lock_init(&desc[i].lock);
-                desc_smp_init(desc + i, node);
                lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
+                desc_set_defaults(i, &desc[i], node);
        }
        return arch_early_irq_init();
 }
@@ -346,6 +344,12 @@ irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node)
        if (!cnt)
                return -EINVAL;
+        if (irq >= 0) {
+                if (from > irq)
+                        return -EINVAL;
+                from = irq;
+        }
        mutex_lock(&sparse_irq_lock);
        start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS,
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index f7ce0021e1c4..0a7840aeb0fb 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -491,6 +491,9 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on)
        struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
        int ret = 0;
+        if (!desc)
+                return -EINVAL;
        /* wakeup-capable irqs can be shared between drivers that
         * don't need to have the same sleep mode behaviors.
         */
@@ -723,13 +726,16 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
 * context. So we need to disable bh here to avoid deadlocks and other
 * side effects.
 */
-static void
+static irqreturn_t
 irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
 {
+        irqreturn_t ret;
        local_bh_disable();
-        action->thread_fn(action->irq, action->dev_id);
+        ret = action->thread_fn(action->irq, action->dev_id);
        irq_finalize_oneshot(desc, action, false);
        local_bh_enable();
+        return ret;
 }
 /*
@@ -737,10 +743,14 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
 * preemtible - many of them need to sleep and wait for slow busses to
 * complete.
 */
-static void irq_thread_fn(struct irq_desc *desc, struct irqaction *action)
+static irqreturn_t irq_thread_fn(struct irq_desc *desc,
+                struct irqaction *action)
 {
-        action->thread_fn(action->irq, action->dev_id);
+        irqreturn_t ret;
+        ret = action->thread_fn(action->irq, action->dev_id);
        irq_finalize_oneshot(desc, action, false);
+        return ret;
 }
 /*
@@ -753,7 +763,8 @@ static int irq_thread(void *data)
        };
        struct irqaction *action = data;
        struct irq_desc *desc = irq_to_desc(action->irq);
-        void (*handler_fn)(struct irq_desc *desc, struct irqaction *action);
+        irqreturn_t (*handler_fn)(struct irq_desc *desc,
+                        struct irqaction *action);
        int wake;
        if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD,
@@ -783,8 +794,12 @@ static int irq_thread(void *data)
                        desc->istate |= IRQS_PENDING;
                        raw_spin_unlock_irq(&desc->lock);
                } else {
+                        irqreturn_t action_ret;
                        raw_spin_unlock_irq(&desc->lock);
-                        handler_fn(desc, action);
+                        action_ret = handler_fn(desc, action);
+                        if (!noirqdebug)
+                                note_interrupt(action->irq, desc, action_ret);
                }
                wake = atomic_dec_and_test(&desc->threads_active);
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index dfbd550401b2..aa57d5da18c1 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -167,6 +167,13 @@ out:
                  jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
 }
+static inline int bad_action_ret(irqreturn_t action_ret)
+{
+        if (likely(action_ret <= (IRQ_HANDLED | IRQ_WAKE_THREAD)))
+                return 0;
+        return 1;
+}
 /*
 * If 99,900 of the previous 100,000 interrupts have not been handled
 * then assume that the IRQ is stuck in some manner. Drop a diagnostic
@@ -182,7 +189,7 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc,
        struct irqaction *action;
        unsigned long flags;
-        if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) {
+        if (bad_action_ret(action_ret)) {
                printk(KERN_ERR "irq event %d: bogus return value %x\n",
                                irq, action_ret);
        } else {
@@ -201,10 +208,11 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc,
        raw_spin_lock_irqsave(&desc->lock, flags);
        action = desc->action;
        while (action) {
-                printk(KERN_ERR "[<%p>]", action->handler);
+                printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler);
-                print_symbol(" (%s)",
+                if (action->thread_fn)
-                        (unsigned long)action->handler);
+                        printk(KERN_CONT " threaded [<%p>] %pf",
-                printk("\n");
+                                        action->thread_fn, action->thread_fn);
+                printk(KERN_CONT "\n");
                action = action->next;
        }
        raw_spin_unlock_irqrestore(&desc->lock, flags);
@@ -262,7 +270,16 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
        if (desc->istate & IRQS_POLL_INPROGRESS)
                return;
-        if (unlikely(action_ret != IRQ_HANDLED)) {
+        /* we get here again via the threaded handler */
+        if (action_ret == IRQ_WAKE_THREAD)
+                return;
+        if (bad_action_ret(action_ret)) {
+                report_bad_irq(irq, desc, action_ret);
+                return;
+        }
+        if (unlikely(action_ret == IRQ_NONE)) {
                /*
                 * If we are seeing only the odd spurious IRQ caused by
                 * bus asynchronicity then don't eventually trigger an error,
@@ -274,8 +291,6 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
                else
                        desc->irqs_unhandled++;
                desc->last_unhandled = jiffies;
-                if (unlikely(action_ret != IRQ_NONE))
-                        report_bad_irq(irq, desc, action_ret);
        }
        if (unlikely(try_misrouted_irq(irq, desc, action_ret))) {
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index fa27e750dbc0..a8ce45097f3d 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -375,15 +375,19 @@ int jump_label_text_reserved(void *start, void *end)
 static void jump_label_update(struct jump_label_key *key, int enable)
 {
-        struct jump_entry *entry = key->entries;
+        struct jump_entry *entry = key->entries, *stop = __stop___jump_table;
-        /* if there are no users, entry can be NULL */
-        if (entry)
-                __jump_label_update(key, entry, __stop___jump_table, enable);
 #ifdef CONFIG_MODULES
+        struct module *mod = __module_address((jump_label_t)key);
        __jump_label_mod_update(key, enable);
+        if (mod)
+                stop = mod->jump_entries + mod->num_jump_entries;
 #endif
+        /* if there are no users, entry can be NULL */
+        if (entry)
+                __jump_label_update(key, entry, stop, enable);
 }
 #endif
diff --git a/kernel/kmod.c b/kernel/kmod.c
index ad6a81c58b44..47613dfb7b28 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -156,12 +156,6 @@ static int ____call_usermodehelper(void *data)
         */
        set_user_nice(current, 0);
-        if (sub_info->init) {
-                retval = sub_info->init(sub_info);
-                if (retval)
-                        goto fail;
-        }
        retval = -ENOMEM;
        new = prepare_kernel_cred(current);
        if (!new)
@@ -173,6 +167,14 @@ static int ____call_usermodehelper(void *data)
                                             new->cap_inheritable);
        spin_unlock(&umh_sysctl_lock);
+        if (sub_info->init) {
+                retval = sub_info->init(sub_info, new);
+                if (retval) {
+                        abort_creds(new);
+                        goto fail;
+                }
+        }
        commit_creds(new);
        retval = kernel_execve(sub_info->path,
@@ -388,7 +390,7 @@ EXPORT_SYMBOL(call_usermodehelper_setup);
 * context in which call_usermodehelper_exec is called.
 */
 void call_usermodehelper_setfns(struct subprocess_info *info,
-                    int (*init)(struct subprocess_info *info),
+                    int (*init)(struct subprocess_info *info, struct cred *new),
                    void (*cleanup)(struct subprocess_info *info),
                    void *data)
 {
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 63437d065ac8..298c9276dfdb 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -3426,7 +3426,7 @@ int lock_is_held(struct lockdep_map *lock)
        int ret = 0;
        if (unlikely(current->lockdep_recursion))
-                return ret;
+                return 1; /* avoid false negative lockdep_assert_held() */
        raw_local_irq_save(flags);
        check_flags(flags);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index ace55889f702..06efa54f93d6 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1211,7 +1211,11 @@ static void free_unnecessary_pages(void)
                to_free_highmem = alloc_highmem - save;
        } else {
                to_free_highmem = 0;
-                to_free_normal -= save - alloc_highmem;
+                save -= alloc_highmem;
+                if (to_free_normal > save)
+                        to_free_normal -= save;
+                else
+                        to_free_normal = 0;
        }
        memory_bm_position_reset(&copy_bm);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 7d02d33be699..42ddbc6f0de6 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -113,8 +113,10 @@ static int snapshot_open(struct inode *inode, struct file *filp)
                if (error)
                        pm_notifier_call_chain(PM_POST_RESTORE);
        }
-        if (error)
+        if (error) {
+                free_basic_memory_bitmaps();
                atomic_inc(&snapshot_device_available);
+        }
        data->frozen = 0;
        data->ready = 0;
        data->platform_support = 0;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 77a7671dd147..ba06207b1dd3 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -84,10 +84,35 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
 static struct rcu_state *rcu_state;
+/*
+ * The rcu_scheduler_active variable transitions from zero to one just
+ * before the first task is spawned.  So when this variable is zero, RCU
+ * can assume that there is but one task, allowing RCU to (for example)
+ * optimized synchronize_sched() to a simple barrier().  When this variable
+ * is one, RCU must actually do all the hard work required to detect real
+ * grace periods.  This variable is also used to suppress boot-time false
+ * positives from lockdep-RCU error checking.
+ */
 int rcu_scheduler_active __read_mostly;
 EXPORT_SYMBOL_GPL(rcu_scheduler_active);
 /*
+ * The rcu_scheduler_fully_active variable transitions from zero to one
+ * during the early_initcall() processing, which is after the scheduler
+ * is capable of creating new tasks.  So RCU processing (for example,
+ * creating tasks for RCU priority boosting) must be delayed until after
+ * rcu_scheduler_fully_active transitions from zero to one.  We also
+ * currently delay invocation of any RCU callbacks until after this point.
+ *
+ * It might later prove better for people registering RCU callbacks during
+ * early boot to take responsibility for these callbacks, but one step at
+ * a time.
+ */
+static int rcu_scheduler_fully_active __read_mostly;
+#ifdef CONFIG_RCU_BOOST
+/*
 * Control variables for per-CPU and per-rcu_node kthreads.  These
 * handle all flavors of RCU.
 */
@@ -96,10 +121,12 @@ DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
 DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu);
 DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
 DEFINE_PER_CPU(char, rcu_cpu_has_work);
-static char rcu_kthreads_spawnable;
+#endif /* #ifdef CONFIG_RCU_BOOST */
 static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
-static void invoke_rcu_cpu_kthread(void);
+static void invoke_rcu_core(void);
+static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
 #define RCU_KTHREAD_PRIO 1      /* RT priority for per-CPU kthreads. */
@@ -1088,14 +1115,8 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
        int need_report = 0;
        struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
        struct rcu_node *rnp;
-        struct task_struct *t;
-        /* Stop the CPU's kthread. */
+        rcu_stop_cpu_kthread(cpu);
-        t = per_cpu(rcu_cpu_kthread_task, cpu);
-        if (t != NULL) {
-                per_cpu(rcu_cpu_kthread_task, cpu) = NULL;
-                kthread_stop(t);
-        }
        /* Exclude any attempts to start a new grace period. */
        raw_spin_lock_irqsave(&rsp->onofflock, flags);
@@ -1231,7 +1252,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
        /* Re-raise the RCU softirq if there are callbacks remaining. */
        if (cpu_has_callbacks_ready_to_invoke(rdp))
-                invoke_rcu_cpu_kthread();
+                invoke_rcu_core();
 }
 /*
@@ -1277,7 +1298,7 @@ void rcu_check_callbacks(int cpu, int user)
        }
        rcu_preempt_check_callbacks(cpu);
        if (rcu_pending(cpu))
-                invoke_rcu_cpu_kthread();
+                invoke_rcu_core();
 }
 #ifdef CONFIG_SMP
@@ -1442,13 +1463,14 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
        }
        /* If there are callbacks ready, invoke them. */
-        rcu_do_batch(rsp, rdp);
+        if (cpu_has_callbacks_ready_to_invoke(rdp))
+                invoke_rcu_callbacks(rsp, rdp);
 }
 /*
 * Do softirq processing for the current CPU.
 */
-static void rcu_process_callbacks(void)
+static void rcu_process_callbacks(struct softirq_action *unused)
 {
        __rcu_process_callbacks(&rcu_sched_state,
                                &__get_cpu_var(rcu_sched_data));
@@ -1465,330 +1487,22 @@ static void rcu_process_callbacks(void)
 * the current CPU with interrupts disabled, the rcu_cpu_kthread_task
 * cannot disappear out from under us.
 */
-static void invoke_rcu_cpu_kthread(void)
+static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
 {
-        unsigned long flags;
+        if (unlikely(!ACCESS_ONCE(rcu_scheduler_fully_active)))
-        local_irq_save(flags);
-        __this_cpu_write(rcu_cpu_has_work, 1);
-        if (__this_cpu_read(rcu_cpu_kthread_task) == NULL) {
-                local_irq_restore(flags);
                return;
-        }
+        if (likely(!rsp->boost)) {
-        wake_up_process(__this_cpu_read(rcu_cpu_kthread_task));
+                rcu_do_batch(rsp, rdp);
-        local_irq_restore(flags);
-}
-/*
- * Wake up the specified per-rcu_node-structure kthread.
- * Because the per-rcu_node kthreads are immortal, we don't need
- * to do anything to keep them alive.
- */
-static void invoke_rcu_node_kthread(struct rcu_node *rnp)
-{
-        struct task_struct *t;
-        t = rnp->node_kthread_task;
-        if (t != NULL)
-                wake_up_process(t);
-}
-/*
- * Set the specified CPU's kthread to run RT or not, as specified by
- * the to_rt argument.  The CPU-hotplug locks are held, so the task
- * is not going away.
- */
-static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
-{
-        int policy;
-        struct sched_param sp;
-        struct task_struct *t;
-        t = per_cpu(rcu_cpu_kthread_task, cpu);
-        if (t == NULL)
                return;
-        if (to_rt) {
-                policy = SCHED_FIFO;
-                sp.sched_priority = RCU_KTHREAD_PRIO;
-        } else {
-                policy = SCHED_NORMAL;
-                sp.sched_priority = 0;
        }
-        sched_setscheduler_nocheck(t, policy, &sp);
+        invoke_rcu_callbacks_kthread();
 }
-/*
+static void invoke_rcu_core(void)
- * Timer handler to initiate the waking up of per-CPU kthreads that
- * have yielded the CPU due to excess numbers of RCU callbacks.
- * We wake up the per-rcu_node kthread, which in turn will wake up
- * the booster kthread.
- */
-static void rcu_cpu_kthread_timer(unsigned long arg)
 {
-        struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg);
+        raise_softirq(RCU_SOFTIRQ);
-        struct rcu_node *rnp = rdp->mynode;
-        atomic_or(rdp->grpmask, &rnp->wakemask);
-        invoke_rcu_node_kthread(rnp);
-}
-/*
- * Drop to non-real-time priority and yield, but only after posting a
- * timer that will cause us to regain our real-time priority if we
- * remain preempted.  Either way, we restore our real-time priority
- * before returning.
- */
-static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
-{
-        struct sched_param sp;
-        struct timer_list yield_timer;
-        setup_timer_on_stack(&yield_timer, f, arg);
-        mod_timer(&yield_timer, jiffies + 2);
-        sp.sched_priority = 0;
-        sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp);
-        set_user_nice(current, 19);
-        schedule();
-        sp.sched_priority = RCU_KTHREAD_PRIO;
-        sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
-        del_timer(&yield_timer);
-}
-/*
- * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU.
- * This can happen while the corresponding CPU is either coming online
- * or going offline.  We cannot wait until the CPU is fully online
- * before starting the kthread, because the various notifier functions
- * can wait for RCU grace periods.  So we park rcu_cpu_kthread() until
- * the corresponding CPU is online.
- *
- * Return 1 if the kthread needs to stop, 0 otherwise.
- *
- * Caller must disable bh.  This function can momentarily enable it.
- */
-static int rcu_cpu_kthread_should_stop(int cpu)
-{
-        while (cpu_is_offline(cpu) ||
-               !cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)) ||
-               smp_processor_id() != cpu) {
-                if (kthread_should_stop())
-                        return 1;
-                per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
-                per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id();
-                local_bh_enable();
-                schedule_timeout_uninterruptible(1);
-                if (!cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)))
-                        set_cpus_allowed_ptr(current, cpumask_of(cpu));
-                local_bh_disable();
-        }
-        per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
-        return 0;
 }
-/*
- * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
- * earlier RCU softirq.
- */
-static int rcu_cpu_kthread(void *arg)
-{
-        int cpu = (int)(long)arg;
-        unsigned long flags;
-        int spincnt = 0;
-        unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu);
-        char work;
-        char *workp = &per_cpu(rcu_cpu_has_work, cpu);
-        for (;;) {
-                *statusp = RCU_KTHREAD_WAITING;
-                rcu_wait(*workp != 0 || kthread_should_stop());
-                local_bh_disable();
-                if (rcu_cpu_kthread_should_stop(cpu)) {
-                        local_bh_enable();
-                        break;
-                }
-                *statusp = RCU_KTHREAD_RUNNING;
-                per_cpu(rcu_cpu_kthread_loops, cpu)++;
-                local_irq_save(flags);
-                work = *workp;
-                *workp = 0;
-                local_irq_restore(flags);
-                if (work)
-                        rcu_process_callbacks();
-                local_bh_enable();
-                if (*workp != 0)
-                        spincnt++;
-                else
-                        spincnt = 0;
-                if (spincnt > 10) {
-                        *statusp = RCU_KTHREAD_YIELDING;
-                        rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu);
-                        spincnt = 0;
-                }
-        }
-        *statusp = RCU_KTHREAD_STOPPED;
-        return 0;
-}
-/*
- * Spawn a per-CPU kthread, setting up affinity and priority.
- * Because the CPU hotplug lock is held, no other CPU will be attempting
- * to manipulate rcu_cpu_kthread_task.  There might be another CPU
- * attempting to access it during boot, but the locking in kthread_bind()
- * will enforce sufficient ordering.
- */
-static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
-{
-        struct sched_param sp;
-        struct task_struct *t;
-        if (!rcu_kthreads_spawnable ||
-            per_cpu(rcu_cpu_kthread_task, cpu) != NULL)
-                return 0;
-        t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu);
-        if (IS_ERR(t))
-                return PTR_ERR(t);
-        kthread_bind(t, cpu);
-        set_task_state(t, TASK_INTERRUPTIBLE);
-        per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
-        WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL);
-        per_cpu(rcu_cpu_kthread_task, cpu) = t;
-        sp.sched_priority = RCU_KTHREAD_PRIO;
-        sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
-        return 0;
-}
-/*
- * Per-rcu_node kthread, which is in charge of waking up the per-CPU
- * kthreads when needed.  We ignore requests to wake up kthreads
- * for offline CPUs, which is OK because force_quiescent_state()
- * takes care of this case.
- */
-static int rcu_node_kthread(void *arg)
-{
-        int cpu;
-        unsigned long flags;
-        unsigned long mask;
-        struct rcu_node *rnp = (struct rcu_node *)arg;
-        struct sched_param sp;
-        struct task_struct *t;
-        for (;;) {
-                rnp->node_kthread_status = RCU_KTHREAD_WAITING;
-                rcu_wait(atomic_read(&rnp->wakemask) != 0);
-                rnp->node_kthread_status = RCU_KTHREAD_RUNNING;
-                raw_spin_lock_irqsave(&rnp->lock, flags);
-                mask = atomic_xchg(&rnp->wakemask, 0);
-                rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
-                for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) {
-                        if ((mask & 0x1) == 0)
-                                continue;
-                        preempt_disable();
-                        t = per_cpu(rcu_cpu_kthread_task, cpu);
-                        if (!cpu_online(cpu) || t == NULL) {
-                                preempt_enable();
-                                continue;
-                        }
-                        per_cpu(rcu_cpu_has_work, cpu) = 1;
-                        sp.sched_priority = RCU_KTHREAD_PRIO;
-                        sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
-                        preempt_enable();
-                }
-        }
-        /* NOTREACHED */
-        rnp->node_kthread_status = RCU_KTHREAD_STOPPED;
-        return 0;
-}
-/*
- * Set the per-rcu_node kthread's affinity to cover all CPUs that are
- * served by the rcu_node in question.  The CPU hotplug lock is still
- * held, so the value of rnp->qsmaskinit will be stable.
- *
- * We don't include outgoingcpu in the affinity set, use -1 if there is
- * no outgoing CPU.  If there are no CPUs left in the affinity set,
- * this function allows the kthread to execute on any CPU.
- */
-static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
-{
-        cpumask_var_t cm;
-        int cpu;
-        unsigned long mask = rnp->qsmaskinit;
-        if (rnp->node_kthread_task == NULL)
-                return;
-        if (!alloc_cpumask_var(&cm, GFP_KERNEL))
-                return;
-        cpumask_clear(cm);
-        for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
-                if ((mask & 0x1) && cpu != outgoingcpu)
-                        cpumask_set_cpu(cpu, cm);
-        if (cpumask_weight(cm) == 0) {
-                cpumask_setall(cm);
-                for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++)
-                        cpumask_clear_cpu(cpu, cm);
-                WARN_ON_ONCE(cpumask_weight(cm) == 0);
-        }
-        set_cpus_allowed_ptr(rnp->node_kthread_task, cm);
-        rcu_boost_kthread_setaffinity(rnp, cm);
-        free_cpumask_var(cm);
-}
-/*
- * Spawn a per-rcu_node kthread, setting priority and affinity.
- * Called during boot before online/offline can happen, or, if
- * during runtime, with the main CPU-hotplug locks held.  So only
- * one of these can be executing at a time.
- */
-static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp,
-                                                struct rcu_node *rnp)
-{
-        unsigned long flags;
-        int rnp_index = rnp - &rsp->node[0];
-        struct sched_param sp;
-        struct task_struct *t;
-        if (!rcu_kthreads_spawnable ||
-            rnp->qsmaskinit == 0)
-                return 0;
-        if (rnp->node_kthread_task == NULL) {
-                t = kthread_create(rcu_node_kthread, (void *)rnp,
-                                   "rcun%d", rnp_index);
-                if (IS_ERR(t))
-                        return PTR_ERR(t);
-                raw_spin_lock_irqsave(&rnp->lock, flags);
-                set_task_state(t, TASK_INTERRUPTIBLE);
-                rnp->node_kthread_task = t;
-                raw_spin_unlock_irqrestore(&rnp->lock, flags);
-                sp.sched_priority = 99;
-                sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
-        }
-        return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index);
-}
-/*
- * Spawn all kthreads -- called as soon as the scheduler is running.
- */
-static int __init rcu_spawn_kthreads(void)
-{
-        int cpu;
-        struct rcu_node *rnp;
-        rcu_kthreads_spawnable = 1;
-        for_each_possible_cpu(cpu) {
-                per_cpu(rcu_cpu_has_work, cpu) = 0;
-                if (cpu_online(cpu))
-                        (void)rcu_spawn_one_cpu_kthread(cpu);
-        }
-        rnp = rcu_get_root(rcu_state);
-        (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
-        if (NUM_RCU_NODES > 1) {
-                rcu_for_each_leaf_node(rcu_state, rnp)
-                        (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
-        }
-        return 0;
-}
-early_initcall(rcu_spawn_kthreads);
 static void
 __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
           struct rcu_state *rsp)
@@ -2188,26 +1902,13 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
        raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
 }
-static void __cpuinit rcu_online_cpu(int cpu)
+static void __cpuinit rcu_prepare_cpu(int cpu)
 {
        rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
        rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
        rcu_preempt_init_percpu_data(cpu);
 }
-static void __cpuinit rcu_online_kthreads(int cpu)
-{
-        struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
-        struct rcu_node *rnp = rdp->mynode;
-        /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
-        if (rcu_kthreads_spawnable) {
-                (void)rcu_spawn_one_cpu_kthread(cpu);
-                if (rnp->node_kthread_task == NULL)
-                        (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
-        }
-}
 /*
 * Handle CPU online/offline notification events.
 */
@@ -2221,8 +1922,8 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
        switch (action) {
        case CPU_UP_PREPARE:
        case CPU_UP_PREPARE_FROZEN:
-                rcu_online_cpu(cpu);
+                rcu_prepare_cpu(cpu);
-                rcu_online_kthreads(cpu);
+                rcu_prepare_kthreads(cpu);
                break;
        case CPU_ONLINE:
        case CPU_DOWN_FAILED:
@@ -2372,6 +2073,7 @@ void __init rcu_init(void)
        rcu_init_one(&rcu_sched_state, &rcu_sched_data);
        rcu_init_one(&rcu_bh_state, &rcu_bh_data);
        __rcu_init_preempt();
+         open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
        /*
         * We don't need protection against CPU-hotplug here because
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 7b9a08b4aaea..01b2ccda26fb 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -369,6 +369,7 @@ struct rcu_state {
                                                /*  period because */
                                                /*  force_quiescent_state() */
                                                /*  was running. */
+        u8      boost;                          /* Subject to priority boost. */
        unsigned long gpnum;                    /* Current gp number. */
        unsigned long completed;                /* # of last completed gp. */
@@ -426,6 +427,7 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
 #ifdef CONFIG_HOTPLUG_CPU
 static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
                                      unsigned long flags);
+static void rcu_stop_cpu_kthread(int cpu);
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 static void rcu_print_detail_task_stall(struct rcu_state *rsp);
 static void rcu_print_task_stall(struct rcu_node *rnp);
@@ -450,11 +452,19 @@ static void rcu_preempt_send_cbs_to_online(void);
 static void __init __rcu_init_preempt(void);
 static void rcu_needs_cpu_flush(void);
 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
+static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
+static void invoke_rcu_callbacks_kthread(void);
+#ifdef CONFIG_RCU_BOOST
+static void rcu_preempt_do_callbacks(void);
 static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
                                          cpumask_var_t cm);
-static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
 static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
                                                 struct rcu_node *rnp,
                                                 int rnp_index);
+static void invoke_rcu_node_kthread(struct rcu_node *rnp);
+static void rcu_yield(void (*f)(unsigned long), unsigned long arg);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+static void rcu_cpu_kthread_setrt(int cpu, int to_rt);
+static void __cpuinit rcu_prepare_kthreads(int cpu);
 #endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index a767b7dac365..75113cb7c4fb 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -602,6 +602,15 @@ static void rcu_preempt_process_callbacks(void)
                                &__get_cpu_var(rcu_preempt_data));
 }
+#ifdef CONFIG_RCU_BOOST
+static void rcu_preempt_do_callbacks(void)
+{
+        rcu_do_batch(&rcu_preempt_state, &__get_cpu_var(rcu_preempt_data));
+}
+#endif /* #ifdef CONFIG_RCU_BOOST */
 /*
 * Queue a preemptible-RCU callback for invocation after a grace period.
 */
@@ -1249,6 +1258,23 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
 }
 /*
+ * Wake up the per-CPU kthread to invoke RCU callbacks.
+ */
+static void invoke_rcu_callbacks_kthread(void)
+{
+        unsigned long flags;
+        local_irq_save(flags);
+        __this_cpu_write(rcu_cpu_has_work, 1);
+        if (__this_cpu_read(rcu_cpu_kthread_task) == NULL) {
+                local_irq_restore(flags);
+                return;
+        }
+        wake_up_process(__this_cpu_read(rcu_cpu_kthread_task));
+        local_irq_restore(flags);
+}
+/*
 * Set the affinity of the boost kthread.  The CPU-hotplug locks are
 * held, so no one should be messing with the existence of the boost
 * kthread.
@@ -1288,6 +1314,7 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
        if (&rcu_preempt_state != rsp)
                return 0;
+        rsp->boost = 1;
        if (rnp->boost_kthread_task != NULL)
                return 0;
        t = kthread_create(rcu_boost_kthread, (void *)rnp,
@@ -1295,14 +1322,378 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
        if (IS_ERR(t))
                return PTR_ERR(t);
        raw_spin_lock_irqsave(&rnp->lock, flags);
-        set_task_state(t, TASK_INTERRUPTIBLE);
        rnp->boost_kthread_task = t;
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
        sp.sched_priority = RCU_KTHREAD_PRIO;
        sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+        wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
+        return 0;
+}
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * Stop the RCU's per-CPU kthread when its CPU goes offline,.
+ */
+static void rcu_stop_cpu_kthread(int cpu)
+{
+        struct task_struct *t;
+        /* Stop the CPU's kthread. */
+        t = per_cpu(rcu_cpu_kthread_task, cpu);
+        if (t != NULL) {
+                per_cpu(rcu_cpu_kthread_task, cpu) = NULL;
+                kthread_stop(t);
+        }
+}
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+static void rcu_kthread_do_work(void)
+{
+        rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data));
+        rcu_do_batch(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
+        rcu_preempt_do_callbacks();
+}
+/*
+ * Wake up the specified per-rcu_node-structure kthread.
+ * Because the per-rcu_node kthreads are immortal, we don't need
+ * to do anything to keep them alive.
+ */
+static void invoke_rcu_node_kthread(struct rcu_node *rnp)
+{
+        struct task_struct *t;
+        t = rnp->node_kthread_task;
+        if (t != NULL)
+                wake_up_process(t);
+}
+/*
+ * Set the specified CPU's kthread to run RT or not, as specified by
+ * the to_rt argument.  The CPU-hotplug locks are held, so the task
+ * is not going away.
+ */
+static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
+{
+        int policy;
+        struct sched_param sp;
+        struct task_struct *t;
+        t = per_cpu(rcu_cpu_kthread_task, cpu);
+        if (t == NULL)
+                return;
+        if (to_rt) {
+                policy = SCHED_FIFO;
+                sp.sched_priority = RCU_KTHREAD_PRIO;
+        } else {
+                policy = SCHED_NORMAL;
+                sp.sched_priority = 0;
+        }
+        sched_setscheduler_nocheck(t, policy, &sp);
+}
+/*
+ * Timer handler to initiate the waking up of per-CPU kthreads that
+ * have yielded the CPU due to excess numbers of RCU callbacks.
+ * We wake up the per-rcu_node kthread, which in turn will wake up
+ * the booster kthread.
+ */
+static void rcu_cpu_kthread_timer(unsigned long arg)
+{
+        struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg);
+        struct rcu_node *rnp = rdp->mynode;
+        atomic_or(rdp->grpmask, &rnp->wakemask);
+        invoke_rcu_node_kthread(rnp);
+}
+/*
+ * Drop to non-real-time priority and yield, but only after posting a
+ * timer that will cause us to regain our real-time priority if we
+ * remain preempted.  Either way, we restore our real-time priority
+ * before returning.
+ */
+static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
+{
+        struct sched_param sp;
+        struct timer_list yield_timer;
+        setup_timer_on_stack(&yield_timer, f, arg);
+        mod_timer(&yield_timer, jiffies + 2);
+        sp.sched_priority = 0;
+        sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp);
+        set_user_nice(current, 19);
+        schedule();
+        sp.sched_priority = RCU_KTHREAD_PRIO;
+        sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
+        del_timer(&yield_timer);
+}
+/*
+ * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU.
+ * This can happen while the corresponding CPU is either coming online
+ * or going offline.  We cannot wait until the CPU is fully online
+ * before starting the kthread, because the various notifier functions
+ * can wait for RCU grace periods.  So we park rcu_cpu_kthread() until
+ * the corresponding CPU is online.
+ *
+ * Return 1 if the kthread needs to stop, 0 otherwise.
+ *
+ * Caller must disable bh.  This function can momentarily enable it.
+ */
+static int rcu_cpu_kthread_should_stop(int cpu)
+{
+        while (cpu_is_offline(cpu) ||
+               !cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)) ||
+               smp_processor_id() != cpu) {
+                if (kthread_should_stop())
+                        return 1;
+                per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
+                per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id();
+                local_bh_enable();
+                schedule_timeout_uninterruptible(1);
+                if (!cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)))
+                        set_cpus_allowed_ptr(current, cpumask_of(cpu));
+                local_bh_disable();
+        }
+        per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
+        return 0;
+}
+/*
+ * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
+ * earlier RCU softirq.
+ */
+static int rcu_cpu_kthread(void *arg)
+{
+        int cpu = (int)(long)arg;
+        unsigned long flags;
+        int spincnt = 0;
+        unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu);
+        char work;
+        char *workp = &per_cpu(rcu_cpu_has_work, cpu);
+        for (;;) {
+                *statusp = RCU_KTHREAD_WAITING;
+                rcu_wait(*workp != 0 || kthread_should_stop());
+                local_bh_disable();
+                if (rcu_cpu_kthread_should_stop(cpu)) {
+                        local_bh_enable();
+                        break;
+                }
+                *statusp = RCU_KTHREAD_RUNNING;
+                per_cpu(rcu_cpu_kthread_loops, cpu)++;
+                local_irq_save(flags);
+                work = *workp;
+                *workp = 0;
+                local_irq_restore(flags);
+                if (work)
+                        rcu_kthread_do_work();
+                local_bh_enable();
+                if (*workp != 0)
+                        spincnt++;
+                else
+                        spincnt = 0;
+                if (spincnt > 10) {
+                        *statusp = RCU_KTHREAD_YIELDING;
+                        rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu);
+                        spincnt = 0;
+                }
+        }
+        *statusp = RCU_KTHREAD_STOPPED;
        return 0;
 }
+/*
+ * Spawn a per-CPU kthread, setting up affinity and priority.
+ * Because the CPU hotplug lock is held, no other CPU will be attempting
+ * to manipulate rcu_cpu_kthread_task.  There might be another CPU
+ * attempting to access it during boot, but the locking in kthread_bind()
+ * will enforce sufficient ordering.
+ *
+ * Please note that we cannot simply refuse to wake up the per-CPU
+ * kthread because kthreads are created in TASK_UNINTERRUPTIBLE state,
+ * which can result in softlockup complaints if the task ends up being
+ * idle for more than a couple of minutes.
+ *
+ * However, please note also that we cannot bind the per-CPU kthread to its
+ * CPU until that CPU is fully online.  We also cannot wait until the
+ * CPU is fully online before we create its per-CPU kthread, as this would
+ * deadlock the system when CPU notifiers tried waiting for grace
+ * periods.  So we bind the per-CPU kthread to its CPU only if the CPU
+ * is online.  If its CPU is not yet fully online, then the code in
+ * rcu_cpu_kthread() will wait until it is fully online, and then do
+ * the binding.
+ */
+static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
+{
+        struct sched_param sp;
+        struct task_struct *t;
+        if (!rcu_scheduler_fully_active ||
+            per_cpu(rcu_cpu_kthread_task, cpu) != NULL)
+                return 0;
+        t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu);
+        if (IS_ERR(t))
+                return PTR_ERR(t);
+        if (cpu_online(cpu))
+                kthread_bind(t, cpu);
+        per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
+        WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL);
+        sp.sched_priority = RCU_KTHREAD_PRIO;
+        sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+        per_cpu(rcu_cpu_kthread_task, cpu) = t;
+        wake_up_process(t); /* Get to TASK_INTERRUPTIBLE quickly. */
+        return 0;
+}
+/*
+ * Per-rcu_node kthread, which is in charge of waking up the per-CPU
+ * kthreads when needed.  We ignore requests to wake up kthreads
+ * for offline CPUs, which is OK because force_quiescent_state()
+ * takes care of this case.
+ */
+static int rcu_node_kthread(void *arg)
+{
+        int cpu;
+        unsigned long flags;
+        unsigned long mask;
+        struct rcu_node *rnp = (struct rcu_node *)arg;
+        struct sched_param sp;
+        struct task_struct *t;
+        for (;;) {
+                rnp->node_kthread_status = RCU_KTHREAD_WAITING;
+                rcu_wait(atomic_read(&rnp->wakemask) != 0);
+                rnp->node_kthread_status = RCU_KTHREAD_RUNNING;
+                raw_spin_lock_irqsave(&rnp->lock, flags);
+                mask = atomic_xchg(&rnp->wakemask, 0);
+                rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
+                for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) {
+                        if ((mask & 0x1) == 0)
+                                continue;
+                        preempt_disable();
+                        t = per_cpu(rcu_cpu_kthread_task, cpu);
+                        if (!cpu_online(cpu) || t == NULL) {
+                                preempt_enable();
+                                continue;
+                        }
+                        per_cpu(rcu_cpu_has_work, cpu) = 1;
+                        sp.sched_priority = RCU_KTHREAD_PRIO;
+                        sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+                        preempt_enable();
+                }
+        }
+        /* NOTREACHED */
+        rnp->node_kthread_status = RCU_KTHREAD_STOPPED;
+        return 0;
+}
+/*
+ * Set the per-rcu_node kthread's affinity to cover all CPUs that are
+ * served by the rcu_node in question.  The CPU hotplug lock is still
+ * held, so the value of rnp->qsmaskinit will be stable.
+ *
+ * We don't include outgoingcpu in the affinity set, use -1 if there is
+ * no outgoing CPU.  If there are no CPUs left in the affinity set,
+ * this function allows the kthread to execute on any CPU.
+ */
+static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
+{
+        cpumask_var_t cm;
+        int cpu;
+        unsigned long mask = rnp->qsmaskinit;
+        if (rnp->node_kthread_task == NULL)
+                return;
+        if (!alloc_cpumask_var(&cm, GFP_KERNEL))
+                return;
+        cpumask_clear(cm);
+        for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
+                if ((mask & 0x1) && cpu != outgoingcpu)
+                        cpumask_set_cpu(cpu, cm);
+        if (cpumask_weight(cm) == 0) {
+                cpumask_setall(cm);
+                for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++)
+                        cpumask_clear_cpu(cpu, cm);
+                WARN_ON_ONCE(cpumask_weight(cm) == 0);
+        }
+        set_cpus_allowed_ptr(rnp->node_kthread_task, cm);
+        rcu_boost_kthread_setaffinity(rnp, cm);
+        free_cpumask_var(cm);
+}
+/*
+ * Spawn a per-rcu_node kthread, setting priority and affinity.
+ * Called during boot before online/offline can happen, or, if
+ * during runtime, with the main CPU-hotplug locks held.  So only
+ * one of these can be executing at a time.
+ */
+static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp,
+                                                struct rcu_node *rnp)
+{
+        unsigned long flags;
+        int rnp_index = rnp - &rsp->node[0];
+        struct sched_param sp;
+        struct task_struct *t;
+        if (!rcu_scheduler_fully_active ||
+            rnp->qsmaskinit == 0)
+                return 0;
+        if (rnp->node_kthread_task == NULL) {
+                t = kthread_create(rcu_node_kthread, (void *)rnp,
+                                   "rcun%d", rnp_index);
+                if (IS_ERR(t))
+                        return PTR_ERR(t);
+                raw_spin_lock_irqsave(&rnp->lock, flags);
+                rnp->node_kthread_task = t;
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                sp.sched_priority = 99;
+                sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+                wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
+        }
+        return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index);
+}
+/*
+ * Spawn all kthreads -- called as soon as the scheduler is running.
+ */
+static int __init rcu_spawn_kthreads(void)
+{
+        int cpu;
+        struct rcu_node *rnp;
+        rcu_scheduler_fully_active = 1;
+        for_each_possible_cpu(cpu) {
+                per_cpu(rcu_cpu_has_work, cpu) = 0;
+                if (cpu_online(cpu))
+                        (void)rcu_spawn_one_cpu_kthread(cpu);
+        }
+        rnp = rcu_get_root(rcu_state);
+        (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
+        if (NUM_RCU_NODES > 1) {
+                rcu_for_each_leaf_node(rcu_state, rnp)
+                        (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
+        }
+        return 0;
+}
+early_initcall(rcu_spawn_kthreads);
+static void __cpuinit rcu_prepare_kthreads(int cpu)
+{
+        struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
+        struct rcu_node *rnp = rdp->mynode;
+        /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
+        if (rcu_scheduler_fully_active) {
+                (void)rcu_spawn_one_cpu_kthread(cpu);
+                if (rnp->node_kthread_task == NULL)
+                        (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
+        }
+}
 #else /* #ifdef CONFIG_RCU_BOOST */
 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
@@ -1310,21 +1701,41 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
-static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
+static void invoke_rcu_callbacks_kthread(void)
-                                          cpumask_var_t cm)
 {
+        WARN_ON_ONCE(1);
 }
 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
 {
 }
-static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
+#ifdef CONFIG_HOTPLUG_CPU
-                                                 struct rcu_node *rnp,
-                                                 int rnp_index)
+static void rcu_stop_cpu_kthread(int cpu)
 {
+}
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
+{
+}
+static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
+{
+}
+static int __init rcu_scheduler_really_started(void)
+{
+        rcu_scheduler_fully_active = 1;
        return 0;
 }
+early_initcall(rcu_scheduler_really_started);
+static void __cpuinit rcu_prepare_kthreads(int cpu)
+{
+}
 #endif /* #else #ifdef CONFIG_RCU_BOOST */
@@ -1500,7 +1911,7 @@ static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
 *
 * Because it is not legal to invoke rcu_process_callbacks() with irqs
 * disabled, we do one pass of force_quiescent_state(), then do a
- * invoke_rcu_cpu_kthread() to cause rcu_process_callbacks() to be invoked
+ * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked
 * later.  The per-cpu rcu_dyntick_drain variable controls the sequencing.
 */
 int rcu_needs_cpu(int cpu)
@@ -1551,7 +1962,7 @@ int rcu_needs_cpu(int cpu)
        /* If RCU callbacks are still pending, RCU still needs this CPU. */
        if (c)
-                invoke_rcu_cpu_kthread();
+                invoke_rcu_core();
        return c;
 }
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 9678cc3650f5..4e144876dc68 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -46,6 +46,8 @@
 #define RCU_TREE_NONCORE
 #include "rcutree.h"
+#ifdef CONFIG_RCU_BOOST
 DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
 DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_cpu);
 DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
@@ -58,6 +60,8 @@ static char convert_kthread_status(unsigned int kthread_status)
        return "SRWOY"[kthread_status];
 }
+#endif /* #ifdef CONFIG_RCU_BOOST */
 static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
 {
        if (!rdp->beenonline)
@@ -76,7 +80,7 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
                   rdp->dynticks_fqs);
 #endif /* #ifdef CONFIG_NO_HZ */
        seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
-        seq_printf(m, " ql=%ld qs=%c%c%c%c kt=%d/%c/%d ktl=%x b=%ld",
+        seq_printf(m, " ql=%ld qs=%c%c%c%c",
                   rdp->qlen,
                   ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
                        rdp->nxttail[RCU_NEXT_TAIL]],
@@ -84,13 +88,16 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
                        rdp->nxttail[RCU_NEXT_READY_TAIL]],
                   ".W"[rdp->nxttail[RCU_DONE_TAIL] !=
                        rdp->nxttail[RCU_WAIT_TAIL]],
-                   ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]],
+                   ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]);
+#ifdef CONFIG_RCU_BOOST
+        seq_printf(m, " kt=%d/%c/%d ktl=%x",
                   per_cpu(rcu_cpu_has_work, rdp->cpu),
                   convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
                                          rdp->cpu)),
                   per_cpu(rcu_cpu_kthread_cpu, rdp->cpu),
-                   per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff,
+                   per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff);
-                   rdp->blimit);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+        seq_printf(m, " b=%ld", rdp->blimit);
        seq_printf(m, " ci=%lu co=%lu ca=%lu\n",
                   rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
 }
@@ -147,18 +154,21 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
                   rdp->dynticks_fqs);
 #endif /* #ifdef CONFIG_NO_HZ */
        seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
-        seq_printf(m, ",%ld,\"%c%c%c%c\",%d,\"%c\",%ld", rdp->qlen,
+        seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen,
                   ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
                        rdp->nxttail[RCU_NEXT_TAIL]],
                   ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
                        rdp->nxttail[RCU_NEXT_READY_TAIL]],
                   ".W"[rdp->nxttail[RCU_DONE_TAIL] !=
                        rdp->nxttail[RCU_WAIT_TAIL]],
-                   ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]],
+                   ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]);
+#ifdef CONFIG_RCU_BOOST
+        seq_printf(m, ",%d,\"%c\"",
                   per_cpu(rcu_cpu_has_work, rdp->cpu),
                   convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
-                                          rdp->cpu)),
+                                          rdp->cpu)));
-                   rdp->blimit);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+        seq_printf(m, ",%ld", rdp->blimit);
        seq_printf(m, ",%lu,%lu,%lu\n",
                   rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
 }
@@ -169,7 +179,11 @@ static int show_rcudata_csv(struct seq_file *m, void *unused)
 #ifdef CONFIG_NO_HZ
        seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
 #endif /* #ifdef CONFIG_NO_HZ */
-        seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\",\"ci\",\"co\",\"ca\"\n");
+        seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\"");
+#ifdef CONFIG_RCU_BOOST
+        seq_puts(m, "\"kt\",\"ktl\"");
+#endif /* #ifdef CONFIG_RCU_BOOST */
+        seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n");
 #ifdef CONFIG_TREE_PREEMPT_RCU
        seq_puts(m, "\"rcu_preempt:\"\n");
        PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m);
diff --git a/kernel/resource.c b/kernel/resource.c
index 798e2fae2a06..3ff40178dce7 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -38,6 +38,14 @@ struct resource iomem_resource = {
 };
 EXPORT_SYMBOL(iomem_resource);
+/* constraints to be met while allocating resources */
+struct resource_constraint {
+        resource_size_t min, max, align;
+        resource_size_t (*alignf)(void *, const struct resource *,
+                        resource_size_t, resource_size_t);
+        void *alignf_data;
+};
 static DEFINE_RWLOCK(resource_lock);
 static void *r_next(struct seq_file *m, void *v, loff_t *pos)
@@ -384,16 +392,13 @@ static bool resource_contains(struct resource *res1, struct resource *res2)
 }
 /*
- * Find empty slot in the resource tree given range and alignment.
+ * Find empty slot in the resource tree with the given range and
+ * alignment constraints
 */
-static int find_resource(struct resource *root, struct resource *new,
+static int __find_resource(struct resource *root, struct resource *old,
-                         resource_size_t size, resource_size_t min,
+                         struct resource *new,
-                         resource_size_t max, resource_size_t align,
+                         resource_size_t  size,
-                         resource_size_t (*alignf)(void *,
+                         struct resource_constraint *constraint)
-                                                   const struct resource *,
-                                                   resource_size_t,
-                                                   resource_size_t),
-                         void *alignf_data)
 {
        struct resource *this = root->child;
        struct resource tmp = *new, avail, alloc;
@@ -404,25 +409,26 @@ static int find_resource(struct resource *root, struct resource *new,
         * Skip past an allocated resource that starts at 0, since the assignment
         * of this->start - 1 to tmp->end below would cause an underflow.
         */
-        if (this && this->start == 0) {
+        if (this && this->start == root->start) {
-                tmp.start = this->end + 1;
+                tmp.start = (this == old) ? old->start : this->end + 1;
                this = this->sibling;
        }
        for(;;) {
                if (this)
-                        tmp.end = this->start - 1;
+                        tmp.end = (this == old) ?  this->end : this->start - 1;
                else
                        tmp.end = root->end;
-                resource_clip(&tmp, min, max);
+                resource_clip(&tmp, constraint->min, constraint->max);
                arch_remove_reservations(&tmp);
                /* Check for overflow after ALIGN() */
                avail = *new;
-                avail.start = ALIGN(tmp.start, align);
+                avail.start = ALIGN(tmp.start, constraint->align);
                avail.end = tmp.end;
                if (avail.start >= tmp.start) {
-                        alloc.start = alignf(alignf_data, &avail, size, align);
+                        alloc.start = constraint->alignf(constraint->alignf_data, &avail,
+                                        size, constraint->align);
                        alloc.end = alloc.start + size - 1;
                        if (resource_contains(&avail, &alloc)) {
                                new->start = alloc.start;
@@ -432,14 +438,75 @@ static int find_resource(struct resource *root, struct resource *new,
                }
                if (!this)
                        break;
-                tmp.start = this->end + 1;
+                if (this != old)
+                        tmp.start = this->end + 1;
                this = this->sibling;
        }
        return -EBUSY;
 }
+/*
+ * Find empty slot in the resource tree given range and alignment.
+ */
+static int find_resource(struct resource *root, struct resource *new,
+                        resource_size_t size,
+                        struct resource_constraint  *constraint)
+{
+        return  __find_resource(root, NULL, new, size, constraint);
+}
 /**
- * allocate_resource - allocate empty slot in the resource tree given range & alignment
+ * reallocate_resource - allocate a slot in the resource tree given range & alignment.
+ *      The resource will be relocated if the new size cannot be reallocated in the
+ *      current location.
+ *
+ * @root: root resource descriptor
+ * @old:  resource descriptor desired by caller
+ * @newsize: new size of the resource descriptor
+ * @constraint: the size and alignment constraints to be met.
+ */
+int reallocate_resource(struct resource *root, struct resource *old,
+                        resource_size_t newsize,
+                        struct resource_constraint  *constraint)
+{
+        int err=0;
+        struct resource new = *old;
+        struct resource *conflict;
+        write_lock(&resource_lock);
+        if ((err = __find_resource(root, old, &new, newsize, constraint)))
+                goto out;
+        if (resource_contains(&new, old)) {
+                old->start = new.start;
+                old->end = new.end;
+                goto out;
+        }
+        if (old->child) {
+                err = -EBUSY;
+                goto out;
+        }
+        if (resource_contains(old, &new)) {
+                old->start = new.start;
+                old->end = new.end;
+        } else {
+                __release_resource(old);
+                *old = new;
+                conflict = __request_resource(root, old);
+                BUG_ON(conflict);
+        }
+out:
+        write_unlock(&resource_lock);
+        return err;
+}
+/**
+ * allocate_resource - allocate empty slot in the resource tree given range & alignment.
+ *      The resource will be reallocated with a new size if it was already allocated
 * @root: root resource descriptor
 * @new: resource descriptor desired by caller
 * @size: requested resource region size
@@ -459,12 +526,25 @@ int allocate_resource(struct resource *root, struct resource *new,
                      void *alignf_data)
 {
        int err;
+        struct resource_constraint constraint;
        if (!alignf)
                alignf = simple_align_resource;
+        constraint.min = min;
+        constraint.max = max;
+        constraint.align = align;
+        constraint.alignf = alignf;
+        constraint.alignf_data = alignf_data;
+        if ( new->parent ) {
+                /* resource is already allocated, try reallocating with
+                   the new constraints */
+                return reallocate_resource(root, new, size, &constraint);
+        }
        write_lock(&resource_lock);
-        err = find_resource(root, new, size, min, max, align, alignf, alignf_data);
+        err = find_resource(root, new, size, &constraint);
        if (err >= 0 && __request_resource(root, new))
                err = -EBUSY;
        write_unlock(&resource_lock);
diff --git a/kernel/sched.c b/kernel/sched.c
index cbb3a0eee58e..3dc716f6d8ad 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -292,8 +292,8 @@ static DEFINE_SPINLOCK(task_group_lock);
 * (The default weight is 1024 - so there's no practical
 *  limitation from this.)
 */
-#define MIN_SHARES      2
+#define MIN_SHARES      (1UL <<  1)
-#define MAX_SHARES      (1UL << (18 + SCHED_LOAD_RESOLUTION))
+#define MAX_SHARES      (1UL << 18)
 static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
 #endif
@@ -605,10 +605,10 @@ static inline int cpu_of(struct rq *rq)
 /*
 * Return the group to which this tasks belongs.
 *
- * We use task_subsys_state_check() and extend the RCU verification
+ * We use task_subsys_state_check() and extend the RCU verification with
- * with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach()
+ * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
- * holds that lock for each task it moves into the cgroup. Therefore
+ * task it moves into the cgroup. Therefore by holding either of those locks,
- * by holding that lock, we pin the task to the current cgroup.
+ * we pin the task to the current cgroup.
 */
 static inline struct task_group *task_group(struct task_struct *p)
 {
@@ -616,7 +616,8 @@ static inline struct task_group *task_group(struct task_struct *p)
        struct cgroup_subsys_state *css;
        css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
-                        lockdep_is_held(&p->pi_lock));
+                        lockdep_is_held(&p->pi_lock) ||
+                        lockdep_is_held(&task_rq(p)->lock));
        tg = container_of(css, struct task_group, css);
        return autogroup_task_group(p, tg);
@@ -2200,6 +2201,16 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
                        !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
 #ifdef CONFIG_LOCKDEP
+        /*
+         * The caller should hold either p->pi_lock or rq->lock, when changing
+         * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
+         *
+         * sched_move_task() holds both and thus holding either pins the cgroup,
+         * see set_task_rq().
+         *
+         * Furthermore, all task_rq users should acquire both locks, see
+         * task_rq_lock().
+         */
        WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
                                      lockdep_is_held(&task_rq(p)->lock)));
 #endif
@@ -2447,6 +2458,10 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
                }
                rcu_read_unlock();
        }
+        if (wake_flags & WF_MIGRATED)
+                schedstat_inc(p, se.statistics.nr_wakeups_migrate);
 #endif /* CONFIG_SMP */
        schedstat_inc(rq, ttwu_count);
@@ -2455,9 +2470,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
        if (wake_flags & WF_SYNC)
                schedstat_inc(p, se.statistics.nr_wakeups_sync);
-        if (cpu != task_cpu(p))
-                schedstat_inc(p, se.statistics.nr_wakeups_migrate);
 #endif /* CONFIG_SCHEDSTATS */
 }
@@ -2600,6 +2612,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
 #if defined(CONFIG_SMP)
        if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
+                sched_clock_cpu(cpu); /* sync clocks x-cpu */
                ttwu_queue_remote(p, cpu);
                return;
        }
@@ -2674,8 +2687,10 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
                p->sched_class->task_waking(p);
        cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
-        if (task_cpu(p) != cpu)
+        if (task_cpu(p) != cpu) {
+                wake_flags |= WF_MIGRATED;
                set_task_cpu(p, cpu);
+        }
 #endif /* CONFIG_SMP */
        ttwu_queue(p, cpu);
@@ -7742,6 +7757,9 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
 #endif
 #endif
        cfs_rq->min_vruntime = (u64)(-(1LL << 20));
+#ifndef CONFIG_64BIT
+        cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
+#endif
 }
 static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
@@ -8435,10 +8453,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
        if (!tg->se[0])
                return -EINVAL;
-        if (shares < MIN_SHARES)
+        shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
-                shares = MIN_SHARES;
-        else if (shares > MAX_SHARES)
-                shares = MAX_SHARES;
        mutex_lock(&shares_mutex);
        if (tg->shares == shares)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 88725c939e0b..10d018212bab 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1096,7 +1096,7 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag
         * to move current somewhere else, making room for our non-migratable
         * task.
         */
-        if (p->prio == rq->curr->prio && !need_resched())
+        if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
                check_preempt_equal_prio(rq, p);
 #endif
 }
@@ -1239,6 +1239,10 @@ static int find_lowest_rq(struct task_struct *task)
        int this_cpu = smp_processor_id();
        int cpu      = task_cpu(task);
+        /* Make sure the mask is initialized first */
+        if (unlikely(!lowest_mask))
+                return -1;
        if (task->rt.nr_cpus_allowed == 1)
                return -1; /* No other targets possible */
diff --git a/kernel/signal.c b/kernel/signal.c
index 86c32b884f8e..ff7678603328 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2365,7 +2365,7 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
 /**
 *  sys_rt_sigprocmask - change the list of currently blocked signals
 *  @how: whether to add, remove, or set signals
- *  @set: stores pending signals
+ *  @nset: stores pending signals
 *  @oset: previous value of signal mask if non-null
 *  @sigsetsize: size of sigset_t type
 */
diff --git a/kernel/smp.c b/kernel/smp.c
index 73a195193558..fb67dfa8394e 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -74,7 +74,7 @@ static struct notifier_block __cpuinitdata hotplug_cfd_notifier = {
        .notifier_call          = hotplug_cfd,
 };
-static int __cpuinit init_call_single_data(void)
+void __init call_function_init(void)
 {
        void *cpu = (void *)(long)smp_processor_id();
        int i;
@@ -88,10 +88,7 @@ static int __cpuinit init_call_single_data(void)
        hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu);
        register_cpu_notifier(&hotplug_cfd_notifier);
-        return 0;
 }
-early_initcall(init_call_single_data);
 /*
 * csd_lock/csd_unlock used to serialize access to per-cpu csd resources
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 13960170cad4..40cf63ddd4b3 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -58,7 +58,7 @@ DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
 char *softirq_to_name[NR_SOFTIRQS] = {
        "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
-        "TASKLET", "SCHED", "HRTIMER"
+        "TASKLET", "SCHED", "HRTIMER", "RCU"
 };
 /*
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4fc92445a29c..f175d98bd355 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -938,6 +938,12 @@ static struct ctl_table kern_table[] = {
        },
 #endif
 #ifdef CONFIG_PERF_EVENTS
+        /*
+         * User-space scripts rely on the existence of this file
+         * as a feature check for perf_events being enabled.
+         *
+         * So it's an ABI, do not remove!
+         */
        {
                .procname       = "perf_event_paranoid",
                .data           = &sysctl_perf_event_paranoid,
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 9ffea360a778..fc0f22005417 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -285,16 +285,18 @@ ret:
 static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
 {
        struct listener_list *listeners;
-        struct listener *s, *tmp;
+        struct listener *s, *tmp, *s2;
        unsigned int cpu;
        if (!cpumask_subset(mask, cpu_possible_mask))
                return -EINVAL;
+        s = NULL;
        if (isadd == REGISTER) {
                for_each_cpu(cpu, mask) {
-                        s = kmalloc_node(sizeof(struct listener), GFP_KERNEL,
+                        if (!s)
-                                         cpu_to_node(cpu));
+                                s = kmalloc_node(sizeof(struct listener),
+                                                 GFP_KERNEL, cpu_to_node(cpu));
                        if (!s)
                                goto cleanup;
                        s->pid = pid;
@@ -303,9 +305,16 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
                        listeners = &per_cpu(listener_array, cpu);
                        down_write(&listeners->sem);
+                        list_for_each_entry_safe(s2, tmp, &listeners->list, list) {
+                                if (s2->pid == pid)
+                                        goto next_cpu;
+                        }
                        list_add(&s->list, &listeners->list);
+                        s = NULL;
+next_cpu:
                        up_write(&listeners->sem);
                }
+                kfree(s);
                return 0;
        }
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 2d966244ea60..59f369f98a04 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -42,15 +42,75 @@ static struct alarm_base {
        clockid_t               base_clockid;
 } alarm_bases[ALARM_NUMTYPE];
+/* freezer delta & lock used to handle clock_nanosleep triggered wakeups */
+static ktime_t freezer_delta;
+static DEFINE_SPINLOCK(freezer_delta_lock);
 #ifdef CONFIG_RTC_CLASS
 /* rtc timer and device for setting alarm wakeups at suspend */
 static struct rtc_timer         rtctimer;
 static struct rtc_device        *rtcdev;
-#endif
+static DEFINE_SPINLOCK(rtcdev_lock);
-/* freezer delta & lock used to handle clock_nanosleep triggered wakeups */
+/**
-static ktime_t freezer_delta;
+ * has_wakealarm - check rtc device has wakealarm ability
-static DEFINE_SPINLOCK(freezer_delta_lock);
+ * @dev: current device
+ * @name_ptr: name to be returned
+ *
+ * This helper function checks to see if the rtc device can wake
+ * from suspend.
+ */
+static int has_wakealarm(struct device *dev, void *name_ptr)
+{
+        struct rtc_device *candidate = to_rtc_device(dev);
+        if (!candidate->ops->set_alarm)
+                return 0;
+        if (!device_may_wakeup(candidate->dev.parent))
+                return 0;
+        *(const char **)name_ptr = dev_name(dev);
+        return 1;
+}
+/**
+ * alarmtimer_get_rtcdev - Return selected rtcdevice
+ *
+ * This function returns the rtc device to use for wakealarms.
+ * If one has not already been chosen, it checks to see if a
+ * functional rtc device is available.
+ */
+static struct rtc_device *alarmtimer_get_rtcdev(void)
+{
+        struct device *dev;
+        char *str;
+        unsigned long flags;
+        struct rtc_device *ret;
+        spin_lock_irqsave(&rtcdev_lock, flags);
+        if (!rtcdev) {
+                /* Find an rtc device and init the rtc_timer */
+                dev = class_find_device(rtc_class, NULL, &str, has_wakealarm);
+                /* If we have a device then str is valid. See has_wakealarm() */
+                if (dev) {
+                        rtcdev = rtc_class_open(str);
+                        /*
+                         * Drop the reference we got in class_find_device,
+                         * rtc_open takes its own.
+                         */
+                        put_device(dev);
+                        rtc_timer_init(&rtctimer, NULL, NULL);
+                }
+        }
+        ret = rtcdev;
+        spin_unlock_irqrestore(&rtcdev_lock, flags);
+        return ret;
+}
+#else
+#define alarmtimer_get_rtcdev() (0)
+#define rtcdev (0)
+#endif
 /**
@@ -166,6 +226,7 @@ static int alarmtimer_suspend(struct device *dev)
        struct rtc_time tm;
        ktime_t min, now;
        unsigned long flags;
+        struct rtc_device *rtc;
        int i;
        spin_lock_irqsave(&freezer_delta_lock, flags);
@@ -173,8 +234,9 @@ static int alarmtimer_suspend(struct device *dev)
        freezer_delta = ktime_set(0, 0);
        spin_unlock_irqrestore(&freezer_delta_lock, flags);
+        rtc = rtcdev;
        /* If we have no rtcdev, just return */
-        if (!rtcdev)
+        if (!rtc)
                return 0;
        /* Find the soonest timer to expire*/
@@ -199,12 +261,12 @@ static int alarmtimer_suspend(struct device *dev)
        WARN_ON(min.tv64 < NSEC_PER_SEC);
        /* Setup an rtc timer to fire that far in the future */
-        rtc_timer_cancel(rtcdev, &rtctimer);
+        rtc_timer_cancel(rtc, &rtctimer);
-        rtc_read_time(rtcdev, &tm);
+        rtc_read_time(rtc, &tm);
        now = rtc_tm_to_ktime(tm);
        now = ktime_add(now, min);
-        rtc_timer_start(rtcdev, &rtctimer, now, ktime_set(0, 0));
+        rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0));
        return 0;
 }
@@ -322,6 +384,9 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp)
 {
        clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid;
+        if (!alarmtimer_get_rtcdev())
+                return -ENOTSUPP;
        return hrtimer_get_res(baseid, tp);
 }
@@ -336,6 +401,9 @@ static int alarm_clock_get(clockid_t which_clock, struct timespec *tp)
 {
        struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)];
+        if (!alarmtimer_get_rtcdev())
+                return -ENOTSUPP;
        *tp = ktime_to_timespec(base->gettime());
        return 0;
 }
@@ -351,6 +419,9 @@ static int alarm_timer_create(struct k_itimer *new_timer)
        enum  alarmtimer_type type;
        struct alarm_base *base;
+        if (!alarmtimer_get_rtcdev())
+                return -ENOTSUPP;
        if (!capable(CAP_WAKE_ALARM))
                return -EPERM;
@@ -385,6 +456,9 @@ static void alarm_timer_get(struct k_itimer *timr,
 */
 static int alarm_timer_del(struct k_itimer *timr)
 {
+        if (!rtcdev)
+                return -ENOTSUPP;
        alarm_cancel(&timr->it.alarmtimer);
        return 0;
 }
@@ -402,6 +476,9 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
                                struct itimerspec *new_setting,
                                struct itimerspec *old_setting)
 {
+        if (!rtcdev)
+                return -ENOTSUPP;
        /* Save old values */
        old_setting->it_interval =
                        ktime_to_timespec(timr->it.alarmtimer.period);
@@ -541,6 +618,9 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
        int ret = 0;
        struct restart_block *restart;
+        if (!alarmtimer_get_rtcdev())
+                return -ENOTSUPP;
        if (!capable(CAP_WAKE_ALARM))
                return -EPERM;
@@ -638,65 +718,3 @@ static int __init alarmtimer_init(void)
 }
 device_initcall(alarmtimer_init);
-#ifdef CONFIG_RTC_CLASS
-/**
- * has_wakealarm - check rtc device has wakealarm ability
- * @dev: current device
- * @name_ptr: name to be returned
- *
- * This helper function checks to see if the rtc device can wake
- * from suspend.
- */
-static int __init has_wakealarm(struct device *dev, void *name_ptr)
-{
-        struct rtc_device *candidate = to_rtc_device(dev);
-        if (!candidate->ops->set_alarm)
-                return 0;
-        if (!device_may_wakeup(candidate->dev.parent))
-                return 0;
-        *(const char **)name_ptr = dev_name(dev);
-        return 1;
-}
-/**
- * alarmtimer_init_late - Late initializing of alarmtimer code
- *
- * This function locates a rtc device to use for wakealarms.
- * Run as late_initcall to make sure rtc devices have been
- * registered.
- */
-static int __init alarmtimer_init_late(void)
-{
-        struct device *dev;
-        char *str;
-        /* Find an rtc device and init the rtc_timer */
-        dev = class_find_device(rtc_class, NULL, &str, has_wakealarm);
-        /* If we have a device then str is valid. See has_wakealarm() */
-        if (dev) {
-                rtcdev = rtc_class_open(str);
-                /*
-                 * Drop the reference we got in class_find_device,
-                 * rtc_open takes its own.
-                 */
-                put_device(dev);
-        }
-        if (!rtcdev) {
-                printk(KERN_WARNING "No RTC device found, ALARM timers will"
-                        " not wake from suspend");
-        }
-        rtc_timer_init(&rtctimer, NULL, NULL);
-        return 0;
-}
-#else
-static int __init alarmtimer_init_late(void)
-{
-        printk(KERN_WARNING "Kernel not built with RTC support, ALARM timers"
-                " will not wake from suspend");
-        return 0;
-}
-#endif
-late_initcall(alarmtimer_init_late);
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index c027d4f602f1..e4c699dfa4e8 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -182,7 +182,10 @@ void clockevents_register_device(struct clock_event_device *dev)
        unsigned long flags;
        BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
-        BUG_ON(!dev->cpumask);
+        if (!dev->cpumask) {
+                WARN_ON(num_possible_cpus() > 1);
+                dev->cpumask = cpumask_of(smp_processor_id());
+        }
        raw_spin_lock_irqsave(&clockevents_lock, flags);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 1c95fd677328..e0980f0d9a0a 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -185,7 +185,6 @@ static struct clocksource *watchdog;
 static struct timer_list watchdog_timer;
 static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
 static DEFINE_SPINLOCK(watchdog_lock);
-static cycle_t watchdog_last;
 static int watchdog_running;
 static int clocksource_watchdog_kthread(void *data);
@@ -254,11 +253,6 @@ static void clocksource_watchdog(unsigned long data)
        if (!watchdog_running)
                goto out;
-        wdnow = watchdog->read(watchdog);
-        wd_nsec = clocksource_cyc2ns((wdnow - watchdog_last) & watchdog->mask,
-                                     watchdog->mult, watchdog->shift);
-        watchdog_last = wdnow;
        list_for_each_entry(cs, &watchdog_list, wd_list) {
                /* Clocksource already marked unstable? */
@@ -268,19 +262,28 @@ static void clocksource_watchdog(unsigned long data)
                        continue;
                }
+                local_irq_disable();
                csnow = cs->read(cs);
+                wdnow = watchdog->read(watchdog);
+                local_irq_enable();
                /* Clocksource initialized ? */
                if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {
                        cs->flags |= CLOCK_SOURCE_WATCHDOG;
-                        cs->wd_last = csnow;
+                        cs->wd_last = wdnow;
+                        cs->cs_last = csnow;
                        continue;
                }
-                /* Check the deviation from the watchdog clocksource. */
+                wd_nsec = clocksource_cyc2ns((wdnow - cs->wd_last) & watchdog->mask,
-                cs_nsec = clocksource_cyc2ns((csnow - cs->wd_last) &
+                                             watchdog->mult, watchdog->shift);
+                cs_nsec = clocksource_cyc2ns((csnow - cs->cs_last) &
                                             cs->mask, cs->mult, cs->shift);
-                cs->wd_last = csnow;
+                cs->cs_last = csnow;
+                cs->wd_last = wdnow;
+                /* Check the deviation from the watchdog clocksource. */
                if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {
                        clocksource_unstable(cs, cs_nsec - wd_nsec);
                        continue;
@@ -318,7 +321,6 @@ static inline void clocksource_start_watchdog(void)
                return;
        init_timer(&watchdog_timer);
        watchdog_timer.function = clocksource_watchdog;
-        watchdog_last = watchdog->read(watchdog);
        watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
        add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
        watchdog_running = 1;
diff --git a/kernel/timer.c b/kernel/timer.c
index fd6198692b57..8cff36119e4d 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -749,16 +749,15 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
        unsigned long expires_limit, mask;
        int bit;
-        expires_limit = expires;
        if (timer->slack >= 0) {
                expires_limit = expires + timer->slack;
        } else {
-                unsigned long now = jiffies;
+                long delta = expires - jiffies;
+                if (delta < 256)
+                        return expires;
-                /* No slack, if already expired else auto slack 0.4% */
+                expires_limit = expires + delta / 256;
-                if (time_after(expires, now))
-                        expires_limit = expires + (expires - now)/256;
        }
        mask = expires ^ expires_limit;
        if (mask == 0)
@@ -795,6 +794,8 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
 */
 int mod_timer(struct timer_list *timer, unsigned long expires)
 {
+        expires = apply_slack(timer, expires);
        /*
         * This is a common optimization triggered by the
         * networking code - if the timer is re-modified
@@ -803,8 +804,6 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
        if (timer_pending(timer) && timer->expires == expires)
                return 1;
-        expires = apply_slack(timer, expires);
        return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
 }
 EXPORT_SYMBOL(mod_timer);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1ee417fcbfa5..908038f57440 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2740,7 +2740,7 @@ static int ftrace_process_regex(struct ftrace_hash *hash,
 {
        char *func, *command, *next = buff;
        struct ftrace_func_command *p;
-        int ret;
+        int ret = -EINVAL;
        func = strsep(&next, ":");
@@ -3330,6 +3330,7 @@ static int ftrace_process_locs(struct module *mod,
 {
        unsigned long *p;
        unsigned long addr;
+        unsigned long flags;
        mutex_lock(&ftrace_lock);
        p = start;
@@ -3346,7 +3347,13 @@ static int ftrace_process_locs(struct module *mod,
                ftrace_record_ip(addr);
        }
+        /*
+         * Disable interrupts to prevent interrupts from executing
+         * code that is being modified.
+         */
+        local_irq_save(flags);
        ftrace_update_code(mod);
+        local_irq_restore(flags);
        mutex_unlock(&ftrace_lock);
        return 0;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index f925c45f0afa..27d13b36b8be 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1870,8 +1870,12 @@ fs_initcall(init_kprobe_trace);
 #ifdef CONFIG_FTRACE_STARTUP_TEST
-static int kprobe_trace_selftest_target(int a1, int a2, int a3,
+/*
-                                        int a4, int a5, int a6)
+ * The "__used" keeps gcc from removing the function symbol
+ * from the kallsyms table.
+ */
+static __used int kprobe_trace_selftest_target(int a1, int a2, int a3,
+                                               int a4, int a5, int a6)
 {
        return a1 + a2 + a3 + a4 + a5 + a6;
 }
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index dff763b7baf1..1f06468a10d7 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -240,13 +240,10 @@ static const char **find_next(void *v, loff_t *pos)
        const char **fmt = v;
        int start_index;
-        if (!fmt)
-                fmt = __start___trace_bprintk_fmt + *pos;
        start_index = __stop___trace_bprintk_fmt - __start___trace_bprintk_fmt;
        if (*pos < start_index)
-                return fmt;
+                return __start___trace_bprintk_fmt + *pos;
        return find_next_mod_format(start_index, v, fmt, pos);
 }