Merge commit 'v2.6.38-rc2' into core/locking

Reason: Update to mainline before adding the locking cleanup Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
author: Thomas Gleixner <tglx@linutronix.de> 2011-01-27 06:29:13 -0500
committer: Thomas Gleixner <tglx@linutronix.de> 2011-01-27 06:29:37 -0500
commit: f97b12cce6dea51880a6a89d4607c29c70a6a841 (patch)
tree: 1f05f6d39975bd213e7506e8a73ae0a59188c75e /kernel
parent: ccaa8d657117bb1876d471bd91579d774106778d (diff)
parent: 1bae4ce27c9c90344f23c65ea6966c50ffeae2f5 (diff)
90 files changed, 4867 insertions, 2384 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 0b5ff083fa22..353d3fe8ba33 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -43,7 +43,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
 obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
 obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
-obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o
+obj-$(CONFIG_SMP) += smp.o
 ifneq ($(CONFIG_SMP),y)
 obj-y += up.o
 endif
@@ -100,6 +100,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/
 obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_X86_DS) += trace/
 obj-$(CONFIG_RING_BUFFER) += trace/
+obj-$(CONFIG_TRACEPOINTS) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
 obj-$(CONFIG_IRQ_WORK) += irq_work.o
 obj-$(CONFIG_PERF_EVENTS) += perf_event.o
@@ -121,7 +122,7 @@ $(obj)/configs.o: $(obj)/config_data.h
 # config_data.h contains the same information as ikconfig.h but gzipped.
 # Info from config_data can be extracted from /proc/config*
 targets += config_data.gz
-$(obj)/config_data.gz: .config FORCE
+$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
        $(call if_changed,gzip)
 quiet_cmd_ikconfiggz = IKCFG   $@
diff --git a/kernel/audit.c b/kernel/audit.c
index 77770a034d59..e4956244ae50 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -400,7 +400,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
        if (err < 0) {
                BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
                printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
-                audit_log_lost("auditd dissapeared\n");
+                audit_log_lost("auditd disappeared\n");
                audit_pid = 0;
                /* we might get lucky and get this in the next auditd */
                audit_hold_skb(skb);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 66a416b42c18..b24d7027b83c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -764,6 +764,7 @@ EXPORT_SYMBOL_GPL(cgroup_unlock);
 */
 static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
+static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *);
 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
 static int cgroup_populate_dir(struct cgroup *cgrp);
 static const struct inode_operations cgroup_dir_inode_operations;
@@ -860,6 +861,11 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
        iput(inode);
 }
+static int cgroup_delete(const struct dentry *d)
+{
+        return 1;
+}
 static void remove_dir(struct dentry *d)
 {
        struct dentry *parent = dget(d->d_parent);
@@ -874,25 +880,29 @@ static void cgroup_clear_directory(struct dentry *dentry)
        struct list_head *node;
        BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
-        spin_lock(&dcache_lock);
+        spin_lock(&dentry->d_lock);
        node = dentry->d_subdirs.next;
        while (node != &dentry->d_subdirs) {
                struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
+                spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
                list_del_init(node);
                if (d->d_inode) {
                        /* This should never be called on a cgroup
                         * directory with child cgroups */
                        BUG_ON(d->d_inode->i_mode & S_IFDIR);
-                        d = dget_locked(d);
+                        dget_dlock(d);
-                        spin_unlock(&dcache_lock);
+                        spin_unlock(&d->d_lock);
+                        spin_unlock(&dentry->d_lock);
                        d_delete(d);
                        simple_unlink(dentry->d_inode, d);
                        dput(d);
-                        spin_lock(&dcache_lock);
+                        spin_lock(&dentry->d_lock);
-                }
+                } else
+                        spin_unlock(&d->d_lock);
                node = dentry->d_subdirs.next;
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&dentry->d_lock);
 }
 /*
@@ -900,11 +910,16 @@ static void cgroup_clear_directory(struct dentry *dentry)
 */
 static void cgroup_d_remove_dir(struct dentry *dentry)
 {
+        struct dentry *parent;
        cgroup_clear_directory(dentry);
-        spin_lock(&dcache_lock);
+        parent = dentry->d_parent;
+        spin_lock(&parent->d_lock);
+        spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
        list_del_init(&dentry->d_u.d_child);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&dentry->d_lock);
+        spin_unlock(&parent->d_lock);
        remove_dir(dentry);
 }
@@ -1440,6 +1455,11 @@ static int cgroup_set_super(struct super_block *sb, void *data)
 static int cgroup_get_rootdir(struct super_block *sb)
 {
+        static const struct dentry_operations cgroup_dops = {
+                .d_iput = cgroup_diput,
+                .d_delete = cgroup_delete,
+        };
        struct inode *inode =
                cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
        struct dentry *dentry;
@@ -1457,6 +1477,8 @@ static int cgroup_get_rootdir(struct super_block *sb)
                return -ENOMEM;
        }
        sb->s_root = dentry;
+        /* for everything else we want ->d_op set */
+        sb->s_d_op = &cgroup_dops;
        return 0;
 }
@@ -2180,12 +2202,20 @@ static const struct file_operations cgroup_file_operations = {
 };
 static const struct inode_operations cgroup_dir_inode_operations = {
-        .lookup = simple_lookup,
+        .lookup = cgroup_lookup,
        .mkdir = cgroup_mkdir,
        .rmdir = cgroup_rmdir,
        .rename = cgroup_rename,
 };
+static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+{
+        if (dentry->d_name.len > NAME_MAX)
+                return ERR_PTR(-ENAMETOOLONG);
+        d_add(dentry, NULL);
+        return NULL;
+}
 /*
 * Check if a file is a control file
 */
@@ -2199,10 +2229,6 @@ static inline struct cftype *__file_cft(struct file *file)
 static int cgroup_create_file(struct dentry *dentry, mode_t mode,
                                struct super_block *sb)
 {
-        static const struct dentry_operations cgroup_dops = {
-                .d_iput = cgroup_diput,
-        };
        struct inode *inode;
        if (!dentry)
@@ -2228,7 +2254,6 @@ static int cgroup_create_file(struct dentry *dentry, mode_t mode,
                inode->i_size = 0;
                inode->i_fop = &cgroup_file_operations;
        }
-        dentry->d_op = &cgroup_dops;
        d_instantiate(dentry, inode);
        dget(dentry);   /* Extra count - pin the dentry in core */
        return 0;
@@ -3638,9 +3663,7 @@ again:
        list_del(&cgrp->sibling);
        cgroup_unlock_hierarchy(cgrp->root);
-        spin_lock(&cgrp->dentry->d_lock);
        d = dget(cgrp->dentry);
-        spin_unlock(&d->d_lock);
        cgroup_d_remove_dir(d);
        dput(d);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f6e726f18491..156cc5556140 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -189,7 +189,6 @@ static inline void check_for_tasks(int cpu)
 }
 struct take_cpu_down_param {
-        struct task_struct *caller;
        unsigned long mod;
        void *hcpu;
 };
@@ -198,7 +197,6 @@ struct take_cpu_down_param {
 static int __ref take_cpu_down(void *_param)
 {
        struct take_cpu_down_param *param = _param;
-        unsigned int cpu = (unsigned long)param->hcpu;
        int err;
        /* Ensure this CPU doesn't handle any more interrupts. */
@@ -208,11 +206,6 @@ static int __ref take_cpu_down(void *_param)
        cpu_notify(CPU_DYING | param->mod, param->hcpu);
-        if (task_cpu(param->caller) == cpu)
-                move_task_off_dead_cpu(cpu, param->caller);
-        /* Force idle task to run as soon as we yield: it should
-           immediately notice cpu is offline and die quickly. */
-        sched_idle_next();
        return 0;
 }
@@ -223,7 +216,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
        void *hcpu = (void *)(long)cpu;
        unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
        struct take_cpu_down_param tcd_param = {
-                .caller = current,
                .mod = mod,
                .hcpu = hcpu,
        };
@@ -253,9 +245,15 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
        }
        BUG_ON(cpu_online(cpu));
-        /* Wait for it to sleep (leaving idle task). */
+        /*
+         * The migration_call() CPU_DYING callback will have removed all
+         * runnable tasks from the cpu, there's only the idle task left now
+         * that the migration thread is done doing the stop_machine thing.
+         *
+         * Wait for the stop thread to go away.
+         */
        while (!idle_cpu(cpu))
-                yield();
+                cpu_relax();
        /* This actually kills the CPU. */
        __cpu_die(cpu);
@@ -386,6 +384,14 @@ out:
 #ifdef CONFIG_PM_SLEEP_SMP
 static cpumask_var_t frozen_cpus;
+void __weak arch_disable_nonboot_cpus_begin(void)
+{
+}
+void __weak arch_disable_nonboot_cpus_end(void)
+{
+}
 int disable_nonboot_cpus(void)
 {
        int cpu, first_cpu, error = 0;
@@ -397,6 +403,7 @@ int disable_nonboot_cpus(void)
         * with the userspace trying to use the CPU hotplug at the same time
         */
        cpumask_clear(frozen_cpus);
+        arch_disable_nonboot_cpus_begin();
        printk("Disabling non-boot CPUs ...\n");
        for_each_online_cpu(cpu) {
@@ -412,6 +419,8 @@ int disable_nonboot_cpus(void)
                }
        }
+        arch_disable_nonboot_cpus_end();
        if (!error) {
                BUG_ON(num_online_cpus() > 1);
                /* Make sure the CPUs won't be enabled by someone else */
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 37755d621924..bd3e8e29caa3 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -82,7 +82,7 @@ static kdbtab_t kdb_base_commands[50];
 #define for_each_kdbcmd(cmd, num)                                       \
        for ((cmd) = kdb_base_commands, (num) = 0;                      \
             num < kdb_max_commands;                                    \
-             num == KDB_BASE_CMD_MAX ? cmd = kdb_commands : cmd++, num++)
+             num++, num == KDB_BASE_CMD_MAX ? cmd = kdb_commands : cmd++)
 typedef struct _kdbmsg {
        int     km_diag;        /* kdb diagnostic */
@@ -646,7 +646,7 @@ static int kdb_defcmd2(const char *cmdstr, const char *argv0)
        }
        if (!s->usable)
                return KDB_NOTIMP;
-        s->command = kmalloc((s->count + 1) * sizeof(*(s->command)), GFP_KDB);
+        s->command = kzalloc((s->count + 1) * sizeof(*(s->command)), GFP_KDB);
        if (!s->command) {
                kdb_printf("Could not allocate new kdb_defcmd table for %s\n",
                           cmdstr);
@@ -2361,7 +2361,7 @@ static int kdb_pid(int argc, const char **argv)
 */
 static int kdb_ll(int argc, const char **argv)
 {
-        int diag;
+        int diag = 0;
        unsigned long addr;
        long offset = 0;
        unsigned long va;
@@ -2400,20 +2400,21 @@ static int kdb_ll(int argc, const char **argv)
                char buf[80];
                if (KDB_FLAG(CMD_INTERRUPT))
-                        return 0;
+                        goto out;
                sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va);
                diag = kdb_parse(buf);
                if (diag)
-                        return diag;
+                        goto out;
                addr = va + linkoffset;
                if (kdb_getword(&va, addr, sizeof(va)))
-                        return 0;
+                        goto out;
        }
-        kfree(command);
-        return 0;
+out:
+        kfree(command);
+        return diag;
 }
 static int kdb_kgdb(int argc, const char **argv)
@@ -2739,13 +2740,13 @@ int kdb_register_repeat(char *cmd,
                }
                if (kdb_commands) {
                        memcpy(new, kdb_commands,
-                               kdb_max_commands * sizeof(*new));
+                          (kdb_max_commands - KDB_BASE_CMD_MAX) * sizeof(*new));
                        kfree(kdb_commands);
                }
                memset(new + kdb_max_commands, 0,
                       kdb_command_extend * sizeof(*new));
                kdb_commands = new;
-                kp = kdb_commands + kdb_max_commands;
+                kp = kdb_commands + kdb_max_commands - KDB_BASE_CMD_MAX;
                kdb_max_commands += kdb_command_extend;
        }
@@ -2913,7 +2914,7 @@ static void __init kdb_cmd_init(void)
        }
 }
-/* Intialize kdb_printf, breakpoint tables and kdb state */
+/* Initialize kdb_printf, breakpoint tables and kdb state */
 void __init kdb_init(int lvl)
 {
        static int kdb_init_lvl = KDB_NOT_INITIALIZED;
diff --git a/kernel/exit.c b/kernel/exit.c
index 21aa7b3001fb..f9a45ebcc7b1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -69,7 +69,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
                list_del_rcu(&p->tasks);
                list_del_init(&p->sibling);
-                __get_cpu_var(process_counts)--;
+                __this_cpu_dec(process_counts);
        }
        list_del_rcu(&p->thread_group);
 }
@@ -914,6 +914,15 @@ NORET_TYPE void do_exit(long code)
        if (unlikely(!tsk->pid))
                panic("Attempted to kill the idle task!");
+        /*
+         * If do_exit is called because this processes oopsed, it's possible
+         * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
+         * continuing. Amongst other possible reasons, this is to prevent
+         * mm_release()->clear_child_tid() from writing to a user-controlled
+         * kernel address.
+         */
+        set_fs(USER_DS);
        tracehook_report_exit(&code);
        validate_creds_for_do_exit(tsk);
@@ -985,6 +994,15 @@ NORET_TYPE void do_exit(long code)
        exit_fs(tsk);
        check_stack_usage();
        exit_thread();
+        /*
+         * Flush inherited counters to the parent - before the parent
+         * gets woken up by child-exit notifications.
+         *
+         * because of cgroup mode, must be called before cgroup_exit()
+         */
+        perf_event_exit_task(tsk);
        cgroup_exit(tsk, 1);
        if (group_dead)
@@ -998,11 +1016,6 @@ NORET_TYPE void do_exit(long code)
         * FIXME: do that only when needed, using sched_exit tracepoint
         */
        flush_ptrace_hw_breakpoint(tsk);
-        /*
-         * Flush inherited counters to the parent - before the parent
-         * gets woken up by child-exit notifications.
-         */
-        perf_event_exit_task(tsk);
        exit_notify(tsk, group_dead);
 #ifdef CONFIG_NUMA
diff --git a/kernel/fork.c b/kernel/fork.c
index 3b159c5991b7..25e429152ddc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -66,6 +66,7 @@
 #include <linux/posix-timers.h>
 #include <linux/user-return-notifier.h>
 #include <linux/oom.h>
+#include <linux/khugepaged.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -169,6 +170,7 @@ EXPORT_SYMBOL(free_task);
 static inline void free_signal_struct(struct signal_struct *sig)
 {
        taskstats_tgid_free(sig);
+        sched_autogroup_exit(sig);
        kmem_cache_free(signal_cachep, sig);
 }
@@ -273,6 +275,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
        setup_thread_stack(tsk, orig);
        clear_user_return_notifier(tsk);
+        clear_tsk_need_resched(tsk);
        stackend = end_of_stack(tsk);
        *stackend = STACK_END_MAGIC;    /* for overflow detection */
@@ -328,6 +331,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
        retval = ksm_fork(mm, oldmm);
        if (retval)
                goto out;
+        retval = khugepaged_fork(mm, oldmm);
+        if (retval)
+                goto out;
        prev = NULL;
        for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
@@ -527,6 +533,9 @@ void __mmdrop(struct mm_struct *mm)
        mm_free_pgd(mm);
        destroy_context(mm);
        mmu_notifier_mm_destroy(mm);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        VM_BUG_ON(mm->pmd_huge_pte);
+#endif
        free_mm(mm);
 }
 EXPORT_SYMBOL_GPL(__mmdrop);
@@ -541,6 +550,7 @@ void mmput(struct mm_struct *mm)
        if (atomic_dec_and_test(&mm->mm_users)) {
                exit_aio(mm);
                ksm_exit(mm);
+                khugepaged_exit(mm); /* must run before exit_mmap */
                exit_mmap(mm);
                set_mm_exe_file(mm, NULL);
                if (!list_empty(&mm->mmlist)) {
@@ -667,6 +677,10 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
        mm->token_priority = 0;
        mm->last_interval = 0;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        mm->pmd_huge_pte = NULL;
+#endif
        if (!mm_init(mm, tsk))
                goto fail_nomem;
@@ -904,9 +918,11 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        posix_cpu_timers_init_group(sig);
        tty_audit_fork(sig);
+        sched_autogroup_fork(sig);
        sig->oom_adj = current->signal->oom_adj;
        sig->oom_score_adj = current->signal->oom_score_adj;
+        sig->oom_score_adj_min = current->signal->oom_score_adj_min;
        mutex_init(&sig->cred_guard_mutex);
@@ -1282,7 +1298,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                        attach_pid(p, PIDTYPE_SID, task_session(current));
                        list_add_tail(&p->sibling, &p->real_parent->children);
                        list_add_tail_rcu(&p->tasks, &init_task.tasks);
-                        __get_cpu_var(process_counts)++;
+                        __this_cpu_inc(process_counts);
                }
                attach_pid(p, PIDTYPE_PID, pid);
                nr_threads++;
@@ -1407,23 +1423,6 @@ long do_fork(unsigned long clone_flags,
        }
        /*
-         * We hope to recycle these flags after 2.6.26
-         */
-        if (unlikely(clone_flags & CLONE_STOPPED)) {
-                static int __read_mostly count = 100;
-                if (count > 0 && printk_ratelimit()) {
-                        char comm[TASK_COMM_LEN];
-                        count--;
-                        printk(KERN_INFO "fork(): process `%s' used deprecated "
-                                        "clone flags 0x%lx\n",
-                                get_task_comm(comm, current),
-                                clone_flags & CLONE_STOPPED);
-                }
-        }
-        /*
         * When called from kernel_thread, don't do user tracing stuff.
         */
        if (likely(user_mode(regs)))
@@ -1461,16 +1460,7 @@ long do_fork(unsigned long clone_flags,
                 */
                p->flags &= ~PF_STARTING;
-                if (unlikely(clone_flags & CLONE_STOPPED)) {
+                wake_up_new_task(p, clone_flags);
-                        /*
-                         * We'll start up with an immediate SIGSTOP.
-                         */
-                        sigaddset(&p->pending.signal, SIGSTOP);
-                        set_tsk_thread_flag(p, TIF_SIGPENDING);
-                        __set_task_state(p, TASK_STOPPED);
-                } else {
-                        wake_up_new_task(p, clone_flags);
-                }
                tracehook_report_clone_complete(trace, regs,
                                                clone_flags, nr, p);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index bd1d42b17cb2..66ecd2ead215 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -104,8 +104,13 @@ bool freeze_task(struct task_struct *p, bool sig_only)
        }
        if (should_send_signal(p)) {
-                if (!signal_pending(p))
+                fake_signal_wake_up(p);
-                        fake_signal_wake_up(p);
+                /*
+                 * fake_signal_wake_up() goes through p's scheduler
+                 * lock and guarantees that TASK_STOPPED/TRACED ->
+                 * TASK_RUNNING transition can't race with task state
+                 * testing in try_to_freeze_tasks().
+                 */
        } else if (sig_only) {
                return false;
        } else {
diff --git a/kernel/futex.c b/kernel/futex.c
index 6c683b37f2ce..b766d28accd6 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -69,6 +69,14 @@ int __read_mostly futex_cmpxchg_enabled;
 #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
 /*
+ * Futex flags used to encode options to functions and preserve them across
+ * restarts.
+ */
+#define FLAGS_SHARED            0x01
+#define FLAGS_CLOCKRT           0x02
+#define FLAGS_HAS_TIMEOUT       0x04
+/*
 * Priority Inheritance state:
 */
 struct futex_pi_state {
@@ -123,6 +131,12 @@ struct futex_q {
        u32 bitset;
 };
+static const struct futex_q futex_q_init = {
+        /* list gets initialized in queue_me()*/
+        .key = FUTEX_KEY_INIT,
+        .bitset = FUTEX_BITSET_MATCH_ANY
+};
 /*
 * Hash buckets are shared by all the futex_keys that hash to the same
 * location.  Each key may have multiple futex_q structures, one for each task
@@ -219,7 +233,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
 {
        unsigned long address = (unsigned long)uaddr;
        struct mm_struct *mm = current->mm;
-        struct page *page;
+        struct page *page, *page_head;
        int err;
        /*
@@ -251,11 +265,46 @@ again:
        if (err < 0)
                return err;
-        page = compound_head(page);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-        lock_page(page);
+        page_head = page;
-        if (!page->mapping) {
+        if (unlikely(PageTail(page))) {
-                unlock_page(page);
                put_page(page);
+                /* serialize against __split_huge_page_splitting() */
+                local_irq_disable();
+                if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) {
+                        page_head = compound_head(page);
+                        /*
+                         * page_head is valid pointer but we must pin
+                         * it before taking the PG_lock and/or
+                         * PG_compound_lock. The moment we re-enable
+                         * irqs __split_huge_page_splitting() can
+                         * return and the head page can be freed from
+                         * under us. We can't take the PG_lock and/or
+                         * PG_compound_lock on a page that could be
+                         * freed from under us.
+                         */
+                        if (page != page_head) {
+                                get_page(page_head);
+                                put_page(page);
+                        }
+                        local_irq_enable();
+                } else {
+                        local_irq_enable();
+                        goto again;
+                }
+        }
+#else
+        page_head = compound_head(page);
+        if (page != page_head) {
+                get_page(page_head);
+                put_page(page);
+        }
+#endif
+        lock_page(page_head);
+        if (!page_head->mapping) {
+                unlock_page(page_head);
+                put_page(page_head);
                goto again;
        }
@@ -266,25 +315,24 @@ again:
         * it's a read-only handle, it's expected that futexes attach to
         * the object not the particular process.
         */
-        if (PageAnon(page)) {
+        if (PageAnon(page_head)) {
                key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
                key->private.mm = mm;
                key->private.address = address;
        } else {
                key->both.offset |= FUT_OFF_INODE; /* inode-based key */
-                key->shared.inode = page->mapping->host;
+                key->shared.inode = page_head->mapping->host;
-                key->shared.pgoff = page->index;
+                key->shared.pgoff = page_head->index;
        }
        get_futex_key_refs(key);
-        unlock_page(page);
+        unlock_page(page_head);
-        put_page(page);
+        put_page(page_head);
        return 0;
 }
-static inline
+static inline void put_futex_key(union futex_key *key)
-void put_futex_key(int fshared, union futex_key *key)
 {
        drop_futex_key_refs(key);
 }
@@ -778,10 +826,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
        new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
        /*
-         * This happens when we have stolen the lock and the original
+         * It is possible that the next waiter (the one that brought
-         * pending owner did not enqueue itself back on the rt_mutex.
+         * this owner to the kernel) timed out and is no longer
-         * Thats not a tragedy. We know that way, that a lock waiter
+         * waiting on the lock.
-         * is on the fly. We make the futex_q waiter the pending owner.
         */
        if (!new_owner)
                new_owner = this->task;
@@ -870,7 +917,8 @@ double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
 /*
 * Wake up waiters matching bitset queued on this futex (uaddr).
 */
-static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
+static int
+futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
 {
        struct futex_hash_bucket *hb;
        struct futex_q *this, *next;
@@ -881,7 +929,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
        if (!bitset)
                return -EINVAL;
-        ret = get_futex_key(uaddr, fshared, &key);
+        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
        if (unlikely(ret != 0))
                goto out;
@@ -907,7 +955,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
        }
        spin_unlock(&hb->lock);
-        put_futex_key(fshared, &key);
+        put_futex_key(&key);
 out:
        return ret;
 }
@@ -917,7 +965,7 @@ out:
 * to this virtual address:
 */
 static int
-futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
+futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
              int nr_wake, int nr_wake2, int op)
 {
        union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
@@ -927,10 +975,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
        int ret, op_ret;
 retry:
-        ret = get_futex_key(uaddr1, fshared, &key1);
+        ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1);
        if (unlikely(ret != 0))
                goto out;
-        ret = get_futex_key(uaddr2, fshared, &key2);
+        ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
        if (unlikely(ret != 0))
                goto out_put_key1;
@@ -962,11 +1010,11 @@ retry_private:
                if (ret)
                        goto out_put_keys;
-                if (!fshared)
+                if (!(flags & FLAGS_SHARED))
                        goto retry_private;
-                put_futex_key(fshared, &key2);
+                put_futex_key(&key2);
-                put_futex_key(fshared, &key1);
+                put_futex_key(&key1);
                goto retry;
        }
@@ -996,9 +1044,9 @@ retry_private:
        double_unlock_hb(hb1, hb2);
 out_put_keys:
-        put_futex_key(fshared, &key2);
+        put_futex_key(&key2);
 out_put_key1:
-        put_futex_key(fshared, &key1);
+        put_futex_key(&key1);
 out:
        return ret;
 }
@@ -1133,13 +1181,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
 /**
 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
 * @uaddr1:     source futex user address
- * @fshared:    0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
+ * @flags:      futex flags (FLAGS_SHARED, etc.)
 * @uaddr2:     target futex user address
 * @nr_wake:    number of waiters to wake (must be 1 for requeue_pi)
 * @nr_requeue: number of waiters to requeue (0-INT_MAX)
 * @cmpval:     @uaddr1 expected value (or %NULL)
 * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
- *              pi futex (pi to pi requeue is not supported)
+ *              pi futex (pi to pi requeue is not supported)
 *
 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
 * uaddr2 atomically on behalf of the top waiter.
@@ -1148,9 +1196,9 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
 * >=0 - on success, the number of tasks requeued or woken
 *  <0 - on error
 */
-static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
+static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
-                         int nr_wake, int nr_requeue, u32 *cmpval,
+                         u32 __user *uaddr2, int nr_wake, int nr_requeue,
-                         int requeue_pi)
+                         u32 *cmpval, int requeue_pi)
 {
        union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
        int drop_count = 0, task_count = 0, ret;
@@ -1191,10 +1239,10 @@ retry:
                pi_state = NULL;
        }
-        ret = get_futex_key(uaddr1, fshared, &key1);
+        ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1);
        if (unlikely(ret != 0))
                goto out;
-        ret = get_futex_key(uaddr2, fshared, &key2);
+        ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
        if (unlikely(ret != 0))
                goto out_put_key1;
@@ -1216,11 +1264,11 @@ retry_private:
                        if (ret)
                                goto out_put_keys;
-                        if (!fshared)
+                        if (!(flags & FLAGS_SHARED))
                                goto retry_private;
-                        put_futex_key(fshared, &key2);
+                        put_futex_key(&key2);
-                        put_futex_key(fshared, &key1);
+                        put_futex_key(&key1);
                        goto retry;
                }
                if (curval != *cmpval) {
@@ -1260,8 +1308,8 @@ retry_private:
                        break;
                case -EFAULT:
                        double_unlock_hb(hb1, hb2);
-                        put_futex_key(fshared, &key2);
+                        put_futex_key(&key2);
-                        put_futex_key(fshared, &key1);
+                        put_futex_key(&key1);
                        ret = fault_in_user_writeable(uaddr2);
                        if (!ret)
                                goto retry;
@@ -1269,8 +1317,8 @@ retry_private:
                case -EAGAIN:
                        /* The owner was exiting, try again. */
                        double_unlock_hb(hb1, hb2);
-                        put_futex_key(fshared, &key2);
+                        put_futex_key(&key2);
-                        put_futex_key(fshared, &key1);
+                        put_futex_key(&key1);
                        cond_resched();
                        goto retry;
                default:
@@ -1352,9 +1400,9 @@ out_unlock:
                drop_futex_key_refs(&key1);
 out_put_keys:
-        put_futex_key(fshared, &key2);
+        put_futex_key(&key2);
 out_put_key1:
-        put_futex_key(fshared, &key1);
+        put_futex_key(&key1);
 out:
        if (pi_state != NULL)
                free_pi_state(pi_state);
@@ -1494,7 +1542,7 @@ static void unqueue_me_pi(struct futex_q *q)
 * private futexes.
 */
 static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
-                                struct task_struct *newowner, int fshared)
+                                struct task_struct *newowner)
 {
        u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
        struct futex_pi_state *pi_state = q->pi_state;
@@ -1587,20 +1635,11 @@ handle_fault:
        goto retry;
 }
-/*
- * In case we must use restart_block to restart a futex_wait,
- * we encode in the 'flags' shared capability
- */
-#define FLAGS_SHARED            0x01
-#define FLAGS_CLOCKRT           0x02
-#define FLAGS_HAS_TIMEOUT       0x04
 static long futex_wait_restart(struct restart_block *restart);
 /**
 * fixup_owner() - Post lock pi_state and corner case management
 * @uaddr:      user address of the futex
- * @fshared:    whether the futex is shared (1) or not (0)
 * @q:          futex_q (contains pi_state and access to the rt_mutex)
 * @locked:     if the attempt to take the rt_mutex succeeded (1) or not (0)
 *
@@ -1613,8 +1652,7 @@ static long futex_wait_restart(struct restart_block *restart);
 *  0 - success, lock not taken
 * <0 - on error (-EFAULT)
 */
-static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
+static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
-                       int locked)
 {
        struct task_struct *owner;
        int ret = 0;
@@ -1625,7 +1663,7 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
                 * did a lock-steal - fix up the PI-state in that case:
                 */
                if (q->pi_state->owner != current)
-                        ret = fixup_pi_state_owner(uaddr, q, current, fshared);
+                        ret = fixup_pi_state_owner(uaddr, q, current);
                goto out;
        }
@@ -1652,7 +1690,7 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
                 * lock. Fix the state up.
                 */
                owner = rt_mutex_owner(&q->pi_state->pi_mutex);
-                ret = fixup_pi_state_owner(uaddr, q, owner, fshared);
+                ret = fixup_pi_state_owner(uaddr, q, owner);
                goto out;
        }
@@ -1715,7 +1753,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
 * futex_wait_setup() - Prepare to wait on a futex
 * @uaddr:      the futex userspace address
 * @val:        the expected value
- * @fshared:    whether the futex is shared (1) or not (0)
+ * @flags:      futex flags (FLAGS_SHARED, etc.)
 * @q:          the associated futex_q
 * @hb:         storage for hash_bucket pointer to be returned to caller
 *
@@ -1728,7 +1766,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
 *  0 - uaddr contains val and hb has been locked
 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked
 */
-static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
+static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
                           struct futex_q *q, struct futex_hash_bucket **hb)
 {
        u32 uval;
@@ -1752,8 +1790,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
         * rare, but normal.
         */
 retry:
-        q->key = FUTEX_KEY_INIT;
+        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key);
-        ret = get_futex_key(uaddr, fshared, &q->key);
        if (unlikely(ret != 0))
                return ret;
@@ -1769,10 +1806,10 @@ retry_private:
                if (ret)
                        goto out;
-                if (!fshared)
+                if (!(flags & FLAGS_SHARED))
                        goto retry_private;
-                put_futex_key(fshared, &q->key);
+                put_futex_key(&q->key);
                goto retry;
        }
@@ -1783,32 +1820,29 @@ retry_private:
 out:
        if (ret)
-                put_futex_key(fshared, &q->key);
+                put_futex_key(&q->key);
        return ret;
 }
-static int futex_wait(u32 __user *uaddr, int fshared,
+static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
-                      u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
+                      ktime_t *abs_time, u32 bitset)
 {
        struct hrtimer_sleeper timeout, *to = NULL;
        struct restart_block *restart;
        struct futex_hash_bucket *hb;
-        struct futex_q q;
+        struct futex_q q = futex_q_init;
        int ret;
        if (!bitset)
                return -EINVAL;
-        q.pi_state = NULL;
        q.bitset = bitset;
-        q.rt_waiter = NULL;
-        q.requeue_pi_key = NULL;
        if (abs_time) {
                to = &timeout;
-                hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
+                hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
-                                      CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+                                      CLOCK_REALTIME : CLOCK_MONOTONIC,
+                                      HRTIMER_MODE_ABS);
                hrtimer_init_sleeper(to, current);
                hrtimer_set_expires_range_ns(&to->timer, *abs_time,
                                             current->timer_slack_ns);
@@ -1819,7 +1853,7 @@ retry:
         * Prepare to wait on uaddr. On success, holds hb lock and increments
         * q.key refs.
         */
-        ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
+        ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
        if (ret)
                goto out;
@@ -1852,12 +1886,7 @@ retry:
        restart->futex.val = val;
        restart->futex.time = abs_time->tv64;
        restart->futex.bitset = bitset;
-        restart->futex.flags = FLAGS_HAS_TIMEOUT;
+        restart->futex.flags = flags;
-        if (fshared)
-                restart->futex.flags |= FLAGS_SHARED;
-        if (clockrt)
-                restart->futex.flags |= FLAGS_CLOCKRT;
        ret = -ERESTART_RESTARTBLOCK;
@@ -1873,7 +1902,6 @@ out:
 static long futex_wait_restart(struct restart_block *restart)
 {
        u32 __user *uaddr = restart->futex.uaddr;
-        int fshared = 0;
        ktime_t t, *tp = NULL;
        if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
@@ -1881,11 +1909,9 @@ static long futex_wait_restart(struct restart_block *restart)
                tp = &t;
        }
        restart->fn = do_no_restart_syscall;
-        if (restart->futex.flags & FLAGS_SHARED)
-                fshared = 1;
+        return (long)futex_wait(uaddr, restart->futex.flags,
-        return (long)futex_wait(uaddr, fshared, restart->futex.val, tp,
+                                restart->futex.val, tp, restart->futex.bitset);
-                                restart->futex.bitset,
-                                restart->futex.flags & FLAGS_CLOCKRT);
 }
@@ -1895,12 +1921,12 @@ static long futex_wait_restart(struct restart_block *restart)
 * if there are waiters then it will block, it does PI, etc. (Due to
 * races the kernel might see a 0 value of the futex too.)
 */
-static int futex_lock_pi(u32 __user *uaddr, int fshared,
+static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect,
-                         int detect, ktime_t *time, int trylock)
+                         ktime_t *time, int trylock)
 {
        struct hrtimer_sleeper timeout, *to = NULL;
        struct futex_hash_bucket *hb;
-        struct futex_q q;
+        struct futex_q q = futex_q_init;
        int res, ret;
        if (refill_pi_state_cache())
@@ -1914,12 +1940,8 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
                hrtimer_set_expires(&to->timer, *time);
        }
-        q.pi_state = NULL;
-        q.rt_waiter = NULL;
-        q.requeue_pi_key = NULL;
 retry:
-        q.key = FUTEX_KEY_INIT;
+        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key);
-        ret = get_futex_key(uaddr, fshared, &q.key);
        if (unlikely(ret != 0))
                goto out;
@@ -1941,7 +1963,7 @@ retry_private:
                         * exit to complete.
                         */
                        queue_unlock(&q, hb);
-                        put_futex_key(fshared, &q.key);
+                        put_futex_key(&q.key);
                        cond_resched();
                        goto retry;
                default:
@@ -1971,7 +1993,7 @@ retry_private:
         * Fixup the pi_state owner and possibly acquire the lock if we
         * haven't already.
         */
-        res = fixup_owner(uaddr, fshared, &q, !ret);
+        res = fixup_owner(uaddr, &q, !ret);
        /*
         * If fixup_owner() returned an error, proprogate that.  If it acquired
         * the lock, clear our -ETIMEDOUT or -EINTR.
@@ -1995,7 +2017,7 @@ out_unlock_put_key:
        queue_unlock(&q, hb);
 out_put_key:
-        put_futex_key(fshared, &q.key);
+        put_futex_key(&q.key);
 out:
        if (to)
                destroy_hrtimer_on_stack(&to->timer);
@@ -2008,10 +2030,10 @@ uaddr_faulted:
        if (ret)
                goto out_put_key;
-        if (!fshared)
+        if (!(flags & FLAGS_SHARED))
                goto retry_private;
-        put_futex_key(fshared, &q.key);
+        put_futex_key(&q.key);
        goto retry;
 }
@@ -2020,7 +2042,7 @@ uaddr_faulted:
 * This is the in-kernel slowpath: we look up the PI state (if any),
 * and do the rt-mutex unlock.
 */
-static int futex_unlock_pi(u32 __user *uaddr, int fshared)
+static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
 {
        struct futex_hash_bucket *hb;
        struct futex_q *this, *next;
@@ -2038,7 +2060,7 @@ retry:
        if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
                return -EPERM;
-        ret = get_futex_key(uaddr, fshared, &key);
+        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
        if (unlikely(ret != 0))
                goto out;
@@ -2093,14 +2115,14 @@ retry:
 out_unlock:
        spin_unlock(&hb->lock);
-        put_futex_key(fshared, &key);
+        put_futex_key(&key);
 out:
        return ret;
 pi_faulted:
        spin_unlock(&hb->lock);
-        put_futex_key(fshared, &key);
+        put_futex_key(&key);
        ret = fault_in_user_writeable(uaddr);
        if (!ret)
@@ -2160,7 +2182,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
 /**
 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
 * @uaddr:      the futex we initially wait on (non-pi)
- * @fshared:    whether the futexes are shared (1) or not (0).  They must be
+ * @flags:      futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
 *              the same type, no requeueing from private to shared, etc.
 * @val:        the expected value of uaddr
 * @abs_time:   absolute timeout
@@ -2198,16 +2220,16 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
 *  0 - On success
 * <0 - On error
 */
-static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
+static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
                                 u32 val, ktime_t *abs_time, u32 bitset,
-                                 int clockrt, u32 __user *uaddr2)
+                                 u32 __user *uaddr2)
 {
        struct hrtimer_sleeper timeout, *to = NULL;
        struct rt_mutex_waiter rt_waiter;
        struct rt_mutex *pi_mutex = NULL;
        struct futex_hash_bucket *hb;
-        union futex_key key2;
+        union futex_key key2 = FUTEX_KEY_INIT;
-        struct futex_q q;
+        struct futex_q q = futex_q_init;
        int res, ret;
        if (!bitset)
@@ -2215,8 +2237,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
        if (abs_time) {
                to = &timeout;
-                hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
+                hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
-                                      CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+                                      CLOCK_REALTIME : CLOCK_MONOTONIC,
+                                      HRTIMER_MODE_ABS);
                hrtimer_init_sleeper(to, current);
                hrtimer_set_expires_range_ns(&to->timer, *abs_time,
                                             current->timer_slack_ns);
@@ -2229,12 +2252,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
        debug_rt_mutex_init_waiter(&rt_waiter);
        rt_waiter.task = NULL;
-        key2 = FUTEX_KEY_INIT;
+        ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
-        ret = get_futex_key(uaddr2, fshared, &key2);
        if (unlikely(ret != 0))
                goto out;
-        q.pi_state = NULL;
        q.bitset = bitset;
        q.rt_waiter = &rt_waiter;
        q.requeue_pi_key = &key2;
@@ -2243,7 +2264,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
         * Prepare to wait on uaddr. On success, increments q.key (key1) ref
         * count.
         */
-        ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
+        ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
        if (ret)
                goto out_key2;
@@ -2273,8 +2294,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
                 */
                if (q.pi_state && (q.pi_state->owner != current)) {
                        spin_lock(q.lock_ptr);
-                        ret = fixup_pi_state_owner(uaddr2, &q, current,
+                        ret = fixup_pi_state_owner(uaddr2, &q, current);
-                                                   fshared);
                        spin_unlock(q.lock_ptr);
                }
        } else {
@@ -2293,7 +2313,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
                 * Fixup the pi_state owner and possibly acquire the lock if we
                 * haven't already.
                 */
-                res = fixup_owner(uaddr2, fshared, &q, !ret);
+                res = fixup_owner(uaddr2, &q, !ret);
                /*
                 * If fixup_owner() returned an error, proprogate that.  If it
                 * acquired the lock, clear -ETIMEDOUT or -EINTR.
@@ -2324,9 +2344,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
        }
 out_put_keys:
-        put_futex_key(fshared, &q.key);
+        put_futex_key(&q.key);
 out_key2:
-        put_futex_key(fshared, &key2);
+        put_futex_key(&key2);
 out:
        if (to) {
@@ -2489,7 +2509,8 @@ void exit_robust_list(struct task_struct *curr)
 {
        struct robust_list_head __user *head = curr->robust_list;
        struct robust_list __user *entry, *next_entry, *pending;
-        unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip;
+        unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
+        unsigned int uninitialized_var(next_pi);
        unsigned long futex_offset;
        int rc;
@@ -2550,58 +2571,57 @@ void exit_robust_list(struct task_struct *curr)
 long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
                u32 __user *uaddr2, u32 val2, u32 val3)
 {
-        int clockrt, ret = -ENOSYS;
+        int ret = -ENOSYS, cmd = op & FUTEX_CMD_MASK;
-        int cmd = op & FUTEX_CMD_MASK;
+        unsigned int flags = 0;
-        int fshared = 0;
        if (!(op & FUTEX_PRIVATE_FLAG))
-                fshared = 1;
+                flags |= FLAGS_SHARED;
-        clockrt = op & FUTEX_CLOCK_REALTIME;
+        if (op & FUTEX_CLOCK_REALTIME) {
-        if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
+                flags |= FLAGS_CLOCKRT;
-                return -ENOSYS;
+                if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
+                        return -ENOSYS;
+        }
        switch (cmd) {
        case FUTEX_WAIT:
                val3 = FUTEX_BITSET_MATCH_ANY;
        case FUTEX_WAIT_BITSET:
-                ret = futex_wait(uaddr, fshared, val, timeout, val3, clockrt);
+                ret = futex_wait(uaddr, flags, val, timeout, val3);
                break;
        case FUTEX_WAKE:
                val3 = FUTEX_BITSET_MATCH_ANY;
        case FUTEX_WAKE_BITSET:
-                ret = futex_wake(uaddr, fshared, val, val3);
+                ret = futex_wake(uaddr, flags, val, val3);
                break;
        case FUTEX_REQUEUE:
-                ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0);
+                ret = futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
                break;
        case FUTEX_CMP_REQUEUE:
-                ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
+                ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
-                                    0);
                break;
        case FUTEX_WAKE_OP:
-                ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);
+                ret = futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
                break;
        case FUTEX_LOCK_PI:
                if (futex_cmpxchg_enabled)
-                        ret = futex_lock_pi(uaddr, fshared, val, timeout, 0);
+                        ret = futex_lock_pi(uaddr, flags, val, timeout, 0);
                break;
        case FUTEX_UNLOCK_PI:
                if (futex_cmpxchg_enabled)
-                        ret = futex_unlock_pi(uaddr, fshared);
+                        ret = futex_unlock_pi(uaddr, flags);
                break;
        case FUTEX_TRYLOCK_PI:
                if (futex_cmpxchg_enabled)
-                        ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);
+                        ret = futex_lock_pi(uaddr, flags, 0, timeout, 1);
                break;
        case FUTEX_WAIT_REQUEUE_PI:
                val3 = FUTEX_BITSET_MATCH_ANY;
-                ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3,
+                ret = futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
-                                            clockrt, uaddr2);
+                                            uaddr2);
                break;
        case FUTEX_CMP_REQUEUE_PI:
-                ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
+                ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
-                                    1);
                break;
        default:
                ret = -ENOSYS;
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 06da4dfc339b..a7934ac75e5b 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -49,7 +49,8 @@ void compat_exit_robust_list(struct task_struct *curr)
 {
        struct compat_robust_list_head __user *head = curr->compat_robust_list;
        struct robust_list __user *entry, *next_entry, *pending;
-        unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip;
+        unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
+        unsigned int uninitialized_var(next_pi);
        compat_uptr_t uentry, next_uentry, upending;
        compat_long_t futex_offset;
        int rc;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 72206cf5c6cf..0c8d7c048615 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -497,7 +497,7 @@ static inline int hrtimer_is_hres_enabled(void)
 */
 static inline int hrtimer_hres_active(void)
 {
-        return __get_cpu_var(hrtimer_bases).hres_active;
+        return __this_cpu_read(hrtimer_bases.hres_active);
 }
 /*
@@ -516,10 +516,13 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
                struct hrtimer *timer;
+                struct timerqueue_node *next;
-                if (!base->first)
+                next = timerqueue_getnext(&base->active);
+                if (!next)
                        continue;
-                timer = rb_entry(base->first, struct hrtimer, node);
+                timer = container_of(next, struct hrtimer, node);
                expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
                /*
                 * clock_was_set() has changed base->offset so the
@@ -840,48 +843,17 @@ EXPORT_SYMBOL_GPL(hrtimer_forward);
 static int enqueue_hrtimer(struct hrtimer *timer,
                           struct hrtimer_clock_base *base)
 {
-        struct rb_node **link = &base->active.rb_node;
-        struct rb_node *parent = NULL;
-        struct hrtimer *entry;
-        int leftmost = 1;
        debug_activate(timer);
-        /*
+        timerqueue_add(&base->active, &timer->node);
-         * Find the right place in the rbtree:
-         */
-        while (*link) {
-                parent = *link;
-                entry = rb_entry(parent, struct hrtimer, node);
-                /*
-                 * We dont care about collisions. Nodes with
-                 * the same expiry time stay together.
-                 */
-                if (hrtimer_get_expires_tv64(timer) <
-                                hrtimer_get_expires_tv64(entry)) {
-                        link = &(*link)->rb_left;
-                } else {
-                        link = &(*link)->rb_right;
-                        leftmost = 0;
-                }
-        }
-        /*
-         * Insert the timer to the rbtree and check whether it
-         * replaces the first pending timer
-         */
-        if (leftmost)
-                base->first = &timer->node;
-        rb_link_node(&timer->node, parent, link);
-        rb_insert_color(&timer->node, &base->active);
        /*
         * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
         * state of a possibly running callback.
         */
        timer->state |= HRTIMER_STATE_ENQUEUED;
-        return leftmost;
+        return (&timer->node == base->active.next);
 }
 /*
@@ -901,12 +873,7 @@ static void __remove_hrtimer(struct hrtimer *timer,
        if (!(timer->state & HRTIMER_STATE_ENQUEUED))
                goto out;
-        /*
+        if (&timer->node == timerqueue_getnext(&base->active)) {
-         * Remove the timer from the rbtree and replace the first
-         * entry pointer if necessary.
-         */
-        if (base->first == &timer->node) {
-                base->first = rb_next(&timer->node);
 #ifdef CONFIG_HIGH_RES_TIMERS
                /* Reprogram the clock event device. if enabled */
                if (reprogram && hrtimer_hres_active()) {
@@ -919,7 +886,7 @@ static void __remove_hrtimer(struct hrtimer *timer,
                }
 #endif
        }
-        rb_erase(&timer->node, &base->active);
+        timerqueue_del(&base->active, &timer->node);
 out:
        timer->state = newstate;
 }
@@ -1128,11 +1095,13 @@ ktime_t hrtimer_get_next_event(void)
        if (!hrtimer_hres_active()) {
                for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
                        struct hrtimer *timer;
+                        struct timerqueue_node *next;
-                        if (!base->first)
+                        next = timerqueue_getnext(&base->active);
+                        if (!next)
                                continue;
-                        timer = rb_entry(base->first, struct hrtimer, node);
+                        timer = container_of(next, struct hrtimer, node);
                        delta.tv64 = hrtimer_get_expires_tv64(timer);
                        delta = ktime_sub(delta, base->get_time());
                        if (delta.tv64 < mindelta.tv64)
@@ -1162,6 +1131,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
        timer->base = &cpu_base->clock_base[clock_id];
        hrtimer_init_timer_hres(timer);
+        timerqueue_init(&timer->node);
 #ifdef CONFIG_TIMER_STATS
        timer->start_site = NULL;
@@ -1278,14 +1248,14 @@ retry:
        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
                ktime_t basenow;
-                struct rb_node *node;
+                struct timerqueue_node *node;
                basenow = ktime_add(now, base->offset);
-                while ((node = base->first)) {
+                while ((node = timerqueue_getnext(&base->active))) {
                        struct hrtimer *timer;
-                        timer = rb_entry(node, struct hrtimer, node);
+                        timer = container_of(node, struct hrtimer, node);
                        /*
                         * The immediate goal for using the softexpires is
@@ -1441,7 +1411,7 @@ void hrtimer_run_pending(void)
 */
 void hrtimer_run_queues(void)
 {
-        struct rb_node *node;
+        struct timerqueue_node *node;
        struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
        struct hrtimer_clock_base *base;
        int index, gettime = 1;
@@ -1451,8 +1421,7 @@ void hrtimer_run_queues(void)
        for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
                base = &cpu_base->clock_base[index];
+                if (!timerqueue_getnext(&base->active))
-                if (!base->first)
                        continue;
                if (gettime) {
@@ -1462,10 +1431,10 @@ void hrtimer_run_queues(void)
                raw_spin_lock(&cpu_base->lock);
-                while ((node = base->first)) {
+                while ((node = timerqueue_getnext(&base->active))) {
                        struct hrtimer *timer;
-                        timer = rb_entry(node, struct hrtimer, node);
+                        timer = container_of(node, struct hrtimer, node);
                        if (base->softirq_time.tv64 <=
                                        hrtimer_get_expires_tv64(timer))
                                break;
@@ -1630,8 +1599,10 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
        raw_spin_lock_init(&cpu_base->lock);
-        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
+        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
                cpu_base->clock_base[i].cpu_base = cpu_base;
+                timerqueue_init_head(&cpu_base->clock_base[i].active);
+        }
        hrtimer_init_hres(cpu_base);
 }
@@ -1642,10 +1613,10 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
                                struct hrtimer_clock_base *new_base)
 {
        struct hrtimer *timer;
-        struct rb_node *node;
+        struct timerqueue_node *node;
-        while ((node = rb_first(&old_base->active))) {
+        while ((node = timerqueue_getnext(&old_base->active))) {
-                timer = rb_entry(node, struct hrtimer, node);
+                timer = container_of(node, struct hrtimer, node);
                BUG_ON(hrtimer_callback_running(timer));
                debug_deactivate(timer);
@@ -1774,7 +1745,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
        }
        /*
-         * A NULL parameter means "inifinte"
+         * A NULL parameter means "infinite"
         */
        if (!expires) {
                schedule();
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 2c9120f0afca..086adf25a55e 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -620,7 +620,7 @@ static struct pmu perf_breakpoint = {
        .read           = hw_breakpoint_pmu_read,
 };
-static int __init init_hw_breakpoint(void)
+int __init init_hw_breakpoint(void)
 {
        unsigned int **task_bp_pinned;
        int cpu, err_cpu;
@@ -641,7 +641,7 @@ static int __init init_hw_breakpoint(void)
        constraints_initialized = 1;
-        perf_pmu_register(&perf_breakpoint);
+        perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT);
        return register_die_notifier(&hw_breakpoint_exceptions_nb);
@@ -655,6 +655,5 @@ static int __init init_hw_breakpoint(void)
        return -ENOMEM;
 }
-core_initcall(init_hw_breakpoint);
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 31d766bf5d2e..8e42fec7686d 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -9,9 +9,6 @@ menu "IRQ subsystem"
 config GENERIC_HARDIRQS
       def_bool y
-config GENERIC_HARDIRQS_NO__DO_IRQ
-       def_bool y
 # Select this to disable the deprecated stuff
 config GENERIC_HARDIRQS_NO_DEPRECATED
       def_bool n
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index e2347eb63306..3540a7190122 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -118,114 +118,3 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
        return retval;
 }
-#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
-#ifdef CONFIG_ENABLE_WARN_DEPRECATED
-# warning __do_IRQ is deprecated. Please convert to proper flow handlers
-#endif
-/**
- * __do_IRQ - original all in one highlevel IRQ handler
- * @irq:        the interrupt number
- *
- * __do_IRQ handles all normal device IRQ's (the special
- * SMP cross-CPU interrupts have their own specific
- * handlers).
- *
- * This is the original x86 implementation which is used for every
- * interrupt type.
- */
-unsigned int __do_IRQ(unsigned int irq)
-{
-        struct irq_desc *desc = irq_to_desc(irq);
-        struct irqaction *action;
-        unsigned int status;
-        kstat_incr_irqs_this_cpu(irq, desc);
-        if (CHECK_IRQ_PER_CPU(desc->status)) {
-                irqreturn_t action_ret;
-                /*
-                 * No locking required for CPU-local interrupts:
-                 */
-                if (desc->irq_data.chip->ack)
-                        desc->irq_data.chip->ack(irq);
-                if (likely(!(desc->status & IRQ_DISABLED))) {
-                        action_ret = handle_IRQ_event(irq, desc->action);
-                        if (!noirqdebug)
-                                note_interrupt(irq, desc, action_ret);
-                }
-                desc->irq_data.chip->end(irq);
-                return 1;
-        }
-        raw_spin_lock(&desc->lock);
-        if (desc->irq_data.chip->ack)
-                desc->irq_data.chip->ack(irq);
-        /*
-         * REPLAY is when Linux resends an IRQ that was dropped earlier
-         * WAITING is used by probe to mark irqs that are being tested
-         */
-        status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING);
-        status |= IRQ_PENDING; /* we _want_ to handle it */
-        /*
-         * If the IRQ is disabled for whatever reason, we cannot
-         * use the action we have.
-         */
-        action = NULL;
-        if (likely(!(status & (IRQ_DISABLED | IRQ_INPROGRESS)))) {
-                action = desc->action;
-                status &= ~IRQ_PENDING; /* we commit to handling */
-                status |= IRQ_INPROGRESS; /* we are handling it */
-        }
-        desc->status = status;
-        /*
-         * If there is no IRQ handler or it was disabled, exit early.
-         * Since we set PENDING, if another processor is handling
-         * a different instance of this same irq, the other processor
-         * will take care of it.
-         */
-        if (unlikely(!action))
-                goto out;
-        /*
-         * Edge triggered interrupts need to remember
-         * pending events.
-         * This applies to any hw interrupts that allow a second
-         * instance of the same irq to arrive while we are in do_IRQ
-         * or in the handler. But the code here only handles the _second_
-         * instance of the irq, not the third or fourth. So it is mostly
-         * useful for irq hardware that does not mask cleanly in an
-         * SMP environment.
-         */
-        for (;;) {
-                irqreturn_t action_ret;
-                raw_spin_unlock(&desc->lock);
-                action_ret = handle_IRQ_event(irq, action);
-                if (!noirqdebug)
-                        note_interrupt(irq, desc, action_ret);
-                raw_spin_lock(&desc->lock);
-                if (likely(!(desc->status & IRQ_PENDING)))
-                        break;
-                desc->status &= ~IRQ_PENDING;
-        }
-        desc->status &= ~IRQ_INPROGRESS;
-out:
-        /*
-         * The ->end() handler has to deal with interrupts which got
-         * disabled while the handler was running.
-         */
-        desc->irq_data.chip->end(irq);
-        raw_spin_unlock(&desc->lock);
-        return 1;
-}
-#endif
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 9988d03797f5..282f20230e67 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -72,6 +72,8 @@ static inline int desc_node(struct irq_desc *desc) { return 0; }
 static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
 {
+        int cpu;
        desc->irq_data.irq = irq;
        desc->irq_data.chip = &no_irq_chip;
        desc->irq_data.chip_data = NULL;
@@ -83,7 +85,8 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
        desc->irq_count = 0;
        desc->irqs_unhandled = 0;
        desc->name = NULL;
-        memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs)));
+        for_each_possible_cpu(cpu)
+                *per_cpu_ptr(desc->kstat_irqs, cpu) = 0;
        desc_smp_init(desc, node);
 }
@@ -133,8 +136,7 @@ static struct irq_desc *alloc_desc(int irq, int node)
        if (!desc)
                return NULL;
        /* allocate based on nr_cpu_ids */
-        desc->kstat_irqs = kzalloc_node(nr_cpu_ids * sizeof(*desc->kstat_irqs),
+        desc->kstat_irqs = alloc_percpu(unsigned int);
-                                         gfp, node);
        if (!desc->kstat_irqs)
                goto err_desc;
@@ -149,7 +151,7 @@ static struct irq_desc *alloc_desc(int irq, int node)
        return desc;
 err_kstat:
-        kfree(desc->kstat_irqs);
+        free_percpu(desc->kstat_irqs);
 err_desc:
        kfree(desc);
        return NULL;
@@ -166,7 +168,7 @@ static void free_desc(unsigned int irq)
        mutex_unlock(&sparse_irq_lock);
        free_masks(desc);
-        kfree(desc->kstat_irqs);
+        free_percpu(desc->kstat_irqs);
        kfree(desc);
 }
@@ -234,7 +236,6 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
        }
 };
-static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS];
 int __init early_irq_init(void)
 {
        int count, i, node = first_online_node;
@@ -250,7 +251,8 @@ int __init early_irq_init(void)
        for (i = 0; i < count; i++) {
                desc[i].irq_data.irq = i;
                desc[i].irq_data.chip = &no_irq_chip;
-                desc[i].kstat_irqs = kstat_irqs_all[i];
+                /* TODO : do this allocation on-demand ... */
+                desc[i].kstat_irqs = alloc_percpu(unsigned int);
                alloc_masks(desc + i, GFP_KERNEL, node);
                desc_smp_init(desc + i, node);
                lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
@@ -275,6 +277,22 @@ static void free_desc(unsigned int irq)
 static inline int alloc_descs(unsigned int start, unsigned int cnt, int node)
 {
+#if defined(CONFIG_KSTAT_IRQS_ONDEMAND)
+        struct irq_desc *desc;
+        unsigned int i;
+        for (i = 0; i < cnt; i++) {
+                desc = irq_to_desc(start + i);
+                if (desc && !desc->kstat_irqs) {
+                        unsigned int __percpu *stats = alloc_percpu(unsigned int);
+                        if (!stats)
+                                return -1;
+                        if (cmpxchg(&desc->kstat_irqs, NULL, stats) != NULL)
+                                free_percpu(stats);
+                }
+        }
+#endif
        return start;
 }
 #endif /* !CONFIG_SPARSE_IRQ */
@@ -391,7 +409,9 @@ void dynamic_irq_cleanup(unsigned int irq)
 unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
 {
        struct irq_desc *desc = irq_to_desc(irq);
-        return desc ? desc->kstat_irqs[cpu] : 0;
+        return desc && desc->kstat_irqs ?
+                        *per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
 }
 #ifdef CONFIG_GENERIC_HARDIRQS
@@ -401,10 +421,10 @@ unsigned int kstat_irqs(unsigned int irq)
        int cpu;
        int sum = 0;
-        if (!desc)
+        if (!desc || !desc->kstat_irqs)
                return 0;
        for_each_possible_cpu(cpu)
-                sum += desc->kstat_irqs[cpu];
+                sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
        return sum;
 }
 #endif /* CONFIG_GENERIC_HARDIRQS */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 5f92acc5f952..0caa59f747dd 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -577,7 +577,9 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
 */
 static int irq_thread(void *data)
 {
-        struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, };
+        static const struct sched_param param = {
+                .sched_priority = MAX_USER_RT_PRIO/2,
+        };
        struct irqaction *action = data;
        struct irq_desc *desc = irq_to_desc(action->irq);
        int wake, oneshot = desc->status & IRQ_ONESHOT;
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 01b1d3a88983..6c8a2a9f8a7b 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -214,7 +214,7 @@ static int irq_spurious_proc_show(struct seq_file *m, void *v)
 static int irq_spurious_proc_open(struct inode *inode, struct file *file)
 {
-        return single_open(file, irq_spurious_proc_show, NULL);
+        return single_open(file, irq_spurious_proc_show, PDE(inode)->data);
 }
 static const struct file_operations irq_spurious_proc_fops = {
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index f16763ff8481..c58fa7da8aef 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -77,21 +77,21 @@ void __weak arch_irq_work_raise(void)
 */
 static void __irq_work_queue(struct irq_work *entry)
 {
-        struct irq_work **head, *next;
+        struct irq_work *next;
-        head = &get_cpu_var(irq_work_list);
+        preempt_disable();
        do {
-                next = *head;
+                next = __this_cpu_read(irq_work_list);
                /* Can assign non-atomic because we keep the flags set. */
                entry->next = next_flags(next, IRQ_WORK_FLAGS);
-        } while (cmpxchg(head, next, entry) != next);
+        } while (this_cpu_cmpxchg(irq_work_list, next, entry) != next);
        /* The list was empty, raise self-interrupt to start processing. */
        if (!irq_work_next(entry))
                arch_irq_work_raise();
-        put_cpu_var(irq_work_list);
+        preempt_enable();
 }
 /*
@@ -120,16 +120,16 @@ EXPORT_SYMBOL_GPL(irq_work_queue);
 */
 void irq_work_run(void)
 {
-        struct irq_work *list, **head;
+        struct irq_work *list;
-        head = &__get_cpu_var(irq_work_list);
+        if (this_cpu_read(irq_work_list) == NULL)
-        if (*head == NULL)
                return;
        BUG_ON(!in_irq());
        BUG_ON(!irqs_disabled());
-        list = xchg(head, NULL);
+        list = this_cpu_xchg(irq_work_list, NULL);
        while (list != NULL) {
                struct irq_work *entry = list;
@@ -145,7 +145,9 @@ void irq_work_run(void)
                 * Clear the BUSY bit and return to the free state if
                 * no-one else claimed it meanwhile.
                 */
-                cmpxchg(&entry->next, next_flags(NULL, IRQ_WORK_BUSY), NULL);
+                (void)cmpxchg(&entry->next,
+                              next_flags(NULL, IRQ_WORK_BUSY),
+                              NULL);
        }
 }
 EXPORT_SYMBOL_GPL(irq_work_run);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index b55045bc7563..ec19b92c7ebd 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -163,7 +163,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
         * just verifies it is an address we can use.
         *
         * Since the kernel does everything in page size chunks ensure
-         * the destination addreses are page aligned.  Too many
+         * the destination addresses are page aligned.  Too many
         * special cases crop of when we don't do this.  The most
         * insidious is getting overlapping destination addresses
         * simply because addresses are changed to page size
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 9737a76e106f..77981813a1e7 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -317,12 +317,12 @@ void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty)
 /* We have preemption disabled.. so it is safe to use __ versions */
 static inline void set_kprobe_instance(struct kprobe *kp)
 {
-        __get_cpu_var(kprobe_instance) = kp;
+        __this_cpu_write(kprobe_instance, kp);
 }
 static inline void reset_kprobe_instance(void)
 {
-        __get_cpu_var(kprobe_instance) = NULL;
+        __this_cpu_write(kprobe_instance, NULL);
 }
 /*
@@ -354,13 +354,20 @@ static inline int kprobe_aggrprobe(struct kprobe *p)
        return p->pre_handler == aggr_pre_handler;
 }
+/* Return true(!0) if the kprobe is unused */
+static inline int kprobe_unused(struct kprobe *p)
+{
+        return kprobe_aggrprobe(p) && kprobe_disabled(p) &&
+               list_empty(&p->list);
+}
 /*
 * Keep all fields in the kprobe consistent
 */
-static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
+static inline void copy_kprobe(struct kprobe *ap, struct kprobe *p)
 {
-        memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
+        memcpy(&p->opcode, &ap->opcode, sizeof(kprobe_opcode_t));
-        memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
+        memcpy(&p->ainsn, &ap->ainsn, sizeof(struct arch_specific_insn));
 }
 #ifdef CONFIG_OPTPROBES
@@ -384,6 +391,17 @@ void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
        }
 }
+/* Free optimized instructions and optimized_kprobe */
+static __kprobes void free_aggr_kprobe(struct kprobe *p)
+{
+        struct optimized_kprobe *op;
+        op = container_of(p, struct optimized_kprobe, kp);
+        arch_remove_optimized_kprobe(op);
+        arch_remove_kprobe(p);
+        kfree(op);
+}
 /* Return true(!0) if the kprobe is ready for optimization. */
 static inline int kprobe_optready(struct kprobe *p)
 {
@@ -397,6 +415,33 @@ static inline int kprobe_optready(struct kprobe *p)
        return 0;
 }
+/* Return true(!0) if the kprobe is disarmed. Note: p must be on hash list */
+static inline int kprobe_disarmed(struct kprobe *p)
+{
+        struct optimized_kprobe *op;
+        /* If kprobe is not aggr/opt probe, just return kprobe is disabled */
+        if (!kprobe_aggrprobe(p))
+                return kprobe_disabled(p);
+        op = container_of(p, struct optimized_kprobe, kp);
+        return kprobe_disabled(p) && list_empty(&op->list);
+}
+/* Return true(!0) if the probe is queued on (un)optimizing lists */
+static int __kprobes kprobe_queued(struct kprobe *p)
+{
+        struct optimized_kprobe *op;
+        if (kprobe_aggrprobe(p)) {
+                op = container_of(p, struct optimized_kprobe, kp);
+                if (!list_empty(&op->list))
+                        return 1;
+        }
+        return 0;
+}
 /*
 * Return an optimized kprobe whose optimizing code replaces
 * instructions including addr (exclude breakpoint).
@@ -422,30 +467,23 @@ static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
 /* Optimization staging list, protected by kprobe_mutex */
 static LIST_HEAD(optimizing_list);
+static LIST_HEAD(unoptimizing_list);
 static void kprobe_optimizer(struct work_struct *work);
 static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
+static DECLARE_COMPLETION(optimizer_comp);
 #define OPTIMIZE_DELAY 5
-/* Kprobe jump optimizer */
+/*
-static __kprobes void kprobe_optimizer(struct work_struct *work)
+ * Optimize (replace a breakpoint with a jump) kprobes listed on
+ * optimizing_list.
+ */
+static __kprobes void do_optimize_kprobes(void)
 {
-        struct optimized_kprobe *op, *tmp;
+        /* Optimization never be done when disarmed */
+        if (kprobes_all_disarmed || !kprobes_allow_optimization ||
-        /* Lock modules while optimizing kprobes */
+            list_empty(&optimizing_list))
-        mutex_lock(&module_mutex);
+                return;
-        mutex_lock(&kprobe_mutex);
-        if (kprobes_all_disarmed || !kprobes_allow_optimization)
-                goto end;
-        /*
-         * Wait for quiesence period to ensure all running interrupts
-         * are done. Because optprobe may modify multiple instructions
-         * there is a chance that Nth instruction is interrupted. In that
-         * case, running interrupt can return to 2nd-Nth byte of jump
-         * instruction. This wait is for avoiding it.
-         */
-        synchronize_sched();
        /*
         * The optimization/unoptimization refers online_cpus via
@@ -459,17 +497,111 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
         */
        get_online_cpus();
        mutex_lock(&text_mutex);
-        list_for_each_entry_safe(op, tmp, &optimizing_list, list) {
+        arch_optimize_kprobes(&optimizing_list);
-                WARN_ON(kprobe_disabled(&op->kp));
+        mutex_unlock(&text_mutex);
-                if (arch_optimize_kprobe(op) < 0)
+        put_online_cpus();
-                        op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+}
-                list_del_init(&op->list);
+/*
+ * Unoptimize (replace a jump with a breakpoint and remove the breakpoint
+ * if need) kprobes listed on unoptimizing_list.
+ */
+static __kprobes void do_unoptimize_kprobes(struct list_head *free_list)
+{
+        struct optimized_kprobe *op, *tmp;
+        /* Unoptimization must be done anytime */
+        if (list_empty(&unoptimizing_list))
+                return;
+        /* Ditto to do_optimize_kprobes */
+        get_online_cpus();
+        mutex_lock(&text_mutex);
+        arch_unoptimize_kprobes(&unoptimizing_list, free_list);
+        /* Loop free_list for disarming */
+        list_for_each_entry_safe(op, tmp, free_list, list) {
+                /* Disarm probes if marked disabled */
+                if (kprobe_disabled(&op->kp))
+                        arch_disarm_kprobe(&op->kp);
+                if (kprobe_unused(&op->kp)) {
+                        /*
+                         * Remove unused probes from hash list. After waiting
+                         * for synchronization, these probes are reclaimed.
+                         * (reclaiming is done by do_free_cleaned_kprobes.)
+                         */
+                        hlist_del_rcu(&op->kp.hlist);
+                } else
+                        list_del_init(&op->list);
        }
        mutex_unlock(&text_mutex);
        put_online_cpus();
-end:
+}
+/* Reclaim all kprobes on the free_list */
+static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list)
+{
+        struct optimized_kprobe *op, *tmp;
+        list_for_each_entry_safe(op, tmp, free_list, list) {
+                BUG_ON(!kprobe_unused(&op->kp));
+                list_del_init(&op->list);
+                free_aggr_kprobe(&op->kp);
+        }
+}
+/* Start optimizer after OPTIMIZE_DELAY passed */
+static __kprobes void kick_kprobe_optimizer(void)
+{
+        if (!delayed_work_pending(&optimizing_work))
+                schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
+}
+/* Kprobe jump optimizer */
+static __kprobes void kprobe_optimizer(struct work_struct *work)
+{
+        LIST_HEAD(free_list);
+        /* Lock modules while optimizing kprobes */
+        mutex_lock(&module_mutex);
+        mutex_lock(&kprobe_mutex);
+        /*
+         * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)
+         * kprobes before waiting for quiesence period.
+         */
+        do_unoptimize_kprobes(&free_list);
+        /*
+         * Step 2: Wait for quiesence period to ensure all running interrupts
+         * are done. Because optprobe may modify multiple instructions
+         * there is a chance that Nth instruction is interrupted. In that
+         * case, running interrupt can return to 2nd-Nth byte of jump
+         * instruction. This wait is for avoiding it.
+         */
+        synchronize_sched();
+        /* Step 3: Optimize kprobes after quiesence period */
+        do_optimize_kprobes();
+        /* Step 4: Free cleaned kprobes after quiesence period */
+        do_free_cleaned_kprobes(&free_list);
        mutex_unlock(&kprobe_mutex);
        mutex_unlock(&module_mutex);
+        /* Step 5: Kick optimizer again if needed */
+        if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list))
+                kick_kprobe_optimizer();
+        else
+                /* Wake up all waiters */
+                complete_all(&optimizer_comp);
+}
+/* Wait for completing optimization and unoptimization */
+static __kprobes void wait_for_kprobe_optimizer(void)
+{
+        if (delayed_work_pending(&optimizing_work))
+                wait_for_completion(&optimizer_comp);
 }
 /* Optimize kprobe if p is ready to be optimized */
@@ -495,42 +627,99 @@ static __kprobes void optimize_kprobe(struct kprobe *p)
        /* Check if it is already optimized. */
        if (op->kp.flags & KPROBE_FLAG_OPTIMIZED)
                return;
        op->kp.flags |= KPROBE_FLAG_OPTIMIZED;
-        list_add(&op->list, &optimizing_list);
-        if (!delayed_work_pending(&optimizing_work))
+        if (!list_empty(&op->list))
-                schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
+                /* This is under unoptimizing. Just dequeue the probe */
+                list_del_init(&op->list);
+        else {
+                list_add(&op->list, &optimizing_list);
+                kick_kprobe_optimizer();
+        }
+}
+/* Short cut to direct unoptimizing */
+static __kprobes void force_unoptimize_kprobe(struct optimized_kprobe *op)
+{
+        get_online_cpus();
+        arch_unoptimize_kprobe(op);
+        put_online_cpus();
+        if (kprobe_disabled(&op->kp))
+                arch_disarm_kprobe(&op->kp);
 }
 /* Unoptimize a kprobe if p is optimized */
-static __kprobes void unoptimize_kprobe(struct kprobe *p)
+static __kprobes void unoptimize_kprobe(struct kprobe *p, bool force)
 {
        struct optimized_kprobe *op;
-        if ((p->flags & KPROBE_FLAG_OPTIMIZED) && kprobe_aggrprobe(p)) {
+        if (!kprobe_aggrprobe(p) || kprobe_disarmed(p))
-                op = container_of(p, struct optimized_kprobe, kp);
+                return; /* This is not an optprobe nor optimized */
-                if (!list_empty(&op->list))
-                        /* Dequeue from the optimization queue */
+        op = container_of(p, struct optimized_kprobe, kp);
+        if (!kprobe_optimized(p)) {
+                /* Unoptimized or unoptimizing case */
+                if (force && !list_empty(&op->list)) {
+                        /*
+                         * Only if this is unoptimizing kprobe and forced,
+                         * forcibly unoptimize it. (No need to unoptimize
+                         * unoptimized kprobe again :)
+                         */
                        list_del_init(&op->list);
-                else
+                        force_unoptimize_kprobe(op);
-                        /* Replace jump with break */
+                }
-                        arch_unoptimize_kprobe(op);
+                return;
-                op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+        }
+        op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+        if (!list_empty(&op->list)) {
+                /* Dequeue from the optimization queue */
+                list_del_init(&op->list);
+                return;
+        }
+        /* Optimized kprobe case */
+        if (force)
+                /* Forcibly update the code: this is a special case */
+                force_unoptimize_kprobe(op);
+        else {
+                list_add(&op->list, &unoptimizing_list);
+                kick_kprobe_optimizer();
        }
 }
+/* Cancel unoptimizing for reusing */
+static void reuse_unused_kprobe(struct kprobe *ap)
+{
+        struct optimized_kprobe *op;
+        BUG_ON(!kprobe_unused(ap));
+        /*
+         * Unused kprobe MUST be on the way of delayed unoptimizing (means
+         * there is still a relative jump) and disabled.
+         */
+        op = container_of(ap, struct optimized_kprobe, kp);
+        if (unlikely(list_empty(&op->list)))
+                printk(KERN_WARNING "Warning: found a stray unused "
+                        "aggrprobe@%p\n", ap->addr);
+        /* Enable the probe again */
+        ap->flags &= ~KPROBE_FLAG_DISABLED;
+        /* Optimize it again (remove from op->list) */
+        BUG_ON(!kprobe_optready(ap));
+        optimize_kprobe(ap);
+}
 /* Remove optimized instructions */
 static void __kprobes kill_optimized_kprobe(struct kprobe *p)
 {
        struct optimized_kprobe *op;
        op = container_of(p, struct optimized_kprobe, kp);
-        if (!list_empty(&op->list)) {
+        if (!list_empty(&op->list))
-                /* Dequeue from the optimization queue */
+                /* Dequeue from the (un)optimization queue */
                list_del_init(&op->list);
-                op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
-        }
+        op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
-        /* Don't unoptimize, because the target code will be freed. */
+        /* Don't touch the code, because it is already freed. */
        arch_remove_optimized_kprobe(op);
 }
@@ -543,16 +732,6 @@ static __kprobes void prepare_optimized_kprobe(struct kprobe *p)
        arch_prepare_optimized_kprobe(op);
 }
-/* Free optimized instructions and optimized_kprobe */
-static __kprobes void free_aggr_kprobe(struct kprobe *p)
-{
-        struct optimized_kprobe *op;
-        op = container_of(p, struct optimized_kprobe, kp);
-        arch_remove_optimized_kprobe(op);
-        kfree(op);
-}
 /* Allocate new optimized_kprobe and try to prepare optimized instructions */
 static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
 {
@@ -587,7 +766,8 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
        op = container_of(ap, struct optimized_kprobe, kp);
        if (!arch_prepared_optinsn(&op->optinsn)) {
                /* If failed to setup optimizing, fallback to kprobe */
-                free_aggr_kprobe(ap);
+                arch_remove_optimized_kprobe(op);
+                kfree(op);
                return;
        }
@@ -631,21 +811,16 @@ static void __kprobes unoptimize_all_kprobes(void)
                return;
        kprobes_allow_optimization = false;
-        printk(KERN_INFO "Kprobes globally unoptimized\n");
-        get_online_cpus();      /* For avoiding text_mutex deadlock */
-        mutex_lock(&text_mutex);
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
                hlist_for_each_entry_rcu(p, node, head, hlist) {
                        if (!kprobe_disabled(p))
-                                unoptimize_kprobe(p);
+                                unoptimize_kprobe(p, false);
                }
        }
+        /* Wait for unoptimizing completion */
-        mutex_unlock(&text_mutex);
+        wait_for_kprobe_optimizer();
-        put_online_cpus();
+        printk(KERN_INFO "Kprobes globally unoptimized\n");
-        /* Allow all currently running kprobes to complete */
-        synchronize_sched();
 }
 int sysctl_kprobes_optimization;
@@ -669,44 +844,60 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
 }
 #endif /* CONFIG_SYSCTL */
+/* Put a breakpoint for a probe. Must be called with text_mutex locked */
 static void __kprobes __arm_kprobe(struct kprobe *p)
 {
-        struct kprobe *old_p;
+        struct kprobe *_p;
        /* Check collision with other optimized kprobes */
-        old_p = get_optimized_kprobe((unsigned long)p->addr);
+        _p = get_optimized_kprobe((unsigned long)p->addr);
-        if (unlikely(old_p))
+        if (unlikely(_p))
-                unoptimize_kprobe(old_p); /* Fallback to unoptimized kprobe */
+                /* Fallback to unoptimized kprobe */
+                unoptimize_kprobe(_p, true);
        arch_arm_kprobe(p);
        optimize_kprobe(p);     /* Try to optimize (add kprobe to a list) */
 }
-static void __kprobes __disarm_kprobe(struct kprobe *p)
+/* Remove the breakpoint of a probe. Must be called with text_mutex locked */
+static void __kprobes __disarm_kprobe(struct kprobe *p, bool reopt)
 {
-        struct kprobe *old_p;
+        struct kprobe *_p;
-        unoptimize_kprobe(p);   /* Try to unoptimize */
+        unoptimize_kprobe(p, false);    /* Try to unoptimize */
-        arch_disarm_kprobe(p);
-        /* If another kprobe was blocked, optimize it. */
+        if (!kprobe_queued(p)) {
-        old_p = get_optimized_kprobe((unsigned long)p->addr);
+                arch_disarm_kprobe(p);
-        if (unlikely(old_p))
+                /* If another kprobe was blocked, optimize it. */
-                optimize_kprobe(old_p);
+                _p = get_optimized_kprobe((unsigned long)p->addr);
+                if (unlikely(_p) && reopt)
+                        optimize_kprobe(_p);
+        }
+        /* TODO: reoptimize others after unoptimized this probe */
 }
 #else /* !CONFIG_OPTPROBES */
 #define optimize_kprobe(p)                      do {} while (0)
-#define unoptimize_kprobe(p)                    do {} while (0)
+#define unoptimize_kprobe(p, f)                 do {} while (0)
 #define kill_optimized_kprobe(p)                do {} while (0)
 #define prepare_optimized_kprobe(p)             do {} while (0)
 #define try_to_optimize_kprobe(p)               do {} while (0)
 #define __arm_kprobe(p)                         arch_arm_kprobe(p)
-#define __disarm_kprobe(p)                      arch_disarm_kprobe(p)
+#define __disarm_kprobe(p, o)                   arch_disarm_kprobe(p)
+#define kprobe_disarmed(p)                      kprobe_disabled(p)
+#define wait_for_kprobe_optimizer()             do {} while (0)
+/* There should be no unused kprobes can be reused without optimization */
+static void reuse_unused_kprobe(struct kprobe *ap)
+{
+        printk(KERN_ERR "Error: There should be no unused kprobe here.\n");
+        BUG_ON(kprobe_unused(ap));
+}
 static __kprobes void free_aggr_kprobe(struct kprobe *p)
 {
+        arch_remove_kprobe(p);
        kfree(p);
 }
@@ -732,11 +923,10 @@ static void __kprobes arm_kprobe(struct kprobe *kp)
 /* Disarm a kprobe with text_mutex */
 static void __kprobes disarm_kprobe(struct kprobe *kp)
 {
-        get_online_cpus();      /* For avoiding text_mutex deadlock */
+        /* Ditto */
        mutex_lock(&text_mutex);
-        __disarm_kprobe(kp);
+        __disarm_kprobe(kp, true);
        mutex_unlock(&text_mutex);
-        put_online_cpus();
 }
 /*
@@ -775,7 +965,7 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
 static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
                                        int trapnr)
 {
-        struct kprobe *cur = __get_cpu_var(kprobe_instance);
+        struct kprobe *cur = __this_cpu_read(kprobe_instance);
        /*
         * if we faulted "during" the execution of a user specified
@@ -790,7 +980,7 @@ static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
 static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
 {
-        struct kprobe *cur = __get_cpu_var(kprobe_instance);
+        struct kprobe *cur = __this_cpu_read(kprobe_instance);
        int ret = 0;
        if (cur && cur->break_handler) {
@@ -942,7 +1132,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
        BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
        if (p->break_handler || p->post_handler)
-                unoptimize_kprobe(ap);  /* Fall back to normal kprobe */
+                unoptimize_kprobe(ap, true);    /* Fall back to normal kprobe */
        if (p->break_handler) {
                if (ap->break_handler)
@@ -993,19 +1183,21 @@ static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
 * This is the second or subsequent kprobe at the address - handle
 * the intricacies
 */
-static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
+static int __kprobes register_aggr_kprobe(struct kprobe *orig_p,
                                          struct kprobe *p)
 {
        int ret = 0;
-        struct kprobe *ap = old_p;
+        struct kprobe *ap = orig_p;
-        if (!kprobe_aggrprobe(old_p)) {
+        if (!kprobe_aggrprobe(orig_p)) {
-                /* If old_p is not an aggr_kprobe, create new aggr_kprobe. */
+                /* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */
-                ap = alloc_aggr_kprobe(old_p);
+                ap = alloc_aggr_kprobe(orig_p);
                if (!ap)
                        return -ENOMEM;
-                init_aggr_kprobe(ap, old_p);
+                init_aggr_kprobe(ap, orig_p);
-        }
+        } else if (kprobe_unused(ap))
+                /* This probe is going to die. Rescue it */
+                reuse_unused_kprobe(ap);
        if (kprobe_gone(ap)) {
                /*
@@ -1039,23 +1231,6 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
        return add_new_kprobe(ap, p);
 }
-/* Try to disable aggr_kprobe, and return 1 if succeeded.*/
-static int __kprobes try_to_disable_aggr_kprobe(struct kprobe *p)
-{
-        struct kprobe *kp;
-        list_for_each_entry_rcu(kp, &p->list, list) {
-                if (!kprobe_disabled(kp))
-                        /*
-                         * There is an active probe on the list.
-                         * We can't disable aggr_kprobe.
-                         */
-                        return 0;
-        }
-        p->flags |= KPROBE_FLAG_DISABLED;
-        return 1;
-}
 static int __kprobes in_kprobes_functions(unsigned long addr)
 {
        struct kprobe_blackpoint *kb;
@@ -1098,34 +1273,33 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
 /* Check passed kprobe is valid and return kprobe in kprobe_table. */
 static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
 {
-        struct kprobe *old_p, *list_p;
+        struct kprobe *ap, *list_p;
-        old_p = get_kprobe(p->addr);
+        ap = get_kprobe(p->addr);
-        if (unlikely(!old_p))
+        if (unlikely(!ap))
                return NULL;
-        if (p != old_p) {
+        if (p != ap) {
-                list_for_each_entry_rcu(list_p, &old_p->list, list)
+                list_for_each_entry_rcu(list_p, &ap->list, list)
                        if (list_p == p)
                        /* kprobe p is a valid probe */
                                goto valid;
                return NULL;
        }
 valid:
-        return old_p;
+        return ap;
 }
 /* Return error if the kprobe is being re-registered */
 static inline int check_kprobe_rereg(struct kprobe *p)
 {
        int ret = 0;
-        struct kprobe *old_p;
        mutex_lock(&kprobe_mutex);
-        old_p = __get_valid_kprobe(p);
+        if (__get_valid_kprobe(p))
-        if (old_p)
                ret = -EINVAL;
        mutex_unlock(&kprobe_mutex);
        return ret;
 }
@@ -1229,67 +1403,121 @@ fail_with_jump_label:
 }
 EXPORT_SYMBOL_GPL(register_kprobe);
+/* Check if all probes on the aggrprobe are disabled */
+static int __kprobes aggr_kprobe_disabled(struct kprobe *ap)
+{
+        struct kprobe *kp;
+        list_for_each_entry_rcu(kp, &ap->list, list)
+                if (!kprobe_disabled(kp))
+                        /*
+                         * There is an active probe on the list.
+                         * We can't disable this ap.
+                         */
+                        return 0;
+        return 1;
+}
+/* Disable one kprobe: Make sure called under kprobe_mutex is locked */
+static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p)
+{
+        struct kprobe *orig_p;
+        /* Get an original kprobe for return */
+        orig_p = __get_valid_kprobe(p);
+        if (unlikely(orig_p == NULL))
+                return NULL;
+        if (!kprobe_disabled(p)) {
+                /* Disable probe if it is a child probe */
+                if (p != orig_p)
+                        p->flags |= KPROBE_FLAG_DISABLED;
+                /* Try to disarm and disable this/parent probe */
+                if (p == orig_p || aggr_kprobe_disabled(orig_p)) {
+                        disarm_kprobe(orig_p);
+                        orig_p->flags |= KPROBE_FLAG_DISABLED;
+                }
+        }
+        return orig_p;
+}
 /*
 * Unregister a kprobe without a scheduler synchronization.
 */
 static int __kprobes __unregister_kprobe_top(struct kprobe *p)
 {
-        struct kprobe *old_p, *list_p;
+        struct kprobe *ap, *list_p;
-        old_p = __get_valid_kprobe(p);
+        /* Disable kprobe. This will disarm it if needed. */
-        if (old_p == NULL)
+        ap = __disable_kprobe(p);
+        if (ap == NULL)
                return -EINVAL;
-        if (old_p == p ||
+        if (ap == p)
-            (kprobe_aggrprobe(old_p) &&
-             list_is_singular(&old_p->list))) {
                /*
-                 * Only probe on the hash list. Disarm only if kprobes are
+                 * This probe is an independent(and non-optimized) kprobe
-                 * enabled and not gone - otherwise, the breakpoint would
+                 * (not an aggrprobe). Remove from the hash list.
-                 * already have been removed. We save on flushing icache.
                 */
-                if (!kprobes_all_disarmed && !kprobe_disabled(old_p))
+                goto disarmed;
-                        disarm_kprobe(old_p);
-                hlist_del_rcu(&old_p->hlist);
+        /* Following process expects this probe is an aggrprobe */
-        } else {
+        WARN_ON(!kprobe_aggrprobe(ap));
+        if (list_is_singular(&ap->list) && kprobe_disarmed(ap))
+                /*
+                 * !disarmed could be happen if the probe is under delayed
+                 * unoptimizing.
+                 */
+                goto disarmed;
+        else {
+                /* If disabling probe has special handlers, update aggrprobe */
                if (p->break_handler && !kprobe_gone(p))
-                        old_p->break_handler = NULL;
+                        ap->break_handler = NULL;
                if (p->post_handler && !kprobe_gone(p)) {
-                        list_for_each_entry_rcu(list_p, &old_p->list, list) {
+                        list_for_each_entry_rcu(list_p, &ap->list, list) {
                                if ((list_p != p) && (list_p->post_handler))
                                        goto noclean;
                        }
-                        old_p->post_handler = NULL;
+                        ap->post_handler = NULL;
                }
 noclean:
+                /*
+                 * Remove from the aggrprobe: this path will do nothing in
+                 * __unregister_kprobe_bottom().
+                 */
                list_del_rcu(&p->list);
-                if (!kprobe_disabled(old_p)) {
+                if (!kprobe_disabled(ap) && !kprobes_all_disarmed)
-                        try_to_disable_aggr_kprobe(old_p);
+                        /*
-                        if (!kprobes_all_disarmed) {
+                         * Try to optimize this probe again, because post
-                                if (kprobe_disabled(old_p))
+                         * handler may have been changed.
-                                        disarm_kprobe(old_p);
+                         */
-                                else
+                        optimize_kprobe(ap);
-                                        /* Try to optimize this probe again */
-                                        optimize_kprobe(old_p);
-                        }
-                }
        }
        return 0;
+disarmed:
+        BUG_ON(!kprobe_disarmed(ap));
+        hlist_del_rcu(&ap->hlist);
+        return 0;
 }
 static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
 {
-        struct kprobe *old_p;
+        struct kprobe *ap;
        if (list_empty(&p->list))
+                /* This is an independent kprobe */
                arch_remove_kprobe(p);
        else if (list_is_singular(&p->list)) {
-                /* "p" is the last child of an aggr_kprobe */
+                /* This is the last child of an aggrprobe */
-                old_p = list_entry(p->list.next, struct kprobe, list);
+                ap = list_entry(p->list.next, struct kprobe, list);
                list_del(&p->list);
-                arch_remove_kprobe(old_p);
+                free_aggr_kprobe(ap);
-                free_aggr_kprobe(old_p);
        }
+        /* Otherwise, do nothing. */
 }
 int __kprobes register_kprobes(struct kprobe **kps, int num)
@@ -1607,29 +1835,13 @@ static void __kprobes kill_kprobe(struct kprobe *p)
 int __kprobes disable_kprobe(struct kprobe *kp)
 {
        int ret = 0;
-        struct kprobe *p;
        mutex_lock(&kprobe_mutex);
-        /* Check whether specified probe is valid. */
+        /* Disable this kprobe */
-        p = __get_valid_kprobe(kp);
+        if (__disable_kprobe(kp) == NULL)
-        if (unlikely(p == NULL)) {
                ret = -EINVAL;
-                goto out;
-        }
-        /* If the probe is already disabled (or gone), just return */
-        if (kprobe_disabled(kp))
-                goto out;
-        kp->flags |= KPROBE_FLAG_DISABLED;
-        if (p != kp)
-                /* When kp != p, p is always enabled. */
-                try_to_disable_aggr_kprobe(p);
-        if (!kprobes_all_disarmed && kprobe_disabled(p))
-                disarm_kprobe(p);
-out:
        mutex_unlock(&kprobe_mutex);
        return ret;
 }
@@ -1927,36 +2139,27 @@ static void __kprobes disarm_all_kprobes(void)
        mutex_lock(&kprobe_mutex);
        /* If kprobes are already disarmed, just return */
-        if (kprobes_all_disarmed)
+        if (kprobes_all_disarmed) {
-                goto already_disabled;
+                mutex_unlock(&kprobe_mutex);
+                return;
+        }
        kprobes_all_disarmed = true;
        printk(KERN_INFO "Kprobes globally disabled\n");
-        /*
-         * Here we call get_online_cpus() for avoiding text_mutex deadlock,
-         * because disarming may also unoptimize kprobes.
-         */
-        get_online_cpus();
        mutex_lock(&text_mutex);
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
                hlist_for_each_entry_rcu(p, node, head, hlist) {
                        if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
-                                __disarm_kprobe(p);
+                                __disarm_kprobe(p, false);
                }
        }
        mutex_unlock(&text_mutex);
-        put_online_cpus();
        mutex_unlock(&kprobe_mutex);
-        /* Allow all currently running kprobes to complete */
-        synchronize_sched();
-        return;
-already_disabled:
+        /* Wait for disarming all kprobes by optimizer */
-        mutex_unlock(&kprobe_mutex);
+        wait_for_kprobe_optimizer();
-        return;
 }
 /*
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 2dc3786349d1..c55afba990a3 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -148,7 +148,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
        wait_for_completion(&create.done);
        if (!IS_ERR(create.result)) {
-                struct sched_param param = { .sched_priority = 0 };
+                static const struct sched_param param = { .sched_priority = 0 };
                va_list args;
                va_start(args, namefmt);
@@ -265,6 +265,17 @@ int kthreadd(void *unused)
        return 0;
 }
+void __init_kthread_worker(struct kthread_worker *worker,
+                                const char *name,
+                                struct lock_class_key *key)
+{
+        spin_lock_init(&worker->lock);
+        lockdep_set_class_and_name(&worker->lock, key, name);
+        INIT_LIST_HEAD(&worker->work_list);
+        worker->task = NULL;
+}
+EXPORT_SYMBOL_GPL(__init_kthread_worker);
 /**
 * kthread_worker_fn - kthread function to process kthread_worker
 * @worker_ptr: pointer to initialized kthread_worker
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 877fb306d415..ee74b35e528d 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -194,14 +194,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
        account_global_scheduler_latency(tsk, &lat);
-        /*
+        for (i = 0; i < tsk->latency_record_count; i++) {
-         * short term hack; if we're > 32 we stop; future we recycle:
-         */
-        tsk->latency_record_count++;
-        if (tsk->latency_record_count >= LT_SAVECOUNT)
-                goto out_unlock;
-        for (i = 0; i < LT_SAVECOUNT; i++) {
                struct latency_record *mylat;
                int same = 1;
@@ -227,8 +220,14 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
                }
        }
+        /*
+         * short term hack; if we're > 32 we stop; future we recycle:
+         */
+        if (tsk->latency_record_count >= LT_SAVECOUNT)
+                goto out_unlock;
        /* Allocated a new one: */
-        i = tsk->latency_record_count;
+        i = tsk->latency_record_count++;
        memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record));
 out_unlock:
@@ -242,24 +241,19 @@ static int lstats_show(struct seq_file *m, void *v)
        seq_puts(m, "Latency Top version : v0.1\n");
        for (i = 0; i < MAXLR; i++) {
-                if (latency_record[i].backtrace[0]) {
+                struct latency_record *lr = &latency_record[i];
+                if (lr->backtrace[0]) {
                        int q;
-                        seq_printf(m, "%i %lu %lu ",
+                        seq_printf(m, "%i %lu %lu",
-                                latency_record[i].count,
+                                   lr->count, lr->time, lr->max);
-                                latency_record[i].time,
-                                latency_record[i].max);
                        for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
-                                char sym[KSYM_SYMBOL_LEN];
+                                unsigned long bt = lr->backtrace[q];
-                                char *c;
+                                if (!bt)
-                                if (!latency_record[i].backtrace[q])
                                        break;
-                                if (latency_record[i].backtrace[q] == ULONG_MAX)
+                                if (bt == ULONG_MAX)
                                        break;
-                                sprint_symbol(sym, latency_record[i].backtrace[q]);
+                                seq_printf(m, " %ps", (void *)bt);
-                                c = strchr(sym, '+');
-                                if (c)
-                                        *c = 0;
-                                seq_printf(m, "%s ", sym);
                        }
                        seq_printf(m, "\n");
                }
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 42ba65dff7d9..0d2058da80f5 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2292,22 +2292,6 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
 }
 /*
- * Debugging helper: via this flag we know that we are in
- * 'early bootup code', and will warn about any invalid irqs-on event:
- */
-static int early_boot_irqs_enabled;
-void early_boot_irqs_off(void)
-{
-        early_boot_irqs_enabled = 0;
-}
-void early_boot_irqs_on(void)
-{
-        early_boot_irqs_enabled = 1;
-}
-/*
 * Hardirqs will be enabled:
 */
 void trace_hardirqs_on_caller(unsigned long ip)
@@ -2319,7 +2303,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
        if (unlikely(!debug_locks || current->lockdep_recursion))
                return;
-        if (DEBUG_LOCKS_WARN_ON(unlikely(!early_boot_irqs_enabled)))
+        if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled)))
                return;
        if (unlikely(curr->hardirqs_enabled)) {
diff --git a/kernel/module.c b/kernel/module.c
index 437a74a7524a..34e00b708fad 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -56,6 +56,7 @@
 #include <linux/percpu.h>
 #include <linux/kmemleak.h>
 #include <linux/jump_label.h>
+#include <linux/pfn.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/module.h>
@@ -70,6 +71,26 @@
 #define ARCH_SHF_SMALL 0
 #endif
+/*
+ * Modules' sections will be aligned on page boundaries
+ * to ensure complete separation of code and data, but
+ * only when CONFIG_DEBUG_SET_MODULE_RONX=y
+ */
+#ifdef CONFIG_DEBUG_SET_MODULE_RONX
+# define debug_align(X) ALIGN(X, PAGE_SIZE)
+#else
+# define debug_align(X) (X)
+#endif
+/*
+ * Given BASE and SIZE this macro calculates the number of pages the
+ * memory regions occupies
+ */
+#define MOD_NUMBER_OF_PAGES(BASE, SIZE) (((SIZE) > 0) ?         \
+                (PFN_DOWN((unsigned long)(BASE) + (SIZE) - 1) - \
+                         PFN_DOWN((unsigned long)BASE) + 1)     \
+                : (0UL))
 /* If this is set, the section belongs in the init part of the module */
 #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
@@ -1542,6 +1563,115 @@ static int __unlink_module(void *_mod)
        return 0;
 }
+#ifdef CONFIG_DEBUG_SET_MODULE_RONX
+/*
+ * LKM RO/NX protection: protect module's text/ro-data
+ * from modification and any data from execution.
+ */
+void set_page_attributes(void *start, void *end, int (*set)(unsigned long start, int num_pages))
+{
+        unsigned long begin_pfn = PFN_DOWN((unsigned long)start);
+        unsigned long end_pfn = PFN_DOWN((unsigned long)end);
+        if (end_pfn > begin_pfn)
+                set(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn);
+}
+static void set_section_ro_nx(void *base,
+                        unsigned long text_size,
+                        unsigned long ro_size,
+                        unsigned long total_size)
+{
+        /* begin and end PFNs of the current subsection */
+        unsigned long begin_pfn;
+        unsigned long end_pfn;
+        /*
+         * Set RO for module text and RO-data:
+         * - Always protect first page.
+         * - Do not protect last partial page.
+         */
+        if (ro_size > 0)
+                set_page_attributes(base, base + ro_size, set_memory_ro);
+        /*
+         * Set NX permissions for module data:
+         * - Do not protect first partial page.
+         * - Always protect last page.
+         */
+        if (total_size > text_size) {
+                begin_pfn = PFN_UP((unsigned long)base + text_size);
+                end_pfn = PFN_UP((unsigned long)base + total_size);
+                if (end_pfn > begin_pfn)
+                        set_memory_nx(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn);
+        }
+}
+/* Setting memory back to RW+NX before releasing it */
+void unset_section_ro_nx(struct module *mod, void *module_region)
+{
+        unsigned long total_pages;
+        if (mod->module_core == module_region) {
+                /* Set core as NX+RW */
+                total_pages = MOD_NUMBER_OF_PAGES(mod->module_core, mod->core_size);
+                set_memory_nx((unsigned long)mod->module_core, total_pages);
+                set_memory_rw((unsigned long)mod->module_core, total_pages);
+        } else if (mod->module_init == module_region) {
+                /* Set init as NX+RW */
+                total_pages = MOD_NUMBER_OF_PAGES(mod->module_init, mod->init_size);
+                set_memory_nx((unsigned long)mod->module_init, total_pages);
+                set_memory_rw((unsigned long)mod->module_init, total_pages);
+        }
+}
+/* Iterate through all modules and set each module's text as RW */
+void set_all_modules_text_rw()
+{
+        struct module *mod;
+        mutex_lock(&module_mutex);
+        list_for_each_entry_rcu(mod, &modules, list) {
+                if ((mod->module_core) && (mod->core_text_size)) {
+                        set_page_attributes(mod->module_core,
+                                                mod->module_core + mod->core_text_size,
+                                                set_memory_rw);
+                }
+                if ((mod->module_init) && (mod->init_text_size)) {
+                        set_page_attributes(mod->module_init,
+                                                mod->module_init + mod->init_text_size,
+                                                set_memory_rw);
+                }
+        }
+        mutex_unlock(&module_mutex);
+}
+/* Iterate through all modules and set each module's text as RO */
+void set_all_modules_text_ro()
+{
+        struct module *mod;
+        mutex_lock(&module_mutex);
+        list_for_each_entry_rcu(mod, &modules, list) {
+                if ((mod->module_core) && (mod->core_text_size)) {
+                        set_page_attributes(mod->module_core,
+                                                mod->module_core + mod->core_text_size,
+                                                set_memory_ro);
+                }
+                if ((mod->module_init) && (mod->init_text_size)) {
+                        set_page_attributes(mod->module_init,
+                                                mod->module_init + mod->init_text_size,
+                                                set_memory_ro);
+                }
+        }
+        mutex_unlock(&module_mutex);
+}
+#else
+static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { }
+static inline void unset_section_ro_nx(struct module *mod, void *module_region) { }
+#endif
 /* Free a module, remove from lists, etc. */
 static void free_module(struct module *mod)
 {
@@ -1566,6 +1696,7 @@ static void free_module(struct module *mod)
        destroy_params(mod->kp, mod->num_kp);
        /* This may be NULL, but that's OK */
+        unset_section_ro_nx(mod, mod->module_init);
        module_free(mod, mod->module_init);
        kfree(mod->args);
        percpu_modfree(mod);
@@ -1574,6 +1705,7 @@ static void free_module(struct module *mod)
        lockdep_free_key_range(mod->module_core, mod->core_size);
        /* Finally, free the core (containing the module structure) */
+        unset_section_ro_nx(mod, mod->module_core);
        module_free(mod, mod->module_core);
 #ifdef CONFIG_MPU
@@ -1777,8 +1909,19 @@ static void layout_sections(struct module *mod, struct load_info *info)
                        s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
                        DEBUGP("\t%s\n", name);
                }
-                if (m == 0)
+                switch (m) {
+                case 0: /* executable */
+                        mod->core_size = debug_align(mod->core_size);
                        mod->core_text_size = mod->core_size;
+                        break;
+                case 1: /* RO: text and ro-data */
+                        mod->core_size = debug_align(mod->core_size);
+                        mod->core_ro_size = mod->core_size;
+                        break;
+                case 3: /* whole core */
+                        mod->core_size = debug_align(mod->core_size);
+                        break;
+                }
        }
        DEBUGP("Init section allocation order:\n");
@@ -1796,8 +1939,19 @@ static void layout_sections(struct module *mod, struct load_info *info)
                                         | INIT_OFFSET_MASK);
                        DEBUGP("\t%s\n", sname);
                }
-                if (m == 0)
+                switch (m) {
+                case 0: /* executable */
+                        mod->init_size = debug_align(mod->init_size);
                        mod->init_text_size = mod->init_size;
+                        break;
+                case 1: /* RO: text and ro-data */
+                        mod->init_size = debug_align(mod->init_size);
+                        mod->init_ro_size = mod->init_size;
+                        break;
+                case 3: /* whole init */
+                        mod->init_size = debug_align(mod->init_size);
+                        break;
+                }
        }
 }
@@ -2326,6 +2480,18 @@ static void find_module_sections(struct module *mod, struct load_info *info)
        kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) *
                           mod->num_trace_events, GFP_KERNEL);
 #endif
+#ifdef CONFIG_TRACING
+        mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt",
+                                         sizeof(*mod->trace_bprintk_fmt_start),
+                                         &mod->num_trace_bprintk_fmt);
+        /*
+         * This section contains pointers to allocated objects in the trace
+         * code and not scanning it leads to false positives.
+         */
+        kmemleak_scan_area(mod->trace_bprintk_fmt_start,
+                           sizeof(*mod->trace_bprintk_fmt_start) *
+                           mod->num_trace_bprintk_fmt, GFP_KERNEL);
+#endif
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
        /* sechdrs[0].sh_size is always zero */
        mod->ftrace_callsites = section_objs(info, "__mcount_loc",
@@ -2710,6 +2876,18 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
        blocking_notifier_call_chain(&module_notify_list,
                        MODULE_STATE_COMING, mod);
+        /* Set RO and NX regions for core */
+        set_section_ro_nx(mod->module_core,
+                                mod->core_text_size,
+                                mod->core_ro_size,
+                                mod->core_size);
+        /* Set RO and NX regions for init */
+        set_section_ro_nx(mod->module_init,
+                                mod->init_text_size,
+                                mod->init_ro_size,
+                                mod->init_size);
        do_mod_ctors(mod);
        /* Start the module */
        if (mod->init != NULL)
@@ -2753,6 +2931,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
        mod->symtab = mod->core_symtab;
        mod->strtab = mod->core_strtab;
 #endif
+        unset_section_ro_nx(mod, mod->module_init);
        module_free(mod, mod->module_init);
        mod->module_init = NULL;
        mod->init_size = 0;
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 200407c1502f..a5889fb28ecf 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -199,7 +199,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                 * memory barriers as we'll eventually observe the right
                 * values at the cost of a few extra spins.
                 */
-                cpu_relax();
+                arch_mutex_cpu_relax();
        }
 #endif
        spin_lock_mutex(&lock->wait_lock, flags);
diff --git a/kernel/panic.c b/kernel/panic.c
index 4c13b1a88ebb..991bb87a1704 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -34,6 +34,7 @@ static int pause_on_oops_flag;
 static DEFINE_SPINLOCK(pause_on_oops_lock);
 int panic_timeout;
+EXPORT_SYMBOL_GPL(panic_timeout);
 ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 517d827f4982..84522c796987 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -13,6 +13,7 @@
 #include <linux/mm.h>
 #include <linux/cpu.h>
 #include <linux/smp.h>
+#include <linux/idr.h>
 #include <linux/file.h>
 #include <linux/poll.h>
 #include <linux/slab.h>
@@ -21,7 +22,9 @@
 #include <linux/dcache.h>
 #include <linux/percpu.h>
 #include <linux/ptrace.h>
+#include <linux/reboot.h>
 #include <linux/vmstat.h>
+#include <linux/device.h>
 #include <linux/vmalloc.h>
 #include <linux/hardirq.h>
 #include <linux/rculist.h>
@@ -31,9 +34,16 @@
 #include <linux/kernel_stat.h>
 #include <linux/perf_event.h>
 #include <linux/ftrace_event.h>
+#include <linux/hw_breakpoint.h>
 #include <asm/irq_regs.h>
+enum event_type_t {
+        EVENT_FLEXIBLE = 0x1,
+        EVENT_PINNED = 0x2,
+        EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
+};
 atomic_t perf_task_events __read_mostly;
 static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
@@ -61,6 +71,12 @@ int sysctl_perf_event_sample_rate __read_mostly = 100000;
 static atomic64_t perf_event_id;
+static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
+                              enum event_type_t event_type);
+static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
+                             enum event_type_t event_type);
 void __weak perf_event_print_debug(void)        { }
 extern __weak const char *perf_pmu_name(void)
@@ -68,6 +84,11 @@ extern __weak const char *perf_pmu_name(void)
        return "pmu";
 }
+static inline u64 perf_clock(void)
+{
+        return local_clock();
+}
 void perf_pmu_disable(struct pmu *pmu)
 {
        int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -132,6 +153,28 @@ static void unclone_ctx(struct perf_event_context *ctx)
        }
 }
+static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
+{
+        /*
+         * only top level events have the pid namespace they were created in
+         */
+        if (event->parent)
+                event = event->parent;
+        return task_tgid_nr_ns(p, event->ns);
+}
+static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
+{
+        /*
+         * only top level events have the pid namespace they were created in
+         */
+        if (event->parent)
+                event = event->parent;
+        return task_pid_nr_ns(p, event->ns);
+}
 /*
 * If we inherit events we want to return the parent event id
 * to userspace.
@@ -214,11 +257,6 @@ static void perf_unpin_context(struct perf_event_context *ctx)
        put_ctx(ctx);
 }
-static inline u64 perf_clock(void)
-{
-        return local_clock();
-}
 /*
 * Update the record of the current time in a context.
 */
@@ -230,6 +268,12 @@ static void update_context_time(struct perf_event_context *ctx)
        ctx->timestamp = now;
 }
+static u64 perf_event_time(struct perf_event *event)
+{
+        struct perf_event_context *ctx = event->ctx;
+        return ctx ? ctx->time : 0;
+}
 /*
 * Update the total_time_enabled and total_time_running fields for a event.
 */
@@ -243,7 +287,7 @@ static void update_event_times(struct perf_event *event)
                return;
        if (ctx->is_active)
-                run_end = ctx->time;
+                run_end = perf_event_time(event);
        else
                run_end = event->tstamp_stopped;
@@ -252,7 +296,7 @@ static void update_event_times(struct perf_event *event)
        if (event->state == PERF_EVENT_STATE_INACTIVE)
                run_end = event->tstamp_stopped;
        else
-                run_end = ctx->time;
+                run_end = perf_event_time(event);
        event->total_time_running = run_end - event->tstamp_running;
 }
@@ -311,9 +355,84 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
                ctx->nr_stat++;
 }
+/*
+ * Called at perf_event creation and when events are attached/detached from a
+ * group.
+ */
+static void perf_event__read_size(struct perf_event *event)
+{
+        int entry = sizeof(u64); /* value */
+        int size = 0;
+        int nr = 1;
+        if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+                size += sizeof(u64);
+        if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+                size += sizeof(u64);
+        if (event->attr.read_format & PERF_FORMAT_ID)
+                entry += sizeof(u64);
+        if (event->attr.read_format & PERF_FORMAT_GROUP) {
+                nr += event->group_leader->nr_siblings;
+                size += sizeof(u64);
+        }
+        size += entry * nr;
+        event->read_size = size;
+}
+static void perf_event__header_size(struct perf_event *event)
+{
+        struct perf_sample_data *data;
+        u64 sample_type = event->attr.sample_type;
+        u16 size = 0;
+        perf_event__read_size(event);
+        if (sample_type & PERF_SAMPLE_IP)
+                size += sizeof(data->ip);
+        if (sample_type & PERF_SAMPLE_ADDR)
+                size += sizeof(data->addr);
+        if (sample_type & PERF_SAMPLE_PERIOD)
+                size += sizeof(data->period);
+        if (sample_type & PERF_SAMPLE_READ)
+                size += event->read_size;
+        event->header_size = size;
+}
+static void perf_event__id_header_size(struct perf_event *event)
+{
+        struct perf_sample_data *data;
+        u64 sample_type = event->attr.sample_type;
+        u16 size = 0;
+        if (sample_type & PERF_SAMPLE_TID)
+                size += sizeof(data->tid_entry);
+        if (sample_type & PERF_SAMPLE_TIME)
+                size += sizeof(data->time);
+        if (sample_type & PERF_SAMPLE_ID)
+                size += sizeof(data->id);
+        if (sample_type & PERF_SAMPLE_STREAM_ID)
+                size += sizeof(data->stream_id);
+        if (sample_type & PERF_SAMPLE_CPU)
+                size += sizeof(data->cpu_entry);
+        event->id_header_size = size;
+}
 static void perf_group_attach(struct perf_event *event)
 {
-        struct perf_event *group_leader = event->group_leader;
+        struct perf_event *group_leader = event->group_leader, *pos;
        /*
         * We can have double attach due to group movement in perf_event_open.
@@ -332,6 +451,11 @@ static void perf_group_attach(struct perf_event *event)
        list_add_tail(&event->group_entry, &group_leader->sibling_list);
        group_leader->nr_siblings++;
+        perf_event__header_size(group_leader);
+        list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
+                perf_event__header_size(pos);
 }
 /*
@@ -390,7 +514,7 @@ static void perf_group_detach(struct perf_event *event)
        if (event->group_leader != event) {
                list_del_init(&event->group_entry);
                event->group_leader->nr_siblings--;
-                return;
+                goto out;
        }
        if (!list_empty(&event->group_entry))
@@ -409,6 +533,12 @@ static void perf_group_detach(struct perf_event *event)
                /* Inherit group flags from the previous leader */
                sibling->group_flags = event->group_flags;
        }
+out:
+        perf_event__header_size(event->group_leader);
+        list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
+                perf_event__header_size(tmp);
 }
 static inline int
@@ -422,6 +552,7 @@ event_sched_out(struct perf_event *event,
                  struct perf_cpu_context *cpuctx,
                  struct perf_event_context *ctx)
 {
+        u64 tstamp = perf_event_time(event);
        u64 delta;
        /*
         * An event which could not be activated because of
@@ -433,7 +564,7 @@ event_sched_out(struct perf_event *event,
            && !event_filter_match(event)) {
                delta = ctx->time - event->tstamp_stopped;
                event->tstamp_running += delta;
-                event->tstamp_stopped = ctx->time;
+                event->tstamp_stopped = tstamp;
        }
        if (event->state != PERF_EVENT_STATE_ACTIVE)
@@ -444,7 +575,7 @@ event_sched_out(struct perf_event *event,
                event->pending_disable = 0;
                event->state = PERF_EVENT_STATE_OFF;
        }
-        event->tstamp_stopped = ctx->time;
+        event->tstamp_stopped = tstamp;
        event->pmu->del(event, 0);
        event->oncpu = -1;
@@ -656,6 +787,8 @@ event_sched_in(struct perf_event *event,
                 struct perf_cpu_context *cpuctx,
                 struct perf_event_context *ctx)
 {
+        u64 tstamp = perf_event_time(event);
        if (event->state <= PERF_EVENT_STATE_OFF)
                return 0;
@@ -672,7 +805,9 @@ event_sched_in(struct perf_event *event,
                return -EAGAIN;
        }
-        event->tstamp_running += ctx->time - event->tstamp_stopped;
+        event->tstamp_running += tstamp - event->tstamp_stopped;
+        event->shadow_ctx_time = tstamp - ctx->timestamp;
        if (!is_software_event(event))
                cpuctx->active_oncpu++;
@@ -784,11 +919,13 @@ static int group_can_go_on(struct perf_event *event,
 static void add_event_to_ctx(struct perf_event *event,
                               struct perf_event_context *ctx)
 {
+        u64 tstamp = perf_event_time(event);
        list_add_event(event, ctx);
        perf_group_attach(event);
-        event->tstamp_enabled = ctx->time;
+        event->tstamp_enabled = tstamp;
-        event->tstamp_running = ctx->time;
+        event->tstamp_running = tstamp;
-        event->tstamp_stopped = ctx->time;
+        event->tstamp_stopped = tstamp;
 }
 /*
@@ -823,7 +960,7 @@ static void __perf_install_in_context(void *info)
        add_event_to_ctx(event, ctx);
-        if (event->cpu != -1 && event->cpu != smp_processor_id())
+        if (!event_filter_match(event))
                goto unlock;
        /*
@@ -928,14 +1065,13 @@ static void __perf_event_mark_enabled(struct perf_event *event,
                                        struct perf_event_context *ctx)
 {
        struct perf_event *sub;
+        u64 tstamp = perf_event_time(event);
        event->state = PERF_EVENT_STATE_INACTIVE;
-        event->tstamp_enabled = ctx->time - event->total_time_enabled;
+        event->tstamp_enabled = tstamp - event->total_time_enabled;
        list_for_each_entry(sub, &event->sibling_list, group_entry) {
-                if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
+                if (sub->state >= PERF_EVENT_STATE_INACTIVE)
-                        sub->tstamp_enabled =
+                        sub->tstamp_enabled = tstamp - sub->total_time_enabled;
-                                ctx->time - sub->total_time_enabled;
-                }
        }
 }
@@ -968,7 +1104,7 @@ static void __perf_event_enable(void *info)
                goto unlock;
        __perf_event_mark_enabled(event, ctx);
-        if (event->cpu != -1 && event->cpu != smp_processor_id())
+        if (!event_filter_match(event))
                goto unlock;
        /*
@@ -1070,7 +1206,7 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
        /*
         * not supported on inherited events
         */
-        if (event->attr.inherit)
+        if (event->attr.inherit || !is_sampling_event(event))
                return -EINVAL;
        atomic_add(refresh, &event->event_limit);
@@ -1079,12 +1215,6 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
        return 0;
 }
-enum event_type_t {
-        EVENT_FLEXIBLE = 0x1,
-        EVENT_PINNED = 0x2,
-        EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
-};
 static void ctx_sched_out(struct perf_event_context *ctx,
                          struct perf_cpu_context *cpuctx,
                          enum event_type_t event_type)
@@ -1284,8 +1414,6 @@ void __perf_event_task_sched_out(struct task_struct *task,
 {
        int ctxn;
-        perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
        for_each_task_context_nr(ctxn)
                perf_event_context_sched_out(task, ctxn, next);
 }
@@ -1323,7 +1451,7 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
        list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
                if (event->state <= PERF_EVENT_STATE_OFF)
                        continue;
-                if (event->cpu != -1 && event->cpu != smp_processor_id())
+                if (!event_filter_match(event))
                        continue;
                if (group_can_go_on(event, cpuctx, 1))
@@ -1355,7 +1483,7 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
                 * Listen to the 'cpu' scheduling filter constraint
                 * of events:
                 */
-                if (event->cpu != -1 && event->cpu != smp_processor_id())
+                if (!event_filter_match(event))
                        continue;
                if (group_can_go_on(event, cpuctx, can_add_hw)) {
@@ -1582,7 +1710,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
                if (event->state != PERF_EVENT_STATE_ACTIVE)
                        continue;
-                if (event->cpu != -1 && event->cpu != smp_processor_id())
+                if (!event_filter_match(event))
                        continue;
                hwc = &event->hw;
@@ -1619,8 +1747,12 @@ static void rotate_ctx(struct perf_event_context *ctx)
 {
        raw_spin_lock(&ctx->lock);
-        /* Rotate the first entry last of non-pinned groups */
+        /*
-        list_rotate_left(&ctx->flexible_groups);
+         * Rotate the first entry last of non-pinned groups. Rotation might be
+         * disabled by the inheritance code.
+         */
+        if (!ctx->rotate_disable)
+                list_rotate_left(&ctx->flexible_groups);
        raw_spin_unlock(&ctx->lock);
 }
@@ -2096,14 +2228,11 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
        unsigned long flags;
        int ctxn, err;
-        if (!task && cpu != -1) {
+        if (!task) {
                /* Must be root to operate on a CPU event: */
                if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
                        return ERR_PTR(-EACCES);
-                if (cpu < 0 || cpu >= nr_cpumask_bits)
-                        return ERR_PTR(-EINVAL);
                /*
                 * We could be clever and allow to attach a event to an
                 * offline CPU and activate it when the CPU comes up, but
@@ -2232,11 +2361,6 @@ int perf_event_release_kernel(struct perf_event *event)
        raw_spin_unlock_irq(&ctx->lock);
        mutex_unlock(&ctx->mutex);
-        mutex_lock(&event->owner->perf_event_mutex);
-        list_del_init(&event->owner_entry);
-        mutex_unlock(&event->owner->perf_event_mutex);
-        put_task_struct(event->owner);
        free_event(event);
        return 0;
@@ -2249,35 +2373,44 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel);
 static int perf_release(struct inode *inode, struct file *file)
 {
        struct perf_event *event = file->private_data;
+        struct task_struct *owner;
        file->private_data = NULL;
-        return perf_event_release_kernel(event);
+        rcu_read_lock();
-}
+        owner = ACCESS_ONCE(event->owner);
+        /*
-static int perf_event_read_size(struct perf_event *event)
+         * Matches the smp_wmb() in perf_event_exit_task(). If we observe
-{
+         * !owner it means the list deletion is complete and we can indeed
-        int entry = sizeof(u64); /* value */
+         * free this event, otherwise we need to serialize on
-        int size = 0;
+         * owner->perf_event_mutex.
-        int nr = 1;
+         */
+        smp_read_barrier_depends();
-        if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+        if (owner) {
-                size += sizeof(u64);
+                /*
+                 * Since delayed_put_task_struct() also drops the last
-        if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+                 * task reference we can safely take a new reference
-                size += sizeof(u64);
+                 * while holding the rcu_read_lock().
+                 */
-        if (event->attr.read_format & PERF_FORMAT_ID)
+                get_task_struct(owner);
-                entry += sizeof(u64);
-        if (event->attr.read_format & PERF_FORMAT_GROUP) {
-                nr += event->group_leader->nr_siblings;
-                size += sizeof(u64);
        }
+        rcu_read_unlock();
-        size += entry * nr;
+        if (owner) {
+                mutex_lock(&owner->perf_event_mutex);
+                /*
+                 * We have to re-check the event->owner field, if it is cleared
+                 * we raced with perf_event_exit_task(), acquiring the mutex
+                 * ensured they're done, and we can proceed with freeing the
+                 * event.
+                 */
+                if (event->owner)
+                        list_del_init(&event->owner_entry);
+                mutex_unlock(&owner->perf_event_mutex);
+                put_task_struct(owner);
+        }
-        return size;
+        return perf_event_release_kernel(event);
 }
 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
@@ -2394,7 +2527,7 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
        if (event->state == PERF_EVENT_STATE_ERROR)
                return 0;
-        if (count < perf_event_read_size(event))
+        if (count < event->read_size)
                return -ENOSPC;
        WARN_ON_ONCE(event->ctx->parent_ctx);
@@ -2480,7 +2613,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
        int ret = 0;
        u64 value;
-        if (!event->attr.sample_period)
+        if (!is_sampling_event(event))
                return -EINVAL;
        if (copy_from_user(&value, arg, sizeof(value)))
@@ -3271,6 +3404,73 @@ __always_inline void perf_output_copy(struct perf_output_handle *handle,
        } while (len);
 }
+static void __perf_event_header__init_id(struct perf_event_header *header,
+                                         struct perf_sample_data *data,
+                                         struct perf_event *event)
+{
+        u64 sample_type = event->attr.sample_type;
+        data->type = sample_type;
+        header->size += event->id_header_size;
+        if (sample_type & PERF_SAMPLE_TID) {
+                /* namespace issues */
+                data->tid_entry.pid = perf_event_pid(event, current);
+                data->tid_entry.tid = perf_event_tid(event, current);
+        }
+        if (sample_type & PERF_SAMPLE_TIME)
+                data->time = perf_clock();
+        if (sample_type & PERF_SAMPLE_ID)
+                data->id = primary_event_id(event);
+        if (sample_type & PERF_SAMPLE_STREAM_ID)
+                data->stream_id = event->id;
+        if (sample_type & PERF_SAMPLE_CPU) {
+                data->cpu_entry.cpu      = raw_smp_processor_id();
+                data->cpu_entry.reserved = 0;
+        }
+}
+static void perf_event_header__init_id(struct perf_event_header *header,
+                                       struct perf_sample_data *data,
+                                       struct perf_event *event)
+{
+        if (event->attr.sample_id_all)
+                __perf_event_header__init_id(header, data, event);
+}
+static void __perf_event__output_id_sample(struct perf_output_handle *handle,
+                                           struct perf_sample_data *data)
+{
+        u64 sample_type = data->type;
+        if (sample_type & PERF_SAMPLE_TID)
+                perf_output_put(handle, data->tid_entry);
+        if (sample_type & PERF_SAMPLE_TIME)
+                perf_output_put(handle, data->time);
+        if (sample_type & PERF_SAMPLE_ID)
+                perf_output_put(handle, data->id);
+        if (sample_type & PERF_SAMPLE_STREAM_ID)
+                perf_output_put(handle, data->stream_id);
+        if (sample_type & PERF_SAMPLE_CPU)
+                perf_output_put(handle, data->cpu_entry);
+}
+static void perf_event__output_id_sample(struct perf_event *event,
+                                         struct perf_output_handle *handle,
+                                         struct perf_sample_data *sample)
+{
+        if (event->attr.sample_id_all)
+                __perf_event__output_id_sample(handle, sample);
+}
 int perf_output_begin(struct perf_output_handle *handle,
                      struct perf_event *event, unsigned int size,
                      int nmi, int sample)
@@ -3278,6 +3478,7 @@ int perf_output_begin(struct perf_output_handle *handle,
        struct perf_buffer *buffer;
        unsigned long tail, offset, head;
        int have_lost;
+        struct perf_sample_data sample_data;
        struct {
                struct perf_event_header header;
                u64                      id;
@@ -3304,8 +3505,12 @@ int perf_output_begin(struct perf_output_handle *handle,
                goto out;
        have_lost = local_read(&buffer->lost);
-        if (have_lost)
+        if (have_lost) {
-                size += sizeof(lost_event);
+                lost_event.header.size = sizeof(lost_event);
+                perf_event_header__init_id(&lost_event.header, &sample_data,
+                                           event);
+                size += lost_event.header.size;
+        }
        perf_output_get_handle(handle);
@@ -3336,11 +3541,11 @@ int perf_output_begin(struct perf_output_handle *handle,
        if (have_lost) {
                lost_event.header.type = PERF_RECORD_LOST;
                lost_event.header.misc = 0;
-                lost_event.header.size = sizeof(lost_event);
                lost_event.id          = event->id;
                lost_event.lost        = local_xchg(&buffer->lost, 0);
                perf_output_put(handle, lost_event);
+                perf_event__output_id_sample(event, handle, &sample_data);
        }
        return 0;
@@ -3373,30 +3578,9 @@ void perf_output_end(struct perf_output_handle *handle)
        rcu_read_unlock();
 }
-static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
-{
-        /*
-         * only top level events have the pid namespace they were created in
-         */
-        if (event->parent)
-                event = event->parent;
-        return task_tgid_nr_ns(p, event->ns);
-}
-static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
-{
-        /*
-         * only top level events have the pid namespace they were created in
-         */
-        if (event->parent)
-                event = event->parent;
-        return task_pid_nr_ns(p, event->ns);
-}
 static void perf_output_read_one(struct perf_output_handle *handle,
-                                 struct perf_event *event)
+                                 struct perf_event *event,
+                                 u64 enabled, u64 running)
 {
        u64 read_format = event->attr.read_format;
        u64 values[4];
@@ -3404,11 +3588,11 @@ static void perf_output_read_one(struct perf_output_handle *handle,
        values[n++] = perf_event_count(event);
        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
-                values[n++] = event->total_time_enabled +
+                values[n++] = enabled +
                        atomic64_read(&event->child_total_time_enabled);
        }
        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
-                values[n++] = event->total_time_running +
+                values[n++] = running +
                        atomic64_read(&event->child_total_time_running);
        }
        if (read_format & PERF_FORMAT_ID)
@@ -3421,7 +3605,8 @@ static void perf_output_read_one(struct perf_output_handle *handle,
 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
 */
 static void perf_output_read_group(struct perf_output_handle *handle,
-                            struct perf_event *event)
+                            struct perf_event *event,
+                            u64 enabled, u64 running)
 {
        struct perf_event *leader = event->group_leader, *sub;
        u64 read_format = event->attr.read_format;
@@ -3431,10 +3616,10 @@ static void perf_output_read_group(struct perf_output_handle *handle,
        values[n++] = 1 + leader->nr_siblings;
        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
-                values[n++] = leader->total_time_enabled;
+                values[n++] = enabled;
        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
-                values[n++] = leader->total_time_running;
+                values[n++] = running;
        if (leader != event)
                leader->pmu->read(leader);
@@ -3459,13 +3644,35 @@ static void perf_output_read_group(struct perf_output_handle *handle,
        }
 }
+#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
+                                 PERF_FORMAT_TOTAL_TIME_RUNNING)
 static void perf_output_read(struct perf_output_handle *handle,
                             struct perf_event *event)
 {
+        u64 enabled = 0, running = 0, now, ctx_time;
+        u64 read_format = event->attr.read_format;
+        /*
+         * compute total_time_enabled, total_time_running
+         * based on snapshot values taken when the event
+         * was last scheduled in.
+         *
+         * we cannot simply called update_context_time()
+         * because of locking issue as we are called in
+         * NMI context
+         */
+        if (read_format & PERF_FORMAT_TOTAL_TIMES) {
+                now = perf_clock();
+                ctx_time = event->shadow_ctx_time + now;
+                enabled = ctx_time - event->tstamp_enabled;
+                running = ctx_time - event->tstamp_running;
+        }
        if (event->attr.read_format & PERF_FORMAT_GROUP)
-                perf_output_read_group(handle, event);
+                perf_output_read_group(handle, event, enabled, running);
        else
-                perf_output_read_one(handle, event);
+                perf_output_read_one(handle, event, enabled, running);
 }
 void perf_output_sample(struct perf_output_handle *handle,
@@ -3545,61 +3752,16 @@ void perf_prepare_sample(struct perf_event_header *header,
 {
        u64 sample_type = event->attr.sample_type;
-        data->type = sample_type;
        header->type = PERF_RECORD_SAMPLE;
-        header->size = sizeof(*header);
+        header->size = sizeof(*header) + event->header_size;
        header->misc = 0;
        header->misc |= perf_misc_flags(regs);
-        if (sample_type & PERF_SAMPLE_IP) {
+        __perf_event_header__init_id(header, data, event);
-                data->ip = perf_instruction_pointer(regs);
-                header->size += sizeof(data->ip);
+        if (sample_type & PERF_SAMPLE_IP)
-        }
+                data->ip = perf_instruction_pointer(regs);
-        if (sample_type & PERF_SAMPLE_TID) {
-                /* namespace issues */
-                data->tid_entry.pid = perf_event_pid(event, current);
-                data->tid_entry.tid = perf_event_tid(event, current);
-                header->size += sizeof(data->tid_entry);
-        }
-        if (sample_type & PERF_SAMPLE_TIME) {
-                data->time = perf_clock();
-                header->size += sizeof(data->time);
-        }
-        if (sample_type & PERF_SAMPLE_ADDR)
-                header->size += sizeof(data->addr);
-        if (sample_type & PERF_SAMPLE_ID) {
-                data->id = primary_event_id(event);
-                header->size += sizeof(data->id);
-        }
-        if (sample_type & PERF_SAMPLE_STREAM_ID) {
-                data->stream_id = event->id;
-                header->size += sizeof(data->stream_id);
-        }
-        if (sample_type & PERF_SAMPLE_CPU) {
-                data->cpu_entry.cpu             = raw_smp_processor_id();
-                data->cpu_entry.reserved        = 0;
-                header->size += sizeof(data->cpu_entry);
-        }
-        if (sample_type & PERF_SAMPLE_PERIOD)
-                header->size += sizeof(data->period);
-        if (sample_type & PERF_SAMPLE_READ)
-                header->size += perf_event_read_size(event);
        if (sample_type & PERF_SAMPLE_CALLCHAIN) {
                int size = 1;
@@ -3664,23 +3826,26 @@ perf_event_read_event(struct perf_event *event,
                        struct task_struct *task)
 {
        struct perf_output_handle handle;
+        struct perf_sample_data sample;
        struct perf_read_event read_event = {
                .header = {
                        .type = PERF_RECORD_READ,
                        .misc = 0,
-                        .size = sizeof(read_event) + perf_event_read_size(event),
+                        .size = sizeof(read_event) + event->read_size,
                },
                .pid = perf_event_pid(event, task),
                .tid = perf_event_tid(event, task),
        };
        int ret;
+        perf_event_header__init_id(&read_event.header, &sample, event);
        ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
        if (ret)
                return;
        perf_output_put(&handle, read_event);
        perf_output_read(&handle, event);
+        perf_event__output_id_sample(event, &handle, &sample);
        perf_output_end(&handle);
 }
@@ -3710,14 +3875,16 @@ static void perf_event_task_output(struct perf_event *event,
                                     struct perf_task_event *task_event)
 {
        struct perf_output_handle handle;
+        struct perf_sample_data sample;
        struct task_struct *task = task_event->task;
-        int size, ret;
+        int ret, size = task_event->event_id.header.size;
-        size  = task_event->event_id.header.size;
+        perf_event_header__init_id(&task_event->event_id.header, &sample, event);
-        ret = perf_output_begin(&handle, event, size, 0, 0);
+        ret = perf_output_begin(&handle, event,
+                                task_event->event_id.header.size, 0, 0);
        if (ret)
-                return;
+                goto out;
        task_event->event_id.pid = perf_event_pid(event, task);
        task_event->event_id.ppid = perf_event_pid(event, current);
@@ -3727,7 +3894,11 @@ static void perf_event_task_output(struct perf_event *event,
        perf_output_put(&handle, task_event->event_id);
+        perf_event__output_id_sample(event, &handle, &sample);
        perf_output_end(&handle);
+out:
+        task_event->event_id.header.size = size;
 }
 static int perf_event_task_match(struct perf_event *event)
@@ -3735,7 +3906,7 @@ static int perf_event_task_match(struct perf_event *event)
        if (event->state < PERF_EVENT_STATE_INACTIVE)
                return 0;
-        if (event->cpu != -1 && event->cpu != smp_processor_id())
+        if (!event_filter_match(event))
                return 0;
        if (event->attr.comm || event->attr.mmap ||
@@ -3766,6 +3937,8 @@ static void perf_event_task_event(struct perf_task_event *task_event)
        rcu_read_lock();
        list_for_each_entry_rcu(pmu, &pmus, entry) {
                cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+                if (cpuctx->active_pmu != pmu)
+                        goto next;
                perf_event_task_ctx(&cpuctx->ctx, task_event);
                ctx = task_event->task_ctx;
@@ -3840,11 +4013,16 @@ static void perf_event_comm_output(struct perf_event *event,
                                     struct perf_comm_event *comm_event)
 {
        struct perf_output_handle handle;
+        struct perf_sample_data sample;
        int size = comm_event->event_id.header.size;
-        int ret = perf_output_begin(&handle, event, size, 0, 0);
+        int ret;
+        perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
+        ret = perf_output_begin(&handle, event,
+                                comm_event->event_id.header.size, 0, 0);
        if (ret)
-                return;
+                goto out;
        comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
        comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
@@ -3852,7 +4030,12 @@ static void perf_event_comm_output(struct perf_event *event,
        perf_output_put(&handle, comm_event->event_id);
        perf_output_copy(&handle, comm_event->comm,
                                   comm_event->comm_size);
+        perf_event__output_id_sample(event, &handle, &sample);
        perf_output_end(&handle);
+out:
+        comm_event->event_id.header.size = size;
 }
 static int perf_event_comm_match(struct perf_event *event)
@@ -3860,7 +4043,7 @@ static int perf_event_comm_match(struct perf_event *event)
        if (event->state < PERF_EVENT_STATE_INACTIVE)
                return 0;
-        if (event->cpu != -1 && event->cpu != smp_processor_id())
+        if (!event_filter_match(event))
                return 0;
        if (event->attr.comm)
@@ -3897,10 +4080,11 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
        comm_event->comm_size = size;
        comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
        rcu_read_lock();
        list_for_each_entry_rcu(pmu, &pmus, entry) {
                cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+                if (cpuctx->active_pmu != pmu)
+                        goto next;
                perf_event_comm_ctx(&cpuctx->ctx, comm_event);
                ctxn = pmu->task_ctx_nr;
@@ -3976,11 +4160,15 @@ static void perf_event_mmap_output(struct perf_event *event,
                                     struct perf_mmap_event *mmap_event)
 {
        struct perf_output_handle handle;
+        struct perf_sample_data sample;
        int size = mmap_event->event_id.header.size;
-        int ret = perf_output_begin(&handle, event, size, 0, 0);
+        int ret;
+        perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
+        ret = perf_output_begin(&handle, event,
+                                mmap_event->event_id.header.size, 0, 0);
        if (ret)
-                return;
+                goto out;
        mmap_event->event_id.pid = perf_event_pid(event, current);
        mmap_event->event_id.tid = perf_event_tid(event, current);
@@ -3988,7 +4176,12 @@ static void perf_event_mmap_output(struct perf_event *event,
        perf_output_put(&handle, mmap_event->event_id);
        perf_output_copy(&handle, mmap_event->file_name,
                                   mmap_event->file_size);
+        perf_event__output_id_sample(event, &handle, &sample);
        perf_output_end(&handle);
+out:
+        mmap_event->event_id.header.size = size;
 }
 static int perf_event_mmap_match(struct perf_event *event,
@@ -3998,7 +4191,7 @@ static int perf_event_mmap_match(struct perf_event *event,
        if (event->state < PERF_EVENT_STATE_INACTIVE)
                return 0;
-        if (event->cpu != -1 && event->cpu != smp_processor_id())
+        if (!event_filter_match(event))
                return 0;
        if ((!executable && event->attr.mmap_data) ||
@@ -4086,6 +4279,8 @@ got_name:
        rcu_read_lock();
        list_for_each_entry_rcu(pmu, &pmus, entry) {
                cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+                if (cpuctx->active_pmu != pmu)
+                        goto next;
                perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
                                        vma->vm_flags & VM_EXEC);
@@ -4141,6 +4336,7 @@ void perf_event_mmap(struct vm_area_struct *vma)
 static void perf_log_throttle(struct perf_event *event, int enable)
 {
        struct perf_output_handle handle;
+        struct perf_sample_data sample;
        int ret;
        struct {
@@ -4162,11 +4358,15 @@ static void perf_log_throttle(struct perf_event *event, int enable)
        if (enable)
                throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
-        ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0);
+        perf_event_header__init_id(&throttle_event.header, &sample, event);
+        ret = perf_output_begin(&handle, event,
+                                throttle_event.header.size, 1, 0);
        if (ret)
                return;
        perf_output_put(&handle, throttle_event);
+        perf_event__output_id_sample(event, &handle, &sample);
        perf_output_end(&handle);
 }
@@ -4182,6 +4382,13 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
        struct hw_perf_event *hwc = &event->hw;
        int ret = 0;
+        /*
+         * Non-sampling counters might still use the PMI to fold short
+         * hardware counters, ignore those.
+         */
+        if (unlikely(!is_sampling_event(event)))
+                return 0;
        if (!throttle) {
                hwc->interrupts++;
        } else {
@@ -4327,7 +4534,7 @@ static void perf_swevent_event(struct perf_event *event, u64 nr,
        if (!regs)
                return;
-        if (!hwc->sample_period)
+        if (!is_sampling_event(event))
                return;
        if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
@@ -4454,7 +4661,7 @@ int perf_swevent_get_recursion_context(void)
 }
 EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
-void inline perf_swevent_put_recursion_context(int rctx)
+inline void perf_swevent_put_recursion_context(int rctx)
 {
        struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
@@ -4490,7 +4697,7 @@ static int perf_swevent_add(struct perf_event *event, int flags)
        struct hw_perf_event *hwc = &event->hw;
        struct hlist_head *head;
-        if (hwc->sample_period) {
+        if (is_sampling_event(event)) {
                hwc->last_period = hwc->sample_period;
                perf_swevent_set_period(event);
        }
@@ -4655,7 +4862,7 @@ static int perf_swevent_init(struct perf_event *event)
                break;
        }
-        if (event_id > PERF_COUNT_SW_MAX)
+        if (event_id >= PERF_COUNT_SW_MAX)
                return -ENOENT;
        if (!event->parent) {
@@ -4747,15 +4954,6 @@ static int perf_tp_event_init(struct perf_event *event)
        if (event->attr.type != PERF_TYPE_TRACEPOINT)
                return -ENOENT;
-        /*
-         * Raw tracepoint data is a severe data leak, only allow root to
-         * have these.
-         */
-        if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
-                        perf_paranoid_tracepoint_raw() &&
-                        !capable(CAP_SYS_ADMIN))
-                return -EPERM;
        err = perf_trace_init(event);
        if (err)
                return err;
@@ -4778,7 +4976,7 @@ static struct pmu perf_tracepoint = {
 static inline void perf_tp_register(void)
 {
-        perf_pmu_register(&perf_tracepoint);
+        perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
 }
 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4868,31 +5066,33 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
 static void perf_swevent_start_hrtimer(struct perf_event *event)
 {
        struct hw_perf_event *hwc = &event->hw;
+        s64 period;
+        if (!is_sampling_event(event))
+                return;
        hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        hwc->hrtimer.function = perf_swevent_hrtimer;
-        if (hwc->sample_period) {
-                s64 period = local64_read(&hwc->period_left);
-                if (period) {
+        period = local64_read(&hwc->period_left);
-                        if (period < 0)
+        if (period) {
-                                period = 10000;
+                if (period < 0)
+                        period = 10000;
-                        local64_set(&hwc->period_left, 0);
+                local64_set(&hwc->period_left, 0);
-                } else {
+        } else {
-                        period = max_t(u64, 10000, hwc->sample_period);
+                period = max_t(u64, 10000, hwc->sample_period);
-                }
+        }
-                __hrtimer_start_range_ns(&hwc->hrtimer,
+        __hrtimer_start_range_ns(&hwc->hrtimer,
                                ns_to_ktime(period), 0,
                                HRTIMER_MODE_REL_PINNED, 0);
-        }
 }
 static void perf_swevent_cancel_hrtimer(struct perf_event *event)
 {
        struct hw_perf_event *hwc = &event->hw;
-        if (hwc->sample_period) {
+        if (is_sampling_event(event)) {
                ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
                local64_set(&hwc->period_left, ktime_to_ns(remaining));
@@ -5087,25 +5287,94 @@ static void *find_pmu_context(int ctxn)
        return NULL;
 }
-static void free_pmu_context(void * __percpu cpu_context)
+static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
 {
-        struct pmu *pmu;
+        int cpu;
+        for_each_possible_cpu(cpu) {
+                struct perf_cpu_context *cpuctx;
+                cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+                if (cpuctx->active_pmu == old_pmu)
+                        cpuctx->active_pmu = pmu;
+        }
+}
+static void free_pmu_context(struct pmu *pmu)
+{
+        struct pmu *i;
        mutex_lock(&pmus_lock);
        /*
         * Like a real lame refcount.
         */
-        list_for_each_entry(pmu, &pmus, entry) {
+        list_for_each_entry(i, &pmus, entry) {
-                if (pmu->pmu_cpu_context == cpu_context)
+                if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
+                        update_pmu_context(i, pmu);
                        goto out;
+                }
        }
-        free_percpu(cpu_context);
+        free_percpu(pmu->pmu_cpu_context);
 out:
        mutex_unlock(&pmus_lock);
 }
+static struct idr pmu_idr;
+static ssize_t
+type_show(struct device *dev, struct device_attribute *attr, char *page)
+{
+        struct pmu *pmu = dev_get_drvdata(dev);
+        return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
+}
+static struct device_attribute pmu_dev_attrs[] = {
+       __ATTR_RO(type),
+       __ATTR_NULL,
+};
+static int pmu_bus_running;
+static struct bus_type pmu_bus = {
+        .name           = "event_source",
+        .dev_attrs      = pmu_dev_attrs,
+};
+static void pmu_dev_release(struct device *dev)
+{
+        kfree(dev);
+}
+static int pmu_dev_alloc(struct pmu *pmu)
+{
+        int ret = -ENOMEM;
+        pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
+        if (!pmu->dev)
+                goto out;
+        device_initialize(pmu->dev);
+        ret = dev_set_name(pmu->dev, "%s", pmu->name);
+        if (ret)
+                goto free_dev;
+        dev_set_drvdata(pmu->dev, pmu);
+        pmu->dev->bus = &pmu_bus;
+        pmu->dev->release = pmu_dev_release;
+        ret = device_add(pmu->dev);
+        if (ret)
+                goto free_dev;
+out:
+        return ret;
+free_dev:
+        put_device(pmu->dev);
+        goto out;
+}
-int perf_pmu_register(struct pmu *pmu)
+int perf_pmu_register(struct pmu *pmu, char *name, int type)
 {
        int cpu, ret;
@@ -5115,13 +5384,38 @@ int perf_pmu_register(struct pmu *pmu)
        if (!pmu->pmu_disable_count)
                goto unlock;
+        pmu->type = -1;
+        if (!name)
+                goto skip_type;
+        pmu->name = name;
+        if (type < 0) {
+                int err = idr_pre_get(&pmu_idr, GFP_KERNEL);
+                if (!err)
+                        goto free_pdc;
+                err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type);
+                if (err) {
+                        ret = err;
+                        goto free_pdc;
+                }
+        }
+        pmu->type = type;
+        if (pmu_bus_running) {
+                ret = pmu_dev_alloc(pmu);
+                if (ret)
+                        goto free_idr;
+        }
+skip_type:
        pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
        if (pmu->pmu_cpu_context)
                goto got_cpu_context;
        pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
        if (!pmu->pmu_cpu_context)
-                goto free_pdc;
+                goto free_dev;
        for_each_possible_cpu(cpu) {
                struct perf_cpu_context *cpuctx;
@@ -5132,6 +5426,7 @@ int perf_pmu_register(struct pmu *pmu)
                cpuctx->ctx.pmu = pmu;
                cpuctx->jiffies_interval = 1;
                INIT_LIST_HEAD(&cpuctx->rotation_list);
+                cpuctx->active_pmu = pmu;
        }
 got_cpu_context:
@@ -5164,6 +5459,14 @@ unlock:
        return ret;
+free_dev:
+        device_del(pmu->dev);
+        put_device(pmu->dev);
+free_idr:
+        if (pmu->type >= PERF_TYPE_MAX)
+                idr_remove(&pmu_idr, pmu->type);
 free_pdc:
        free_percpu(pmu->pmu_disable_count);
        goto unlock;
@@ -5183,7 +5486,11 @@ void perf_pmu_unregister(struct pmu *pmu)
        synchronize_rcu();
        free_percpu(pmu->pmu_disable_count);
-        free_pmu_context(pmu->pmu_cpu_context);
+        if (pmu->type >= PERF_TYPE_MAX)
+                idr_remove(&pmu_idr, pmu->type);
+        device_del(pmu->dev);
+        put_device(pmu->dev);
+        free_pmu_context(pmu);
 }
 struct pmu *perf_init_event(struct perf_event *event)
@@ -5192,6 +5499,13 @@ struct pmu *perf_init_event(struct perf_event *event)
        int idx;
        idx = srcu_read_lock(&pmus_srcu);
+        rcu_read_lock();
+        pmu = idr_find(&pmu_idr, event->attr.type);
+        rcu_read_unlock();
+        if (pmu)
+                goto unlock;
        list_for_each_entry_rcu(pmu, &pmus, entry) {
                int ret = pmu->event_init(event);
                if (!ret)
@@ -5224,6 +5538,11 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
        struct hw_perf_event *hwc;
        long err;
+        if ((unsigned)cpu >= nr_cpu_ids) {
+                if (!task || cpu != -1)
+                        return ERR_PTR(-EINVAL);
+        }
        event = kzalloc(sizeof(*event), GFP_KERNEL);
        if (!event)
                return ERR_PTR(-ENOMEM);
@@ -5272,7 +5591,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
        if (!overflow_handler && parent_event)
                overflow_handler = parent_event->overflow_handler;
-        
        event->overflow_handler = overflow_handler;
        if (attr->disabled)
@@ -5651,12 +5970,18 @@ SYSCALL_DEFINE5(perf_event_open,
        mutex_unlock(&ctx->mutex);
        event->owner = current;
-        get_task_struct(current);
        mutex_lock(&current->perf_event_mutex);
        list_add_tail(&event->owner_entry, &current->perf_event_list);
        mutex_unlock(&current->perf_event_mutex);
        /*
+         * Precalculate sample_data sizes
+         */
+        perf_event__header_size(event);
+        perf_event__id_header_size(event);
+        /*
         * Drop the reference on the group_event after placing the
         * new event on the sibling_list. This ensures destruction
         * of the group leader will find the pointer to itself in
@@ -5719,12 +6044,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
        ++ctx->generation;
        mutex_unlock(&ctx->mutex);
-        event->owner = current;
-        get_task_struct(current);
-        mutex_lock(&current->perf_event_mutex);
-        list_add_tail(&event->owner_entry, &current->perf_event_list);
-        mutex_unlock(&current->perf_event_mutex);
        return event;
 err_free:
@@ -5875,8 +6194,24 @@ again:
 */
 void perf_event_exit_task(struct task_struct *child)
 {
+        struct perf_event *event, *tmp;
        int ctxn;
+        mutex_lock(&child->perf_event_mutex);
+        list_for_each_entry_safe(event, tmp, &child->perf_event_list,
+                                 owner_entry) {
+                list_del_init(&event->owner_entry);
+                /*
+                 * Ensure the list deletion is visible before we clear
+                 * the owner, closes a race against perf_release() where
+                 * we need to serialize on the owner->perf_event_mutex.
+                 */
+                smp_wmb();
+                event->owner = NULL;
+        }
+        mutex_unlock(&child->perf_event_mutex);
        for_each_task_context_nr(ctxn)
                perf_event_exit_task_context(child, ctxn);
 }
@@ -5999,6 +6334,12 @@ inherit_event(struct perf_event *parent_event,
        child_event->overflow_handler = parent_event->overflow_handler;
        /*
+         * Precalculate sample_data sizes
+         */
+        perf_event__header_size(child_event);
+        perf_event__id_header_size(child_event);
+        /*
         * Link it up in the child's context:
         */
        raw_spin_lock_irqsave(&child_ctx->lock, flags);
@@ -6096,6 +6437,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
        struct perf_event *event;
        struct task_struct *parent = current;
        int inherited_all = 1;
+        unsigned long flags;
        int ret = 0;
        child->perf_event_ctxp[ctxn] = NULL;
@@ -6136,6 +6478,15 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
                        break;
        }
+        /*
+         * We can't hold ctx->lock when iterating the ->flexible_group list due
+         * to allocations, but we need to prevent rotation because
+         * rotate_ctx() will change the list from interrupt context.
+         */
+        raw_spin_lock_irqsave(&parent_ctx->lock, flags);
+        parent_ctx->rotate_disable = 1;
+        raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
        list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
                ret = inherit_task_group(event, parent, parent_ctx,
                                         child, ctxn, &inherited_all);
@@ -6143,18 +6494,20 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
                        break;
        }
+        raw_spin_lock_irqsave(&parent_ctx->lock, flags);
+        parent_ctx->rotate_disable = 0;
        child_ctx = child->perf_event_ctxp[ctxn];
        if (child_ctx && inherited_all) {
                /*
                 * Mark the child context as a clone of the parent
                 * context, or of whatever the parent is a clone of.
-                 * Note that if the parent is a clone, it could get
+                 *
-                 * uncloned at any point, but that doesn't matter
+                 * Note that if the parent is a clone, the holding of
-                 * because the list of events and the generation
+                 * parent_ctx->lock avoids it from being uncloned.
-                 * count can't have changed since we took the mutex.
                 */
-                cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
+                cloned_ctx = parent_ctx->parent_ctx;
                if (cloned_ctx) {
                        child_ctx->parent_ctx = cloned_ctx;
                        child_ctx->parent_gen = parent_ctx->parent_gen;
@@ -6165,6 +6518,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
                get_ctx(child_ctx->parent_ctx);
        }
+        raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
        mutex_unlock(&parent_ctx->mutex);
        perf_unpin_context(parent_ctx);
@@ -6215,7 +6569,7 @@ static void __cpuinit perf_event_init_cpu(int cpu)
        mutex_unlock(&swhash->hlist_mutex);
 }
-#ifdef CONFIG_HOTPLUG_CPU
+#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
 static void perf_pmu_rotate_stop(struct pmu *pmu)
 {
        struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
@@ -6269,6 +6623,26 @@ static void perf_event_exit_cpu(int cpu)
 static inline void perf_event_exit_cpu(int cpu) { }
 #endif
+static int
+perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
+{
+        int cpu;
+        for_each_online_cpu(cpu)
+                perf_event_exit_cpu(cpu);
+        return NOTIFY_OK;
+}
+/*
+ * Run the perf reboot notifier at the very last possible moment so that
+ * the generic watchdog code runs as long as possible.
+ */
+static struct notifier_block perf_reboot_notifier = {
+        .notifier_call = perf_reboot,
+        .priority = INT_MIN,
+};
 static int __cpuinit
 perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
 {
@@ -6295,11 +6669,47 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
 void __init perf_event_init(void)
 {
+        int ret;
+        idr_init(&pmu_idr);
        perf_event_init_all_cpus();
        init_srcu_struct(&pmus_srcu);
-        perf_pmu_register(&perf_swevent);
+        perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
-        perf_pmu_register(&perf_cpu_clock);
+        perf_pmu_register(&perf_cpu_clock, NULL, -1);
-        perf_pmu_register(&perf_task_clock);
+        perf_pmu_register(&perf_task_clock, NULL, -1);
        perf_tp_register();
        perf_cpu_notifier(perf_cpu_notify);
+        register_reboot_notifier(&perf_reboot_notifier);
+        ret = init_hw_breakpoint();
+        WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
+}
+static int __init perf_event_sysfs_init(void)
+{
+        struct pmu *pmu;
+        int ret;
+        mutex_lock(&pmus_lock);
+        ret = bus_register(&pmu_bus);
+        if (ret)
+                goto unlock;
+        list_for_each_entry(pmu, &pmus, entry) {
+                if (!pmu->name || pmu->type < 0)
+                        continue;
+                ret = pmu_dev_alloc(pmu);
+                WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
+        }
+        pmu_bus_running = 1;
+        ret = 0;
+unlock:
+        mutex_unlock(&pmus_lock);
+        return ret;
 }
+device_initcall(perf_event_sysfs_init);
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index c7a8f453919e..aeaa7f846821 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -121,10 +121,10 @@ static inline int pm_qos_get_value(struct pm_qos_object *o)
        switch (o->type) {
        case PM_QOS_MIN:
-                return plist_last(&o->requests)->prio;
+                return plist_first(&o->requests)->prio;
        case PM_QOS_MAX:
-                return plist_first(&o->requests)->prio;
+                return plist_last(&o->requests)->prio;
        default:
                /* runtime check for not using enum */
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 6842eeba5879..05bb7173850e 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -37,13 +37,13 @@ static int check_clock(const clockid_t which_clock)
        if (pid == 0)
                return 0;
-        read_lock(&tasklist_lock);
+        rcu_read_lock();
        p = find_task_by_vpid(pid);
        if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ?
-                   same_thread_group(p, current) : thread_group_leader(p))) {
+                   same_thread_group(p, current) : has_group_leader_pid(p))) {
                error = -EINVAL;
        }
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        return error;
 }
@@ -390,7 +390,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
        INIT_LIST_HEAD(&new_timer->it.cpu.entry);
-        read_lock(&tasklist_lock);
+        rcu_read_lock();
        if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) {
                if (pid == 0) {
                        p = current;
@@ -404,7 +404,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
                        p = current->group_leader;
                } else {
                        p = find_task_by_vpid(pid);
-                        if (p && !thread_group_leader(p))
+                        if (p && !has_group_leader_pid(p))
                                p = NULL;
                }
        }
@@ -414,7 +414,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
        } else {
                ret = -EINVAL;
        }
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        return ret;
 }
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 9ca4973f736d..93bd2eb2bc53 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -145,7 +145,13 @@ static int common_timer_del(struct k_itimer *timer);
 static enum hrtimer_restart posix_timer_fn(struct hrtimer *data);
-static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags);
+static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags);
+#define lock_timer(tid, flags)                                             \
+({      struct k_itimer *__timr;                                           \
+        __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags));  \
+        __timr;                                                            \
+})
 static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
 {
@@ -619,7 +625,7 @@ out:
 * the find to the timer lock.  To avoid a dead lock, the timer id MUST
 * be release with out holding the timer lock.
 */
-static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags)
+static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
 {
        struct k_itimer *timr;
        /*
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 29bff6117abc..265729966ece 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -100,13 +100,9 @@ config PM_SLEEP_ADVANCED_DEBUG
        depends on PM_ADVANCED_DEBUG
        default n
-config SUSPEND_NVS
-       bool
 config SUSPEND
        bool "Suspend to RAM and standby"
        depends on PM && ARCH_SUSPEND_POSSIBLE
-        select SUSPEND_NVS if HAS_IOMEM
        default y
        ---help---
          Allow the system to enter sleep states in which main memory is
@@ -140,7 +136,6 @@ config HIBERNATION
        depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE
        select LZO_COMPRESS
        select LZO_DECOMPRESS
-        select SUSPEND_NVS if HAS_IOMEM
        ---help---
          Enable the suspend to disk (STD) functionality, which is usually
          called "hibernation" in user interfaces.  STD checkpoints the
@@ -246,9 +241,13 @@ config PM_OPS
        depends on PM_SLEEP || PM_RUNTIME
        default y
+config ARCH_HAS_OPP
+        bool
 config PM_OPP
        bool "Operating Performance Point (OPP) Layer library"
        depends on PM
+        depends on ARCH_HAS_OPP
        ---help---
          SOCs have a standard set of tuples consisting of frequency and
          voltage pairs that the device will support per voltage domain. This
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index f9063c6b185d..c350e18b53e3 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,7 +1,4 @@
+ccflags-$(CONFIG_PM_DEBUG)      :=      -DDEBUG
-ifeq ($(CONFIG_PM_DEBUG),y)
-EXTRA_CFLAGS    +=      -DDEBUG
-endif
 obj-$(CONFIG_PM)                += main.o
 obj-$(CONFIG_PM_SLEEP)          += console.o
@@ -10,6 +7,5 @@ obj-$(CONFIG_SUSPEND)		+= suspend.o
 obj-$(CONFIG_PM_TEST_SUSPEND)   += suspend_test.o
 obj-$(CONFIG_HIBERNATION)       += hibernate.o snapshot.o swap.o user.o \
                                   block_io.o
-obj-$(CONFIG_SUSPEND_NVS)       += nvs.o
 obj-$(CONFIG_MAGIC_SYSRQ)       += poweroff.o
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 657272e91d0a..1832bd264219 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -51,18 +51,18 @@ enum {
 static int hibernation_mode = HIBERNATION_SHUTDOWN;
-static struct platform_hibernation_ops *hibernation_ops;
+static const struct platform_hibernation_ops *hibernation_ops;
 /**
 * hibernation_set_ops - set the global hibernate operations
 * @ops: the hibernation operations to use in subsequent hibernation transitions
 */
-void hibernation_set_ops(struct platform_hibernation_ops *ops)
+void hibernation_set_ops(const struct platform_hibernation_ops *ops)
 {
        if (ops && !(ops->begin && ops->end &&  ops->pre_snapshot
            && ops->prepare && ops->finish && ops->enter && ops->pre_restore
-            && ops->restore_cleanup)) {
+            && ops->restore_cleanup && ops->leave)) {
                WARN_ON(1);
                return;
        }
@@ -278,7 +278,7 @@ static int create_image(int platform_mode)
                goto Enable_irqs;
        }
-        if (hibernation_test(TEST_CORE) || !pm_check_wakeup_events())
+        if (hibernation_test(TEST_CORE) || pm_wakeup_pending())
                goto Power_up;
        in_suspend = 1;
@@ -327,7 +327,6 @@ static int create_image(int platform_mode)
 int hibernation_snapshot(int platform_mode)
 {
        int error;
-        gfp_t saved_mask;
        error = platform_begin(platform_mode);
        if (error)
@@ -339,7 +338,7 @@ int hibernation_snapshot(int platform_mode)
                goto Close;
        suspend_console();
-        saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
+        pm_restrict_gfp_mask();
        error = dpm_suspend_start(PMSG_FREEZE);
        if (error)
                goto Recover_platform;
@@ -348,7 +347,10 @@ int hibernation_snapshot(int platform_mode)
                goto Recover_platform;
        error = create_image(platform_mode);
-        /* Control returns here after successful restore */
+        /*
+         * Control returns here (1) after the image has been created or the
+         * image creation has failed and (2) after a successful restore.
+         */
 Resume_devices:
        /* We may need to release the preallocated image pages here. */
@@ -357,7 +359,10 @@ int hibernation_snapshot(int platform_mode)
        dpm_resume_end(in_suspend ?
                (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
-        set_gfp_allowed_mask(saved_mask);
+        if (error || !in_suspend)
+                pm_restore_gfp_mask();
        resume_console();
 Close:
        platform_end(platform_mode);
@@ -452,17 +457,16 @@ static int resume_target_kernel(bool platform_mode)
 int hibernation_restore(int platform_mode)
 {
        int error;
-        gfp_t saved_mask;
        pm_prepare_console();
        suspend_console();
-        saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
+        pm_restrict_gfp_mask();
        error = dpm_suspend_start(PMSG_QUIESCE);
        if (!error) {
                error = resume_target_kernel(platform_mode);
                dpm_resume_end(PMSG_RECOVER);
        }
-        set_gfp_allowed_mask(saved_mask);
+        pm_restore_gfp_mask();
        resume_console();
        pm_restore_console();
        return error;
@@ -476,7 +480,6 @@ int hibernation_restore(int platform_mode)
 int hibernation_platform_enter(void)
 {
        int error;
-        gfp_t saved_mask;
        if (!hibernation_ops)
                return -ENOSYS;
@@ -492,7 +495,6 @@ int hibernation_platform_enter(void)
        entering_platform_hibernation = true;
        suspend_console();
-        saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
        error = dpm_suspend_start(PMSG_HIBERNATE);
        if (error) {
                if (hibernation_ops->recover)
@@ -514,7 +516,7 @@ int hibernation_platform_enter(void)
        local_irq_disable();
        sysdev_suspend(PMSG_HIBERNATE);
-        if (!pm_check_wakeup_events()) {
+        if (pm_wakeup_pending()) {
                error = -EAGAIN;
                goto Power_up;
        }
@@ -536,7 +538,6 @@ int hibernation_platform_enter(void)
 Resume_devices:
        entering_platform_hibernation = false;
        dpm_resume_end(PMSG_RESTORE);
-        set_gfp_allowed_mask(saved_mask);
        resume_console();
 Close:
@@ -646,6 +647,8 @@ int hibernate(void)
                swsusp_free();
                if (!error)
                        power_down();
+                in_suspend = 0;
+                pm_restore_gfp_mask();
        } else {
                pr_debug("PM: Image restored successfully.\n");
        }
diff --git a/kernel/power/nvs.c b/kernel/power/nvs.c
deleted file mode 100644
index 1836db60bbb6..000000000000
--- a/kernel/power/nvs.c
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * linux/kernel/power/hibernate_nvs.c - Routines for handling NVS memory
- *
- * Copyright (C) 2008,2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
- *
- * This file is released under the GPLv2.
- */
-#include <linux/io.h>
-#include <linux/kernel.h>
-#include <linux/list.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/suspend.h>
-/*
- * Platforms, like ACPI, may want us to save some memory used by them during
- * suspend and to restore the contents of this memory during the subsequent
- * resume.  The code below implements a mechanism allowing us to do that.
- */
-struct nvs_page {
-        unsigned long phys_start;
-        unsigned int size;
-        void *kaddr;
-        void *data;
-        struct list_head node;
-};
-static LIST_HEAD(nvs_list);
-/**
- *      suspend_nvs_register - register platform NVS memory region to save
- *      @start - physical address of the region
- *      @size - size of the region
- *
- *      The NVS region need not be page-aligned (both ends) and we arrange
- *      things so that the data from page-aligned addresses in this region will
- *      be copied into separate RAM pages.
- */
-int suspend_nvs_register(unsigned long start, unsigned long size)
-{
-        struct nvs_page *entry, *next;
-        while (size > 0) {
-                unsigned int nr_bytes;
-                entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL);
-                if (!entry)
-                        goto Error;
-                list_add_tail(&entry->node, &nvs_list);
-                entry->phys_start = start;
-                nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK);
-                entry->size = (size < nr_bytes) ? size : nr_bytes;
-                start += entry->size;
-                size -= entry->size;
-        }
-        return 0;
- Error:
-        list_for_each_entry_safe(entry, next, &nvs_list, node) {
-                list_del(&entry->node);
-                kfree(entry);
-        }
-        return -ENOMEM;
-}
-/**
- *      suspend_nvs_free - free data pages allocated for saving NVS regions
- */
-void suspend_nvs_free(void)
-{
-        struct nvs_page *entry;
-        list_for_each_entry(entry, &nvs_list, node)
-                if (entry->data) {
-                        free_page((unsigned long)entry->data);
-                        entry->data = NULL;
-                        if (entry->kaddr) {
-                                iounmap(entry->kaddr);
-                                entry->kaddr = NULL;
-                        }
-                }
-}
-/**
- *      suspend_nvs_alloc - allocate memory necessary for saving NVS regions
- */
-int suspend_nvs_alloc(void)
-{
-        struct nvs_page *entry;
-        list_for_each_entry(entry, &nvs_list, node) {
-                entry->data = (void *)__get_free_page(GFP_KERNEL);
-                if (!entry->data) {
-                        suspend_nvs_free();
-                        return -ENOMEM;
-                }
-        }
-        return 0;
-}
-/**
- *      suspend_nvs_save - save NVS memory regions
- */
-void suspend_nvs_save(void)
-{
-        struct nvs_page *entry;
-        printk(KERN_INFO "PM: Saving platform NVS memory\n");
-        list_for_each_entry(entry, &nvs_list, node)
-                if (entry->data) {
-                        entry->kaddr = ioremap(entry->phys_start, entry->size);
-                        memcpy(entry->data, entry->kaddr, entry->size);
-                }
-}
-/**
- *      suspend_nvs_restore - restore NVS memory regions
- *
- *      This function is going to be called with interrupts disabled, so it
- *      cannot iounmap the virtual addresses used to access the NVS region.
- */
-void suspend_nvs_restore(void)
-{
-        struct nvs_page *entry;
-        printk(KERN_INFO "PM: Restoring platform NVS memory\n");
-        list_for_each_entry(entry, &nvs_list, node)
-                if (entry->data)
-                        memcpy(entry->kaddr, entry->data, entry->size);
-}
diff --git a/kernel/power/process.c b/kernel/power/process.c
index e50b4c1b2a0f..d6d2a10320e0 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -64,6 +64,12 @@ static int try_to_freeze_tasks(bool sig_only)
                         * perturb a task in TASK_STOPPED or TASK_TRACED.
                         * It is "frozen enough".  If the task does wake
                         * up, it will immediately call try_to_freeze.
+                         *
+                         * Because freeze_task() goes through p's
+                         * scheduler lock after setting TIF_FREEZE, it's
+                         * guaranteed that either we see TASK_RUNNING or
+                         * try_to_stop() after schedule() in ptrace/signal
+                         * stop sees TIF_FREEZE.
                         */
                        if (!task_is_stopped_or_traced(p) &&
                            !freezer_should_skip(p))
@@ -79,7 +85,7 @@ static int try_to_freeze_tasks(bool sig_only)
                if (!todo || time_after(jiffies, end_time))
                        break;
-                if (!pm_check_wakeup_events()) {
+                if (pm_wakeup_pending()) {
                        wakeup = true;
                        break;
                }
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 7335952ee473..de6f86bfa303 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -22,6 +22,7 @@
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/suspend.h>
+#include <trace/events/power.h>
 #include "power.h"
@@ -30,13 +31,13 @@ const char *const pm_states[PM_SUSPEND_MAX] = {
        [PM_SUSPEND_MEM]        = "mem",
 };
-static struct platform_suspend_ops *suspend_ops;
+static const struct platform_suspend_ops *suspend_ops;
 /**
 *      suspend_set_ops - Set the global suspend method table.
 *      @ops:   Pointer to ops structure.
 */
-void suspend_set_ops(struct platform_suspend_ops *ops)
+void suspend_set_ops(const struct platform_suspend_ops *ops)
 {
        mutex_lock(&pm_mutex);
        suspend_ops = ops;
@@ -163,7 +164,7 @@ static int suspend_enter(suspend_state_t state)
        error = sysdev_suspend(PMSG_SUSPEND);
        if (!error) {
-                if (!suspend_test(TEST_CORE) && pm_check_wakeup_events()) {
+                if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) {
                        error = suspend_ops->enter(state);
                        events_check_enabled = false;
                }
@@ -197,18 +198,18 @@ static int suspend_enter(suspend_state_t state)
 int suspend_devices_and_enter(suspend_state_t state)
 {
        int error;
-        gfp_t saved_mask;
        if (!suspend_ops)
                return -ENOSYS;
+        trace_machine_suspend(state);
        if (suspend_ops->begin) {
                error = suspend_ops->begin(state);
                if (error)
                        goto Close;
        }
        suspend_console();
-        saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
+        pm_restrict_gfp_mask();
        suspend_test_start();
        error = dpm_suspend_start(PMSG_SUSPEND);
        if (error) {
@@ -225,11 +226,12 @@ int suspend_devices_and_enter(suspend_state_t state)
        suspend_test_start();
        dpm_resume_end(PMSG_RESUME);
        suspend_test_finish("resume devices");
-        set_gfp_allowed_mask(saved_mask);
+        pm_restore_gfp_mask();
        resume_console();
 Close:
        if (suspend_ops->end)
                suspend_ops->end();
+        trace_machine_suspend(PWR_EVENT_EXIT);
        return error;
 Recover_platform:
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index a0e4a86ccf94..7c97c3a0eee3 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -6,6 +6,7 @@
 *
 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+ * Copyright (C) 2010 Bojan Smojver <bojan@rexursive.com>
 *
 * This file is released under the GPLv2.
 *
@@ -29,7 +30,7 @@
 #include "power.h"
-#define HIBERNATE_SIG   "LINHIB0001"
+#define HIBERNATE_SIG   "S1SUSPEND"
 /*
 *      The swap map is a data structure used for keeping track of each page
@@ -223,7 +224,7 @@ static int swsusp_swap_check(void)
                return res;
        root_swap = res;
-        res = blkdev_get(hib_resume_bdev, FMODE_WRITE);
+        res = blkdev_get(hib_resume_bdev, FMODE_WRITE, NULL);
        if (res)
                return res;
@@ -753,30 +754,43 @@ static int load_image_lzo(struct swap_map_handle *handle,
 {
        unsigned int m;
        int error = 0;
+        struct bio *bio;
        struct timeval start;
        struct timeval stop;
        unsigned nr_pages;
-        size_t off, unc_len, cmp_len;
+        size_t i, off, unc_len, cmp_len;
-        unsigned char *unc, *cmp, *page;
+        unsigned char *unc, *cmp, *page[LZO_CMP_PAGES];
-        page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
+        for (i = 0; i < LZO_CMP_PAGES; i++) {
-        if (!page) {
+                page[i] = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
-                printk(KERN_ERR "PM: Failed to allocate LZO page\n");
+                if (!page[i]) {
-                return -ENOMEM;
+                        printk(KERN_ERR "PM: Failed to allocate LZO page\n");
+                        while (i)
+                                free_page((unsigned long)page[--i]);
+                        return -ENOMEM;
+                }
        }
        unc = vmalloc(LZO_UNC_SIZE);
        if (!unc) {
                printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n");
-                free_page((unsigned long)page);
+                for (i = 0; i < LZO_CMP_PAGES; i++)
+                        free_page((unsigned long)page[i]);
                return -ENOMEM;
        }
        cmp = vmalloc(LZO_CMP_SIZE);
        if (!cmp) {
                printk(KERN_ERR "PM: Failed to allocate LZO compressed\n");
                vfree(unc);
-                free_page((unsigned long)page);
+                for (i = 0; i < LZO_CMP_PAGES; i++)
+                        free_page((unsigned long)page[i]);
                return -ENOMEM;
        }
@@ -787,6 +801,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
        if (!m)
                m = 1;
        nr_pages = 0;
+        bio = NULL;
        do_gettimeofday(&start);
        error = snapshot_write_next(snapshot);
@@ -794,11 +809,11 @@ static int load_image_lzo(struct swap_map_handle *handle,
                goto out_finish;
        for (;;) {
-                error = swap_read_page(handle, page, NULL); /* sync */
+                error = swap_read_page(handle, page[0], NULL); /* sync */
                if (error)
                        break;
-                cmp_len = *(size_t *)page;
+                cmp_len = *(size_t *)page[0];
                if (unlikely(!cmp_len ||
                             cmp_len > lzo1x_worst_compress(LZO_UNC_SIZE))) {
                        printk(KERN_ERR "PM: Invalid LZO compressed length\n");
@@ -806,13 +821,20 @@ static int load_image_lzo(struct swap_map_handle *handle,
                        break;
                }
-                memcpy(cmp, page, PAGE_SIZE);
+                for (off = PAGE_SIZE, i = 1;
-                for (off = PAGE_SIZE; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) {
+                     off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) {
-                        error = swap_read_page(handle, page, NULL); /* sync */
+                        error = swap_read_page(handle, page[i], &bio);
                        if (error)
                                goto out_finish;
+                }
-                        memcpy(cmp + off, page, PAGE_SIZE);
+                error = hib_wait_on_bio_chain(&bio); /* need all data now */
+                if (error)
+                        goto out_finish;
+                for (off = 0, i = 0;
+                     off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) {
+                        memcpy(cmp + off, page[i], PAGE_SIZE);
                }
                unc_len = LZO_UNC_SIZE;
@@ -857,7 +879,8 @@ out_finish:
        vfree(cmp);
        vfree(unc);
-        free_page((unsigned long)page);
+        for (i = 0; i < LZO_CMP_PAGES; i++)
+                free_page((unsigned long)page[i]);
        return error;
 }
@@ -865,7 +888,7 @@ out_finish:
 /**
 *      swsusp_read - read the hibernation image.
 *      @flags_p: flags passed by the "frozen" kernel in the image header should
- *                be written into this memeory location
+ *                be written into this memory location
 */
 int swsusp_read(unsigned int *flags_p)
@@ -907,7 +930,8 @@ int swsusp_check(void)
 {
        int error;
-        hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
+        hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device,
+                                            FMODE_READ, NULL);
        if (!IS_ERR(hib_resume_bdev)) {
                set_blocksize(hib_resume_bdev, PAGE_SIZE);
                clear_page(swsusp_header);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index e819e17877ca..c36c3b9e8a84 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -137,7 +137,7 @@ static int snapshot_release(struct inode *inode, struct file *filp)
        free_all_swap_pages(data->swap);
        if (data->frozen)
                thaw_processes();
-        pm_notifier_call_chain(data->mode == O_WRONLY ?
+        pm_notifier_call_chain(data->mode == O_RDONLY ?
                        PM_POST_HIBERNATION : PM_POST_RESTORE);
        atomic_inc(&snapshot_device_available);
@@ -263,6 +263,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
        case SNAPSHOT_UNFREEZE:
                if (!data->frozen || data->ready)
                        break;
+                pm_restore_gfp_mask();
                thaw_processes();
                usermodehelper_enable();
                data->frozen = 0;
@@ -275,6 +276,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                        error = -EPERM;
                        break;
                }
+                pm_restore_gfp_mask();
                error = hibernation_snapshot(data->platform_support);
                if (!error)
                        error = put_user(in_suspend, (int __user *)arg);
diff --git a/kernel/printk.c b/kernel/printk.c
index b2ebaee8c377..53d9a9ec88e6 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -39,16 +39,11 @@
 #include <linux/syslog.h>
 #include <linux/cpu.h>
 #include <linux/notifier.h>
+#include <linux/rculist.h>
 #include <asm/uaccess.h>
 /*
- * for_each_console() allows you to iterate on each console
- */
-#define for_each_console(con) \
-        for (con = console_drivers; con != NULL; con = con->next)
-/*
 * Architectures can override it:
 */
 void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
@@ -261,6 +256,12 @@ static inline void boot_delay_msec(void)
 }
 #endif
+#ifdef CONFIG_SECURITY_DMESG_RESTRICT
+int dmesg_restrict = 1;
+#else
+int dmesg_restrict;
+#endif
 int do_syslog(int type, char __user *buf, int len, bool from_file)
 {
        unsigned i, j, limit, count;
@@ -268,7 +269,20 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
        char c;
        int error = 0;
-        error = security_syslog(type, from_file);
+        /*
+         * If this is from /proc/kmsg we only do the capabilities checks
+         * at open time.
+         */
+        if (type == SYSLOG_ACTION_OPEN || !from_file) {
+                if (dmesg_restrict && !capable(CAP_SYSLOG))
+                        goto warn; /* switch to return -EPERM after 2.6.39 */
+                if ((type != SYSLOG_ACTION_READ_ALL &&
+                     type != SYSLOG_ACTION_SIZE_BUFFER) &&
+                    !capable(CAP_SYSLOG))
+                        goto warn; /* switch to return -EPERM after 2.6.39 */
+        }
+        error = security_syslog(type);
        if (error)
                return error;
@@ -409,6 +423,12 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
        }
 out:
        return error;
+warn:
+        /* remove after 2.6.39 */
+        if (capable(CAP_SYS_ADMIN))
+                WARN_ONCE(1, "Attempt to access syslog with CAP_SYS_ADMIN "
+                  "but no CAP_SYSLOG (deprecated and denied).\n");
+        return -EPERM;
 }
 SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
@@ -1055,21 +1075,23 @@ static DEFINE_PER_CPU(int, printk_pending);
 void printk_tick(void)
 {
-        if (__get_cpu_var(printk_pending)) {
+        if (__this_cpu_read(printk_pending)) {
-                __get_cpu_var(printk_pending) = 0;
+                __this_cpu_write(printk_pending, 0);
                wake_up_interruptible(&log_wait);
        }
 }
 int printk_needs_cpu(int cpu)
 {
-        return per_cpu(printk_pending, cpu);
+        if (cpu_is_offline(cpu))
+                printk_tick();
+        return __this_cpu_read(printk_pending);
 }
 void wake_up_klogd(void)
 {
        if (waitqueue_active(&log_wait))
-                __raw_get_cpu_var(printk_pending) = 1;
+                this_cpu_write(printk_pending, 1);
 }
 /**
@@ -1338,6 +1360,7 @@ void register_console(struct console *newcon)
                spin_unlock_irqrestore(&logbuf_lock, flags);
        }
        release_console_sem();
+        console_sysfs_notify();
        /*
         * By unregistering the bootconsoles after we enable the real console
@@ -1396,6 +1419,7 @@ int unregister_console(struct console *console)
                console_drivers->flags |= CON_CONSDEV;
        release_console_sem();
+        console_sysfs_notify();
        return res;
 }
 EXPORT_SYMBOL(unregister_console);
@@ -1479,7 +1503,7 @@ int kmsg_dump_register(struct kmsg_dumper *dumper)
        /* Don't allow registering multiple times */
        if (!dumper->registered) {
                dumper->registered = 1;
-                list_add_tail(&dumper->list, &dump_list);
+                list_add_tail_rcu(&dumper->list, &dump_list);
                err = 0;
        }
        spin_unlock_irqrestore(&dump_list_lock, flags);
@@ -1503,29 +1527,16 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper)
        spin_lock_irqsave(&dump_list_lock, flags);
        if (dumper->registered) {
                dumper->registered = 0;
-                list_del(&dumper->list);
+                list_del_rcu(&dumper->list);
                err = 0;
        }
        spin_unlock_irqrestore(&dump_list_lock, flags);
+        synchronize_rcu();
        return err;
 }
 EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
-static const char * const kmsg_reasons[] = {
-        [KMSG_DUMP_OOPS]        = "oops",
-        [KMSG_DUMP_PANIC]       = "panic",
-        [KMSG_DUMP_KEXEC]       = "kexec",
-};
-static const char *kmsg_to_str(enum kmsg_dump_reason reason)
-{
-        if (reason >= ARRAY_SIZE(kmsg_reasons) || reason < 0)
-                return "unknown";
-        return kmsg_reasons[reason];
-}
 /**
 * kmsg_dump - dump kernel log to kernel message dumpers.
 * @reason: the reason (oops, panic etc) for dumping
@@ -1564,13 +1575,9 @@ void kmsg_dump(enum kmsg_dump_reason reason)
                l2 = chars;
        }
-        if (!spin_trylock_irqsave(&dump_list_lock, flags)) {
+        rcu_read_lock();
-                printk(KERN_ERR "dump_kmsg: dump list lock is held during %s, skipping dump\n",
+        list_for_each_entry_rcu(dumper, &dump_list, list)
-                                kmsg_to_str(reason));
-                return;
-        }
-        list_for_each_entry(dumper, &dump_list, list)
                dumper->dump(dumper, reason, s1, l1, s2, l2);
-        spin_unlock_irqrestore(&dump_list_lock, flags);
+        rcu_read_unlock();
 }
 #endif
diff --git a/kernel/range.c b/kernel/range.c
index 471b66acabb5..37fa9b99ad58 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -119,7 +119,7 @@ static int cmp_range(const void *x1, const void *x2)
 int clean_sort_range(struct range *range, int az)
 {
-        int i, j, k = az - 1, nr_range = 0;
+        int i, j, k = az - 1, nr_range = az;
        for (i = 0; i < k; i++) {
                if (range[i].end)
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index d806735342ac..0c343b9a46d5 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -36,31 +36,16 @@
 #include <linux/time.h>
 #include <linux/cpu.h>
-/* Global control variables for rcupdate callback mechanism. */
+/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */
-struct rcu_ctrlblk {
+static struct task_struct *rcu_kthread_task;
-        struct rcu_head *rcucblist;     /* List of pending callbacks (CBs). */
+static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
-        struct rcu_head **donetail;     /* ->next pointer of last "done" CB. */
+static unsigned long have_rcu_kthread_work;
-        struct rcu_head **curtail;      /* ->next pointer of last CB. */
+static void invoke_rcu_kthread(void);
-};
-/* Definition for rcupdate control block. */
-static struct rcu_ctrlblk rcu_sched_ctrlblk = {
-        .donetail       = &rcu_sched_ctrlblk.rcucblist,
-        .curtail        = &rcu_sched_ctrlblk.rcucblist,
-};
-static struct rcu_ctrlblk rcu_bh_ctrlblk = {
-        .donetail       = &rcu_bh_ctrlblk.rcucblist,
-        .curtail        = &rcu_bh_ctrlblk.rcucblist,
-};
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-int rcu_scheduler_active __read_mostly;
-EXPORT_SYMBOL_GPL(rcu_scheduler_active);
-#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 /* Forward declarations for rcutiny_plugin.h. */
-static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
+struct rcu_ctrlblk;
+static void rcu_process_callbacks(struct rcu_ctrlblk *rcp);
+static int rcu_kthread(void *arg);
 static void __call_rcu(struct rcu_head *head,
                       void (*func)(struct rcu_head *rcu),
                       struct rcu_ctrlblk *rcp);
@@ -123,7 +108,7 @@ void rcu_sched_qs(int cpu)
 {
        if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
            rcu_qsctr_help(&rcu_bh_ctrlblk))
-                raise_softirq(RCU_SOFTIRQ);
+                invoke_rcu_kthread();
 }
 /*
@@ -132,7 +117,7 @@ void rcu_sched_qs(int cpu)
 void rcu_bh_qs(int cpu)
 {
        if (rcu_qsctr_help(&rcu_bh_ctrlblk))
-                raise_softirq(RCU_SOFTIRQ);
+                invoke_rcu_kthread();
 }
 /*
@@ -152,13 +137,14 @@ void rcu_check_callbacks(int cpu, int user)
 }
 /*
- * Helper function for rcu_process_callbacks() that operates on the
+ * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure
- * specified rcu_ctrlkblk structure.
+ * whose grace period has elapsed.
 */
-static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
+static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
 {
        struct rcu_head *next, *list;
        unsigned long flags;
+        RCU_TRACE(int cb_count = 0);
        /* If no RCU callbacks ready to invoke, just return. */
        if (&rcp->rcucblist == rcp->donetail)
@@ -180,19 +166,59 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
                next = list->next;
                prefetch(next);
                debug_rcu_head_unqueue(list);
+                local_bh_disable();
                list->func(list);
+                local_bh_enable();
                list = next;
+                RCU_TRACE(cb_count++);
        }
+        RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
 }
 /*
- * Invoke any callbacks whose grace period has completed.
+ * This kthread invokes RCU callbacks whose grace periods have
+ * elapsed.  It is awakened as needed, and takes the place of the
+ * RCU_SOFTIRQ that was used previously for this purpose.
+ * This is a kthread, but it is never stopped, at least not until
+ * the system goes down.
 */
-static void rcu_process_callbacks(struct softirq_action *unused)
+static int rcu_kthread(void *arg)
 {
-        __rcu_process_callbacks(&rcu_sched_ctrlblk);
+        unsigned long work;
-        __rcu_process_callbacks(&rcu_bh_ctrlblk);
+        unsigned long morework;
-        rcu_preempt_process_callbacks();
+        unsigned long flags;
+        for (;;) {
+                wait_event_interruptible(rcu_kthread_wq,
+                                         have_rcu_kthread_work != 0);
+                morework = rcu_boost();
+                local_irq_save(flags);
+                work = have_rcu_kthread_work;
+                have_rcu_kthread_work = morework;
+                local_irq_restore(flags);
+                if (work) {
+                        rcu_process_callbacks(&rcu_sched_ctrlblk);
+                        rcu_process_callbacks(&rcu_bh_ctrlblk);
+                        rcu_preempt_process_callbacks();
+                }
+                schedule_timeout_interruptible(1); /* Leave CPU for others. */
+        }
+        return 0;  /* Not reached, but needed to shut gcc up. */
+}
+/*
+ * Wake up rcu_kthread() to process callbacks now eligible for invocation
+ * or to boost readers.
+ */
+static void invoke_rcu_kthread(void)
+{
+        unsigned long flags;
+        local_irq_save(flags);
+        have_rcu_kthread_work = 1;
+        wake_up(&rcu_kthread_wq);
+        local_irq_restore(flags);
 }
 /*
@@ -230,6 +256,7 @@ static void __call_rcu(struct rcu_head *head,
        local_irq_save(flags);
        *rcp->curtail = head;
        rcp->curtail = &head->next;
+        RCU_TRACE(rcp->qlen++);
        local_irq_restore(flags);
 }
@@ -282,7 +309,16 @@ void rcu_barrier_sched(void)
 }
 EXPORT_SYMBOL_GPL(rcu_barrier_sched);
-void __init rcu_init(void)
+/*
+ * Spawn the kthread that invokes RCU callbacks.
+ */
+static int __init rcu_spawn_kthreads(void)
 {
-        open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+        struct sched_param sp;
+        rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
+        sp.sched_priority = RCU_BOOST_PRIO;
+        sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
+        return 0;
 }
+early_initcall(rcu_spawn_kthreads);
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 6ceca4f745ff..015abaea962a 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -22,6 +22,40 @@
 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
 */
+#include <linux/kthread.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#ifdef CONFIG_RCU_TRACE
+#define RCU_TRACE(stmt) stmt
+#else /* #ifdef CONFIG_RCU_TRACE */
+#define RCU_TRACE(stmt)
+#endif /* #else #ifdef CONFIG_RCU_TRACE */
+/* Global control variables for rcupdate callback mechanism. */
+struct rcu_ctrlblk {
+        struct rcu_head *rcucblist;     /* List of pending callbacks (CBs). */
+        struct rcu_head **donetail;     /* ->next pointer of last "done" CB. */
+        struct rcu_head **curtail;      /* ->next pointer of last CB. */
+        RCU_TRACE(long qlen);           /* Number of pending CBs. */
+};
+/* Definition for rcupdate control block. */
+static struct rcu_ctrlblk rcu_sched_ctrlblk = {
+        .donetail       = &rcu_sched_ctrlblk.rcucblist,
+        .curtail        = &rcu_sched_ctrlblk.rcucblist,
+};
+static struct rcu_ctrlblk rcu_bh_ctrlblk = {
+        .donetail       = &rcu_bh_ctrlblk.rcucblist,
+        .curtail        = &rcu_bh_ctrlblk.rcucblist,
+};
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+int rcu_scheduler_active __read_mostly;
+EXPORT_SYMBOL_GPL(rcu_scheduler_active);
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 #ifdef CONFIG_TINY_PREEMPT_RCU
 #include <linux/delay.h>
@@ -46,17 +80,45 @@ struct rcu_preempt_ctrlblk {
        struct list_head *gp_tasks;
                                /* Pointer to the first task blocking the */
                                /*  current grace period, or NULL if there */
-                                /*  is not such task. */
+                                /*  is no such task. */
        struct list_head *exp_tasks;
                                /* Pointer to first task blocking the */
                                /*  current expedited grace period, or NULL */
                                /*  if there is no such task.  If there */
                                /*  is no current expedited grace period, */
                                /*  then there cannot be any such task. */
+#ifdef CONFIG_RCU_BOOST
+        struct list_head *boost_tasks;
+                                /* Pointer to first task that needs to be */
+                                /*  priority-boosted, or NULL if no priority */
+                                /*  boosting is needed.  If there is no */
+                                /*  current or expedited grace period, there */
+                                /*  can be no such task. */
+#endif /* #ifdef CONFIG_RCU_BOOST */
        u8 gpnum;               /* Current grace period. */
        u8 gpcpu;               /* Last grace period blocked by the CPU. */
        u8 completed;           /* Last grace period completed. */
                                /*  If all three are equal, RCU is idle. */
+#ifdef CONFIG_RCU_BOOST
+        s8 boosted_this_gp;     /* Has boosting already happened? */
+        unsigned long boost_time; /* When to start boosting (jiffies) */
+#endif /* #ifdef CONFIG_RCU_BOOST */
+#ifdef CONFIG_RCU_TRACE
+        unsigned long n_grace_periods;
+#ifdef CONFIG_RCU_BOOST
+        unsigned long n_tasks_boosted;
+        unsigned long n_exp_boosts;
+        unsigned long n_normal_boosts;
+        unsigned long n_normal_balk_blkd_tasks;
+        unsigned long n_normal_balk_gp_tasks;
+        unsigned long n_normal_balk_boost_tasks;
+        unsigned long n_normal_balk_boosted;
+        unsigned long n_normal_balk_notyet;
+        unsigned long n_normal_balk_nos;
+        unsigned long n_exp_balk_blkd_tasks;
+        unsigned long n_exp_balk_nos;
+#endif /* #ifdef CONFIG_RCU_BOOST */
+#endif /* #ifdef CONFIG_RCU_TRACE */
 };
 static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
@@ -122,6 +184,210 @@ static int rcu_preempt_gp_in_progress(void)
 }
 /*
+ * Advance a ->blkd_tasks-list pointer to the next entry, instead
+ * returning NULL if at the end of the list.
+ */
+static struct list_head *rcu_next_node_entry(struct task_struct *t)
+{
+        struct list_head *np;
+        np = t->rcu_node_entry.next;
+        if (np == &rcu_preempt_ctrlblk.blkd_tasks)
+                np = NULL;
+        return np;
+}
+#ifdef CONFIG_RCU_TRACE
+#ifdef CONFIG_RCU_BOOST
+static void rcu_initiate_boost_trace(void);
+static void rcu_initiate_exp_boost_trace(void);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+/*
+ * Dump additional statistice for TINY_PREEMPT_RCU.
+ */
+static void show_tiny_preempt_stats(struct seq_file *m)
+{
+        seq_printf(m, "rcu_preempt: qlen=%ld gp=%lu g%u/p%u/c%u tasks=%c%c%c\n",
+                   rcu_preempt_ctrlblk.rcb.qlen,
+                   rcu_preempt_ctrlblk.n_grace_periods,
+                   rcu_preempt_ctrlblk.gpnum,
+                   rcu_preempt_ctrlblk.gpcpu,
+                   rcu_preempt_ctrlblk.completed,
+                   "T."[list_empty(&rcu_preempt_ctrlblk.blkd_tasks)],
+                   "N."[!rcu_preempt_ctrlblk.gp_tasks],
+                   "E."[!rcu_preempt_ctrlblk.exp_tasks]);
+#ifdef CONFIG_RCU_BOOST
+        seq_printf(m, "             ttb=%c btg=",
+                   "B."[!rcu_preempt_ctrlblk.boost_tasks]);
+        switch (rcu_preempt_ctrlblk.boosted_this_gp) {
+        case -1:
+                seq_puts(m, "exp");
+                break;
+        case 0:
+                seq_puts(m, "no");
+                break;
+        case 1:
+                seq_puts(m, "begun");
+                break;
+        case 2:
+                seq_puts(m, "done");
+                break;
+        default:
+                seq_printf(m, "?%d?", rcu_preempt_ctrlblk.boosted_this_gp);
+        }
+        seq_printf(m, " ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n",
+                   rcu_preempt_ctrlblk.n_tasks_boosted,
+                   rcu_preempt_ctrlblk.n_exp_boosts,
+                   rcu_preempt_ctrlblk.n_normal_boosts,
+                   (int)(jiffies & 0xffff),
+                   (int)(rcu_preempt_ctrlblk.boost_time & 0xffff));
+        seq_printf(m, "             %s: nt=%lu gt=%lu bt=%lu b=%lu ny=%lu nos=%lu\n",
+                   "normal balk",
+                   rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks,
+                   rcu_preempt_ctrlblk.n_normal_balk_gp_tasks,
+                   rcu_preempt_ctrlblk.n_normal_balk_boost_tasks,
+                   rcu_preempt_ctrlblk.n_normal_balk_boosted,
+                   rcu_preempt_ctrlblk.n_normal_balk_notyet,
+                   rcu_preempt_ctrlblk.n_normal_balk_nos);
+        seq_printf(m, "             exp balk: bt=%lu nos=%lu\n",
+                   rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks,
+                   rcu_preempt_ctrlblk.n_exp_balk_nos);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+}
+#endif /* #ifdef CONFIG_RCU_TRACE */
+#ifdef CONFIG_RCU_BOOST
+#include "rtmutex_common.h"
+/*
+ * Carry out RCU priority boosting on the task indicated by ->boost_tasks,
+ * and advance ->boost_tasks to the next task in the ->blkd_tasks list.
+ */
+static int rcu_boost(void)
+{
+        unsigned long flags;
+        struct rt_mutex mtx;
+        struct list_head *np;
+        struct task_struct *t;
+        if (rcu_preempt_ctrlblk.boost_tasks == NULL)
+                return 0;  /* Nothing to boost. */
+        raw_local_irq_save(flags);
+        rcu_preempt_ctrlblk.boosted_this_gp++;
+        t = container_of(rcu_preempt_ctrlblk.boost_tasks, struct task_struct,
+                         rcu_node_entry);
+        np = rcu_next_node_entry(t);
+        rt_mutex_init_proxy_locked(&mtx, t);
+        t->rcu_boost_mutex = &mtx;
+        t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
+        raw_local_irq_restore(flags);
+        rt_mutex_lock(&mtx);
+        RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++);
+        rcu_preempt_ctrlblk.boosted_this_gp++;
+        rt_mutex_unlock(&mtx);
+        return rcu_preempt_ctrlblk.boost_tasks != NULL;
+}
+/*
+ * Check to see if it is now time to start boosting RCU readers blocking
+ * the current grace period, and, if so, tell the rcu_kthread_task to
+ * start boosting them.  If there is an expedited boost in progress,
+ * we wait for it to complete.
+ *
+ * If there are no blocked readers blocking the current grace period,
+ * return 0 to let the caller know, otherwise return 1.  Note that this
+ * return value is independent of whether or not boosting was done.
+ */
+static int rcu_initiate_boost(void)
+{
+        if (!rcu_preempt_blocked_readers_cgp()) {
+                RCU_TRACE(rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks++);
+                return 0;
+        }
+        if (rcu_preempt_ctrlblk.gp_tasks != NULL &&
+            rcu_preempt_ctrlblk.boost_tasks == NULL &&
+            rcu_preempt_ctrlblk.boosted_this_gp == 0 &&
+            ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) {
+                rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks;
+                invoke_rcu_kthread();
+                RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++);
+        } else
+                RCU_TRACE(rcu_initiate_boost_trace());
+        return 1;
+}
+/*
+ * Initiate boosting for an expedited grace period.
+ */
+static void rcu_initiate_expedited_boost(void)
+{
+        unsigned long flags;
+        raw_local_irq_save(flags);
+        if (!list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) {
+                rcu_preempt_ctrlblk.boost_tasks =
+                        rcu_preempt_ctrlblk.blkd_tasks.next;
+                rcu_preempt_ctrlblk.boosted_this_gp = -1;
+                invoke_rcu_kthread();
+                RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++);
+        } else
+                RCU_TRACE(rcu_initiate_exp_boost_trace());
+        raw_local_irq_restore(flags);
+}
+#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000);
+/*
+ * Do priority-boost accounting for the start of a new grace period.
+ */
+static void rcu_preempt_boost_start_gp(void)
+{
+        rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
+        if (rcu_preempt_ctrlblk.boosted_this_gp > 0)
+                rcu_preempt_ctrlblk.boosted_this_gp = 0;
+}
+#else /* #ifdef CONFIG_RCU_BOOST */
+/*
+ * If there is no RCU priority boosting, we don't boost.
+ */
+static int rcu_boost(void)
+{
+        return 0;
+}
+/*
+ * If there is no RCU priority boosting, we don't initiate boosting,
+ * but we do indicate whether there are blocked readers blocking the
+ * current grace period.
+ */
+static int rcu_initiate_boost(void)
+{
+        return rcu_preempt_blocked_readers_cgp();
+}
+/*
+ * If there is no RCU priority boosting, we don't initiate expedited boosting.
+ */
+static void rcu_initiate_expedited_boost(void)
+{
+}
+/*
+ * If there is no RCU priority boosting, nothing to do at grace-period start.
+ */
+static void rcu_preempt_boost_start_gp(void)
+{
+}
+#endif /* else #ifdef CONFIG_RCU_BOOST */
+/*
 * Record a preemptible-RCU quiescent state for the specified CPU.  Note
 * that this just means that the task currently running on the CPU is
 * in a quiescent state.  There might be any number of tasks blocked
@@ -148,11 +414,14 @@ static void rcu_preempt_cpu_qs(void)
        rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum;
        current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
+        /* If there is no GP then there is nothing more to do.  */
+        if (!rcu_preempt_gp_in_progress())
+                return;
        /*
-         * If there is no GP, or if blocked readers are still blocking GP,
+         * Check up on boosting.  If there are no readers blocking the
-         * then there is nothing more to do.
+         * current grace period, leave.
         */
-        if (!rcu_preempt_gp_in_progress() || rcu_preempt_blocked_readers_cgp())
+        if (rcu_initiate_boost())
                return;
        /* Advance callbacks. */
@@ -164,9 +433,9 @@ static void rcu_preempt_cpu_qs(void)
        if (!rcu_preempt_blocked_readers_any())
                rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail;
-        /* If there are done callbacks, make RCU_SOFTIRQ process them. */
+        /* If there are done callbacks, cause them to be invoked. */
        if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
-                raise_softirq(RCU_SOFTIRQ);
+                invoke_rcu_kthread();
 }
 /*
@@ -178,12 +447,16 @@ static void rcu_preempt_start_gp(void)
                /* Official start of GP. */
                rcu_preempt_ctrlblk.gpnum++;
+                RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++);
                /* Any blocked RCU readers block new GP. */
                if (rcu_preempt_blocked_readers_any())
                        rcu_preempt_ctrlblk.gp_tasks =
                                rcu_preempt_ctrlblk.blkd_tasks.next;
+                /* Set up for RCU priority boosting. */
+                rcu_preempt_boost_start_gp();
                /* If there is no running reader, CPU is done with GP. */
                if (!rcu_preempt_running_reader())
                        rcu_preempt_cpu_qs();
@@ -304,14 +577,16 @@ static void rcu_read_unlock_special(struct task_struct *t)
                 */
                empty = !rcu_preempt_blocked_readers_cgp();
                empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL;
-                np = t->rcu_node_entry.next;
+                np = rcu_next_node_entry(t);
-                if (np == &rcu_preempt_ctrlblk.blkd_tasks)
-                        np = NULL;
                list_del(&t->rcu_node_entry);
                if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks)
                        rcu_preempt_ctrlblk.gp_tasks = np;
                if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks)
                        rcu_preempt_ctrlblk.exp_tasks = np;
+#ifdef CONFIG_RCU_BOOST
+                if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks)
+                        rcu_preempt_ctrlblk.boost_tasks = np;
+#endif /* #ifdef CONFIG_RCU_BOOST */
                INIT_LIST_HEAD(&t->rcu_node_entry);
                /*
@@ -331,6 +606,14 @@ static void rcu_read_unlock_special(struct task_struct *t)
                if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL)
                        rcu_report_exp_done();
        }
+#ifdef CONFIG_RCU_BOOST
+        /* Unboost self if was boosted. */
+        if (special & RCU_READ_UNLOCK_BOOSTED) {
+                t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED;
+                rt_mutex_unlock(t->rcu_boost_mutex);
+                t->rcu_boost_mutex = NULL;
+        }
+#endif /* #ifdef CONFIG_RCU_BOOST */
        local_irq_restore(flags);
 }
@@ -374,7 +657,7 @@ static void rcu_preempt_check_callbacks(void)
                rcu_preempt_cpu_qs();
        if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
            rcu_preempt_ctrlblk.rcb.donetail)
-                raise_softirq(RCU_SOFTIRQ);
+                invoke_rcu_kthread();
        if (rcu_preempt_gp_in_progress() &&
            rcu_cpu_blocking_cur_gp() &&
            rcu_preempt_running_reader())
@@ -383,7 +666,7 @@ static void rcu_preempt_check_callbacks(void)
 /*
 * TINY_PREEMPT_RCU has an extra callback-list tail pointer to
- * update, so this is invoked from __rcu_process_callbacks() to
+ * update, so this is invoked from rcu_process_callbacks() to
 * handle that case.  Of course, it is invoked for all flavors of
 * RCU, but RCU callbacks can appear only on one of the lists, and
 * neither ->nexttail nor ->donetail can possibly be NULL, so there
@@ -400,7 +683,7 @@ static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
 */
 static void rcu_preempt_process_callbacks(void)
 {
-        __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
+        rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
 }
 /*
@@ -417,6 +700,7 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
        local_irq_save(flags);
        *rcu_preempt_ctrlblk.nexttail = head;
        rcu_preempt_ctrlblk.nexttail = &head->next;
+        RCU_TRACE(rcu_preempt_ctrlblk.rcb.qlen++);
        rcu_preempt_start_gp();  /* checks to see if GP needed. */
        local_irq_restore(flags);
 }
@@ -532,6 +816,7 @@ void synchronize_rcu_expedited(void)
        /* Wait for tail of ->blkd_tasks list to drain. */
        if (rcu_preempted_readers_exp())
+                rcu_initiate_expedited_boost();
                wait_event(sync_rcu_preempt_exp_wq,
                           !rcu_preempted_readers_exp());
@@ -572,6 +857,27 @@ void exit_rcu(void)
 #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
+#ifdef CONFIG_RCU_TRACE
+/*
+ * Because preemptible RCU does not exist, it is not necessary to
+ * dump out its statistics.
+ */
+static void show_tiny_preempt_stats(struct seq_file *m)
+{
+}
+#endif /* #ifdef CONFIG_RCU_TRACE */
+/*
+ * Because preemptible RCU does not exist, it is never necessary to
+ * boost preempted RCU readers.
+ */
+static int rcu_boost(void)
+{
+        return 0;
+}
 /*
 * Because preemptible RCU does not exist, it never has any callbacks
 * to check.
@@ -599,17 +905,116 @@ static void rcu_preempt_process_callbacks(void)
 #endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 #include <linux/kernel_stat.h>
 /*
 * During boot, we forgive RCU lockdep issues.  After this function is
 * invoked, we start taking RCU lockdep issues seriously.
 */
-void rcu_scheduler_starting(void)
+void __init rcu_scheduler_starting(void)
 {
        WARN_ON(nr_context_switches() > 0);
        rcu_scheduler_active = 1;
 }
 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+#ifdef CONFIG_RCU_BOOST
+#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
+#else /* #ifdef CONFIG_RCU_BOOST */
+#define RCU_BOOST_PRIO 1
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
+#ifdef CONFIG_RCU_TRACE
+#ifdef CONFIG_RCU_BOOST
+static void rcu_initiate_boost_trace(void)
+{
+        if (rcu_preempt_ctrlblk.gp_tasks == NULL)
+                rcu_preempt_ctrlblk.n_normal_balk_gp_tasks++;
+        else if (rcu_preempt_ctrlblk.boost_tasks != NULL)
+                rcu_preempt_ctrlblk.n_normal_balk_boost_tasks++;
+        else if (rcu_preempt_ctrlblk.boosted_this_gp != 0)
+                rcu_preempt_ctrlblk.n_normal_balk_boosted++;
+        else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))
+                rcu_preempt_ctrlblk.n_normal_balk_notyet++;
+        else
+                rcu_preempt_ctrlblk.n_normal_balk_nos++;
+}
+static void rcu_initiate_exp_boost_trace(void)
+{
+        if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks))
+                rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks++;
+        else
+                rcu_preempt_ctrlblk.n_exp_balk_nos++;
+}
+#endif /* #ifdef CONFIG_RCU_BOOST */
+static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)
+{
+        unsigned long flags;
+        raw_local_irq_save(flags);
+        rcp->qlen -= n;
+        raw_local_irq_restore(flags);
+}
+/*
+ * Dump statistics for TINY_RCU, such as they are.
+ */
+static int show_tiny_stats(struct seq_file *m, void *unused)
+{
+        show_tiny_preempt_stats(m);
+        seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen);
+        seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen);
+        return 0;
+}
+static int show_tiny_stats_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, show_tiny_stats, NULL);
+}
+static const struct file_operations show_tiny_stats_fops = {
+        .owner = THIS_MODULE,
+        .open = show_tiny_stats_open,
+        .read = seq_read,
+        .llseek = seq_lseek,
+        .release = single_release,
+};
+static struct dentry *rcudir;
+static int __init rcutiny_trace_init(void)
+{
+        struct dentry *retval;
+        rcudir = debugfs_create_dir("rcu", NULL);
+        if (!rcudir)
+                goto free_out;
+        retval = debugfs_create_file("rcudata", 0444, rcudir,
+                                     NULL, &show_tiny_stats_fops);
+        if (!retval)
+                goto free_out;
+        return 0;
+free_out:
+        debugfs_remove_recursive(rcudir);
+        return 1;
+}
+static void __exit rcutiny_trace_cleanup(void)
+{
+        debugfs_remove_recursive(rcudir);
+}
+module_init(rcutiny_trace_init);
+module_exit(rcutiny_trace_cleanup);
+MODULE_AUTHOR("Paul E. McKenney");
+MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation");
+MODULE_LICENSE("GPL");
+#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 9d8e8fb2515f..89613f97ff26 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -47,6 +47,7 @@
 #include <linux/srcu.h>
 #include <linux/slab.h>
 #include <asm/byteorder.h>
+#include <linux/sched.h>
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
@@ -64,6 +65,9 @@ static int irqreader = 1;	/* RCU readers from irq (timers). */
 static int fqs_duration = 0;    /* Duration of bursts (us), 0 to disable. */
 static int fqs_holdoff = 0;     /* Hold time within burst (us). */
 static int fqs_stutter = 3;     /* Wait time between bursts (s). */
+static int test_boost = 1;      /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */
+static int test_boost_interval = 7; /* Interval between boost tests, seconds. */
+static int test_boost_duration = 4; /* Duration of each boost test, seconds. */
 static char *torture_type = "rcu"; /* What RCU implementation to torture. */
 module_param(nreaders, int, 0444);
@@ -88,6 +92,12 @@ module_param(fqs_holdoff, int, 0444);
 MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
 module_param(fqs_stutter, int, 0444);
 MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
+module_param(test_boost, int, 0444);
+MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
+module_param(test_boost_interval, int, 0444);
+MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds.");
+module_param(test_boost_duration, int, 0444);
+MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds.");
 module_param(torture_type, charp, 0444);
 MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
@@ -109,6 +119,7 @@ static struct task_struct *stats_task;
 static struct task_struct *shuffler_task;
 static struct task_struct *stutter_task;
 static struct task_struct *fqs_task;
+static struct task_struct *boost_tasks[NR_CPUS];
 #define RCU_TORTURE_PIPE_LEN 10
@@ -134,6 +145,12 @@ static atomic_t n_rcu_torture_alloc_fail;
 static atomic_t n_rcu_torture_free;
 static atomic_t n_rcu_torture_mberror;
 static atomic_t n_rcu_torture_error;
+static long n_rcu_torture_boost_ktrerror;
+static long n_rcu_torture_boost_rterror;
+static long n_rcu_torture_boost_allocerror;
+static long n_rcu_torture_boost_afferror;
+static long n_rcu_torture_boost_failure;
+static long n_rcu_torture_boosts;
 static long n_rcu_torture_timers;
 static struct list_head rcu_torture_removed;
 static cpumask_var_t shuffle_tmp_mask;
@@ -147,6 +164,16 @@ static int stutter_pause_test;
 #endif
 int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
+#ifdef CONFIG_RCU_BOOST
+#define rcu_can_boost() 1
+#else /* #ifdef CONFIG_RCU_BOOST */
+#define rcu_can_boost() 0
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
+static unsigned long boost_starttime;   /* jiffies of next boost test start. */
+DEFINE_MUTEX(boost_mutex);              /* protect setting boost_starttime */
+                                        /*  and boost task create/destroy. */
 /* Mediate rmmod and system shutdown.  Concurrent rmmod & shutdown illegal! */
 #define FULLSTOP_DONTSTOP 0     /* Normal operation. */
@@ -277,6 +304,7 @@ struct rcu_torture_ops {
        void (*fqs)(void);
        int (*stats)(char *page);
        int irq_capable;
+        int can_boost;
        char *name;
 };
@@ -366,6 +394,7 @@ static struct rcu_torture_ops rcu_ops = {
        .fqs            = rcu_force_quiescent_state,
        .stats          = NULL,
        .irq_capable    = 1,
+        .can_boost      = rcu_can_boost(),
        .name           = "rcu"
 };
@@ -408,6 +437,7 @@ static struct rcu_torture_ops rcu_sync_ops = {
        .fqs            = rcu_force_quiescent_state,
        .stats          = NULL,
        .irq_capable    = 1,
+        .can_boost      = rcu_can_boost(),
        .name           = "rcu_sync"
 };
@@ -424,6 +454,7 @@ static struct rcu_torture_ops rcu_expedited_ops = {
        .fqs            = rcu_force_quiescent_state,
        .stats          = NULL,
        .irq_capable    = 1,
+        .can_boost      = rcu_can_boost(),
        .name           = "rcu_expedited"
 };
@@ -684,6 +715,110 @@ static struct rcu_torture_ops sched_expedited_ops = {
 };
 /*
+ * RCU torture priority-boost testing.  Runs one real-time thread per
+ * CPU for moderate bursts, repeatedly registering RCU callbacks and
+ * spinning waiting for them to be invoked.  If a given callback takes
+ * too long to be invoked, we assume that priority inversion has occurred.
+ */
+struct rcu_boost_inflight {
+        struct rcu_head rcu;
+        int inflight;
+};
+static void rcu_torture_boost_cb(struct rcu_head *head)
+{
+        struct rcu_boost_inflight *rbip =
+                container_of(head, struct rcu_boost_inflight, rcu);
+        smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */
+        rbip->inflight = 0;
+}
+static int rcu_torture_boost(void *arg)
+{
+        unsigned long call_rcu_time;
+        unsigned long endtime;
+        unsigned long oldstarttime;
+        struct rcu_boost_inflight rbi = { .inflight = 0 };
+        struct sched_param sp;
+        VERBOSE_PRINTK_STRING("rcu_torture_boost started");
+        /* Set real-time priority. */
+        sp.sched_priority = 1;
+        if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) {
+                VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!");
+                n_rcu_torture_boost_rterror++;
+        }
+        /* Each pass through the following loop does one boost-test cycle. */
+        do {
+                /* Wait for the next test interval. */
+                oldstarttime = boost_starttime;
+                while (jiffies - oldstarttime > ULONG_MAX / 2) {
+                        schedule_timeout_uninterruptible(1);
+                        rcu_stutter_wait("rcu_torture_boost");
+                        if (kthread_should_stop() ||
+                            fullstop != FULLSTOP_DONTSTOP)
+                                goto checkwait;
+                }
+                /* Do one boost-test interval. */
+                endtime = oldstarttime + test_boost_duration * HZ;
+                call_rcu_time = jiffies;
+                while (jiffies - endtime > ULONG_MAX / 2) {
+                        /* If we don't have a callback in flight, post one. */
+                        if (!rbi.inflight) {
+                                smp_mb(); /* RCU core before ->inflight = 1. */
+                                rbi.inflight = 1;
+                                call_rcu(&rbi.rcu, rcu_torture_boost_cb);
+                                if (jiffies - call_rcu_time >
+                                         test_boost_duration * HZ - HZ / 2) {
+                                        VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed");
+                                        n_rcu_torture_boost_failure++;
+                                }
+                                call_rcu_time = jiffies;
+                        }
+                        cond_resched();
+                        rcu_stutter_wait("rcu_torture_boost");
+                        if (kthread_should_stop() ||
+                            fullstop != FULLSTOP_DONTSTOP)
+                                goto checkwait;
+                }
+                /*
+                 * Set the start time of the next test interval.
+                 * Yes, this is vulnerable to long delays, but such
+                 * delays simply cause a false negative for the next
+                 * interval.  Besides, we are running at RT priority,
+                 * so delays should be relatively rare.
+                 */
+                while (oldstarttime == boost_starttime) {
+                        if (mutex_trylock(&boost_mutex)) {
+                                boost_starttime = jiffies +
+                                                  test_boost_interval * HZ;
+                                n_rcu_torture_boosts++;
+                                mutex_unlock(&boost_mutex);
+                                break;
+                        }
+                        schedule_timeout_uninterruptible(1);
+                }
+                /* Go do the stutter. */
+checkwait:      rcu_stutter_wait("rcu_torture_boost");
+        } while (!kthread_should_stop() && fullstop  == FULLSTOP_DONTSTOP);
+        /* Clean up and exit. */
+        VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping");
+        rcutorture_shutdown_absorb("rcu_torture_boost");
+        while (!kthread_should_stop() || rbi.inflight)
+                schedule_timeout_uninterruptible(1);
+        smp_mb(); /* order accesses to ->inflight before stack-frame death. */
+        return 0;
+}
+/*
 * RCU torture force-quiescent-state kthread.  Repeatedly induces
 * bursts of calls to force_quiescent_state(), increasing the probability
 * of occurrence of some important types of race conditions.
@@ -933,7 +1068,8 @@ rcu_torture_printk(char *page)
        cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
        cnt += sprintf(&page[cnt],
                       "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d "
-                       "rtmbe: %d nt: %ld",
+                       "rtmbe: %d rtbke: %ld rtbre: %ld rtbae: %ld rtbafe: %ld "
+                       "rtbf: %ld rtb: %ld nt: %ld",
                       rcu_torture_current,
                       rcu_torture_current_version,
                       list_empty(&rcu_torture_freelist),
@@ -941,8 +1077,19 @@ rcu_torture_printk(char *page)
                       atomic_read(&n_rcu_torture_alloc_fail),
                       atomic_read(&n_rcu_torture_free),
                       atomic_read(&n_rcu_torture_mberror),
+                       n_rcu_torture_boost_ktrerror,
+                       n_rcu_torture_boost_rterror,
+                       n_rcu_torture_boost_allocerror,
+                       n_rcu_torture_boost_afferror,
+                       n_rcu_torture_boost_failure,
+                       n_rcu_torture_boosts,
                       n_rcu_torture_timers);
-        if (atomic_read(&n_rcu_torture_mberror) != 0)
+        if (atomic_read(&n_rcu_torture_mberror) != 0 ||
+            n_rcu_torture_boost_ktrerror != 0 ||
+            n_rcu_torture_boost_rterror != 0 ||
+            n_rcu_torture_boost_allocerror != 0 ||
+            n_rcu_torture_boost_afferror != 0 ||
+            n_rcu_torture_boost_failure != 0)
                cnt += sprintf(&page[cnt], " !!!");
        cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
        if (i > 1) {
@@ -1094,22 +1241,91 @@ rcu_torture_stutter(void *arg)
 }
 static inline void
-rcu_torture_print_module_parms(char *tag)
+rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
 {
        printk(KERN_ALERT "%s" TORTURE_FLAG
                "--- %s: nreaders=%d nfakewriters=%d "
                "stat_interval=%d verbose=%d test_no_idle_hz=%d "
                "shuffle_interval=%d stutter=%d irqreader=%d "
-                "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d\n",
+                "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
+                "test_boost=%d/%d test_boost_interval=%d "
+                "test_boost_duration=%d\n",
                torture_type, tag, nrealreaders, nfakewriters,
                stat_interval, verbose, test_no_idle_hz, shuffle_interval,
-                stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter);
+                stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
+                test_boost, cur_ops->can_boost,
+                test_boost_interval, test_boost_duration);
 }
-static struct notifier_block rcutorture_nb = {
+static struct notifier_block rcutorture_shutdown_nb = {
        .notifier_call = rcutorture_shutdown_notify,
 };
+static void rcutorture_booster_cleanup(int cpu)
+{
+        struct task_struct *t;
+        if (boost_tasks[cpu] == NULL)
+                return;
+        mutex_lock(&boost_mutex);
+        VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task");
+        t = boost_tasks[cpu];
+        boost_tasks[cpu] = NULL;
+        mutex_unlock(&boost_mutex);
+        /* This must be outside of the mutex, otherwise deadlock! */
+        kthread_stop(t);
+}
+static int rcutorture_booster_init(int cpu)
+{
+        int retval;
+        if (boost_tasks[cpu] != NULL)
+                return 0;  /* Already created, nothing more to do. */
+        /* Don't allow time recalculation while creating a new task. */
+        mutex_lock(&boost_mutex);
+        VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task");
+        boost_tasks[cpu] = kthread_create(rcu_torture_boost, NULL,
+                                          "rcu_torture_boost");
+        if (IS_ERR(boost_tasks[cpu])) {
+                retval = PTR_ERR(boost_tasks[cpu]);
+                VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed");
+                n_rcu_torture_boost_ktrerror++;
+                boost_tasks[cpu] = NULL;
+                mutex_unlock(&boost_mutex);
+                return retval;
+        }
+        kthread_bind(boost_tasks[cpu], cpu);
+        wake_up_process(boost_tasks[cpu]);
+        mutex_unlock(&boost_mutex);
+        return 0;
+}
+static int rcutorture_cpu_notify(struct notifier_block *self,
+                                 unsigned long action, void *hcpu)
+{
+        long cpu = (long)hcpu;
+        switch (action) {
+        case CPU_ONLINE:
+        case CPU_DOWN_FAILED:
+                (void)rcutorture_booster_init(cpu);
+                break;
+        case CPU_DOWN_PREPARE:
+                rcutorture_booster_cleanup(cpu);
+                break;
+        default:
+                break;
+        }
+        return NOTIFY_OK;
+}
+static struct notifier_block rcutorture_cpu_nb = {
+        .notifier_call = rcutorture_cpu_notify,
+};
 static void
 rcu_torture_cleanup(void)
 {
@@ -1127,7 +1343,7 @@ rcu_torture_cleanup(void)
        }
        fullstop = FULLSTOP_RMMOD;
        mutex_unlock(&fullstop_mutex);
-        unregister_reboot_notifier(&rcutorture_nb);
+        unregister_reboot_notifier(&rcutorture_shutdown_nb);
        if (stutter_task) {
                VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
                kthread_stop(stutter_task);
@@ -1184,6 +1400,12 @@ rcu_torture_cleanup(void)
                kthread_stop(fqs_task);
        }
        fqs_task = NULL;
+        if ((test_boost == 1 && cur_ops->can_boost) ||
+            test_boost == 2) {
+                unregister_cpu_notifier(&rcutorture_cpu_nb);
+                for_each_possible_cpu(i)
+                        rcutorture_booster_cleanup(i);
+        }
        /* Wait for all RCU callbacks to fire.  */
@@ -1195,9 +1417,9 @@ rcu_torture_cleanup(void)
        if (cur_ops->cleanup)
                cur_ops->cleanup();
        if (atomic_read(&n_rcu_torture_error))
-                rcu_torture_print_module_parms("End of test: FAILURE");
+                rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
        else
-                rcu_torture_print_module_parms("End of test: SUCCESS");
+                rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
 }
 static int __init
@@ -1242,7 +1464,7 @@ rcu_torture_init(void)
                nrealreaders = nreaders;
        else
                nrealreaders = 2 * num_online_cpus();
-        rcu_torture_print_module_parms("Start of test");
+        rcu_torture_print_module_parms(cur_ops, "Start of test");
        fullstop = FULLSTOP_DONTSTOP;
        /* Set up the freelist. */
@@ -1263,6 +1485,12 @@ rcu_torture_init(void)
        atomic_set(&n_rcu_torture_free, 0);
        atomic_set(&n_rcu_torture_mberror, 0);
        atomic_set(&n_rcu_torture_error, 0);
+        n_rcu_torture_boost_ktrerror = 0;
+        n_rcu_torture_boost_rterror = 0;
+        n_rcu_torture_boost_allocerror = 0;
+        n_rcu_torture_boost_afferror = 0;
+        n_rcu_torture_boost_failure = 0;
+        n_rcu_torture_boosts = 0;
        for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
                atomic_set(&rcu_torture_wcount[i], 0);
        for_each_possible_cpu(cpu) {
@@ -1376,7 +1604,27 @@ rcu_torture_init(void)
                        goto unwind;
                }
        }
-        register_reboot_notifier(&rcutorture_nb);
+        if (test_boost_interval < 1)
+                test_boost_interval = 1;
+        if (test_boost_duration < 2)
+                test_boost_duration = 2;
+        if ((test_boost == 1 && cur_ops->can_boost) ||
+            test_boost == 2) {
+                int retval;
+                boost_starttime = jiffies + test_boost_interval * HZ;
+                register_cpu_notifier(&rcutorture_cpu_nb);
+                for_each_possible_cpu(i) {
+                        if (cpu_is_offline(i))
+                                continue;  /* Heuristic: CPU can go offline. */
+                        retval = rcutorture_booster_init(i);
+                        if (retval < 0) {
+                                firsterr = retval;
+                                goto unwind;
+                        }
+                }
+        }
+        register_reboot_notifier(&rcutorture_shutdown_nb);
        mutex_unlock(&fullstop_mutex);
        return 0;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index ccdc04c47981..dd4aea806f8e 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -67,9 +67,6 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
        .gpnum = -300, \
        .completed = -300, \
        .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \
-        .orphan_cbs_list = NULL, \
-        .orphan_cbs_tail = &structname.orphan_cbs_list, \
-        .orphan_qlen = 0, \
        .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \
        .n_force_qs = 0, \
        .n_force_qs_ngp = 0, \
@@ -367,8 +364,8 @@ void rcu_irq_exit(void)
        WARN_ON_ONCE(rdtp->dynticks & 0x1);
        /* If the interrupt queued a callback, get out of dyntick mode. */
-        if (__get_cpu_var(rcu_sched_data).nxtlist ||
+        if (__this_cpu_read(rcu_sched_data.nxtlist) ||
-            __get_cpu_var(rcu_bh_data).nxtlist)
+            __this_cpu_read(rcu_bh_data.nxtlist))
                set_need_resched();
 }
@@ -620,9 +617,17 @@ static void __init check_cpu_stall_init(void)
 static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
 {
        if (rdp->gpnum != rnp->gpnum) {
-                rdp->qs_pending = 1;
+                /*
-                rdp->passed_quiesc = 0;
+                 * If the current grace period is waiting for this CPU,
+                 * set up to detect a quiescent state, otherwise don't
+                 * go looking for one.
+                 */
                rdp->gpnum = rnp->gpnum;
+                if (rnp->qsmask & rdp->grpmask) {
+                        rdp->qs_pending = 1;
+                        rdp->passed_quiesc = 0;
+                } else
+                        rdp->qs_pending = 0;
        }
 }
@@ -681,6 +686,24 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
                /* Remember that we saw this grace-period completion. */
                rdp->completed = rnp->completed;
+                /*
+                 * If we were in an extended quiescent state, we may have
+                 * missed some grace periods that others CPUs handled on
+                 * our behalf. Catch up with this state to avoid noting
+                 * spurious new grace periods.  If another grace period
+                 * has started, then rnp->gpnum will have advanced, so
+                 * we will detect this later on.
+                 */
+                if (ULONG_CMP_LT(rdp->gpnum, rdp->completed))
+                        rdp->gpnum = rdp->completed;
+                /*
+                 * If RCU does not need a quiescent state from this CPU,
+                 * then make sure that this CPU doesn't go looking for one.
+                 */
+                if ((rnp->qsmask & rdp->grpmask) == 0)
+                        rdp->qs_pending = 0;
        }
 }
@@ -984,53 +1007,31 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
 #ifdef CONFIG_HOTPLUG_CPU
 /*
- * Move a dying CPU's RCU callbacks to the ->orphan_cbs_list for the
+ * Move a dying CPU's RCU callbacks to online CPU's callback list.
- * specified flavor of RCU.  The callbacks will be adopted by the next
+ * Synchronization is not required because this function executes
- * _rcu_barrier() invocation or by the CPU_DEAD notifier, whichever
+ * in stop_machine() context.
- * comes first.  Because this is invoked from the CPU_DYING notifier,
- * irqs are already disabled.
 */
-static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
+static void rcu_send_cbs_to_online(struct rcu_state *rsp)
 {
        int i;
+        /* current DYING CPU is cleared in the cpu_online_mask */
+        int receive_cpu = cpumask_any(cpu_online_mask);
        struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
+        struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu);
        if (rdp->nxtlist == NULL)
                return;  /* irqs disabled, so comparison is stable. */
-        raw_spin_lock(&rsp->onofflock);  /* irqs already disabled. */
-        *rsp->orphan_cbs_tail = rdp->nxtlist;
+        *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
-        rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL];
+        receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+        receive_rdp->qlen += rdp->qlen;
+        receive_rdp->n_cbs_adopted += rdp->qlen;
+        rdp->n_cbs_orphaned += rdp->qlen;
        rdp->nxtlist = NULL;
        for (i = 0; i < RCU_NEXT_SIZE; i++)
                rdp->nxttail[i] = &rdp->nxtlist;
-        rsp->orphan_qlen += rdp->qlen;
-        rdp->n_cbs_orphaned += rdp->qlen;
        rdp->qlen = 0;
-        raw_spin_unlock(&rsp->onofflock);  /* irqs remain disabled. */
-}
-/*
- * Adopt previously orphaned RCU callbacks.
- */
-static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
-{
-        unsigned long flags;
-        struct rcu_data *rdp;
-        raw_spin_lock_irqsave(&rsp->onofflock, flags);
-        rdp = this_cpu_ptr(rsp->rda);
-        if (rsp->orphan_cbs_list == NULL) {
-                raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
-                return;
-        }
-        *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
-        rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail;
-        rdp->qlen += rsp->orphan_qlen;
-        rdp->n_cbs_adopted += rsp->orphan_qlen;
-        rsp->orphan_cbs_list = NULL;
-        rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
-        rsp->orphan_qlen = 0;
-        raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
 }
 /*
@@ -1081,8 +1082,6 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        if (need_report & RCU_OFL_TASKS_EXP_GP)
                rcu_report_exp_rnp(rsp, rnp);
-        rcu_adopt_orphan_cbs(rsp);
 }
 /*
@@ -1100,11 +1099,7 @@ static void rcu_offline_cpu(int cpu)
 #else /* #ifdef CONFIG_HOTPLUG_CPU */
-static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
+static void rcu_send_cbs_to_online(struct rcu_state *rsp)
-{
-}
-static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
 {
 }
@@ -1440,22 +1435,11 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
         */
        local_irq_save(flags);
        rdp = this_cpu_ptr(rsp->rda);
-        rcu_process_gp_end(rsp, rdp);
-        check_for_new_grace_period(rsp, rdp);
        /* Add the callback to our list. */
        *rdp->nxttail[RCU_NEXT_TAIL] = head;
        rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
-        /* Start a new grace period if one not already started. */
-        if (!rcu_gp_in_progress(rsp)) {
-                unsigned long nestflag;
-                struct rcu_node *rnp_root = rcu_get_root(rsp);
-                raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
-                rcu_start_gp(rsp, nestflag);  /* releases rnp_root->lock. */
-        }
        /*
         * Force the grace period if too many callbacks or too long waiting.
         * Enforce hysteresis, and don't invoke force_quiescent_state()
@@ -1464,12 +1448,27 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
         * is the only one waiting for a grace period to complete.
         */
        if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
-                rdp->blimit = LONG_MAX;
-                if (rsp->n_force_qs == rdp->n_force_qs_snap &&
+                /* Are we ignoring a completed grace period? */
-                    *rdp->nxttail[RCU_DONE_TAIL] != head)
+                rcu_process_gp_end(rsp, rdp);
-                        force_quiescent_state(rsp, 0);
+                check_for_new_grace_period(rsp, rdp);
-                rdp->n_force_qs_snap = rsp->n_force_qs;
-                rdp->qlen_last_fqs_check = rdp->qlen;
+                /* Start a new grace period if one not already started. */
+                if (!rcu_gp_in_progress(rsp)) {
+                        unsigned long nestflag;
+                        struct rcu_node *rnp_root = rcu_get_root(rsp);
+                        raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
+                        rcu_start_gp(rsp, nestflag);  /* rlses rnp_root->lock */
+                } else {
+                        /* Give the grace period a kick. */
+                        rdp->blimit = LONG_MAX;
+                        if (rsp->n_force_qs == rdp->n_force_qs_snap &&
+                            *rdp->nxttail[RCU_DONE_TAIL] != head)
+                                force_quiescent_state(rsp, 0);
+                        rdp->n_force_qs_snap = rsp->n_force_qs;
+                        rdp->qlen_last_fqs_check = rdp->qlen;
+                }
        } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
                force_quiescent_state(rsp, 1);
        local_irq_restore(flags);
@@ -1699,13 +1698,12 @@ static void _rcu_barrier(struct rcu_state *rsp,
         * decrement rcu_barrier_cpu_count -- otherwise the first CPU
         * might complete its grace period before all of the other CPUs
         * did their increment, causing this function to return too
-         * early.
+         * early.  Note that on_each_cpu() disables irqs, which prevents
+         * any CPUs from coming online or going offline until each online
+         * CPU has queued its RCU-barrier callback.
         */
        atomic_set(&rcu_barrier_cpu_count, 1);
-        preempt_disable(); /* stop CPU_DYING from filling orphan_cbs_list */
-        rcu_adopt_orphan_cbs(rsp);
        on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1);
-        preempt_enable(); /* CPU_DYING can again fill orphan_cbs_list */
        if (atomic_dec_and_test(&rcu_barrier_cpu_count))
                complete(&rcu_barrier_completion);
        wait_for_completion(&rcu_barrier_completion);
@@ -1831,18 +1829,13 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
        case CPU_DYING:
        case CPU_DYING_FROZEN:
                /*
-                 * preempt_disable() in _rcu_barrier() prevents stop_machine(),
+                 * The whole machine is "stopped" except this CPU, so we can
-                 * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);"
+                 * touch any data without introducing corruption. We send the
-                 * returns, all online cpus have queued rcu_barrier_func().
+                 * dying CPU's callbacks to an arbitrarily chosen online CPU.
-                 * The dying CPU clears its cpu_online_mask bit and
-                 * moves all of its RCU callbacks to ->orphan_cbs_list
-                 * in the context of stop_machine(), so subsequent calls
-                 * to _rcu_barrier() will adopt these callbacks and only
-                 * then queue rcu_barrier_func() on all remaining CPUs.
                 */
-                rcu_send_cbs_to_orphanage(&rcu_bh_state);
+                rcu_send_cbs_to_online(&rcu_bh_state);
-                rcu_send_cbs_to_orphanage(&rcu_sched_state);
+                rcu_send_cbs_to_online(&rcu_sched_state);
-                rcu_preempt_send_cbs_to_orphanage();
+                rcu_preempt_send_cbs_to_online();
                break;
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
@@ -1880,8 +1873,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
 {
        int i;
-        for (i = NUM_RCU_LVLS - 1; i >= 0; i--)
+        for (i = NUM_RCU_LVLS - 1; i > 0; i--)
                rsp->levelspread[i] = CONFIG_RCU_FANOUT;
+        rsp->levelspread[0] = RCU_FANOUT_LEAF;
 }
 #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
 static void __init rcu_init_levelspread(struct rcu_state *rsp)
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 91d4170c5c13..e8f057e44e3e 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -31,46 +31,51 @@
 /*
 * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
 * In theory, it should be possible to add more levels straightforwardly.
- * In practice, this has not been tested, so there is probably some
+ * In practice, this did work well going from three levels to four.
- * bug somewhere.
+ * Of course, your mileage may vary.
 */
 #define MAX_RCU_LVLS 4
-#define RCU_FANOUT            (CONFIG_RCU_FANOUT)
+#if CONFIG_RCU_FANOUT > 16
-#define RCU_FANOUT_SQ         (RCU_FANOUT * RCU_FANOUT)
+#define RCU_FANOUT_LEAF       16
-#define RCU_FANOUT_CUBE       (RCU_FANOUT_SQ * RCU_FANOUT)
+#else /* #if CONFIG_RCU_FANOUT > 16 */
-#define RCU_FANOUT_FOURTH     (RCU_FANOUT_CUBE * RCU_FANOUT)
+#define RCU_FANOUT_LEAF       (CONFIG_RCU_FANOUT)
+#endif /* #else #if CONFIG_RCU_FANOUT > 16 */
-#if NR_CPUS <= RCU_FANOUT
+#define RCU_FANOUT_1          (RCU_FANOUT_LEAF)
+#define RCU_FANOUT_2          (RCU_FANOUT_1 * CONFIG_RCU_FANOUT)
+#define RCU_FANOUT_3          (RCU_FANOUT_2 * CONFIG_RCU_FANOUT)
+#define RCU_FANOUT_4          (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
+#if NR_CPUS <= RCU_FANOUT_1
 #  define NUM_RCU_LVLS        1
 #  define NUM_RCU_LVL_0       1
 #  define NUM_RCU_LVL_1       (NR_CPUS)
 #  define NUM_RCU_LVL_2       0
 #  define NUM_RCU_LVL_3       0
 #  define NUM_RCU_LVL_4       0
-#elif NR_CPUS <= RCU_FANOUT_SQ
+#elif NR_CPUS <= RCU_FANOUT_2
 #  define NUM_RCU_LVLS        2
 #  define NUM_RCU_LVL_0       1
-#  define NUM_RCU_LVL_1       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
+#  define NUM_RCU_LVL_1       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
 #  define NUM_RCU_LVL_2       (NR_CPUS)
 #  define NUM_RCU_LVL_3       0
 #  define NUM_RCU_LVL_4       0
-#elif NR_CPUS <= RCU_FANOUT_CUBE
+#elif NR_CPUS <= RCU_FANOUT_3
 #  define NUM_RCU_LVLS        3
 #  define NUM_RCU_LVL_0       1
-#  define NUM_RCU_LVL_1       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
+#  define NUM_RCU_LVL_1       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
-#  define NUM_RCU_LVL_2       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
+#  define NUM_RCU_LVL_2       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
-#  define NUM_RCU_LVL_3       NR_CPUS
+#  define NUM_RCU_LVL_3       (NR_CPUS)
 #  define NUM_RCU_LVL_4       0
-#elif NR_CPUS <= RCU_FANOUT_FOURTH
+#elif NR_CPUS <= RCU_FANOUT_4
 #  define NUM_RCU_LVLS        4
 #  define NUM_RCU_LVL_0       1
-#  define NUM_RCU_LVL_1       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_CUBE)
+#  define NUM_RCU_LVL_1       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
-#  define NUM_RCU_LVL_2       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
+#  define NUM_RCU_LVL_2       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
-#  define NUM_RCU_LVL_3       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
+#  define NUM_RCU_LVL_3       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
-#  define NUM_RCU_LVL_4       NR_CPUS
+#  define NUM_RCU_LVL_4       (NR_CPUS)
 #else
 # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
-#endif /* #if (NR_CPUS) <= RCU_FANOUT */
+#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
 #define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
 #define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
@@ -203,8 +208,8 @@ struct rcu_data {
        long            qlen_last_fqs_check;
                                        /* qlen at last check for QS forcing */
        unsigned long   n_cbs_invoked;  /* count of RCU cbs invoked. */
-        unsigned long   n_cbs_orphaned; /* RCU cbs sent to orphanage. */
+        unsigned long   n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */
-        unsigned long   n_cbs_adopted;  /* RCU cbs adopted from orphanage. */
+        unsigned long   n_cbs_adopted;  /* RCU cbs adopted from dying CPU */
        unsigned long   n_force_qs_snap;
                                        /* did other CPU force QS recently? */
        long            blimit;         /* Upper limit on a processed batch */
@@ -309,15 +314,7 @@ struct rcu_state {
        /* End of fields guarded by root rcu_node's lock. */
        raw_spinlock_t onofflock;               /* exclude on/offline and */
-                                                /*  starting new GP.  Also */
+                                                /*  starting new GP. */
-                                                /*  protects the following */
-                                                /*  orphan_cbs fields. */
-        struct rcu_head *orphan_cbs_list;       /* list of rcu_head structs */
-                                                /*  orphaned by all CPUs in */
-                                                /*  a given leaf rcu_node */
-                                                /*  going offline. */
-        struct rcu_head **orphan_cbs_tail;      /* And tail pointer. */
-        long orphan_qlen;                       /* Number of orphaned cbs. */
        raw_spinlock_t fqslock;                 /* Only one task forcing */
                                                /*  quiescent states. */
        unsigned long jiffies_force_qs;         /* Time at which to invoke */
@@ -390,7 +387,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp);
 static int rcu_preempt_pending(int cpu);
 static int rcu_preempt_needs_cpu(int cpu);
 static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
-static void rcu_preempt_send_cbs_to_orphanage(void);
+static void rcu_preempt_send_cbs_to_online(void);
 static void __init __rcu_init_preempt(void);
 static void rcu_needs_cpu_flush(void);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 71a4147473f9..a3638710dc67 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -25,6 +25,7 @@
 */
 #include <linux/delay.h>
+#include <linux/stop_machine.h>
 /*
 * Check the RCU kernel configuration parameters and print informative
@@ -773,11 +774,11 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
 }
 /*
- * Move preemptable RCU's callbacks to ->orphan_cbs_list.
+ * Move preemptable RCU's callbacks from dying CPU to other online CPU.
 */
-static void rcu_preempt_send_cbs_to_orphanage(void)
+static void rcu_preempt_send_cbs_to_online(void)
 {
-        rcu_send_cbs_to_orphanage(&rcu_preempt_state);
+        rcu_send_cbs_to_online(&rcu_preempt_state);
 }
 /*
@@ -1001,7 +1002,7 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
 /*
 * Because there is no preemptable RCU, there are no callbacks to move.
 */
-static void rcu_preempt_send_cbs_to_orphanage(void)
+static void rcu_preempt_send_cbs_to_online(void)
 {
 }
@@ -1014,6 +1015,132 @@ static void __init __rcu_init_preempt(void)
 #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
+#ifndef CONFIG_SMP
+void synchronize_sched_expedited(void)
+{
+        cond_resched();
+}
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+#else /* #ifndef CONFIG_SMP */
+static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
+static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
+static int synchronize_sched_expedited_cpu_stop(void *data)
+{
+        /*
+         * There must be a full memory barrier on each affected CPU
+         * between the time that try_stop_cpus() is called and the
+         * time that it returns.
+         *
+         * In the current initial implementation of cpu_stop, the
+         * above condition is already met when the control reaches
+         * this point and the following smp_mb() is not strictly
+         * necessary.  Do smp_mb() anyway for documentation and
+         * robustness against future implementation changes.
+         */
+        smp_mb(); /* See above comment block. */
+        return 0;
+}
+/*
+ * Wait for an rcu-sched grace period to elapse, but use "big hammer"
+ * approach to force grace period to end quickly.  This consumes
+ * significant time on all CPUs, and is thus not recommended for
+ * any sort of common-case code.
+ *
+ * Note that it is illegal to call this function while holding any
+ * lock that is acquired by a CPU-hotplug notifier.  Failing to
+ * observe this restriction will result in deadlock.
+ *
+ * This implementation can be thought of as an application of ticket
+ * locking to RCU, with sync_sched_expedited_started and
+ * sync_sched_expedited_done taking on the roles of the halves
+ * of the ticket-lock word.  Each task atomically increments
+ * sync_sched_expedited_started upon entry, snapshotting the old value,
+ * then attempts to stop all the CPUs.  If this succeeds, then each
+ * CPU will have executed a context switch, resulting in an RCU-sched
+ * grace period.  We are then done, so we use atomic_cmpxchg() to
+ * update sync_sched_expedited_done to match our snapshot -- but
+ * only if someone else has not already advanced past our snapshot.
+ *
+ * On the other hand, if try_stop_cpus() fails, we check the value
+ * of sync_sched_expedited_done.  If it has advanced past our
+ * initial snapshot, then someone else must have forced a grace period
+ * some time after we took our snapshot.  In this case, our work is
+ * done for us, and we can simply return.  Otherwise, we try again,
+ * but keep our initial snapshot for purposes of checking for someone
+ * doing our work for us.
+ *
+ * If we fail too many times in a row, we fall back to synchronize_sched().
+ */
+void synchronize_sched_expedited(void)
+{
+        int firstsnap, s, snap, trycount = 0;
+        /* Note that atomic_inc_return() implies full memory barrier. */
+        firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started);
+        get_online_cpus();
+        /*
+         * Each pass through the following loop attempts to force a
+         * context switch on each CPU.
+         */
+        while (try_stop_cpus(cpu_online_mask,
+                             synchronize_sched_expedited_cpu_stop,
+                             NULL) == -EAGAIN) {
+                put_online_cpus();
+                /* No joy, try again later.  Or just synchronize_sched(). */
+                if (trycount++ < 10)
+                        udelay(trycount * num_online_cpus());
+                else {
+                        synchronize_sched();
+                        return;
+                }
+                /* Check to see if someone else did our work for us. */
+                s = atomic_read(&sync_sched_expedited_done);
+                if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) {
+                        smp_mb(); /* ensure test happens before caller kfree */
+                        return;
+                }
+                /*
+                 * Refetching sync_sched_expedited_started allows later
+                 * callers to piggyback on our grace period.  We subtract
+                 * 1 to get the same token that the last incrementer got.
+                 * We retry after they started, so our grace period works
+                 * for them, and they started after our first try, so their
+                 * grace period works for us.
+                 */
+                get_online_cpus();
+                snap = atomic_read(&sync_sched_expedited_started) - 1;
+                smp_mb(); /* ensure read is before try_stop_cpus(). */
+        }
+        /*
+         * Everyone up to our most recent fetch is covered by our grace
+         * period.  Update the counter, but only if our work is still
+         * relevant -- which it won't be if someone who started later
+         * than we did beat us to the punch.
+         */
+        do {
+                s = atomic_read(&sync_sched_expedited_done);
+                if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) {
+                        smp_mb(); /* ensure test happens before caller kfree */
+                        break;
+                }
+        } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s);
+        put_online_cpus();
+}
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+#endif /* #else #ifndef CONFIG_SMP */
 #if !defined(CONFIG_RCU_FAST_NO_HZ)
 /*
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index d15430b9d122..c8e97853b970 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -166,13 +166,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
        gpnum = rsp->gpnum;
        seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
-                      "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n",
+                      "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
                   rsp->completed, gpnum, rsp->signaled,
                   (long)(rsp->jiffies_force_qs - jiffies),
                   (int)(jiffies & 0xffff),
                   rsp->n_force_qs, rsp->n_force_qs_ngp,
                   rsp->n_force_qs - rsp->n_force_qs_ngp,
-                   rsp->n_force_qs_lh, rsp->orphan_qlen);
+                   rsp->n_force_qs_lh);
        for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
                if (rnp->level != level) {
                        seq_puts(m, "\n");
@@ -300,7 +300,7 @@ static const struct file_operations rcu_pending_fops = {
 static struct dentry *rcudir;
-static int __init rcuclassic_trace_init(void)
+static int __init rcutree_trace_init(void)
 {
        struct dentry *retval;
@@ -337,14 +337,14 @@ free_out:
        return 1;
 }
-static void __exit rcuclassic_trace_cleanup(void)
+static void __exit rcutree_trace_cleanup(void)
 {
        debugfs_remove_recursive(rcudir);
 }
-module_init(rcuclassic_trace_init);
+module_init(rcutree_trace_init);
-module_exit(rcuclassic_trace_cleanup);
+module_exit(rcutree_trace_cleanup);
 MODULE_AUTHOR("Paul E. McKenney");
 MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation");
diff --git a/kernel/resource.c b/kernel/resource.c
index 9fad33efd0db..798e2fae2a06 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -40,23 +40,6 @@ EXPORT_SYMBOL(iomem_resource);
 static DEFINE_RWLOCK(resource_lock);
-/*
- * By default, we allocate free space bottom-up.  The architecture can request
- * top-down by clearing this flag.  The user can override the architecture's
- * choice with the "resource_alloc_from_bottom" kernel boot option, but that
- * should only be a debugging tool.
- */
-int resource_alloc_from_bottom = 1;
-static __init int setup_alloc_from_bottom(char *s)
-{
-        printk(KERN_INFO
-               "resource: allocating from bottom-up; please report a bug\n");
-        resource_alloc_from_bottom = 1;
-        return 0;
-}
-early_param("resource_alloc_from_bottom", setup_alloc_from_bottom);
 static void *r_next(struct seq_file *m, void *v, loff_t *pos)
 {
        struct resource *p = v;
@@ -374,6 +357,10 @@ int __weak page_is_ram(unsigned long pfn)
        return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1;
 }
+void __weak arch_remove_reservations(struct resource *avail)
+{
+}
 static resource_size_t simple_align_resource(void *data,
                                             const struct resource *avail,
                                             resource_size_t size,
@@ -397,74 +384,7 @@ static bool resource_contains(struct resource *res1, struct resource *res2)
 }
 /*
- * Find the resource before "child" in the sibling list of "root" children.
- */
-static struct resource *find_sibling_prev(struct resource *root, struct resource *child)
-{
-        struct resource *this;
-        for (this = root->child; this; this = this->sibling)
-                if (this->sibling == child)
-                        return this;
-        return NULL;
-}
-/*
 * Find empty slot in the resource tree given range and alignment.
- * This version allocates from the end of the root resource first.
- */
-static int find_resource_from_top(struct resource *root, struct resource *new,
-                                  resource_size_t size, resource_size_t min,
-                                  resource_size_t max, resource_size_t align,
-                                  resource_size_t (*alignf)(void *,
-                                                   const struct resource *,
-                                                   resource_size_t,
-                                                   resource_size_t),
-                                  void *alignf_data)
-{
-        struct resource *this;
-        struct resource tmp, avail, alloc;
-        tmp.start = root->end;
-        tmp.end = root->end;
-        this = find_sibling_prev(root, NULL);
-        for (;;) {
-                if (this) {
-                        if (this->end < root->end)
-                                tmp.start = this->end + 1;
-                } else
-                        tmp.start = root->start;
-                resource_clip(&tmp, min, max);
-                /* Check for overflow after ALIGN() */
-                avail = *new;
-                avail.start = ALIGN(tmp.start, align);
-                avail.end = tmp.end;
-                if (avail.start >= tmp.start) {
-                        alloc.start = alignf(alignf_data, &avail, size, align);
-                        alloc.end = alloc.start + size - 1;
-                        if (resource_contains(&avail, &alloc)) {
-                                new->start = alloc.start;
-                                new->end = alloc.end;
-                                return 0;
-                        }
-                }
-                if (!this || this->start == root->start)
-                        break;
-                tmp.end = this->start - 1;
-                this = find_sibling_prev(root, this);
-        }
-        return -EBUSY;
-}
-/*
- * Find empty slot in the resource tree given range and alignment.
- * This version allocates from the beginning of the root resource first.
 */
 static int find_resource(struct resource *root, struct resource *new,
                         resource_size_t size, resource_size_t min,
@@ -478,23 +398,24 @@ static int find_resource(struct resource *root, struct resource *new,
        struct resource *this = root->child;
        struct resource tmp = *new, avail, alloc;
+        tmp.flags = new->flags;
        tmp.start = root->start;
        /*
-         * Skip past an allocated resource that starts at 0, since the
+         * Skip past an allocated resource that starts at 0, since the assignment
-         * assignment of this->start - 1 to tmp->end below would cause an
+         * of this->start - 1 to tmp->end below would cause an underflow.
-         * underflow.
         */
        if (this && this->start == 0) {
                tmp.start = this->end + 1;
                this = this->sibling;
        }
-        for (;;) {
+        for(;;) {
                if (this)
                        tmp.end = this->start - 1;
                else
                        tmp.end = root->end;
                resource_clip(&tmp, min, max);
+                arch_remove_reservations(&tmp);
                /* Check for overflow after ALIGN() */
                avail = *new;
@@ -509,10 +430,8 @@ static int find_resource(struct resource *root, struct resource *new,
                                return 0;
                        }
                }
                if (!this)
                        break;
                tmp.start = this->end + 1;
                this = this->sibling;
        }
@@ -545,10 +464,7 @@ int allocate_resource(struct resource *root, struct resource *new,
                alignf = simple_align_resource;
        write_lock(&resource_lock);
-        if (resource_alloc_from_bottom)
+        err = find_resource(root, new, size, min, max, align, alignf, alignf_data);
-                err = find_resource(root, new, size, min, max, align, alignf, alignf_data);
-        else
-                err = find_resource_from_top(root, new, size, min, max, align, alignf, alignf_data);
        if (err >= 0 && __request_resource(root, new))
                err = -EBUSY;
        write_unlock(&resource_lock);
diff --git a/kernel/sched.c b/kernel/sched.c
index aa14a56f9d03..18d38e4ec7ba 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,9 +75,11 @@
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
+#include <asm/mutex.h>
 #include "sched_cpupri.h"
 #include "workqueue_sched.h"
+#include "sched_autogroup.h"
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
@@ -253,6 +255,8 @@ struct task_group {
        /* runqueue "owned" by this group on each cpu */
        struct cfs_rq **cfs_rq;
        unsigned long shares;
+        atomic_t load_weight;
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -268,25 +272,18 @@ struct task_group {
        struct task_group *parent;
        struct list_head siblings;
        struct list_head children;
-};
-#define root_task_group init_task_group
+#ifdef CONFIG_SCHED_AUTOGROUP
+        struct autogroup *autogroup;
+#endif
+};
-/* task_group_lock serializes add/remove of task groups and also changes to
+/* task_group_lock serializes the addition/removal of task groups */
- * a task group's cpu shares.
- */
 static DEFINE_SPINLOCK(task_group_lock);
 #ifdef CONFIG_FAIR_GROUP_SCHED
-#ifdef CONFIG_SMP
+# define ROOT_TASK_GROUP_LOAD   NICE_0_LOAD
-static int root_task_group_empty(void)
-{
-        return list_empty(&root_task_group.children);
-}
-#endif
-# define INIT_TASK_GROUP_LOAD   NICE_0_LOAD
 /*
 * A weight of 0 or 1 can cause arithmetics problems.
@@ -299,13 +296,13 @@ static int root_task_group_empty(void)
 #define MIN_SHARES      2
 #define MAX_SHARES      (1UL << 18)
-static int init_task_group_load = INIT_TASK_GROUP_LOAD;
+static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
 #endif
 /* Default task group.
 *      Every task in system belong to this group at bootup.
 */
-struct task_group init_task_group;
+struct task_group root_task_group;
 #endif  /* CONFIG_CGROUP_SCHED */
@@ -342,6 +339,7 @@ struct cfs_rq {
         * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
         * list is used during load balance.
         */
+        int on_list;
        struct list_head leaf_cfs_rq_list;
        struct task_group *tg;  /* group that "owns" this runqueue */
@@ -360,14 +358,17 @@ struct cfs_rq {
        unsigned long h_load;
        /*
-         * this cpu's part of tg->shares
+         * Maintaining per-cpu shares distribution for group scheduling
+         *
+         * load_stamp is the last time we updated the load average
+         * load_last is the last time we updated the load average and saw load
+         * load_unacc_exec_time is currently unaccounted execution time
         */
-        unsigned long shares;
+        u64 load_avg;
+        u64 load_period;
+        u64 load_stamp, load_last, load_unacc_exec_time;
-        /*
+        unsigned long load_contribution;
-         * load.weight at the time we set shares
-         */
-        unsigned long rq_weight;
 #endif
 #endif
 };
@@ -552,26 +553,13 @@ struct rq {
        /* try_to_wake_up() stats */
        unsigned int ttwu_count;
        unsigned int ttwu_local;
-        /* BKL stats */
-        unsigned int bkl_count;
 #endif
 };
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
-static inline
-void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
-{
-        rq->curr->sched_class->check_preempt_curr(rq, p, flags);
-        /*
+static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
-         * A queue event has occurred, and we're going to schedule.  In
-         * this case, we can save a useless back to back clock update.
-         */
-        if (test_tsk_need_resched(p))
-                rq->skip_clock_update = 1;
-}
 static inline int cpu_of(struct rq *rq)
 {
@@ -615,11 +603,17 @@ static inline int cpu_of(struct rq *rq)
 */
 static inline struct task_group *task_group(struct task_struct *p)
 {
+        struct task_group *tg;
        struct cgroup_subsys_state *css;
+        if (p->flags & PF_EXITING)
+                return &root_task_group;
        css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
                        lockdep_is_held(&task_rq(p)->lock));
-        return container_of(css, struct task_group, css);
+        tg = container_of(css, struct task_group, css);
+        return autogroup_task_group(p, tg);
 }
 /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
@@ -646,22 +640,18 @@ static inline struct task_group *task_group(struct task_struct *p)
 #endif /* CONFIG_CGROUP_SCHED */
-static u64 irq_time_cpu(int cpu);
+static void update_rq_clock_task(struct rq *rq, s64 delta);
-static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
-inline void update_rq_clock(struct rq *rq)
+static void update_rq_clock(struct rq *rq)
 {
-        if (!rq->skip_clock_update) {
+        s64 delta;
-                int cpu = cpu_of(rq);
-                u64 irq_time;
-                rq->clock = sched_clock_cpu(cpu);
+        if (rq->skip_clock_update)
-                irq_time = irq_time_cpu(cpu);
+                return;
-                if (rq->clock - irq_time > rq->clock_task)
-                        rq->clock_task = rq->clock - irq_time;
-                sched_irq_time_avg_update(rq, irq_time);
+        delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
-        }
+        rq->clock += delta;
+        update_rq_clock_task(rq, delta);
 }
 /*
@@ -751,7 +741,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
        buf[cnt] = 0;
        cmp = strstrip(buf);
-        if (strncmp(buf, "NO_", 3) == 0) {
+        if (strncmp(cmp, "NO_", 3) == 0) {
                neg = 1;
                cmp += 3;
        }
@@ -807,20 +797,6 @@ late_initcall(sched_init_debug);
 const_debug unsigned int sysctl_sched_nr_migrate = 32;
 /*
- * ratelimit for updating the group shares.
- * default: 0.25ms
- */
-unsigned int sysctl_sched_shares_ratelimit = 250000;
-unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
-/*
- * Inject some fuzzyness into changing the per-cpu group shares
- * this avoids remote rq-locks at the expense of fairness.
- * default: 4
- */
-unsigned int sysctl_sched_shares_thresh = 4;
-/*
 * period over which we average the RT time consumption, measured
 * in ms.
 *
@@ -1369,6 +1345,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
        lw->inv_weight = 0;
 }
+static inline void update_load_set(struct load_weight *lw, unsigned long w)
+{
+        lw->weight = w;
+        lw->inv_weight = 0;
+}
 /*
 * To aid in avoiding the subversion of "niceness" due to uneven distribution
 * of tasks with abnormal "nice" values across CPUs the contribution that
@@ -1557,101 +1539,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static __read_mostly unsigned long __percpu *update_shares_data;
-static void __set_se_shares(struct sched_entity *se, unsigned long shares);
-/*
- * Calculate and set the cpu's group shares.
- */
-static void update_group_shares_cpu(struct task_group *tg, int cpu,
-                                    unsigned long sd_shares,
-                                    unsigned long sd_rq_weight,
-                                    unsigned long *usd_rq_weight)
-{
-        unsigned long shares, rq_weight;
-        int boost = 0;
-        rq_weight = usd_rq_weight[cpu];
-        if (!rq_weight) {
-                boost = 1;
-                rq_weight = NICE_0_LOAD;
-        }
-        /*
-         *             \Sum_j shares_j * rq_weight_i
-         * shares_i =  -----------------------------
-         *                  \Sum_j rq_weight_j
-         */
-        shares = (sd_shares * rq_weight) / sd_rq_weight;
-        shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
-        if (abs(shares - tg->se[cpu]->load.weight) >
-                        sysctl_sched_shares_thresh) {
-                struct rq *rq = cpu_rq(cpu);
-                unsigned long flags;
-                raw_spin_lock_irqsave(&rq->lock, flags);
-                tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
-                tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
-                __set_se_shares(tg->se[cpu], shares);
-                raw_spin_unlock_irqrestore(&rq->lock, flags);
-        }
-}
-/*
- * Re-compute the task group their per cpu shares over the given domain.
- * This needs to be done in a bottom-up fashion because the rq weight of a
- * parent group depends on the shares of its child groups.
- */
-static int tg_shares_up(struct task_group *tg, void *data)
-{
-        unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
-        unsigned long *usd_rq_weight;
-        struct sched_domain *sd = data;
-        unsigned long flags;
-        int i;
-        if (!tg->se[0])
-                return 0;
-        local_irq_save(flags);
-        usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
-        for_each_cpu(i, sched_domain_span(sd)) {
-                weight = tg->cfs_rq[i]->load.weight;
-                usd_rq_weight[i] = weight;
-                rq_weight += weight;
-                /*
-                 * If there are currently no tasks on the cpu pretend there
-                 * is one of average load so that when a new task gets to
-                 * run here it will not get delayed by group starvation.
-                 */
-                if (!weight)
-                        weight = NICE_0_LOAD;
-                sum_weight += weight;
-                shares += tg->cfs_rq[i]->shares;
-        }
-        if (!rq_weight)
-                rq_weight = sum_weight;
-        if ((!shares && rq_weight) || shares > tg->shares)
-                shares = tg->shares;
-        if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
-                shares = tg->shares;
-        for_each_cpu(i, sched_domain_span(sd))
-                update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
-        local_irq_restore(flags);
-        return 0;
-}
 /*
 * Compute the cpu's hierarchical load factor for each task group.
 * This needs to be done in a top-down fashion because the load of a child
@@ -1666,7 +1553,7 @@ static int tg_load_down(struct task_group *tg, void *data)
                load = cpu_rq(cpu)->load.weight;
        } else {
                load = tg->parent->cfs_rq[cpu]->h_load;
-                load *= tg->cfs_rq[cpu]->shares;
+                load *= tg->se[cpu]->load.weight;
                load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
        }
@@ -1675,34 +1562,11 @@ static int tg_load_down(struct task_group *tg, void *data)
        return 0;
 }
-static void update_shares(struct sched_domain *sd)
-{
-        s64 elapsed;
-        u64 now;
-        if (root_task_group_empty())
-                return;
-        now = local_clock();
-        elapsed = now - sd->last_update;
-        if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
-                sd->last_update = now;
-                walk_tg_tree(tg_nop, tg_shares_up, sd);
-        }
-}
 static void update_h_load(long cpu)
 {
        walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
 }
-#else
-static inline void update_shares(struct sched_domain *sd)
-{
-}
 #endif
 #ifdef CONFIG_PREEMPT
@@ -1824,15 +1688,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
 #endif
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
-{
-#ifdef CONFIG_SMP
-        cfs_rq->shares = shares;
-#endif
-}
-#endif
 static void calc_load_account_idle(struct rq *this_rq);
 static void update_sysctl(void);
 static int get_update_sysctl_factor(void);
@@ -1934,10 +1789,9 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
 * They are read and saved off onto struct rq in update_rq_clock().
 * This may result in other CPU reading this CPU's irq time and can
 * race with irq/account_system_vtime on this CPU. We would either get old
- * or new value (or semi updated value on 32 bit) with a side effect of
+ * or new value with a side effect of accounting a slice of irq time to wrong
- * accounting a slice of irq time to wrong task when irq is in progress
+ * task when irq is in progress while we read rq->clock. That is a worthy
- * while we read rq->clock. That is a worthy compromise in place of having
+ * compromise in place of having locks on each irq in account_system_time.
- * locks on each irq in account_system_time.
 */
 static DEFINE_PER_CPU(u64, cpu_hardirq_time);
 static DEFINE_PER_CPU(u64, cpu_softirq_time);
@@ -1955,19 +1809,58 @@ void disable_sched_clock_irqtime(void)
        sched_clock_irqtime = 0;
 }
-static u64 irq_time_cpu(int cpu)
+#ifndef CONFIG_64BIT
+static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
+static inline void irq_time_write_begin(void)
 {
-        if (!sched_clock_irqtime)
+        __this_cpu_inc(irq_time_seq.sequence);
-                return 0;
+        smp_wmb();
+}
+static inline void irq_time_write_end(void)
+{
+        smp_wmb();
+        __this_cpu_inc(irq_time_seq.sequence);
+}
+static inline u64 irq_time_read(int cpu)
+{
+        u64 irq_time;
+        unsigned seq;
+        do {
+                seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
+                irq_time = per_cpu(cpu_softirq_time, cpu) +
+                           per_cpu(cpu_hardirq_time, cpu);
+        } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
+        return irq_time;
+}
+#else /* CONFIG_64BIT */
+static inline void irq_time_write_begin(void)
+{
+}
+static inline void irq_time_write_end(void)
+{
+}
+static inline u64 irq_time_read(int cpu)
+{
        return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
 }
+#endif /* CONFIG_64BIT */
+/*
+ * Called before incrementing preempt_count on {soft,}irq_enter
+ * and before decrementing preempt_count on {soft,}irq_exit.
+ */
 void account_system_vtime(struct task_struct *curr)
 {
        unsigned long flags;
+        s64 delta;
        int cpu;
-        u64 now, delta;
        if (!sched_clock_irqtime)
                return;
@@ -1975,9 +1868,10 @@ void account_system_vtime(struct task_struct *curr)
        local_irq_save(flags);
        cpu = smp_processor_id();
-        now = sched_clock_cpu(cpu);
+        delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
-        delta = now - per_cpu(irq_start_time, cpu);
+        __this_cpu_add(irq_start_time, delta);
-        per_cpu(irq_start_time, cpu) = now;
+        irq_time_write_begin();
        /*
         * We do not account for softirq time from ksoftirqd here.
         * We want to continue accounting softirq time to ksoftirqd thread
@@ -1985,37 +1879,60 @@ void account_system_vtime(struct task_struct *curr)
         * that do not consume any time, but still wants to run.
         */
        if (hardirq_count())
-                per_cpu(cpu_hardirq_time, cpu) += delta;
+                __this_cpu_add(cpu_hardirq_time, delta);
        else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
-                per_cpu(cpu_softirq_time, cpu) += delta;
+                __this_cpu_add(cpu_softirq_time, delta);
+        irq_time_write_end();
        local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(account_system_vtime);
-static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time)
+static void update_rq_clock_task(struct rq *rq, s64 delta)
 {
-        if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) {
+        s64 irq_delta;
-                u64 delta_irq = curr_irq_time - rq->prev_irq_time;
-                rq->prev_irq_time = curr_irq_time;
+        irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
-                sched_rt_avg_update(rq, delta_irq);
-        }
+        /*
+         * Since irq_time is only updated on {soft,}irq_exit, we might run into
+         * this case when a previous update_rq_clock() happened inside a
+         * {soft,}irq region.
+         *
+         * When this happens, we stop ->clock_task and only update the
+         * prev_irq_time stamp to account for the part that fit, so that a next
+         * update will consume the rest. This ensures ->clock_task is
+         * monotonic.
+         *
+         * It does however cause some slight miss-attribution of {soft,}irq
+         * time, a more accurate solution would be to update the irq_time using
+         * the current rq->clock timestamp, except that would require using
+         * atomic ops.
+         */
+        if (irq_delta > delta)
+                irq_delta = delta;
+        rq->prev_irq_time += irq_delta;
+        delta -= irq_delta;
+        rq->clock_task += delta;
+        if (irq_delta && sched_feat(NONIRQ_POWER))
+                sched_rt_avg_update(rq, irq_delta);
 }
-#else
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */
-static u64 irq_time_cpu(int cpu)
+static void update_rq_clock_task(struct rq *rq, s64 delta)
 {
-        return 0;
+        rq->clock_task += delta;
 }
-static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
-#endif
 #include "sched_idletask.c"
 #include "sched_fair.c"
 #include "sched_rt.c"
+#include "sched_autogroup.c"
 #include "sched_stoptask.c"
 #ifdef CONFIG_SCHED_DEBUG
 # include "sched_debug.c"
@@ -2118,6 +2035,31 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
                p->sched_class->prio_changed(rq, p, oldprio, running);
 }
+static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
+{
+        const struct sched_class *class;
+        if (p->sched_class == rq->curr->sched_class) {
+                rq->curr->sched_class->check_preempt_curr(rq, p, flags);
+        } else {
+                for_each_class(class) {
+                        if (class == rq->curr->sched_class)
+                                break;
+                        if (class == p->sched_class) {
+                                resched_task(rq->curr);
+                                break;
+                        }
+                }
+        }
+        /*
+         * A queue event has occurred, and we're going to schedule.  In
+         * this case, we can save a useless back to back clock update.
+         */
+        if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr))
+                rq->skip_clock_update = 1;
+}
 #ifdef CONFIG_SMP
 /*
 * Is this task likely cache-hot:
@@ -2183,10 +2125,8 @@ static int migration_cpu_stop(void *data);
 * The task's runqueue lock must be held.
 * Returns true if you have to wait for migration thread.
 */
-static bool migrate_task(struct task_struct *p, int dest_cpu)
+static bool migrate_task(struct task_struct *p, struct rq *rq)
 {
-        struct rq *rq = task_rq(p);
        /*
         * If the task is not on a runqueue (and not running), then
         * the next wake-up will properly place the task.
@@ -2366,18 +2306,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
                return dest_cpu;
        /* No more Mr. Nice Guy. */
-        if (unlikely(dest_cpu >= nr_cpu_ids)) {
+        dest_cpu = cpuset_cpus_allowed_fallback(p);
-                dest_cpu = cpuset_cpus_allowed_fallback(p);
+        /*
-                /*
+         * Don't tell them about moving exiting tasks or
-                 * Don't tell them about moving exiting tasks or
+         * kernel threads (both mm NULL), since they never
-                 * kernel threads (both mm NULL), since they never
+         * leave kernel.
-                 * leave kernel.
+         */
-                 */
+        if (p->mm && printk_ratelimit()) {
-                if (p->mm && printk_ratelimit()) {
+                printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
-                        printk(KERN_INFO "process %d (%s) no "
+                                task_pid_nr(p), p->comm, cpu);
-                               "longer affine to cpu%d\n",
-                               task_pid_nr(p), p->comm, cpu);
-                }
        }
        return dest_cpu;
@@ -2568,7 +2505,7 @@ out:
 * try_to_wake_up_local - try to wake up a local task with rq lock held
 * @p: the thread to be awakened
 *
- * Put @p on the run-queue if it's not alredy there.  The caller must
+ * Put @p on the run-queue if it's not already there.  The caller must
 * ensure that this_rq() is locked, @p is bound to this_rq() and not
 * the current task.  this_rq() stays locked over invocation.
 */
@@ -2713,7 +2650,9 @@ void sched_fork(struct task_struct *p, int clone_flags)
        /* Want to start with kernel preemption disabled. */
        task_thread_info(p)->preempt_count = 1;
 #endif
+#ifdef CONFIG_SMP
        plist_node_init(&p->pushable_tasks, MAX_PRIO);
+#endif
        put_cpu();
 }
@@ -3104,6 +3043,15 @@ static long calc_load_fold_active(struct rq *this_rq)
        return delta;
 }
+static unsigned long
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
+{
+        load *= exp;
+        load += active * (FIXED_1 - exp);
+        load += 1UL << (FSHIFT - 1);
+        return load >> FSHIFT;
+}
 #ifdef CONFIG_NO_HZ
 /*
 * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
@@ -3133,6 +3081,128 @@ static long calc_load_fold_idle(void)
        return delta;
 }
+/**
+ * fixed_power_int - compute: x^n, in O(log n) time
+ *
+ * @x:         base of the power
+ * @frac_bits: fractional bits of @x
+ * @n:         power to raise @x to.
+ *
+ * By exploiting the relation between the definition of the natural power
+ * function: x^n := x*x*...*x (x multiplied by itself for n times), and
+ * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
+ * (where: n_i \elem {0, 1}, the binary vector representing n),
+ * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
+ * of course trivially computable in O(log_2 n), the length of our binary
+ * vector.
+ */
+static unsigned long
+fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
+{
+        unsigned long result = 1UL << frac_bits;
+        if (n) for (;;) {
+                if (n & 1) {
+                        result *= x;
+                        result += 1UL << (frac_bits - 1);
+                        result >>= frac_bits;
+                }
+                n >>= 1;
+                if (!n)
+                        break;
+                x *= x;
+                x += 1UL << (frac_bits - 1);
+                x >>= frac_bits;
+        }
+        return result;
+}
+/*
+ * a1 = a0 * e + a * (1 - e)
+ *
+ * a2 = a1 * e + a * (1 - e)
+ *    = (a0 * e + a * (1 - e)) * e + a * (1 - e)
+ *    = a0 * e^2 + a * (1 - e) * (1 + e)
+ *
+ * a3 = a2 * e + a * (1 - e)
+ *    = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
+ *    = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
+ *
+ *  ...
+ *
+ * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
+ *    = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
+ *    = a0 * e^n + a * (1 - e^n)
+ *
+ * [1] application of the geometric series:
+ *
+ *              n         1 - x^(n+1)
+ *     S_n := \Sum x^i = -------------
+ *             i=0          1 - x
+ */
+static unsigned long
+calc_load_n(unsigned long load, unsigned long exp,
+            unsigned long active, unsigned int n)
+{
+        return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
+}
+/*
+ * NO_HZ can leave us missing all per-cpu ticks calling
+ * calc_load_account_active(), but since an idle CPU folds its delta into
+ * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
+ * in the pending idle delta if our idle period crossed a load cycle boundary.
+ *
+ * Once we've updated the global active value, we need to apply the exponential
+ * weights adjusted to the number of cycles missed.
+ */
+static void calc_global_nohz(unsigned long ticks)
+{
+        long delta, active, n;
+        if (time_before(jiffies, calc_load_update))
+                return;
+        /*
+         * If we crossed a calc_load_update boundary, make sure to fold
+         * any pending idle changes, the respective CPUs might have
+         * missed the tick driven calc_load_account_active() update
+         * due to NO_HZ.
+         */
+        delta = calc_load_fold_idle();
+        if (delta)
+                atomic_long_add(delta, &calc_load_tasks);
+        /*
+         * If we were idle for multiple load cycles, apply them.
+         */
+        if (ticks >= LOAD_FREQ) {
+                n = ticks / LOAD_FREQ;
+                active = atomic_long_read(&calc_load_tasks);
+                active = active > 0 ? active * FIXED_1 : 0;
+                avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+                avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+                avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+                calc_load_update += n * LOAD_FREQ;
+        }
+        /*
+         * Its possible the remainder of the above division also crosses
+         * a LOAD_FREQ period, the regular check in calc_global_load()
+         * which comes after this will take care of that.
+         *
+         * Consider us being 11 ticks before a cycle completion, and us
+         * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
+         * age us 4 cycles, and the test in calc_global_load() will
+         * pick up the final one.
+         */
+}
 #else
 static void calc_load_account_idle(struct rq *this_rq)
 {
@@ -3142,6 +3212,10 @@ static inline long calc_load_fold_idle(void)
 {
        return 0;
 }
+static void calc_global_nohz(unsigned long ticks)
+{
+}
 #endif
 /**
@@ -3159,24 +3233,17 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
        loads[2] = (avenrun[2] + offset) << shift;
 }
-static unsigned long
-calc_load(unsigned long load, unsigned long exp, unsigned long active)
-{
-        load *= exp;
-        load += active * (FIXED_1 - exp);
-        return load >> FSHIFT;
-}
 /*
 * calc_load - update the avenrun load estimates 10 ticks after the
 * CPUs have updated calc_load_tasks.
 */
-void calc_global_load(void)
+void calc_global_load(unsigned long ticks)
 {
-        unsigned long upd = calc_load_update + 10;
        long active;
-        if (time_before(jiffies, upd))
+        calc_global_nohz(ticks);
+        if (time_before(jiffies, calc_load_update + 10))
                return;
        active = atomic_long_read(&calc_load_tasks);
@@ -3349,7 +3416,7 @@ void sched_exec(void)
         * select_task_rq() can race against ->cpus_allowed
         */
        if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
-            likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) {
+            likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) {
                struct migration_arg arg = { p, dest_cpu };
                task_rq_unlock(rq, &flags);
@@ -3820,7 +3887,7 @@ static inline void schedule_debug(struct task_struct *prev)
        schedstat_inc(this_rq(), sched_count);
 #ifdef CONFIG_SCHEDSTATS
        if (unlikely(prev->lock_depth >= 0)) {
-                schedstat_inc(this_rq(), bkl_count);
+                schedstat_inc(this_rq(), rq_sched_info.bkl_count);
                schedstat_inc(prev, sched_info.bkl_count);
        }
 #endif
@@ -3830,7 +3897,6 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev)
 {
        if (prev->se.on_rq)
                update_rq_clock(rq);
-        rq->skip_clock_update = 0;
        prev->sched_class->put_prev_task(rq, prev);
 }
@@ -3888,7 +3954,6 @@ need_resched_nonpreemptible:
                hrtick_clear(rq);
        raw_spin_lock_irq(&rq->lock);
-        clear_tsk_need_resched(prev);
        switch_count = &prev->nivcsw;
        if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
@@ -3920,6 +3985,8 @@ need_resched_nonpreemptible:
        put_prev_task(rq, prev);
        next = pick_next_task(rq);
+        clear_tsk_need_resched(prev);
+        rq->skip_clock_update = 0;
        if (likely(prev != next)) {
                sched_info_switch(prev, next);
@@ -4014,7 +4081,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
                if (task_thread_info(rq->curr) != owner || need_resched())
                        return 0;
-                cpu_relax();
+                arch_mutex_cpu_relax();
        }
        return 1;
@@ -4326,7 +4393,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);
 * This waits for either a completion of a specific task to be signaled or for a
 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
 */
-unsigned long __sched
+long __sched
 wait_for_completion_interruptible_timeout(struct completion *x,
                                          unsigned long timeout)
 {
@@ -4359,7 +4426,7 @@ EXPORT_SYMBOL(wait_for_completion_killable);
 * signaled or for a specified timeout to expire. It can be
 * interrupted by a kill signal. The timeout is in jiffies.
 */
-unsigned long __sched
+long __sched
 wait_for_completion_killable_timeout(struct completion *x,
                                     unsigned long timeout)
 {
@@ -4701,7 +4768,7 @@ static bool check_same_owner(struct task_struct *p)
 }
 static int __sched_setscheduler(struct task_struct *p, int policy,
-                                struct sched_param *param, bool user)
+                                const struct sched_param *param, bool user)
 {
        int retval, oldprio, oldpolicy = -1, on_rq, running;
        unsigned long flags;
@@ -4804,7 +4871,8 @@ recheck:
                 * assigned.
                 */
                if (rt_bandwidth_enabled() && rt_policy(policy) &&
-                                task_group(p)->rt_bandwidth.rt_runtime == 0) {
+                                task_group(p)->rt_bandwidth.rt_runtime == 0 &&
+                                !task_group_is_autogroup(task_group(p))) {
                        __task_rq_unlock(rq);
                        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
                        return -EPERM;
@@ -4856,7 +4924,7 @@ recheck:
 * NOTE that the task may be already dead.
 */
 int sched_setscheduler(struct task_struct *p, int policy,
-                       struct sched_param *param)
+                       const struct sched_param *param)
 {
        return __sched_setscheduler(p, policy, param, true);
 }
@@ -4874,7 +4942,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
 * but our caller might not have that capability.
 */
 int sched_setscheduler_nocheck(struct task_struct *p, int policy,
-                               struct sched_param *param)
+                               const struct sched_param *param)
 {
        return __sched_setscheduler(p, policy, param, false);
 }
@@ -5390,7 +5458,7 @@ void sched_show_task(struct task_struct *p)
        unsigned state;
        state = p->state ? __ffs(p->state) + 1 : 0;
-        printk(KERN_INFO "%-13.13s %c", p->comm,
+        printk(KERN_INFO "%-15.15s %c", p->comm,
                state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
 #if BITS_PER_LONG == 32
        if (state == TASK_RUNNING)
@@ -5554,7 +5622,6 @@ static void update_sysctl(void)
        SET_SYSCTL(sched_min_granularity);
        SET_SYSCTL(sched_latency);
        SET_SYSCTL(sched_wakeup_granularity);
-        SET_SYSCTL(sched_shares_ratelimit);
 #undef SET_SYSCTL
 }
@@ -5630,7 +5697,7 @@ again:
                goto out;
        dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
-        if (migrate_task(p, dest_cpu)) {
+        if (migrate_task(p, rq)) {
                struct migration_arg arg = { p, dest_cpu };
                /* Need help from migration thread: drop lock and wait. */
                task_rq_unlock(rq, &flags);
@@ -5712,29 +5779,20 @@ static int migration_cpu_stop(void *data)
 }
 #ifdef CONFIG_HOTPLUG_CPU
 /*
- * Figure out where task on dead CPU should go, use force if necessary.
+ * Ensures that the idle task is using init_mm right before its cpu goes
+ * offline.
 */
-void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
+void idle_task_exit(void)
 {
-        struct rq *rq = cpu_rq(dead_cpu);
+        struct mm_struct *mm = current->active_mm;
-        int needs_cpu, uninitialized_var(dest_cpu);
-        unsigned long flags;
-        local_irq_save(flags);
+        BUG_ON(cpu_online(smp_processor_id()));
-        raw_spin_lock(&rq->lock);
+        if (mm != &init_mm)
-        needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING);
+                switch_mm(mm, &init_mm, current);
-        if (needs_cpu)
+        mmdrop(mm);
-                dest_cpu = select_fallback_rq(dead_cpu, p);
-        raw_spin_unlock(&rq->lock);
-        /*
-         * It can only fail if we race with set_cpus_allowed(),
-         * in the racer should migrate the task anyway.
-         */
-        if (needs_cpu)
-                __migrate_task(p, dead_cpu, dest_cpu);
-        local_irq_restore(flags);
 }
 /*
@@ -5747,128 +5805,69 @@ void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 static void migrate_nr_uninterruptible(struct rq *rq_src)
 {
        struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
-        unsigned long flags;
-        local_irq_save(flags);
-        double_rq_lock(rq_src, rq_dest);
        rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
        rq_src->nr_uninterruptible = 0;
-        double_rq_unlock(rq_src, rq_dest);
-        local_irq_restore(flags);
-}
-/* Run through task list and migrate tasks from the dead cpu. */
-static void migrate_live_tasks(int src_cpu)
-{
-        struct task_struct *p, *t;
-        read_lock(&tasklist_lock);
-        do_each_thread(t, p) {
-                if (p == current)
-                        continue;
-                if (task_cpu(p) == src_cpu)
-                        move_task_off_dead_cpu(src_cpu, p);
-        } while_each_thread(t, p);
-        read_unlock(&tasklist_lock);
 }
 /*
- * Schedules idle task to be the next runnable task on current CPU.
+ * remove the tasks which were accounted by rq from calc_load_tasks.
- * It does so by boosting its priority to highest possible.
- * Used by CPU offline code.
 */
-void sched_idle_next(void)
+static void calc_global_load_remove(struct rq *rq)
 {
-        int this_cpu = smp_processor_id();
+        atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
-        struct rq *rq = cpu_rq(this_cpu);
+        rq->calc_load_active = 0;
-        struct task_struct *p = rq->idle;
-        unsigned long flags;
-        /* cpu has to be offline */
-        BUG_ON(cpu_online(this_cpu));
-        /*
-         * Strictly not necessary since rest of the CPUs are stopped by now
-         * and interrupts disabled on the current cpu.
-         */
-        raw_spin_lock_irqsave(&rq->lock, flags);
-        __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
-        activate_task(rq, p, 0);
-        raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 /*
- * Ensures that the idle task is using init_mm right before its cpu goes
+ * Migrate all tasks from the rq, sleeping tasks will be migrated by
- * offline.
+ * try_to_wake_up()->select_task_rq().
+ *
+ * Called with rq->lock held even though we'er in stop_machine() and
+ * there's no concurrency possible, we hold the required locks anyway
+ * because of lock validation efforts.
 */
-void idle_task_exit(void)
+static void migrate_tasks(unsigned int dead_cpu)
-{
-        struct mm_struct *mm = current->active_mm;
-        BUG_ON(cpu_online(smp_processor_id()));
-        if (mm != &init_mm)
-                switch_mm(mm, &init_mm, current);
-        mmdrop(mm);
-}
-/* called under rq->lock with disabled interrupts */
-static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
 {
        struct rq *rq = cpu_rq(dead_cpu);
+        struct task_struct *next, *stop = rq->stop;
-        /* Must be exiting, otherwise would be on tasklist. */
+        int dest_cpu;
-        BUG_ON(!p->exit_state);
-        /* Cannot have done final schedule yet: would have vanished. */
-        BUG_ON(p->state == TASK_DEAD);
-        get_task_struct(p);
        /*
-         * Drop lock around migration; if someone else moves it,
+         * Fudge the rq selection such that the below task selection loop
-         * that's OK. No task can be added to this CPU, so iteration is
+         * doesn't get stuck on the currently eligible stop task.
-         * fine.
+         *
+         * We're currently inside stop_machine() and the rq is either stuck
+         * in the stop_machine_cpu_stop() loop, or we're executing this code,
+         * either way we should never end up calling schedule() until we're
+         * done here.
         */
-        raw_spin_unlock_irq(&rq->lock);
+        rq->stop = NULL;
-        move_task_off_dead_cpu(dead_cpu, p);
-        raw_spin_lock_irq(&rq->lock);
-        put_task_struct(p);
-}
-/* release_task() removes task from tasklist, so we won't find dead tasks. */
-static void migrate_dead_tasks(unsigned int dead_cpu)
-{
-        struct rq *rq = cpu_rq(dead_cpu);
-        struct task_struct *next;
        for ( ; ; ) {
-                if (!rq->nr_running)
+                /*
+                 * There's this thread running, bail when that's the only
+                 * remaining thread.
+                 */
+                if (rq->nr_running == 1)
                        break;
                next = pick_next_task(rq);
-                if (!next)
+                BUG_ON(!next);
-                        break;
                next->sched_class->put_prev_task(rq, next);
-                migrate_dead(dead_cpu, next);
+                /* Find suitable destination for @next, with force if needed. */
+                dest_cpu = select_fallback_rq(dead_cpu, next);
+                raw_spin_unlock(&rq->lock);
+                __migrate_task(next, dead_cpu, dest_cpu);
+                raw_spin_lock(&rq->lock);
        }
-}
-/*
+        rq->stop = stop;
- * remove the tasks which were accounted by rq from calc_load_tasks.
- */
-static void calc_global_load_remove(struct rq *rq)
-{
-        atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
-        rq->calc_load_active = 0;
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -6078,15 +6077,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
        unsigned long flags;
        struct rq *rq = cpu_rq(cpu);
-        switch (action) {
+        switch (action & ~CPU_TASKS_FROZEN) {
        case CPU_UP_PREPARE:
-        case CPU_UP_PREPARE_FROZEN:
                rq->calc_load_update = calc_load_update;
                break;
        case CPU_ONLINE:
-        case CPU_ONLINE_FROZEN:
                /* Update our root-domain */
                raw_spin_lock_irqsave(&rq->lock, flags);
                if (rq->rd) {
@@ -6098,30 +6095,19 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                break;
 #ifdef CONFIG_HOTPLUG_CPU
-        case CPU_DEAD:
-        case CPU_DEAD_FROZEN:
-                migrate_live_tasks(cpu);
-                /* Idle task back to normal (off runqueue, low prio) */
-                raw_spin_lock_irq(&rq->lock);
-                deactivate_task(rq, rq->idle, 0);
-                __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
-                rq->idle->sched_class = &idle_sched_class;
-                migrate_dead_tasks(cpu);
-                raw_spin_unlock_irq(&rq->lock);
-                migrate_nr_uninterruptible(rq);
-                BUG_ON(rq->nr_running != 0);
-                calc_global_load_remove(rq);
-                break;
        case CPU_DYING:
-        case CPU_DYING_FROZEN:
                /* Update our root-domain */
                raw_spin_lock_irqsave(&rq->lock, flags);
                if (rq->rd) {
                        BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
                        set_rq_offline(rq);
                }
+                migrate_tasks(cpu);
+                BUG_ON(rq->nr_running != 1); /* the migration thread */
                raw_spin_unlock_irqrestore(&rq->lock, flags);
+                migrate_nr_uninterruptible(rq);
+                calc_global_load_remove(rq);
                break;
 #endif
        }
@@ -6960,6 +6946,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
        if (cpu != group_first_cpu(sd->groups))
                return;
+        sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
        child = sd->child;
        sd->groups->cpu_power = 0;
@@ -7850,18 +7838,16 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
-                                struct sched_entity *se, int cpu, int add,
+                                struct sched_entity *se, int cpu,
                                struct sched_entity *parent)
 {
        struct rq *rq = cpu_rq(cpu);
        tg->cfs_rq[cpu] = cfs_rq;
        init_cfs_rq(cfs_rq, rq);
        cfs_rq->tg = tg;
-        if (add)
-                list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
        tg->se[cpu] = se;
-        /* se could be NULL for init_task_group */
+        /* se could be NULL for root_task_group */
        if (!se)
                return;
@@ -7871,15 +7857,14 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
                se->cfs_rq = parent->my_q;
        se->my_q = cfs_rq;
-        se->load.weight = tg->shares;
+        update_load_set(&se->load, 0);
-        se->load.inv_weight = 0;
        se->parent = parent;
 }
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
 static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
-                struct sched_rt_entity *rt_se, int cpu, int add,
+                struct sched_rt_entity *rt_se, int cpu,
                struct sched_rt_entity *parent)
 {
        struct rq *rq = cpu_rq(cpu);
@@ -7888,8 +7873,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
        init_rt_rq(rt_rq, rq);
        rt_rq->tg = tg;
        rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
-        if (add)
-                list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
        tg->rt_se[cpu] = rt_se;
        if (!rt_se)
@@ -7924,18 +7907,18 @@ void __init sched_init(void)
                ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
 #ifdef CONFIG_FAIR_GROUP_SCHED
-                init_task_group.se = (struct sched_entity **)ptr;
+                root_task_group.se = (struct sched_entity **)ptr;
                ptr += nr_cpu_ids * sizeof(void **);
-                init_task_group.cfs_rq = (struct cfs_rq **)ptr;
+                root_task_group.cfs_rq = (struct cfs_rq **)ptr;
                ptr += nr_cpu_ids * sizeof(void **);
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
-                init_task_group.rt_se = (struct sched_rt_entity **)ptr;
+                root_task_group.rt_se = (struct sched_rt_entity **)ptr;
                ptr += nr_cpu_ids * sizeof(void **);
-                init_task_group.rt_rq = (struct rt_rq **)ptr;
+                root_task_group.rt_rq = (struct rt_rq **)ptr;
                ptr += nr_cpu_ids * sizeof(void **);
 #endif /* CONFIG_RT_GROUP_SCHED */
@@ -7955,20 +7938,16 @@ void __init sched_init(void)
                        global_rt_period(), global_rt_runtime());
 #ifdef CONFIG_RT_GROUP_SCHED
-        init_rt_bandwidth(&init_task_group.rt_bandwidth,
+        init_rt_bandwidth(&root_task_group.rt_bandwidth,
                        global_rt_period(), global_rt_runtime());
 #endif /* CONFIG_RT_GROUP_SCHED */
 #ifdef CONFIG_CGROUP_SCHED
-        list_add(&init_task_group.list, &task_groups);
+        list_add(&root_task_group.list, &task_groups);
-        INIT_LIST_HEAD(&init_task_group.children);
+        INIT_LIST_HEAD(&root_task_group.children);
+        autogroup_init(&init_task);
 #endif /* CONFIG_CGROUP_SCHED */
-#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
-        update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
-                                            __alignof__(unsigned long));
-#endif
        for_each_possible_cpu(i) {
                struct rq *rq;
@@ -7980,38 +7959,34 @@ void __init sched_init(void)
                init_cfs_rq(&rq->cfs, rq);
                init_rt_rq(&rq->rt, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
-                init_task_group.shares = init_task_group_load;
+                root_task_group.shares = root_task_group_load;
                INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
-#ifdef CONFIG_CGROUP_SCHED
                /*
-                 * How much cpu bandwidth does init_task_group get?
+                 * How much cpu bandwidth does root_task_group get?
                 *
                 * In case of task-groups formed thr' the cgroup filesystem, it
                 * gets 100% of the cpu resources in the system. This overall
                 * system cpu resource is divided among the tasks of
-                 * init_task_group and its child task-groups in a fair manner,
+                 * root_task_group and its child task-groups in a fair manner,
                 * based on each entity's (task or task-group's) weight
                 * (se->load.weight).
                 *
-                 * In other words, if init_task_group has 10 tasks of weight
+                 * In other words, if root_task_group has 10 tasks of weight
                 * 1024) and two child groups A0 and A1 (of weight 1024 each),
                 * then A0's share of the cpu resource is:
                 *
                 *      A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
                 *
-                 * We achieve this by letting init_task_group's tasks sit
+                 * We achieve this by letting root_task_group's tasks sit
-                 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
+                 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
                 */
-                init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
+                init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
-#endif
 #endif /* CONFIG_FAIR_GROUP_SCHED */
                rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
 #ifdef CONFIG_RT_GROUP_SCHED
                INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
-#ifdef CONFIG_CGROUP_SCHED
+                init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
-                init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
-#endif
 #endif
                for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -8091,8 +8066,6 @@ void __init sched_init(void)
                zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
 #endif /* SMP */
-        perf_event_init();
        scheduler_running = 1;
 }
@@ -8286,7 +8259,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
                if (!se)
                        goto err_free_rq;
-                init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
+                init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
        }
        return 1;
@@ -8297,15 +8270,21 @@ err:
        return 0;
 }
-static inline void register_fair_sched_group(struct task_group *tg, int cpu)
-{
-        list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
-                        &cpu_rq(cpu)->leaf_cfs_rq_list);
-}
 static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
 {
-        list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
+        struct rq *rq = cpu_rq(cpu);
+        unsigned long flags;
+        /*
+        * Only empty task groups can be destroyed; so we can speculatively
+        * check on_list without danger of it being re-added.
+        */
+        if (!tg->cfs_rq[cpu]->on_list)
+                return;
+        raw_spin_lock_irqsave(&rq->lock, flags);
+        list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
+        raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 #else /* !CONFG_FAIR_GROUP_SCHED */
 static inline void free_fair_sched_group(struct task_group *tg)
@@ -8318,10 +8297,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
        return 1;
 }
-static inline void register_fair_sched_group(struct task_group *tg, int cpu)
-{
-}
 static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
 {
 }
@@ -8376,7 +8351,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
                if (!rt_se)
                        goto err_free_rq;
-                init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
+                init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
        }
        return 1;
@@ -8386,17 +8361,6 @@ err_free_rq:
 err:
        return 0;
 }
-static inline void register_rt_sched_group(struct task_group *tg, int cpu)
-{
-        list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
-                        &cpu_rq(cpu)->leaf_rt_rq_list);
-}
-static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
-{
-        list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
-}
 #else /* !CONFIG_RT_GROUP_SCHED */
 static inline void free_rt_sched_group(struct task_group *tg)
 {
@@ -8407,14 +8371,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 {
        return 1;
 }
-static inline void register_rt_sched_group(struct task_group *tg, int cpu)
-{
-}
-static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
-{
-}
 #endif /* CONFIG_RT_GROUP_SCHED */
 #ifdef CONFIG_CGROUP_SCHED
@@ -8422,6 +8378,7 @@ static void free_sched_group(struct task_group *tg)
 {
        free_fair_sched_group(tg);
        free_rt_sched_group(tg);
+        autogroup_free(tg);
        kfree(tg);
 }
@@ -8430,7 +8387,6 @@ struct task_group *sched_create_group(struct task_group *parent)
 {
        struct task_group *tg;
        unsigned long flags;
-        int i;
        tg = kzalloc(sizeof(*tg), GFP_KERNEL);
        if (!tg)
@@ -8443,10 +8399,6 @@ struct task_group *sched_create_group(struct task_group *parent)
                goto err;
        spin_lock_irqsave(&task_group_lock, flags);
-        for_each_possible_cpu(i) {
-                register_fair_sched_group(tg, i);
-                register_rt_sched_group(tg, i);
-        }
        list_add_rcu(&tg->list, &task_groups);
        WARN_ON(!parent); /* root should already exist */
@@ -8476,11 +8428,11 @@ void sched_destroy_group(struct task_group *tg)
        unsigned long flags;
        int i;
-        spin_lock_irqsave(&task_group_lock, flags);
+        /* end participation in shares distribution */
-        for_each_possible_cpu(i) {
+        for_each_possible_cpu(i)
                unregister_fair_sched_group(tg, i);
-                unregister_rt_sched_group(tg, i);
-        }
+        spin_lock_irqsave(&task_group_lock, flags);
        list_del_rcu(&tg->list);
        list_del_rcu(&tg->siblings);
        spin_unlock_irqrestore(&task_group_lock, flags);
@@ -8527,33 +8479,6 @@ void sched_move_task(struct task_struct *tsk)
 #endif /* CONFIG_CGROUP_SCHED */
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static void __set_se_shares(struct sched_entity *se, unsigned long shares)
-{
-        struct cfs_rq *cfs_rq = se->cfs_rq;
-        int on_rq;
-        on_rq = se->on_rq;
-        if (on_rq)
-                dequeue_entity(cfs_rq, se, 0);
-        se->load.weight = shares;
-        se->load.inv_weight = 0;
-        if (on_rq)
-                enqueue_entity(cfs_rq, se, 0);
-}
-static void set_se_shares(struct sched_entity *se, unsigned long shares)
-{
-        struct cfs_rq *cfs_rq = se->cfs_rq;
-        struct rq *rq = cfs_rq->rq;
-        unsigned long flags;
-        raw_spin_lock_irqsave(&rq->lock, flags);
-        __set_se_shares(se, shares);
-        raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
 static DEFINE_MUTEX(shares_mutex);
 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
@@ -8576,37 +8501,19 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
        if (tg->shares == shares)
                goto done;
-        spin_lock_irqsave(&task_group_lock, flags);
-        for_each_possible_cpu(i)
-                unregister_fair_sched_group(tg, i);
-        list_del_rcu(&tg->siblings);
-        spin_unlock_irqrestore(&task_group_lock, flags);
-        /* wait for any ongoing reference to this group to finish */
-        synchronize_sched();
-        /*
-         * Now we are free to modify the group's share on each cpu
-         * w/o tripping rebalance_share or load_balance_fair.
-         */
        tg->shares = shares;
        for_each_possible_cpu(i) {
-                /*
+                struct rq *rq = cpu_rq(i);
-                 * force a rebalance
+                struct sched_entity *se;
-                 */
-                cfs_rq_set_shares(tg->cfs_rq[i], 0);
+                se = tg->se[i];
-                set_se_shares(tg->se[i], shares);
+                /* Propagate contribution to hierarchy */
+                raw_spin_lock_irqsave(&rq->lock, flags);
+                for_each_sched_entity(se)
+                        update_cfs_shares(group_cfs_rq(se), 0);
+                raw_spin_unlock_irqrestore(&rq->lock, flags);
        }
-        /*
-         * Enable load balance activity on this group, by inserting it back on
-         * each cpu's rq->leaf_cfs_rq_list.
-         */
-        spin_lock_irqsave(&task_group_lock, flags);
-        for_each_possible_cpu(i)
-                register_fair_sched_group(tg, i);
-        list_add_rcu(&tg->siblings, &tg->parent->children);
-        spin_unlock_irqrestore(&task_group_lock, flags);
 done:
        mutex_unlock(&shares_mutex);
        return 0;
@@ -8905,7 +8812,7 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
        if (!cgrp->parent) {
                /* This is early initialization for the top cgroup */
-                return &init_task_group.css;
+                return &root_task_group.css;
        }
        parent = cgroup_tg(cgrp->parent);
@@ -8976,6 +8883,20 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
        }
 }
+static void
+cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task)
+{
+        /*
+         * cgroup_exit() is called in the copy_process() failure path.
+         * Ignore this case since the task hasn't ran yet, this avoids
+         * trying to poke a half freed task state from generic code.
+         */
+        if (!(task->flags & PF_EXITING))
+                return;
+        sched_move_task(task);
+}
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
                                u64 shareval)
@@ -9048,6 +8969,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
        .destroy        = cpu_cgroup_destroy,
        .can_attach     = cpu_cgroup_can_attach,
        .attach         = cpu_cgroup_attach,
+        .exit           = cpu_cgroup_exit,
        .populate       = cpu_cgroup_populate,
        .subsys_id      = cpu_cgroup_subsys_id,
        .early_init     = 1,
@@ -9332,72 +9254,3 @@ struct cgroup_subsys cpuacct_subsys = {
 };
 #endif  /* CONFIG_CGROUP_CPUACCT */
-#ifndef CONFIG_SMP
-void synchronize_sched_expedited(void)
-{
-        barrier();
-}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-#else /* #ifndef CONFIG_SMP */
-static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
-static int synchronize_sched_expedited_cpu_stop(void *data)
-{
-        /*
-         * There must be a full memory barrier on each affected CPU
-         * between the time that try_stop_cpus() is called and the
-         * time that it returns.
-         *
-         * In the current initial implementation of cpu_stop, the
-         * above condition is already met when the control reaches
-         * this point and the following smp_mb() is not strictly
-         * necessary.  Do smp_mb() anyway for documentation and
-         * robustness against future implementation changes.
-         */
-        smp_mb(); /* See above comment block. */
-        return 0;
-}
-/*
- * Wait for an rcu-sched grace period to elapse, but use "big hammer"
- * approach to force grace period to end quickly.  This consumes
- * significant time on all CPUs, and is thus not recommended for
- * any sort of common-case code.
- *
- * Note that it is illegal to call this function while holding any
- * lock that is acquired by a CPU-hotplug notifier.  Failing to
- * observe this restriction will result in deadlock.
- */
-void synchronize_sched_expedited(void)
-{
-        int snap, trycount = 0;
-        smp_mb();  /* ensure prior mod happens before capturing snap. */
-        snap = atomic_read(&synchronize_sched_expedited_count) + 1;
-        get_online_cpus();
-        while (try_stop_cpus(cpu_online_mask,
-                             synchronize_sched_expedited_cpu_stop,
-                             NULL) == -EAGAIN) {
-                put_online_cpus();
-                if (trycount++ < 10)
-                        udelay(trycount * num_online_cpus());
-                else {
-                        synchronize_sched();
-                        return;
-                }
-                if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
-                        smp_mb(); /* ensure test happens before caller kfree */
-                        return;
-                }
-                get_online_cpus();
-        }
-        atomic_inc(&synchronize_sched_expedited_count);
-        smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
-        put_online_cpus();
-}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-#endif /* #else #ifndef CONFIG_SMP */
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c
new file mode 100644
index 000000000000..9fb656283157
--- /dev/null
+++ b/kernel/sched_autogroup.c
@@ -0,0 +1,270 @@
+#ifdef CONFIG_SCHED_AUTOGROUP
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/kallsyms.h>
+#include <linux/utsname.h>
+unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
+static struct autogroup autogroup_default;
+static atomic_t autogroup_seq_nr;
+static void __init autogroup_init(struct task_struct *init_task)
+{
+        autogroup_default.tg = &root_task_group;
+        root_task_group.autogroup = &autogroup_default;
+        kref_init(&autogroup_default.kref);
+        init_rwsem(&autogroup_default.lock);
+        init_task->signal->autogroup = &autogroup_default;
+}
+static inline void autogroup_free(struct task_group *tg)
+{
+        kfree(tg->autogroup);
+}
+static inline void autogroup_destroy(struct kref *kref)
+{
+        struct autogroup *ag = container_of(kref, struct autogroup, kref);
+#ifdef CONFIG_RT_GROUP_SCHED
+        /* We've redirected RT tasks to the root task group... */
+        ag->tg->rt_se = NULL;
+        ag->tg->rt_rq = NULL;
+#endif
+        sched_destroy_group(ag->tg);
+}
+static inline void autogroup_kref_put(struct autogroup *ag)
+{
+        kref_put(&ag->kref, autogroup_destroy);
+}
+static inline struct autogroup *autogroup_kref_get(struct autogroup *ag)
+{
+        kref_get(&ag->kref);
+        return ag;
+}
+static inline struct autogroup *autogroup_task_get(struct task_struct *p)
+{
+        struct autogroup *ag;
+        unsigned long flags;
+        if (!lock_task_sighand(p, &flags))
+                return autogroup_kref_get(&autogroup_default);
+        ag = autogroup_kref_get(p->signal->autogroup);
+        unlock_task_sighand(p, &flags);
+        return ag;
+}
+#ifdef CONFIG_RT_GROUP_SCHED
+static void free_rt_sched_group(struct task_group *tg);
+#endif
+static inline struct autogroup *autogroup_create(void)
+{
+        struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
+        struct task_group *tg;
+        if (!ag)
+                goto out_fail;
+        tg = sched_create_group(&root_task_group);
+        if (IS_ERR(tg))
+                goto out_free;
+        kref_init(&ag->kref);
+        init_rwsem(&ag->lock);
+        ag->id = atomic_inc_return(&autogroup_seq_nr);
+        ag->tg = tg;
+#ifdef CONFIG_RT_GROUP_SCHED
+        /*
+         * Autogroup RT tasks are redirected to the root task group
+         * so we don't have to move tasks around upon policy change,
+         * or flail around trying to allocate bandwidth on the fly.
+         * A bandwidth exception in __sched_setscheduler() allows
+         * the policy change to proceed.  Thereafter, task_group()
+         * returns &root_task_group, so zero bandwidth is required.
+         */
+        free_rt_sched_group(tg);
+        tg->rt_se = root_task_group.rt_se;
+        tg->rt_rq = root_task_group.rt_rq;
+#endif
+        tg->autogroup = ag;
+        return ag;
+out_free:
+        kfree(ag);
+out_fail:
+        if (printk_ratelimit()) {
+                printk(KERN_WARNING "autogroup_create: %s failure.\n",
+                        ag ? "sched_create_group()" : "kmalloc()");
+        }
+        return autogroup_kref_get(&autogroup_default);
+}
+static inline bool
+task_wants_autogroup(struct task_struct *p, struct task_group *tg)
+{
+        if (tg != &root_task_group)
+                return false;
+        if (p->sched_class != &fair_sched_class)
+                return false;
+        /*
+         * We can only assume the task group can't go away on us if
+         * autogroup_move_group() can see us on ->thread_group list.
+         */
+        if (p->flags & PF_EXITING)
+                return false;
+        return true;
+}
+static inline bool task_group_is_autogroup(struct task_group *tg)
+{
+        return tg != &root_task_group && tg->autogroup;
+}
+static inline struct task_group *
+autogroup_task_group(struct task_struct *p, struct task_group *tg)
+{
+        int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
+        if (enabled && task_wants_autogroup(p, tg))
+                return p->signal->autogroup->tg;
+        return tg;
+}
+static void
+autogroup_move_group(struct task_struct *p, struct autogroup *ag)
+{
+        struct autogroup *prev;
+        struct task_struct *t;
+        unsigned long flags;
+        BUG_ON(!lock_task_sighand(p, &flags));
+        prev = p->signal->autogroup;
+        if (prev == ag) {
+                unlock_task_sighand(p, &flags);
+                return;
+        }
+        p->signal->autogroup = autogroup_kref_get(ag);
+        t = p;
+        do {
+                sched_move_task(t);
+        } while_each_thread(p, t);
+        unlock_task_sighand(p, &flags);
+        autogroup_kref_put(prev);
+}
+/* Allocates GFP_KERNEL, cannot be called under any spinlock */
+void sched_autogroup_create_attach(struct task_struct *p)
+{
+        struct autogroup *ag = autogroup_create();
+        autogroup_move_group(p, ag);
+        /* drop extra refrence added by autogroup_create() */
+        autogroup_kref_put(ag);
+}
+EXPORT_SYMBOL(sched_autogroup_create_attach);
+/* Cannot be called under siglock.  Currently has no users */
+void sched_autogroup_detach(struct task_struct *p)
+{
+        autogroup_move_group(p, &autogroup_default);
+}
+EXPORT_SYMBOL(sched_autogroup_detach);
+void sched_autogroup_fork(struct signal_struct *sig)
+{
+        sig->autogroup = autogroup_task_get(current);
+}
+void sched_autogroup_exit(struct signal_struct *sig)
+{
+        autogroup_kref_put(sig->autogroup);
+}
+static int __init setup_autogroup(char *str)
+{
+        sysctl_sched_autogroup_enabled = 0;
+        return 1;
+}
+__setup("noautogroup", setup_autogroup);
+#ifdef CONFIG_PROC_FS
+int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice)
+{
+        static unsigned long next = INITIAL_JIFFIES;
+        struct autogroup *ag;
+        int err;
+        if (*nice < -20 || *nice > 19)
+                return -EINVAL;
+        err = security_task_setnice(current, *nice);
+        if (err)
+                return err;
+        if (*nice < 0 && !can_nice(current, *nice))
+                return -EPERM;
+        /* this is a heavy operation taking global locks.. */
+        if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next))
+                return -EAGAIN;
+        next = HZ / 10 + jiffies;
+        ag = autogroup_task_get(p);
+        down_write(&ag->lock);
+        err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]);
+        if (!err)
+                ag->nice = *nice;
+        up_write(&ag->lock);
+        autogroup_kref_put(ag);
+        return err;
+}
+void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
+{
+        struct autogroup *ag = autogroup_task_get(p);
+        down_read(&ag->lock);
+        seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice);
+        up_read(&ag->lock);
+        autogroup_kref_put(ag);
+}
+#endif /* CONFIG_PROC_FS */
+#ifdef CONFIG_SCHED_DEBUG
+static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
+{
+        int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
+        if (!enabled || !tg->autogroup)
+                return 0;
+        return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
+}
+#endif /* CONFIG_SCHED_DEBUG */
+#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h
new file mode 100644
index 000000000000..7b859ffe5dad
--- /dev/null
+++ b/kernel/sched_autogroup.h
@@ -0,0 +1,36 @@
+#ifdef CONFIG_SCHED_AUTOGROUP
+struct autogroup {
+        struct kref             kref;
+        struct task_group       *tg;
+        struct rw_semaphore     lock;
+        unsigned long           id;
+        int                     nice;
+};
+static inline struct task_group *
+autogroup_task_group(struct task_struct *p, struct task_group *tg);
+#else /* !CONFIG_SCHED_AUTOGROUP */
+static inline void autogroup_init(struct task_struct *init_task) {  }
+static inline void autogroup_free(struct task_group *tg) { }
+static inline bool task_group_is_autogroup(struct task_group *tg)
+{
+        return 0;
+}
+static inline struct task_group *
+autogroup_task_group(struct task_struct *p, struct task_group *tg)
+{
+        return tg;
+}
+#ifdef CONFIG_SCHED_DEBUG
+static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
+{
+        return 0;
+}
+#endif
+#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 52f1a149bfb1..9d8af0b3fb64 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -79,7 +79,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
 }
 EXPORT_SYMBOL_GPL(sched_clock);
-static __read_mostly int sched_clock_running;
+__read_mostly int sched_clock_running;
 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
 __read_mostly int sched_clock_stable;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 2e1b0d17dd9b..eb6cb8edd075 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -16,6 +16,8 @@
 #include <linux/kallsyms.h>
 #include <linux/utsname.h>
+static DEFINE_SPINLOCK(sched_debug_lock);
 /*
 * This allows printing both to /proc/sched_debug and
 * to the console
@@ -54,8 +56,7 @@ static unsigned long nsec_low(unsigned long long nsec)
 #define SPLIT_NS(x) nsec_high(x), nsec_low(x)
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static void print_cfs_group_stats(struct seq_file *m, int cpu,
+static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
-                struct task_group *tg)
 {
        struct sched_entity *se = tg->se[cpu];
        if (!se)
@@ -87,6 +88,26 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu,
 }
 #endif
+#ifdef CONFIG_CGROUP_SCHED
+static char group_path[PATH_MAX];
+static char *task_group_path(struct task_group *tg)
+{
+        if (autogroup_path(tg, group_path, PATH_MAX))
+                return group_path;
+        /*
+         * May be NULL if the underlying cgroup isn't fully-created yet
+         */
+        if (!tg->css.cgroup) {
+                group_path[0] = '\0';
+                return group_path;
+        }
+        cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
+        return group_path;
+}
+#endif
 static void
 print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 {
@@ -109,17 +130,10 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
        SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
                0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
 #endif
 #ifdef CONFIG_CGROUP_SCHED
-        {
+        SEQ_printf(m, " %s", task_group_path(task_group(p)));
-                char path[64];
-                rcu_read_lock();
-                cgroup_path(task_group(p)->css.cgroup, path, sizeof(path));
-                rcu_read_unlock();
-                SEQ_printf(m, " %s", path);
-        }
 #endif
        SEQ_printf(m, "\n");
 }
@@ -147,19 +161,6 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
        read_unlock_irqrestore(&tasklist_lock, flags);
 }
-#if defined(CONFIG_CGROUP_SCHED) && \
-        (defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED))
-static void task_group_path(struct task_group *tg, char *buf, int buflen)
-{
-        /* may be NULL if the underlying cgroup isn't fully-created yet */
-        if (!tg->css.cgroup) {
-                buf[0] = '\0';
-                return;
-        }
-        cgroup_path(tg->css.cgroup, buf, buflen);
-}
-#endif
 void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 {
        s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
@@ -168,13 +169,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
        struct sched_entity *last;
        unsigned long flags;
-#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
+#ifdef CONFIG_FAIR_GROUP_SCHED
-        char path[128];
+        SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg));
-        struct task_group *tg = cfs_rq->tg;
-        task_group_path(tg, path, sizeof(path));
-        SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
 #else
        SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
 #endif
@@ -202,33 +198,34 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
        spread0 = min_vruntime - rq0_min_vruntime;
        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "spread0",
                        SPLIT_NS(spread0));
-        SEQ_printf(m, "  .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
-        SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
        SEQ_printf(m, "  .%-30s: %d\n", "nr_spread_over",
                        cfs_rq->nr_spread_over);
+        SEQ_printf(m, "  .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
+        SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 #ifdef CONFIG_SMP
-        SEQ_printf(m, "  .%-30s: %lu\n", "shares", cfs_rq->shares);
+        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "load_avg",
+                        SPLIT_NS(cfs_rq->load_avg));
+        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "load_period",
+                        SPLIT_NS(cfs_rq->load_period));
+        SEQ_printf(m, "  .%-30s: %ld\n", "load_contrib",
+                        cfs_rq->load_contribution);
+        SEQ_printf(m, "  .%-30s: %d\n", "load_tg",
+                        atomic_read(&cfs_rq->tg->load_weight));
 #endif
        print_cfs_group_stats(m, cpu, cfs_rq->tg);
 #endif
 }
 void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
 {
-#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED)
+#ifdef CONFIG_RT_GROUP_SCHED
-        char path[128];
+        SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg));
-        struct task_group *tg = rt_rq->tg;
-        task_group_path(tg, path, sizeof(path));
-        SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
 #else
        SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
 #endif
 #define P(x) \
        SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
 #define PN(x) \
@@ -243,9 +240,12 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
 #undef P
 }
+extern __read_mostly int sched_clock_running;
 static void print_cpu(struct seq_file *m, int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
+        unsigned long flags;
 #ifdef CONFIG_X86
        {
@@ -296,14 +296,20 @@ static void print_cpu(struct seq_file *m, int cpu)
        P(ttwu_count);
        P(ttwu_local);
-        P(bkl_count);
+        SEQ_printf(m, "  .%-30s: %d\n", "bkl_count",
+                                rq->rq_sched_info.bkl_count);
 #undef P
+#undef P64
 #endif
+        spin_lock_irqsave(&sched_debug_lock, flags);
        print_cfs_stats(m, cpu);
        print_rt_stats(m, cpu);
+        rcu_read_lock();
        print_rq(m, rq, cpu);
+        rcu_read_unlock();
+        spin_unlock_irqrestore(&sched_debug_lock, flags);
 }
 static const char *sched_tunable_scaling_names[] = {
@@ -314,21 +320,42 @@ static const char *sched_tunable_scaling_names[] = {
 static int sched_debug_show(struct seq_file *m, void *v)
 {
-        u64 now = ktime_to_ns(ktime_get());
+        u64 ktime, sched_clk, cpu_clk;
+        unsigned long flags;
        int cpu;
-        SEQ_printf(m, "Sched Debug Version: v0.09, %s %.*s\n",
+        local_irq_save(flags);
+        ktime = ktime_to_ns(ktime_get());
+        sched_clk = sched_clock();
+        cpu_clk = local_clock();
+        local_irq_restore(flags);
+        SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n",
                init_utsname()->release,
                (int)strcspn(init_utsname()->version, " "),
                init_utsname()->version);
-        SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now));
+#define P(x) \
+        SEQ_printf(m, "%-40s: %Ld\n", #x, (long long)(x))
+#define PN(x) \
+        SEQ_printf(m, "%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
+        PN(ktime);
+        PN(sched_clk);
+        PN(cpu_clk);
+        P(jiffies);
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+        P(sched_clock_stable);
+#endif
+#undef PN
+#undef P
+        SEQ_printf(m, "\n");
+        SEQ_printf(m, "sysctl_sched\n");
 #define P(x) \
        SEQ_printf(m, "  .%-40s: %Ld\n", #x, (long long)(x))
 #define PN(x) \
        SEQ_printf(m, "  .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
-        P(jiffies);
        PN(sysctl_sched_latency);
        PN(sysctl_sched_min_granularity);
        PN(sysctl_sched_wakeup_granularity);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f4f6a8326dd0..77e9166d7bbf 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -89,6 +89,13 @@ unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
+/*
+ * The exponential sliding  window over which load is averaged for shares
+ * distribution.
+ * (default: 10msec)
+ */
+unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
 static const struct sched_class fair_sched_class;
 /**************************************************************
@@ -143,6 +150,36 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
        return cfs_rq->tg->cfs_rq[this_cpu];
 }
+static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+        if (!cfs_rq->on_list) {
+                /*
+                 * Ensure we either appear before our parent (if already
+                 * enqueued) or force our parent to appear after us when it is
+                 * enqueued.  The fact that we always enqueue bottom-up
+                 * reduces this to two cases.
+                 */
+                if (cfs_rq->tg->parent &&
+                    cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
+                        list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
+                                &rq_of(cfs_rq)->leaf_cfs_rq_list);
+                } else {
+                        list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
+                                &rq_of(cfs_rq)->leaf_cfs_rq_list);
+                }
+                cfs_rq->on_list = 1;
+        }
+}
+static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+        if (cfs_rq->on_list) {
+                list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
+                cfs_rq->on_list = 0;
+        }
+}
 /* Iterate thr' all leaf cfs_rq's on a runqueue */
 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
        list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
@@ -246,6 +283,14 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
        return &cpu_rq(this_cpu)->cfs;
 }
+static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+}
+static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+}
 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
                for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
@@ -417,7 +462,6 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
        WRT_SYSCTL(sched_min_granularity);
        WRT_SYSCTL(sched_latency);
        WRT_SYSCTL(sched_wakeup_granularity);
-        WRT_SYSCTL(sched_shares_ratelimit);
 #undef WRT_SYSCTL
        return 0;
@@ -495,6 +539,9 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
        return calc_delta_fair(sched_slice(cfs_rq, se), se);
 }
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
+static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta);
 /*
 * Update the current task's runtime statistics. Skip current tasks that
 * are not in our scheduling class.
@@ -514,6 +561,10 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
        curr->vruntime += delta_exec_weighted;
        update_min_vruntime(cfs_rq);
+#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
+        cfs_rq->load_unacc_exec_time += delta_exec;
+#endif
 }
 static void update_curr(struct cfs_rq *cfs_rq)
@@ -633,7 +684,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
                list_add(&se->group_node, &cfs_rq->tasks);
        }
        cfs_rq->nr_running++;
-        se->on_rq = 1;
 }
 static void
@@ -647,9 +697,140 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
                list_del_init(&se->group_node);
        }
        cfs_rq->nr_running--;
-        se->on_rq = 0;
 }
+#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
+static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
+                                            int global_update)
+{
+        struct task_group *tg = cfs_rq->tg;
+        long load_avg;
+        load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
+        load_avg -= cfs_rq->load_contribution;
+        if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
+                atomic_add(load_avg, &tg->load_weight);
+                cfs_rq->load_contribution += load_avg;
+        }
+}
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+{
+        u64 period = sysctl_sched_shares_window;
+        u64 now, delta;
+        unsigned long load = cfs_rq->load.weight;
+        if (!cfs_rq)
+                return;
+        now = rq_of(cfs_rq)->clock;
+        delta = now - cfs_rq->load_stamp;
+        /* truncate load history at 4 idle periods */
+        if (cfs_rq->load_stamp > cfs_rq->load_last &&
+            now - cfs_rq->load_last > 4 * period) {
+                cfs_rq->load_period = 0;
+                cfs_rq->load_avg = 0;
+        }
+        cfs_rq->load_stamp = now;
+        cfs_rq->load_unacc_exec_time = 0;
+        cfs_rq->load_period += delta;
+        if (load) {
+                cfs_rq->load_last = now;
+                cfs_rq->load_avg += delta * load;
+        }
+        /* consider updating load contribution on each fold or truncate */
+        if (global_update || cfs_rq->load_period > period
+            || !cfs_rq->load_period)
+                update_cfs_rq_load_contribution(cfs_rq, global_update);
+        while (cfs_rq->load_period > period) {
+                /*
+                 * Inline assembly required to prevent the compiler
+                 * optimising this loop into a divmod call.
+                 * See __iter_div_u64_rem() for another example of this.
+                 */
+                asm("" : "+rm" (cfs_rq->load_period));
+                cfs_rq->load_period /= 2;
+                cfs_rq->load_avg /= 2;
+        }
+        if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
+                list_del_leaf_cfs_rq(cfs_rq);
+}
+static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+                            unsigned long weight)
+{
+        if (se->on_rq) {
+                /* commit outstanding execution time */
+                if (cfs_rq->curr == se)
+                        update_curr(cfs_rq);
+                account_entity_dequeue(cfs_rq, se);
+        }
+        update_load_set(&se->load, weight);
+        if (se->on_rq)
+                account_entity_enqueue(cfs_rq, se);
+}
+static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
+{
+        struct task_group *tg;
+        struct sched_entity *se;
+        long load_weight, load, shares;
+        if (!cfs_rq)
+                return;
+        tg = cfs_rq->tg;
+        se = tg->se[cpu_of(rq_of(cfs_rq))];
+        if (!se)
+                return;
+        load = cfs_rq->load.weight + weight_delta;
+        load_weight = atomic_read(&tg->load_weight);
+        load_weight -= cfs_rq->load_contribution;
+        load_weight += load;
+        shares = (tg->shares * load);
+        if (load_weight)
+                shares /= load_weight;
+        if (shares < MIN_SHARES)
+                shares = MIN_SHARES;
+        if (shares > tg->shares)
+                shares = tg->shares;
+        reweight_entity(cfs_rq_of(se), se, shares);
+}
+static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+        if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
+                update_cfs_load(cfs_rq, 0);
+                update_cfs_shares(cfs_rq, 0);
+        }
+}
+#else /* CONFIG_FAIR_GROUP_SCHED */
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+{
+}
+static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
+{
+}
+static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+}
+#endif /* CONFIG_FAIR_GROUP_SCHED */
 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 #ifdef CONFIG_SCHEDSTATS
@@ -771,6 +952,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
         * Update run-time statistics of the 'current'.
         */
        update_curr(cfs_rq);
+        update_cfs_load(cfs_rq, 0);
+        update_cfs_shares(cfs_rq, se->load.weight);
        account_entity_enqueue(cfs_rq, se);
        if (flags & ENQUEUE_WAKEUP) {
@@ -782,6 +965,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
        check_spread(cfs_rq, se);
        if (se != cfs_rq->curr)
                __enqueue_entity(cfs_rq, se);
+        se->on_rq = 1;
+        if (cfs_rq->nr_running == 1)
+                list_add_leaf_cfs_rq(cfs_rq);
 }
 static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -825,8 +1012,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
        if (se != cfs_rq->curr)
                __dequeue_entity(cfs_rq, se);
+        se->on_rq = 0;
+        update_cfs_load(cfs_rq, 0);
        account_entity_dequeue(cfs_rq, se);
        update_min_vruntime(cfs_rq);
+        update_cfs_shares(cfs_rq, 0);
        /*
         * Normalize the entity after updating the min_vruntime because the
@@ -872,6 +1062,9 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
                struct sched_entity *se = __pick_next_entity(cfs_rq);
                s64 delta = curr->vruntime - se->vruntime;
+                if (delta < 0)
+                        return;
                if (delta > ideal_runtime)
                        resched_task(rq_of(cfs_rq)->curr);
        }
@@ -955,6 +1148,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
         */
        update_curr(cfs_rq);
+        /*
+         * Update share accounting for long-running entities.
+         */
+        update_entity_shares_tick(cfs_rq);
 #ifdef CONFIG_SCHED_HRTICK
        /*
         * queued ticks are scheduled to match the slice, so don't bother
@@ -1055,6 +1253,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                flags = ENQUEUE_WAKEUP;
        }
+        for_each_sched_entity(se) {
+                struct cfs_rq *cfs_rq = cfs_rq_of(se);
+                update_cfs_load(cfs_rq, 0);
+                update_cfs_shares(cfs_rq, 0);
+        }
        hrtick_update(rq);
 }
@@ -1071,12 +1276,20 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
        for_each_sched_entity(se) {
                cfs_rq = cfs_rq_of(se);
                dequeue_entity(cfs_rq, se, flags);
                /* Don't dequeue parent if it has other entities besides us */
                if (cfs_rq->load.weight)
                        break;
                flags |= DEQUEUE_SLEEP;
        }
+        for_each_sched_entity(se) {
+                struct cfs_rq *cfs_rq = cfs_rq_of(se);
+                update_cfs_load(cfs_rq, 0);
+                update_cfs_shares(cfs_rq, 0);
+        }
        hrtick_update(rq);
 }
@@ -1143,67 +1356,36 @@ static void task_waking_fair(struct rq *rq, struct task_struct *p)
 * Adding load to a group doesn't make a group heavier, but can cause movement
 * of group shares between cpus. Assuming the shares were perfectly aligned one
 * can calculate the shift in shares.
- *
- * The problem is that perfectly aligning the shares is rather expensive, hence
- * we try to avoid doing that too often - see update_shares(), which ratelimits
- * this change.
- *
- * We compensate this by not only taking the current delta into account, but
- * also considering the delta between when the shares were last adjusted and
- * now.
- *
- * We still saw a performance dip, some tracing learned us that between
- * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
- * significantly. Therefore try to bias the error in direction of failing
- * the affine wakeup.
- *
 */
-static long effective_load(struct task_group *tg, int cpu,
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
-                long wl, long wg)
 {
        struct sched_entity *se = tg->se[cpu];
        if (!tg->parent)
                return wl;
-        /*
-         * By not taking the decrease of shares on the other cpu into
-         * account our error leans towards reducing the affine wakeups.
-         */
-        if (!wl && sched_feat(ASYM_EFF_LOAD))
-                return wl;
        for_each_sched_entity(se) {
-                long S, rw, s, a, b;
+                long lw, w;
-                long more_w;
-                /*
+                tg = se->my_q->tg;
-                 * Instead of using this increment, also add the difference
+                w = se->my_q->load.weight;
-                 * between when the shares were last updated and now.
-                 */
-                more_w = se->my_q->load.weight - se->my_q->rq_weight;
-                wl += more_w;
-                wg += more_w;
-                S = se->my_q->tg->shares;
-                s = se->my_q->shares;
-                rw = se->my_q->rq_weight;
-                a = S*(rw + wl);
+                /* use this cpu's instantaneous contribution */
-                b = S*rw + s*wg;
+                lw = atomic_read(&tg->load_weight);
+                lw -= se->my_q->load_contribution;
+                lw += w + wg;
-                wl = s*(a-b);
+                wl += w;
-                if (likely(b))
+                if (lw > 0 && wl < lw)
-                        wl /= b;
+                        wl = (wl * tg->shares) / lw;
+                else
+                        wl = tg->shares;
-                /*
+                /* zero point is MIN_SHARES */
-                 * Assume the group is already running and will
+                if (wl < MIN_SHARES)
-                 * thus already be accounted for in the weight.
+                        wl = MIN_SHARES;
-                 *
+                wl -= se->load.weight;
-                 * That is, moving shares between CPUs, does not
-                 * alter the group weight.
-                 */
                wg = 0;
        }
@@ -1508,23 +1690,6 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
                        sd = tmp;
        }
-#ifdef CONFIG_FAIR_GROUP_SCHED
-        if (sched_feat(LB_SHARES_UPDATE)) {
-                /*
-                 * Pick the largest domain to update shares over
-                 */
-                tmp = sd;
-                if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
-                        tmp = affine_sd;
-                if (tmp) {
-                        raw_spin_unlock(&rq->lock);
-                        update_shares(tmp);
-                        raw_spin_lock(&rq->lock);
-                }
-        }
-#endif
        if (affine_sd) {
                if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
                        return select_idle_sibling(p, cpu);
@@ -1654,12 +1819,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
        struct cfs_rq *cfs_rq = task_cfs_rq(curr);
        int scale = cfs_rq->nr_running >= sched_nr_latency;
-        if (unlikely(rt_prio(p->prio)))
-                goto preempt;
-        if (unlikely(p->sched_class != &fair_sched_class))
-                return;
        if (unlikely(se == pse))
                return;
@@ -1764,10 +1923,6 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
        set_task_cpu(p, this_cpu);
        activate_task(this_rq, p, 0);
        check_preempt_curr(this_rq, p, 0);
-        /* re-arm NEWIDLE balancing when moving tasks */
-        src_rq->avg_idle = this_rq->avg_idle = 2*sysctl_sched_migration_cost;
-        this_rq->idle_stamp = 0;
 }
 /*
@@ -1919,6 +2074,48 @@ out:
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * update tg->load_weight by folding this cpu's load_avg
+ */
+static int update_shares_cpu(struct task_group *tg, int cpu)
+{
+        struct cfs_rq *cfs_rq;
+        unsigned long flags;
+        struct rq *rq;
+        if (!tg->se[cpu])
+                return 0;
+        rq = cpu_rq(cpu);
+        cfs_rq = tg->cfs_rq[cpu];
+        raw_spin_lock_irqsave(&rq->lock, flags);
+        update_rq_clock(rq);
+        update_cfs_load(cfs_rq, 1);
+        /*
+         * We need to update shares after updating tg->load_weight in
+         * order to adjust the weight of groups with long running tasks.
+         */
+        update_cfs_shares(cfs_rq, 0);
+        raw_spin_unlock_irqrestore(&rq->lock, flags);
+        return 0;
+}
+static void update_shares(int cpu)
+{
+        struct cfs_rq *cfs_rq;
+        struct rq *rq = cpu_rq(cpu);
+        rcu_read_lock();
+        for_each_leaf_cfs_rq(rq, cfs_rq)
+                update_shares_cpu(cfs_rq->tg, cpu);
+        rcu_read_unlock();
+}
 static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                  unsigned long max_load_move,
@@ -1966,6 +2163,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
        return max_load_move - rem_load_move;
 }
 #else
+static inline void update_shares(int cpu)
+{
+}
 static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                  unsigned long max_load_move,
@@ -2035,13 +2236,16 @@ struct sd_lb_stats {
        unsigned long this_load_per_task;
        unsigned long this_nr_running;
        unsigned long this_has_capacity;
+        unsigned int  this_idle_cpus;
        /* Statistics of the busiest group */
+        unsigned int  busiest_idle_cpus;
        unsigned long max_load;
        unsigned long busiest_load_per_task;
        unsigned long busiest_nr_running;
        unsigned long busiest_group_capacity;
        unsigned long busiest_has_capacity;
+        unsigned int  busiest_group_weight;
        int group_imb; /* Is there imbalance in this sd */
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -2063,6 +2267,8 @@ struct sg_lb_stats {
        unsigned long sum_nr_running; /* Nr tasks running in the group */
        unsigned long sum_weighted_load; /* Weighted load of group's tasks */
        unsigned long group_capacity;
+        unsigned long idle_cpus;
+        unsigned long group_weight;
        int group_imb; /* Is there an imbalance in the group ? */
        int group_has_capacity; /* Is there extra capacity in the group? */
 };
@@ -2431,7 +2637,8 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
                sgs->group_load += load;
                sgs->sum_nr_running += rq->nr_running;
                sgs->sum_weighted_load += weighted_cpuload(i);
+                if (idle_cpu(i))
+                        sgs->idle_cpus++;
        }
        /*
@@ -2469,6 +2676,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
        sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
        if (!sgs->group_capacity)
                sgs->group_capacity = fix_small_capacity(sd, group);
+        sgs->group_weight = group->group_weight;
        if (sgs->group_capacity > sgs->sum_nr_running)
                sgs->group_has_capacity = 1;
@@ -2576,13 +2784,16 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
                        sds->this_nr_running = sgs.sum_nr_running;
                        sds->this_load_per_task = sgs.sum_weighted_load;
                        sds->this_has_capacity = sgs.group_has_capacity;
+                        sds->this_idle_cpus = sgs.idle_cpus;
                } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
                        sds->max_load = sgs.avg_load;
                        sds->busiest = sg;
                        sds->busiest_nr_running = sgs.sum_nr_running;
+                        sds->busiest_idle_cpus = sgs.idle_cpus;
                        sds->busiest_group_capacity = sgs.group_capacity;
                        sds->busiest_load_per_task = sgs.sum_weighted_load;
                        sds->busiest_has_capacity = sgs.group_has_capacity;
+                        sds->busiest_group_weight = sgs.group_weight;
                        sds->group_imb = sgs.group_imb;
                }
@@ -2860,8 +3071,26 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
        if (sds.this_load >= sds.avg_load)
                goto out_balanced;
-        if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+        /*
-                goto out_balanced;
+         * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative.
+         * And to check for busy balance use !idle_cpu instead of
+         * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE
+         * even when they are idle.
+         */
+        if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) {
+                if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+                        goto out_balanced;
+        } else {
+                /*
+                 * This cpu is idle. If the busiest group load doesn't
+                 * have more tasks than the number of available cpu's and
+                 * there is no imbalance between this and busiest group
+                 * wrt to idle cpu's, it is balanced.
+                 */
+                if ((sds.this_idle_cpus  <= sds.busiest_idle_cpus + 1) &&
+                    sds.busiest_nr_running <= sds.busiest_group_weight)
+                        goto out_balanced;
+        }
 force_balance:
        /* Looks like there is an imbalance. Compute it */
@@ -3014,7 +3243,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
        schedstat_inc(sd, lb_count[idle]);
 redo:
-        update_shares(sd);
        group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
                                   cpus, balance);
@@ -3156,8 +3384,6 @@ out_one_pinned:
        else
                ld_moved = 0;
 out:
-        if (ld_moved)
-                update_shares(sd);
        return ld_moved;
 }
@@ -3181,6 +3407,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
         */
        raw_spin_unlock(&this_rq->lock);
+        update_shares(this_cpu);
        for_each_domain(this_cpu, sd) {
                unsigned long interval;
                int balance = 1;
@@ -3197,8 +3424,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
                interval = msecs_to_jiffies(sd->balance_interval);
                if (time_after(next_balance, sd->last_balance + interval))
                        next_balance = sd->last_balance + interval;
-                if (pulled_task)
+                if (pulled_task) {
+                        this_rq->idle_stamp = 0;
                        break;
+                }
        }
        raw_spin_lock(&this_rq->lock);
@@ -3549,6 +3778,8 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
        int update_next_balance = 0;
        int need_serialize;
+        update_shares(cpu);
        for_each_domain(cpu, sd) {
                if (!(sd->flags & SD_LOAD_BALANCE))
                        continue;
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 185f920ec1a2..68e69acc29b9 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -52,8 +52,6 @@ SCHED_FEAT(ARCH_POWER, 0)
 SCHED_FEAT(HRTICK, 0)
 SCHED_FEAT(DOUBLE_TICK, 0)
 SCHED_FEAT(LB_BIAS, 1)
-SCHED_FEAT(LB_SHARES_UPDATE, 1)
-SCHED_FEAT(ASYM_EFF_LOAD, 1)
 /*
 * Spin-wait on mutex acquisition when the mutex owner is running on
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index bea7d79f7e9c..c914ec747ca6 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -183,6 +183,17 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
        return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
 }
+static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
+{
+        list_add_rcu(&rt_rq->leaf_rt_rq_list,
+                        &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list);
+}
+static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
+{
+        list_del_rcu(&rt_rq->leaf_rt_rq_list);
+}
 #define for_each_leaf_rt_rq(rt_rq, rq) \
        list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
@@ -276,6 +287,14 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
        return ktime_to_ns(def_rt_bandwidth.rt_period);
 }
+static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
+{
+}
+static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
+{
+}
 #define for_each_leaf_rt_rq(rt_rq, rq) \
        for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
@@ -825,6 +844,9 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
        if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
                return;
+        if (!rt_rq->rt_nr_running)
+                list_add_leaf_rt_rq(rt_rq);
        if (head)
                list_add(&rt_se->run_list, queue);
        else
@@ -844,6 +866,8 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
                __clear_bit(rt_se_prio(rt_se), array->bitmap);
        dec_rt_tasks(rt_se, rt_rq);
+        if (!rt_rq->rt_nr_running)
+                list_del_leaf_rt_rq(rt_rq);
 }
 /*
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
index 45bddc0c1048..2bf6b47058c1 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched_stoptask.c
@@ -19,14 +19,14 @@ select_task_rq_stop(struct rq *rq, struct task_struct *p,
 static void
 check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
 {
-        resched_task(rq->curr); /* we preempt everything */
+        /* we're never preempted */
 }
 static struct task_struct *pick_next_task_stop(struct rq *rq)
 {
        struct task_struct *stop = rq->stop;
-        if (stop && stop->state == TASK_RUNNING)
+        if (stop && stop->se.on_rq)
                return stop;
        return NULL;
diff --git a/kernel/smp.c b/kernel/smp.c
index 12ed8b013e2d..9910744f0856 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -13,6 +13,7 @@
 #include <linux/smp.h>
 #include <linux/cpu.h>
+#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
 static struct {
        struct list_head        queue;
        raw_spinlock_t          lock;
@@ -193,23 +194,52 @@ void generic_smp_call_function_interrupt(void)
         */
        list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
                int refs;
+                void (*func) (void *info);
-                if (!cpumask_test_and_clear_cpu(cpu, data->cpumask))
+                /*
+                 * Since we walk the list without any locks, we might
+                 * see an entry that was completed, removed from the
+                 * list and is in the process of being reused.
+                 *
+                 * We must check that the cpu is in the cpumask before
+                 * checking the refs, and both must be set before
+                 * executing the callback on this cpu.
+                 */
+                if (!cpumask_test_cpu(cpu, data->cpumask))
+                        continue;
+                smp_rmb();
+                if (atomic_read(&data->refs) == 0)
                        continue;
+                func = data->csd.func;                  /* for later warn */
                data->csd.func(data->csd.info);
+                /*
+                 * If the cpu mask is not still set then it enabled interrupts,
+                 * we took another smp interrupt, and executed the function
+                 * twice on this cpu.  In theory that copy decremented refs.
+                 */
+                if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) {
+                        WARN(1, "%pS enabled interrupts and double executed\n",
+                             func);
+                        continue;
+                }
                refs = atomic_dec_return(&data->refs);
                WARN_ON(refs < 0);
-                if (!refs) {
-                        raw_spin_lock(&call_function.lock);
-                        list_del_rcu(&data->csd.list);
-                        raw_spin_unlock(&call_function.lock);
-                }
                if (refs)
                        continue;
+                WARN_ON(!cpumask_empty(data->cpumask));
+                raw_spin_lock(&call_function.lock);
+                list_del_rcu(&data->csd.list);
+                raw_spin_unlock(&call_function.lock);
                csd_unlock(&data->csd);
        }
@@ -429,7 +459,7 @@ void smp_call_function_many(const struct cpumask *mask,
         * can't happen.
         */
        WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
-                     && !oops_in_progress);
+                     && !oops_in_progress && !early_boot_irqs_disabled);
        /* So, what's a CPU they want? Ignoring this one. */
        cpu = cpumask_first_and(mask, cpu_online_mask);
@@ -453,11 +483,21 @@ void smp_call_function_many(const struct cpumask *mask,
        data = &__get_cpu_var(cfd_data);
        csd_lock(&data->csd);
+        BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask));
        data->csd.func = func;
        data->csd.info = info;
        cpumask_and(data->cpumask, mask, cpu_online_mask);
        cpumask_clear_cpu(this_cpu, data->cpumask);
+        /*
+         * To ensure the interrupt handler gets an complete view
+         * we order the cpumask and refs writes and order the read
+         * of them in the interrupt handler.  In addition we may
+         * only clear our own cpu bit from the mask.
+         */
+        smp_wmb();
        atomic_set(&data->refs, cpumask_weight(data->cpumask));
        raw_spin_lock_irqsave(&call_function.lock, flags);
@@ -529,3 +569,24 @@ void ipi_call_unlock_irq(void)
 {
        raw_spin_unlock_irq(&call_function.lock);
 }
+#endif /* USE_GENERIC_SMP_HELPERS */
+/*
+ * Call a function on all processors.  May be used during early boot while
+ * early_boot_irqs_disabled is set.  Use local_irq_save/restore() instead
+ * of local_irq_disable/enable().
+ */
+int on_each_cpu(void (*func) (void *info), void *info, int wait)
+{
+        unsigned long flags;
+        int ret = 0;
+        preempt_disable();
+        ret = smp_call_function(func, info, wait);
+        local_irq_save(flags);
+        func(info);
+        local_irq_restore(flags);
+        preempt_enable();
+        return ret;
+}
+EXPORT_SYMBOL(on_each_cpu);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 18f4be0d5fe0..68eb5efec388 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -70,7 +70,7 @@ char *softirq_to_name[NR_SOFTIRQS] = {
 static void wakeup_softirqd(void)
 {
        /* Interrupts are disabled: no need to stop preemption */
-        struct task_struct *tsk = __get_cpu_var(ksoftirqd);
+        struct task_struct *tsk = __this_cpu_read(ksoftirqd);
        if (tsk && tsk->state != TASK_RUNNING)
                wake_up_process(tsk);
@@ -388,8 +388,8 @@ void __tasklet_schedule(struct tasklet_struct *t)
        local_irq_save(flags);
        t->next = NULL;
-        *__get_cpu_var(tasklet_vec).tail = t;
+        *__this_cpu_read(tasklet_vec.tail) = t;
-        __get_cpu_var(tasklet_vec).tail = &(t->next);
+        __this_cpu_write(tasklet_vec.tail, &(t->next));
        raise_softirq_irqoff(TASKLET_SOFTIRQ);
        local_irq_restore(flags);
 }
@@ -402,8 +402,8 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
        local_irq_save(flags);
        t->next = NULL;
-        *__get_cpu_var(tasklet_hi_vec).tail = t;
+        *__this_cpu_read(tasklet_hi_vec.tail) = t;
-        __get_cpu_var(tasklet_hi_vec).tail = &(t->next);
+        __this_cpu_write(tasklet_hi_vec.tail,  &(t->next));
        raise_softirq_irqoff(HI_SOFTIRQ);
        local_irq_restore(flags);
 }
@@ -414,8 +414,8 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t)
 {
        BUG_ON(!irqs_disabled());
-        t->next = __get_cpu_var(tasklet_hi_vec).head;
+        t->next = __this_cpu_read(tasklet_hi_vec.head);
-        __get_cpu_var(tasklet_hi_vec).head = t;
+        __this_cpu_write(tasklet_hi_vec.head, t);
        __raise_softirq_irqoff(HI_SOFTIRQ);
 }
@@ -426,9 +426,9 @@ static void tasklet_action(struct softirq_action *a)
        struct tasklet_struct *list;
        local_irq_disable();
-        list = __get_cpu_var(tasklet_vec).head;
+        list = __this_cpu_read(tasklet_vec.head);
-        __get_cpu_var(tasklet_vec).head = NULL;
+        __this_cpu_write(tasklet_vec.head, NULL);
-        __get_cpu_var(tasklet_vec).tail = &__get_cpu_var(tasklet_vec).head;
+        __this_cpu_write(tasklet_vec.tail, &__get_cpu_var(tasklet_vec).head);
        local_irq_enable();
        while (list) {
@@ -449,8 +449,8 @@ static void tasklet_action(struct softirq_action *a)
                local_irq_disable();
                t->next = NULL;
-                *__get_cpu_var(tasklet_vec).tail = t;
+                *__this_cpu_read(tasklet_vec.tail) = t;
-                __get_cpu_var(tasklet_vec).tail = &(t->next);
+                __this_cpu_write(tasklet_vec.tail, &(t->next));
                __raise_softirq_irqoff(TASKLET_SOFTIRQ);
                local_irq_enable();
        }
@@ -461,9 +461,9 @@ static void tasklet_hi_action(struct softirq_action *a)
        struct tasklet_struct *list;
        local_irq_disable();
-        list = __get_cpu_var(tasklet_hi_vec).head;
+        list = __this_cpu_read(tasklet_hi_vec.head);
-        __get_cpu_var(tasklet_hi_vec).head = NULL;
+        __this_cpu_write(tasklet_hi_vec.head, NULL);
-        __get_cpu_var(tasklet_hi_vec).tail = &__get_cpu_var(tasklet_hi_vec).head;
+        __this_cpu_write(tasklet_hi_vec.tail, &__get_cpu_var(tasklet_hi_vec).head);
        local_irq_enable();
        while (list) {
@@ -484,8 +484,8 @@ static void tasklet_hi_action(struct softirq_action *a)
                local_irq_disable();
                t->next = NULL;
-                *__get_cpu_var(tasklet_hi_vec).tail = t;
+                *__this_cpu_read(tasklet_hi_vec.tail) = t;
-                __get_cpu_var(tasklet_hi_vec).tail = &(t->next);
+                __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
                __raise_softirq_irqoff(HI_SOFTIRQ);
                local_irq_enable();
        }
@@ -802,16 +802,16 @@ static void takeover_tasklets(unsigned int cpu)
        /* Find end, append list for that CPU. */
        if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) {
-                *(__get_cpu_var(tasklet_vec).tail) = per_cpu(tasklet_vec, cpu).head;
+                *__this_cpu_read(tasklet_vec.tail) = per_cpu(tasklet_vec, cpu).head;
-                __get_cpu_var(tasklet_vec).tail = per_cpu(tasklet_vec, cpu).tail;
+                this_cpu_write(tasklet_vec.tail, per_cpu(tasklet_vec, cpu).tail);
                per_cpu(tasklet_vec, cpu).head = NULL;
                per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head;
        }
        raise_softirq_irqoff(TASKLET_SOFTIRQ);
        if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) {
-                *__get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).head;
+                *__this_cpu_read(tasklet_hi_vec.tail) = per_cpu(tasklet_hi_vec, cpu).head;
-                __get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).tail;
+                __this_cpu_write(tasklet_hi_vec.tail, per_cpu(tasklet_hi_vec, cpu).tail);
                per_cpu(tasklet_hi_vec, cpu).head = NULL;
                per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head;
        }
@@ -853,7 +853,9 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
                             cpumask_any(cpu_online_mask));
        case CPU_DEAD:
        case CPU_DEAD_FROZEN: {
-                struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+                static const struct sched_param param = {
+                        .sched_priority = MAX_RT_PRIO-1
+                };
                p = per_cpu(ksoftirqd, hotcpu);
                per_cpu(ksoftirqd, hotcpu) = NULL;
@@ -883,25 +885,6 @@ static __init int spawn_ksoftirqd(void)
 }
 early_initcall(spawn_ksoftirqd);
-#ifdef CONFIG_SMP
-/*
- * Call a function on all processors
- */
-int on_each_cpu(void (*func) (void *info), void *info, int wait)
-{
-        int ret = 0;
-        preempt_disable();
-        ret = smp_call_function(func, info, wait);
-        local_irq_disable();
-        func(info);
-        local_irq_enable();
-        preempt_enable();
-        return ret;
-}
-EXPORT_SYMBOL(on_each_cpu);
-#endif
 /*
 * [ These __weak aliases are kept in a separate compilation unit, so that
 *   GCC does not inline them incorrectly. ]
diff --git a/kernel/srcu.c b/kernel/srcu.c
index c71e07500536..73ce23feaea9 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -31,6 +31,7 @@
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
 #include <linux/smp.h>
+#include <linux/delay.h>
 #include <linux/srcu.h>
 static int init_srcu_struct_fields(struct srcu_struct *sp)
@@ -155,6 +156,16 @@ void __srcu_read_unlock(struct srcu_struct *sp, int idx)
 EXPORT_SYMBOL_GPL(__srcu_read_unlock);
 /*
+ * We use an adaptive strategy for synchronize_srcu() and especially for
+ * synchronize_srcu_expedited().  We spin for a fixed time period
+ * (defined below) to allow SRCU readers to exit their read-side critical
+ * sections.  If there are still some readers after 10 microseconds,
+ * we repeatedly block for 1-millisecond time periods.  This approach
+ * has done well in testing, so there is no need for a config parameter.
+ */
+#define SYNCHRONIZE_SRCU_READER_DELAY 10
+/*
 * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
 */
 static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
@@ -203,9 +214,15 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
         * all srcu_read_lock() calls using the old counters have completed.
         * Their corresponding critical sections might well be still
         * executing, but the srcu_read_lock() primitives themselves
-         * will have finished executing.
+         * will have finished executing.  We initially give readers
+         * an arbitrarily chosen 10 microseconds to get out of their
+         * SRCU read-side critical sections, then loop waiting 1/HZ
+         * seconds per iteration.  The 10-microsecond value has done
+         * very well in testing.
         */
+        if (srcu_readers_active_idx(sp, idx))
+                udelay(SYNCHRONIZE_SRCU_READER_DELAY);
        while (srcu_readers_active_idx(sp, idx))
                schedule_timeout_interruptible(1);
diff --git a/kernel/sys.c b/kernel/sys.c
index 7f5a0cd296a9..31b71a276b40 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -43,6 +43,8 @@
 #include <linux/kprobes.h>
 #include <linux/user_namespace.h>
+#include <linux/kmsg_dump.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 #include <asm/unistd.h>
@@ -285,6 +287,7 @@ out_unlock:
 */
 void emergency_restart(void)
 {
+        kmsg_dump(KMSG_DUMP_EMERG);
        machine_emergency_restart();
 }
 EXPORT_SYMBOL_GPL(emergency_restart);
@@ -312,6 +315,7 @@ void kernel_restart(char *cmd)
                printk(KERN_EMERG "Restarting system.\n");
        else
                printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd);
+        kmsg_dump(KMSG_DUMP_RESTART);
        machine_restart(cmd);
 }
 EXPORT_SYMBOL_GPL(kernel_restart);
@@ -333,6 +337,7 @@ void kernel_halt(void)
        kernel_shutdown_prepare(SYSTEM_HALT);
        sysdev_shutdown();
        printk(KERN_EMERG "System halted.\n");
+        kmsg_dump(KMSG_DUMP_HALT);
        machine_halt();
 }
@@ -351,6 +356,7 @@ void kernel_power_off(void)
        disable_nonboot_cpus();
        sysdev_shutdown();
        printk(KERN_EMERG "Power down.\n");
+        kmsg_dump(KMSG_DUMP_POWEROFF);
        machine_power_off();
 }
 EXPORT_SYMBOL_GPL(kernel_power_off);
@@ -1080,8 +1086,10 @@ SYSCALL_DEFINE0(setsid)
        err = session;
 out:
        write_unlock_irq(&tasklist_lock);
-        if (err > 0)
+        if (err > 0) {
                proc_sid_connector(group_leader);
+                sched_autogroup_create_attach(group_leader);
+        }
        return err;
 }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c33a1edb799f..bc86bb32e126 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -24,6 +24,7 @@
 #include <linux/slab.h>
 #include <linux/sysctl.h>
 #include <linux/signal.h>
+#include <linux/printk.h>
 #include <linux/proc_fs.h>
 #include <linux/security.h>
 #include <linux/ctype.h>
@@ -245,10 +246,6 @@ static struct ctl_table root_table[] = {
                .mode           = 0555,
                .child          = dev_table,
        },
-/*
- * NOTE: do not add new entries to this table unless you have read
- * Documentation/sysctl/ctl_unnumbered.txt
- */
        { }
 };
@@ -259,8 +256,6 @@ static int min_wakeup_granularity_ns;			/* 0 usecs */
 static int max_wakeup_granularity_ns = NSEC_PER_SEC;    /* 1 second */
 static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
 static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
-static int min_sched_shares_ratelimit = 100000; /* 100 usec */
-static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */
 #endif
 #ifdef CONFIG_COMPACTION
@@ -305,15 +300,6 @@ static struct ctl_table kern_table[] = {
                .extra2         = &max_wakeup_granularity_ns,
        },
        {
-                .procname       = "sched_shares_ratelimit",
-                .data           = &sysctl_sched_shares_ratelimit,
-                .maxlen         = sizeof(unsigned int),
-                .mode           = 0644,
-                .proc_handler   = sched_proc_update_handler,
-                .extra1         = &min_sched_shares_ratelimit,
-                .extra2         = &max_sched_shares_ratelimit,
-        },
-        {
                .procname       = "sched_tunable_scaling",
                .data           = &sysctl_sched_tunable_scaling,
                .maxlen         = sizeof(enum sched_tunable_scaling),
@@ -323,14 +309,6 @@ static struct ctl_table kern_table[] = {
                .extra2         = &max_sched_tunable_scaling,
        },
        {
-                .procname       = "sched_shares_thresh",
-                .data           = &sysctl_sched_shares_thresh,
-                .maxlen         = sizeof(unsigned int),
-                .mode           = 0644,
-                .proc_handler   = proc_dointvec_minmax,
-                .extra1         = &zero,
-        },
-        {
                .procname       = "sched_migration_cost",
                .data           = &sysctl_sched_migration_cost,
                .maxlen         = sizeof(unsigned int),
@@ -352,6 +330,13 @@ static struct ctl_table kern_table[] = {
                .proc_handler   = proc_dointvec,
        },
        {
+                .procname       = "sched_shares_window",
+                .data           = &sysctl_sched_shares_window,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec,
+        },
+        {
                .procname       = "timer_migration",
                .data           = &sysctl_timer_migration,
                .maxlen         = sizeof(unsigned int),
@@ -382,6 +367,17 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
+#ifdef CONFIG_SCHED_AUTOGROUP
+        {
+                .procname       = "sched_autogroup_enabled",
+                .data           = &sysctl_sched_autogroup_enabled,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec,
+                .extra1         = &zero,
+                .extra2         = &one,
+        },
+#endif
 #ifdef CONFIG_PROVE_LOCKING
        {
                .procname       = "prove_locking",
@@ -702,6 +698,24 @@ static struct ctl_table kern_table[] = {
                .extra1         = &zero,
                .extra2         = &ten_thousand,
        },
+        {
+                .procname       = "dmesg_restrict",
+                .data           = &dmesg_restrict,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &zero,
+                .extra2         = &one,
+        },
+        {
+                .procname       = "kptr_restrict",
+                .data           = &kptr_restrict,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &zero,
+                .extra2         = &two,
+        },
 #endif
        {
                .procname       = "ngroups_max",
@@ -736,21 +750,21 @@ static struct ctl_table kern_table[] = {
                .extra1         = &zero,
                .extra2         = &one,
        },
-#endif
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_LOCKUP_DETECTOR)
        {
-                .procname       = "unknown_nmi_panic",
+                .procname       = "nmi_watchdog",
-                .data           = &unknown_nmi_panic,
+                .data           = &watchdog_enabled,
                .maxlen         = sizeof (int),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = proc_dowatchdog_enabled,
        },
+#endif
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
        {
-                .procname       = "nmi_watchdog",
+                .procname       = "unknown_nmi_panic",
-                .data           = &nmi_watchdog_enabled,
+                .data           = &unknown_nmi_panic,
                .maxlen         = sizeof (int),
                .mode           = 0644,
-                .proc_handler   = proc_nmi_enabled,
+                .proc_handler   = proc_dointvec,
        },
 #endif
 #if defined(CONFIG_X86)
@@ -954,10 +968,6 @@ static struct ctl_table kern_table[] = {
                .proc_handler   = proc_dointvec,
        },
 #endif
-/*
- * NOTE: do not add new entries to this table unless you have read
- * Documentation/sysctl/ctl_unnumbered.txt
- */
        { }
 };
@@ -1318,11 +1328,6 @@ static struct ctl_table vm_table[] = {
                .extra2         = &one,
        },
 #endif
-/*
- * NOTE: do not add new entries to this table unless you have read
- * Documentation/sysctl/ctl_unnumbered.txt
- */
        { }
 };
@@ -1478,10 +1483,6 @@ static struct ctl_table fs_table[] = {
                .proc_handler   = &pipe_proc_fn,
                .extra1         = &pipe_min_size,
        },
-/*
- * NOTE: do not add new entries to this table unless you have read
- * Documentation/sysctl/ctl_unnumbered.txt
- */
        { }
 };
@@ -2891,7 +2892,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
        }
 }
-#else /* CONFIG_PROC_FS */
+#else /* CONFIG_PROC_SYSCTL */
 int proc_dostring(struct ctl_table *table, int write,
                  void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -2943,7 +2944,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
 }
-#endif /* CONFIG_PROC_FS */
+#endif /* CONFIG_PROC_SYSCTL */
 /*
 * No sense putting this after each symbol definition, twice,
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 1357c5786064..b875bedf7c9a 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -136,7 +136,6 @@ static const struct bin_table bin_kern_table[] = {
        { CTL_INT,      KERN_IA64_UNALIGNED,            "ignore-unaligned-usertrap" },
        { CTL_INT,      KERN_COMPAT_LOG,                "compat-log" },
        { CTL_INT,      KERN_MAX_LOCK_DEPTH,            "max_lock_depth" },
-        { CTL_INT,      KERN_NMI_WATCHDOG,              "nmi_watchdog" },
        { CTL_INT,      KERN_PANIC_ON_NMI,              "panic_on_unrecovered_nmi" },
        {}
 };
@@ -1193,7 +1192,7 @@ static ssize_t bin_dn_node_address(struct file *file,
                buf[result] = '\0';
-                /* Convert the decnet addresss to binary */
+                /* Convert the decnet address to binary */
                result = -EIO;
                nodep = strchr(buf, '.') + 1;
                if (!nodep)
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index c8231fb15708..3971c6b9d58d 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -89,8 +89,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
                return -ENOMEM;
        if (!info) {
-                int seq = get_cpu_var(taskstats_seqnum)++;
+                int seq = this_cpu_inc_return(taskstats_seqnum) - 1;
-                put_cpu_var(taskstats_seqnum);
                reply = genlmsg_put(skb, 0, seq, &family, 0, cmd);
        } else
@@ -349,25 +348,47 @@ static int parse(struct nlattr *na, struct cpumask *mask)
        return ret;
 }
+#if defined(CONFIG_64BIT) && !defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
+#define TASKSTATS_NEEDS_PADDING 1
+#endif
 static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
 {
        struct nlattr *na, *ret;
        int aggr;
-        /* If we don't pad, we end up with alignment on a 4 byte boundary.
-         * This causes lots of runtime warnings on systems requiring 8 byte
-         * alignment */
-        u32 pids[2] = { pid, 0 };
-        int pid_size = ALIGN(sizeof(pid), sizeof(long));
        aggr = (type == TASKSTATS_TYPE_PID)
                        ? TASKSTATS_TYPE_AGGR_PID
                        : TASKSTATS_TYPE_AGGR_TGID;
+        /*
+         * The taskstats structure is internally aligned on 8 byte
+         * boundaries but the layout of the aggregrate reply, with
+         * two NLA headers and the pid (each 4 bytes), actually
+         * force the entire structure to be unaligned. This causes
+         * the kernel to issue unaligned access warnings on some
+         * architectures like ia64. Unfortunately, some software out there
+         * doesn't properly unroll the NLA packet and assumes that the start
+         * of the taskstats structure will always be 20 bytes from the start
+         * of the netlink payload. Aligning the start of the taskstats
+         * structure breaks this software, which we don't want. So, for now
+         * the alignment only happens on architectures that require it
+         * and those users will have to update to fixed versions of those
+         * packages. Space is reserved in the packet only when needed.
+         * This ifdef should be removed in several years e.g. 2012 once
+         * we can be confident that fixed versions are installed on most
+         * systems. We add the padding before the aggregate since the
+         * aggregate is already a defined type.
+         */
+#ifdef TASKSTATS_NEEDS_PADDING
+        if (nla_put(skb, TASKSTATS_TYPE_NULL, 0, NULL) < 0)
+                goto err;
+#endif
        na = nla_nest_start(skb, aggr);
        if (!na)
                goto err;
-        if (nla_put(skb, type, pid_size, pids) < 0)
+        if (nla_put(skb, type, sizeof(pid), &pid) < 0)
                goto err;
        ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats));
        if (!ret)
@@ -456,6 +477,18 @@ out:
        return rc;
 }
+static size_t taskstats_packet_size(void)
+{
+        size_t size;
+        size = nla_total_size(sizeof(u32)) +
+                nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
+#ifdef TASKSTATS_NEEDS_PADDING
+        size += nla_total_size(0); /* Padding for alignment */
+#endif
+        return size;
+}
 static int cmd_attr_pid(struct genl_info *info)
 {
        struct taskstats *stats;
@@ -464,8 +497,7 @@ static int cmd_attr_pid(struct genl_info *info)
        u32 pid;
        int rc;
-        size = nla_total_size(sizeof(u32)) +
+        size = taskstats_packet_size();
-                nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
        rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size);
        if (rc < 0)
@@ -494,8 +526,7 @@ static int cmd_attr_tgid(struct genl_info *info)
        u32 tgid;
        int rc;
-        size = nla_total_size(sizeof(u32)) +
+        size = taskstats_packet_size();
-                nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
        rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size);
        if (rc < 0)
@@ -570,8 +601,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
        /*
         * Size includes space for nested attributes
         */
-        size = nla_total_size(sizeof(u32)) +
+        size = taskstats_packet_size();
-                nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
        is_thread_group = !!taskstats_tgid_alloc(tsk);
        if (is_thread_group) {
@@ -581,7 +611,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
                fill_tgid_exit(tsk);
        }
-        listeners = &__raw_get_cpu_var(listener_array);
+        listeners = __this_cpu_ptr(&listener_array);
        if (list_empty(&listeners->list))
                return;
diff --git a/kernel/time.c b/kernel/time.c
index ba9b338d1835..32174359576f 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -238,7 +238,7 @@ EXPORT_SYMBOL(current_fs_time);
 * Avoid unnecessary multiplications/divisions in the
 * two most common HZ cases:
 */
-unsigned int inline jiffies_to_msecs(const unsigned long j)
+inline unsigned int jiffies_to_msecs(const unsigned long j)
 {
 #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
        return (MSEC_PER_SEC / HZ) * j;
@@ -254,7 +254,7 @@ unsigned int inline jiffies_to_msecs(const unsigned long j)
 }
 EXPORT_SYMBOL(jiffies_to_msecs);
-unsigned int inline jiffies_to_usecs(const unsigned long j)
+inline unsigned int jiffies_to_usecs(const unsigned long j)
 {
 #if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
        return (USEC_PER_SEC / HZ) * j;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index c18d7efa1b4b..6519cf62d9cd 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -113,7 +113,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time);
 * @shift:      pointer to shift variable
 * @from:       frequency to convert from
 * @to:         frequency to convert to
- * @minsec:     guaranteed runtime conversion range in seconds
+ * @maxsec:     guaranteed runtime conversion range in seconds
 *
 * The function evaluates the shift/mult pair for the scaled math
 * operations of clocksources and clockevents.
@@ -122,7 +122,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time);
 * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock
 * event @to is the counter frequency and @from is NSEC_PER_SEC.
 *
- * The @minsec conversion range argument controls the time frame in
+ * The @maxsec conversion range argument controls the time frame in
 * seconds which must be covered by the runtime conversion with the
 * calculated mult and shift factors. This guarantees that no 64bit
 * overflow happens when the input value of the conversion is
@@ -131,7 +131,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time);
 * factors.
 */
 void
-clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec)
+clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec)
 {
        u64 tmp;
        u32 sft, sftacc= 32;
@@ -140,7 +140,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec)
         * Calculate the shift factor which is limiting the conversion
         * range:
         */
-        tmp = ((u64)minsec * from) >> 32;
+        tmp = ((u64)maxsec * from) >> 32;
        while (tmp) {
                tmp >>=1;
                sftacc--;
@@ -152,6 +152,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec)
         */
        for (sft = 32; sft > 0; sft--) {
                tmp = (u64) to << sft;
+                tmp += from / 2;
                do_div(tmp, from);
                if ((tmp >> sftacc) == 0)
                        break;
@@ -678,7 +679,7 @@ EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
 int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
 {
-        /* Intialize mult/shift and max_idle_ns */
+        /* Initialize mult/shift and max_idle_ns */
        __clocksource_updatefreq_scale(cs, scale, freq);
        /* Add clocksource to the clcoksource list */
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index d2321891538f..5c00242fa921 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -14,6 +14,7 @@
 #include <linux/timex.h>
 #include <linux/time.h>
 #include <linux/mm.h>
+#include <linux/module.h>
 /*
 * NTP timekeeping variables:
@@ -74,6 +75,162 @@ static long			time_adjust;
 /* constant (boot-param configurable) NTP tick adjustment (upscaled)    */
 static s64                      ntp_tick_adj;
+#ifdef CONFIG_NTP_PPS
+/*
+ * The following variables are used when a pulse-per-second (PPS) signal
+ * is available. They establish the engineering parameters of the clock
+ * discipline loop when controlled by the PPS signal.
+ */
+#define PPS_VALID       10      /* PPS signal watchdog max (s) */
+#define PPS_POPCORN     4       /* popcorn spike threshold (shift) */
+#define PPS_INTMIN      2       /* min freq interval (s) (shift) */
+#define PPS_INTMAX      8       /* max freq interval (s) (shift) */
+#define PPS_INTCOUNT    4       /* number of consecutive good intervals to
+                                   increase pps_shift or consecutive bad
+                                   intervals to decrease it */
+#define PPS_MAXWANDER   100000  /* max PPS freq wander (ns/s) */
+static int pps_valid;           /* signal watchdog counter */
+static long pps_tf[3];          /* phase median filter */
+static long pps_jitter;         /* current jitter (ns) */
+static struct timespec pps_fbase; /* beginning of the last freq interval */
+static int pps_shift;           /* current interval duration (s) (shift) */
+static int pps_intcnt;          /* interval counter */
+static s64 pps_freq;            /* frequency offset (scaled ns/s) */
+static long pps_stabil;         /* current stability (scaled ns/s) */
+/*
+ * PPS signal quality monitors
+ */
+static long pps_calcnt;         /* calibration intervals */
+static long pps_jitcnt;         /* jitter limit exceeded */
+static long pps_stbcnt;         /* stability limit exceeded */
+static long pps_errcnt;         /* calibration errors */
+/* PPS kernel consumer compensates the whole phase error immediately.
+ * Otherwise, reduce the offset by a fixed factor times the time constant.
+ */
+static inline s64 ntp_offset_chunk(s64 offset)
+{
+        if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL)
+                return offset;
+        else
+                return shift_right(offset, SHIFT_PLL + time_constant);
+}
+static inline void pps_reset_freq_interval(void)
+{
+        /* the PPS calibration interval may end
+           surprisingly early */
+        pps_shift = PPS_INTMIN;
+        pps_intcnt = 0;
+}
+/**
+ * pps_clear - Clears the PPS state variables
+ *
+ * Must be called while holding a write on the xtime_lock
+ */
+static inline void pps_clear(void)
+{
+        pps_reset_freq_interval();
+        pps_tf[0] = 0;
+        pps_tf[1] = 0;
+        pps_tf[2] = 0;
+        pps_fbase.tv_sec = pps_fbase.tv_nsec = 0;
+        pps_freq = 0;
+}
+/* Decrease pps_valid to indicate that another second has passed since
+ * the last PPS signal. When it reaches 0, indicate that PPS signal is
+ * missing.
+ *
+ * Must be called while holding a write on the xtime_lock
+ */
+static inline void pps_dec_valid(void)
+{
+        if (pps_valid > 0)
+                pps_valid--;
+        else {
+                time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
+                                 STA_PPSWANDER | STA_PPSERROR);
+                pps_clear();
+        }
+}
+static inline void pps_set_freq(s64 freq)
+{
+        pps_freq = freq;
+}
+static inline int is_error_status(int status)
+{
+        return (time_status & (STA_UNSYNC|STA_CLOCKERR))
+                /* PPS signal lost when either PPS time or
+                 * PPS frequency synchronization requested
+                 */
+                || ((time_status & (STA_PPSFREQ|STA_PPSTIME))
+                        && !(time_status & STA_PPSSIGNAL))
+                /* PPS jitter exceeded when
+                 * PPS time synchronization requested */
+                || ((time_status & (STA_PPSTIME|STA_PPSJITTER))
+                        == (STA_PPSTIME|STA_PPSJITTER))
+                /* PPS wander exceeded or calibration error when
+                 * PPS frequency synchronization requested
+                 */
+                || ((time_status & STA_PPSFREQ)
+                        && (time_status & (STA_PPSWANDER|STA_PPSERROR)));
+}
+static inline void pps_fill_timex(struct timex *txc)
+{
+        txc->ppsfreq       = shift_right((pps_freq >> PPM_SCALE_INV_SHIFT) *
+                                         PPM_SCALE_INV, NTP_SCALE_SHIFT);
+        txc->jitter        = pps_jitter;
+        if (!(time_status & STA_NANO))
+                txc->jitter /= NSEC_PER_USEC;
+        txc->shift         = pps_shift;
+        txc->stabil        = pps_stabil;
+        txc->jitcnt        = pps_jitcnt;
+        txc->calcnt        = pps_calcnt;
+        txc->errcnt        = pps_errcnt;
+        txc->stbcnt        = pps_stbcnt;
+}
+#else /* !CONFIG_NTP_PPS */
+static inline s64 ntp_offset_chunk(s64 offset)
+{
+        return shift_right(offset, SHIFT_PLL + time_constant);
+}
+static inline void pps_reset_freq_interval(void) {}
+static inline void pps_clear(void) {}
+static inline void pps_dec_valid(void) {}
+static inline void pps_set_freq(s64 freq) {}
+static inline int is_error_status(int status)
+{
+        return status & (STA_UNSYNC|STA_CLOCKERR);
+}
+static inline void pps_fill_timex(struct timex *txc)
+{
+        /* PPS is not implemented, so these are zero */
+        txc->ppsfreq       = 0;
+        txc->jitter        = 0;
+        txc->shift         = 0;
+        txc->stabil        = 0;
+        txc->jitcnt        = 0;
+        txc->calcnt        = 0;
+        txc->errcnt        = 0;
+        txc->stbcnt        = 0;
+}
+#endif /* CONFIG_NTP_PPS */
 /*
 * NTP methods:
 */
@@ -185,6 +342,9 @@ void ntp_clear(void)
        tick_length     = tick_length_base;
        time_offset     = 0;
+        /* Clear PPS state variables */
+        pps_clear();
 }
 /*
@@ -250,16 +410,16 @@ void second_overflow(void)
                time_status |= STA_UNSYNC;
        }
-        /*
+        /* Compute the phase adjustment for the next second */
-         * Compute the phase adjustment for the next second. The offset is
-         * reduced by a fixed factor times the time constant.
-         */
        tick_length      = tick_length_base;
-        delta            = shift_right(time_offset, SHIFT_PLL + time_constant);
+        delta            = ntp_offset_chunk(time_offset);
        time_offset     -= delta;
        tick_length     += delta;
+        /* Check PPS signal */
+        pps_dec_valid();
        if (!time_adjust)
                return;
@@ -369,6 +529,8 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
        if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) {
                time_state = TIME_OK;
                time_status = STA_UNSYNC;
+                /* restart PPS frequency calibration */
+                pps_reset_freq_interval();
        }
        /*
@@ -418,6 +580,8 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts
                time_freq = txc->freq * PPM_SCALE;
                time_freq = min(time_freq, MAXFREQ_SCALED);
                time_freq = max(time_freq, -MAXFREQ_SCALED);
+                /* update pps_freq */
+                pps_set_freq(time_freq);
        }
        if (txc->modes & ADJ_MAXERROR)
@@ -508,7 +672,8 @@ int do_adjtimex(struct timex *txc)
        }
        result = time_state;    /* mostly `TIME_OK' */
-        if (time_status & (STA_UNSYNC|STA_CLOCKERR))
+        /* check for errors */
+        if (is_error_status(time_status))
                result = TIME_ERROR;
        txc->freq          = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) *
@@ -522,15 +687,8 @@ int do_adjtimex(struct timex *txc)
        txc->tick          = tick_usec;
        txc->tai           = time_tai;
-        /* PPS is not implemented, so these are zero */
+        /* fill PPS status fields */
-        txc->ppsfreq       = 0;
+        pps_fill_timex(txc);
-        txc->jitter        = 0;
-        txc->shift         = 0;
-        txc->stabil        = 0;
-        txc->jitcnt        = 0;
-        txc->calcnt        = 0;
-        txc->errcnt        = 0;
-        txc->stbcnt        = 0;
        write_sequnlock_irq(&xtime_lock);
@@ -544,6 +702,243 @@ int do_adjtimex(struct timex *txc)
        return result;
 }
+#ifdef  CONFIG_NTP_PPS
+/* actually struct pps_normtime is good old struct timespec, but it is
+ * semantically different (and it is the reason why it was invented):
+ * pps_normtime.nsec has a range of ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ]
+ * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) */
+struct pps_normtime {
+        __kernel_time_t sec;    /* seconds */
+        long            nsec;   /* nanoseconds */
+};
+/* normalize the timestamp so that nsec is in the
+   ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval */
+static inline struct pps_normtime pps_normalize_ts(struct timespec ts)
+{
+        struct pps_normtime norm = {
+                .sec = ts.tv_sec,
+                .nsec = ts.tv_nsec
+        };
+        if (norm.nsec > (NSEC_PER_SEC >> 1)) {
+                norm.nsec -= NSEC_PER_SEC;
+                norm.sec++;
+        }
+        return norm;
+}
+/* get current phase correction and jitter */
+static inline long pps_phase_filter_get(long *jitter)
+{
+        *jitter = pps_tf[0] - pps_tf[1];
+        if (*jitter < 0)
+                *jitter = -*jitter;
+        /* TODO: test various filters */
+        return pps_tf[0];
+}
+/* add the sample to the phase filter */
+static inline void pps_phase_filter_add(long err)
+{
+        pps_tf[2] = pps_tf[1];
+        pps_tf[1] = pps_tf[0];
+        pps_tf[0] = err;
+}
+/* decrease frequency calibration interval length.
+ * It is halved after four consecutive unstable intervals.
+ */
+static inline void pps_dec_freq_interval(void)
+{
+        if (--pps_intcnt <= -PPS_INTCOUNT) {
+                pps_intcnt = -PPS_INTCOUNT;
+                if (pps_shift > PPS_INTMIN) {
+                        pps_shift--;
+                        pps_intcnt = 0;
+                }
+        }
+}
+/* increase frequency calibration interval length.
+ * It is doubled after four consecutive stable intervals.
+ */
+static inline void pps_inc_freq_interval(void)
+{
+        if (++pps_intcnt >= PPS_INTCOUNT) {
+                pps_intcnt = PPS_INTCOUNT;
+                if (pps_shift < PPS_INTMAX) {
+                        pps_shift++;
+                        pps_intcnt = 0;
+                }
+        }
+}
+/* update clock frequency based on MONOTONIC_RAW clock PPS signal
+ * timestamps
+ *
+ * At the end of the calibration interval the difference between the
+ * first and last MONOTONIC_RAW clock timestamps divided by the length
+ * of the interval becomes the frequency update. If the interval was
+ * too long, the data are discarded.
+ * Returns the difference between old and new frequency values.
+ */
+static long hardpps_update_freq(struct pps_normtime freq_norm)
+{
+        long delta, delta_mod;
+        s64 ftemp;
+        /* check if the frequency interval was too long */
+        if (freq_norm.sec > (2 << pps_shift)) {
+                time_status |= STA_PPSERROR;
+                pps_errcnt++;
+                pps_dec_freq_interval();
+                pr_err("hardpps: PPSERROR: interval too long - %ld s\n",
+                                freq_norm.sec);
+                return 0;
+        }
+        /* here the raw frequency offset and wander (stability) is
+         * calculated. If the wander is less than the wander threshold
+         * the interval is increased; otherwise it is decreased.
+         */
+        ftemp = div_s64(((s64)(-freq_norm.nsec)) << NTP_SCALE_SHIFT,
+                        freq_norm.sec);
+        delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT);
+        pps_freq = ftemp;
+        if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) {
+                pr_warning("hardpps: PPSWANDER: change=%ld\n", delta);
+                time_status |= STA_PPSWANDER;
+                pps_stbcnt++;
+                pps_dec_freq_interval();
+        } else {        /* good sample */
+                pps_inc_freq_interval();
+        }
+        /* the stability metric is calculated as the average of recent
+         * frequency changes, but is used only for performance
+         * monitoring
+         */
+        delta_mod = delta;
+        if (delta_mod < 0)
+                delta_mod = -delta_mod;
+        pps_stabil += (div_s64(((s64)delta_mod) <<
+                                (NTP_SCALE_SHIFT - SHIFT_USEC),
+                                NSEC_PER_USEC) - pps_stabil) >> PPS_INTMIN;
+        /* if enabled, the system clock frequency is updated */
+        if ((time_status & STA_PPSFREQ) != 0 &&
+            (time_status & STA_FREQHOLD) == 0) {
+                time_freq = pps_freq;
+                ntp_update_frequency();
+        }
+        return delta;
+}
+/* correct REALTIME clock phase error against PPS signal */
+static void hardpps_update_phase(long error)
+{
+        long correction = -error;
+        long jitter;
+        /* add the sample to the median filter */
+        pps_phase_filter_add(correction);
+        correction = pps_phase_filter_get(&jitter);
+        /* Nominal jitter is due to PPS signal noise. If it exceeds the
+         * threshold, the sample is discarded; otherwise, if so enabled,
+         * the time offset is updated.
+         */
+        if (jitter > (pps_jitter << PPS_POPCORN)) {
+                pr_warning("hardpps: PPSJITTER: jitter=%ld, limit=%ld\n",
+                       jitter, (pps_jitter << PPS_POPCORN));
+                time_status |= STA_PPSJITTER;
+                pps_jitcnt++;
+        } else if (time_status & STA_PPSTIME) {
+                /* correct the time using the phase offset */
+                time_offset = div_s64(((s64)correction) << NTP_SCALE_SHIFT,
+                                NTP_INTERVAL_FREQ);
+                /* cancel running adjtime() */
+                time_adjust = 0;
+        }
+        /* update jitter */
+        pps_jitter += (jitter - pps_jitter) >> PPS_INTMIN;
+}
+/*
+ * hardpps() - discipline CPU clock oscillator to external PPS signal
+ *
+ * This routine is called at each PPS signal arrival in order to
+ * discipline the CPU clock oscillator to the PPS signal. It takes two
+ * parameters: REALTIME and MONOTONIC_RAW clock timestamps. The former
+ * is used to correct clock phase error and the latter is used to
+ * correct the frequency.
+ *
+ * This code is based on David Mills's reference nanokernel
+ * implementation. It was mostly rewritten but keeps the same idea.
+ */
+void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
+{
+        struct pps_normtime pts_norm, freq_norm;
+        unsigned long flags;
+        pts_norm = pps_normalize_ts(*phase_ts);
+        write_seqlock_irqsave(&xtime_lock, flags);
+        /* clear the error bits, they will be set again if needed */
+        time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
+        /* indicate signal presence */
+        time_status |= STA_PPSSIGNAL;
+        pps_valid = PPS_VALID;
+        /* when called for the first time,
+         * just start the frequency interval */
+        if (unlikely(pps_fbase.tv_sec == 0)) {
+                pps_fbase = *raw_ts;
+                write_sequnlock_irqrestore(&xtime_lock, flags);
+                return;
+        }
+        /* ok, now we have a base for frequency calculation */
+        freq_norm = pps_normalize_ts(timespec_sub(*raw_ts, pps_fbase));
+        /* check that the signal is in the range
+         * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it */
+        if ((freq_norm.sec == 0) ||
+                        (freq_norm.nsec > MAXFREQ * freq_norm.sec) ||
+                        (freq_norm.nsec < -MAXFREQ * freq_norm.sec)) {
+                time_status |= STA_PPSJITTER;
+                /* restart the frequency calibration interval */
+                pps_fbase = *raw_ts;
+                write_sequnlock_irqrestore(&xtime_lock, flags);
+                pr_err("hardpps: PPSJITTER: bad pulse\n");
+                return;
+        }
+        /* signal is ok */
+        /* check if the current frequency interval is finished */
+        if (freq_norm.sec >= (1 << pps_shift)) {
+                pps_calcnt++;
+                /* restart the frequency calibration interval */
+                pps_fbase = *raw_ts;
+                hardpps_update_freq(freq_norm);
+        }
+        hardpps_update_phase(pts_norm.nsec);
+        write_sequnlock_irqrestore(&xtime_lock, flags);
+}
+EXPORT_SYMBOL(hardpps);
+#endif  /* CONFIG_NTP_PPS */
 static int __init ntp_tick_adj_setup(char *str)
 {
        ntp_tick_adj = simple_strtol(str, NULL, 0);
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index b6b898d2eeef..051bc80a0c43 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -49,7 +49,7 @@ struct tick_device *tick_get_device(int cpu)
 */
 int tick_is_oneshot_available(void)
 {
-        struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
+        struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
        return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT);
 }
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index aada0e52680a..5cbc101f908b 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -95,7 +95,7 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
 */
 int tick_program_event(ktime_t expires, int force)
 {
-        struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
+        struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
        return tick_dev_program_event(dev, expires, force);
 }
@@ -167,7 +167,7 @@ int tick_oneshot_mode_active(void)
        int ret;
        local_irq_save(flags);
-        ret = __get_cpu_var(tick_cpu_device).mode == TICKDEV_MODE_ONESHOT;
+        ret = __this_cpu_read(tick_cpu_device.mode) == TICKDEV_MODE_ONESHOT;
        local_irq_restore(flags);
        return ret;
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c
index ac38fbb176cc..a9ae369925ce 100644
--- a/kernel/time/timecompare.c
+++ b/kernel/time/timecompare.c
@@ -21,6 +21,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/math64.h>
+#include <linux/kernel.h>
 /*
 * fixed point arithmetic scale factor for skew
@@ -57,11 +58,11 @@ int timecompare_offset(struct timecompare *sync,
        int index;
        int num_samples = sync->num_samples;
-        if (num_samples > sizeof(buffer)/sizeof(buffer[0])) {
+        if (num_samples > ARRAY_SIZE(buffer)) {
                samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC);
                if (!samples) {
                        samples = buffer;
-                        num_samples = sizeof(buffer)/sizeof(buffer[0]);
+                        num_samples = ARRAY_SIZE(buffer);
                }
        } else {
                samples = buffer;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 49010d822f72..d27c7562902c 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -32,6 +32,8 @@ struct timekeeper {
        cycle_t cycle_interval;
        /* Number of clock shifted nano seconds in one NTP interval. */
        u64     xtime_interval;
+        /* shifted nano seconds left over when rounding cycle_interval */
+        s64     xtime_remainder;
        /* Raw nano seconds accumulated per NTP interval. */
        u32     raw_interval;
@@ -47,7 +49,7 @@ struct timekeeper {
        u32     mult;
 };
-struct timekeeper timekeeper;
+static struct timekeeper timekeeper;
 /**
 * timekeeper_setup_internals - Set up internals to use clocksource clock.
@@ -62,7 +64,7 @@ struct timekeeper timekeeper;
 static void timekeeper_setup_internals(struct clocksource *clock)
 {
        cycle_t interval;
-        u64 tmp;
+        u64 tmp, ntpinterval;
        timekeeper.clock = clock;
        clock->cycle_last = clock->read(clock);
@@ -70,6 +72,7 @@ static void timekeeper_setup_internals(struct clocksource *clock)
        /* Do the ns -> cycle conversion first, using original mult */
        tmp = NTP_INTERVAL_LENGTH;
        tmp <<= clock->shift;
+        ntpinterval = tmp;
        tmp += clock->mult/2;
        do_div(tmp, clock->mult);
        if (tmp == 0)
@@ -80,6 +83,7 @@ static void timekeeper_setup_internals(struct clocksource *clock)
        /* Go back from cycles -> shifted ns */
        timekeeper.xtime_interval = (u64) interval * clock->mult;
+        timekeeper.xtime_remainder = ntpinterval - timekeeper.xtime_interval;
        timekeeper.raw_interval =
                ((u64) interval * clock->mult) >> clock->shift;
@@ -160,7 +164,7 @@ static struct timespec total_sleep_time;
 /*
 * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock.
 */
-struct timespec raw_time;
+static struct timespec raw_time;
 /* flag for if timekeeping is suspended */
 int __read_mostly timekeeping_suspended;
@@ -284,6 +288,49 @@ void ktime_get_ts(struct timespec *ts)
 }
 EXPORT_SYMBOL_GPL(ktime_get_ts);
+#ifdef CONFIG_NTP_PPS
+/**
+ * getnstime_raw_and_real - get day and raw monotonic time in timespec format
+ * @ts_raw:     pointer to the timespec to be set to raw monotonic time
+ * @ts_real:    pointer to the timespec to be set to the time of day
+ *
+ * This function reads both the time of day and raw monotonic time at the
+ * same time atomically and stores the resulting timestamps in timespec
+ * format.
+ */
+void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
+{
+        unsigned long seq;
+        s64 nsecs_raw, nsecs_real;
+        WARN_ON_ONCE(timekeeping_suspended);
+        do {
+                u32 arch_offset;
+                seq = read_seqbegin(&xtime_lock);
+                *ts_raw = raw_time;
+                *ts_real = xtime;
+                nsecs_raw = timekeeping_get_ns_raw();
+                nsecs_real = timekeeping_get_ns();
+                /* If arch requires, add in gettimeoffset() */
+                arch_offset = arch_gettimeoffset();
+                nsecs_raw += arch_offset;
+                nsecs_real += arch_offset;
+        } while (read_seqretry(&xtime_lock, seq));
+        timespec_add_ns(ts_raw, nsecs_raw);
+        timespec_add_ns(ts_real, nsecs_real);
+}
+EXPORT_SYMBOL(getnstime_raw_and_real);
+#endif /* CONFIG_NTP_PPS */
 /**
 * do_gettimeofday - Returns the time of day in a timeval
 * @tv:         pointer to the timeval to be set
@@ -719,7 +766,8 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
        /* Accumulate error between NTP and clock interval */
        timekeeper.ntp_error += tick_length << shift;
-        timekeeper.ntp_error -= timekeeper.xtime_interval <<
+        timekeeper.ntp_error -=
+            (timekeeper.xtime_interval + timekeeper.xtime_remainder) <<
                                (timekeeper.ntp_error_shift + shift);
        return offset;
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index ab8f5e33fa92..32a19f9397fc 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -79,26 +79,26 @@ print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base,
 {
        struct hrtimer *timer, tmp;
        unsigned long next = 0, i;
-        struct rb_node *curr;
+        struct timerqueue_node *curr;
        unsigned long flags;
 next_one:
        i = 0;
        raw_spin_lock_irqsave(&base->cpu_base->lock, flags);
-        curr = base->first;
+        curr = timerqueue_getnext(&base->active);
        /*
         * Crude but we have to do this O(N*N) thing, because
         * we have to unlock the base when printing:
         */
        while (curr && i < next) {
-                curr = rb_next(curr);
+                curr = timerqueue_iterate_next(curr);
                i++;
        }
        if (curr) {
-                timer = rb_entry(curr, struct hrtimer, node);
+                timer = container_of(curr, struct hrtimer, node);
                tmp = *timer;
                raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags);
diff --git a/kernel/timer.c b/kernel/timer.c
index 68a9ae7679b7..43ca9936f2d0 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -88,18 +88,6 @@ struct tvec_base boot_tvec_bases;
 EXPORT_SYMBOL(boot_tvec_bases);
 static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
-/*
- * Note that all tvec_bases are 2 byte aligned and lower bit of
- * base in timer_list is guaranteed to be zero. Use the LSB to
- * indicate whether the timer is deferrable.
- *
- * A deferrable timer will work normally when the system is busy, but
- * will not cause a CPU to come out of idle just to service it; instead,
- * the timer will be serviced when the CPU eventually wakes up with a
- * subsequent non-deferrable timer.
- */
-#define TBASE_DEFERRABLE_FLAG           (0x1)
 /* Functions below help us manage 'deferrable' flag */
 static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
 {
@@ -113,8 +101,7 @@ static inline struct tvec_base *tbase_get_base(struct tvec_base *base)
 static inline void timer_set_deferrable(struct timer_list *timer)
 {
-        timer->base = ((struct tvec_base *)((unsigned long)(timer->base) |
+        timer->base = TBASE_MAKE_DEFERRED(timer->base);
-                                       TBASE_DEFERRABLE_FLAG));
 }
 static inline void
@@ -343,15 +330,6 @@ void set_timer_slack(struct timer_list *timer, int slack_hz)
 }
 EXPORT_SYMBOL_GPL(set_timer_slack);
-static inline void set_running_timer(struct tvec_base *base,
-                                        struct timer_list *timer)
-{
-#ifdef CONFIG_SMP
-        base->running_timer = timer;
-#endif
-}
 static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 {
        unsigned long expires = timer->expires;
@@ -936,15 +914,12 @@ int del_timer(struct timer_list *timer)
 }
 EXPORT_SYMBOL(del_timer);
-#ifdef CONFIG_SMP
 /**
 * try_to_del_timer_sync - Try to deactivate a timer
 * @timer: timer do del
 *
 * This function tries to deactivate a timer. Upon successful (ret >= 0)
 * exit the timer is not queued and the handler is not running on any CPU.
- *
- * It must not be called from interrupt contexts.
 */
 int try_to_del_timer_sync(struct timer_list *timer)
 {
@@ -973,6 +948,7 @@ out:
 }
 EXPORT_SYMBOL(try_to_del_timer_sync);
+#ifdef CONFIG_SMP
 /**
 * del_timer_sync - deactivate a timer and wait for the handler to finish.
 * @timer: the timer to be deactivated
@@ -983,7 +959,7 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
 *
 * Synchronization rules: Callers must prevent restarting of the timer,
 * otherwise this function is meaningless. It must not be called from
- * interrupt contexts. The caller must not hold locks which would prevent
+ * hardirq contexts. The caller must not hold locks which would prevent
 * completion of the timer's handler. The timer's handler must not call
 * add_timer_on(). Upon exit the timer is not queued and the handler is
 * not running on any CPU.
@@ -993,14 +969,16 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
 int del_timer_sync(struct timer_list *timer)
 {
 #ifdef CONFIG_LOCKDEP
-        unsigned long flags;
+        local_bh_disable();
-        local_irq_save(flags);
        lock_map_acquire(&timer->lockdep_map);
        lock_map_release(&timer->lockdep_map);
-        local_irq_restore(flags);
+        local_bh_enable();
 #endif
+        /*
+         * don't use it in hardirq context, because it
+         * could lead to deadlock.
+         */
+        WARN_ON(in_irq());
        for (;;) {
                int ret = try_to_del_timer_sync(timer);
                if (ret >= 0)
@@ -1111,7 +1089,7 @@ static inline void __run_timers(struct tvec_base *base)
                        timer_stats_account_timer(timer);
-                        set_running_timer(base, timer);
+                        base->running_timer = timer;
                        detach_timer(timer, 1);
                        spin_unlock_irq(&base->lock);
@@ -1119,7 +1097,7 @@ static inline void __run_timers(struct tvec_base *base)
                        spin_lock_irq(&base->lock);
                }
        }
-        set_running_timer(base, NULL);
+        base->running_timer = NULL;
        spin_unlock_irq(&base->lock);
 }
@@ -1249,9 +1227,15 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now,
 */
 unsigned long get_next_timer_interrupt(unsigned long now)
 {
-        struct tvec_base *base = __get_cpu_var(tvec_bases);
+        struct tvec_base *base = __this_cpu_read(tvec_bases);
        unsigned long expires;
+        /*
+         * Pretend that there is no timer pending if the cpu is offline.
+         * Possible pending timers will be migrated later to an active cpu.
+         */
+        if (cpu_is_offline(smp_processor_id()))
+                return now + NEXT_TIMER_MAX_DELTA;
        spin_lock(&base->lock);
        if (time_before_eq(base->next_timer, base->timer_jiffies))
                base->next_timer = __next_timer_interrupt(base);
@@ -1292,7 +1276,7 @@ void update_process_times(int user_tick)
 */
 static void run_timer_softirq(struct softirq_action *h)
 {
-        struct tvec_base *base = __get_cpu_var(tvec_bases);
+        struct tvec_base *base = __this_cpu_read(tvec_bases);
        hrtimer_run_pending();
@@ -1319,7 +1303,7 @@ void do_timer(unsigned long ticks)
 {
        jiffies_64 += ticks;
        update_wall_time();
-        calc_global_load();
+        calc_global_load(ticks);
 }
 #ifdef __ARCH_WANT_SYS_ALARM
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index e04b8bcdef88..14674dce77a6 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -69,6 +69,21 @@ config EVENT_TRACING
        select CONTEXT_SWITCH_TRACER
        bool
+config EVENT_POWER_TRACING_DEPRECATED
+        depends on EVENT_TRACING
+        bool "Deprecated power event trace API, to be removed"
+        default y
+        help
+          Provides old power event types:
+          C-state/idle accounting events:
+          power:power_start
+          power:power_end
+          and old cpufreq accounting event:
+          power:power_frequency
+          This is for userspace compatibility
+          and will vanish after 5 kernel iterations,
+          namely 2.6.41.
 config CONTEXT_SWITCH_TRACER
        bool
@@ -126,7 +141,7 @@ if FTRACE
 config FUNCTION_TRACER
        bool "Kernel Function Tracer"
        depends on HAVE_FUNCTION_TRACER
-        select FRAME_POINTER if (!ARM_UNWIND)
+        select FRAME_POINTER if !ARM_UNWIND && !S390
        select KALLSYMS
        select GENERIC_TRACER
        select CONTEXT_SWITCH_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 53f338190b26..761c510a06c5 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -52,7 +52,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
 endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
-obj-$(CONFIG_EVENT_TRACING) += power-traces.o
+obj-$(CONFIG_TRACEPOINTS) += power-traces.o
 ifeq ($(CONFIG_TRACING),y)
 obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
 endif
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index bc251ed66724..153562d0b93c 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -168,7 +168,6 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
 static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
                                 BLK_TC_ACT(BLK_TC_WRITE) };
-#define BLK_TC_HARDBARRIER      BLK_TC_BARRIER
 #define BLK_TC_RAHEAD           BLK_TC_AHEAD
 /* The ilog2() calls fall out because they're constant */
@@ -196,7 +195,6 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
                return;
        what |= ddir_act[rw & WRITE];
-        what |= MASK_TC_BIT(rw, HARDBARRIER);
        what |= MASK_TC_BIT(rw, SYNC);
        what |= MASK_TC_BIT(rw, RAHEAD);
        what |= MASK_TC_BIT(rw, META);
@@ -760,53 +758,58 @@ static void blk_add_trace_rq_complete(void *ignore,
 * @q:          queue the io is for
 * @bio:        the source bio
 * @what:       the action
+ * @error:      error, if any
 *
 * Description:
 *     Records an action against a bio. Will log the bio offset + size.
 *
 **/
 static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
-                                     u32 what)
+                              u32 what, int error)
 {
        struct blk_trace *bt = q->blk_trace;
        if (likely(!bt))
                return;
+        if (!error && !bio_flagged(bio, BIO_UPTODATE))
+                error = EIO;
        __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what,
-                        !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
+                        error, 0, NULL);
 }
 static void blk_add_trace_bio_bounce(void *ignore,
                                     struct request_queue *q, struct bio *bio)
 {
-        blk_add_trace_bio(q, bio, BLK_TA_BOUNCE);
+        blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
 }
 static void blk_add_trace_bio_complete(void *ignore,
-                                       struct request_queue *q, struct bio *bio)
+                                       struct request_queue *q, struct bio *bio,
+                                       int error)
 {
-        blk_add_trace_bio(q, bio, BLK_TA_COMPLETE);
+        blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
 }
 static void blk_add_trace_bio_backmerge(void *ignore,
                                        struct request_queue *q,
                                        struct bio *bio)
 {
-        blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
+        blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0);
 }
 static void blk_add_trace_bio_frontmerge(void *ignore,
                                         struct request_queue *q,
                                         struct bio *bio)
 {
-        blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
+        blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0);
 }
 static void blk_add_trace_bio_queue(void *ignore,
                                    struct request_queue *q, struct bio *bio)
 {
-        blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
+        blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0);
 }
 static void blk_add_trace_getrq(void *ignore,
@@ -814,7 +817,7 @@ static void blk_add_trace_getrq(void *ignore,
                                struct bio *bio, int rw)
 {
        if (bio)
-                blk_add_trace_bio(q, bio, BLK_TA_GETRQ);
+                blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0);
        else {
                struct blk_trace *bt = q->blk_trace;
@@ -829,7 +832,7 @@ static void blk_add_trace_sleeprq(void *ignore,
                                  struct bio *bio, int rw)
 {
        if (bio)
-                blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ);
+                blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0);
        else {
                struct blk_trace *bt = q->blk_trace;
@@ -889,7 +892,7 @@ static void blk_add_trace_split(void *ignore,
 }
 /**
- * blk_add_trace_remap - Add a trace for a remap operation
+ * blk_add_trace_bio_remap - Add a trace for a bio-remap operation
 * @ignore:     trace callback data parameter (not used)
 * @q:          queue the io is for
 * @bio:        the source bio
@@ -901,9 +904,9 @@ static void blk_add_trace_split(void *ignore,
 *     it spans a stripe (or similar). Add a trace for that action.
 *
 **/
-static void blk_add_trace_remap(void *ignore,
+static void blk_add_trace_bio_remap(void *ignore,
-                                struct request_queue *q, struct bio *bio,
+                                    struct request_queue *q, struct bio *bio,
-                                dev_t dev, sector_t from)
+                                    dev_t dev, sector_t from)
 {
        struct blk_trace *bt = q->blk_trace;
        struct blk_io_trace_remap r;
@@ -1018,7 +1021,7 @@ static void blk_register_tracepoints(void)
        WARN_ON(ret);
        ret = register_trace_block_split(blk_add_trace_split, NULL);
        WARN_ON(ret);
-        ret = register_trace_block_remap(blk_add_trace_remap, NULL);
+        ret = register_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
        WARN_ON(ret);
        ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
        WARN_ON(ret);
@@ -1027,7 +1030,7 @@ static void blk_register_tracepoints(void)
 static void blk_unregister_tracepoints(void)
 {
        unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
-        unregister_trace_block_remap(blk_add_trace_remap, NULL);
+        unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
        unregister_trace_block_split(blk_add_trace_split, NULL);
        unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
        unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
@@ -1807,8 +1810,6 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
        if (rw & REQ_RAHEAD)
                rwbs[i++] = 'A';
-        if (rw & REQ_HARDBARRIER)
-                rwbs[i++] = 'B';
        if (rw & REQ_SYNC)
                rwbs[i++] = 'S';
        if (rw & REQ_META)
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index a22582a06161..f55fcf61b223 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -13,5 +13,8 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/power.h>
-EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency);
+#ifdef EVENT_POWER_TRACING_DEPRECATED
+EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
+#endif
+EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9ed509a015d8..bd1c35a4fbcc 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3853,6 +3853,13 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
                /* Need to copy one event at a time */
                do {
+                        /* We need the size of one event, because
+                         * rb_advance_reader only advances by one event,
+                         * whereas rb_event_ts_length may include the size of
+                         * one or two events.
+                         * We have already ensured there's enough space if this
+                         * is a time extend. */
+                        size = rb_event_length(event);
                        memcpy(bpage->data + pos, rpage->data + rpos, size);
                        len -= size;
@@ -3867,7 +3874,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
                        event = rb_reader_event(cpu_buffer);
                        /* Always keep the time extend and data together */
                        size = rb_event_ts_length(event);
-                } while (len > size);
+                } while (len >= size);
                /* update bpage */
                local_set(&bpage->commit, pos);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 82d9b8106cd0..dc53ecb80589 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -17,7 +17,6 @@
 #include <linux/writeback.h>
 #include <linux/kallsyms.h>
 #include <linux/seq_file.h>
-#include <linux/smp_lock.h>
 #include <linux/notifier.h>
 #include <linux/irqflags.h>
 #include <linux/debugfs.h>
@@ -1284,6 +1283,8 @@ void trace_dump_stack(void)
        __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count());
 }
+static DEFINE_PER_CPU(int, user_stack_count);
 void
 ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
 {
@@ -1302,10 +1303,20 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
        if (unlikely(in_nmi()))
                return;
+        /*
+         * prevent recursion, since the user stack tracing may
+         * trigger other kernel events.
+         */
+        preempt_disable();
+        if (__this_cpu_read(user_stack_count))
+                goto out;
+        __this_cpu_inc(user_stack_count);
        event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
                                          sizeof(*entry), flags, pc);
        if (!event)
-                return;
+                goto out_drop_count;
        entry   = ring_buffer_event_data(event);
        entry->tgid             = current->tgid;
@@ -1319,6 +1330,11 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
        save_stack_trace_user(&trace);
        if (!filter_check_discard(call, entry, buffer, event))
                ring_buffer_unlock_commit(buffer, event);
+ out_drop_count:
+        __this_cpu_dec(user_stack_count);
+ out:
+        preempt_enable();
 }
 #ifdef UNUSED
@@ -2320,11 +2336,19 @@ tracing_write_stub(struct file *filp, const char __user *ubuf,
        return count;
 }
+static loff_t tracing_seek(struct file *file, loff_t offset, int origin)
+{
+        if (file->f_mode & FMODE_READ)
+                return seq_lseek(file, offset, origin);
+        else
+                return 0;
+}
 static const struct file_operations tracing_fops = {
        .open           = tracing_open,
        .read           = seq_read,
        .write          = tracing_write_stub,
-        .llseek         = seq_lseek,
+        .llseek         = tracing_seek,
        .release        = tracing_release,
 };
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index e3dfecaf13e6..6cf223764be8 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -53,7 +53,7 @@
 */
 /*
- * Function trace entry - function address and parent function addres:
+ * Function trace entry - function address and parent function address:
 */
 FTRACE_ENTRY(function, ftrace_entry,
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 39c059ca670e..19a359d5e6d5 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -21,17 +21,46 @@ typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
 /* Count the events in use (per event id, not per instance) */
 static int      total_ref_count;
+static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
+                                 struct perf_event *p_event)
+{
+        /* No tracing, just counting, so no obvious leak */
+        if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW))
+                return 0;
+        /* Some events are ok to be traced by non-root users... */
+        if (p_event->attach_state == PERF_ATTACH_TASK) {
+                if (tp_event->flags & TRACE_EVENT_FL_CAP_ANY)
+                        return 0;
+        }
+        /*
+         * ...otherwise raw tracepoint data can be a severe data leak,
+         * only allow root to have these.
+         */
+        if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        return 0;
+}
 static int perf_trace_event_init(struct ftrace_event_call *tp_event,
                                 struct perf_event *p_event)
 {
        struct hlist_head __percpu *list;
-        int ret = -ENOMEM;
+        int ret;
        int cpu;
+        ret = perf_trace_event_perm(tp_event, p_event);
+        if (ret)
+                return ret;
        p_event->tp_event = tp_event;
        if (tp_event->perf_refcount++ > 0)
                return 0;
+        ret = -ENOMEM;
        list = alloc_percpu(struct hlist_head);
        if (!list)
                goto fail;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 0725eeab1937..35fde09b81de 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -27,6 +27,12 @@
 DEFINE_MUTEX(event_mutex);
+DEFINE_MUTEX(event_storage_mutex);
+EXPORT_SYMBOL_GPL(event_storage_mutex);
+char event_storage[EVENT_STORAGE_SIZE];
+EXPORT_SYMBOL_GPL(event_storage);
 LIST_HEAD(ftrace_events);
 LIST_HEAD(ftrace_common_fields);
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 4ba44deaac25..4b74d71705c0 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -83,13 +83,19 @@ static void __always_unused ____ftrace_check_##name(void)	\
 #undef __array
 #define __array(type, item, len)                                        \
-        BUILD_BUG_ON(len > MAX_FILTER_STR_VAL);                         \
+        do {                                                            \
-        ret = trace_define_field(event_call, #type "[" #len "]", #item, \
+                BUILD_BUG_ON(len > MAX_FILTER_STR_VAL);                 \
+                mutex_lock(&event_storage_mutex);                       \
+                snprintf(event_storage, sizeof(event_storage),          \
+                         "%s[%d]", #type, len);                         \
+                ret = trace_define_field(event_call, event_storage, #item, \
                                 offsetof(typeof(field), item),         \
                                 sizeof(field.item),                    \
                                 is_signed_type(type), FILTER_OTHER);   \
-        if (ret)                                                        \
+                mutex_unlock(&event_storage_mutex);                     \
-                return ret;
+                if (ret)                                                \
+                        return ret;                                     \
+        } while (0);
 #undef __array_desc
 #define __array_desc(type, container, item, len)                        \
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 5cf8c602b880..92b6e1e12d98 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -453,14 +453,6 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1)
 * Stubs:
 */
-void early_boot_irqs_off(void)
-{
-}
-void early_boot_irqs_on(void)
-{
-}
 void trace_softirqs_on(unsigned long ip)
 {
 }
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 155a415b3209..659732eba07c 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -558,7 +558,7 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr)
 static int trace_wakeup_test_thread(void *data)
 {
        /* Make this a RT thread, doesn't need to be too high */
-        struct sched_param param = { .sched_priority = 5 };
+        static const struct sched_param param = { .sched_priority = 5 };
        struct completion *x = data;
        sched_setscheduler(current, SCHED_FIFO, &param);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index bac752f0cfb5..b706529b4fc7 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -23,9 +23,6 @@ static int syscall_exit_register(struct ftrace_event_call *event,
 static int syscall_enter_define_fields(struct ftrace_event_call *call);
 static int syscall_exit_define_fields(struct ftrace_event_call *call);
-/* All syscall exit events have the same fields */
-static LIST_HEAD(syscall_exit_fields);
 static struct list_head *
 syscall_get_enter_fields(struct ftrace_event_call *call)
 {
@@ -34,34 +31,28 @@ syscall_get_enter_fields(struct ftrace_event_call *call)
        return &entry->enter_fields;
 }
-static struct list_head *
-syscall_get_exit_fields(struct ftrace_event_call *call)
-{
-        return &syscall_exit_fields;
-}
 struct trace_event_functions enter_syscall_print_funcs = {
-        .trace                  = print_syscall_enter,
+        .trace          = print_syscall_enter,
 };
 struct trace_event_functions exit_syscall_print_funcs = {
-        .trace                  = print_syscall_exit,
+        .trace          = print_syscall_exit,
 };
 struct ftrace_event_class event_class_syscall_enter = {
-        .system                 = "syscalls",
+        .system         = "syscalls",
-        .reg                    = syscall_enter_register,
+        .reg            = syscall_enter_register,
-        .define_fields          = syscall_enter_define_fields,
+        .define_fields  = syscall_enter_define_fields,
-        .get_fields             = syscall_get_enter_fields,
+        .get_fields     = syscall_get_enter_fields,
-        .raw_init               = init_syscall_trace,
+        .raw_init       = init_syscall_trace,
 };
 struct ftrace_event_class event_class_syscall_exit = {
-        .system                 = "syscalls",
+        .system         = "syscalls",
-        .reg                    = syscall_exit_register,
+        .reg            = syscall_exit_register,
-        .define_fields          = syscall_exit_define_fields,
+        .define_fields  = syscall_exit_define_fields,
-        .get_fields             = syscall_get_exit_fields,
+        .fields         = LIST_HEAD_INIT(event_class_syscall_exit.fields),
-        .raw_init               = init_syscall_trace,
+        .raw_init       = init_syscall_trace,
 };
 extern unsigned long __start_syscalls_metadata[];
diff --git a/kernel/user.c b/kernel/user.c
index 2c7d8d5914b1..5c598ca781df 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -158,6 +158,7 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
                spin_lock_irq(&uidhash_lock);
                up = uid_hash_find(uid, hashent);
                if (up) {
+                        put_user_ns(ns);
                        key_put(new->uid_keyring);
                        key_put(new->session_keyring);
                        kmem_cache_free(uid_cachep, new);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 25915832291a..9da289c34f22 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -12,6 +12,8 @@
 #include <linux/highuid.h>
 #include <linux/cred.h>
+static struct kmem_cache *user_ns_cachep __read_mostly;
 /*
 * Create a new user namespace, deriving the creator from the user in the
 * passed credentials, and replacing that user with the new root user for the
@@ -26,7 +28,7 @@ int create_user_ns(struct cred *new)
        struct user_struct *root_user;
        int n;
-        ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL);
+        ns = kmem_cache_alloc(user_ns_cachep, GFP_KERNEL);
        if (!ns)
                return -ENOMEM;
@@ -38,7 +40,7 @@ int create_user_ns(struct cred *new)
        /* Alloc new root user.  */
        root_user = alloc_uid(ns, 0);
        if (!root_user) {
-                kfree(ns);
+                kmem_cache_free(user_ns_cachep, ns);
                return -ENOMEM;
        }
@@ -71,7 +73,7 @@ static void free_user_ns_work(struct work_struct *work)
        struct user_namespace *ns =
                container_of(work, struct user_namespace, destroyer);
        free_uid(ns->creator);
-        kfree(ns);
+        kmem_cache_free(user_ns_cachep, ns);
 }
 void free_user_ns(struct kref *kref)
@@ -126,3 +128,10 @@ gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t
        /* No useful relationship so no mapping */
        return overflowgid;
 }
+static __init int user_namespaces_init(void)
+{
+        user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC);
+        return 0;
+}
+module_init(user_namespaces_init);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 6e3c41a4024c..d7ebdf4cea98 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -57,6 +57,8 @@ static int __init hardlockup_panic_setup(char *str)
 {
        if (!strncmp(str, "panic", 5))
                hardlockup_panic = 1;
+        else if (!strncmp(str, "0", 1))
+                no_watchdog = 1;
        return 1;
 }
 __setup("nmi_watchdog=", hardlockup_panic_setup);
@@ -116,12 +118,12 @@ static void __touch_watchdog(void)
 {
        int this_cpu = smp_processor_id();
-        __get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu);
+        __this_cpu_write(watchdog_touch_ts, get_timestamp(this_cpu));
 }
 void touch_softlockup_watchdog(void)
 {
-        __raw_get_cpu_var(watchdog_touch_ts) = 0;
+        __this_cpu_write(watchdog_touch_ts, 0);
 }
 EXPORT_SYMBOL(touch_softlockup_watchdog);
@@ -165,12 +167,12 @@ void touch_softlockup_watchdog_sync(void)
 /* watchdog detector functions */
 static int is_hardlockup(void)
 {
-        unsigned long hrint = __get_cpu_var(hrtimer_interrupts);
+        unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
-        if (__get_cpu_var(hrtimer_interrupts_saved) == hrint)
+        if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)
                return 1;
-        __get_cpu_var(hrtimer_interrupts_saved) = hrint;
+        __this_cpu_write(hrtimer_interrupts_saved, hrint);
        return 0;
 }
 #endif
@@ -203,8 +205,8 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi,
        /* Ensure the watchdog never gets throttled */
        event->hw.interrupts = 0;
-        if (__get_cpu_var(watchdog_nmi_touch) == true) {
+        if (__this_cpu_read(watchdog_nmi_touch) == true) {
-                __get_cpu_var(watchdog_nmi_touch) = false;
+                __this_cpu_write(watchdog_nmi_touch, false);
                return;
        }
@@ -218,7 +220,7 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi,
                int this_cpu = smp_processor_id();
                /* only print hardlockups once */
-                if (__get_cpu_var(hard_watchdog_warn) == true)
+                if (__this_cpu_read(hard_watchdog_warn) == true)
                        return;
                if (hardlockup_panic)
@@ -226,16 +228,16 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi,
                else
                        WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu);
-                __get_cpu_var(hard_watchdog_warn) = true;
+                __this_cpu_write(hard_watchdog_warn, true);
                return;
        }
-        __get_cpu_var(hard_watchdog_warn) = false;
+        __this_cpu_write(hard_watchdog_warn, false);
        return;
 }
 static void watchdog_interrupt_count(void)
 {
-        __get_cpu_var(hrtimer_interrupts)++;
+        __this_cpu_inc(hrtimer_interrupts);
 }
 #else
 static inline void watchdog_interrupt_count(void) { return; }
@@ -244,7 +246,7 @@ static inline void watchdog_interrupt_count(void) { return; }
 /* watchdog kicker functions */
 static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 {
-        unsigned long touch_ts = __get_cpu_var(watchdog_touch_ts);
+        unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
        struct pt_regs *regs = get_irq_regs();
        int duration;
@@ -252,18 +254,18 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
        watchdog_interrupt_count();
        /* kick the softlockup detector */
-        wake_up_process(__get_cpu_var(softlockup_watchdog));
+        wake_up_process(__this_cpu_read(softlockup_watchdog));
        /* .. and repeat */
        hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period()));
        if (touch_ts == 0) {
-                if (unlikely(__get_cpu_var(softlockup_touch_sync))) {
+                if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
                        /*
                         * If the time stamp was touched atomically
                         * make sure the scheduler tick is up to date.
                         */
-                        __get_cpu_var(softlockup_touch_sync) = false;
+                        __this_cpu_write(softlockup_touch_sync, false);
                        sched_clock_tick();
                }
                __touch_watchdog();
@@ -279,7 +281,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
        duration = is_softlockup(touch_ts);
        if (unlikely(duration)) {
                /* only warn once */
-                if (__get_cpu_var(soft_watchdog_warn) == true)
+                if (__this_cpu_read(soft_watchdog_warn) == true)
                        return HRTIMER_RESTART;
                printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
@@ -294,9 +296,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
                if (softlockup_panic)
                        panic("softlockup: hung tasks");
-                __get_cpu_var(soft_watchdog_warn) = true;
+                __this_cpu_write(soft_watchdog_warn, true);
        } else
-                __get_cpu_var(soft_watchdog_warn) = false;
+                __this_cpu_write(soft_watchdog_warn, false);
        return HRTIMER_RESTART;
 }
@@ -307,7 +309,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 */
 static int watchdog(void *unused)
 {
-        struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+        static struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
        struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
        sched_setscheduler(current, SCHED_FIFO, &param);
@@ -364,7 +366,8 @@ static int watchdog_nmi_enable(int cpu)
                goto out_save;
        }
-        printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event);
+        printk(KERN_ERR "NMI watchdog disabled for cpu%i: unable to create perf event: %ld\n",
+               cpu, PTR_ERR(event));
        return PTR_ERR(event);
        /* success path */
@@ -547,13 +550,13 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
        .notifier_call = cpu_callback
 };
-static int __init spawn_watchdog_task(void)
+void __init lockup_detector_init(void)
 {
        void *cpu = (void *)(long)smp_processor_id();
        int err;
        if (no_watchdog)
-                return 0;
+                return;
        err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
        WARN_ON(notifier_to_errno(err));
@@ -561,6 +564,5 @@ static int __init spawn_watchdog_task(void)
        cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
        register_cpu_notifier(&cpu_nfb);
-        return 0;
+        return;
 }
-early_initcall(spawn_watchdog_task);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 90db1bd1a978..11869faa6819 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -661,7 +661,7 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
 {
        struct worker *worker = kthread_data(task);
-        if (likely(!(worker->flags & WORKER_NOT_RUNNING)))
+        if (!(worker->flags & WORKER_NOT_RUNNING))
                atomic_inc(get_gcwq_nr_running(cpu));
 }
@@ -687,7 +687,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
        struct global_cwq *gcwq = get_gcwq(cpu);
        atomic_t *nr_running = get_gcwq_nr_running(cpu);
-        if (unlikely(worker->flags & WORKER_NOT_RUNNING))
+        if (worker->flags & WORKER_NOT_RUNNING)
                return NULL;
        /* this can only happen on the local cpu */
@@ -768,7 +768,11 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
        worker->flags &= ~flags;
-        /* if transitioning out of NOT_RUNNING, increment nr_running */
+        /*
+         * If transitioning out of NOT_RUNNING, increment nr_running.  Note
+         * that the nested NOT_RUNNING is not a noop.  NOT_RUNNING is mask
+         * of multiple flags, not a single flag.
+         */
        if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
                if (!(worker->flags & WORKER_NOT_RUNNING))
                        atomic_inc(get_gcwq_nr_running(gcwq->cpu));
@@ -932,6 +936,38 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
                wake_up_worker(gcwq);
 }
+/*
+ * Test whether @work is being queued from another work executing on the
+ * same workqueue.  This is rather expensive and should only be used from
+ * cold paths.
+ */
+static bool is_chained_work(struct workqueue_struct *wq)
+{
+        unsigned long flags;
+        unsigned int cpu;
+        for_each_gcwq_cpu(cpu) {
+                struct global_cwq *gcwq = get_gcwq(cpu);
+                struct worker *worker;
+                struct hlist_node *pos;
+                int i;
+                spin_lock_irqsave(&gcwq->lock, flags);
+                for_each_busy_worker(worker, i, pos, gcwq) {
+                        if (worker->task != current)
+                                continue;
+                        spin_unlock_irqrestore(&gcwq->lock, flags);
+                        /*
+                         * I'm @worker, no locking necessary.  See if @work
+                         * is headed to the same workqueue.
+                         */
+                        return worker->current_cwq->wq == wq;
+                }
+                spin_unlock_irqrestore(&gcwq->lock, flags);
+        }
+        return false;
+}
 static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
                         struct work_struct *work)
 {
@@ -943,7 +979,9 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
        debug_work_activate(work);
-        if (WARN_ON_ONCE(wq->flags & WQ_DYING))
+        /* if dying, only works from the same workqueue are allowed */
+        if (unlikely(wq->flags & WQ_DYING) &&
+            WARN_ON_ONCE(!is_chained_work(wq)))
                return;
        /* determine gcwq to use */
@@ -1806,7 +1844,7 @@ __acquires(&gcwq->lock)
        spin_unlock_irq(&gcwq->lock);
        work_clear_pending(work);
-        lock_map_acquire(&cwq->wq->lockdep_map);
+        lock_map_acquire_read(&cwq->wq->lockdep_map);
        lock_map_acquire(&lockdep_map);
        trace_workqueue_execute_start(work);
        f(work);
@@ -2350,8 +2388,18 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
        insert_wq_barrier(cwq, barr, work, worker);
        spin_unlock_irq(&gcwq->lock);
-        lock_map_acquire(&cwq->wq->lockdep_map);
+        /*
+         * If @max_active is 1 or rescuer is in use, flushing another work
+         * item on the same workqueue may lead to deadlock.  Make sure the
+         * flusher is not running on the same workqueue by verifying write
+         * access.
+         */
+        if (cwq->wq->saved_max_active == 1 || cwq->wq->flags & WQ_RESCUER)
+                lock_map_acquire(&cwq->wq->lockdep_map);
+        else
+                lock_map_acquire_read(&cwq->wq->lockdep_map);
        lock_map_release(&cwq->wq->lockdep_map);
        return true;
 already_gone:
        spin_unlock_irq(&gcwq->lock);
@@ -2936,11 +2984,35 @@ EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
 */
 void destroy_workqueue(struct workqueue_struct *wq)
 {
+        unsigned int flush_cnt = 0;
        unsigned int cpu;
+        /*
+         * Mark @wq dying and drain all pending works.  Once WQ_DYING is
+         * set, only chain queueing is allowed.  IOW, only currently
+         * pending or running work items on @wq can queue further work
+         * items on it.  @wq is flushed repeatedly until it becomes empty.
+         * The number of flushing is detemined by the depth of chaining and
+         * should be relatively short.  Whine if it takes too long.
+         */
        wq->flags |= WQ_DYING;
+reflush:
        flush_workqueue(wq);
+        for_each_cwq_cpu(cpu, wq) {
+                struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+                if (!cwq->nr_active && list_empty(&cwq->delayed_works))
+                        continue;
+                if (++flush_cnt == 10 ||
+                    (flush_cnt % 100 == 0 && flush_cnt <= 1000))
+                        printk(KERN_WARNING "workqueue %s: flush on "
+                               "destruction isn't complete after %u tries\n",
+                               wq->name, flush_cnt);
+                goto reflush;
+        }
        /*
         * wq list is used to freeze wq, remove from list after
         * flushing is complete in case freeze races us.
@@ -3692,7 +3764,8 @@ static int __init init_workqueues(void)
        system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0);
        system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
                                            WQ_UNBOUND_MAX_ACTIVE);
-        BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq);
+        BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq ||
+               !system_unbound_wq);
        return 0;
 }
 early_initcall(init_workqueues);
author	Thomas Gleixner <tglx@linutronix.de>	2011-01-27 06:29:13 -0500
committer	Thomas Gleixner <tglx@linutronix.de>	2011-01-27 06:29:37 -0500
commit	f97b12cce6dea51880a6a89d4607c29c70a6a841 (patch)
tree	1f05f6d39975bd213e7506e8a73ae0a59188c75e /kernel
parent	ccaa8d657117bb1876d471bd91579d774106778d (diff)
parent	1bae4ce27c9c90344f23c65ea6966c50ffeae2f5 (diff)