68 files changed, 1742 insertions, 1146 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 76768ee812b2..08561f1acd13 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -231,6 +231,10 @@ config RWSEM_SPIN_ON_OWNER
       def_bool y
       depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
+config LOCK_SPIN_ON_OWNER
+       def_bool y
+       depends on MUTEX_SPIN_ON_OWNER || RWSEM_SPIN_ON_OWNER
 config ARCH_USE_QUEUE_RWLOCK
        bool
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index d6594e457a25..a64e7a207d2b 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -163,7 +163,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
 void bpf_jit_binary_free(struct bpf_binary_header *hdr)
 {
-        module_free(NULL, hdr);
+        module_memfree(hdr);
 }
 #endif /* CONFIG_BPF_JIT */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 088ac0b1b106..536edc2be307 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -150,7 +150,7 @@ static int map_lookup_elem(union bpf_attr *attr)
        int ufd = attr->map_fd;
        struct fd f = fdget(ufd);
        struct bpf_map *map;
-        void *key, *value;
+        void *key, *value, *ptr;
        int err;
        if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
@@ -169,20 +169,29 @@ static int map_lookup_elem(union bpf_attr *attr)
        if (copy_from_user(key, ukey, map->key_size) != 0)
                goto free_key;
-        err = -ENOENT;
+        err = -ENOMEM;
-        rcu_read_lock();
+        value = kmalloc(map->value_size, GFP_USER);
-        value = map->ops->map_lookup_elem(map, key);
        if (!value)
-                goto err_unlock;
+                goto free_key;
+        rcu_read_lock();
+        ptr = map->ops->map_lookup_elem(map, key);
+        if (ptr)
+                memcpy(value, ptr, map->value_size);
+        rcu_read_unlock();
+        err = -ENOENT;
+        if (!ptr)
+                goto free_value;
        err = -EFAULT;
        if (copy_to_user(uvalue, value, map->value_size) != 0)
-                goto err_unlock;
+                goto free_value;
        err = 0;
-err_unlock:
+free_value:
-        rcu_read_unlock();
+        kfree(value);
 free_key:
        kfree(key);
 err_put:
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index bb263d0caab3..04cfe8ace520 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1909,7 +1909,7 @@ static void cgroup_kill_sb(struct super_block *sb)
         *
         * And don't kill the default root.
         */
-        if (css_has_online_children(&root->cgrp.self) ||
+        if (!list_empty(&root->cgrp.self.children) ||
            root == &cgrp_dfl_root)
                cgroup_put(&root->cgrp);
        else
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 5d220234b3ca..1972b161c61e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -58,22 +58,23 @@ static int cpu_hotplug_disabled;
 static struct {
        struct task_struct *active_writer;
-        struct mutex lock; /* Synchronizes accesses to refcount, */
+        /* wait queue to wake up the active_writer */
+        wait_queue_head_t wq;
+        /* verifies that no writer will get active while readers are active */
+        struct mutex lock;
        /*
         * Also blocks the new readers during
         * an ongoing cpu hotplug operation.
         */
-        int refcount;
+        atomic_t refcount;
-        /* And allows lockless put_online_cpus(). */
-        atomic_t puts_pending;
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map dep_map;
 #endif
 } cpu_hotplug = {
        .active_writer = NULL,
+        .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq),
        .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
-        .refcount = 0,
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
        .dep_map = {.name = "cpu_hotplug.lock" },
 #endif
@@ -86,15 +87,6 @@ static struct {
 #define cpuhp_lock_acquire()      lock_map_acquire(&cpu_hotplug.dep_map)
 #define cpuhp_lock_release()      lock_map_release(&cpu_hotplug.dep_map)
-static void apply_puts_pending(int max)
-{
-        int delta;
-        if (atomic_read(&cpu_hotplug.puts_pending) >= max) {
-                delta = atomic_xchg(&cpu_hotplug.puts_pending, 0);
-                cpu_hotplug.refcount -= delta;
-        }
-}
 void get_online_cpus(void)
 {
@@ -103,8 +95,7 @@ void get_online_cpus(void)
                return;
        cpuhp_lock_acquire_read();
        mutex_lock(&cpu_hotplug.lock);
-        apply_puts_pending(65536);
+        atomic_inc(&cpu_hotplug.refcount);
-        cpu_hotplug.refcount++;
        mutex_unlock(&cpu_hotplug.lock);
 }
 EXPORT_SYMBOL_GPL(get_online_cpus);
@@ -116,8 +107,7 @@ bool try_get_online_cpus(void)
        if (!mutex_trylock(&cpu_hotplug.lock))
                return false;
        cpuhp_lock_acquire_tryread();
-        apply_puts_pending(65536);
+        atomic_inc(&cpu_hotplug.refcount);
-        cpu_hotplug.refcount++;
        mutex_unlock(&cpu_hotplug.lock);
        return true;
 }
@@ -125,20 +115,18 @@ EXPORT_SYMBOL_GPL(try_get_online_cpus);
 void put_online_cpus(void)
 {
+        int refcount;
        if (cpu_hotplug.active_writer == current)
                return;
-        if (!mutex_trylock(&cpu_hotplug.lock)) {
-                atomic_inc(&cpu_hotplug.puts_pending);
-                cpuhp_lock_release();
-                return;
-        }
-        if (WARN_ON(!cpu_hotplug.refcount))
+        refcount = atomic_dec_return(&cpu_hotplug.refcount);
-                cpu_hotplug.refcount++; /* try to fix things up */
+        if (WARN_ON(refcount < 0)) /* try to fix things up */
+                atomic_inc(&cpu_hotplug.refcount);
+        if (refcount <= 0 && waitqueue_active(&cpu_hotplug.wq))
+                wake_up(&cpu_hotplug.wq);
-        if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
-                wake_up_process(cpu_hotplug.active_writer);
-        mutex_unlock(&cpu_hotplug.lock);
        cpuhp_lock_release();
 }
@@ -168,18 +156,20 @@ EXPORT_SYMBOL_GPL(put_online_cpus);
 */
 void cpu_hotplug_begin(void)
 {
-        cpu_hotplug.active_writer = current;
+        DEFINE_WAIT(wait);
+        cpu_hotplug.active_writer = current;
        cpuhp_lock_acquire();
        for (;;) {
                mutex_lock(&cpu_hotplug.lock);
-                apply_puts_pending(1);
+                prepare_to_wait(&cpu_hotplug.wq, &wait, TASK_UNINTERRUPTIBLE);
-                if (likely(!cpu_hotplug.refcount))
+                if (likely(!atomic_read(&cpu_hotplug.refcount)))
-                        break;
+                                break;
-                __set_current_state(TASK_UNINTERRUPTIBLE);
                mutex_unlock(&cpu_hotplug.lock);
                schedule();
        }
+        finish_wait(&cpu_hotplug.wq, &wait);
 }
 void cpu_hotplug_done(void)
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 1adf62b39b96..07ce18ca71e0 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -27,6 +27,9 @@
 * version 2. This program is licensed "as is" without any warranty of any
 * kind, whether express or implied.
 */
+#define pr_fmt(fmt) "KGDB: " fmt
 #include <linux/pid_namespace.h>
 #include <linux/clocksource.h>
 #include <linux/serial_core.h>
@@ -196,8 +199,8 @@ int __weak kgdb_validate_break_address(unsigned long addr)
                return err;
        err = kgdb_arch_remove_breakpoint(&tmp);
        if (err)
-                printk(KERN_ERR "KGDB: Critical breakpoint error, kernel "
+                pr_err("Critical breakpoint error, kernel memory destroyed at: %lx\n",
-                   "memory destroyed at: %lx", addr);
+                       addr);
        return err;
 }
@@ -256,8 +259,8 @@ int dbg_activate_sw_breakpoints(void)
                error = kgdb_arch_set_breakpoint(&kgdb_break[i]);
                if (error) {
                        ret = error;
-                        printk(KERN_INFO "KGDB: BP install failed: %lx",
+                        pr_info("BP install failed: %lx\n",
-                               kgdb_break[i].bpt_addr);
+                                kgdb_break[i].bpt_addr);
                        continue;
                }
@@ -319,8 +322,8 @@ int dbg_deactivate_sw_breakpoints(void)
                        continue;
                error = kgdb_arch_remove_breakpoint(&kgdb_break[i]);
                if (error) {
-                        printk(KERN_INFO "KGDB: BP remove failed: %lx\n",
+                        pr_info("BP remove failed: %lx\n",
-                               kgdb_break[i].bpt_addr);
+                                kgdb_break[i].bpt_addr);
                        ret = error;
                }
@@ -367,7 +370,7 @@ int dbg_remove_all_break(void)
                        goto setundefined;
                error = kgdb_arch_remove_breakpoint(&kgdb_break[i]);
                if (error)
-                        printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n",
+                        pr_err("breakpoint remove failed: %lx\n",
                               kgdb_break[i].bpt_addr);
 setundefined:
                kgdb_break[i].state = BP_UNDEFINED;
@@ -400,9 +403,9 @@ static int kgdb_io_ready(int print_wait)
        if (print_wait) {
 #ifdef CONFIG_KGDB_KDB
                if (!dbg_kdb_mode)
-                        printk(KERN_CRIT "KGDB: waiting... or $3#33 for KDB\n");
+                        pr_crit("waiting... or $3#33 for KDB\n");
 #else
-                printk(KERN_CRIT "KGDB: Waiting for remote debugger\n");
+                pr_crit("Waiting for remote debugger\n");
 #endif
        }
        return 1;
@@ -430,8 +433,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
                exception_level = 0;
                kgdb_skipexception(ks->ex_vector, ks->linux_regs);
                dbg_activate_sw_breakpoints();
-                printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n",
+                pr_crit("re-enter error: breakpoint removed %lx\n", addr);
-                        addr);
                WARN_ON_ONCE(1);
                return 1;
@@ -444,7 +446,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
                panic("Recursive entry to debugger");
        }
-        printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n");
+        pr_crit("re-enter exception: ALL breakpoints killed\n");
 #ifdef CONFIG_KGDB_KDB
        /* Allow kdb to debug itself one level */
        return 0;
@@ -471,6 +473,7 @@ static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs,
        int cpu;
        int trace_on = 0;
        int online_cpus = num_online_cpus();
+        u64 time_left;
        kgdb_info[ks->cpu].enter_kgdb++;
        kgdb_info[ks->cpu].exception_state |= exception_state;
@@ -595,9 +598,13 @@ return_normal:
        /*
         * Wait for the other CPUs to be notified and be waiting for us:
         */
-        while (kgdb_do_roundup && (atomic_read(&masters_in_kgdb) +
+        time_left = loops_per_jiffy * HZ;
-                                atomic_read(&slaves_in_kgdb)) != online_cpus)
+        while (kgdb_do_roundup && --time_left &&
+               (atomic_read(&masters_in_kgdb) + atomic_read(&slaves_in_kgdb)) !=
+                   online_cpus)
                cpu_relax();
+        if (!time_left)
+                pr_crit("KGDB: Timed out waiting for secondary CPUs.\n");
        /*
         * At this point the primary processor is completely
@@ -795,15 +802,15 @@ static struct console kgdbcons = {
 static void sysrq_handle_dbg(int key)
 {
        if (!dbg_io_ops) {
-                printk(KERN_CRIT "ERROR: No KGDB I/O module available\n");
+                pr_crit("ERROR: No KGDB I/O module available\n");
                return;
        }
        if (!kgdb_connected) {
 #ifdef CONFIG_KGDB_KDB
                if (!dbg_kdb_mode)
-                        printk(KERN_CRIT "KGDB or $3#33 for KDB\n");
+                        pr_crit("KGDB or $3#33 for KDB\n");
 #else
-                printk(KERN_CRIT "Entering KGDB\n");
+                pr_crit("Entering KGDB\n");
 #endif
        }
@@ -945,7 +952,7 @@ static void kgdb_initial_breakpoint(void)
 {
        kgdb_break_asap = 0;
-        printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n");
+        pr_crit("Waiting for connection from remote gdb...\n");
        kgdb_breakpoint();
 }
@@ -964,8 +971,7 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops)
        if (dbg_io_ops) {
                spin_unlock(&kgdb_registration_lock);
-                printk(KERN_ERR "kgdb: Another I/O driver is already "
+                pr_err("Another I/O driver is already registered with KGDB\n");
-                                "registered with KGDB.\n");
                return -EBUSY;
        }
@@ -981,8 +987,7 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops)
        spin_unlock(&kgdb_registration_lock);
-        printk(KERN_INFO "kgdb: Registered I/O driver %s.\n",
+        pr_info("Registered I/O driver %s\n", new_dbg_io_ops->name);
-               new_dbg_io_ops->name);
        /* Arm KGDB now. */
        kgdb_register_callbacks();
@@ -1017,8 +1022,7 @@ void kgdb_unregister_io_module(struct kgdb_io *old_dbg_io_ops)
        spin_unlock(&kgdb_registration_lock);
-        printk(KERN_INFO
+        pr_info("Unregistered I/O driver %s, debugger disabled\n",
-                "kgdb: Unregistered I/O driver %s, debugger disabled.\n",
                old_dbg_io_ops->name);
 }
 EXPORT_SYMBOL_GPL(kgdb_unregister_io_module);
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index b20d544f20c2..e1dbf4a2c69e 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -531,22 +531,29 @@ void __init kdb_initbptab(void)
        for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++)
                bp->bp_free = 1;
-        kdb_register_repeat("bp", kdb_bp, "[<vaddr>]",
+        kdb_register_flags("bp", kdb_bp, "[<vaddr>]",
-                "Set/Display breakpoints", 0, KDB_REPEAT_NO_ARGS);
+                "Set/Display breakpoints", 0,
-        kdb_register_repeat("bl", kdb_bp, "[<vaddr>]",
+                KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
-                "Display breakpoints", 0, KDB_REPEAT_NO_ARGS);
+        kdb_register_flags("bl", kdb_bp, "[<vaddr>]",
+                "Display breakpoints", 0,
+                KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
        if (arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT)
-                kdb_register_repeat("bph", kdb_bp, "[<vaddr>]",
+                kdb_register_flags("bph", kdb_bp, "[<vaddr>]",
-                "[datar [length]|dataw [length]]   Set hw brk", 0, KDB_REPEAT_NO_ARGS);
+                "[datar [length]|dataw [length]]   Set hw brk", 0,
-        kdb_register_repeat("bc", kdb_bc, "<bpnum>",
+                KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
-                "Clear Breakpoint", 0, KDB_REPEAT_NONE);
+        kdb_register_flags("bc", kdb_bc, "<bpnum>",
-        kdb_register_repeat("be", kdb_bc, "<bpnum>",
+                "Clear Breakpoint", 0,
-                "Enable Breakpoint", 0, KDB_REPEAT_NONE);
+                KDB_ENABLE_FLOW_CTRL);
-        kdb_register_repeat("bd", kdb_bc, "<bpnum>",
+        kdb_register_flags("be", kdb_bc, "<bpnum>",
-                "Disable Breakpoint", 0, KDB_REPEAT_NONE);
+                "Enable Breakpoint", 0,
+                KDB_ENABLE_FLOW_CTRL);
-        kdb_register_repeat("ss", kdb_ss, "",
+        kdb_register_flags("bd", kdb_bc, "<bpnum>",
-                "Single Step", 1, KDB_REPEAT_NO_ARGS);
+                "Disable Breakpoint", 0,
+                KDB_ENABLE_FLOW_CTRL);
+        kdb_register_flags("ss", kdb_ss, "",
+                "Single Step", 1,
+                KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
        /*
         * Architecture dependent initialization.
         */
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index 8859ca34dcfe..15e1a7af5dd0 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -129,6 +129,10 @@ int kdb_stub(struct kgdb_state *ks)
                ks->pass_exception = 1;
                KDB_FLAG_SET(CATASTROPHIC);
        }
+        /* set CATASTROPHIC if the system contains unresponsive processors */
+        for_each_online_cpu(i)
+                if (!kgdb_info[i].enter_kgdb)
+                        KDB_FLAG_SET(CATASTROPHIC);
        if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) {
                KDB_STATE_CLEAR(SSBPT);
                KDB_STATE_CLEAR(DOING_SS);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 379650b984f8..7b40c5f07dce 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -12,6 +12,7 @@
 */
 #include <linux/ctype.h>
+#include <linux/types.h>
 #include <linux/string.h>
 #include <linux/kernel.h>
 #include <linux/kmsg_dump.h>
@@ -23,6 +24,7 @@
 #include <linux/vmalloc.h>
 #include <linux/atomic.h>
 #include <linux/module.h>
+#include <linux/moduleparam.h>
 #include <linux/mm.h>
 #include <linux/init.h>
 #include <linux/kallsyms.h>
@@ -42,6 +44,12 @@
 #include <linux/slab.h>
 #include "kdb_private.h"
+#undef  MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX "kdb."
+static int kdb_cmd_enabled = CONFIG_KDB_DEFAULT_ENABLE;
+module_param_named(cmd_enable, kdb_cmd_enabled, int, 0600);
 #define GREP_LEN 256
 char kdb_grep_string[GREP_LEN];
 int kdb_grepping_flag;
@@ -121,6 +129,7 @@ static kdbmsg_t kdbmsgs[] = {
        KDBMSG(BADLENGTH, "Invalid length field"),
        KDBMSG(NOBP, "No Breakpoint exists"),
        KDBMSG(BADADDR, "Invalid address"),
+        KDBMSG(NOPERM, "Permission denied"),
 };
 #undef KDBMSG
@@ -188,6 +197,26 @@ struct task_struct *kdb_curr_task(int cpu)
 }
 /*
+ * Check whether the flags of the current command and the permissions
+ * of the kdb console has allow a command to be run.
+ */
+static inline bool kdb_check_flags(kdb_cmdflags_t flags, int permissions,
+                                   bool no_args)
+{
+        /* permissions comes from userspace so needs massaging slightly */
+        permissions &= KDB_ENABLE_MASK;
+        permissions |= KDB_ENABLE_ALWAYS_SAFE;
+        /* some commands change group when launched with no arguments */
+        if (no_args)
+                permissions |= permissions << KDB_ENABLE_NO_ARGS_SHIFT;
+        flags |= KDB_ENABLE_ALL;
+        return permissions & flags;
+}
+/*
 * kdbgetenv - This function will return the character string value of
 *      an environment variable.
 * Parameters:
@@ -476,6 +505,15 @@ int kdbgetaddrarg(int argc, const char **argv, int *nextarg,
        kdb_symtab_t symtab;
        /*
+         * If the enable flags prohibit both arbitrary memory access
+         * and flow control then there are no reasonable grounds to
+         * provide symbol lookup.
+         */
+        if (!kdb_check_flags(KDB_ENABLE_MEM_READ | KDB_ENABLE_FLOW_CTRL,
+                             kdb_cmd_enabled, false))
+                return KDB_NOPERM;
+        /*
         * Process arguments which follow the following syntax:
         *
         *  symbol | numeric-address [+/- numeric-offset]
@@ -641,8 +679,13 @@ static int kdb_defcmd2(const char *cmdstr, const char *argv0)
                if (!s->count)
                        s->usable = 0;
                if (s->usable)
-                        kdb_register(s->name, kdb_exec_defcmd,
+                        /* macros are always safe because when executed each
-                                     s->usage, s->help, 0);
+                         * internal command re-enters kdb_parse() and is
+                         * safety checked individually.
+                         */
+                        kdb_register_flags(s->name, kdb_exec_defcmd, s->usage,
+                                           s->help, 0,
+                                           KDB_ENABLE_ALWAYS_SAFE);
                return 0;
        }
        if (!s->usable)
@@ -1003,25 +1046,22 @@ int kdb_parse(const char *cmdstr)
        if (i < kdb_max_commands) {
                int result;
+                if (!kdb_check_flags(tp->cmd_flags, kdb_cmd_enabled, argc <= 1))
+                        return KDB_NOPERM;
                KDB_STATE_SET(CMD);
                result = (*tp->cmd_func)(argc-1, (const char **)argv);
                if (result && ignore_errors && result > KDB_CMD_GO)
                        result = 0;
                KDB_STATE_CLEAR(CMD);
-                switch (tp->cmd_repeat) {
-                case KDB_REPEAT_NONE:
+                if (tp->cmd_flags & KDB_REPEAT_WITH_ARGS)
-                        argc = 0;
+                        return result;
-                        if (argv[0])
-                                *(argv[0]) = '\0';
+                argc = tp->cmd_flags & KDB_REPEAT_NO_ARGS ? 1 : 0;
-                        break;
+                if (argv[argc])
-                case KDB_REPEAT_NO_ARGS:
+                        *(argv[argc]) = '\0';
-                        argc = 1;
-                        if (argv[1])
-                                *(argv[1]) = '\0';
-                        break;
-                case KDB_REPEAT_WITH_ARGS:
-                        break;
-                }
                return result;
        }
@@ -1921,10 +1961,14 @@ static int kdb_rm(int argc, const char **argv)
 */
 static int kdb_sr(int argc, const char **argv)
 {
+        bool check_mask =
+            !kdb_check_flags(KDB_ENABLE_ALL, kdb_cmd_enabled, false);
        if (argc != 1)
                return KDB_ARGCOUNT;
        kdb_trap_printk++;
-        __handle_sysrq(*argv[1], false);
+        __handle_sysrq(*argv[1], check_mask);
        kdb_trap_printk--;
        return 0;
@@ -1979,7 +2023,7 @@ static int kdb_lsmod(int argc, const char **argv)
                kdb_printf("%-20s%8u  0x%p ", mod->name,
                           mod->core_size, (void *)mod);
 #ifdef CONFIG_MODULE_UNLOAD
-                kdb_printf("%4ld ", module_refcount(mod));
+                kdb_printf("%4d ", module_refcount(mod));
 #endif
                if (mod->state == MODULE_STATE_GOING)
                        kdb_printf(" (Unloading)");
@@ -2157,6 +2201,8 @@ static void kdb_cpu_status(void)
        for (start_cpu = -1, i = 0; i < NR_CPUS; i++) {
                if (!cpu_online(i)) {
                        state = 'F';    /* cpu is offline */
+                } else if (!kgdb_info[i].enter_kgdb) {
+                        state = 'D';    /* cpu is online but unresponsive */
                } else {
                        state = ' ';    /* cpu is responding to kdb */
                        if (kdb_task_state_char(KDB_TSK(i)) == 'I')
@@ -2210,7 +2256,7 @@ static int kdb_cpu(int argc, const char **argv)
        /*
         * Validate cpunum
         */
-        if ((cpunum > NR_CPUS) || !cpu_online(cpunum))
+        if ((cpunum > NR_CPUS) || !kgdb_info[cpunum].enter_kgdb)
                return KDB_BADCPUNUM;
        dbg_switch_cpu = cpunum;
@@ -2375,6 +2421,8 @@ static int kdb_help(int argc, const char **argv)
                        return 0;
                if (!kt->cmd_name)
                        continue;
+                if (!kdb_check_flags(kt->cmd_flags, kdb_cmd_enabled, true))
+                        continue;
                if (strlen(kt->cmd_usage) > 20)
                        space = "\n                                    ";
                kdb_printf("%-15.15s %-20s%s%s\n", kt->cmd_name,
@@ -2629,7 +2677,7 @@ static int kdb_grep_help(int argc, const char **argv)
 }
 /*
- * kdb_register_repeat - This function is used to register a kernel
+ * kdb_register_flags - This function is used to register a kernel
 *      debugger command.
 * Inputs:
 *      cmd     Command name
@@ -2641,12 +2689,12 @@ static int kdb_grep_help(int argc, const char **argv)
 *      zero for success, one if a duplicate command.
 */
 #define kdb_command_extend 50   /* arbitrary */
-int kdb_register_repeat(char *cmd,
+int kdb_register_flags(char *cmd,
-                        kdb_func_t func,
+                       kdb_func_t func,
-                        char *usage,
+                       char *usage,
-                        char *help,
+                       char *help,
-                        short minlen,
+                       short minlen,
-                        kdb_repeat_t repeat)
+                       kdb_cmdflags_t flags)
 {
        int i;
        kdbtab_t *kp;
@@ -2694,19 +2742,18 @@ int kdb_register_repeat(char *cmd,
        kp->cmd_func   = func;
        kp->cmd_usage  = usage;
        kp->cmd_help   = help;
-        kp->cmd_flags  = 0;
        kp->cmd_minlen = minlen;
-        kp->cmd_repeat = repeat;
+        kp->cmd_flags  = flags;
        return 0;
 }
-EXPORT_SYMBOL_GPL(kdb_register_repeat);
+EXPORT_SYMBOL_GPL(kdb_register_flags);
 /*
 * kdb_register - Compatibility register function for commands that do
 *      not need to specify a repeat state.  Equivalent to
- *      kdb_register_repeat with KDB_REPEAT_NONE.
+ *      kdb_register_flags with flags set to 0.
 * Inputs:
 *      cmd     Command name
 *      func    Function to execute the command
@@ -2721,8 +2768,7 @@ int kdb_register(char *cmd,
             char *help,
             short minlen)
 {
-        return kdb_register_repeat(cmd, func, usage, help, minlen,
+        return kdb_register_flags(cmd, func, usage, help, minlen, 0);
-                                   KDB_REPEAT_NONE);
 }
 EXPORT_SYMBOL_GPL(kdb_register);
@@ -2764,80 +2810,109 @@ static void __init kdb_inittab(void)
        for_each_kdbcmd(kp, i)
                kp->cmd_name = NULL;
-        kdb_register_repeat("md", kdb_md, "<vaddr>",
+        kdb_register_flags("md", kdb_md, "<vaddr>",
          "Display Memory Contents, also mdWcN, e.g. md8c1", 1,
-                            KDB_REPEAT_NO_ARGS);
+          KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
-        kdb_register_repeat("mdr", kdb_md, "<vaddr> <bytes>",
+        kdb_register_flags("mdr", kdb_md, "<vaddr> <bytes>",
-          "Display Raw Memory", 0, KDB_REPEAT_NO_ARGS);
+          "Display Raw Memory", 0,
-        kdb_register_repeat("mdp", kdb_md, "<paddr> <bytes>",
+          KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
-          "Display Physical Memory", 0, KDB_REPEAT_NO_ARGS);
+        kdb_register_flags("mdp", kdb_md, "<paddr> <bytes>",
-        kdb_register_repeat("mds", kdb_md, "<vaddr>",
+          "Display Physical Memory", 0,
-          "Display Memory Symbolically", 0, KDB_REPEAT_NO_ARGS);
+          KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
-        kdb_register_repeat("mm", kdb_mm, "<vaddr> <contents>",
+        kdb_register_flags("mds", kdb_md, "<vaddr>",
-          "Modify Memory Contents", 0, KDB_REPEAT_NO_ARGS);
+          "Display Memory Symbolically", 0,
-        kdb_register_repeat("go", kdb_go, "[<vaddr>]",
+          KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
-          "Continue Execution", 1, KDB_REPEAT_NONE);
+        kdb_register_flags("mm", kdb_mm, "<vaddr> <contents>",
-        kdb_register_repeat("rd", kdb_rd, "",
+          "Modify Memory Contents", 0,
-          "Display Registers", 0, KDB_REPEAT_NONE);
+          KDB_ENABLE_MEM_WRITE | KDB_REPEAT_NO_ARGS);
-        kdb_register_repeat("rm", kdb_rm, "<reg> <contents>",
+        kdb_register_flags("go", kdb_go, "[<vaddr>]",
-          "Modify Registers", 0, KDB_REPEAT_NONE);
+          "Continue Execution", 1,
-        kdb_register_repeat("ef", kdb_ef, "<vaddr>",
+          KDB_ENABLE_REG_WRITE | KDB_ENABLE_ALWAYS_SAFE_NO_ARGS);
-          "Display exception frame", 0, KDB_REPEAT_NONE);
+        kdb_register_flags("rd", kdb_rd, "",
-        kdb_register_repeat("bt", kdb_bt, "[<vaddr>]",
+          "Display Registers", 0,
-          "Stack traceback", 1, KDB_REPEAT_NONE);
+          KDB_ENABLE_REG_READ);
-        kdb_register_repeat("btp", kdb_bt, "<pid>",
+        kdb_register_flags("rm", kdb_rm, "<reg> <contents>",
-          "Display stack for process <pid>", 0, KDB_REPEAT_NONE);
+          "Modify Registers", 0,
-        kdb_register_repeat("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]",
+          KDB_ENABLE_REG_WRITE);
-          "Backtrace all processes matching state flag", 0, KDB_REPEAT_NONE);
+        kdb_register_flags("ef", kdb_ef, "<vaddr>",
-        kdb_register_repeat("btc", kdb_bt, "",
+          "Display exception frame", 0,
-          "Backtrace current process on each cpu", 0, KDB_REPEAT_NONE);
+          KDB_ENABLE_MEM_READ);
-        kdb_register_repeat("btt", kdb_bt, "<vaddr>",
+        kdb_register_flags("bt", kdb_bt, "[<vaddr>]",
+          "Stack traceback", 1,
+          KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS);
+        kdb_register_flags("btp", kdb_bt, "<pid>",
+          "Display stack for process <pid>", 0,
+          KDB_ENABLE_INSPECT);
+        kdb_register_flags("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]",
+          "Backtrace all processes matching state flag", 0,
+          KDB_ENABLE_INSPECT);
+        kdb_register_flags("btc", kdb_bt, "",
+          "Backtrace current process on each cpu", 0,
+          KDB_ENABLE_INSPECT);
+        kdb_register_flags("btt", kdb_bt, "<vaddr>",
          "Backtrace process given its struct task address", 0,
-                            KDB_REPEAT_NONE);
+          KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS);
-        kdb_register_repeat("env", kdb_env, "",
+        kdb_register_flags("env", kdb_env, "",
-          "Show environment variables", 0, KDB_REPEAT_NONE);
+          "Show environment variables", 0,
-        kdb_register_repeat("set", kdb_set, "",
+          KDB_ENABLE_ALWAYS_SAFE);
-          "Set environment variables", 0, KDB_REPEAT_NONE);
+        kdb_register_flags("set", kdb_set, "",
-        kdb_register_repeat("help", kdb_help, "",
+          "Set environment variables", 0,
-          "Display Help Message", 1, KDB_REPEAT_NONE);
+          KDB_ENABLE_ALWAYS_SAFE);
-        kdb_register_repeat("?", kdb_help, "",
+        kdb_register_flags("help", kdb_help, "",
-          "Display Help Message", 0, KDB_REPEAT_NONE);
+          "Display Help Message", 1,
-        kdb_register_repeat("cpu", kdb_cpu, "<cpunum>",
+          KDB_ENABLE_ALWAYS_SAFE);
-          "Switch to new cpu", 0, KDB_REPEAT_NONE);
+        kdb_register_flags("?", kdb_help, "",
-        kdb_register_repeat("kgdb", kdb_kgdb, "",
+          "Display Help Message", 0,
-          "Enter kgdb mode", 0, KDB_REPEAT_NONE);
+          KDB_ENABLE_ALWAYS_SAFE);
-        kdb_register_repeat("ps", kdb_ps, "[<flags>|A]",
+        kdb_register_flags("cpu", kdb_cpu, "<cpunum>",
-          "Display active task list", 0, KDB_REPEAT_NONE);
+          "Switch to new cpu", 0,
-        kdb_register_repeat("pid", kdb_pid, "<pidnum>",
+          KDB_ENABLE_ALWAYS_SAFE_NO_ARGS);
-          "Switch to another task", 0, KDB_REPEAT_NONE);
+        kdb_register_flags("kgdb", kdb_kgdb, "",
-        kdb_register_repeat("reboot", kdb_reboot, "",
+          "Enter kgdb mode", 0, 0);
-          "Reboot the machine immediately", 0, KDB_REPEAT_NONE);
+        kdb_register_flags("ps", kdb_ps, "[<flags>|A]",
+          "Display active task list", 0,
+          KDB_ENABLE_INSPECT);
+        kdb_register_flags("pid", kdb_pid, "<pidnum>",
+          "Switch to another task", 0,
+          KDB_ENABLE_INSPECT);
+        kdb_register_flags("reboot", kdb_reboot, "",
+          "Reboot the machine immediately", 0,
+          KDB_ENABLE_REBOOT);
 #if defined(CONFIG_MODULES)
-        kdb_register_repeat("lsmod", kdb_lsmod, "",
+        kdb_register_flags("lsmod", kdb_lsmod, "",
-          "List loaded kernel modules", 0, KDB_REPEAT_NONE);
+          "List loaded kernel modules", 0,
+          KDB_ENABLE_INSPECT);
 #endif
 #if defined(CONFIG_MAGIC_SYSRQ)
-        kdb_register_repeat("sr", kdb_sr, "<key>",
+        kdb_register_flags("sr", kdb_sr, "<key>",
-          "Magic SysRq key", 0, KDB_REPEAT_NONE);
+          "Magic SysRq key", 0,
+          KDB_ENABLE_ALWAYS_SAFE);
 #endif
 #if defined(CONFIG_PRINTK)
-        kdb_register_repeat("dmesg", kdb_dmesg, "[lines]",
+        kdb_register_flags("dmesg", kdb_dmesg, "[lines]",
-          "Display syslog buffer", 0, KDB_REPEAT_NONE);
+          "Display syslog buffer", 0,
+          KDB_ENABLE_ALWAYS_SAFE);
 #endif
        if (arch_kgdb_ops.enable_nmi) {
-                kdb_register_repeat("disable_nmi", kdb_disable_nmi, "",
+                kdb_register_flags("disable_nmi", kdb_disable_nmi, "",
-                  "Disable NMI entry to KDB", 0, KDB_REPEAT_NONE);
+                  "Disable NMI entry to KDB", 0,
-        }
+                  KDB_ENABLE_ALWAYS_SAFE);
-        kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"",
+        }
-          "Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE);
+        kdb_register_flags("defcmd", kdb_defcmd, "name \"usage\" \"help\"",
-        kdb_register_repeat("kill", kdb_kill, "<-signal> <pid>",
+          "Define a set of commands, down to endefcmd", 0,
-          "Send a signal to a process", 0, KDB_REPEAT_NONE);
+          KDB_ENABLE_ALWAYS_SAFE);
-        kdb_register_repeat("summary", kdb_summary, "",
+        kdb_register_flags("kill", kdb_kill, "<-signal> <pid>",
-          "Summarize the system", 4, KDB_REPEAT_NONE);
+          "Send a signal to a process", 0,
-        kdb_register_repeat("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]",
+          KDB_ENABLE_SIGNAL);
-          "Display per_cpu variables", 3, KDB_REPEAT_NONE);
+        kdb_register_flags("summary", kdb_summary, "",
-        kdb_register_repeat("grephelp", kdb_grep_help, "",
+          "Summarize the system", 4,
-          "Display help on | grep", 0, KDB_REPEAT_NONE);
+          KDB_ENABLE_ALWAYS_SAFE);
+        kdb_register_flags("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]",
+          "Display per_cpu variables", 3,
+          KDB_ENABLE_MEM_READ);
+        kdb_register_flags("grephelp", kdb_grep_help, "",
+          "Display help on | grep", 0,
+          KDB_ENABLE_ALWAYS_SAFE);
 }
 /* Execute any commands defined in kdb_cmds.  */
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 7afd3c8c41d5..eaacd1693954 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -172,10 +172,9 @@ typedef struct _kdbtab {
        kdb_func_t cmd_func;            /* Function to execute command */
        char    *cmd_usage;             /* Usage String for this command */
        char    *cmd_help;              /* Help message for this command */
-        short    cmd_flags;             /* Parsing flags */
        short    cmd_minlen;            /* Minimum legal # command
                                         * chars required */
-        kdb_repeat_t cmd_repeat;        /* Does command auto repeat on enter? */
+        kdb_cmdflags_t cmd_flags;       /* Command behaviour flags */
 } kdbtab_t;
 extern int kdb_bt(int, const char **);  /* KDB display back trace */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4c1ee7f2bebc..7f2fbb8b5069 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -872,22 +872,32 @@ void perf_pmu_enable(struct pmu *pmu)
                pmu->pmu_enable(pmu);
 }
-static DEFINE_PER_CPU(struct list_head, rotation_list);
+static DEFINE_PER_CPU(struct list_head, active_ctx_list);
 /*
- * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
+ * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
- * because they're strictly cpu affine and rotate_start is called with IRQs
+ * perf_event_task_tick() are fully serialized because they're strictly cpu
- * disabled, while rotate_context is called from IRQ context.
+ * affine and perf_event_ctx{activate,deactivate} are called with IRQs
+ * disabled, while perf_event_task_tick is called from IRQ context.
 */
-static void perf_pmu_rotate_start(struct pmu *pmu)
+static void perf_event_ctx_activate(struct perf_event_context *ctx)
 {
-        struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+        struct list_head *head = this_cpu_ptr(&active_ctx_list);
-        struct list_head *head = this_cpu_ptr(&rotation_list);
        WARN_ON(!irqs_disabled());
-        if (list_empty(&cpuctx->rotation_list))
+        WARN_ON(!list_empty(&ctx->active_ctx_list));
-                list_add(&cpuctx->rotation_list, head);
+        list_add(&ctx->active_ctx_list, head);
+}
+static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
+{
+        WARN_ON(!irqs_disabled());
+        WARN_ON(list_empty(&ctx->active_ctx_list));
+        list_del_init(&ctx->active_ctx_list);
 }
 static void get_ctx(struct perf_event_context *ctx)
@@ -907,6 +917,84 @@ static void put_ctx(struct perf_event_context *ctx)
 }
 /*
+ * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
+ * perf_pmu_migrate_context() we need some magic.
+ *
+ * Those places that change perf_event::ctx will hold both
+ * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
+ *
+ * Lock ordering is by mutex address. There is one other site where
+ * perf_event_context::mutex nests and that is put_event(). But remember that
+ * that is a parent<->child context relation, and migration does not affect
+ * children, therefore these two orderings should not interact.
+ *
+ * The change in perf_event::ctx does not affect children (as claimed above)
+ * because the sys_perf_event_open() case will install a new event and break
+ * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
+ * concerned with cpuctx and that doesn't have children.
+ *
+ * The places that change perf_event::ctx will issue:
+ *
+ *   perf_remove_from_context();
+ *   synchronize_rcu();
+ *   perf_install_in_context();
+ *
+ * to affect the change. The remove_from_context() + synchronize_rcu() should
+ * quiesce the event, after which we can install it in the new location. This
+ * means that only external vectors (perf_fops, prctl) can perturb the event
+ * while in transit. Therefore all such accessors should also acquire
+ * perf_event_context::mutex to serialize against this.
+ *
+ * However; because event->ctx can change while we're waiting to acquire
+ * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
+ * function.
+ *
+ * Lock order:
+ *      task_struct::perf_event_mutex
+ *        perf_event_context::mutex
+ *          perf_event_context::lock
+ *          perf_event::child_mutex;
+ *          perf_event::mmap_mutex
+ *          mmap_sem
+ */
+static struct perf_event_context *
+perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
+{
+        struct perf_event_context *ctx;
+again:
+        rcu_read_lock();
+        ctx = ACCESS_ONCE(event->ctx);
+        if (!atomic_inc_not_zero(&ctx->refcount)) {
+                rcu_read_unlock();
+                goto again;
+        }
+        rcu_read_unlock();
+        mutex_lock_nested(&ctx->mutex, nesting);
+        if (event->ctx != ctx) {
+                mutex_unlock(&ctx->mutex);
+                put_ctx(ctx);
+                goto again;
+        }
+        return ctx;
+}
+static inline struct perf_event_context *
+perf_event_ctx_lock(struct perf_event *event)
+{
+        return perf_event_ctx_lock_nested(event, 0);
+}
+static void perf_event_ctx_unlock(struct perf_event *event,
+                                  struct perf_event_context *ctx)
+{
+        mutex_unlock(&ctx->mutex);
+        put_ctx(ctx);
+}
+/*
 * This must be done under the ctx->lock, such as to serialize against
 * context_equiv(), therefore we cannot call put_ctx() since that might end up
 * calling scheduler related locks and ctx->lock nests inside those.
@@ -1155,8 +1243,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
                ctx->nr_branch_stack++;
        list_add_rcu(&event->event_entry, &ctx->event_list);
-        if (!ctx->nr_events)
-                perf_pmu_rotate_start(ctx->pmu);
        ctx->nr_events++;
        if (event->attr.inherit_stat)
                ctx->nr_stat++;
@@ -1275,6 +1361,8 @@ static void perf_group_attach(struct perf_event *event)
        if (group_leader == event)
                return;
+        WARN_ON_ONCE(group_leader->ctx != event->ctx);
        if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
                        !is_software_event(event))
                group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
@@ -1296,6 +1384,10 @@ static void
 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 {
        struct perf_cpu_context *cpuctx;
+        WARN_ON_ONCE(event->ctx != ctx);
+        lockdep_assert_held(&ctx->lock);
        /*
         * We can have double detach due to exit/hot-unplug + close.
         */
@@ -1380,6 +1472,8 @@ static void perf_group_detach(struct perf_event *event)
                /* Inherit group flags from the previous leader */
                sibling->group_flags = event->group_flags;
+                WARN_ON_ONCE(sibling->ctx != event->ctx);
        }
 out:
@@ -1442,6 +1536,10 @@ event_sched_out(struct perf_event *event,
 {
        u64 tstamp = perf_event_time(event);
        u64 delta;
+        WARN_ON_ONCE(event->ctx != ctx);
+        lockdep_assert_held(&ctx->lock);
        /*
         * An event which could not be activated because of
         * filter mismatch still needs to have its timings
@@ -1471,7 +1569,8 @@ event_sched_out(struct perf_event *event,
        if (!is_software_event(event))
                cpuctx->active_oncpu--;
-        ctx->nr_active--;
+        if (!--ctx->nr_active)
+                perf_event_ctx_deactivate(ctx);
        if (event->attr.freq && event->attr.sample_freq)
                ctx->nr_freq--;
        if (event->attr.exclusive || !cpuctx->active_oncpu)
@@ -1654,7 +1753,7 @@ int __perf_event_disable(void *info)
 * is the current context on this CPU and preemption is disabled,
 * hence we can't get into perf_event_task_sched_out for this context.
 */
-void perf_event_disable(struct perf_event *event)
+static void _perf_event_disable(struct perf_event *event)
 {
        struct perf_event_context *ctx = event->ctx;
        struct task_struct *task = ctx->task;
@@ -1695,6 +1794,19 @@ retry:
        }
        raw_spin_unlock_irq(&ctx->lock);
 }
+/*
+ * Strictly speaking kernel users cannot create groups and therefore this
+ * interface does not need the perf_event_ctx_lock() magic.
+ */
+void perf_event_disable(struct perf_event *event)
+{
+        struct perf_event_context *ctx;
+        ctx = perf_event_ctx_lock(event);
+        _perf_event_disable(event);
+        perf_event_ctx_unlock(event, ctx);
+}
 EXPORT_SYMBOL_GPL(perf_event_disable);
 static void perf_set_shadow_time(struct perf_event *event,
@@ -1782,7 +1894,8 @@ event_sched_in(struct perf_event *event,
        if (!is_software_event(event))
                cpuctx->active_oncpu++;
-        ctx->nr_active++;
+        if (!ctx->nr_active++)
+                perf_event_ctx_activate(ctx);
        if (event->attr.freq && event->attr.sample_freq)
                ctx->nr_freq++;
@@ -2158,7 +2271,7 @@ unlock:
 * perf_event_for_each_child or perf_event_for_each as described
 * for perf_event_disable.
 */
-void perf_event_enable(struct perf_event *event)
+static void _perf_event_enable(struct perf_event *event)
 {
        struct perf_event_context *ctx = event->ctx;
        struct task_struct *task = ctx->task;
@@ -2214,9 +2327,21 @@ retry:
 out:
        raw_spin_unlock_irq(&ctx->lock);
 }
+/*
+ * See perf_event_disable();
+ */
+void perf_event_enable(struct perf_event *event)
+{
+        struct perf_event_context *ctx;
+        ctx = perf_event_ctx_lock(event);
+        _perf_event_enable(event);
+        perf_event_ctx_unlock(event, ctx);
+}
 EXPORT_SYMBOL_GPL(perf_event_enable);
-int perf_event_refresh(struct perf_event *event, int refresh)
+static int _perf_event_refresh(struct perf_event *event, int refresh)
 {
        /*
         * not supported on inherited events
@@ -2225,10 +2350,25 @@ int perf_event_refresh(struct perf_event *event, int refresh)
                return -EINVAL;
        atomic_add(refresh, &event->event_limit);
-        perf_event_enable(event);
+        _perf_event_enable(event);
        return 0;
 }
+/*
+ * See perf_event_disable()
+ */
+int perf_event_refresh(struct perf_event *event, int refresh)
+{
+        struct perf_event_context *ctx;
+        int ret;
+        ctx = perf_event_ctx_lock(event);
+        ret = _perf_event_refresh(event, refresh);
+        perf_event_ctx_unlock(event, ctx);
+        return ret;
+}
 EXPORT_SYMBOL_GPL(perf_event_refresh);
 static void ctx_sched_out(struct perf_event_context *ctx,
@@ -2612,12 +2752,6 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
        perf_pmu_enable(ctx->pmu);
        perf_ctx_unlock(cpuctx, ctx);
-        /*
-         * Since these rotations are per-cpu, we need to ensure the
-         * cpu-context we got scheduled on is actually rotating.
-         */
-        perf_pmu_rotate_start(ctx->pmu);
 }
 /*
@@ -2905,25 +3039,18 @@ static void rotate_ctx(struct perf_event_context *ctx)
                list_rotate_left(&ctx->flexible_groups);
 }
-/*
- * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
- * because they're strictly cpu affine and rotate_start is called with IRQs
- * disabled, while rotate_context is called from IRQ context.
- */
 static int perf_rotate_context(struct perf_cpu_context *cpuctx)
 {
        struct perf_event_context *ctx = NULL;
-        int rotate = 0, remove = 1;
+        int rotate = 0;
        if (cpuctx->ctx.nr_events) {
-                remove = 0;
                if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
                        rotate = 1;
        }
        ctx = cpuctx->task_ctx;
        if (ctx && ctx->nr_events) {
-                remove = 0;
                if (ctx->nr_events != ctx->nr_active)
                        rotate = 1;
        }
@@ -2947,8 +3074,6 @@ static int perf_rotate_context(struct perf_cpu_context *cpuctx)
        perf_pmu_enable(cpuctx->ctx.pmu);
        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 done:
-        if (remove)
-                list_del_init(&cpuctx->rotation_list);
        return rotate;
 }
@@ -2966,9 +3091,8 @@ bool perf_event_can_stop_tick(void)
 void perf_event_task_tick(void)
 {
-        struct list_head *head = this_cpu_ptr(&rotation_list);
+        struct list_head *head = this_cpu_ptr(&active_ctx_list);
-        struct perf_cpu_context *cpuctx, *tmp;
+        struct perf_event_context *ctx, *tmp;
-        struct perf_event_context *ctx;
        int throttled;
        WARN_ON(!irqs_disabled());
@@ -2976,14 +3100,8 @@ void perf_event_task_tick(void)
        __this_cpu_inc(perf_throttled_seq);
        throttled = __this_cpu_xchg(perf_throttled_count, 0);
-        list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) {
+        list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
-                ctx = &cpuctx->ctx;
                perf_adjust_freq_unthr_context(ctx, throttled);
-                ctx = cpuctx->task_ctx;
-                if (ctx)
-                        perf_adjust_freq_unthr_context(ctx, throttled);
-        }
 }
 static int event_enable_on_exec(struct perf_event *event,
@@ -3142,6 +3260,7 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
 {
        raw_spin_lock_init(&ctx->lock);
        mutex_init(&ctx->mutex);
+        INIT_LIST_HEAD(&ctx->active_ctx_list);
        INIT_LIST_HEAD(&ctx->pinned_groups);
        INIT_LIST_HEAD(&ctx->flexible_groups);
        INIT_LIST_HEAD(&ctx->event_list);
@@ -3421,7 +3540,16 @@ static void perf_remove_from_owner(struct perf_event *event)
        rcu_read_unlock();
        if (owner) {
-                mutex_lock(&owner->perf_event_mutex);
+                /*
+                 * If we're here through perf_event_exit_task() we're already
+                 * holding ctx->mutex which would be an inversion wrt. the
+                 * normal lock order.
+                 *
+                 * However we can safely take this lock because its the child
+                 * ctx->mutex.
+                 */
+                mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
                /*
                 * We have to re-check the event->owner field, if it is cleared
                 * we raced with perf_event_exit_task(), acquiring the mutex
@@ -3440,7 +3568,7 @@ static void perf_remove_from_owner(struct perf_event *event)
 */
 static void put_event(struct perf_event *event)
 {
-        struct perf_event_context *ctx = event->ctx;
+        struct perf_event_context *ctx;
        if (!atomic_long_dec_and_test(&event->refcount))
                return;
@@ -3448,7 +3576,6 @@ static void put_event(struct perf_event *event)
        if (!is_kernel_event(event))
                perf_remove_from_owner(event);
-        WARN_ON_ONCE(ctx->parent_ctx);
        /*
         * There are two ways this annotation is useful:
         *
@@ -3461,7 +3588,8 @@ static void put_event(struct perf_event *event)
         *     the last filedesc died, so there is no possibility
         *     to trigger the AB-BA case.
         */
-        mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
+        ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING);
+        WARN_ON_ONCE(ctx->parent_ctx);
        perf_remove_from_context(event, true);
        mutex_unlock(&ctx->mutex);
@@ -3547,12 +3675,13 @@ static int perf_event_read_group(struct perf_event *event,
                                   u64 read_format, char __user *buf)
 {
        struct perf_event *leader = event->group_leader, *sub;
-        int n = 0, size = 0, ret = -EFAULT;
        struct perf_event_context *ctx = leader->ctx;
-        u64 values[5];
+        int n = 0, size = 0, ret;
        u64 count, enabled, running;
+        u64 values[5];
+        lockdep_assert_held(&ctx->mutex);
-        mutex_lock(&ctx->mutex);
        count = perf_event_read_value(leader, &enabled, &running);
        values[n++] = 1 + leader->nr_siblings;
@@ -3567,7 +3696,7 @@ static int perf_event_read_group(struct perf_event *event,
        size = n * sizeof(u64);
        if (copy_to_user(buf, values, size))
-                goto unlock;
+                return -EFAULT;
        ret = size;
@@ -3581,14 +3710,11 @@ static int perf_event_read_group(struct perf_event *event,
                size = n * sizeof(u64);
                if (copy_to_user(buf + ret, values, size)) {
-                        ret = -EFAULT;
+                        return -EFAULT;
-                        goto unlock;
                }
                ret += size;
        }
-unlock:
-        mutex_unlock(&ctx->mutex);
        return ret;
 }
@@ -3660,8 +3786,14 @@ static ssize_t
 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 {
        struct perf_event *event = file->private_data;
+        struct perf_event_context *ctx;
+        int ret;
-        return perf_read_hw(event, buf, count);
+        ctx = perf_event_ctx_lock(event);
+        ret = perf_read_hw(event, buf, count);
+        perf_event_ctx_unlock(event, ctx);
+        return ret;
 }
 static unsigned int perf_poll(struct file *file, poll_table *wait)
@@ -3687,7 +3819,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
        return events;
 }
-static void perf_event_reset(struct perf_event *event)
+static void _perf_event_reset(struct perf_event *event)
 {
        (void)perf_event_read(event);
        local64_set(&event->count, 0);
@@ -3706,6 +3838,7 @@ static void perf_event_for_each_child(struct perf_event *event,
        struct perf_event *child;
        WARN_ON_ONCE(event->ctx->parent_ctx);
        mutex_lock(&event->child_mutex);
        func(event);
        list_for_each_entry(child, &event->child_list, child_list)
@@ -3719,14 +3852,13 @@ static void perf_event_for_each(struct perf_event *event,
        struct perf_event_context *ctx = event->ctx;
        struct perf_event *sibling;
-        WARN_ON_ONCE(ctx->parent_ctx);
+        lockdep_assert_held(&ctx->mutex);
-        mutex_lock(&ctx->mutex);
        event = event->group_leader;
        perf_event_for_each_child(event, func);
        list_for_each_entry(sibling, &event->sibling_list, group_entry)
                perf_event_for_each_child(sibling, func);
-        mutex_unlock(&ctx->mutex);
 }
 static int perf_event_period(struct perf_event *event, u64 __user *arg)
@@ -3796,25 +3928,24 @@ static int perf_event_set_output(struct perf_event *event,
                                 struct perf_event *output_event);
 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
-static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
 {
-        struct perf_event *event = file->private_data;
        void (*func)(struct perf_event *);
        u32 flags = arg;
        switch (cmd) {
        case PERF_EVENT_IOC_ENABLE:
-                func = perf_event_enable;
+                func = _perf_event_enable;
                break;
        case PERF_EVENT_IOC_DISABLE:
-                func = perf_event_disable;
+                func = _perf_event_disable;
                break;
        case PERF_EVENT_IOC_RESET:
-                func = perf_event_reset;
+                func = _perf_event_reset;
                break;
        case PERF_EVENT_IOC_REFRESH:
-                return perf_event_refresh(event, arg);
+                return _perf_event_refresh(event, arg);
        case PERF_EVENT_IOC_PERIOD:
                return perf_event_period(event, (u64 __user *)arg);
@@ -3861,6 +3992,19 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
        return 0;
 }
+static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+        struct perf_event *event = file->private_data;
+        struct perf_event_context *ctx;
+        long ret;
+        ctx = perf_event_ctx_lock(event);
+        ret = _perf_ioctl(event, cmd, arg);
+        perf_event_ctx_unlock(event, ctx);
+        return ret;
+}
 #ifdef CONFIG_COMPAT
 static long perf_compat_ioctl(struct file *file, unsigned int cmd,
                                unsigned long arg)
@@ -3883,11 +4027,15 @@ static long perf_compat_ioctl(struct file *file, unsigned int cmd,
 int perf_event_task_enable(void)
 {
+        struct perf_event_context *ctx;
        struct perf_event *event;
        mutex_lock(&current->perf_event_mutex);
-        list_for_each_entry(event, &current->perf_event_list, owner_entry)
+        list_for_each_entry(event, &current->perf_event_list, owner_entry) {
-                perf_event_for_each_child(event, perf_event_enable);
+                ctx = perf_event_ctx_lock(event);
+                perf_event_for_each_child(event, _perf_event_enable);
+                perf_event_ctx_unlock(event, ctx);
+        }
        mutex_unlock(&current->perf_event_mutex);
        return 0;
@@ -3895,11 +4043,15 @@ int perf_event_task_enable(void)
 int perf_event_task_disable(void)
 {
+        struct perf_event_context *ctx;
        struct perf_event *event;
        mutex_lock(&current->perf_event_mutex);
-        list_for_each_entry(event, &current->perf_event_list, owner_entry)
+        list_for_each_entry(event, &current->perf_event_list, owner_entry) {
-                perf_event_for_each_child(event, perf_event_disable);
+                ctx = perf_event_ctx_lock(event);
+                perf_event_for_each_child(event, _perf_event_disable);
+                perf_event_ctx_unlock(event, ctx);
+        }
        mutex_unlock(&current->perf_event_mutex);
        return 0;
@@ -4461,18 +4613,14 @@ perf_output_sample_regs(struct perf_output_handle *handle,
 }
 static void perf_sample_regs_user(struct perf_regs *regs_user,
-                                  struct pt_regs *regs)
+                                  struct pt_regs *regs,
+                                  struct pt_regs *regs_user_copy)
 {
-        if (!user_mode(regs)) {
+        if (user_mode(regs)) {
-                if (current->mm)
+                regs_user->abi = perf_reg_abi(current);
-                        regs = task_pt_regs(current);
-                else
-                        regs = NULL;
-        }
-        if (regs) {
-                regs_user->abi  = perf_reg_abi(current);
                regs_user->regs = regs;
+        } else if (current->mm) {
+                perf_get_regs_user(regs_user, regs, regs_user_copy);
        } else {
                regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
                regs_user->regs = NULL;
@@ -4951,7 +5099,8 @@ void perf_prepare_sample(struct perf_event_header *header,
        }
        if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
-                perf_sample_regs_user(&data->regs_user, regs);
+                perf_sample_regs_user(&data->regs_user, regs,
+                                      &data->regs_user_copy);
        if (sample_type & PERF_SAMPLE_REGS_USER) {
                /* regs dump ABI info */
@@ -5892,6 +6041,8 @@ end:
        rcu_read_unlock();
 }
+DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
 int perf_swevent_get_recursion_context(void)
 {
        struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
@@ -5907,21 +6058,30 @@ inline void perf_swevent_put_recursion_context(int rctx)
        put_recursion_context(swhash->recursion, rctx);
 }
-void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
+void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
 {
        struct perf_sample_data data;
-        int rctx;
-        preempt_disable_notrace();
+        if (WARN_ON_ONCE(!regs))
-        rctx = perf_swevent_get_recursion_context();
-        if (rctx < 0)
                return;
        perf_sample_data_init(&data, addr, 0);
        do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
+}
+void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
+{
+        int rctx;
+        preempt_disable_notrace();
+        rctx = perf_swevent_get_recursion_context();
+        if (unlikely(rctx < 0))
+                goto fail;
+        ___perf_sw_event(event_id, nr, regs, addr);
        perf_swevent_put_recursion_context(rctx);
+fail:
        preempt_enable_notrace();
 }
@@ -6779,12 +6939,10 @@ skip_type:
                __perf_event_init_context(&cpuctx->ctx);
                lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
                lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
-                cpuctx->ctx.type = cpu_context;
                cpuctx->ctx.pmu = pmu;
                __perf_cpu_hrtimer_init(cpuctx, cpu);
-                INIT_LIST_HEAD(&cpuctx->rotation_list);
                cpuctx->unique_pmu = pmu;
        }
@@ -6857,6 +7015,20 @@ void perf_pmu_unregister(struct pmu *pmu)
 }
 EXPORT_SYMBOL_GPL(perf_pmu_unregister);
+static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
+{
+        int ret;
+        if (!try_module_get(pmu->module))
+                return -ENODEV;
+        event->pmu = pmu;
+        ret = pmu->event_init(event);
+        if (ret)
+                module_put(pmu->module);
+        return ret;
+}
 struct pmu *perf_init_event(struct perf_event *event)
 {
        struct pmu *pmu = NULL;
@@ -6869,24 +7041,14 @@ struct pmu *perf_init_event(struct perf_event *event)
        pmu = idr_find(&pmu_idr, event->attr.type);
        rcu_read_unlock();
        if (pmu) {
-                if (!try_module_get(pmu->module)) {
+                ret = perf_try_init_event(pmu, event);
-                        pmu = ERR_PTR(-ENODEV);
-                        goto unlock;
-                }
-                event->pmu = pmu;
-                ret = pmu->event_init(event);
                if (ret)
                        pmu = ERR_PTR(ret);
                goto unlock;
        }
        list_for_each_entry_rcu(pmu, &pmus, entry) {
-                if (!try_module_get(pmu->module)) {
+                ret = perf_try_init_event(pmu, event);
-                        pmu = ERR_PTR(-ENODEV);
-                        goto unlock;
-                }
-                event->pmu = pmu;
-                ret = pmu->event_init(event);
                if (!ret)
                        goto unlock;
@@ -7250,6 +7412,15 @@ out:
        return ret;
 }
+static void mutex_lock_double(struct mutex *a, struct mutex *b)
+{
+        if (b < a)
+                swap(a, b);
+        mutex_lock(a);
+        mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
+}
 /**
 * sys_perf_event_open - open a performance event, associate it to a task/cpu
 *
@@ -7265,7 +7436,7 @@ SYSCALL_DEFINE5(perf_event_open,
        struct perf_event *group_leader = NULL, *output_event = NULL;
        struct perf_event *event, *sibling;
        struct perf_event_attr attr;
-        struct perf_event_context *ctx;
+        struct perf_event_context *ctx, *uninitialized_var(gctx);
        struct file *event_file = NULL;
        struct fd group = {NULL, 0};
        struct task_struct *task = NULL;
@@ -7423,7 +7594,19 @@ SYSCALL_DEFINE5(perf_event_open,
                 * task or CPU context:
                 */
                if (move_group) {
-                        if (group_leader->ctx->type != ctx->type)
+                        /*
+                         * Make sure we're both on the same task, or both
+                         * per-cpu events.
+                         */
+                        if (group_leader->ctx->task != ctx->task)
+                                goto err_context;
+                        /*
+                         * Make sure we're both events for the same CPU;
+                         * grouping events for different CPUs is broken; since
+                         * you can never concurrently schedule them anyhow.
+                         */
+                        if (group_leader->cpu != event->cpu)
                                goto err_context;
                } else {
                        if (group_leader->ctx != ctx)
@@ -7451,43 +7634,68 @@ SYSCALL_DEFINE5(perf_event_open,
        }
        if (move_group) {
-                struct perf_event_context *gctx = group_leader->ctx;
+                gctx = group_leader->ctx;
-                mutex_lock(&gctx->mutex);
-                perf_remove_from_context(group_leader, false);
                /*
-                 * Removing from the context ends up with disabled
+                 * See perf_event_ctx_lock() for comments on the details
-                 * event. What we want here is event in the initial
+                 * of swizzling perf_event::ctx.
-                 * startup state, ready to be add into new context.
                 */
-                perf_event__state_init(group_leader);
+                mutex_lock_double(&gctx->mutex, &ctx->mutex);
+                perf_remove_from_context(group_leader, false);
                list_for_each_entry(sibling, &group_leader->sibling_list,
                                    group_entry) {
                        perf_remove_from_context(sibling, false);
-                        perf_event__state_init(sibling);
                        put_ctx(gctx);
                }
-                mutex_unlock(&gctx->mutex);
+        } else {
-                put_ctx(gctx);
+                mutex_lock(&ctx->mutex);
        }
        WARN_ON_ONCE(ctx->parent_ctx);
-        mutex_lock(&ctx->mutex);
        if (move_group) {
+                /*
+                 * Wait for everybody to stop referencing the events through
+                 * the old lists, before installing it on new lists.
+                 */
                synchronize_rcu();
-                perf_install_in_context(ctx, group_leader, group_leader->cpu);
-                get_ctx(ctx);
+                /*
+                 * Install the group siblings before the group leader.
+                 *
+                 * Because a group leader will try and install the entire group
+                 * (through the sibling list, which is still in-tact), we can
+                 * end up with siblings installed in the wrong context.
+                 *
+                 * By installing siblings first we NO-OP because they're not
+                 * reachable through the group lists.
+                 */
                list_for_each_entry(sibling, &group_leader->sibling_list,
                                    group_entry) {
+                        perf_event__state_init(sibling);
                        perf_install_in_context(ctx, sibling, sibling->cpu);
                        get_ctx(ctx);
                }
+                /*
+                 * Removing from the context ends up with disabled
+                 * event. What we want here is event in the initial
+                 * startup state, ready to be add into new context.
+                 */
+                perf_event__state_init(group_leader);
+                perf_install_in_context(ctx, group_leader, group_leader->cpu);
+                get_ctx(ctx);
        }
        perf_install_in_context(ctx, event, event->cpu);
        perf_unpin_context(ctx);
+        if (move_group) {
+                mutex_unlock(&gctx->mutex);
+                put_ctx(gctx);
+        }
        mutex_unlock(&ctx->mutex);
        put_online_cpus();
@@ -7595,7 +7803,11 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
        src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
        dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
-        mutex_lock(&src_ctx->mutex);
+        /*
+         * See perf_event_ctx_lock() for comments on the details
+         * of swizzling perf_event::ctx.
+         */
+        mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
        list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
                                 event_entry) {
                perf_remove_from_context(event, false);
@@ -7603,11 +7815,36 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
                put_ctx(src_ctx);
                list_add(&event->migrate_entry, &events);
        }
-        mutex_unlock(&src_ctx->mutex);
+        /*
+         * Wait for the events to quiesce before re-instating them.
+         */
        synchronize_rcu();
-        mutex_lock(&dst_ctx->mutex);
+        /*
+         * Re-instate events in 2 passes.
+         *
+         * Skip over group leaders and only install siblings on this first
+         * pass, siblings will not get enabled without a leader, however a
+         * leader will enable its siblings, even if those are still on the old
+         * context.
+         */
+        list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
+                if (event->group_leader == event)
+                        continue;
+                list_del(&event->migrate_entry);
+                if (event->state >= PERF_EVENT_STATE_OFF)
+                        event->state = PERF_EVENT_STATE_INACTIVE;
+                account_event_cpu(event, dst_cpu);
+                perf_install_in_context(dst_ctx, event, dst_cpu);
+                get_ctx(dst_ctx);
+        }
+        /*
+         * Once all the siblings are setup properly, install the group leaders
+         * to make it go.
+         */
        list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
                list_del(&event->migrate_entry);
                if (event->state >= PERF_EVENT_STATE_OFF)
@@ -7617,6 +7854,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
                get_ctx(dst_ctx);
        }
        mutex_unlock(&dst_ctx->mutex);
+        mutex_unlock(&src_ctx->mutex);
 }
 EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
@@ -7803,14 +8041,19 @@ static void perf_free_event(struct perf_event *event,
        put_event(parent);
+        raw_spin_lock_irq(&ctx->lock);
        perf_group_detach(event);
        list_del_event(event, ctx);
+        raw_spin_unlock_irq(&ctx->lock);
        free_event(event);
 }
 /*
- * free an unexposed, unused context as created by inheritance by
+ * Free an unexposed, unused context as created by inheritance by
 * perf_event_init_task below, used by fork() in case of fail.
+ *
+ * Not all locks are strictly required, but take them anyway to be nice and
+ * help out with the lockdep assertions.
 */
 void perf_event_free_task(struct task_struct *task)
 {
@@ -8129,7 +8372,7 @@ static void __init perf_event_init_all_cpus(void)
        for_each_possible_cpu(cpu) {
                swhash = &per_cpu(swevent_htable, cpu);
                mutex_init(&swhash->hlist_mutex);
-                INIT_LIST_HEAD(&per_cpu(rotation_list, cpu));
+                INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
        }
 }
@@ -8150,22 +8393,11 @@ static void perf_event_init_cpu(int cpu)
 }
 #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
-static void perf_pmu_rotate_stop(struct pmu *pmu)
-{
-        struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-        WARN_ON(!irqs_disabled());
-        list_del_init(&cpuctx->rotation_list);
-}
 static void __perf_event_exit_context(void *__info)
 {
        struct remove_event re = { .detach_group = true };
        struct perf_event_context *ctx = __info;
-        perf_pmu_rotate_stop(ctx->pmu);
        rcu_read_lock();
        list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
                __perf_remove_from_context(&re);
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 146a5792b1d2..eadb95ce7aac 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -13,12 +13,13 @@
 #include <linux/vmalloc.h>
 #include <linux/slab.h>
 #include <linux/circ_buf.h>
+#include <linux/poll.h>
 #include "internal.h"
 static void perf_output_wakeup(struct perf_output_handle *handle)
 {
-        atomic_set(&handle->rb->poll, POLL_IN);
+        atomic_set(&handle->rb->poll, POLLIN);
        handle->event->pending_wakeup = 1;
        irq_work_queue(&handle->event->pending);
diff --git a/kernel/exit.c b/kernel/exit.c
index 1ea4369890a3..6806c55475ee 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1287,9 +1287,15 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
 static int wait_consider_task(struct wait_opts *wo, int ptrace,
                                struct task_struct *p)
 {
+        /*
+         * We can race with wait_task_zombie() from another thread.
+         * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition
+         * can't confuse the checks below.
+         */
+        int exit_state = ACCESS_ONCE(p->exit_state);
        int ret;
-        if (unlikely(p->exit_state == EXIT_DEAD))
+        if (unlikely(exit_state == EXIT_DEAD))
                return 0;
        ret = eligible_child(wo, p);
@@ -1310,7 +1316,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
                return 0;
        }
-        if (unlikely(p->exit_state == EXIT_TRACE)) {
+        if (unlikely(exit_state == EXIT_TRACE)) {
                /*
                 * ptrace == 0 means we are the natural parent. In this case
                 * we should clear notask_error, debugger will notify us.
@@ -1337,7 +1343,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
        }
        /* slay zombie? */
-        if (p->exit_state == EXIT_ZOMBIE) {
+        if (exit_state == EXIT_ZOMBIE) {
                /* we don't reap group leaders with subthreads */
                if (!delay_group_leader(p)) {
                        /*
diff --git a/kernel/futex.c b/kernel/futex.c
index 63678b573d61..4eeb63de7e54 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2258,7 +2258,7 @@ static long futex_wait_restart(struct restart_block *restart)
 * if there are waiters then it will block, it does PI, etc. (Due to
 * races the kernel might see a 0 value of the futex too.)
 */
-static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect,
+static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
                         ktime_t *time, int trylock)
 {
        struct hrtimer_sleeper timeout, *to = NULL;
@@ -2953,11 +2953,11 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
        case FUTEX_WAKE_OP:
                return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
        case FUTEX_LOCK_PI:
-                return futex_lock_pi(uaddr, flags, val, timeout, 0);
+                return futex_lock_pi(uaddr, flags, timeout, 0);
        case FUTEX_UNLOCK_PI:
                return futex_unlock_pi(uaddr, flags);
        case FUTEX_TRYLOCK_PI:
-                return futex_lock_pi(uaddr, flags, 0, timeout, 1);
+                return futex_lock_pi(uaddr, flags, NULL, 1);
        case FUTEX_WAIT_REQUEUE_PI:
                val3 = FUTEX_BITSET_MATCH_ANY;
                return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 06f58309fed2..ee619929cf90 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -127,7 +127,7 @@ static void *alloc_insn_page(void)
 static void free_insn_page(void *page)
 {
-        module_free(NULL, page);
+        module_memfree(page);
 }
 struct kprobe_insn_cache kprobe_insn_slots = {
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 8541bfdfd232..4ca8eb151975 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -1,5 +1,5 @@
-obj-y += mutex.o semaphore.o rwsem.o mcs_spinlock.o
+obj-y += mutex.o semaphore.o rwsem.o
 ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_lockdep.o = -pg
@@ -14,6 +14,7 @@ ifeq ($(CONFIG_PROC_FS),y)
 obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
 endif
 obj-$(CONFIG_SMP) += spinlock.o
+obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o
 obj-$(CONFIG_SMP) += lglock.o
 obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
 obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index 4d60986fcbee..d1fe2ba5bac9 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -108,20 +108,4 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
        arch_mcs_spin_unlock_contended(&next->locked);
 }
-/*
- * Cancellable version of the MCS lock above.
- *
- * Intended for adaptive spinning of sleeping locks:
- * mutex_lock()/rwsem_down_{read,write}() etc.
- */
-struct optimistic_spin_node {
-        struct optimistic_spin_node *next, *prev;
-        int locked; /* 1 if lock acquired */
-        int cpu; /* encoded CPU # value */
-};
-extern bool osq_lock(struct optimistic_spin_queue *lock);
-extern void osq_unlock(struct optimistic_spin_queue *lock);
 #endif /* __LINUX_MCS_SPINLOCK_H */
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index 5cf6731b98e9..3ef3736002d8 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -80,13 +80,13 @@ void debug_mutex_unlock(struct mutex *lock)
                        DEBUG_LOCKS_WARN_ON(lock->owner != current);
                DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
-                mutex_clear_owner(lock);
        }
        /*
         * __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug
         * mutexes so that we can do it here after we've verified state.
         */
+        mutex_clear_owner(lock);
        atomic_set(&lock->count, 1);
 }
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 454195194d4a..94674e5919cb 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -81,7 +81,7 @@ __visible void __sched __mutex_lock_slowpath(atomic_t *lock_count);
 * The mutex must later on be released by the same task that
 * acquired it. Recursive locking is not allowed. The task
 * may not exit without first unlocking the mutex. Also, kernel
- * memory where the mutex resides mutex must not be freed with
+ * memory where the mutex resides must not be freed with
 * the mutex still locked. The mutex must first be initialized
 * (or statically defined) before it can be locked. memset()-ing
 * the mutex to 0 is not allowed.
@@ -147,7 +147,7 @@ static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
 }
 /*
- * after acquiring lock with fastpath or when we lost out in contested
+ * After acquiring lock with fastpath or when we lost out in contested
 * slowpath, set ctx and wake up any waiters so they can recheck.
 *
 * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set,
@@ -191,19 +191,32 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock,
        spin_unlock_mutex(&lock->base.wait_lock, flags);
 }
-#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
 /*
- * In order to avoid a stampede of mutex spinners from acquiring the mutex
+ * After acquiring lock in the slowpath set ctx and wake up any
- * more or less simultaneously, the spinners need to acquire a MCS lock
+ * waiters so they can recheck.
- * first before spinning on the owner field.
 *
+ * Callers must hold the mutex wait_lock.
 */
+static __always_inline void
+ww_mutex_set_context_slowpath(struct ww_mutex *lock,
+                              struct ww_acquire_ctx *ctx)
+{
+        struct mutex_waiter *cur;
-/*
+        ww_mutex_lock_acquired(lock, ctx);
- * Mutex spinning code migrated from kernel/sched/core.c
+        lock->ctx = ctx;
- */
+        /*
+         * Give any possible sleeping processes the chance to wake up,
+         * so they can recheck if they have to back off.
+         */
+        list_for_each_entry(cur, &lock->base.wait_list, list) {
+                debug_mutex_wake_waiter(&lock->base, cur);
+                wake_up_process(cur->task);
+        }
+}
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
 static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
 {
        if (lock->owner != owner)
@@ -307,6 +320,11 @@ static bool mutex_optimistic_spin(struct mutex *lock,
        if (!mutex_can_spin_on_owner(lock))
                goto done;
+        /*
+         * In order to avoid a stampede of mutex spinners trying to
+         * acquire the mutex all at once, the spinners need to take a
+         * MCS (queued) lock first before spinning on the owner field.
+         */
        if (!osq_lock(&lock->osq))
                goto done;
@@ -469,7 +487,7 @@ void __sched ww_mutex_unlock(struct ww_mutex *lock)
 EXPORT_SYMBOL(ww_mutex_unlock);
 static inline int __sched
-__mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
+__ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
 {
        struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
        struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
@@ -557,7 +575,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                }
                if (use_ww_ctx && ww_ctx->acquired > 0) {
-                        ret = __mutex_lock_check_stamp(lock, ww_ctx);
+                        ret = __ww_mutex_lock_check_stamp(lock, ww_ctx);
                        if (ret)
                                goto err;
                }
@@ -569,6 +587,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                schedule_preempt_disabled();
                spin_lock_mutex(&lock->wait_lock, flags);
        }
+        __set_task_state(task, TASK_RUNNING);
        mutex_remove_waiter(lock, &waiter, current_thread_info());
        /* set it to 0 if there are no waiters left: */
        if (likely(list_empty(&lock->wait_list)))
@@ -582,23 +602,7 @@ skip_wait:
        if (use_ww_ctx) {
                struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
-                struct mutex_waiter *cur;
+                ww_mutex_set_context_slowpath(ww, ww_ctx);
-                /*
-                 * This branch gets optimized out for the common case,
-                 * and is only important for ww_mutex_lock.
-                 */
-                ww_mutex_lock_acquired(ww, ww_ctx);
-                ww->ctx = ww_ctx;
-                /*
-                 * Give any possible sleeping processes the chance to wake up,
-                 * so they can recheck if they have to back off.
-                 */
-                list_for_each_entry(cur, &lock->wait_list, list) {
-                        debug_mutex_wake_waiter(lock, cur);
-                        wake_up_process(cur->task);
-                }
        }
        spin_unlock_mutex(&lock->wait_lock, flags);
diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/osq_lock.c
index 9887a905a762..c112d00341b0 100644
--- a/kernel/locking/mcs_spinlock.c
+++ b/kernel/locking/osq_lock.c
@@ -1,8 +1,6 @@
 #include <linux/percpu.h>
 #include <linux/sched.h>
-#include "mcs_spinlock.h"
+#include <linux/osq_lock.h>
-#ifdef CONFIG_SMP
 /*
 * An MCS like lock especially tailored for optimistic spinning for sleeping
@@ -111,7 +109,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
         * cmpxchg in an attempt to undo our queueing.
         */
-        while (!smp_load_acquire(&node->locked)) {
+        while (!ACCESS_ONCE(node->locked)) {
                /*
                 * If we need to reschedule bail... so we can block.
                 */
@@ -203,6 +201,3 @@ void osq_unlock(struct optimistic_spin_queue *lock)
        if (next)
                ACCESS_ONCE(next->locked) = 1;
 }
-#endif
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 7c98873a3077..3059bc2f022d 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1130,6 +1130,7 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
                set_current_state(state);
        }
+        __set_current_state(TASK_RUNNING);
        return ret;
 }
@@ -1188,10 +1189,9 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
        ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
        if (likely(!ret))
+                /* sleep on the mutex */
                ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
-        set_current_state(TASK_RUNNING);
        if (unlikely(ret)) {
                remove_waiter(lock, &waiter);
                rt_mutex_handle_deadlock(ret, chwalk, &waiter);
@@ -1626,10 +1626,9 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
        set_current_state(TASK_INTERRUPTIBLE);
+        /* sleep on the mutex */
        ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
-        set_current_state(TASK_RUNNING);
        if (unlikely(ret))
                remove_waiter(lock, waiter);
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 2c93571162cb..2555ae15ec14 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -154,7 +154,7 @@ void __sched __down_read(struct rw_semaphore *sem)
                set_task_state(tsk, TASK_UNINTERRUPTIBLE);
        }
-        tsk->state = TASK_RUNNING;
+        __set_task_state(tsk, TASK_RUNNING);
 out:
        ;
 }
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 7628c3fc37ca..2f7cc4076f50 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -242,8 +242,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
                schedule();
        }
-        tsk->state = TASK_RUNNING;
+        __set_task_state(tsk, TASK_RUNNING);
        return sem;
 }
 EXPORT_SYMBOL(rwsem_down_read_failed);
diff --git a/kernel/module.c b/kernel/module.c
index 3965511ae133..d856e96a3cce 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -772,9 +772,18 @@ static int try_stop_module(struct module *mod, int flags, int *forced)
        return 0;
 }
-unsigned long module_refcount(struct module *mod)
+/**
+ * module_refcount - return the refcount or -1 if unloading
+ *
+ * @mod:        the module we're checking
+ *
+ * Returns:
+ *      -1 if the module is in the process of unloading
+ *      otherwise the number of references in the kernel to the module
+ */
+int module_refcount(struct module *mod)
 {
-        return (unsigned long)atomic_read(&mod->refcnt) - MODULE_REF_BASE;
+        return atomic_read(&mod->refcnt) - MODULE_REF_BASE;
 }
 EXPORT_SYMBOL(module_refcount);
@@ -856,7 +865,7 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod)
        struct module_use *use;
        int printed_something = 0;
-        seq_printf(m, " %lu ", module_refcount(mod));
+        seq_printf(m, " %i ", module_refcount(mod));
        /*
         * Always include a trailing , so userspace can differentiate
@@ -908,7 +917,7 @@ EXPORT_SYMBOL_GPL(symbol_put_addr);
 static ssize_t show_refcnt(struct module_attribute *mattr,
                           struct module_kobject *mk, char *buffer)
 {
-        return sprintf(buffer, "%lu\n", module_refcount(mk->mod));
+        return sprintf(buffer, "%i\n", module_refcount(mk->mod));
 }
 static struct module_attribute modinfo_refcnt =
@@ -1795,7 +1804,7 @@ static void unset_module_core_ro_nx(struct module *mod) { }
 static void unset_module_init_ro_nx(struct module *mod) { }
 #endif
-void __weak module_free(struct module *mod, void *module_region)
+void __weak module_memfree(void *module_region)
 {
        vfree(module_region);
 }
@@ -1804,6 +1813,10 @@ void __weak module_arch_cleanup(struct module *mod)
 {
 }
+void __weak module_arch_freeing_init(struct module *mod)
+{
+}
 /* Free a module, remove from lists, etc. */
 static void free_module(struct module *mod)
 {
@@ -1841,7 +1854,8 @@ static void free_module(struct module *mod)
        /* This may be NULL, but that's OK */
        unset_module_init_ro_nx(mod);
-        module_free(mod, mod->module_init);
+        module_arch_freeing_init(mod);
+        module_memfree(mod->module_init);
        kfree(mod->args);
        percpu_modfree(mod);
@@ -1850,7 +1864,7 @@ static void free_module(struct module *mod)
        /* Finally, free the core (containing the module structure) */
        unset_module_core_ro_nx(mod);
-        module_free(mod, mod->module_core);
+        module_memfree(mod->module_core);
 #ifdef CONFIG_MPU
        update_protections(current->mm);
@@ -2785,7 +2799,7 @@ static int move_module(struct module *mod, struct load_info *info)
                 */
                kmemleak_ignore(ptr);
                if (!ptr) {
-                        module_free(mod, mod->module_core);
+                        module_memfree(mod->module_core);
                        return -ENOMEM;
                }
                memset(ptr, 0, mod->init_size);
@@ -2930,8 +2944,9 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
 static void module_deallocate(struct module *mod, struct load_info *info)
 {
        percpu_modfree(mod);
-        module_free(mod, mod->module_init);
+        module_arch_freeing_init(mod);
-        module_free(mod, mod->module_core);
+        module_memfree(mod->module_init);
+        module_memfree(mod->module_core);
 }
 int __weak module_finalize(const Elf_Ehdr *hdr,
@@ -2983,10 +2998,31 @@ static void do_mod_ctors(struct module *mod)
 #endif
 }
+/* For freeing module_init on success, in case kallsyms traversing */
+struct mod_initfree {
+        struct rcu_head rcu;
+        void *module_init;
+};
+static void do_free_init(struct rcu_head *head)
+{
+        struct mod_initfree *m = container_of(head, struct mod_initfree, rcu);
+        module_memfree(m->module_init);
+        kfree(m);
+}
 /* This is where the real work happens */
 static int do_init_module(struct module *mod)
 {
        int ret = 0;
+        struct mod_initfree *freeinit;
+        freeinit = kmalloc(sizeof(*freeinit), GFP_KERNEL);
+        if (!freeinit) {
+                ret = -ENOMEM;
+                goto fail;
+        }
+        freeinit->module_init = mod->module_init;
        /*
         * We want to find out whether @mod uses async during init.  Clear
@@ -2999,18 +3035,7 @@ static int do_init_module(struct module *mod)
        if (mod->init != NULL)
                ret = do_one_initcall(mod->init);
        if (ret < 0) {
-                /*
+                goto fail_free_freeinit;
-                 * Init routine failed: abort.  Try to protect us from
-                 * buggy refcounters.
-                 */
-                mod->state = MODULE_STATE_GOING;
-                synchronize_sched();
-                module_put(mod);
-                blocking_notifier_call_chain(&module_notify_list,
-                                             MODULE_STATE_GOING, mod);
-                free_module(mod);
-                wake_up_all(&module_wq);
-                return ret;
        }
        if (ret > 0) {
                pr_warn("%s: '%s'->init suspiciously returned %d, it should "
@@ -3055,15 +3080,35 @@ static int do_init_module(struct module *mod)
        mod->strtab = mod->core_strtab;
 #endif
        unset_module_init_ro_nx(mod);
-        module_free(mod, mod->module_init);
+        module_arch_freeing_init(mod);
        mod->module_init = NULL;
        mod->init_size = 0;
        mod->init_ro_size = 0;
        mod->init_text_size = 0;
+        /*
+         * We want to free module_init, but be aware that kallsyms may be
+         * walking this with preempt disabled.  In all the failure paths,
+         * we call synchronize_rcu/synchronize_sched, but we don't want
+         * to slow down the success path, so use actual RCU here.
+         */
+        call_rcu(&freeinit->rcu, do_free_init);
        mutex_unlock(&module_mutex);
        wake_up_all(&module_wq);
        return 0;
+fail_free_freeinit:
+        kfree(freeinit);
+fail:
+        /* Try to protect us from buggy refcounters. */
+        mod->state = MODULE_STATE_GOING;
+        synchronize_sched();
+        module_put(mod);
+        blocking_notifier_call_chain(&module_notify_list,
+                                     MODULE_STATE_GOING, mod);
+        free_module(mod);
+        wake_up_all(&module_wq);
+        return ret;
 }
 static int may_init_module(void)
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 4803da6eab62..ae9fc7cc360e 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -402,6 +402,7 @@ int raw_notifier_call_chain(struct raw_notifier_head *nh,
 }
 EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
+#ifdef CONFIG_SRCU
 /*
 *      SRCU notifier chain routines.    Registration and unregistration
 *      use a mutex, and call_chain is synchronized by SRCU (no locks).
@@ -528,6 +529,8 @@ void srcu_init_notifier_head(struct srcu_notifier_head *nh)
 }
 EXPORT_SYMBOL_GPL(srcu_init_notifier_head);
+#endif /* CONFIG_SRCU */
 static ATOMIC_NOTIFIER_HEAD(die_chain);
 int notrace notify_die(enum die_val val, const char *str,
diff --git a/kernel/params.c b/kernel/params.c
index 0af9b2c4e56c..728e05b167de 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -642,12 +642,15 @@ static __modinit int add_sysfs_param(struct module_kobject *mk,
        mk->mp->grp.attrs = new_attrs;
        /* Tack new one on the end. */
+        memset(&mk->mp->attrs[mk->mp->num], 0, sizeof(mk->mp->attrs[0]));
        sysfs_attr_init(&mk->mp->attrs[mk->mp->num].mattr.attr);
        mk->mp->attrs[mk->mp->num].param = kp;
        mk->mp->attrs[mk->mp->num].mattr.show = param_attr_show;
        /* Do not allow runtime DAC changes to make param writable. */
        if ((kp->perm & (S_IWUSR | S_IWGRP | S_IWOTH)) != 0)
                mk->mp->attrs[mk->mp->num].mattr.store = param_attr_store;
+        else
+                mk->mp->attrs[mk->mp->num].mattr.store = NULL;
        mk->mp->attrs[mk->mp->num].mattr.attr.name = (char *)name;
        mk->mp->attrs[mk->mp->num].mattr.attr.mode = kp->perm;
        mk->mp->num++;
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 48b28d387c7f..7e01f78f0417 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -251,6 +251,7 @@ config APM_EMULATION
 config PM_OPP
        bool
+        select SRCU
        ---help---
          SOCs have a standard set of tuples consisting of frequency and
          voltage pairs that the device will support per voltage domain. This
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 5f4c006c4b1e..97b0df71303e 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -41,6 +41,8 @@
 #include <linux/platform_device.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
 #include <linux/uaccess.h>
 #include <linux/export.h>
@@ -182,6 +184,81 @@ static inline void pm_qos_set_value(struct pm_qos_constraints *c, s32 value)
        c->target_value = value;
 }
+static inline int pm_qos_get_value(struct pm_qos_constraints *c);
+static int pm_qos_dbg_show_requests(struct seq_file *s, void *unused)
+{
+        struct pm_qos_object *qos = (struct pm_qos_object *)s->private;
+        struct pm_qos_constraints *c;
+        struct pm_qos_request *req;
+        char *type;
+        unsigned long flags;
+        int tot_reqs = 0;
+        int active_reqs = 0;
+        if (IS_ERR_OR_NULL(qos)) {
+                pr_err("%s: bad qos param!\n", __func__);
+                return -EINVAL;
+        }
+        c = qos->constraints;
+        if (IS_ERR_OR_NULL(c)) {
+                pr_err("%s: Bad constraints on qos?\n", __func__);
+                return -EINVAL;
+        }
+        /* Lock to ensure we have a snapshot */
+        spin_lock_irqsave(&pm_qos_lock, flags);
+        if (plist_head_empty(&c->list)) {
+                seq_puts(s, "Empty!\n");
+                goto out;
+        }
+        switch (c->type) {
+        case PM_QOS_MIN:
+                type = "Minimum";
+                break;
+        case PM_QOS_MAX:
+                type = "Maximum";
+                break;
+        case PM_QOS_SUM:
+                type = "Sum";
+                break;
+        default:
+                type = "Unknown";
+        }
+        plist_for_each_entry(req, &c->list, node) {
+                char *state = "Default";
+                if ((req->node).prio != c->default_value) {
+                        active_reqs++;
+                        state = "Active";
+                }
+                tot_reqs++;
+                seq_printf(s, "%d: %d: %s\n", tot_reqs,
+                           (req->node).prio, state);
+        }
+        seq_printf(s, "Type=%s, Value=%d, Requests: active=%d / total=%d\n",
+                   type, pm_qos_get_value(c), active_reqs, tot_reqs);
+out:
+        spin_unlock_irqrestore(&pm_qos_lock, flags);
+        return 0;
+}
+static int pm_qos_dbg_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, pm_qos_dbg_show_requests,
+                           inode->i_private);
+}
+static const struct file_operations pm_qos_debug_fops = {
+        .open           = pm_qos_dbg_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
 /**
 * pm_qos_update_target - manages the constraints list and calls the notifiers
 *  if needed
@@ -509,12 +586,17 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
 EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
 /* User space interface to PM QoS classes via misc devices */
-static int register_pm_qos_misc(struct pm_qos_object *qos)
+static int register_pm_qos_misc(struct pm_qos_object *qos, struct dentry *d)
 {
        qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR;
        qos->pm_qos_power_miscdev.name = qos->name;
        qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops;
+        if (d) {
+                (void)debugfs_create_file(qos->name, S_IRUGO, d,
+                                          (void *)qos, &pm_qos_debug_fops);
+        }
        return misc_register(&qos->pm_qos_power_miscdev);
 }
@@ -608,11 +690,16 @@ static int __init pm_qos_power_init(void)
 {
        int ret = 0;
        int i;
+        struct dentry *d;
        BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES);
+        d = debugfs_create_dir("pm_qos", NULL);
+        if (IS_ERR_OR_NULL(d))
+                d = NULL;
        for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) {
-                ret = register_pm_qos_misc(pm_qos_array[i]);
+                ret = register_pm_qos_misc(pm_qos_array[i], d);
                if (ret < 0) {
                        printk(KERN_ERR "pm_qos_param: %s setup failed\n",
                               pm_qos_array[i]->name);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 0c40c16174b4..c24d5a23bf93 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1472,9 +1472,9 @@ static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
 /**
 * free_unnecessary_pages - Release preallocated pages not needed for the image
 */
-static void free_unnecessary_pages(void)
+static unsigned long free_unnecessary_pages(void)
 {
-        unsigned long save, to_free_normal, to_free_highmem;
+        unsigned long save, to_free_normal, to_free_highmem, free;
        save = count_data_pages();
        if (alloc_normal >= save) {
@@ -1495,6 +1495,7 @@ static void free_unnecessary_pages(void)
                else
                        to_free_normal = 0;
        }
+        free = to_free_normal + to_free_highmem;
        memory_bm_position_reset(&copy_bm);
@@ -1518,6 +1519,8 @@ static void free_unnecessary_pages(void)
                swsusp_unset_page_free(page);
                __free_page(page);
        }
+        return free;
 }
 /**
@@ -1707,7 +1710,7 @@ int hibernate_preallocate_memory(void)
         * pages in memory, but we have allocated more.  Release the excessive
         * ones now.
         */
-        free_unnecessary_pages();
+        pages -= free_unnecessary_pages();
 out:
        stop = ktime_get();
@@ -2310,8 +2313,6 @@ static inline void free_highmem_data(void)
                free_image_page(buffer, PG_UNSAFE_CLEAR);
 }
 #else
-static inline int get_safe_write_buffer(void) { return 0; }
 static unsigned int
 count_highmem_image_pages(struct memory_bitmap *bm) { return 0; }
diff --git a/kernel/range.c b/kernel/range.c
index 322ea8e93e4b..82cfc285b046 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -113,12 +113,12 @@ static int cmp_range(const void *x1, const void *x2)
 {
        const struct range *r1 = x1;
        const struct range *r2 = x2;
-        s64 start1, start2;
-        start1 = r1->start;
+        if (r1->start < r2->start)
-        start2 = r2->start;
+                return -1;
+        if (r1->start > r2->start)
-        return start1 - start2;
+                return 1;
+        return 0;
 }
 int clean_sort_range(struct range *range, int az)
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index e6fae503d1bc..50a808424b06 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -1,4 +1,5 @@
-obj-y += update.o srcu.o
+obj-y += update.o
+obj-$(CONFIG_SRCU) += srcu.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_TREE_RCU) += tree.o
 obj-$(CONFIG_PREEMPT_RCU) += tree.o
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 07bb02eda844..80adef7d4c3d 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -137,4 +137,10 @@ int rcu_jiffies_till_stall_check(void);
 void rcu_early_boot_tests(void);
+/*
+ * This function really isn't for public consumption, but RCU is special in
+ * that context switches can allow the state machine to make progress.
+ */
+extern void resched_cpu(int cpu);
 #endif /* __LINUX_RCU_H */
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 4d559baf06e0..30d42aa55d83 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -244,7 +244,8 @@ struct rcu_torture_ops {
        int (*readlock)(void);
        void (*read_delay)(struct torture_random_state *rrsp);
        void (*readunlock)(int idx);
-        int (*completed)(void);
+        unsigned long (*started)(void);
+        unsigned long (*completed)(void);
        void (*deferred_free)(struct rcu_torture *p);
        void (*sync)(void);
        void (*exp_sync)(void);
@@ -296,11 +297,6 @@ static void rcu_torture_read_unlock(int idx) __releases(RCU)
        rcu_read_unlock();
 }
-static int rcu_torture_completed(void)
-{
-        return rcu_batches_completed();
-}
 /*
 * Update callback in the pipe.  This should be invoked after a grace period.
 */
@@ -356,7 +352,7 @@ rcu_torture_cb(struct rcu_head *p)
                cur_ops->deferred_free(rp);
 }
-static int rcu_no_completed(void)
+static unsigned long rcu_no_completed(void)
 {
        return 0;
 }
@@ -377,7 +373,8 @@ static struct rcu_torture_ops rcu_ops = {
        .readlock       = rcu_torture_read_lock,
        .read_delay     = rcu_read_delay,
        .readunlock     = rcu_torture_read_unlock,
-        .completed      = rcu_torture_completed,
+        .started        = rcu_batches_started,
+        .completed      = rcu_batches_completed,
        .deferred_free  = rcu_torture_deferred_free,
        .sync           = synchronize_rcu,
        .exp_sync       = synchronize_rcu_expedited,
@@ -407,11 +404,6 @@ static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH)
        rcu_read_unlock_bh();
 }
-static int rcu_bh_torture_completed(void)
-{
-        return rcu_batches_completed_bh();
-}
 static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
 {
        call_rcu_bh(&p->rtort_rcu, rcu_torture_cb);
@@ -423,7 +415,8 @@ static struct rcu_torture_ops rcu_bh_ops = {
        .readlock       = rcu_bh_torture_read_lock,
        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
        .readunlock     = rcu_bh_torture_read_unlock,
-        .completed      = rcu_bh_torture_completed,
+        .started        = rcu_batches_started_bh,
+        .completed      = rcu_batches_completed_bh,
        .deferred_free  = rcu_bh_torture_deferred_free,
        .sync           = synchronize_rcu_bh,
        .exp_sync       = synchronize_rcu_bh_expedited,
@@ -466,6 +459,7 @@ static struct rcu_torture_ops rcu_busted_ops = {
        .readlock       = rcu_torture_read_lock,
        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
        .readunlock     = rcu_torture_read_unlock,
+        .started        = rcu_no_completed,
        .completed      = rcu_no_completed,
        .deferred_free  = rcu_busted_torture_deferred_free,
        .sync           = synchronize_rcu_busted,
@@ -510,7 +504,7 @@ static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
        srcu_read_unlock(&srcu_ctl, idx);
 }
-static int srcu_torture_completed(void)
+static unsigned long srcu_torture_completed(void)
 {
        return srcu_batches_completed(&srcu_ctl);
 }
@@ -564,6 +558,7 @@ static struct rcu_torture_ops srcu_ops = {
        .readlock       = srcu_torture_read_lock,
        .read_delay     = srcu_read_delay,
        .readunlock     = srcu_torture_read_unlock,
+        .started        = NULL,
        .completed      = srcu_torture_completed,
        .deferred_free  = srcu_torture_deferred_free,
        .sync           = srcu_torture_synchronize,
@@ -600,7 +595,8 @@ static struct rcu_torture_ops sched_ops = {
        .readlock       = sched_torture_read_lock,
        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
        .readunlock     = sched_torture_read_unlock,
-        .completed      = rcu_no_completed,
+        .started        = rcu_batches_started_sched,
+        .completed      = rcu_batches_completed_sched,
        .deferred_free  = rcu_sched_torture_deferred_free,
        .sync           = synchronize_sched,
        .exp_sync       = synchronize_sched_expedited,
@@ -638,6 +634,7 @@ static struct rcu_torture_ops tasks_ops = {
        .readlock       = tasks_torture_read_lock,
        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
        .readunlock     = tasks_torture_read_unlock,
+        .started        = rcu_no_completed,
        .completed      = rcu_no_completed,
        .deferred_free  = rcu_tasks_torture_deferred_free,
        .sync           = synchronize_rcu_tasks,
@@ -1015,8 +1012,8 @@ static void rcutorture_trace_dump(void)
 static void rcu_torture_timer(unsigned long unused)
 {
        int idx;
-        int completed;
+        unsigned long started;
-        int completed_end;
+        unsigned long completed;
        static DEFINE_TORTURE_RANDOM(rand);
        static DEFINE_SPINLOCK(rand_lock);
        struct rcu_torture *p;
@@ -1024,7 +1021,10 @@ static void rcu_torture_timer(unsigned long unused)
        unsigned long long ts;
        idx = cur_ops->readlock();
-        completed = cur_ops->completed();
+        if (cur_ops->started)
+                started = cur_ops->started();
+        else
+                started = cur_ops->completed();
        ts = rcu_trace_clock_local();
        p = rcu_dereference_check(rcu_torture_current,
                                  rcu_read_lock_bh_held() ||
@@ -1047,14 +1047,16 @@ static void rcu_torture_timer(unsigned long unused)
                /* Should not happen, but... */
                pipe_count = RCU_TORTURE_PIPE_LEN;
        }
-        completed_end = cur_ops->completed();
+        completed = cur_ops->completed();
        if (pipe_count > 1) {
                do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts,
-                                          completed, completed_end);
+                                          started, completed);
                rcutorture_trace_dump();
        }
        __this_cpu_inc(rcu_torture_count[pipe_count]);
-        completed = completed_end - completed;
+        completed = completed - started;
+        if (cur_ops->started)
+                completed++;
        if (completed > RCU_TORTURE_PIPE_LEN) {
                /* Should not happen, but... */
                completed = RCU_TORTURE_PIPE_LEN;
@@ -1073,8 +1075,8 @@ static void rcu_torture_timer(unsigned long unused)
 static int
 rcu_torture_reader(void *arg)
 {
-        int completed;
+        unsigned long started;
-        int completed_end;
+        unsigned long completed;
        int idx;
        DEFINE_TORTURE_RANDOM(rand);
        struct rcu_torture *p;
@@ -1093,7 +1095,10 @@ rcu_torture_reader(void *arg)
                                mod_timer(&t, jiffies + 1);
                }
                idx = cur_ops->readlock();
-                completed = cur_ops->completed();
+                if (cur_ops->started)
+                        started = cur_ops->started();
+                else
+                        started = cur_ops->completed();
                ts = rcu_trace_clock_local();
                p = rcu_dereference_check(rcu_torture_current,
                                          rcu_read_lock_bh_held() ||
@@ -1114,14 +1119,16 @@ rcu_torture_reader(void *arg)
                        /* Should not happen, but... */
                        pipe_count = RCU_TORTURE_PIPE_LEN;
                }
-                completed_end = cur_ops->completed();
+                completed = cur_ops->completed();
                if (pipe_count > 1) {
                        do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu,
-                                                  ts, completed, completed_end);
+                                                  ts, started, completed);
                        rcutorture_trace_dump();
                }
                __this_cpu_inc(rcu_torture_count[pipe_count]);
-                completed = completed_end - completed;
+                completed = completed - started;
+                if (cur_ops->started)
+                        completed++;
                if (completed > RCU_TORTURE_PIPE_LEN) {
                        /* Should not happen, but... */
                        completed = RCU_TORTURE_PIPE_LEN;
@@ -1420,6 +1427,9 @@ static int rcu_torture_barrier(void *arg)
                cur_ops->cb_barrier(); /* Implies smp_mb() for wait_event(). */
                if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) {
                        n_rcu_torture_barrier_error++;
+                        pr_err("barrier_cbs_invoked = %d, n_barrier_cbs = %d\n",
+                               atomic_read(&barrier_cbs_invoked),
+                               n_barrier_cbs);
                        WARN_ON_ONCE(1);
                }
                n_barrier_successes++;
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index e037f3eb2f7b..445bf8ffe3fb 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -546,7 +546,7 @@ EXPORT_SYMBOL_GPL(srcu_barrier);
 * Report the number of batches, correlated with, but not necessarily
 * precisely the same as, the number of grace periods that have elapsed.
 */
-long srcu_batches_completed(struct srcu_struct *sp)
+unsigned long srcu_batches_completed(struct srcu_struct *sp)
 {
        return sp->completed;
 }
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 0db5649f8817..cc9ceca7bde1 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -47,54 +47,14 @@ static void __call_rcu(struct rcu_head *head,
                       void (*func)(struct rcu_head *rcu),
                       struct rcu_ctrlblk *rcp);
-static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
 #include "tiny_plugin.h"
-/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcu/tree.c. */
-static void rcu_idle_enter_common(long long newval)
-{
-        if (newval) {
-                RCU_TRACE(trace_rcu_dyntick(TPS("--="),
-                                            rcu_dynticks_nesting, newval));
-                rcu_dynticks_nesting = newval;
-                return;
-        }
-        RCU_TRACE(trace_rcu_dyntick(TPS("Start"),
-                                    rcu_dynticks_nesting, newval));
-        if (IS_ENABLED(CONFIG_RCU_TRACE) && !is_idle_task(current)) {
-                struct task_struct *idle __maybe_unused = idle_task(smp_processor_id());
-                RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"),
-                                            rcu_dynticks_nesting, newval));
-                ftrace_dump(DUMP_ALL);
-                WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
-                          current->pid, current->comm,
-                          idle->pid, idle->comm); /* must be idle task! */
-        }
-        rcu_sched_qs(); /* implies rcu_bh_inc() */
-        barrier();
-        rcu_dynticks_nesting = newval;
-}
 /*
 * Enter idle, which is an extended quiescent state if we have fully
- * entered that mode (i.e., if the new value of dynticks_nesting is zero).
+ * entered that mode.
 */
 void rcu_idle_enter(void)
 {
-        unsigned long flags;
-        long long newval;
-        local_irq_save(flags);
-        WARN_ON_ONCE((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0);
-        if ((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) ==
-            DYNTICK_TASK_NEST_VALUE)
-                newval = 0;
-        else
-                newval = rcu_dynticks_nesting - DYNTICK_TASK_NEST_VALUE;
-        rcu_idle_enter_common(newval);
-        local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(rcu_idle_enter);
@@ -103,55 +63,14 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter);
 */
 void rcu_irq_exit(void)
 {
-        unsigned long flags;
-        long long newval;
-        local_irq_save(flags);
-        newval = rcu_dynticks_nesting - 1;
-        WARN_ON_ONCE(newval < 0);
-        rcu_idle_enter_common(newval);
-        local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(rcu_irq_exit);
-/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcu/tree.c. */
-static void rcu_idle_exit_common(long long oldval)
-{
-        if (oldval) {
-                RCU_TRACE(trace_rcu_dyntick(TPS("++="),
-                                            oldval, rcu_dynticks_nesting));
-                return;
-        }
-        RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting));
-        if (IS_ENABLED(CONFIG_RCU_TRACE) && !is_idle_task(current)) {
-                struct task_struct *idle __maybe_unused = idle_task(smp_processor_id());
-                RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"),
-                          oldval, rcu_dynticks_nesting));
-                ftrace_dump(DUMP_ALL);
-                WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
-                          current->pid, current->comm,
-                          idle->pid, idle->comm); /* must be idle task! */
-        }
-}
 /*
 * Exit idle, so that we are no longer in an extended quiescent state.
 */
 void rcu_idle_exit(void)
 {
-        unsigned long flags;
-        long long oldval;
-        local_irq_save(flags);
-        oldval = rcu_dynticks_nesting;
-        WARN_ON_ONCE(rcu_dynticks_nesting < 0);
-        if (rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK)
-                rcu_dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
-        else
-                rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
-        rcu_idle_exit_common(oldval);
-        local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(rcu_idle_exit);
@@ -160,15 +79,6 @@ EXPORT_SYMBOL_GPL(rcu_idle_exit);
 */
 void rcu_irq_enter(void)
 {
-        unsigned long flags;
-        long long oldval;
-        local_irq_save(flags);
-        oldval = rcu_dynticks_nesting;
-        rcu_dynticks_nesting++;
-        WARN_ON_ONCE(rcu_dynticks_nesting == 0);
-        rcu_idle_exit_common(oldval);
-        local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(rcu_irq_enter);
@@ -179,23 +89,13 @@ EXPORT_SYMBOL_GPL(rcu_irq_enter);
 */
 bool notrace __rcu_is_watching(void)
 {
-        return rcu_dynticks_nesting;
+        return true;
 }
 EXPORT_SYMBOL(__rcu_is_watching);
 #endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */
 /*
- * Test whether the current CPU was interrupted from idle.  Nested
- * interrupts don't count, we must be running at the first interrupt
- * level.
- */
-static int rcu_is_cpu_rrupt_from_idle(void)
-{
-        return rcu_dynticks_nesting <= 1;
-}
-/*
 * Helper function for rcu_sched_qs() and rcu_bh_qs().
 * Also irqs are disabled to avoid confusion due to interrupt handlers
 * invoking call_rcu().
@@ -250,7 +150,7 @@ void rcu_bh_qs(void)
 void rcu_check_callbacks(int user)
 {
        RCU_TRACE(check_cpu_stalls());
-        if (user || rcu_is_cpu_rrupt_from_idle())
+        if (user)
                rcu_sched_qs();
        else if (!in_softirq())
                rcu_bh_qs();
@@ -357,6 +257,11 @@ static void __call_rcu(struct rcu_head *head,
        rcp->curtail = &head->next;
        RCU_TRACE(rcp->qlen++);
        local_irq_restore(flags);
+        if (unlikely(is_idle_task(current))) {
+                /* force scheduling for rcu_sched_qs() */
+                resched_cpu(0);
+        }
 }
 /*
@@ -383,6 +288,8 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
 void __init rcu_init(void)
 {
        open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+        RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk));
+        RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk));
        rcu_early_boot_tests();
 }
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index 858c56569127..f94e209a10d6 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -145,17 +145,16 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp)
        rcp->ticks_this_gp++;
        j = jiffies;
        js = ACCESS_ONCE(rcp->jiffies_stall);
-        if (*rcp->curtail && ULONG_CMP_GE(j, js)) {
+        if (rcp->rcucblist && ULONG_CMP_GE(j, js)) {
                pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
-                       rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting,
+                       rcp->name, rcp->ticks_this_gp, DYNTICK_TASK_EXIT_IDLE,
                       jiffies - rcp->gp_start, rcp->qlen);
                dump_stack();
-        }
-        if (*rcp->curtail && ULONG_CMP_GE(j, js))
                ACCESS_ONCE(rcp->jiffies_stall) = jiffies +
                        3 * rcu_jiffies_till_stall_check() + 3;
-        else if (ULONG_CMP_GE(j, js))
+        } else if (ULONG_CMP_GE(j, js)) {
                ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check();
+        }
 }
 static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 7680fc275036..48d640ca1a05 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -156,6 +156,10 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
 static void invoke_rcu_core(void);
 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
+/* rcuc/rcub kthread realtime priority */
+static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
+module_param(kthread_prio, int, 0644);
 /*
 * Track the rcutorture test sequence number and the update version
 * number within a given test.  The rcutorture_testseq is incremented
@@ -215,6 +219,9 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
 #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
 };
+DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
+EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr);
 /*
 * Let the RCU core know that this CPU has gone through the scheduler,
 * which is a quiescent state.  This is called when the need for a
@@ -284,6 +291,22 @@ void rcu_note_context_switch(void)
 }
 EXPORT_SYMBOL_GPL(rcu_note_context_switch);
+/*
+ * Register a quiesecent state for all RCU flavors.  If there is an
+ * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight
+ * dyntick-idle quiescent state visible to other CPUs (but only for those
+ * RCU flavors in desparate need of a quiescent state, which will normally
+ * be none of them).  Either way, do a lightweight quiescent state for
+ * all RCU flavors.
+ */
+void rcu_all_qs(void)
+{
+        if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
+                rcu_momentary_dyntick_idle();
+        this_cpu_inc(rcu_qs_ctr);
+}
+EXPORT_SYMBOL_GPL(rcu_all_qs);
 static long blimit = 10;        /* Maximum callbacks per rcu_do_batch. */
 static long qhimark = 10000;    /* If this many pending, ignore blimit. */
 static long qlowmark = 100;     /* Once only this many pending, use blimit. */
@@ -315,18 +338,54 @@ static void force_quiescent_state(struct rcu_state *rsp);
 static int rcu_pending(void);
 /*
- * Return the number of RCU-sched batches processed thus far for debug & stats.
+ * Return the number of RCU batches started thus far for debug & stats.
+ */
+unsigned long rcu_batches_started(void)
+{
+        return rcu_state_p->gpnum;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_started);
+/*
+ * Return the number of RCU-sched batches started thus far for debug & stats.
+ */
+unsigned long rcu_batches_started_sched(void)
+{
+        return rcu_sched_state.gpnum;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_started_sched);
+/*
+ * Return the number of RCU BH batches started thus far for debug & stats.
 */
-long rcu_batches_completed_sched(void)
+unsigned long rcu_batches_started_bh(void)
+{
+        return rcu_bh_state.gpnum;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_started_bh);
+/*
+ * Return the number of RCU batches completed thus far for debug & stats.
+ */
+unsigned long rcu_batches_completed(void)
+{
+        return rcu_state_p->completed;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed);
+/*
+ * Return the number of RCU-sched batches completed thus far for debug & stats.
+ */
+unsigned long rcu_batches_completed_sched(void)
 {
        return rcu_sched_state.completed;
 }
 EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
 /*
- * Return the number of RCU BH batches processed thus far for debug & stats.
+ * Return the number of RCU BH batches completed thus far for debug & stats.
 */
-long rcu_batches_completed_bh(void)
+unsigned long rcu_batches_completed_bh(void)
 {
        return rcu_bh_state.completed;
 }
@@ -759,39 +818,71 @@ void rcu_irq_enter(void)
 /**
 * rcu_nmi_enter - inform RCU of entry to NMI context
 *
- * If the CPU was idle with dynamic ticks active, and there is no
+ * If the CPU was idle from RCU's viewpoint, update rdtp->dynticks and
- * irq handler running, this updates rdtp->dynticks_nmi to let the
+ * rdtp->dynticks_nmi_nesting to let the RCU grace-period handling know
- * RCU grace-period handling know that the CPU is active.
+ * that the CPU is active.  This implementation permits nested NMIs, as
+ * long as the nesting level does not overflow an int.  (You will probably
+ * run out of stack space first.)
 */
 void rcu_nmi_enter(void)
 {
        struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+        int incby = 2;
-        if (rdtp->dynticks_nmi_nesting == 0 &&
+        /* Complain about underflow. */
-            (atomic_read(&rdtp->dynticks) & 0x1))
+        WARN_ON_ONCE(rdtp->dynticks_nmi_nesting < 0);
-                return;
-        rdtp->dynticks_nmi_nesting++;
+        /*
-        smp_mb__before_atomic();  /* Force delay from prior write. */
+         * If idle from RCU viewpoint, atomically increment ->dynticks
-        atomic_inc(&rdtp->dynticks);
+         * to mark non-idle and increment ->dynticks_nmi_nesting by one.
-        /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
+         * Otherwise, increment ->dynticks_nmi_nesting by two.  This means
-        smp_mb__after_atomic();  /* See above. */
+         * if ->dynticks_nmi_nesting is equal to one, we are guaranteed
-        WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+         * to be in the outermost NMI handler that interrupted an RCU-idle
+         * period (observation due to Andy Lutomirski).
+         */
+        if (!(atomic_read(&rdtp->dynticks) & 0x1)) {
+                smp_mb__before_atomic();  /* Force delay from prior write. */
+                atomic_inc(&rdtp->dynticks);
+                /* atomic_inc() before later RCU read-side crit sects */
+                smp_mb__after_atomic();  /* See above. */
+                WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+                incby = 1;
+        }
+        rdtp->dynticks_nmi_nesting += incby;
+        barrier();
 }
 /**
 * rcu_nmi_exit - inform RCU of exit from NMI context
 *
- * If the CPU was idle with dynamic ticks active, and there is no
+ * If we are returning from the outermost NMI handler that interrupted an
- * irq handler running, this updates rdtp->dynticks_nmi to let the
+ * RCU-idle period, update rdtp->dynticks and rdtp->dynticks_nmi_nesting
- * RCU grace-period handling know that the CPU is no longer active.
+ * to let the RCU grace-period handling know that the CPU is back to
+ * being RCU-idle.
 */
 void rcu_nmi_exit(void)
 {
        struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
-        if (rdtp->dynticks_nmi_nesting == 0 ||
+        /*
-            --rdtp->dynticks_nmi_nesting != 0)
+         * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks.
+         * (We are exiting an NMI handler, so RCU better be paying attention
+         * to us!)
+         */
+        WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0);
+        WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+        /*
+         * If the nesting level is not 1, the CPU wasn't RCU-idle, so
+         * leave it in non-RCU-idle state.
+         */
+        if (rdtp->dynticks_nmi_nesting != 1) {
+                rdtp->dynticks_nmi_nesting -= 2;
                return;
+        }
+        /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
+        rdtp->dynticks_nmi_nesting = 0;
        /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
        smp_mb__before_atomic();  /* See above. */
        atomic_inc(&rdtp->dynticks);
@@ -898,17 +989,14 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
                trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
                return 1;
        } else {
+                if (ULONG_CMP_LT(ACCESS_ONCE(rdp->gpnum) + ULONG_MAX / 4,
+                                 rdp->mynode->gpnum))
+                        ACCESS_ONCE(rdp->gpwrap) = true;
                return 0;
        }
 }
 /*
- * This function really isn't for public consumption, but RCU is special in
- * that context switches can allow the state machine to make progress.
- */
-extern void resched_cpu(int cpu);
-/*
 * Return true if the specified CPU has passed through a quiescent
 * state by virtue of being in or having passed through an dynticks
 * idle state since the last call to dyntick_save_progress_counter()
@@ -1011,6 +1099,22 @@ static void record_gp_stall_check_time(struct rcu_state *rsp)
        j1 = rcu_jiffies_till_stall_check();
        ACCESS_ONCE(rsp->jiffies_stall) = j + j1;
        rsp->jiffies_resched = j + j1 / 2;
+        rsp->n_force_qs_gpstart = ACCESS_ONCE(rsp->n_force_qs);
+}
+/*
+ * Complain about starvation of grace-period kthread.
+ */
+static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
+{
+        unsigned long gpa;
+        unsigned long j;
+        j = jiffies;
+        gpa = ACCESS_ONCE(rsp->gp_activity);
+        if (j - gpa > 2 * HZ)
+                pr_err("%s kthread starved for %ld jiffies!\n",
+                       rsp->name, j - gpa);
 }
 /*
@@ -1033,11 +1137,13 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
        }
 }
-static void print_other_cpu_stall(struct rcu_state *rsp)
+static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
 {
        int cpu;
        long delta;
        unsigned long flags;
+        unsigned long gpa;
+        unsigned long j;
        int ndetected = 0;
        struct rcu_node *rnp = rcu_get_root(rsp);
        long totqlen = 0;
@@ -1075,30 +1181,34 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        }
-        /*
-         * Now rat on any tasks that got kicked up to the root rcu_node
-         * due to CPU offlining.
-         */
-        rnp = rcu_get_root(rsp);
-        raw_spin_lock_irqsave(&rnp->lock, flags);
-        ndetected += rcu_print_task_stall(rnp);
-        raw_spin_unlock_irqrestore(&rnp->lock, flags);
        print_cpu_stall_info_end();
        for_each_possible_cpu(cpu)
                totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
        pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n",
               smp_processor_id(), (long)(jiffies - rsp->gp_start),
               (long)rsp->gpnum, (long)rsp->completed, totqlen);
-        if (ndetected == 0)
+        if (ndetected) {
-                pr_err("INFO: Stall ended before state dump start\n");
-        else
                rcu_dump_cpu_stacks(rsp);
+        } else {
+                if (ACCESS_ONCE(rsp->gpnum) != gpnum ||
+                    ACCESS_ONCE(rsp->completed) == gpnum) {
+                        pr_err("INFO: Stall ended before state dump start\n");
+                } else {
+                        j = jiffies;
+                        gpa = ACCESS_ONCE(rsp->gp_activity);
+                        pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld\n",
+                               rsp->name, j - gpa, j, gpa,
+                               jiffies_till_next_fqs);
+                        /* In this case, the current CPU might be at fault. */
+                        sched_show_task(current);
+                }
+        }
        /* Complain about tasks blocking the grace period. */
        rcu_print_detail_task_stall(rsp);
+        rcu_check_gp_kthread_starvation(rsp);
        force_quiescent_state(rsp);  /* Kick them all. */
 }
@@ -1123,6 +1233,9 @@ static void print_cpu_stall(struct rcu_state *rsp)
        pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n",
                jiffies - rsp->gp_start,
                (long)rsp->gpnum, (long)rsp->completed, totqlen);
+        rcu_check_gp_kthread_starvation(rsp);
        rcu_dump_cpu_stacks(rsp);
        raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -1193,7 +1306,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
                   ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) {
                /* They had a few time units to dump stack, so complain. */
-                print_other_cpu_stall(rsp);
+                print_other_cpu_stall(rsp, gpnum);
        }
 }
@@ -1530,7 +1643,8 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
        bool ret;
        /* Handle the ends of any preceding grace periods first. */
-        if (rdp->completed == rnp->completed) {
+        if (rdp->completed == rnp->completed &&
+            !unlikely(ACCESS_ONCE(rdp->gpwrap))) {
                /* No grace period end, so just accelerate recent callbacks. */
                ret = rcu_accelerate_cbs(rsp, rnp, rdp);
@@ -1545,7 +1659,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
                trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend"));
        }
-        if (rdp->gpnum != rnp->gpnum) {
+        if (rdp->gpnum != rnp->gpnum || unlikely(ACCESS_ONCE(rdp->gpwrap))) {
                /*
                 * If the current grace period is waiting for this CPU,
                 * set up to detect a quiescent state, otherwise don't
@@ -1554,8 +1668,10 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
                rdp->gpnum = rnp->gpnum;
                trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart"));
                rdp->passed_quiesce = 0;
+                rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
                rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
                zero_cpu_stall_ticks(rdp);
+                ACCESS_ONCE(rdp->gpwrap) = false;
        }
        return ret;
 }
@@ -1569,7 +1685,8 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
        local_irq_save(flags);
        rnp = rdp->mynode;
        if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) &&
-             rdp->completed == ACCESS_ONCE(rnp->completed)) || /* w/out lock. */
+             rdp->completed == ACCESS_ONCE(rnp->completed) &&
+             !unlikely(ACCESS_ONCE(rdp->gpwrap))) || /* w/out lock. */
            !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
                local_irq_restore(flags);
                return;
@@ -1589,6 +1706,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
        struct rcu_data *rdp;
        struct rcu_node *rnp = rcu_get_root(rsp);
+        ACCESS_ONCE(rsp->gp_activity) = jiffies;
        rcu_bind_gp_kthread();
        raw_spin_lock_irq(&rnp->lock);
        smp_mb__after_unlock_lock();
@@ -1649,6 +1767,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
                                            rnp->grphi, rnp->qsmask);
                raw_spin_unlock_irq(&rnp->lock);
                cond_resched_rcu_qs();
+                ACCESS_ONCE(rsp->gp_activity) = jiffies;
        }
        mutex_unlock(&rsp->onoff_mutex);
@@ -1665,6 +1784,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
        unsigned long maxj;
        struct rcu_node *rnp = rcu_get_root(rsp);
+        ACCESS_ONCE(rsp->gp_activity) = jiffies;
        rsp->n_force_qs++;
        if (fqs_state == RCU_SAVE_DYNTICK) {
                /* Collect dyntick-idle snapshots. */
@@ -1703,6 +1823,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
        struct rcu_data *rdp;
        struct rcu_node *rnp = rcu_get_root(rsp);
+        ACCESS_ONCE(rsp->gp_activity) = jiffies;
        raw_spin_lock_irq(&rnp->lock);
        smp_mb__after_unlock_lock();
        gp_duration = jiffies - rsp->gp_start;
@@ -1739,6 +1860,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
                nocb += rcu_future_gp_cleanup(rsp, rnp);
                raw_spin_unlock_irq(&rnp->lock);
                cond_resched_rcu_qs();
+                ACCESS_ONCE(rsp->gp_activity) = jiffies;
        }
        rnp = rcu_get_root(rsp);
        raw_spin_lock_irq(&rnp->lock);
@@ -1788,6 +1910,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
                        if (rcu_gp_init(rsp))
                                break;
                        cond_resched_rcu_qs();
+                        ACCESS_ONCE(rsp->gp_activity) = jiffies;
                        WARN_ON(signal_pending(current));
                        trace_rcu_grace_period(rsp->name,
                                               ACCESS_ONCE(rsp->gpnum),
@@ -1831,9 +1954,11 @@ static int __noreturn rcu_gp_kthread(void *arg)
                                                       ACCESS_ONCE(rsp->gpnum),
                                                       TPS("fqsend"));
                                cond_resched_rcu_qs();
+                                ACCESS_ONCE(rsp->gp_activity) = jiffies;
                        } else {
                                /* Deal with stray signal. */
                                cond_resched_rcu_qs();
+                                ACCESS_ONCE(rsp->gp_activity) = jiffies;
                                WARN_ON(signal_pending(current));
                                trace_rcu_grace_period(rsp->name,
                                                       ACCESS_ONCE(rsp->gpnum),
@@ -2010,8 +2135,10 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
        rnp = rdp->mynode;
        raw_spin_lock_irqsave(&rnp->lock, flags);
        smp_mb__after_unlock_lock();
-        if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum ||
+        if ((rdp->passed_quiesce == 0 &&
-            rnp->completed == rnp->gpnum) {
+             rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) ||
+            rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum ||
+            rdp->gpwrap) {
                /*
                 * The grace period in which this quiescent state was
@@ -2020,6 +2147,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
                 * within the current grace period.
                 */
                rdp->passed_quiesce = 0;        /* need qs for new gp. */
+                rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
                return;
        }
@@ -2064,7 +2192,8 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
         * Was there a quiescent state since the beginning of the grace
         * period? If no, then exit and wait for the next call.
         */
-        if (!rdp->passed_quiesce)
+        if (!rdp->passed_quiesce &&
+            rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr))
                return;
        /*
@@ -2195,6 +2324,46 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
 }
 /*
+ * All CPUs for the specified rcu_node structure have gone offline,
+ * and all tasks that were preempted within an RCU read-side critical
+ * section while running on one of those CPUs have since exited their RCU
+ * read-side critical section.  Some other CPU is reporting this fact with
+ * the specified rcu_node structure's ->lock held and interrupts disabled.
+ * This function therefore goes up the tree of rcu_node structures,
+ * clearing the corresponding bits in the ->qsmaskinit fields.  Note that
+ * the leaf rcu_node structure's ->qsmaskinit field has already been
+ * updated
+ *
+ * This function does check that the specified rcu_node structure has
+ * all CPUs offline and no blocked tasks, so it is OK to invoke it
+ * prematurely.  That said, invoking it after the fact will cost you
+ * a needless lock acquisition.  So once it has done its work, don't
+ * invoke it again.
+ */
+static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
+{
+        long mask;
+        struct rcu_node *rnp = rnp_leaf;
+        if (rnp->qsmaskinit || rcu_preempt_has_tasks(rnp))
+                return;
+        for (;;) {
+                mask = rnp->grpmask;
+                rnp = rnp->parent;
+                if (!rnp)
+                        break;
+                raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+                smp_mb__after_unlock_lock(); /* GP memory ordering. */
+                rnp->qsmaskinit &= ~mask;
+                if (rnp->qsmaskinit) {
+                        raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+                        return;
+                }
+                raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+        }
+}
+/*
 * The CPU has been completely removed, and some other CPU is reporting
 * this fact from process context.  Do the remainder of the cleanup,
 * including orphaning the outgoing CPU's RCU callbacks, and also
@@ -2204,8 +2373,6 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
 static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 {
        unsigned long flags;
-        unsigned long mask;
-        int need_report = 0;
        struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
        struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
@@ -2219,40 +2386,15 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
        /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
        rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
        rcu_adopt_orphan_cbs(rsp, flags);
+        raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags);
-        /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
+        /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
-        mask = rdp->grpmask;    /* rnp->grplo is constant. */
+        raw_spin_lock_irqsave(&rnp->lock, flags);
-        do {
+        smp_mb__after_unlock_lock();    /* Enforce GP memory-order guarantee. */
-                raw_spin_lock(&rnp->lock);      /* irqs already disabled. */
+        rnp->qsmaskinit &= ~rdp->grpmask;
-                smp_mb__after_unlock_lock();
+        if (rnp->qsmaskinit == 0 && !rcu_preempt_has_tasks(rnp))
-                rnp->qsmaskinit &= ~mask;
+                rcu_cleanup_dead_rnp(rnp);
-                if (rnp->qsmaskinit != 0) {
+        rcu_report_qs_rnp(rdp->grpmask, rsp, rnp, flags); /* Rlses rnp->lock. */
-                        if (rnp != rdp->mynode)
-                                raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
-                        break;
-                }
-                if (rnp == rdp->mynode)
-                        need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
-                else
-                        raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
-                mask = rnp->grpmask;
-                rnp = rnp->parent;
-        } while (rnp != NULL);
-        /*
-         * We still hold the leaf rcu_node structure lock here, and
-         * irqs are still disabled.  The reason for this subterfuge is
-         * because invoking rcu_report_unblock_qs_rnp() with ->orphan_lock
-         * held leads to deadlock.
-         */
-        raw_spin_unlock(&rsp->orphan_lock); /* irqs remain disabled. */
-        rnp = rdp->mynode;
-        if (need_report & RCU_OFL_TASKS_NORM_GP)
-                rcu_report_unblock_qs_rnp(rnp, flags);
-        else
-                raw_spin_unlock_irqrestore(&rnp->lock, flags);
-        if (need_report & RCU_OFL_TASKS_EXP_GP)
-                rcu_report_exp_rnp(rsp, rnp, true);
        WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
                  "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
                  cpu, rdp->qlen, rdp->nxtlist);
@@ -2268,6 +2410,10 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
 {
 }
+static void __maybe_unused rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
+{
+}
 static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 {
 }
@@ -2464,12 +2610,6 @@ static void force_qs_rnp(struct rcu_state *rsp,
                }
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        }
-        rnp = rcu_get_root(rsp);
-        if (rnp->qsmask == 0) {
-                raw_spin_lock_irqsave(&rnp->lock, flags);
-                smp_mb__after_unlock_lock();
-                rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
-        }
 }
 /*
@@ -2569,7 +2709,7 @@ static void rcu_process_callbacks(struct softirq_action *unused)
 * Schedule RCU callback invocation.  If the specified type of RCU
 * does not support RCU priority boosting, just do a direct call,
 * otherwise wake up the per-CPU kernel kthread.  Note that because we
- * are running on the current CPU with interrupts disabled, the
+ * are running on the current CPU with softirqs disabled, the
 * rcu_cpu_kthread_task cannot disappear out from under us.
 */
 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
@@ -3109,9 +3249,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
        /* Is the RCU core waiting for a quiescent state from this CPU? */
        if (rcu_scheduler_fully_active &&
-            rdp->qs_pending && !rdp->passed_quiesce) {
+            rdp->qs_pending && !rdp->passed_quiesce &&
+            rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) {
                rdp->n_rp_qs_pending++;
-        } else if (rdp->qs_pending && rdp->passed_quiesce) {
+        } else if (rdp->qs_pending &&
+                   (rdp->passed_quiesce ||
+                    rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) {
                rdp->n_rp_report_qs++;
                return 1;
        }
@@ -3135,7 +3278,8 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
        }
        /* Has a new RCU grace period started? */
-        if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum) { /* outside lock */
+        if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum ||
+            unlikely(ACCESS_ONCE(rdp->gpwrap))) { /* outside lock */
                rdp->n_rp_gp_started++;
                return 1;
        }
@@ -3318,6 +3462,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
                        } else {
                                _rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
                                                   rsp->n_barrier_done);
+                                smp_mb__before_atomic();
                                atomic_inc(&rsp->barrier_cpu_count);
                                __call_rcu(&rdp->barrier_head,
                                           rcu_barrier_callback, rsp, cpu, 0);
@@ -3385,9 +3530,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
        /* Set up local state, ensuring consistent view of global state. */
        raw_spin_lock_irqsave(&rnp->lock, flags);
        rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
-        init_callback_list(rdp);
-        rdp->qlen_lazy = 0;
-        ACCESS_ONCE(rdp->qlen) = 0;
        rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
        WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
        WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
@@ -3444,6 +3586,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
                        rdp->gpnum = rnp->completed;
                        rdp->completed = rnp->completed;
                        rdp->passed_quiesce = 0;
+                        rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
                        rdp->qs_pending = 0;
                        trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
                }
@@ -3535,17 +3678,35 @@ static int rcu_pm_notify(struct notifier_block *self,
 static int __init rcu_spawn_gp_kthread(void)
 {
        unsigned long flags;
+        int kthread_prio_in = kthread_prio;
        struct rcu_node *rnp;
        struct rcu_state *rsp;
+        struct sched_param sp;
        struct task_struct *t;
+        /* Force priority into range. */
+        if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1)
+                kthread_prio = 1;
+        else if (kthread_prio < 0)
+                kthread_prio = 0;
+        else if (kthread_prio > 99)
+                kthread_prio = 99;
+        if (kthread_prio != kthread_prio_in)
+                pr_alert("rcu_spawn_gp_kthread(): Limited prio to %d from %d\n",
+                         kthread_prio, kthread_prio_in);
        rcu_scheduler_fully_active = 1;
        for_each_rcu_flavor(rsp) {
-                t = kthread_run(rcu_gp_kthread, rsp, "%s", rsp->name);
+                t = kthread_create(rcu_gp_kthread, rsp, "%s", rsp->name);
                BUG_ON(IS_ERR(t));
                rnp = rcu_get_root(rsp);
                raw_spin_lock_irqsave(&rnp->lock, flags);
                rsp->gp_kthread = t;
+                if (kthread_prio) {
+                        sp.sched_priority = kthread_prio;
+                        sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+                }
+                wake_up_process(t);
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        }
        rcu_spawn_nocb_kthreads();
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 8e7b1843896e..119de399eb2f 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -27,7 +27,6 @@
 #include <linux/threads.h>
 #include <linux/cpumask.h>
 #include <linux/seqlock.h>
-#include <linux/irq_work.h>
 /*
 * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
@@ -172,11 +171,6 @@ struct rcu_node {
                                /*  queued on this rcu_node structure that */
                                /*  are blocking the current grace period, */
                                /*  there can be no such task. */
-        struct completion boost_completion;
-                                /* Used to ensure that the rt_mutex used */
-                                /*  to carry out the boosting is fully */
-                                /*  released with no future boostee accesses */
-                                /*  before that rt_mutex is re-initialized. */
        struct rt_mutex boost_mtx;
                                /* Used only for the priority-boosting */
                                /*  side effect, not as a lock. */
@@ -257,9 +251,12 @@ struct rcu_data {
                                        /*  in order to detect GP end. */
        unsigned long   gpnum;          /* Highest gp number that this CPU */
                                        /*  is aware of having started. */
+        unsigned long   rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */
+                                        /*  for rcu_all_qs() invocations. */
        bool            passed_quiesce; /* User-mode/idle loop etc. */
        bool            qs_pending;     /* Core waits for quiesc state. */
        bool            beenonline;     /* CPU online at least once. */
+        bool            gpwrap;         /* Possible gpnum/completed wrap. */
        struct rcu_node *mynode;        /* This CPU's leaf of hierarchy */
        unsigned long grpmask;          /* Mask to apply to leaf qsmask. */
 #ifdef CONFIG_RCU_CPU_STALL_INFO
@@ -340,14 +337,10 @@ struct rcu_data {
 #ifdef CONFIG_RCU_NOCB_CPU
        struct rcu_head *nocb_head;     /* CBs waiting for kthread. */
        struct rcu_head **nocb_tail;
-        atomic_long_t nocb_q_count;     /* # CBs waiting for kthread */
+        atomic_long_t nocb_q_count;     /* # CBs waiting for nocb */
-        atomic_long_t nocb_q_count_lazy; /*  (approximate). */
+        atomic_long_t nocb_q_count_lazy; /*  invocation (all stages). */
        struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */
        struct rcu_head **nocb_follower_tail;
-        atomic_long_t nocb_follower_count; /* # CBs ready to invoke. */
-        atomic_long_t nocb_follower_count_lazy; /*  (approximate). */
-        int nocb_p_count;               /* # CBs being invoked by kthread */
-        int nocb_p_count_lazy;          /*  (approximate). */
        wait_queue_head_t nocb_wq;      /* For nocb kthreads to sleep on. */
        struct task_struct *nocb_kthread;
        int nocb_defer_wakeup;          /* Defer wakeup of nocb_kthread. */
@@ -356,8 +349,6 @@ struct rcu_data {
        struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp;
                                        /* CBs waiting for GP. */
        struct rcu_head **nocb_gp_tail;
-        long nocb_gp_count;
-        long nocb_gp_count_lazy;
        bool nocb_leader_sleep;         /* Is the nocb leader thread asleep? */
        struct rcu_data *nocb_next_follower;
                                        /* Next follower in wakeup chain. */
@@ -488,10 +479,14 @@ struct rcu_state {
                                                /*  due to no GP active. */
        unsigned long gp_start;                 /* Time at which GP started, */
                                                /*  but in jiffies. */
+        unsigned long gp_activity;              /* Time of last GP kthread */
+                                                /*  activity in jiffies. */
        unsigned long jiffies_stall;            /* Time at which to check */
                                                /*  for CPU stalls. */
        unsigned long jiffies_resched;          /* Time at which to resched */
                                                /*  a reluctant CPU. */
+        unsigned long n_force_qs_gpstart;       /* Snapshot of n_force_qs at */
+                                                /*  GP start. */
        unsigned long gp_max;                   /* Maximum GP duration in */
                                                /*  jiffies. */
        const char *name;                       /* Name of structure. */
@@ -514,13 +509,6 @@ extern struct list_head rcu_struct_flavors;
 #define for_each_rcu_flavor(rsp) \
        list_for_each_entry((rsp), &rcu_struct_flavors, flavors)
-/* Return values for rcu_preempt_offline_tasks(). */
-#define RCU_OFL_TASKS_NORM_GP   0x1             /* Tasks blocking normal */
-                                                /*  GP were moved to root. */
-#define RCU_OFL_TASKS_EXP_GP    0x2             /* Tasks blocking expedited */
-                                                /*  GP were moved to root. */
 /*
 * RCU implementation internal declarations:
 */
@@ -546,27 +534,16 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work);
 /* Forward declarations for rcutree_plugin.h */
 static void rcu_bootup_announce(void);
-long rcu_batches_completed(void);
 static void rcu_preempt_note_context_switch(void);
 static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
 #ifdef CONFIG_HOTPLUG_CPU
-static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
+static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
-                                      unsigned long flags);
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 static void rcu_print_detail_task_stall(struct rcu_state *rsp);
 static int rcu_print_task_stall(struct rcu_node *rnp);
 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
-#ifdef CONFIG_HOTPLUG_CPU
-static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
-                                     struct rcu_node *rnp,
-                                     struct rcu_data *rdp);
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
 static void rcu_preempt_check_callbacks(void);
 void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
-#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU)
-static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
-                               bool wake);
-#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU) */
 static void __init __rcu_init_preempt(void);
 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
@@ -622,24 +599,15 @@ static void rcu_dynticks_task_exit(void);
 #endif /* #ifndef RCU_TREE_NONCORE */
 #ifdef CONFIG_RCU_TRACE
-#ifdef CONFIG_RCU_NOCB_CPU
+/* Read out queue lengths for tracing. */
-/* Sum up queue lengths for tracing. */
 static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
 {
-        *ql = atomic_long_read(&rdp->nocb_q_count) +
+#ifdef CONFIG_RCU_NOCB_CPU
-              rdp->nocb_p_count +
+        *ql = atomic_long_read(&rdp->nocb_q_count);
-              atomic_long_read(&rdp->nocb_follower_count) +
+        *qll = atomic_long_read(&rdp->nocb_q_count_lazy);
-              rdp->nocb_p_count + rdp->nocb_gp_count;
-        *qll = atomic_long_read(&rdp->nocb_q_count_lazy) +
-               rdp->nocb_p_count_lazy +
-               atomic_long_read(&rdp->nocb_follower_count_lazy) +
-               rdp->nocb_p_count_lazy + rdp->nocb_gp_count_lazy;
-}
 #else /* #ifdef CONFIG_RCU_NOCB_CPU */
-static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
-{
        *ql = 0;
        *qll = 0;
-}
 #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
+}
 #endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 3ec85cb5d544..2e850a51bb8f 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -34,10 +34,6 @@
 #include "../locking/rtmutex_common.h"
-/* rcuc/rcub kthread realtime priority */
-static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
-module_param(kthread_prio, int, 0644);
 /*
 * Control variables for per-CPU and per-rcu_node kthreads.  These
 * handle all flavors of RCU.
@@ -103,6 +99,8 @@ RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
 static struct rcu_state *rcu_state_p = &rcu_preempt_state;
 static int rcu_preempted_readers_exp(struct rcu_node *rnp);
+static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
+                               bool wake);
 /*
 * Tell them what RCU they are running.
@@ -114,25 +112,6 @@ static void __init rcu_bootup_announce(void)
 }
 /*
- * Return the number of RCU-preempt batches processed thus far
- * for debug and statistics.
- */
-static long rcu_batches_completed_preempt(void)
-{
-        return rcu_preempt_state.completed;
-}
-EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt);
-/*
- * Return the number of RCU batches processed thus far for debug & stats.
- */
-long rcu_batches_completed(void)
-{
-        return rcu_batches_completed_preempt();
-}
-EXPORT_SYMBOL_GPL(rcu_batches_completed);
-/*
 * Record a preemptible-RCU quiescent state for the specified CPU.  Note
 * that this just means that the task currently running on the CPU is
 * not in a quiescent state.  There might be any number of tasks blocked
@@ -307,15 +286,25 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t,
 }
 /*
+ * Return true if the specified rcu_node structure has tasks that were
+ * preempted within an RCU read-side critical section.
+ */
+static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
+{
+        return !list_empty(&rnp->blkd_tasks);
+}
+/*
 * Handle special cases during rcu_read_unlock(), such as needing to
 * notify RCU core processing or task having blocked during the RCU
 * read-side critical section.
 */
 void rcu_read_unlock_special(struct task_struct *t)
 {
-        int empty;
+        bool empty;
-        int empty_exp;
+        bool empty_exp;
-        int empty_exp_now;
+        bool empty_norm;
+        bool empty_exp_now;
        unsigned long flags;
        struct list_head *np;
 #ifdef CONFIG_RCU_BOOST
@@ -367,7 +356,8 @@ void rcu_read_unlock_special(struct task_struct *t)
                                break;
                        raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
                }
-                empty = !rcu_preempt_blocked_readers_cgp(rnp);
+                empty = !rcu_preempt_has_tasks(rnp);
+                empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
                empty_exp = !rcu_preempted_readers_exp(rnp);
                smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
                np = rcu_next_node_entry(t, rnp);
@@ -387,13 +377,21 @@ void rcu_read_unlock_special(struct task_struct *t)
 #endif /* #ifdef CONFIG_RCU_BOOST */
                /*
+                 * If this was the last task on the list, go see if we
+                 * need to propagate ->qsmaskinit bit clearing up the
+                 * rcu_node tree.
+                 */
+                if (!empty && !rcu_preempt_has_tasks(rnp))
+                        rcu_cleanup_dead_rnp(rnp);
+                /*
                 * If this was the last task on the current list, and if
                 * we aren't waiting on any CPUs, report the quiescent state.
                 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
                 * so we must take a snapshot of the expedited state.
                 */
                empty_exp_now = !rcu_preempted_readers_exp(rnp);
-                if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) {
+                if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) {
                        trace_rcu_quiescent_state_report(TPS("preempt_rcu"),
                                                         rnp->gpnum,
                                                         0, rnp->qsmask,
@@ -408,10 +406,8 @@ void rcu_read_unlock_special(struct task_struct *t)
 #ifdef CONFIG_RCU_BOOST
                /* Unboost if we were boosted. */
-                if (drop_boost_mutex) {
+                if (drop_boost_mutex)
                        rt_mutex_unlock(&rnp->boost_mtx);
-                        complete(&rnp->boost_completion);
-                }
 #endif /* #ifdef CONFIG_RCU_BOOST */
                /*
@@ -519,99 +515,13 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
 {
        WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
-        if (!list_empty(&rnp->blkd_tasks))
+        if (rcu_preempt_has_tasks(rnp))
                rnp->gp_tasks = rnp->blkd_tasks.next;
        WARN_ON_ONCE(rnp->qsmask);
 }
 #ifdef CONFIG_HOTPLUG_CPU
-/*
- * Handle tasklist migration for case in which all CPUs covered by the
- * specified rcu_node have gone offline.  Move them up to the root
- * rcu_node.  The reason for not just moving them to the immediate
- * parent is to remove the need for rcu_read_unlock_special() to
- * make more than two attempts to acquire the target rcu_node's lock.
- * Returns true if there were tasks blocking the current RCU grace
- * period.
- *
- * Returns 1 if there was previously a task blocking the current grace
- * period on the specified rcu_node structure.
- *
- * The caller must hold rnp->lock with irqs disabled.
- */
-static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
-                                     struct rcu_node *rnp,
-                                     struct rcu_data *rdp)
-{
-        struct list_head *lp;
-        struct list_head *lp_root;
-        int retval = 0;
-        struct rcu_node *rnp_root = rcu_get_root(rsp);
-        struct task_struct *t;
-        if (rnp == rnp_root) {
-                WARN_ONCE(1, "Last CPU thought to be offlined?");
-                return 0;  /* Shouldn't happen: at least one CPU online. */
-        }
-        /* If we are on an internal node, complain bitterly. */
-        WARN_ON_ONCE(rnp != rdp->mynode);
-        /*
-         * Move tasks up to root rcu_node.  Don't try to get fancy for
-         * this corner-case operation -- just put this node's tasks
-         * at the head of the root node's list, and update the root node's
-         * ->gp_tasks and ->exp_tasks pointers to those of this node's,
-         * if non-NULL.  This might result in waiting for more tasks than
-         * absolutely necessary, but this is a good performance/complexity
-         * tradeoff.
-         */
-        if (rcu_preempt_blocked_readers_cgp(rnp) && rnp->qsmask == 0)
-                retval |= RCU_OFL_TASKS_NORM_GP;
-        if (rcu_preempted_readers_exp(rnp))
-                retval |= RCU_OFL_TASKS_EXP_GP;
-        lp = &rnp->blkd_tasks;
-        lp_root = &rnp_root->blkd_tasks;
-        while (!list_empty(lp)) {
-                t = list_entry(lp->next, typeof(*t), rcu_node_entry);
-                raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
-                smp_mb__after_unlock_lock();
-                list_del(&t->rcu_node_entry);
-                t->rcu_blocked_node = rnp_root;
-                list_add(&t->rcu_node_entry, lp_root);
-                if (&t->rcu_node_entry == rnp->gp_tasks)
-                        rnp_root->gp_tasks = rnp->gp_tasks;
-                if (&t->rcu_node_entry == rnp->exp_tasks)
-                        rnp_root->exp_tasks = rnp->exp_tasks;
-#ifdef CONFIG_RCU_BOOST
-                if (&t->rcu_node_entry == rnp->boost_tasks)
-                        rnp_root->boost_tasks = rnp->boost_tasks;
-#endif /* #ifdef CONFIG_RCU_BOOST */
-                raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
-        }
-        rnp->gp_tasks = NULL;
-        rnp->exp_tasks = NULL;
-#ifdef CONFIG_RCU_BOOST
-        rnp->boost_tasks = NULL;
-        /*
-         * In case root is being boosted and leaf was not.  Make sure
-         * that we boost the tasks blocking the current grace period
-         * in this case.
-         */
-        raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
-        smp_mb__after_unlock_lock();
-        if (rnp_root->boost_tasks != NULL &&
-            rnp_root->boost_tasks != rnp_root->gp_tasks &&
-            rnp_root->boost_tasks != rnp_root->exp_tasks)
-                rnp_root->boost_tasks = rnp_root->gp_tasks;
-        raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
-#endif /* #ifdef CONFIG_RCU_BOOST */
-        return retval;
-}
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 /*
@@ -771,7 +681,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
        raw_spin_lock_irqsave(&rnp->lock, flags);
        smp_mb__after_unlock_lock();
-        if (list_empty(&rnp->blkd_tasks)) {
+        if (!rcu_preempt_has_tasks(rnp)) {
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        } else {
                rnp->exp_tasks = rnp->blkd_tasks.next;
@@ -933,15 +843,6 @@ static void __init rcu_bootup_announce(void)
 }
 /*
- * Return the number of RCU batches processed thus far for debug & stats.
- */
-long rcu_batches_completed(void)
-{
-        return rcu_batches_completed_sched();
-}
-EXPORT_SYMBOL_GPL(rcu_batches_completed);
-/*
 * Because preemptible RCU does not exist, we never have to check for
 * CPUs being in quiescent states.
 */
@@ -960,11 +861,12 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
 #ifdef CONFIG_HOTPLUG_CPU
-/* Because preemptible RCU does not exist, no quieting of tasks. */
+/*
-static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
+ * Because there is no preemptible RCU, there can be no readers blocked.
-        __releases(rnp->lock)
+ */
+static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
 {
-        raw_spin_unlock_irqrestore(&rnp->lock, flags);
+        return false;
 }
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -996,23 +898,6 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
        WARN_ON_ONCE(rnp->qsmask);
 }
-#ifdef CONFIG_HOTPLUG_CPU
-/*
- * Because preemptible RCU does not exist, it never needs to migrate
- * tasks that were blocked within RCU read-side critical sections, and
- * such non-existent tasks cannot possibly have been blocking the current
- * grace period.
- */
-static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
-                                     struct rcu_node *rnp,
-                                     struct rcu_data *rdp)
-{
-        return 0;
-}
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
 /*
 * Because preemptible RCU does not exist, it never has any callbacks
 * to check.
@@ -1031,20 +916,6 @@ void synchronize_rcu_expedited(void)
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
-#ifdef CONFIG_HOTPLUG_CPU
-/*
- * Because preemptible RCU does not exist, there is never any need to
- * report on tasks preempted in RCU read-side critical sections during
- * expedited RCU grace periods.
- */
-static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
-                               bool wake)
-{
-}
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
 /*
 * Because preemptible RCU does not exist, rcu_barrier() is just
 * another name for rcu_barrier_sched().
@@ -1080,7 +951,7 @@ void exit_rcu(void)
 static void rcu_initiate_boost_trace(struct rcu_node *rnp)
 {
-        if (list_empty(&rnp->blkd_tasks))
+        if (!rcu_preempt_has_tasks(rnp))
                rnp->n_balk_blkd_tasks++;
        else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL)
                rnp->n_balk_exp_gp_tasks++;
@@ -1127,7 +998,8 @@ static int rcu_boost(struct rcu_node *rnp)
        struct task_struct *t;
        struct list_head *tb;
-        if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL)
+        if (ACCESS_ONCE(rnp->exp_tasks) == NULL &&
+            ACCESS_ONCE(rnp->boost_tasks) == NULL)
                return 0;  /* Nothing left to boost. */
        raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -1175,15 +1047,11 @@ static int rcu_boost(struct rcu_node *rnp)
         */
        t = container_of(tb, struct task_struct, rcu_node_entry);
        rt_mutex_init_proxy_locked(&rnp->boost_mtx, t);
-        init_completion(&rnp->boost_completion);
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
        /* Lock only for side effect: boosts task t's priority. */
        rt_mutex_lock(&rnp->boost_mtx);
        rt_mutex_unlock(&rnp->boost_mtx);  /* Then keep lockdep happy. */
-        /* Wait for boostee to be done w/boost_mtx before reinitializing. */
-        wait_for_completion(&rnp->boost_completion);
        return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
               ACCESS_ONCE(rnp->boost_tasks) != NULL;
 }
@@ -1416,12 +1284,8 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
        for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
                if ((mask & 0x1) && cpu != outgoingcpu)
                        cpumask_set_cpu(cpu, cm);
-        if (cpumask_weight(cm) == 0) {
+        if (cpumask_weight(cm) == 0)
                cpumask_setall(cm);
-                for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++)
-                        cpumask_clear_cpu(cpu, cm);
-                WARN_ON_ONCE(cpumask_weight(cm) == 0);
-        }
        set_cpus_allowed_ptr(t, cm);
        free_cpumask_var(cm);
 }
@@ -1446,12 +1310,8 @@ static void __init rcu_spawn_boost_kthreads(void)
        for_each_possible_cpu(cpu)
                per_cpu(rcu_cpu_has_work, cpu) = 0;
        BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
-        rnp = rcu_get_root(rcu_state_p);
+        rcu_for_each_leaf_node(rcu_state_p, rnp)
-        (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
+                (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
-        if (NUM_RCU_NODES > 1) {
-                rcu_for_each_leaf_node(rcu_state_p, rnp)
-                        (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
-        }
 }
 static void rcu_prepare_kthreads(int cpu)
@@ -1605,7 +1465,8 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
                 * completed since we last checked and there are
                 * callbacks not yet ready to invoke.
                 */
-                if (rdp->completed != rnp->completed &&
+                if ((rdp->completed != rnp->completed ||
+                     unlikely(ACCESS_ONCE(rdp->gpwrap))) &&
                    rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
                        note_gp_changes(rsp, rdp);
@@ -1898,11 +1759,12 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
                ticks_value = rsp->gpnum - rdp->gpnum;
        }
        print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
-        pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n",
+        pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n",
               cpu, ticks_value, ticks_title,
               atomic_read(&rdtp->dynticks) & 0xfff,
               rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
               rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
+               ACCESS_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart,
               fast_no_hz);
 }
@@ -2056,9 +1918,26 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
 static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
 {
        struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+        unsigned long ret;
+#ifdef CONFIG_PROVE_RCU
        struct rcu_head *rhp;
+#endif /* #ifdef CONFIG_PROVE_RCU */
-        /* No-CBs CPUs might have callbacks on any of three lists. */
+        /*
+         * Check count of all no-CBs callbacks awaiting invocation.
+         * There needs to be a barrier before this function is called,
+         * but associated with a prior determination that no more
+         * callbacks would be posted.  In the worst case, the first
+         * barrier in _rcu_barrier() suffices (but the caller cannot
+         * necessarily rely on this, not a substitute for the caller
+         * getting the concurrency design right!).  There must also be
+         * a barrier between the following load an posting of a callback
+         * (if a callback is in fact needed).  This is associated with an
+         * atomic_inc() in the caller.
+         */
+        ret = atomic_long_read(&rdp->nocb_q_count);
+#ifdef CONFIG_PROVE_RCU
        rhp = ACCESS_ONCE(rdp->nocb_head);
        if (!rhp)
                rhp = ACCESS_ONCE(rdp->nocb_gp_head);
@@ -2072,8 +1951,9 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
                       cpu, rhp->func);
                WARN_ON_ONCE(1);
        }
+#endif /* #ifdef CONFIG_PROVE_RCU */
-        return !!rhp;
+        return !!ret;
 }
 /*
@@ -2095,9 +1975,10 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
        struct task_struct *t;
        /* Enqueue the callback on the nocb list and update counts. */
+        atomic_long_add(rhcount, &rdp->nocb_q_count);
+        /* rcu_barrier() relies on ->nocb_q_count add before xchg. */
        old_rhpp = xchg(&rdp->nocb_tail, rhtp);
        ACCESS_ONCE(*old_rhpp) = rhp;
-        atomic_long_add(rhcount, &rdp->nocb_q_count);
        atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy);
        smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */
@@ -2288,9 +2169,6 @@ wait_again:
                /* Move callbacks to wait-for-GP list, which is empty. */
                ACCESS_ONCE(rdp->nocb_head) = NULL;
                rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
-                rdp->nocb_gp_count = atomic_long_xchg(&rdp->nocb_q_count, 0);
-                rdp->nocb_gp_count_lazy =
-                        atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
                gotcbs = true;
        }
@@ -2338,9 +2216,6 @@ wait_again:
                /* Append callbacks to follower's "done" list. */
                tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail);
                *tail = rdp->nocb_gp_head;
-                atomic_long_add(rdp->nocb_gp_count, &rdp->nocb_follower_count);
-                atomic_long_add(rdp->nocb_gp_count_lazy,
-                                &rdp->nocb_follower_count_lazy);
                smp_mb__after_atomic(); /* Store *tail before wakeup. */
                if (rdp != my_rdp && tail == &rdp->nocb_follower_head) {
                        /*
@@ -2415,13 +2290,11 @@ static int rcu_nocb_kthread(void *arg)
                trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty");
                ACCESS_ONCE(rdp->nocb_follower_head) = NULL;
                tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head);
-                c = atomic_long_xchg(&rdp->nocb_follower_count, 0);
-                cl = atomic_long_xchg(&rdp->nocb_follower_count_lazy, 0);
-                rdp->nocb_p_count += c;
-                rdp->nocb_p_count_lazy += cl;
                /* Each pass through the following loop invokes a callback. */
-                trace_rcu_batch_start(rdp->rsp->name, cl, c, -1);
+                trace_rcu_batch_start(rdp->rsp->name,
+                                      atomic_long_read(&rdp->nocb_q_count_lazy),
+                                      atomic_long_read(&rdp->nocb_q_count), -1);
                c = cl = 0;
                while (list) {
                        next = list->next;
@@ -2443,9 +2316,9 @@ static int rcu_nocb_kthread(void *arg)
                        list = next;
                }
                trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1);
-                ACCESS_ONCE(rdp->nocb_p_count) = rdp->nocb_p_count - c;
+                smp_mb__before_atomic();  /* _add after CB invocation. */
-                ACCESS_ONCE(rdp->nocb_p_count_lazy) =
+                atomic_long_add(-c, &rdp->nocb_q_count);
-                                                rdp->nocb_p_count_lazy - cl;
+                atomic_long_add(-cl, &rdp->nocb_q_count_lazy);
                rdp->n_nocbs_invoked += c;
        }
        return 0;
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 5cdc62e1beeb..fbb6240509ea 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -46,6 +46,8 @@
 #define RCU_TREE_NONCORE
 #include "tree.h"
+DECLARE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
 static int r_open(struct inode *inode, struct file *file,
                                        const struct seq_operations *op)
 {
@@ -115,11 +117,13 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
        if (!rdp->beenonline)
                return;
-        seq_printf(m, "%3d%cc=%ld g=%ld pq=%d qp=%d",
+        seq_printf(m, "%3d%cc=%ld g=%ld pq=%d/%d qp=%d",
                   rdp->cpu,
                   cpu_is_offline(rdp->cpu) ? '!' : ' ',
                   ulong2long(rdp->completed), ulong2long(rdp->gpnum),
-                   rdp->passed_quiesce, rdp->qs_pending);
+                   rdp->passed_quiesce,
+                   rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu),
+                   rdp->qs_pending);
        seq_printf(m, " dt=%d/%llx/%d df=%lu",
                   atomic_read(&rdp->dynticks->dynticks),
                   rdp->dynticks->dynticks_nesting,
diff --git a/kernel/resource.c b/kernel/resource.c
index 0bcebffc4e77..19f2357dfda3 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -22,6 +22,7 @@
 #include <linux/device.h>
 #include <linux/pfn.h>
 #include <linux/mm.h>
+#include <linux/resource_ext.h>
 #include <asm/io.h>
@@ -1529,6 +1530,30 @@ int iomem_is_exclusive(u64 addr)
        return err;
 }
+struct resource_entry *resource_list_create_entry(struct resource *res,
+                                                  size_t extra_size)
+{
+        struct resource_entry *entry;
+        entry = kzalloc(sizeof(*entry) + extra_size, GFP_KERNEL);
+        if (entry) {
+                INIT_LIST_HEAD(&entry->node);
+                entry->res = res ? res : &entry->__res;
+        }
+        return entry;
+}
+EXPORT_SYMBOL(resource_list_create_entry);
+void resource_list_free(struct list_head *head)
+{
+        struct resource_entry *entry, *tmp;
+        list_for_each_entry_safe(entry, tmp, head, node)
+                resource_list_destroy_entry(entry);
+}
+EXPORT_SYMBOL(resource_list_free);
 static int __init strict_iomem(char *str)
 {
        if (strstr(str, "relaxed"))
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 607f852b4d04..7052d3fd4e7b 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -268,6 +268,15 @@ bool try_wait_for_completion(struct completion *x)
        unsigned long flags;
        int ret = 1;
+        /*
+         * Since x->done will need to be locked only
+         * in the non-blocking case, we check x->done
+         * first without taking the lock so we can
+         * return early in the blocking case.
+         */
+        if (!ACCESS_ONCE(x->done))
+                return 0;
        spin_lock_irqsave(&x->wait.lock, flags);
        if (!x->done)
                ret = 0;
@@ -288,13 +297,6 @@ EXPORT_SYMBOL(try_wait_for_completion);
 */
 bool completion_done(struct completion *x)
 {
-        unsigned long flags;
+        return !!ACCESS_ONCE(x->done);
-        int ret = 1;
-        spin_lock_irqsave(&x->wait.lock, flags);
-        if (!x->done)
-                ret = 0;
-        spin_unlock_irqrestore(&x->wait.lock, flags);
-        return ret;
 }
 EXPORT_SYMBOL(completion_done);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b5797b78add6..1f37fe7f77a4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -119,7 +119,9 @@ void update_rq_clock(struct rq *rq)
 {
        s64 delta;
-        if (rq->skip_clock_update > 0)
+        lockdep_assert_held(&rq->lock);
+        if (rq->clock_skip_update & RQCF_ACT_SKIP)
                return;
        delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
@@ -490,6 +492,11 @@ static __init void init_hrtick(void)
 */
 void hrtick_start(struct rq *rq, u64 delay)
 {
+        /*
+         * Don't schedule slices shorter than 10000ns, that just
+         * doesn't make sense. Rely on vruntime for fairness.
+         */
+        delay = max_t(u64, delay, 10000LL);
        __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
                        HRTIMER_MODE_REL_PINNED, 0);
 }
@@ -1046,7 +1053,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
         * this case, we can save a useless back to back clock update.
         */
        if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
-                rq->skip_clock_update = 1;
+                rq_clock_skip_update(rq, true);
 }
 #ifdef CONFIG_SMP
@@ -1082,7 +1089,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
                if (p->sched_class->migrate_task_rq)
                        p->sched_class->migrate_task_rq(p, new_cpu);
                p->se.nr_migrations++;
-                perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
+                perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
        }
        __set_task_cpu(p, new_cpu);
@@ -1814,6 +1821,10 @@ void __dl_clear_params(struct task_struct *p)
        dl_se->dl_period = 0;
        dl_se->flags = 0;
        dl_se->dl_bw = 0;
+        dl_se->dl_throttled = 0;
+        dl_se->dl_new = 1;
+        dl_se->dl_yielded = 0;
 }
 /*
@@ -1832,6 +1843,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
        p->se.prev_sum_exec_runtime     = 0;
        p->se.nr_migrations             = 0;
        p->se.vruntime                  = 0;
+#ifdef CONFIG_SMP
+        p->se.avg.decay_count           = 0;
+#endif
        INIT_LIST_HEAD(&p->se.group_node);
 #ifdef CONFIG_SCHEDSTATS
@@ -1839,7 +1853,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 #endif
        RB_CLEAR_NODE(&p->dl.rb_node);
-        hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+        init_dl_task_timer(&p->dl);
        __dl_clear_params(p);
        INIT_LIST_HEAD(&p->rt.run_list);
@@ -2049,6 +2063,9 @@ static inline int dl_bw_cpus(int i)
 * allocated bandwidth to reflect the new situation.
 *
 * This function is called while holding p's rq->lock.
+ *
+ * XXX we should delay bw change until the task's 0-lag point, see
+ * __setparam_dl().
 */
 static int dl_overflow(struct task_struct *p, int policy,
                       const struct sched_attr *attr)
@@ -2748,6 +2765,10 @@ again:
 *          - explicit schedule() call
 *          - return from syscall or exception to user-space
 *          - return from interrupt-handler to user-space
+ *
+ * WARNING: all callers must re-check need_resched() afterward and reschedule
+ * accordingly in case an event triggered the need for rescheduling (such as
+ * an interrupt waking up a task) while preemption was disabled in __schedule().
 */
 static void __sched __schedule(void)
 {
@@ -2756,7 +2777,6 @@ static void __sched __schedule(void)
        struct rq *rq;
        int cpu;
-need_resched:
        preempt_disable();
        cpu = smp_processor_id();
        rq = cpu_rq(cpu);
@@ -2776,6 +2796,8 @@ need_resched:
        smp_mb__before_spinlock();
        raw_spin_lock_irq(&rq->lock);
+        rq->clock_skip_update <<= 1; /* promote REQ to ACT */
        switch_count = &prev->nivcsw;
        if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
                if (unlikely(signal_pending_state(prev->state, prev))) {
@@ -2800,13 +2822,13 @@ need_resched:
                switch_count = &prev->nvcsw;
        }
-        if (task_on_rq_queued(prev) || rq->skip_clock_update < 0)
+        if (task_on_rq_queued(prev))
                update_rq_clock(rq);
        next = pick_next_task(rq, prev);
        clear_tsk_need_resched(prev);
        clear_preempt_need_resched();
-        rq->skip_clock_update = 0;
+        rq->clock_skip_update = 0;
        if (likely(prev != next)) {
                rq->nr_switches++;
@@ -2821,8 +2843,6 @@ need_resched:
        post_schedule(rq);
        sched_preempt_enable_no_resched();
-        if (need_resched())
-                goto need_resched;
 }
 static inline void sched_submit_work(struct task_struct *tsk)
@@ -2842,7 +2862,9 @@ asmlinkage __visible void __sched schedule(void)
        struct task_struct *tsk = current;
        sched_submit_work(tsk);
-        __schedule();
+        do {
+                __schedule();
+        } while (need_resched());
 }
 EXPORT_SYMBOL(schedule);
@@ -2877,6 +2899,21 @@ void __sched schedule_preempt_disabled(void)
        preempt_disable();
 }
+static void preempt_schedule_common(void)
+{
+        do {
+                __preempt_count_add(PREEMPT_ACTIVE);
+                __schedule();
+                __preempt_count_sub(PREEMPT_ACTIVE);
+                /*
+                 * Check again in case we missed a preemption opportunity
+                 * between schedule and now.
+                 */
+                barrier();
+        } while (need_resched());
+}
 #ifdef CONFIG_PREEMPT
 /*
 * this is the entry point to schedule() from in-kernel preemption
@@ -2892,17 +2929,7 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
        if (likely(!preemptible()))
                return;
-        do {
+        preempt_schedule_common();
-                __preempt_count_add(PREEMPT_ACTIVE);
-                __schedule();
-                __preempt_count_sub(PREEMPT_ACTIVE);
-                /*
-                 * Check again in case we missed a preemption opportunity
-                 * between schedule and now.
-                 */
-                barrier();
-        } while (need_resched());
 }
 NOKPROBE_SYMBOL(preempt_schedule);
 EXPORT_SYMBOL(preempt_schedule);
@@ -3251,15 +3278,31 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
 {
        struct sched_dl_entity *dl_se = &p->dl;
-        init_dl_task_timer(dl_se);
        dl_se->dl_runtime = attr->sched_runtime;
        dl_se->dl_deadline = attr->sched_deadline;
        dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
        dl_se->flags = attr->sched_flags;
        dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
-        dl_se->dl_throttled = 0;
-        dl_se->dl_new = 1;
+        /*
-        dl_se->dl_yielded = 0;
+         * Changing the parameters of a task is 'tricky' and we're not doing
+         * the correct thing -- also see task_dead_dl() and switched_from_dl().
+         *
+         * What we SHOULD do is delay the bandwidth release until the 0-lag
+         * point. This would include retaining the task_struct until that time
+         * and change dl_overflow() to not immediately decrement the current
+         * amount.
+         *
+         * Instead we retain the current runtime/deadline and let the new
+         * parameters take effect after the current reservation period lapses.
+         * This is safe (albeit pessimistic) because the 0-lag point is always
+         * before the current scheduling deadline.
+         *
+         * We can still have temporary overloads because we do not delay the
+         * change in bandwidth until that time; so admission control is
+         * not on the safe side. It does however guarantee tasks will never
+         * consume more than promised.
+         */
 }
 /*
@@ -3382,6 +3425,20 @@ static bool check_same_owner(struct task_struct *p)
        return match;
 }
+static bool dl_param_changed(struct task_struct *p,
+                const struct sched_attr *attr)
+{
+        struct sched_dl_entity *dl_se = &p->dl;
+        if (dl_se->dl_runtime != attr->sched_runtime ||
+                dl_se->dl_deadline != attr->sched_deadline ||
+                dl_se->dl_period != attr->sched_period ||
+                dl_se->flags != attr->sched_flags)
+                return true;
+        return false;
+}
 static int __sched_setscheduler(struct task_struct *p,
                                const struct sched_attr *attr,
                                bool user)
@@ -3510,7 +3567,7 @@ recheck:
                        goto change;
                if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
                        goto change;
-                if (dl_policy(policy))
+                if (dl_policy(policy) && dl_param_changed(p, attr))
                        goto change;
                p->sched_reset_on_fork = reset_on_fork;
@@ -4202,17 +4259,10 @@ SYSCALL_DEFINE0(sched_yield)
        return 0;
 }
-static void __cond_resched(void)
-{
-        __preempt_count_add(PREEMPT_ACTIVE);
-        __schedule();
-        __preempt_count_sub(PREEMPT_ACTIVE);
-}
 int __sched _cond_resched(void)
 {
        if (should_resched()) {
-                __cond_resched();
+                preempt_schedule_common();
                return 1;
        }
        return 0;
@@ -4237,7 +4287,7 @@ int __cond_resched_lock(spinlock_t *lock)
        if (spin_needbreak(lock) || resched) {
                spin_unlock(lock);
                if (resched)
-                        __cond_resched();
+                        preempt_schedule_common();
                else
                        cpu_relax();
                ret = 1;
@@ -4253,7 +4303,7 @@ int __sched __cond_resched_softirq(void)
        if (should_resched()) {
                local_bh_enable();
-                __cond_resched();
+                preempt_schedule_common();
                local_bh_disable();
                return 1;
        }
@@ -4508,9 +4558,10 @@ void sched_show_task(struct task_struct *p)
 {
        unsigned long free = 0;
        int ppid;
-        unsigned state;
+        unsigned long state = p->state;
-        state = p->state ? __ffs(p->state) + 1 : 0;
+        if (state)
+                state = __ffs(state) + 1;
        printk(KERN_INFO "%-15.15s %c", p->comm,
                state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
 #if BITS_PER_LONG == 32
@@ -4642,6 +4693,9 @@ int cpuset_cpumask_can_shrink(const struct cpumask *cur,
        struct dl_bw *cur_dl_b;
        unsigned long flags;
+        if (!cpumask_weight(cur))
+                return ret;
        rcu_read_lock_sched();
        cur_dl_b = dl_bw_of(cpumask_any(cur));
        trial_cpus = cpumask_weight(trial);
@@ -4740,7 +4794,7 @@ static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 {
-        if (p->sched_class && p->sched_class->set_cpus_allowed)
+        if (p->sched_class->set_cpus_allowed)
                p->sched_class->set_cpus_allowed(p, new_mask);
        cpumask_copy(&p->cpus_allowed, new_mask);
@@ -7113,9 +7167,6 @@ void __init sched_init(void)
 #ifdef CONFIG_RT_GROUP_SCHED
        alloc_size += 2 * nr_cpu_ids * sizeof(void **);
 #endif
-#ifdef CONFIG_CPUMASK_OFFSTACK
-        alloc_size += num_possible_cpus() * cpumask_size();
-#endif
        if (alloc_size) {
                ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
@@ -7135,13 +7186,13 @@ void __init sched_init(void)
                ptr += nr_cpu_ids * sizeof(void **);
 #endif /* CONFIG_RT_GROUP_SCHED */
+        }
 #ifdef CONFIG_CPUMASK_OFFSTACK
-                for_each_possible_cpu(i) {
+        for_each_possible_cpu(i) {
-                        per_cpu(load_balance_mask, i) = (void *)ptr;
+                per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
-                        ptr += cpumask_size();
+                        cpumask_size(), GFP_KERNEL, cpu_to_node(i));
-                }
-#endif /* CONFIG_CPUMASK_OFFSTACK */
        }
+#endif /* CONFIG_CPUMASK_OFFSTACK */
        init_rt_bandwidth(&def_rt_bandwidth,
                        global_rt_period(), global_rt_runtime());
@@ -7253,6 +7304,11 @@ void __init sched_init(void)
        enter_lazy_tlb(&init_mm, current);
        /*
+         * During early bootup we pretend to be a normal task:
+         */
+        current->sched_class = &fair_sched_class;
+        /*
         * Make us the idle thread. Technically, schedule() should not be
         * called from this thread, however somewhere below it might be,
         * but because we are the idle thread, we just pick up running again
@@ -7262,11 +7318,6 @@ void __init sched_init(void)
        calc_load_update = jiffies + LOAD_FREQ;
-        /*
-         * During early bootup we pretend to be a normal task:
-         */
-        current->sched_class = &fair_sched_class;
 #ifdef CONFIG_SMP
        zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
        /* May be allocated at isolcpus cmdline parse time */
@@ -7295,13 +7346,12 @@ void __might_sleep(const char *file, int line, int preempt_offset)
         * since we will exit with TASK_RUNNING make sure we enter with it,
         * otherwise we will destroy state.
         */
-        if (WARN_ONCE(current->state != TASK_RUNNING,
+        WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
                        "do not call blocking ops when !TASK_RUNNING; "
                        "state=%lx set at [<%p>] %pS\n",
                        current->state,
                        (void *)current->task_state_change,
-                        (void *)current->task_state_change))
+                        (void *)current->task_state_change);
-                __set_current_state(TASK_RUNNING);
        ___might_sleep(file, line, preempt_offset);
 }
@@ -7328,6 +7378,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
                        in_atomic(), irqs_disabled(),
                        current->pid, current->comm);
+        if (task_stack_end_corrupted(current))
+                printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
        debug_show_held_locks(current);
        if (irqs_disabled())
                print_irqtrace_events(current);
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 539ca3ce071b..c6acb07466bb 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -107,7 +107,8 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
        int best_cpu = -1;
        const struct sched_dl_entity *dl_se = &p->dl;
-        if (later_mask && cpumask_and(later_mask, later_mask, cp->free_cpus)) {
+        if (later_mask &&
+            cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
                best_cpu = cpumask_any(later_mask);
                goto out;
        } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
@@ -186,6 +187,26 @@ out:
 }
 /*
+ * cpudl_set_freecpu - Set the cpudl.free_cpus
+ * @cp: the cpudl max-heap context
+ * @cpu: rd attached cpu
+ */
+void cpudl_set_freecpu(struct cpudl *cp, int cpu)
+{
+        cpumask_set_cpu(cpu, cp->free_cpus);
+}
+/*
+ * cpudl_clear_freecpu - Clear the cpudl.free_cpus
+ * @cp: the cpudl max-heap context
+ * @cpu: rd attached cpu
+ */
+void cpudl_clear_freecpu(struct cpudl *cp, int cpu)
+{
+        cpumask_clear_cpu(cpu, cp->free_cpus);
+}
+/*
 * cpudl_init - initialize the cpudl structure
 * @cp: the cpudl max-heap context
 */
@@ -203,7 +224,7 @@ int cpudl_init(struct cpudl *cp)
        if (!cp->elements)
                return -ENOMEM;
-        if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) {
+        if (!zalloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) {
                kfree(cp->elements);
                return -ENOMEM;
        }
@@ -211,8 +232,6 @@ int cpudl_init(struct cpudl *cp)
        for_each_possible_cpu(i)
                cp->elements[i].idx = IDX_INVALID;
-        cpumask_setall(cp->free_cpus);
        return 0;
 }
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index 020039bd1326..1a0a6ef2fbe1 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -24,6 +24,8 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
               struct cpumask *later_mask);
 void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
 int cpudl_init(struct cpudl *cp);
+void cpudl_set_freecpu(struct cpudl *cp, int cpu);
+void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
 void cpudl_cleanup(struct cpudl *cp);
 #endif /* CONFIG_SMP */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index e5db8c6feebd..a027799ae130 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -350,6 +350,11 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
                dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
                dl_se->runtime = pi_se->dl_runtime;
        }
+        if (dl_se->dl_yielded)
+                dl_se->dl_yielded = 0;
+        if (dl_se->dl_throttled)
+                dl_se->dl_throttled = 0;
 }
 /*
@@ -536,23 +541,19 @@ again:
        sched_clock_tick();
        update_rq_clock(rq);
-        dl_se->dl_throttled = 0;
+        enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
-        dl_se->dl_yielded = 0;
+        if (dl_task(rq->curr))
-        if (task_on_rq_queued(p)) {
+                check_preempt_curr_dl(rq, p, 0);
-                enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
+        else
-                if (dl_task(rq->curr))
+                resched_curr(rq);
-                        check_preempt_curr_dl(rq, p, 0);
-                else
-                        resched_curr(rq);
 #ifdef CONFIG_SMP
-                /*
+        /*
-                 * Queueing this task back might have overloaded rq,
+         * Queueing this task back might have overloaded rq,
-                 * check if we need to kick someone away.
+         * check if we need to kick someone away.
-                 */
+         */
-                if (has_pushable_dl_tasks(rq))
+        if (has_pushable_dl_tasks(rq))
-                        push_dl_task(rq);
+                push_dl_task(rq);
 #endif
-        }
 unlock:
        raw_spin_unlock(&rq->lock);
@@ -570,24 +571,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
 static
 int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se)
 {
-        int dmiss = dl_time_before(dl_se->deadline, rq_clock(rq));
+        return (dl_se->runtime <= 0);
-        int rorun = dl_se->runtime <= 0;
-        if (!rorun && !dmiss)
-                return 0;
-        /*
-         * If we are beyond our current deadline and we are still
-         * executing, then we have already used some of the runtime of
-         * the next instance. Thus, if we do not account that, we are
-         * stealing bandwidth from the system at each deadline miss!
-         */
-        if (dmiss) {
-                dl_se->runtime = rorun ? dl_se->runtime : 0;
-                dl_se->runtime -= rq_clock(rq) - dl_se->deadline;
-        }
-        return 1;
 }
 extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
@@ -630,10 +614,9 @@ static void update_curr_dl(struct rq *rq)
        dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
        if (dl_runtime_exceeded(rq, dl_se)) {
+                dl_se->dl_throttled = 1;
                __dequeue_task_dl(rq, curr, 0);
-                if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted)))
+                if (unlikely(!start_dl_timer(dl_se, curr->dl.dl_boosted)))
-                        dl_se->dl_throttled = 1;
-                else
                        enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
                if (!is_leftmost(curr, &rq->dl))
@@ -826,10 +809,10 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
         * parameters of the task might need updating. Otherwise,
         * we want a replenishment of its runtime.
         */
-        if (!dl_se->dl_new && flags & ENQUEUE_REPLENISH)
+        if (dl_se->dl_new || flags & ENQUEUE_WAKEUP)
-                replenish_dl_entity(dl_se, pi_se);
-        else
                update_dl_entity(dl_se, pi_se);
+        else if (flags & ENQUEUE_REPLENISH)
+                replenish_dl_entity(dl_se, pi_se);
        __enqueue_dl_entity(dl_se);
 }
@@ -870,7 +853,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
         * its rq, the bandwidth timer callback (which clearly has not
         * run yet) will take care of this.
         */
-        if (p->dl.dl_throttled)
+        if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH))
                return;
        enqueue_dl_entity(&p->dl, pi_se, flags);
@@ -1090,7 +1073,13 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
 {
        update_curr_dl(rq);
-        if (hrtick_enabled(rq) && queued && p->dl.runtime > 0)
+        /*
+         * Even when we have runtime, update_curr_dl() might have resulted in us
+         * not being the leftmost task anymore. In that case NEED_RESCHED will
+         * be set and schedule() will start a new hrtick for the next task.
+         */
+        if (hrtick_enabled(rq) && queued && p->dl.runtime > 0 &&
+            is_leftmost(p, &rq->dl))
                start_hrtick_dl(rq, p);
 }
@@ -1111,6 +1100,7 @@ static void task_dead_dl(struct task_struct *p)
         * Since we are TASK_DEAD we won't slip out of the domain!
         */
        raw_spin_lock_irq(&dl_b->lock);
+        /* XXX we should retain the bw until 0-lag */
        dl_b->total_bw -= p->dl.dl_bw;
        raw_spin_unlock_irq(&dl_b->lock);
@@ -1182,9 +1172,6 @@ static int find_later_rq(struct task_struct *task)
         * We have to consider system topology and task affinity
         * first, then we can look for a suitable cpu.
         */
-        cpumask_copy(later_mask, task_rq(task)->rd->span);
-        cpumask_and(later_mask, later_mask, cpu_active_mask);
-        cpumask_and(later_mask, later_mask, &task->cpus_allowed);
        best_cpu = cpudl_find(&task_rq(task)->rd->cpudl,
                        task, later_mask);
        if (best_cpu == -1)
@@ -1579,6 +1566,7 @@ static void rq_online_dl(struct rq *rq)
        if (rq->dl.overloaded)
                dl_set_overload(rq);
+        cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu);
        if (rq->dl.dl_nr_running > 0)
                cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1);
 }
@@ -1590,6 +1578,7 @@ static void rq_offline_dl(struct rq *rq)
                dl_clear_overload(rq);
        cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
+        cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
 }
 void init_sched_dl_class(void)
@@ -1631,8 +1620,8 @@ static void cancel_dl_timer(struct rq *rq, struct task_struct *p)
 static void switched_from_dl(struct rq *rq, struct task_struct *p)
 {
+        /* XXX we should retain the bw until 0-lag */
        cancel_dl_timer(rq, p);
        __dl_clear_params(p);
        /*
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 92cc52001e74..8baaf858d25c 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -305,6 +305,7 @@ do {									\
        PN(next_balance);
        SEQ_printf(m, "  .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr)));
        PN(clock);
+        PN(clock_task);
        P(cpu_load[0]);
        P(cpu_load[1]);
        P(cpu_load[2]);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index df2cdf77f899..7ce18f3c097a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -676,7 +676,6 @@ void init_task_runnable_average(struct task_struct *p)
 {
        u32 slice;
-        p->se.avg.decay_count = 0;
        slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
        p->se.avg.runnable_avg_sum = slice;
        p->se.avg.runnable_avg_period = slice;
@@ -1730,7 +1729,7 @@ static int preferred_group_nid(struct task_struct *p, int nid)
        nodes = node_online_map;
        for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
                unsigned long max_faults = 0;
-                nodemask_t max_group;
+                nodemask_t max_group = NODE_MASK_NONE;
                int a, b;
                /* Are there nodes at this distance from each other? */
@@ -2574,11 +2573,11 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se)
        u64 decays = atomic64_read(&cfs_rq->decay_counter);
        decays -= se->avg.decay_count;
+        se->avg.decay_count = 0;
        if (!decays)
                return 0;
        se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
-        se->avg.decay_count = 0;
        return decays;
 }
@@ -4005,6 +4004,10 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force)
 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 {
+        /* init_cfs_bandwidth() was not called */
+        if (!cfs_b->throttled_cfs_rq.next)
+                return;
        hrtimer_cancel(&cfs_b->period_timer);
        hrtimer_cancel(&cfs_b->slack_timer);
 }
@@ -4424,7 +4427,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
                 * wl = S * s'_i; see (2)
                 */
                if (W > 0 && w < W)
-                        wl = (w * tg->shares) / W;
+                        wl = (w * (long)tg->shares) / W;
                else
                        wl = tg->shares;
@@ -5153,7 +5156,7 @@ static void yield_task_fair(struct rq *rq)
                 * so we don't do microscopic update in schedule()
                 * and double the fastpath cost.
                 */
-                 rq->skip_clock_update = 1;
+                rq_clock_skip_update(rq, true);
        }
        set_skip_buddy(se);
@@ -5945,8 +5948,8 @@ static unsigned long scale_rt_capacity(int cpu)
         */
        age_stamp = ACCESS_ONCE(rq->age_stamp);
        avg = ACCESS_ONCE(rq->rt_avg);
+        delta = __rq_clock_broken(rq) - age_stamp;
-        delta = rq_clock(rq) - age_stamp;
        if (unlikely(delta < 0))
                delta = 0;
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index c47fce75e666..aaf1c1d5cf5d 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -47,7 +47,8 @@ static inline int cpu_idle_poll(void)
        rcu_idle_enter();
        trace_cpu_idle_rcuidle(0, smp_processor_id());
        local_irq_enable();
-        while (!tif_need_resched())
+        while (!tif_need_resched() &&
+                (cpu_idle_force_poll || tick_check_broadcast_expired()))
                cpu_relax();
        trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
        rcu_idle_exit();
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index ee15f5a0d1c1..f4d4b077eba0 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -831,11 +831,14 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
                                enqueue = 1;
                                /*
-                                 * Force a clock update if the CPU was idle,
+                                 * When we're idle and a woken (rt) task is
-                                 * lest wakeup -> unthrottle time accumulate.
+                                 * throttled check_preempt_curr() will set
+                                 * skip_update and the time between the wakeup
+                                 * and this unthrottle will get accounted as
+                                 * 'runtime'.
                                 */
                                if (rt_rq->rt_nr_running && rq->curr == rq->idle)
-                                        rq->skip_clock_update = -1;
+                                        rq_clock_skip_update(rq, false);
                        }
                        if (rt_rq->rt_time || rt_rq->rt_nr_running)
                                idle = 0;
@@ -1337,7 +1340,12 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
             curr->prio <= p->prio)) {
                int target = find_lowest_rq(p);
-                if (target != -1)
+                /*
+                 * Don't bother moving it if the destination CPU is
+                 * not running a lower priority task.
+                 */
+                if (target != -1 &&
+                    p->prio < cpu_rq(target)->rt.highest_prio.curr)
                        cpu = target;
        }
        rcu_read_unlock();
@@ -1614,6 +1622,16 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
                lowest_rq = cpu_rq(cpu);
+                if (lowest_rq->rt.highest_prio.curr <= task->prio) {
+                        /*
+                         * Target rq has tasks of equal or higher priority,
+                         * retrying does not release any lock and is unlikely
+                         * to yield a different result.
+                         */
+                        lowest_rq = NULL;
+                        break;
+                }
                /* if the prio of this runqueue changed, try again */
                if (double_lock_balance(rq, lowest_rq)) {
                        /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9a2a45c970e7..0870db23d79c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -558,8 +558,6 @@ struct rq {
 #ifdef CONFIG_NO_HZ_FULL
        unsigned long last_sched_tick;
 #endif
-        int skip_clock_update;
        /* capture load from *all* tasks on this cpu: */
        struct load_weight load;
        unsigned long nr_load_updates;
@@ -588,6 +586,7 @@ struct rq {
        unsigned long next_balance;
        struct mm_struct *prev_mm;
+        unsigned int clock_skip_update;
        u64 clock;
        u64 clock_task;
@@ -687,16 +686,35 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 #define cpu_curr(cpu)           (cpu_rq(cpu)->curr)
 #define raw_rq()                raw_cpu_ptr(&runqueues)
+static inline u64 __rq_clock_broken(struct rq *rq)
+{
+        return ACCESS_ONCE(rq->clock);
+}
 static inline u64 rq_clock(struct rq *rq)
 {
+        lockdep_assert_held(&rq->lock);
        return rq->clock;
 }
 static inline u64 rq_clock_task(struct rq *rq)
 {
+        lockdep_assert_held(&rq->lock);
        return rq->clock_task;
 }
+#define RQCF_REQ_SKIP   0x01
+#define RQCF_ACT_SKIP   0x02
+static inline void rq_clock_skip_update(struct rq *rq, bool skip)
+{
+        lockdep_assert_held(&rq->lock);
+        if (skip)
+                rq->clock_skip_update |= RQCF_REQ_SKIP;
+        else
+                rq->clock_skip_update &= ~RQCF_REQ_SKIP;
+}
 #ifdef CONFIG_NUMA
 enum numa_topology_type {
        NUMA_DIRECT,
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index f032fb5284e3..40190f28db35 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -280,6 +280,7 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
        unsigned int cpu;
        int ret = 0;
+        get_online_cpus();
        mutex_lock(&smpboot_threads_lock);
        for_each_online_cpu(cpu) {
                ret = __smpboot_create_thread(plug_thread, cpu);
@@ -292,6 +293,7 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
        list_add(&plug_thread->list, &hotplug_threads);
 out:
        mutex_unlock(&smpboot_threads_lock);
+        put_online_cpus();
        return ret;
 }
 EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 501baa9ac1be..479e4436f787 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -114,8 +114,12 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
                trace_softirqs_off(ip);
        raw_local_irq_restore(flags);
-        if (preempt_count() == cnt)
+        if (preempt_count() == cnt) {
+#ifdef CONFIG_DEBUG_PREEMPT
+                current->preempt_disable_ip = get_parent_ip(CALLER_ADDR1);
+#endif
                trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+        }
 }
 EXPORT_SYMBOL(__local_bh_disable_ip);
 #endif /* CONFIG_TRACE_IRQFLAGS */
@@ -656,9 +660,8 @@ static void run_ksoftirqd(unsigned int cpu)
                 * in the task stack here.
                 */
                __do_softirq();
-                rcu_note_context_switch();
                local_irq_enable();
-                cond_resched();
+                cond_resched_rcu_qs();
                return;
        }
        local_irq_enable();
diff --git a/kernel/sys.c b/kernel/sys.c
index a8c9f5a7dda6..ea9c88109894 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2210,9 +2210,13 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                up_write(&me->mm->mmap_sem);
                break;
        case PR_MPX_ENABLE_MANAGEMENT:
+                if (arg2 || arg3 || arg4 || arg5)
+                        return -EINVAL;
                error = MPX_ENABLE_MANAGEMENT(me);
                break;
        case PR_MPX_DISABLE_MANAGEMENT:
+                if (arg2 || arg3 || arg4 || arg5)
+                        return -EINVAL;
                error = MPX_DISABLE_MANAGEMENT(me);
                break;
        default:
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 37e50aadd471..3f5e183c3d97 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -122,7 +122,7 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
        mono = ktime_get_update_offsets_tick(&off_real, &off_boot, &off_tai);
        boot = ktime_add(mono, off_boot);
        xtim = ktime_add(mono, off_real);
-        tai = ktime_add(xtim, off_tai);
+        tai = ktime_add(mono, off_tai);
        base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
        base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
@@ -266,7 +266,7 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
 /*
 * Divide a ktime value by a nanosecond value
 */
-u64 ktime_divns(const ktime_t kt, s64 div)
+u64 __ktime_divns(const ktime_t kt, s64 div)
 {
        u64 dclc;
        int sft = 0;
@@ -282,7 +282,7 @@ u64 ktime_divns(const ktime_t kt, s64 div)
        return dclc;
 }
-EXPORT_SYMBOL_GPL(ktime_divns);
+EXPORT_SYMBOL_GPL(__ktime_divns);
 #endif /* BITS_PER_LONG >= 64 */
 /*
@@ -440,6 +440,37 @@ static inline void debug_deactivate(struct hrtimer *timer)
        trace_hrtimer_cancel(timer);
 }
+#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
+static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
+{
+        struct hrtimer_clock_base *base = cpu_base->clock_base;
+        ktime_t expires, expires_next = { .tv64 = KTIME_MAX };
+        int i;
+        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
+                struct timerqueue_node *next;
+                struct hrtimer *timer;
+                next = timerqueue_getnext(&base->active);
+                if (!next)
+                        continue;
+                timer = container_of(next, struct hrtimer, node);
+                expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
+                if (expires.tv64 < expires_next.tv64)
+                        expires_next = expires;
+        }
+        /*
+         * clock_was_set() might have changed base->offset of any of
+         * the clock bases so the result might be negative. Fix it up
+         * to prevent a false positive in clockevents_program_event().
+         */
+        if (expires_next.tv64 < 0)
+                expires_next.tv64 = 0;
+        return expires_next;
+}
+#endif
 /* High resolution timer related functions */
 #ifdef CONFIG_HIGH_RES_TIMERS
@@ -488,32 +519,7 @@ static inline int hrtimer_hres_active(void)
 static void
 hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
 {
-        int i;
+        ktime_t expires_next = __hrtimer_get_next_event(cpu_base);
-        struct hrtimer_clock_base *base = cpu_base->clock_base;
-        ktime_t expires, expires_next;
-        expires_next.tv64 = KTIME_MAX;
-        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
-                struct hrtimer *timer;
-                struct timerqueue_node *next;
-                next = timerqueue_getnext(&base->active);
-                if (!next)
-                        continue;
-                timer = container_of(next, struct hrtimer, node);
-                expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
-                /*
-                 * clock_was_set() has changed base->offset so the
-                 * result might be negative. Fix it up to prevent a
-                 * false positive in clockevents_program_event()
-                 */
-                if (expires.tv64 < 0)
-                        expires.tv64 = 0;
-                if (expires.tv64 < expires_next.tv64)
-                        expires_next = expires;
-        }
        if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64)
                return;
@@ -587,6 +593,15 @@ static int hrtimer_reprogram(struct hrtimer *timer,
                return 0;
        /*
+         * When the target cpu of the timer is currently executing
+         * hrtimer_interrupt(), then we do not touch the clock event
+         * device. hrtimer_interrupt() will reevaluate all clock bases
+         * before reprogramming the device.
+         */
+        if (cpu_base->in_hrtirq)
+                return 0;
+        /*
         * If a hang was detected in the last timer interrupt then we
         * do not schedule a timer which is earlier than the expiry
         * which we enforced in the hang detection. We want the system
@@ -1104,29 +1119,14 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
 ktime_t hrtimer_get_next_event(void)
 {
        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
-        struct hrtimer_clock_base *base = cpu_base->clock_base;
+        ktime_t mindelta = { .tv64 = KTIME_MAX };
-        ktime_t delta, mindelta = { .tv64 = KTIME_MAX };
        unsigned long flags;
-        int i;
        raw_spin_lock_irqsave(&cpu_base->lock, flags);
-        if (!hrtimer_hres_active()) {
+        if (!hrtimer_hres_active())
-                for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
+                mindelta = ktime_sub(__hrtimer_get_next_event(cpu_base),
-                        struct hrtimer *timer;
+                                     ktime_get());
-                        struct timerqueue_node *next;
-                        next = timerqueue_getnext(&base->active);
-                        if (!next)
-                                continue;
-                        timer = container_of(next, struct hrtimer, node);
-                        delta.tv64 = hrtimer_get_expires_tv64(timer);
-                        delta = ktime_sub(delta, base->get_time());
-                        if (delta.tv64 < mindelta.tv64)
-                                mindelta.tv64 = delta.tv64;
-                }
-        }
        raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
@@ -1253,7 +1253,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
        raw_spin_lock(&cpu_base->lock);
        entry_time = now = hrtimer_update_base(cpu_base);
 retry:
-        expires_next.tv64 = KTIME_MAX;
+        cpu_base->in_hrtirq = 1;
        /*
         * We set expires_next to KTIME_MAX here with cpu_base->lock
         * held to prevent that a timer is enqueued in our queue via
@@ -1291,28 +1291,20 @@ retry:
                         * are right-of a not yet expired timer, because that
                         * timer will have to trigger a wakeup anyway.
                         */
+                        if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer))
-                        if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) {
-                                ktime_t expires;
-                                expires = ktime_sub(hrtimer_get_expires(timer),
-                                                    base->offset);
-                                if (expires.tv64 < 0)
-                                        expires.tv64 = KTIME_MAX;
-                                if (expires.tv64 < expires_next.tv64)
-                                        expires_next = expires;
                                break;
-                        }
                        __run_hrtimer(timer, &basenow);
                }
        }
+        /* Reevaluate the clock bases for the next expiry */
+        expires_next = __hrtimer_get_next_event(cpu_base);
        /*
         * Store the new expiry value so the migration code can verify
         * against it.
         */
        cpu_base->expires_next = expires_next;
+        cpu_base->in_hrtirq = 0;
        raw_spin_unlock(&cpu_base->lock);
        /* Reprogramming necessary ? */
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 87a346fd6d61..4b585e0fdd22 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -488,13 +488,13 @@ static void sync_cmos_clock(struct work_struct *work)
        getnstimeofday64(&now);
        if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) {
-                struct timespec adjust = timespec64_to_timespec(now);
+                struct timespec64 adjust = now;
                fail = -ENODEV;
                if (persistent_clock_is_local)
                        adjust.tv_sec -= (sys_tz.tz_minuteswest * 60);
 #ifdef CONFIG_GENERIC_CMOS_UPDATE
-                fail = update_persistent_clock(adjust);
+                fail = update_persistent_clock(timespec64_to_timespec(adjust));
 #endif
 #ifdef CONFIG_RTC_SYSTOHC
                if (fail == -ENODEV)
@@ -633,6 +633,13 @@ int ntp_validate_timex(struct timex *txc)
        if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME)))
                return -EPERM;
+        if (txc->modes & ADJ_FREQUENCY) {
+                if (LONG_MIN / PPM_SCALE > txc->freq)
+                        return -EINVAL;
+                if (LONG_MAX / PPM_SCALE < txc->freq)
+                        return -EINVAL;
+        }
        return 0;
 }
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 6390517e77d4..2c85b7724af4 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -196,6 +196,10 @@ SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv,
        if (tv) {
                if (copy_from_user(&user_tv, tv, sizeof(*tv)))
                        return -EFAULT;
+                if (!timeval_valid(&user_tv))
+                        return -EINVAL;
                new_ts.tv_sec = user_tv.tv_sec;
                new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC;
        }
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 6a931852082f..b124af259800 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1659,24 +1659,24 @@ out:
 }
 /**
- * getboottime - Return the real time of system boot.
+ * getboottime64 - Return the real time of system boot.
- * @ts:         pointer to the timespec to be set
+ * @ts:         pointer to the timespec64 to be set
 *
- * Returns the wall-time of boot in a timespec.
+ * Returns the wall-time of boot in a timespec64.
 *
 * This is based on the wall_to_monotonic offset and the total suspend
 * time. Calls to settimeofday will affect the value returned (which
 * basically means that however wrong your real time clock is at boot time,
 * you get the right time here).
 */
-void getboottime(struct timespec *ts)
+void getboottime64(struct timespec64 *ts)
 {
        struct timekeeper *tk = &tk_core.timekeeper;
        ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot);
-        *ts = ktime_to_timespec(t);
+        *ts = ktime_to_timespec64(t);
 }
-EXPORT_SYMBOL_GPL(getboottime);
+EXPORT_SYMBOL_GPL(getboottime64);
 unsigned long get_seconds(void)
 {
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 929a733d302e..224e768bdc73 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2497,12 +2497,14 @@ static void ftrace_run_update_code(int command)
 }
 static void ftrace_run_modify_code(struct ftrace_ops *ops, int command,
-                                   struct ftrace_hash *old_hash)
+                                   struct ftrace_ops_hash *old_hash)
 {
        ops->flags |= FTRACE_OPS_FL_MODIFYING;
-        ops->old_hash.filter_hash = old_hash;
+        ops->old_hash.filter_hash = old_hash->filter_hash;
+        ops->old_hash.notrace_hash = old_hash->notrace_hash;
        ftrace_run_update_code(command);
        ops->old_hash.filter_hash = NULL;
+        ops->old_hash.notrace_hash = NULL;
        ops->flags &= ~FTRACE_OPS_FL_MODIFYING;
 }
@@ -3579,7 +3581,7 @@ static struct ftrace_ops trace_probe_ops __read_mostly =
 static int ftrace_probe_registered;
-static void __enable_ftrace_function_probe(struct ftrace_hash *old_hash)
+static void __enable_ftrace_function_probe(struct ftrace_ops_hash *old_hash)
 {
        int ret;
        int i;
@@ -3637,6 +3639,7 @@ int
 register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
                              void *data)
 {
+        struct ftrace_ops_hash old_hash_ops;
        struct ftrace_func_probe *entry;
        struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash;
        struct ftrace_hash *old_hash = *orig_hash;
@@ -3658,6 +3661,10 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
        mutex_lock(&trace_probe_ops.func_hash->regex_lock);
+        old_hash_ops.filter_hash = old_hash;
+        /* Probes only have filters */
+        old_hash_ops.notrace_hash = NULL;
        hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash);
        if (!hash) {
                count = -ENOMEM;
@@ -3718,7 +3725,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
        ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
-        __enable_ftrace_function_probe(old_hash);
+        __enable_ftrace_function_probe(&old_hash_ops);
        if (!ret)
                free_ftrace_hash_rcu(old_hash);
@@ -4006,10 +4013,34 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
 }
 static void ftrace_ops_update_code(struct ftrace_ops *ops,
-                                   struct ftrace_hash *old_hash)
+                                   struct ftrace_ops_hash *old_hash)
 {
-        if (ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled)
+        struct ftrace_ops *op;
+        if (!ftrace_enabled)
+                return;
+        if (ops->flags & FTRACE_OPS_FL_ENABLED) {
                ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash);
+                return;
+        }
+        /*
+         * If this is the shared global_ops filter, then we need to
+         * check if there is another ops that shares it, is enabled.
+         * If so, we still need to run the modify code.
+         */
+        if (ops->func_hash != &global_ops.local_hash)
+                return;
+        do_for_each_ftrace_op(op, ftrace_ops_list) {
+                if (op->func_hash == &global_ops.local_hash &&
+                    op->flags & FTRACE_OPS_FL_ENABLED) {
+                        ftrace_run_modify_code(op, FTRACE_UPDATE_CALLS, old_hash);
+                        /* Only need to do this once */
+                        return;
+                }
+        } while_for_each_ftrace_op(op);
 }
 static int
@@ -4017,6 +4048,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
                unsigned long ip, int remove, int reset, int enable)
 {
        struct ftrace_hash **orig_hash;
+        struct ftrace_ops_hash old_hash_ops;
        struct ftrace_hash *old_hash;
        struct ftrace_hash *hash;
        int ret;
@@ -4053,9 +4085,11 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
        mutex_lock(&ftrace_lock);
        old_hash = *orig_hash;
+        old_hash_ops.filter_hash = ops->func_hash->filter_hash;
+        old_hash_ops.notrace_hash = ops->func_hash->notrace_hash;
        ret = ftrace_hash_move(ops, enable, orig_hash, hash);
        if (!ret) {
-                ftrace_ops_update_code(ops, old_hash);
+                ftrace_ops_update_code(ops, &old_hash_ops);
                free_ftrace_hash_rcu(old_hash);
        }
        mutex_unlock(&ftrace_lock);
@@ -4267,6 +4301,7 @@ static void __init set_ftrace_early_filters(void)
 int ftrace_regex_release(struct inode *inode, struct file *file)
 {
        struct seq_file *m = (struct seq_file *)file->private_data;
+        struct ftrace_ops_hash old_hash_ops;
        struct ftrace_iterator *iter;
        struct ftrace_hash **orig_hash;
        struct ftrace_hash *old_hash;
@@ -4300,10 +4335,12 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
                mutex_lock(&ftrace_lock);
                old_hash = *orig_hash;
+                old_hash_ops.filter_hash = iter->ops->func_hash->filter_hash;
+                old_hash_ops.notrace_hash = iter->ops->func_hash->notrace_hash;
                ret = ftrace_hash_move(iter->ops, filter_hash,
                                       orig_hash, iter->hash);
                if (!ret) {
-                        ftrace_ops_update_code(iter->ops, old_hash);
+                        ftrace_ops_update_code(iter->ops, &old_hash_ops);
                        free_ftrace_hash_rcu(old_hash);
                }
                mutex_unlock(&ftrace_lock);
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index 1c71382b283d..eb4220a132ec 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -13,5 +13,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/power.h>
+EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume);
 EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2e767972e99c..4a9079b9f082 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6918,7 +6918,6 @@ void __init trace_init(void)
                        tracepoint_printk = 0;
        }
        tracer_alloc_buffers();
-        init_ftrace_syscalls();
        trace_event_init();     
 }
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 4b9c114ee9de..6fa484de2ba1 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -261,7 +261,7 @@ void perf_trace_del(struct perf_event *p_event, int flags)
 }
 void *perf_trace_buf_prepare(int size, unsigned short type,
-                             struct pt_regs *regs, int *rctxp)
+                             struct pt_regs **regs, int *rctxp)
 {
        struct trace_entry *entry;
        unsigned long flags;
@@ -280,6 +280,8 @@ void *perf_trace_buf_prepare(int size, unsigned short type,
        if (*rctxp < 0)
                return NULL;
+        if (regs)
+                *regs = this_cpu_ptr(&__perf_regs[*rctxp]);
        raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]);
        /* zero the dead bytes from align to not leak stack to user */
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 366a78a3e61e..b03a0ea77b99 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2429,12 +2429,39 @@ static __init int event_trace_memsetup(void)
        return 0;
 }
+static __init void
+early_enable_events(struct trace_array *tr, bool disable_first)
+{
+        char *buf = bootup_event_buf;
+        char *token;
+        int ret;
+        while (true) {
+                token = strsep(&buf, ",");
+                if (!token)
+                        break;
+                if (!*token)
+                        continue;
+                /* Restarting syscalls requires that we stop them first */
+                if (disable_first)
+                        ftrace_set_clr_event(tr, token, 0);
+                ret = ftrace_set_clr_event(tr, token, 1);
+                if (ret)
+                        pr_warn("Failed to enable trace event: %s\n", token);
+                /* Put back the comma to allow this to be called again */
+                if (buf)
+                        *(buf - 1) = ',';
+        }
+}
 static __init int event_trace_enable(void)
 {
        struct trace_array *tr = top_trace_array();
        struct ftrace_event_call **iter, *call;
-        char *buf = bootup_event_buf;
-        char *token;
        int ret;
        if (!tr)
@@ -2456,18 +2483,7 @@ static __init int event_trace_enable(void)
         */
        __trace_early_add_events(tr);
-        while (true) {
+        early_enable_events(tr, false);
-                token = strsep(&buf, ",");
-                if (!token)
-                        break;
-                if (!*token)
-                        continue;
-                ret = ftrace_set_clr_event(tr, token, 1);
-                if (ret)
-                        pr_warn("Failed to enable trace event: %s\n", token);
-        }
        trace_printk_start_comm();
@@ -2478,6 +2494,31 @@ static __init int event_trace_enable(void)
        return 0;
 }
+/*
+ * event_trace_enable() is called from trace_event_init() first to
+ * initialize events and perhaps start any events that are on the
+ * command line. Unfortunately, there are some events that will not
+ * start this early, like the system call tracepoints that need
+ * to set the TIF_SYSCALL_TRACEPOINT flag of pid 1. But event_trace_enable()
+ * is called before pid 1 starts, and this flag is never set, making
+ * the syscall tracepoint never get reached, but the event is enabled
+ * regardless (and not doing anything).
+ */
+static __init int event_trace_enable_again(void)
+{
+        struct trace_array *tr;
+        tr = top_trace_array();
+        if (!tr)
+                return -ENODEV;
+        early_enable_events(tr, true);
+        return 0;
+}
+early_initcall(event_trace_enable_again);
 static __init int event_trace_init(void)
 {
        struct trace_array *tr;
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index b0b1c44e923a..3ccf5c2c1320 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -132,8 +132,8 @@ static int kdb_ftdump(int argc, const char **argv)
 static __init int kdb_ftrace_register(void)
 {
-        kdb_register_repeat("ftdump", kdb_ftdump, "[skip_#lines] [cpu]",
+        kdb_register_flags("ftdump", kdb_ftdump, "[skip_#lines] [cpu]",
-                            "Dump ftrace log", 0, KDB_REPEAT_NONE);
+                            "Dump ftrace log", 0, KDB_ENABLE_ALWAYS_SAFE);
        return 0;
 }
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 5edb518be345..296079ae6583 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1148,7 +1148,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
        size = ALIGN(__size + sizeof(u32), sizeof(u64));
        size -= sizeof(u32);
-        entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
+        entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
        if (!entry)
                return;
@@ -1179,7 +1179,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
        size = ALIGN(__size + sizeof(u32), sizeof(u64));
        size -= sizeof(u32);
-        entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
+        entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
        if (!entry)
                return;
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index c6ee36fcbf90..f97f6e3a676c 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -574,7 +574,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
        size -= sizeof(u32);
        rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
-                                sys_data->enter_event->event.type, regs, &rctx);
+                                sys_data->enter_event->event.type, NULL, &rctx);
        if (!rec)
                return;
@@ -647,7 +647,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
        size -= sizeof(u32);
        rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
-                                sys_data->exit_event->event.type, regs, &rctx);
+                                sys_data->exit_event->event.type, NULL, &rctx);
        if (!rec)
                return;
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 8520acc34b18..b11441321e7a 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1111,7 +1111,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
        if (hlist_empty(head))
                goto out;
-        entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
+        entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
        if (!entry)
                goto out;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 6202b08f1933..beeeac9e0e3e 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1841,17 +1841,11 @@ static void pool_mayday_timeout(unsigned long __pool)
 * spin_lock_irq(pool->lock) which may be released and regrabbed
 * multiple times.  Does GFP_KERNEL allocations.  Called only from
 * manager.
- *
- * Return:
- * %false if no action was taken and pool->lock stayed locked, %true
- * otherwise.
 */
-static bool maybe_create_worker(struct worker_pool *pool)
+static void maybe_create_worker(struct worker_pool *pool)
 __releases(&pool->lock)
 __acquires(&pool->lock)
 {
-        if (!need_to_create_worker(pool))
-                return false;
 restart:
        spin_unlock_irq(&pool->lock);
@@ -1877,7 +1871,6 @@ restart:
         */
        if (need_to_create_worker(pool))
                goto restart;
-        return true;
 }
 /**
@@ -1897,16 +1890,14 @@ restart:
 * multiple times.  Does GFP_KERNEL allocations.
 *
 * Return:
- * %false if the pool don't need management and the caller can safely start
+ * %false if the pool doesn't need management and the caller can safely
- * processing works, %true indicates that the function released pool->lock
+ * start processing works, %true if management function was performed and
- * and reacquired it to perform some management function and that the
+ * the conditions that the caller verified before calling the function may
- * conditions that the caller verified while holding the lock before
+ * no longer be true.
- * calling the function might no longer be true.
 */
 static bool manage_workers(struct worker *worker)
 {
        struct worker_pool *pool = worker->pool;
-        bool ret = false;
        /*
         * Anyone who successfully grabs manager_arb wins the arbitration
@@ -1919,12 +1910,12 @@ static bool manage_workers(struct worker *worker)
         * actual management, the pool may stall indefinitely.
         */
        if (!mutex_trylock(&pool->manager_arb))
-                return ret;
+                return false;
-        ret |= maybe_create_worker(pool);
+        maybe_create_worker(pool);
        mutex_unlock(&pool->manager_arb);
-        return ret;
+        return true;
 }
 /**