104 files changed, 5964 insertions, 2670 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 1408b3353a3c..0f8f8b0bc1bf 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,9 @@ obj-y     = fork.o exec_domain.o panic.o \
            extable.o params.o \
            kthread.o sys_ni.o nsproxy.o \
            notifier.o ksysfs.o cred.o reboot.o \
-            async.o range.o groups.o smpboot.o
+            async.o range.o smpboot.o
+obj-$(CONFIG_MULTIUSER) += groups.o
 ifdef CONFIG_FUNCTION_TRACER
 # Do not trace debug files and internal ftrace files
diff --git a/kernel/acct.c b/kernel/acct.c
index e6c10d1a4058..74963d192c5d 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -213,7 +213,7 @@ static int acct_on(struct filename *pathname)
                return -EACCES;
        }
-        if (!file->f_op->write) {
+        if (!(file->f_mode & FMODE_CAN_WRITE)) {
                kfree(acct);
                filp_close(file, NULL);
                return -EIO;
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index a5ae60f0b0a2..e6983be12bd3 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,5 +1,2 @@
 obj-y := core.o
 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o helpers.o
-ifdef CONFIG_TEST_BPF
-obj-$(CONFIG_BPF_SYSCALL) += test_stub.o
-endif
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 9eb4d8a7cd87..8a6616583f38 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -134,7 +134,7 @@ static void array_map_free(struct bpf_map *map)
        kvfree(array);
 }
-static struct bpf_map_ops array_ops = {
+static const struct bpf_map_ops array_ops = {
        .map_alloc = array_map_alloc,
        .map_free = array_map_free,
        .map_get_next_key = array_map_get_next_key,
@@ -143,14 +143,14 @@ static struct bpf_map_ops array_ops = {
        .map_delete_elem = array_map_delete_elem,
 };
-static struct bpf_map_type_list tl = {
+static struct bpf_map_type_list array_type __read_mostly = {
        .ops = &array_ops,
        .type = BPF_MAP_TYPE_ARRAY,
 };
 static int __init register_array_map(void)
 {
-        bpf_register_map_type(&tl);
+        bpf_register_map_type(&array_type);
        return 0;
 }
 late_initcall(register_array_map);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index a64e7a207d2b..4139a0f8b558 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -656,6 +656,14 @@ void bpf_prog_free(struct bpf_prog *fp)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_free);
+/* Weak definitions of helper functions in case we don't have bpf syscall. */
+const struct bpf_func_proto bpf_map_lookup_elem_proto __weak;
+const struct bpf_func_proto bpf_map_update_elem_proto __weak;
+const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
+const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
+const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
 /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
 * skb_copy_bits(), so provide a weak definition of it for NET-less config.
 */
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index b3ba43674310..83c209d9b17a 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -345,7 +345,7 @@ static void htab_map_free(struct bpf_map *map)
        kfree(htab);
 }
-static struct bpf_map_ops htab_ops = {
+static const struct bpf_map_ops htab_ops = {
        .map_alloc = htab_map_alloc,
        .map_free = htab_map_free,
        .map_get_next_key = htab_map_get_next_key,
@@ -354,14 +354,14 @@ static struct bpf_map_ops htab_ops = {
        .map_delete_elem = htab_map_delete_elem,
 };
-static struct bpf_map_type_list tl = {
+static struct bpf_map_type_list htab_type __read_mostly = {
        .ops = &htab_ops,
        .type = BPF_MAP_TYPE_HASH,
 };
 static int __init register_htab_map(void)
 {
-        bpf_register_map_type(&tl);
+        bpf_register_map_type(&htab_type);
        return 0;
 }
 late_initcall(register_htab_map);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 9e3414d85459..bd7f5988ed9c 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -11,6 +11,8 @@
 */
 #include <linux/bpf.h>
 #include <linux/rcupdate.h>
+#include <linux/random.h>
+#include <linux/smp.h>
 /* If kernel subsystem is allowing eBPF programs to call this function,
 * inside its own verifier_ops->get_func_proto() callback it should return
@@ -41,7 +43,7 @@ static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
        return (unsigned long) value;
 }
-struct bpf_func_proto bpf_map_lookup_elem_proto = {
+const struct bpf_func_proto bpf_map_lookup_elem_proto = {
        .func = bpf_map_lookup_elem,
        .gpl_only = false,
        .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
@@ -60,7 +62,7 @@ static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
        return map->ops->map_update_elem(map, key, value, r4);
 }
-struct bpf_func_proto bpf_map_update_elem_proto = {
+const struct bpf_func_proto bpf_map_update_elem_proto = {
        .func = bpf_map_update_elem,
        .gpl_only = false,
        .ret_type = RET_INTEGER,
@@ -80,10 +82,32 @@ static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
        return map->ops->map_delete_elem(map, key);
 }
-struct bpf_func_proto bpf_map_delete_elem_proto = {
+const struct bpf_func_proto bpf_map_delete_elem_proto = {
        .func = bpf_map_delete_elem,
        .gpl_only = false,
        .ret_type = RET_INTEGER,
        .arg1_type = ARG_CONST_MAP_PTR,
        .arg2_type = ARG_PTR_TO_MAP_KEY,
 };
+static u64 bpf_get_prandom_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+        return prandom_u32();
+}
+const struct bpf_func_proto bpf_get_prandom_u32_proto = {
+        .func           = bpf_get_prandom_u32,
+        .gpl_only       = false,
+        .ret_type       = RET_INTEGER,
+};
+static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+        return raw_smp_processor_id();
+}
+const struct bpf_func_proto bpf_get_smp_processor_id_proto = {
+        .func           = bpf_get_smp_processor_id,
+        .gpl_only       = false,
+        .ret_type       = RET_INTEGER,
+};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 536edc2be307..3bae6c591914 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -16,6 +16,7 @@
 #include <linux/file.h>
 #include <linux/license.h>
 #include <linux/filter.h>
+#include <linux/version.h>
 static LIST_HEAD(bpf_map_types);
@@ -354,10 +355,11 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
        list_for_each_entry(tl, &bpf_prog_types, list_node) {
                if (tl->type == type) {
                        prog->aux->ops = tl->ops;
-                        prog->aux->prog_type = type;
+                        prog->type = type;
                        return 0;
                }
        }
        return -EINVAL;
 }
@@ -418,6 +420,7 @@ void bpf_prog_put(struct bpf_prog *prog)
                bpf_prog_free(prog);
        }
 }
+EXPORT_SYMBOL_GPL(bpf_prog_put);
 static int bpf_prog_release(struct inode *inode, struct file *filp)
 {
@@ -465,9 +468,10 @@ struct bpf_prog *bpf_prog_get(u32 ufd)
        fdput(f);
        return prog;
 }
+EXPORT_SYMBOL_GPL(bpf_prog_get);
 /* last field in 'union bpf_attr' used by this command */
-#define BPF_PROG_LOAD_LAST_FIELD log_buf
+#define BPF_PROG_LOAD_LAST_FIELD kern_version
 static int bpf_prog_load(union bpf_attr *attr)
 {
@@ -492,6 +496,10 @@ static int bpf_prog_load(union bpf_attr *attr)
        if (attr->insn_cnt >= BPF_MAXINSNS)
                return -EINVAL;
+        if (type == BPF_PROG_TYPE_KPROBE &&
+            attr->kern_version != LINUX_VERSION_CODE)
+                return -EINVAL;
        /* plain bpf_prog allocation */
        prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
        if (!prog)
@@ -508,7 +516,7 @@ static int bpf_prog_load(union bpf_attr *attr)
        prog->jited = false;
        atomic_set(&prog->aux->refcnt, 1);
-        prog->aux->is_gpl_compatible = is_gpl;
+        prog->gpl_compatible = is_gpl;
        /* find program type: socket_filter vs tracing_filter */
        err = find_prog_type(type, prog);
@@ -516,8 +524,7 @@ static int bpf_prog_load(union bpf_attr *attr)
                goto free_prog;
        /* run eBPF verifier */
-        err = bpf_check(prog, attr);
+        err = bpf_check(&prog, attr);
        if (err < 0)
                goto free_used_maps;
@@ -528,7 +535,6 @@ static int bpf_prog_load(union bpf_attr *attr)
        bpf_prog_select_runtime(prog);
        err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC);
        if (err < 0)
                /* failed to allocate fd */
                goto free_used_maps;
diff --git a/kernel/bpf/test_stub.c b/kernel/bpf/test_stub.c
deleted file mode 100644
index 0ceae1e6e8b5..000000000000
--- a/kernel/bpf/test_stub.c
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- */
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/err.h>
-#include <linux/bpf.h>
-/* test stubs for BPF_MAP_TYPE_UNSPEC and for BPF_PROG_TYPE_UNSPEC
- * to be used by user space verifier testsuite
- */
-struct bpf_context {
-        u64 arg1;
-        u64 arg2;
-};
-static const struct bpf_func_proto *test_func_proto(enum bpf_func_id func_id)
-{
-        switch (func_id) {
-        case BPF_FUNC_map_lookup_elem:
-                return &bpf_map_lookup_elem_proto;
-        case BPF_FUNC_map_update_elem:
-                return &bpf_map_update_elem_proto;
-        case BPF_FUNC_map_delete_elem:
-                return &bpf_map_delete_elem_proto;
-        default:
-                return NULL;
-        }
-}
-static const struct bpf_context_access {
-        int size;
-        enum bpf_access_type type;
-} test_ctx_access[] = {
-        [offsetof(struct bpf_context, arg1)] = {
-                FIELD_SIZEOF(struct bpf_context, arg1),
-                BPF_READ
-        },
-        [offsetof(struct bpf_context, arg2)] = {
-                FIELD_SIZEOF(struct bpf_context, arg2),
-                BPF_READ
-        },
-};
-static bool test_is_valid_access(int off, int size, enum bpf_access_type type)
-{
-        const struct bpf_context_access *access;
-        if (off < 0 || off >= ARRAY_SIZE(test_ctx_access))
-                return false;
-        access = &test_ctx_access[off];
-        if (access->size == size && (access->type & type))
-                return true;
-        return false;
-}
-static struct bpf_verifier_ops test_ops = {
-        .get_func_proto = test_func_proto,
-        .is_valid_access = test_is_valid_access,
-};
-static struct bpf_prog_type_list tl_prog = {
-        .ops = &test_ops,
-        .type = BPF_PROG_TYPE_UNSPEC,
-};
-static int __init register_test_ops(void)
-{
-        bpf_register_prog_type(&tl_prog);
-        return 0;
-}
-late_initcall(register_test_ops);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a28e09c7825d..630a7bac1e51 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -755,7 +755,7 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
        enum bpf_reg_type expected_type;
        int err = 0;
-        if (arg_type == ARG_ANYTHING)
+        if (arg_type == ARG_DONTCARE)
                return 0;
        if (reg->type == NOT_INIT) {
@@ -763,6 +763,9 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
                return -EACCES;
        }
+        if (arg_type == ARG_ANYTHING)
+                return 0;
        if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY ||
            arg_type == ARG_PTR_TO_MAP_VALUE) {
                expected_type = PTR_TO_STACK;
@@ -770,6 +773,8 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
                expected_type = CONST_IMM;
        } else if (arg_type == ARG_CONST_MAP_PTR) {
                expected_type = CONST_PTR_TO_MAP;
+        } else if (arg_type == ARG_PTR_TO_CTX) {
+                expected_type = PTR_TO_CTX;
        } else {
                verbose("unsupported arg_type %d\n", arg_type);
                return -EFAULT;
@@ -852,7 +857,7 @@ static int check_call(struct verifier_env *env, int func_id)
        }
        /* eBPF programs must be GPL compatible to use GPL-ed functions */
-        if (!env->prog->aux->is_gpl_compatible && fn->gpl_only) {
+        if (!env->prog->gpl_compatible && fn->gpl_only) {
                verbose("cannot call GPL only function from proprietary program\n");
                return -EINVAL;
        }
@@ -1172,6 +1177,18 @@ static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn)
        return 0;
 }
+static bool may_access_skb(enum bpf_prog_type type)
+{
+        switch (type) {
+        case BPF_PROG_TYPE_SOCKET_FILTER:
+        case BPF_PROG_TYPE_SCHED_CLS:
+        case BPF_PROG_TYPE_SCHED_ACT:
+                return true;
+        default:
+                return false;
+        }
+}
 /* verify safety of LD_ABS|LD_IND instructions:
 * - they can only appear in the programs where ctx == skb
 * - since they are wrappers of function calls, they scratch R1-R5 registers,
@@ -1194,8 +1211,8 @@ static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn)
        struct reg_state *reg;
        int i, err;
-        if (env->prog->aux->prog_type != BPF_PROG_TYPE_SOCKET_FILTER) {
+        if (!may_access_skb(env->prog->type)) {
-                verbose("BPF_LD_ABS|IND instructions are only allowed in socket filters\n");
+                verbose("BPF_LD_ABS|IND instructions not allowed for this program type\n");
                return -EINVAL;
        }
@@ -1606,11 +1623,10 @@ static int do_check(struct verifier_env *env)
                                return err;
                } else if (class == BPF_LDX) {
-                        if (BPF_MODE(insn->code) != BPF_MEM ||
+                        enum bpf_reg_type src_reg_type;
-                            insn->imm != 0) {
-                                verbose("BPF_LDX uses reserved fields\n");
+                        /* check for reserved fields is already done */
-                                return -EINVAL;
-                        }
                        /* check src operand */
                        err = check_reg_arg(regs, insn->src_reg, SRC_OP);
                        if (err)
@@ -1629,6 +1645,29 @@ static int do_check(struct verifier_env *env)
                        if (err)
                                return err;
+                        src_reg_type = regs[insn->src_reg].type;
+                        if (insn->imm == 0 && BPF_SIZE(insn->code) == BPF_W) {
+                                /* saw a valid insn
+                                 * dst_reg = *(u32 *)(src_reg + off)
+                                 * use reserved 'imm' field to mark this insn
+                                 */
+                                insn->imm = src_reg_type;
+                        } else if (src_reg_type != insn->imm &&
+                                   (src_reg_type == PTR_TO_CTX ||
+                                    insn->imm == PTR_TO_CTX)) {
+                                /* ABuser program is trying to use the same insn
+                                 * dst_reg = *(u32*) (src_reg + off)
+                                 * with different pointer types:
+                                 * src_reg == ctx in one branch and
+                                 * src_reg == stack|map in some other branch.
+                                 * Reject it.
+                                 */
+                                verbose("same insn cannot be used with different pointers\n");
+                                return -EINVAL;
+                        }
                } else if (class == BPF_STX) {
                        if (BPF_MODE(insn->code) == BPF_XADD) {
                                err = check_xadd(env, insn);
@@ -1776,6 +1815,13 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env)
        int i, j;
        for (i = 0; i < insn_cnt; i++, insn++) {
+                if (BPF_CLASS(insn->code) == BPF_LDX &&
+                    (BPF_MODE(insn->code) != BPF_MEM ||
+                     insn->imm != 0)) {
+                        verbose("BPF_LDX uses reserved fields\n");
+                        return -EINVAL;
+                }
                if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) {
                        struct bpf_map *map;
                        struct fd f;
@@ -1867,6 +1913,92 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env)
                        insn->src_reg = 0;
 }
+static void adjust_branches(struct bpf_prog *prog, int pos, int delta)
+{
+        struct bpf_insn *insn = prog->insnsi;
+        int insn_cnt = prog->len;
+        int i;
+        for (i = 0; i < insn_cnt; i++, insn++) {
+                if (BPF_CLASS(insn->code) != BPF_JMP ||
+                    BPF_OP(insn->code) == BPF_CALL ||
+                    BPF_OP(insn->code) == BPF_EXIT)
+                        continue;
+                /* adjust offset of jmps if necessary */
+                if (i < pos && i + insn->off + 1 > pos)
+                        insn->off += delta;
+                else if (i > pos && i + insn->off + 1 < pos)
+                        insn->off -= delta;
+        }
+}
+/* convert load instructions that access fields of 'struct __sk_buff'
+ * into sequence of instructions that access fields of 'struct sk_buff'
+ */
+static int convert_ctx_accesses(struct verifier_env *env)
+{
+        struct bpf_insn *insn = env->prog->insnsi;
+        int insn_cnt = env->prog->len;
+        struct bpf_insn insn_buf[16];
+        struct bpf_prog *new_prog;
+        u32 cnt;
+        int i;
+        if (!env->prog->aux->ops->convert_ctx_access)
+                return 0;
+        for (i = 0; i < insn_cnt; i++, insn++) {
+                if (insn->code != (BPF_LDX | BPF_MEM | BPF_W))
+                        continue;
+                if (insn->imm != PTR_TO_CTX) {
+                        /* clear internal mark */
+                        insn->imm = 0;
+                        continue;
+                }
+                cnt = env->prog->aux->ops->
+                        convert_ctx_access(insn->dst_reg, insn->src_reg,
+                                           insn->off, insn_buf);
+                if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
+                        verbose("bpf verifier is misconfigured\n");
+                        return -EINVAL;
+                }
+                if (cnt == 1) {
+                        memcpy(insn, insn_buf, sizeof(*insn));
+                        continue;
+                }
+                /* several new insns need to be inserted. Make room for them */
+                insn_cnt += cnt - 1;
+                new_prog = bpf_prog_realloc(env->prog,
+                                            bpf_prog_size(insn_cnt),
+                                            GFP_USER);
+                if (!new_prog)
+                        return -ENOMEM;
+                new_prog->len = insn_cnt;
+                memmove(new_prog->insnsi + i + cnt, new_prog->insns + i + 1,
+                        sizeof(*insn) * (insn_cnt - i - cnt));
+                /* copy substitute insns in place of load instruction */
+                memcpy(new_prog->insnsi + i, insn_buf, sizeof(*insn) * cnt);
+                /* adjust branches in the whole program */
+                adjust_branches(new_prog, i, cnt - 1);
+                /* keep walking new program and skip insns we just inserted */
+                env->prog = new_prog;
+                insn = new_prog->insnsi + i + cnt - 1;
+                i += cnt - 1;
+        }
+        return 0;
+}
 static void free_states(struct verifier_env *env)
 {
        struct verifier_state_list *sl, *sln;
@@ -1889,13 +2021,13 @@ static void free_states(struct verifier_env *env)
        kfree(env->explored_states);
 }
-int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
+int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 {
        char __user *log_ubuf = NULL;
        struct verifier_env *env;
        int ret = -EINVAL;
-        if (prog->len <= 0 || prog->len > BPF_MAXINSNS)
+        if ((*prog)->len <= 0 || (*prog)->len > BPF_MAXINSNS)
                return -E2BIG;
        /* 'struct verifier_env' can be global, but since it's not small,
@@ -1905,7 +2037,7 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
        if (!env)
                return -ENOMEM;
-        env->prog = prog;
+        env->prog = *prog;
        /* grab the mutex to protect few globals used by verifier */
        mutex_lock(&bpf_verifier_lock);
@@ -1937,7 +2069,7 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
        if (ret < 0)
                goto skip_full_check;
-        env->explored_states = kcalloc(prog->len,
+        env->explored_states = kcalloc(env->prog->len,
                                       sizeof(struct verifier_state_list *),
                                       GFP_USER);
        ret = -ENOMEM;
@@ -1954,6 +2086,10 @@ skip_full_check:
        while (pop_stack(env, NULL) >= 0);
        free_states(env);
+        if (ret == 0)
+                /* program is valid, convert *(u32*)(ctx + off) accesses */
+                ret = convert_ctx_accesses(env);
        if (log_level && log_len >= log_size - 1) {
                BUG_ON(log_len >= log_size);
                /* verifier log exceeded user supplied buffer */
@@ -1969,18 +2105,18 @@ skip_full_check:
        if (ret == 0 && env->used_map_cnt) {
                /* if program passed verifier, update used_maps in bpf_prog_info */
-                prog->aux->used_maps = kmalloc_array(env->used_map_cnt,
+                env->prog->aux->used_maps = kmalloc_array(env->used_map_cnt,
-                                                     sizeof(env->used_maps[0]),
+                                                          sizeof(env->used_maps[0]),
-                                                     GFP_KERNEL);
+                                                          GFP_KERNEL);
-                if (!prog->aux->used_maps) {
+                if (!env->prog->aux->used_maps) {
                        ret = -ENOMEM;
                        goto free_log_buf;
                }
-                memcpy(prog->aux->used_maps, env->used_maps,
+                memcpy(env->prog->aux->used_maps, env->used_maps,
                       sizeof(env->used_maps[0]) * env->used_map_cnt);
-                prog->aux->used_map_cnt = env->used_map_cnt;
+                env->prog->aux->used_map_cnt = env->used_map_cnt;
                /* program is valid. Convert pseudo bpf_ld_imm64 into generic
                 * bpf_ld_imm64 instructions
@@ -1992,11 +2128,12 @@ free_log_buf:
        if (log_level)
                vfree(log_buf);
 free_env:
-        if (!prog->aux->used_maps)
+        if (!env->prog->aux->used_maps)
                /* if we didn't copy map pointers into bpf_prog_info, release
                 * them now. Otherwise free_bpf_prog_info() will release them.
                 */
                release_maps(env);
+        *prog = env->prog;
        kfree(env);
        mutex_unlock(&bpf_verifier_lock);
        return ret;
diff --git a/kernel/capability.c b/kernel/capability.c
index 989f5bfc57dc..45432b54d5c6 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -35,6 +35,7 @@ static int __init file_caps_disable(char *str)
 }
 __setup("no_file_caps", file_caps_disable);
+#ifdef CONFIG_MULTIUSER
 /*
 * More recent versions of libcap are available from:
 *
@@ -386,6 +387,24 @@ bool ns_capable(struct user_namespace *ns, int cap)
 }
 EXPORT_SYMBOL(ns_capable);
+/**
+ * capable - Determine if the current task has a superior capability in effect
+ * @cap: The capability to be tested for
+ *
+ * Return true if the current task has the given superior capability currently
+ * available for use, false if not.
+ *
+ * This sets PF_SUPERPRIV on the task if the capability is available on the
+ * assumption that it's about to be used.
+ */
+bool capable(int cap)
+{
+        return ns_capable(&init_user_ns, cap);
+}
+EXPORT_SYMBOL(capable);
+#endif /* CONFIG_MULTIUSER */
 /**
 * file_ns_capable - Determine if the file's opener had a capability in effect
 * @file:  The file we want to check
@@ -412,22 +431,6 @@ bool file_ns_capable(const struct file *file, struct user_namespace *ns,
 EXPORT_SYMBOL(file_ns_capable);
 /**
- * capable - Determine if the current task has a superior capability in effect
- * @cap: The capability to be tested for
- *
- * Return true if the current task has the given superior capability currently
- * available for use, false if not.
- *
- * This sets PF_SUPERPRIV on the task if the capability is available on the
- * assumption that it's about to be used.
- */
-bool capable(int cap)
-{
-        return ns_capable(&init_user_ns, cap);
-}
-EXPORT_SYMBOL(capable);
-/**
 * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped
 * @inode: The inode in question
 * @cap: The capability in question
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 29a7b2cc593e..469dd547770c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3806,10 +3806,7 @@ static void *pidlist_allocate(int count)
 static void pidlist_free(void *p)
 {
-        if (is_vmalloc_addr(p))
+        kvfree(p);
-                vfree(p);
-        else
-                kfree(p);
 }
 /*
@@ -4199,7 +4196,9 @@ static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
 static int cgroup_pidlist_show(struct seq_file *s, void *v)
 {
-        return seq_printf(s, "%d\n", *(int *)v);
+        seq_printf(s, "%d\n", *(int *)v);
+        return 0;
 }
 static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
@@ -5040,6 +5039,9 @@ int __init cgroup_init(void)
                        WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
                        WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
                }
+                if (ss->bind)
+                        ss->bind(init_css_set.subsys[ssid]);
        }
        cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
@@ -5451,7 +5453,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
 struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
 {
        WARN_ON_ONCE(!rcu_read_lock_held());
-        return idr_find(&ss->css_idr, id);
+        return id > 0 ? idr_find(&ss->css_idr, id) : NULL;
 }
 #ifdef CONFIG_CGROUP_DEBUG
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 937ecdfdf258..72d59a1a6eb6 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -39,15 +39,15 @@ void context_tracking_cpu_set(int cpu)
 }
 /**
- * context_tracking_user_enter - Inform the context tracking that the CPU is going to
+ * context_tracking_enter - Inform the context tracking that the CPU is going
- *                               enter userspace mode.
+ *                          enter user or guest space mode.
 *
 * This function must be called right before we switch from the kernel
- * to userspace, when it's guaranteed the remaining kernel instructions
+ * to user or guest space, when it's guaranteed the remaining kernel
- * to execute won't use any RCU read side critical section because this
+ * instructions to execute won't use any RCU read side critical section
- * function sets RCU in extended quiescent state.
+ * because this function sets RCU in extended quiescent state.
 */
-void context_tracking_user_enter(void)
+void context_tracking_enter(enum ctx_state state)
 {
        unsigned long flags;
@@ -75,9 +75,8 @@ void context_tracking_user_enter(void)
        WARN_ON_ONCE(!current->mm);
        local_irq_save(flags);
-        if ( __this_cpu_read(context_tracking.state) != IN_USER) {
+        if ( __this_cpu_read(context_tracking.state) != state) {
                if (__this_cpu_read(context_tracking.active)) {
-                        trace_user_enter(0);
                        /*
                         * At this stage, only low level arch entry code remains and
                         * then we'll run in userspace. We can assume there won't be
@@ -85,7 +84,10 @@ void context_tracking_user_enter(void)
                         * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency
                         * on the tick.
                         */
-                        vtime_user_enter(current);
+                        if (state == CONTEXT_USER) {
+                                trace_user_enter(0);
+                                vtime_user_enter(current);
+                        }
                        rcu_user_enter();
                }
                /*
@@ -101,24 +103,32 @@ void context_tracking_user_enter(void)
                 * OTOH we can spare the calls to vtime and RCU when context_tracking.active
                 * is false because we know that CPU is not tickless.
                 */
-                __this_cpu_write(context_tracking.state, IN_USER);
+                __this_cpu_write(context_tracking.state, state);
        }
        local_irq_restore(flags);
 }
+NOKPROBE_SYMBOL(context_tracking_enter);
+EXPORT_SYMBOL_GPL(context_tracking_enter);
+void context_tracking_user_enter(void)
+{
+        context_tracking_enter(CONTEXT_USER);
+}
 NOKPROBE_SYMBOL(context_tracking_user_enter);
 /**
- * context_tracking_user_exit - Inform the context tracking that the CPU is
+ * context_tracking_exit - Inform the context tracking that the CPU is
- *                              exiting userspace mode and entering the kernel.
+ *                         exiting user or guest mode and entering the kernel.
 *
- * This function must be called after we entered the kernel from userspace
+ * This function must be called after we entered the kernel from user or
- * before any use of RCU read side critical section. This potentially include
+ * guest space before any use of RCU read side critical section. This
- * any high level kernel code like syscalls, exceptions, signal handling, etc...
+ * potentially include any high level kernel code like syscalls, exceptions,
+ * signal handling, etc...
 *
 * This call supports re-entrancy. This way it can be called from any exception
 * handler without needing to know if we came from userspace or not.
 */
-void context_tracking_user_exit(void)
+void context_tracking_exit(enum ctx_state state)
 {
        unsigned long flags;
@@ -129,20 +139,29 @@ void context_tracking_user_exit(void)
                return;
        local_irq_save(flags);
-        if (__this_cpu_read(context_tracking.state) == IN_USER) {
+        if (__this_cpu_read(context_tracking.state) == state) {
                if (__this_cpu_read(context_tracking.active)) {
                        /*
                         * We are going to run code that may use RCU. Inform
                         * RCU core about that (ie: we may need the tick again).
                         */
                        rcu_user_exit();
-                        vtime_user_exit(current);
+                        if (state == CONTEXT_USER) {
-                        trace_user_exit(0);
+                                vtime_user_exit(current);
+                                trace_user_exit(0);
+                        }
                }
-                __this_cpu_write(context_tracking.state, IN_KERNEL);
+                __this_cpu_write(context_tracking.state, CONTEXT_KERNEL);
        }
        local_irq_restore(flags);
 }
+NOKPROBE_SYMBOL(context_tracking_exit);
+EXPORT_SYMBOL_GPL(context_tracking_exit);
+void context_tracking_user_exit(void)
+{
+        context_tracking_exit(CONTEXT_USER);
+}
 NOKPROBE_SYMBOL(context_tracking_user_exit);
 /**
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 1972b161c61e..94bbe4695232 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -20,6 +20,7 @@
 #include <linux/gfp.h>
 #include <linux/suspend.h>
 #include <linux/lockdep.h>
+#include <linux/tick.h>
 #include <trace/events/power.h>
 #include "smpboot.h"
@@ -338,6 +339,8 @@ static int __ref take_cpu_down(void *_param)
                return err;
        cpu_notify(CPU_DYING | param->mod, param->hcpu);
+        /* Give up timekeeping duties */
+        tick_handover_do_timer();
        /* Park the stopper thread */
        kthread_park(current);
        return 0;
@@ -408,13 +411,17 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
         *
         * Wait for the stop thread to go away.
         */
-        while (!idle_cpu(cpu))
+        while (!per_cpu(cpu_dead_idle, cpu))
                cpu_relax();
+        smp_mb(); /* Read from cpu_dead_idle before __cpu_die(). */
+        per_cpu(cpu_dead_idle, cpu) = false;
+        hotplug_cpu__broadcast_tick_pull(cpu);
        /* This actually kills the CPU. */
        __cpu_die(cpu);
        /* CPU is completely dead: tell everyone.  Too late to complain. */
+        tick_cleanup_dead_cpu(cpu);
        cpu_notify_nofail(CPU_DEAD | mod, hcpu);
        check_for_tasks(cpu);
@@ -446,6 +453,37 @@ out:
 EXPORT_SYMBOL(cpu_down);
 #endif /*CONFIG_HOTPLUG_CPU*/
+/*
+ * Unpark per-CPU smpboot kthreads at CPU-online time.
+ */
+static int smpboot_thread_call(struct notifier_block *nfb,
+                               unsigned long action, void *hcpu)
+{
+        int cpu = (long)hcpu;
+        switch (action & ~CPU_TASKS_FROZEN) {
+        case CPU_ONLINE:
+                smpboot_unpark_threads(cpu);
+                break;
+        default:
+                break;
+        }
+        return NOTIFY_OK;
+}
+static struct notifier_block smpboot_thread_notifier = {
+        .notifier_call = smpboot_thread_call,
+        .priority = CPU_PRI_SMPBOOT,
+};
+void __cpuinit smpboot_thread_init(void)
+{
+        register_cpu_notifier(&smpboot_thread_notifier);
+}
 /* Requires cpu_add_remove_lock to be held */
 static int _cpu_up(unsigned int cpu, int tasks_frozen)
 {
@@ -485,9 +523,6 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen)
                goto out_notify;
        BUG_ON(!cpu_online(cpu));
-        /* Wake the per cpu threads */
-        smpboot_unpark_threads(cpu);
        /* Now call notifier in preparation. */
        cpu_notify(CPU_ONLINE | mod, hcpu);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index fc7f4748d34a..ee14e3a35a29 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -622,6 +622,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
        int csn;                /* how many cpuset ptrs in csa so far */
        int i, j, k;            /* indices for partition finding loops */
        cpumask_var_t *doms;    /* resulting partition; i.e. sched domains */
+        cpumask_var_t non_isolated_cpus;  /* load balanced CPUs */
        struct sched_domain_attr *dattr;  /* attributes for custom domains */
        int ndoms = 0;          /* number of sched domains in result */
        int nslot;              /* next empty doms[] struct cpumask slot */
@@ -631,6 +632,10 @@ static int generate_sched_domains(cpumask_var_t **domains,
        dattr = NULL;
        csa = NULL;
+        if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL))
+                goto done;
+        cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
        /* Special case for the 99% of systems with one, full, sched domain */
        if (is_sched_load_balance(&top_cpuset)) {
                ndoms = 1;
@@ -643,7 +648,8 @@ static int generate_sched_domains(cpumask_var_t **domains,
                        *dattr = SD_ATTR_INIT;
                        update_domain_attr_tree(dattr, &top_cpuset);
                }
-                cpumask_copy(doms[0], top_cpuset.effective_cpus);
+                cpumask_and(doms[0], top_cpuset.effective_cpus,
+                                     non_isolated_cpus);
                goto done;
        }
@@ -666,7 +672,8 @@ static int generate_sched_domains(cpumask_var_t **domains,
                 * the corresponding sched domain.
                 */
                if (!cpumask_empty(cp->cpus_allowed) &&
-                    !is_sched_load_balance(cp))
+                    !(is_sched_load_balance(cp) &&
+                      cpumask_intersects(cp->cpus_allowed, non_isolated_cpus)))
                        continue;
                if (is_sched_load_balance(cp))
@@ -748,6 +755,7 @@ restart:
                        if (apn == b->pn) {
                                cpumask_or(dp, dp, b->effective_cpus);
+                                cpumask_and(dp, dp, non_isolated_cpus);
                                if (dattr)
                                        update_domain_attr_tree(dattr + nslot, b);
@@ -760,6 +768,7 @@ restart:
        BUG_ON(nslot != ndoms);
 done:
+        free_cpumask_var(non_isolated_cpus);
        kfree(csa);
        /*
@@ -2444,20 +2453,12 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
 * @node: is this an allowed node?
 * @gfp_mask: memory allocation flags
 *
- * If we're in interrupt, yes, we can always allocate.  If __GFP_THISNODE is
+ * If we're in interrupt, yes, we can always allocate.  If @node is set in
- * set, yes, we can always allocate.  If node is in our task's mems_allowed,
+ * current's mems_allowed, yes.  If it's not a __GFP_HARDWALL request and this
- * yes.  If it's not a __GFP_HARDWALL request and this node is in the nearest
+ * node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
- * hardwalled cpuset ancestor to this task's cpuset, yes.  If the task has been
+ * yes.  If current has access to memory reserves due to TIF_MEMDIE, yes.
- * OOM killed and has access to memory reserves as specified by the TIF_MEMDIE
- * flag, yes.
 * Otherwise, no.
 *
- * The __GFP_THISNODE placement logic is really handled elsewhere,
- * by forcibly using a zonelist starting at a specified node, and by
- * (in get_page_from_freelist()) refusing to consider the zones for
- * any node on the zonelist except the first.  By the time any such
- * calls get to this routine, we should just shut up and say 'yes'.
- *
 * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
 * and do not allow allocations outside the current tasks cpuset
 * unless the task has been OOM killed as is marked TIF_MEMDIE.
@@ -2493,7 +2494,7 @@ int __cpuset_node_allowed(int node, gfp_t gfp_mask)
        int allowed;                    /* is allocation in zone z allowed? */
        unsigned long flags;
-        if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
+        if (in_interrupt())
                return 1;
        if (node_isset(node, current->mems_allowed))
                return 1;
diff --git a/kernel/cred.c b/kernel/cred.c
index e0573a43c7df..ec1c07667ec1 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -29,6 +29,9 @@
 static struct kmem_cache *cred_jar;
+/* init to 2 - one for init_task, one to ensure it is never freed */
+struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
 /*
 * The initial credentials for the initial task
 */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f04daabfd1cf..81aa3a4ece9f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -34,14 +34,16 @@
 #include <linux/syscalls.h>
 #include <linux/anon_inodes.h>
 #include <linux/kernel_stat.h>
+#include <linux/cgroup.h>
 #include <linux/perf_event.h>
 #include <linux/ftrace_event.h>
 #include <linux/hw_breakpoint.h>
 #include <linux/mm_types.h>
-#include <linux/cgroup.h>
 #include <linux/module.h>
 #include <linux/mman.h>
 #include <linux/compat.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
 #include "internal.h"
@@ -153,7 +155,7 @@ enum event_type_t {
 */
 struct static_key_deferred perf_sched_events __read_mostly;
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
-static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);
+static DEFINE_PER_CPU(int, perf_sched_cb_usages);
 static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
@@ -327,6 +329,11 @@ static inline u64 perf_clock(void)
        return local_clock();
 }
+static inline u64 perf_event_clock(struct perf_event *event)
+{
+        return event->clock();
+}
 static inline struct perf_cpu_context *
 __get_cpu_context(struct perf_event_context *ctx)
 {
@@ -351,32 +358,6 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
 #ifdef CONFIG_CGROUP_PERF
-/*
- * perf_cgroup_info keeps track of time_enabled for a cgroup.
- * This is a per-cpu dynamically allocated data structure.
- */
-struct perf_cgroup_info {
-        u64                             time;
-        u64                             timestamp;
-};
-struct perf_cgroup {
-        struct cgroup_subsys_state      css;
-        struct perf_cgroup_info __percpu *info;
-};
-/*
- * Must ensure cgroup is pinned (css_get) before calling
- * this function. In other words, we cannot call this function
- * if there is no cgroup event for the current CPU context.
- */
-static inline struct perf_cgroup *
-perf_cgroup_from_task(struct task_struct *task)
-{
-        return container_of(task_css(task, perf_event_cgrp_id),
-                            struct perf_cgroup, css);
-}
 static inline bool
 perf_cgroup_match(struct perf_event *event)
 {
@@ -905,6 +886,15 @@ static void get_ctx(struct perf_event_context *ctx)
        WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
 }
+static void free_ctx(struct rcu_head *head)
+{
+        struct perf_event_context *ctx;
+        ctx = container_of(head, struct perf_event_context, rcu_head);
+        kfree(ctx->task_ctx_data);
+        kfree(ctx);
+}
 static void put_ctx(struct perf_event_context *ctx)
 {
        if (atomic_dec_and_test(&ctx->refcount)) {
@@ -912,7 +902,7 @@ static void put_ctx(struct perf_event_context *ctx)
                        put_ctx(ctx->parent_ctx);
                if (ctx->task)
                        put_task_struct(ctx->task);
-                kfree_rcu(ctx, rcu_head);
+                call_rcu(&ctx->rcu_head, free_ctx);
        }
 }
@@ -1239,9 +1229,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
        if (is_cgroup_event(event))
                ctx->nr_cgroups++;
-        if (has_branch_stack(event))
-                ctx->nr_branch_stack++;
        list_add_rcu(&event->event_entry, &ctx->event_list);
        ctx->nr_events++;
        if (event->attr.inherit_stat)
@@ -1408,9 +1395,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
                        cpuctx->cgrp = NULL;
        }
-        if (has_branch_stack(event))
-                ctx->nr_branch_stack--;
        ctx->nr_events--;
        if (event->attr.inherit_stat)
                ctx->nr_stat--;
@@ -1847,6 +1831,7 @@ static void perf_set_shadow_time(struct perf_event *event,
 #define MAX_INTERRUPTS (~0ULL)
 static void perf_log_throttle(struct perf_event *event, int enable);
+static void perf_log_itrace_start(struct perf_event *event);
 static int
 event_sched_in(struct perf_event *event,
@@ -1881,6 +1866,12 @@ event_sched_in(struct perf_event *event,
        perf_pmu_disable(event->pmu);
+        event->tstamp_running += tstamp - event->tstamp_stopped;
+        perf_set_shadow_time(event, ctx, tstamp);
+        perf_log_itrace_start(event);
        if (event->pmu->add(event, PERF_EF_START)) {
                event->state = PERF_EVENT_STATE_INACTIVE;
                event->oncpu = -1;
@@ -1888,10 +1879,6 @@ event_sched_in(struct perf_event *event,
                goto out;
        }
-        event->tstamp_running += tstamp - event->tstamp_stopped;
-        perf_set_shadow_time(event, ctx, tstamp);
        if (!is_software_event(event))
                cpuctx->active_oncpu++;
        if (!ctx->nr_active++)
@@ -2559,6 +2546,9 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
                        next->perf_event_ctxp[ctxn] = ctx;
                        ctx->task = next;
                        next_ctx->task = task;
+                        swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
                        do_switch = 0;
                        perf_event_sync_stat(ctx, next_ctx);
@@ -2577,6 +2567,56 @@ unlock:
        }
 }
+void perf_sched_cb_dec(struct pmu *pmu)
+{
+        this_cpu_dec(perf_sched_cb_usages);
+}
+void perf_sched_cb_inc(struct pmu *pmu)
+{
+        this_cpu_inc(perf_sched_cb_usages);
+}
+/*
+ * This function provides the context switch callback to the lower code
+ * layer. It is invoked ONLY when the context switch callback is enabled.
+ */
+static void perf_pmu_sched_task(struct task_struct *prev,
+                                struct task_struct *next,
+                                bool sched_in)
+{
+        struct perf_cpu_context *cpuctx;
+        struct pmu *pmu;
+        unsigned long flags;
+        if (prev == next)
+                return;
+        local_irq_save(flags);
+        rcu_read_lock();
+        list_for_each_entry_rcu(pmu, &pmus, entry) {
+                if (pmu->sched_task) {
+                        cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+                        perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+                        perf_pmu_disable(pmu);
+                        pmu->sched_task(cpuctx->task_ctx, sched_in);
+                        perf_pmu_enable(pmu);
+                        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+                }
+        }
+        rcu_read_unlock();
+        local_irq_restore(flags);
+}
 #define for_each_task_context_nr(ctxn)                                  \
        for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
@@ -2596,6 +2636,9 @@ void __perf_event_task_sched_out(struct task_struct *task,
 {
        int ctxn;
+        if (__this_cpu_read(perf_sched_cb_usages))
+                perf_pmu_sched_task(task, next, false);
        for_each_task_context_nr(ctxn)
                perf_event_context_sched_out(task, ctxn, next);
@@ -2755,64 +2798,6 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
 }
 /*
- * When sampling the branck stack in system-wide, it may be necessary
- * to flush the stack on context switch. This happens when the branch
- * stack does not tag its entries with the pid of the current task.
- * Otherwise it becomes impossible to associate a branch entry with a
- * task. This ambiguity is more likely to appear when the branch stack
- * supports priv level filtering and the user sets it to monitor only
- * at the user level (which could be a useful measurement in system-wide
- * mode). In that case, the risk is high of having a branch stack with
- * branch from multiple tasks. Flushing may mean dropping the existing
- * entries or stashing them somewhere in the PMU specific code layer.
- *
- * This function provides the context switch callback to the lower code
- * layer. It is invoked ONLY when there is at least one system-wide context
- * with at least one active event using taken branch sampling.
- */
-static void perf_branch_stack_sched_in(struct task_struct *prev,
-                                       struct task_struct *task)
-{
-        struct perf_cpu_context *cpuctx;
-        struct pmu *pmu;
-        unsigned long flags;
-        /* no need to flush branch stack if not changing task */
-        if (prev == task)
-                return;
-        local_irq_save(flags);
-        rcu_read_lock();
-        list_for_each_entry_rcu(pmu, &pmus, entry) {
-                cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-                /*
-                 * check if the context has at least one
-                 * event using PERF_SAMPLE_BRANCH_STACK
-                 */
-                if (cpuctx->ctx.nr_branch_stack > 0
-                    && pmu->flush_branch_stack) {
-                        perf_ctx_lock(cpuctx, cpuctx->task_ctx);
-                        perf_pmu_disable(pmu);
-                        pmu->flush_branch_stack();
-                        perf_pmu_enable(pmu);
-                        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
-                }
-        }
-        rcu_read_unlock();
-        local_irq_restore(flags);
-}
-/*
 * Called from scheduler to add the events of the current task
 * with interrupts disabled.
 *
@@ -2844,9 +2829,8 @@ void __perf_event_task_sched_in(struct task_struct *prev,
        if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
                perf_cgroup_sched_in(prev, task);
-        /* check for system-wide branch_stack events */
+        if (__this_cpu_read(perf_sched_cb_usages))
-        if (atomic_read(this_cpu_ptr(&perf_branch_stack_events)))
+                perf_pmu_sched_task(prev, task, true);
-                perf_branch_stack_sched_in(prev, task);
 }
 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -3220,7 +3204,10 @@ static void __perf_event_read(void *info)
 static inline u64 perf_event_count(struct perf_event *event)
 {
-        return local64_read(&event->count) + atomic64_read(&event->child_count);
+        if (event->pmu->count)
+                return event->pmu->count(event);
+        return __perf_event_count(event);
 }
 static u64 perf_event_read(struct perf_event *event)
@@ -3321,12 +3308,15 @@ errout:
 * Returns a matching context with refcount and pincount.
 */
 static struct perf_event_context *
-find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
+find_get_context(struct pmu *pmu, struct task_struct *task,
+                struct perf_event *event)
 {
        struct perf_event_context *ctx, *clone_ctx = NULL;
        struct perf_cpu_context *cpuctx;
+        void *task_ctx_data = NULL;
        unsigned long flags;
        int ctxn, err;
+        int cpu = event->cpu;
        if (!task) {
                /* Must be root to operate on a CPU event: */
@@ -3354,11 +3344,24 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
        if (ctxn < 0)
                goto errout;
+        if (event->attach_state & PERF_ATTACH_TASK_DATA) {
+                task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
+                if (!task_ctx_data) {
+                        err = -ENOMEM;
+                        goto errout;
+                }
+        }
 retry:
        ctx = perf_lock_task_context(task, ctxn, &flags);
        if (ctx) {
                clone_ctx = unclone_ctx(ctx);
                ++ctx->pin_count;
+                if (task_ctx_data && !ctx->task_ctx_data) {
+                        ctx->task_ctx_data = task_ctx_data;
+                        task_ctx_data = NULL;
+                }
                raw_spin_unlock_irqrestore(&ctx->lock, flags);
                if (clone_ctx)
@@ -3369,6 +3372,11 @@ retry:
                if (!ctx)
                        goto errout;
+                if (task_ctx_data) {
+                        ctx->task_ctx_data = task_ctx_data;
+                        task_ctx_data = NULL;
+                }
                err = 0;
                mutex_lock(&task->perf_event_mutex);
                /*
@@ -3395,13 +3403,16 @@ retry:
                }
        }
+        kfree(task_ctx_data);
        return ctx;
 errout:
+        kfree(task_ctx_data);
        return ERR_PTR(err);
 }
 static void perf_event_free_filter(struct perf_event *event);
+static void perf_event_free_bpf_prog(struct perf_event *event);
 static void free_event_rcu(struct rcu_head *head)
 {
@@ -3411,10 +3422,10 @@ static void free_event_rcu(struct rcu_head *head)
        if (event->ns)
                put_pid_ns(event->ns);
        perf_event_free_filter(event);
+        perf_event_free_bpf_prog(event);
        kfree(event);
 }
-static void ring_buffer_put(struct ring_buffer *rb);
 static void ring_buffer_attach(struct perf_event *event,
                               struct ring_buffer *rb);
@@ -3423,10 +3434,6 @@ static void unaccount_event_cpu(struct perf_event *event, int cpu)
        if (event->parent)
                return;
-        if (has_branch_stack(event)) {
-                if (!(event->attach_state & PERF_ATTACH_TASK))
-                        atomic_dec(&per_cpu(perf_branch_stack_events, cpu));
-        }
        if (is_cgroup_event(event))
                atomic_dec(&per_cpu(perf_cgroup_events, cpu));
 }
@@ -3454,6 +3461,91 @@ static void unaccount_event(struct perf_event *event)
        unaccount_event_cpu(event, event->cpu);
 }
+/*
+ * The following implement mutual exclusion of events on "exclusive" pmus
+ * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
+ * at a time, so we disallow creating events that might conflict, namely:
+ *
+ *  1) cpu-wide events in the presence of per-task events,
+ *  2) per-task events in the presence of cpu-wide events,
+ *  3) two matching events on the same context.
+ *
+ * The former two cases are handled in the allocation path (perf_event_alloc(),
+ * __free_event()), the latter -- before the first perf_install_in_context().
+ */
+static int exclusive_event_init(struct perf_event *event)
+{
+        struct pmu *pmu = event->pmu;
+        if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+                return 0;
+        /*
+         * Prevent co-existence of per-task and cpu-wide events on the
+         * same exclusive pmu.
+         *
+         * Negative pmu::exclusive_cnt means there are cpu-wide
+         * events on this "exclusive" pmu, positive means there are
+         * per-task events.
+         *
+         * Since this is called in perf_event_alloc() path, event::ctx
+         * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
+         * to mean "per-task event", because unlike other attach states it
+         * never gets cleared.
+         */
+        if (event->attach_state & PERF_ATTACH_TASK) {
+                if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
+                        return -EBUSY;
+        } else {
+                if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
+                        return -EBUSY;
+        }
+        return 0;
+}
+static void exclusive_event_destroy(struct perf_event *event)
+{
+        struct pmu *pmu = event->pmu;
+        if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+                return;
+        /* see comment in exclusive_event_init() */
+        if (event->attach_state & PERF_ATTACH_TASK)
+                atomic_dec(&pmu->exclusive_cnt);
+        else
+                atomic_inc(&pmu->exclusive_cnt);
+}
+static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
+{
+        if ((e1->pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) &&
+            (e1->cpu == e2->cpu ||
+             e1->cpu == -1 ||
+             e2->cpu == -1))
+                return true;
+        return false;
+}
+/* Called under the same ctx::mutex as perf_install_in_context() */
+static bool exclusive_event_installable(struct perf_event *event,
+                                        struct perf_event_context *ctx)
+{
+        struct perf_event *iter_event;
+        struct pmu *pmu = event->pmu;
+        if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+                return true;
+        list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
+                if (exclusive_event_match(iter_event, event))
+                        return false;
+        }
+        return true;
+}
 static void __free_event(struct perf_event *event)
 {
        if (!event->parent) {
@@ -3467,8 +3559,10 @@ static void __free_event(struct perf_event *event)
        if (event->ctx)
                put_ctx(event->ctx);
-        if (event->pmu)
+        if (event->pmu) {
+                exclusive_event_destroy(event);
                module_put(event->pmu->module);
+        }
        call_rcu(&event->rcu_head, free_event_rcu);
 }
@@ -3591,7 +3685,7 @@ static void put_event(struct perf_event *event)
        ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING);
        WARN_ON_ONCE(ctx->parent_ctx);
        perf_remove_from_context(event, true);
-        mutex_unlock(&ctx->mutex);
+        perf_event_ctx_unlock(event, ctx);
        _free_event(event);
 }
@@ -3927,6 +4021,7 @@ static inline int perf_fget_light(int fd, struct fd *p)
 static int perf_event_set_output(struct perf_event *event,
                                 struct perf_event *output_event);
 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
+static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
 static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
 {
@@ -3980,6 +4075,9 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
        case PERF_EVENT_IOC_SET_FILTER:
                return perf_event_set_filter(event, (void __user *)arg);
+        case PERF_EVENT_IOC_SET_BPF:
+                return perf_event_set_bpf_prog(event, arg);
        default:
                return -ENOTTY;
        }
@@ -4096,6 +4194,8 @@ static void perf_event_init_userpage(struct perf_event *event)
        /* Allow new userspace to detect that bit 0 is deprecated */
        userpg->cap_bit0_is_deprecated = 1;
        userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
+        userpg->data_offset = PAGE_SIZE;
+        userpg->data_size = perf_data_size(rb);
 unlock:
        rcu_read_unlock();
@@ -4263,7 +4363,7 @@ static void rb_free_rcu(struct rcu_head *rcu_head)
        rb_free(rb);
 }
-static struct ring_buffer *ring_buffer_get(struct perf_event *event)
+struct ring_buffer *ring_buffer_get(struct perf_event *event)
 {
        struct ring_buffer *rb;
@@ -4278,7 +4378,7 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event)
        return rb;
 }
-static void ring_buffer_put(struct ring_buffer *rb)
+void ring_buffer_put(struct ring_buffer *rb)
 {
        if (!atomic_dec_and_test(&rb->refcount))
                return;
@@ -4295,6 +4395,9 @@ static void perf_mmap_open(struct vm_area_struct *vma)
        atomic_inc(&event->mmap_count);
        atomic_inc(&event->rb->mmap_count);
+        if (vma->vm_pgoff)
+                atomic_inc(&event->rb->aux_mmap_count);
        if (event->pmu->event_mapped)
                event->pmu->event_mapped(event);
 }
@@ -4319,6 +4422,20 @@ static void perf_mmap_close(struct vm_area_struct *vma)
        if (event->pmu->event_unmapped)
                event->pmu->event_unmapped(event);
+        /*
+         * rb->aux_mmap_count will always drop before rb->mmap_count and
+         * event->mmap_count, so it is ok to use event->mmap_mutex to
+         * serialize with perf_mmap here.
+         */
+        if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
+            atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
+                atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
+                vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
+                rb_free_aux(rb);
+                mutex_unlock(&event->mmap_mutex);
+        }
        atomic_dec(&rb->mmap_count);
        if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
@@ -4392,7 +4509,7 @@ out_put:
 static const struct vm_operations_struct perf_mmap_vmops = {
        .open           = perf_mmap_open,
-        .close          = perf_mmap_close,
+        .close          = perf_mmap_close, /* non mergable */
        .fault          = perf_mmap_fault,
        .page_mkwrite   = perf_mmap_fault,
 };
@@ -4403,10 +4520,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
        unsigned long user_locked, user_lock_limit;
        struct user_struct *user = current_user();
        unsigned long locked, lock_limit;
-        struct ring_buffer *rb;
+        struct ring_buffer *rb = NULL;
        unsigned long vma_size;
        unsigned long nr_pages;
-        long user_extra, extra;
+        long user_extra = 0, extra = 0;
        int ret = 0, flags = 0;
        /*
@@ -4421,7 +4538,66 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
                return -EINVAL;
        vma_size = vma->vm_end - vma->vm_start;
-        nr_pages = (vma_size / PAGE_SIZE) - 1;
+        if (vma->vm_pgoff == 0) {
+                nr_pages = (vma_size / PAGE_SIZE) - 1;
+        } else {
+                /*
+                 * AUX area mapping: if rb->aux_nr_pages != 0, it's already
+                 * mapped, all subsequent mappings should have the same size
+                 * and offset. Must be above the normal perf buffer.
+                 */
+                u64 aux_offset, aux_size;
+                if (!event->rb)
+                        return -EINVAL;
+                nr_pages = vma_size / PAGE_SIZE;
+                mutex_lock(&event->mmap_mutex);
+                ret = -EINVAL;
+                rb = event->rb;
+                if (!rb)
+                        goto aux_unlock;
+                aux_offset = ACCESS_ONCE(rb->user_page->aux_offset);
+                aux_size = ACCESS_ONCE(rb->user_page->aux_size);
+                if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
+                        goto aux_unlock;
+                if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
+                        goto aux_unlock;
+                /* already mapped with a different offset */
+                if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
+                        goto aux_unlock;
+                if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
+                        goto aux_unlock;
+                /* already mapped with a different size */
+                if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
+                        goto aux_unlock;
+                if (!is_power_of_2(nr_pages))
+                        goto aux_unlock;
+                if (!atomic_inc_not_zero(&rb->mmap_count))
+                        goto aux_unlock;
+                if (rb_has_aux(rb)) {
+                        atomic_inc(&rb->aux_mmap_count);
+                        ret = 0;
+                        goto unlock;
+                }
+                atomic_set(&rb->aux_mmap_count, 1);
+                user_extra = nr_pages;
+                goto accounting;
+        }
        /*
         * If we have rb pages ensure they're a power-of-two number, so we
@@ -4433,9 +4609,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
        if (vma_size != PAGE_SIZE * (1 + nr_pages))
                return -EINVAL;
-        if (vma->vm_pgoff != 0)
-                return -EINVAL;
        WARN_ON_ONCE(event->ctx->parent_ctx);
 again:
        mutex_lock(&event->mmap_mutex);
@@ -4459,6 +4632,8 @@ again:
        }
        user_extra = nr_pages + 1;
+accounting:
        user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
        /*
@@ -4468,7 +4643,6 @@ again:
        user_locked = atomic_long_read(&user->locked_vm) + user_extra;
-        extra = 0;
        if (user_locked > user_lock_limit)
                extra = user_locked - user_lock_limit;
@@ -4482,35 +4656,46 @@ again:
                goto unlock;
        }
-        WARN_ON(event->rb);
+        WARN_ON(!rb && event->rb);
        if (vma->vm_flags & VM_WRITE)
                flags |= RING_BUFFER_WRITABLE;
-        rb = rb_alloc(nr_pages, 
-                event->attr.watermark ? event->attr.wakeup_watermark : 0,
-                event->cpu, flags);
        if (!rb) {
-                ret = -ENOMEM;
+                rb = rb_alloc(nr_pages,
-                goto unlock;
+                              event->attr.watermark ? event->attr.wakeup_watermark : 0,
-        }
+                              event->cpu, flags);
-        atomic_set(&rb->mmap_count, 1);
+                if (!rb) {
-        rb->mmap_locked = extra;
+                        ret = -ENOMEM;
-        rb->mmap_user = get_current_user();
+                        goto unlock;
+                }
-        atomic_long_add(user_extra, &user->locked_vm);
+                atomic_set(&rb->mmap_count, 1);
-        vma->vm_mm->pinned_vm += extra;
+                rb->mmap_user = get_current_user();
+                rb->mmap_locked = extra;
-        ring_buffer_attach(event, rb);
+                ring_buffer_attach(event, rb);
-        perf_event_init_userpage(event);
+                perf_event_init_userpage(event);
-        perf_event_update_userpage(event);
+                perf_event_update_userpage(event);
+        } else {
+                ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
+                                   event->attr.aux_watermark, flags);
+                if (!ret)
+                        rb->aux_mmap_locked = extra;
+        }
 unlock:
-        if (!ret)
+        if (!ret) {
+                atomic_long_add(user_extra, &user->locked_vm);
+                vma->vm_mm->pinned_vm += extra;
                atomic_inc(&event->mmap_count);
+        } else if (rb) {
+                atomic_dec(&rb->mmap_count);
+        }
+aux_unlock:
        mutex_unlock(&event->mmap_mutex);
        /*
@@ -4574,6 +4759,13 @@ static void perf_pending_event(struct irq_work *entry)
 {
        struct perf_event *event = container_of(entry,
                        struct perf_event, pending);
+        int rctx;
+        rctx = perf_swevent_get_recursion_context();
+        /*
+         * If we 'fail' here, that's OK, it means recursion is already disabled
+         * and we won't recurse 'further'.
+         */
        if (event->pending_disable) {
                event->pending_disable = 0;
@@ -4584,6 +4776,9 @@ static void perf_pending_event(struct irq_work *entry)
                event->pending_wakeup = 0;
                perf_event_wakeup(event);
        }
+        if (rctx >= 0)
+                perf_swevent_put_recursion_context(rctx);
 }
 /*
@@ -4756,7 +4951,7 @@ static void __perf_event_header__init_id(struct perf_event_header *header,
        }
        if (sample_type & PERF_SAMPLE_TIME)
-                data->time = perf_clock();
+                data->time = perf_event_clock(event);
        if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
                data->id = primary_event_id(event);
@@ -5334,6 +5529,8 @@ static void perf_event_task_output(struct perf_event *event,
        task_event->event_id.tid = perf_event_tid(event, task);
        task_event->event_id.ptid = perf_event_tid(event, current);
+        task_event->event_id.time = perf_event_clock(event);
        perf_output_put(&handle, task_event->event_id);
        perf_event__output_id_sample(event, &handle, &sample);
@@ -5367,7 +5564,7 @@ static void perf_event_task(struct task_struct *task,
                        /* .ppid */
                        /* .tid  */
                        /* .ptid */
-                        .time = perf_clock(),
+                        /* .time */
                },
        };
@@ -5722,6 +5919,40 @@ void perf_event_mmap(struct vm_area_struct *vma)
        perf_event_mmap_event(&mmap_event);
 }
+void perf_event_aux_event(struct perf_event *event, unsigned long head,
+                          unsigned long size, u64 flags)
+{
+        struct perf_output_handle handle;
+        struct perf_sample_data sample;
+        struct perf_aux_event {
+                struct perf_event_header        header;
+                u64                             offset;
+                u64                             size;
+                u64                             flags;
+        } rec = {
+                .header = {
+                        .type = PERF_RECORD_AUX,
+                        .misc = 0,
+                        .size = sizeof(rec),
+                },
+                .offset         = head,
+                .size           = size,
+                .flags          = flags,
+        };
+        int ret;
+        perf_event_header__init_id(&rec.header, &sample, event);
+        ret = perf_output_begin(&handle, event, rec.header.size);
+        if (ret)
+                return;
+        perf_output_put(&handle, rec);
+        perf_event__output_id_sample(event, &handle, &sample);
+        perf_output_end(&handle);
+}
 /*
 * IRQ throttle logging
 */
@@ -5743,7 +5974,7 @@ static void perf_log_throttle(struct perf_event *event, int enable)
                        .misc = 0,
                        .size = sizeof(throttle_event),
                },
-                .time           = perf_clock(),
+                .time           = perf_event_clock(event),
                .id             = primary_event_id(event),
                .stream_id      = event->id,
        };
@@ -5763,6 +5994,44 @@ static void perf_log_throttle(struct perf_event *event, int enable)
        perf_output_end(&handle);
 }
+static void perf_log_itrace_start(struct perf_event *event)
+{
+        struct perf_output_handle handle;
+        struct perf_sample_data sample;
+        struct perf_aux_event {
+                struct perf_event_header        header;
+                u32                             pid;
+                u32                             tid;
+        } rec;
+        int ret;
+        if (event->parent)
+                event = event->parent;
+        if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
+            event->hw.itrace_started)
+                return;
+        event->hw.itrace_started = 1;
+        rec.header.type = PERF_RECORD_ITRACE_START;
+        rec.header.misc = 0;
+        rec.header.size = sizeof(rec);
+        rec.pid = perf_event_pid(event, current);
+        rec.tid = perf_event_tid(event, current);
+        perf_event_header__init_id(&rec.header, &sample, event);
+        ret = perf_output_begin(&handle, event, rec.header.size);
+        if (ret)
+                return;
+        perf_output_put(&handle, rec);
+        perf_event__output_id_sample(event, &handle, &sample);
+        perf_output_end(&handle);
+}
 /*
 * Generic event overflow handling, sampling.
 */
@@ -6123,6 +6392,7 @@ static int perf_swevent_add(struct perf_event *event, int flags)
        }
        hlist_add_head_rcu(&event->hlist_entry, head);
+        perf_event_update_userpage(event);
        return 0;
 }
@@ -6286,6 +6556,8 @@ static int perf_swevent_init(struct perf_event *event)
 static struct pmu perf_swevent = {
        .task_ctx_nr    = perf_sw_context,
+        .capabilities   = PERF_PMU_CAP_NO_NMI,
        .event_init     = perf_swevent_init,
        .add            = perf_swevent_add,
        .del            = perf_swevent_del,
@@ -6439,6 +6711,49 @@ static void perf_event_free_filter(struct perf_event *event)
        ftrace_profile_free_filter(event);
 }
+static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
+{
+        struct bpf_prog *prog;
+        if (event->attr.type != PERF_TYPE_TRACEPOINT)
+                return -EINVAL;
+        if (event->tp_event->prog)
+                return -EEXIST;
+        if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE))
+                /* bpf programs can only be attached to kprobes */
+                return -EINVAL;
+        prog = bpf_prog_get(prog_fd);
+        if (IS_ERR(prog))
+                return PTR_ERR(prog);
+        if (prog->type != BPF_PROG_TYPE_KPROBE) {
+                /* valid fd, but invalid bpf program type */
+                bpf_prog_put(prog);
+                return -EINVAL;
+        }
+        event->tp_event->prog = prog;
+        return 0;
+}
+static void perf_event_free_bpf_prog(struct perf_event *event)
+{
+        struct bpf_prog *prog;
+        if (!event->tp_event)
+                return;
+        prog = event->tp_event->prog;
+        if (prog) {
+                event->tp_event->prog = NULL;
+                bpf_prog_put(prog);
+        }
+}
 #else
 static inline void perf_tp_register(void)
@@ -6454,6 +6769,14 @@ static void perf_event_free_filter(struct perf_event *event)
 {
 }
+static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
+{
+        return -ENOENT;
+}
+static void perf_event_free_bpf_prog(struct perf_event *event)
+{
+}
 #endif /* CONFIG_EVENT_TRACING */
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
@@ -6592,6 +6915,7 @@ static int cpu_clock_event_add(struct perf_event *event, int flags)
 {
        if (flags & PERF_EF_START)
                cpu_clock_event_start(event, flags);
+        perf_event_update_userpage(event);
        return 0;
 }
@@ -6628,6 +6952,8 @@ static int cpu_clock_event_init(struct perf_event *event)
 static struct pmu perf_cpu_clock = {
        .task_ctx_nr    = perf_sw_context,
+        .capabilities   = PERF_PMU_CAP_NO_NMI,
        .event_init     = cpu_clock_event_init,
        .add            = cpu_clock_event_add,
        .del            = cpu_clock_event_del,
@@ -6666,6 +6992,7 @@ static int task_clock_event_add(struct perf_event *event, int flags)
 {
        if (flags & PERF_EF_START)
                task_clock_event_start(event, flags);
+        perf_event_update_userpage(event);
        return 0;
 }
@@ -6706,6 +7033,8 @@ static int task_clock_event_init(struct perf_event *event)
 static struct pmu perf_task_clock = {
        .task_ctx_nr    = perf_sw_context,
+        .capabilities   = PERF_PMU_CAP_NO_NMI,
        .event_init     = task_clock_event_init,
        .add            = task_clock_event_add,
        .del            = task_clock_event_del,
@@ -6983,6 +7312,7 @@ got_cpu_context:
                pmu->event_idx = perf_event_idx_default;
        list_add_rcu(&pmu->entry, &pmus);
+        atomic_set(&pmu->exclusive_cnt, 0);
        ret = 0;
 unlock:
        mutex_unlock(&pmus_lock);
@@ -7027,12 +7357,23 @@ EXPORT_SYMBOL_GPL(perf_pmu_unregister);
 static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
 {
+        struct perf_event_context *ctx = NULL;
        int ret;
        if (!try_module_get(pmu->module))
                return -ENODEV;
+        if (event->group_leader != event) {
+                ctx = perf_event_ctx_lock(event->group_leader);
+                BUG_ON(!ctx);
+        }
        event->pmu = pmu;
        ret = pmu->event_init(event);
+        if (ctx)
+                perf_event_ctx_unlock(event->group_leader, ctx);
        if (ret)
                module_put(pmu->module);
@@ -7079,10 +7420,6 @@ static void account_event_cpu(struct perf_event *event, int cpu)
        if (event->parent)
                return;
-        if (has_branch_stack(event)) {
-                if (!(event->attach_state & PERF_ATTACH_TASK))
-                        atomic_inc(&per_cpu(perf_branch_stack_events, cpu));
-        }
        if (is_cgroup_event(event))
                atomic_inc(&per_cpu(perf_cgroup_events, cpu));
 }
@@ -7121,7 +7458,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
                 struct perf_event *group_leader,
                 struct perf_event *parent_event,
                 perf_overflow_handler_t overflow_handler,
-                 void *context)
+                 void *context, int cgroup_fd)
 {
        struct pmu *pmu;
        struct perf_event *event;
@@ -7176,18 +7513,18 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
        if (task) {
                event->attach_state = PERF_ATTACH_TASK;
-                if (attr->type == PERF_TYPE_TRACEPOINT)
-                        event->hw.tp_target = task;
-#ifdef CONFIG_HAVE_HW_BREAKPOINT
                /*
-                 * hw_breakpoint is a bit difficult here..
+                 * XXX pmu::event_init needs to know what task to account to
+                 * and we cannot use the ctx information because we need the
+                 * pmu before we get a ctx.
                 */
-                else if (attr->type == PERF_TYPE_BREAKPOINT)
+                event->hw.target = task;
-                        event->hw.bp_target = task;
-#endif
        }
+        event->clock = &local_clock;
+        if (parent_event)
+                event->clock = parent_event->clock;
        if (!overflow_handler && parent_event) {
                overflow_handler = parent_event->overflow_handler;
                context = parent_event->overflow_handler_context;
@@ -7214,6 +7551,15 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
        if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
                goto err_ns;
+        if (!has_branch_stack(event))
+                event->attr.branch_sample_type = 0;
+        if (cgroup_fd != -1) {
+                err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
+                if (err)
+                        goto err_ns;
+        }
        pmu = perf_init_event(event);
        if (!pmu)
                goto err_ns;
@@ -7222,21 +7568,30 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
                goto err_ns;
        }
+        err = exclusive_event_init(event);
+        if (err)
+                goto err_pmu;
        if (!event->parent) {
                if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
                        err = get_callchain_buffers();
                        if (err)
-                                goto err_pmu;
+                                goto err_per_task;
                }
        }
        return event;
+err_per_task:
+        exclusive_event_destroy(event);
 err_pmu:
        if (event->destroy)
                event->destroy(event);
        module_put(pmu->module);
 err_ns:
+        if (is_cgroup_event(event))
+                perf_detach_cgroup(event);
        if (event->ns)
                put_pid_ns(event->ns);
        kfree(event);
@@ -7399,6 +7754,19 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
        if (output_event->cpu == -1 && output_event->ctx != event->ctx)
                goto out;
+        /*
+         * Mixing clocks in the same buffer is trouble you don't need.
+         */
+        if (output_event->clock != event->clock)
+                goto out;
+        /*
+         * If both events generate aux data, they must be on the same PMU
+         */
+        if (has_aux(event) && has_aux(output_event) &&
+            event->pmu != output_event->pmu)
+                goto out;
 set:
        mutex_lock(&event->mmap_mutex);
        /* Can't redirect output if we've got an active mmap() */
@@ -7431,6 +7799,43 @@ static void mutex_lock_double(struct mutex *a, struct mutex *b)
        mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
 }
+static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
+{
+        bool nmi_safe = false;
+        switch (clk_id) {
+        case CLOCK_MONOTONIC:
+                event->clock = &ktime_get_mono_fast_ns;
+                nmi_safe = true;
+                break;
+        case CLOCK_MONOTONIC_RAW:
+                event->clock = &ktime_get_raw_fast_ns;
+                nmi_safe = true;
+                break;
+        case CLOCK_REALTIME:
+                event->clock = &ktime_get_real_ns;
+                break;
+        case CLOCK_BOOTTIME:
+                event->clock = &ktime_get_boot_ns;
+                break;
+        case CLOCK_TAI:
+                event->clock = &ktime_get_tai_ns;
+                break;
+        default:
+                return -EINVAL;
+        }
+        if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
+                return -EINVAL;
+        return 0;
+}
 /**
 * sys_perf_event_open - open a performance event, associate it to a task/cpu
 *
@@ -7455,6 +7860,7 @@ SYSCALL_DEFINE5(perf_event_open,
        int move_group = 0;
        int err;
        int f_flags = O_RDWR;
+        int cgroup_fd = -1;
        /* for future expandability... */
        if (flags & ~PERF_FLAG_ALL)
@@ -7520,21 +7926,16 @@ SYSCALL_DEFINE5(perf_event_open,
        get_online_cpus();
+        if (flags & PERF_FLAG_PID_CGROUP)
+                cgroup_fd = pid;
        event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
-                                 NULL, NULL);
+                                 NULL, NULL, cgroup_fd);
        if (IS_ERR(event)) {
                err = PTR_ERR(event);
                goto err_cpus;
        }
-        if (flags & PERF_FLAG_PID_CGROUP) {
-                err = perf_cgroup_connect(pid, event, &attr, group_leader);
-                if (err) {
-                        __free_event(event);
-                        goto err_cpus;
-                }
-        }
        if (is_sampling_event(event)) {
                if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
                        err = -ENOTSUPP;
@@ -7550,6 +7951,12 @@ SYSCALL_DEFINE5(perf_event_open,
         */
        pmu = event->pmu;
+        if (attr.use_clockid) {
+                err = perf_event_set_clock(event, attr.clockid);
+                if (err)
+                        goto err_alloc;
+        }
        if (group_leader &&
            (is_software_event(event) != is_software_event(group_leader))) {
                if (is_software_event(event)) {
@@ -7576,12 +7983,17 @@ SYSCALL_DEFINE5(perf_event_open,
        /*
         * Get the target context (task or percpu):
         */
-        ctx = find_get_context(pmu, task, event->cpu);
+        ctx = find_get_context(pmu, task, event);
        if (IS_ERR(ctx)) {
                err = PTR_ERR(ctx);
                goto err_alloc;
        }
+        if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) {
+                err = -EBUSY;
+                goto err_context;
+        }
        if (task) {
                put_task_struct(task);
                task = NULL;
@@ -7599,6 +8011,11 @@ SYSCALL_DEFINE5(perf_event_open,
                 */
                if (group_leader->group_leader != group_leader)
                        goto err_context;
+                /* All events in a group should have the same clock */
+                if (group_leader->clock != event->clock)
+                        goto err_context;
                /*
                 * Do not allow to attach to a group in a different
                 * task or CPU context:
@@ -7699,6 +8116,13 @@ SYSCALL_DEFINE5(perf_event_open,
                get_ctx(ctx);
        }
+        if (!exclusive_event_installable(event, ctx)) {
+                err = -EBUSY;
+                mutex_unlock(&ctx->mutex);
+                fput(event_file);
+                goto err_context;
+        }
        perf_install_in_context(ctx, event, event->cpu);
        perf_unpin_context(ctx);
@@ -7771,7 +8195,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
         */
        event = perf_event_alloc(attr, cpu, task, NULL, NULL,
-                                 overflow_handler, context);
+                                 overflow_handler, context, -1);
        if (IS_ERR(event)) {
                err = PTR_ERR(event);
                goto err;
@@ -7782,7 +8206,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
        account_event(event);
-        ctx = find_get_context(event->pmu, task, cpu);
+        ctx = find_get_context(event->pmu, task, event);
        if (IS_ERR(ctx)) {
                err = PTR_ERR(ctx);
                goto err_free;
@@ -7790,6 +8214,14 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
        WARN_ON_ONCE(ctx->parent_ctx);
        mutex_lock(&ctx->mutex);
+        if (!exclusive_event_installable(event, ctx)) {
+                mutex_unlock(&ctx->mutex);
+                perf_unpin_context(ctx);
+                put_ctx(ctx);
+                err = -EBUSY;
+                goto err_free;
+        }
        perf_install_in_context(ctx, event, cpu);
        perf_unpin_context(ctx);
        mutex_unlock(&ctx->mutex);
@@ -8132,7 +8564,7 @@ inherit_event(struct perf_event *parent_event,
                                           parent_event->cpu,
                                           child,
                                           group_leader, parent_event,
-                                           NULL, NULL);
+                                           NULL, NULL, -1);
        if (IS_ERR(child_event))
                return child_event;
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 9803a6600d49..92ce5f4ccc26 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -116,12 +116,12 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
 */
 static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
 {
-        struct task_struct *tsk = bp->hw.bp_target;
+        struct task_struct *tsk = bp->hw.target;
        struct perf_event *iter;
        int count = 0;
        list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
-                if (iter->hw.bp_target == tsk &&
+                if (iter->hw.target == tsk &&
                    find_slot_idx(iter) == type &&
                    (iter->cpu < 0 || cpu == iter->cpu))
                        count += hw_breakpoint_weight(iter);
@@ -153,7 +153,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
                int nr;
                nr = info->cpu_pinned;
-                if (!bp->hw.bp_target)
+                if (!bp->hw.target)
                        nr += max_task_bp_pinned(cpu, type);
                else
                        nr += task_bp_pinned(cpu, bp, type);
@@ -210,7 +210,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
                weight = -weight;
        /* Pinned counter cpu profiling */
-        if (!bp->hw.bp_target) {
+        if (!bp->hw.target) {
                get_bp_info(bp->cpu, type)->cpu_pinned += weight;
                return;
        }
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 569b218782ad..9f6ce9ba4a04 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -27,6 +27,7 @@ struct ring_buffer {
        local_t                         lost;           /* nr records lost   */
        long                            watermark;      /* wakeup watermark  */
+        long                            aux_watermark;
        /* poll crap */
        spinlock_t                      event_lock;
        struct list_head                event_list;
@@ -35,6 +36,20 @@ struct ring_buffer {
        unsigned long                   mmap_locked;
        struct user_struct              *mmap_user;
+        /* AUX area */
+        local_t                         aux_head;
+        local_t                         aux_nest;
+        local_t                         aux_wakeup;
+        unsigned long                   aux_pgoff;
+        int                             aux_nr_pages;
+        int                             aux_overwrite;
+        atomic_t                        aux_mmap_count;
+        unsigned long                   aux_mmap_locked;
+        void                            (*free_aux)(void *);
+        atomic_t                        aux_refcount;
+        void                            **aux_pages;
+        void                            *aux_priv;
        struct perf_event_mmap_page     *user_page;
        void                            *data_pages[0];
 };
@@ -43,6 +58,19 @@ extern void rb_free(struct ring_buffer *rb);
 extern struct ring_buffer *
 rb_alloc(int nr_pages, long watermark, int cpu, int flags);
 extern void perf_event_wakeup(struct perf_event *event);
+extern int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
+                        pgoff_t pgoff, int nr_pages, long watermark, int flags);
+extern void rb_free_aux(struct ring_buffer *rb);
+extern struct ring_buffer *ring_buffer_get(struct perf_event *event);
+extern void ring_buffer_put(struct ring_buffer *rb);
+static inline bool rb_has_aux(struct ring_buffer *rb)
+{
+        return !!rb->aux_nr_pages;
+}
+void perf_event_aux_event(struct perf_event *event, unsigned long head,
+                          unsigned long size, u64 flags);
 extern void
 perf_event_header__init_id(struct perf_event_header *header,
@@ -81,6 +109,11 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb)
        return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
 }
+static inline unsigned long perf_aux_size(struct ring_buffer *rb)
+{
+        return rb->aux_nr_pages << PAGE_SHIFT;
+}
 #define DEFINE_OUTPUT_COPY(func_name, memcpy_func)                      \
 static inline unsigned long                                             \
 func_name(struct perf_output_handle *handle,                            \
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index eadb95ce7aac..232f00f273cb 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -243,14 +243,317 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
        spin_lock_init(&rb->event_lock);
 }
+/*
+ * This is called before hardware starts writing to the AUX area to
+ * obtain an output handle and make sure there's room in the buffer.
+ * When the capture completes, call perf_aux_output_end() to commit
+ * the recorded data to the buffer.
+ *
+ * The ordering is similar to that of perf_output_{begin,end}, with
+ * the exception of (B), which should be taken care of by the pmu
+ * driver, since ordering rules will differ depending on hardware.
+ */
+void *perf_aux_output_begin(struct perf_output_handle *handle,
+                            struct perf_event *event)
+{
+        struct perf_event *output_event = event;
+        unsigned long aux_head, aux_tail;
+        struct ring_buffer *rb;
+        if (output_event->parent)
+                output_event = output_event->parent;
+        /*
+         * Since this will typically be open across pmu::add/pmu::del, we
+         * grab ring_buffer's refcount instead of holding rcu read lock
+         * to make sure it doesn't disappear under us.
+         */
+        rb = ring_buffer_get(output_event);
+        if (!rb)
+                return NULL;
+        if (!rb_has_aux(rb) || !atomic_inc_not_zero(&rb->aux_refcount))
+                goto err;
+        /*
+         * Nesting is not supported for AUX area, make sure nested
+         * writers are caught early
+         */
+        if (WARN_ON_ONCE(local_xchg(&rb->aux_nest, 1)))
+                goto err_put;
+        aux_head = local_read(&rb->aux_head);
+        handle->rb = rb;
+        handle->event = event;
+        handle->head = aux_head;
+        handle->size = 0;
+        /*
+         * In overwrite mode, AUX data stores do not depend on aux_tail,
+         * therefore (A) control dependency barrier does not exist. The
+         * (B) <-> (C) ordering is still observed by the pmu driver.
+         */
+        if (!rb->aux_overwrite) {
+                aux_tail = ACCESS_ONCE(rb->user_page->aux_tail);
+                handle->wakeup = local_read(&rb->aux_wakeup) + rb->aux_watermark;
+                if (aux_head - aux_tail < perf_aux_size(rb))
+                        handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb));
+                /*
+                 * handle->size computation depends on aux_tail load; this forms a
+                 * control dependency barrier separating aux_tail load from aux data
+                 * store that will be enabled on successful return
+                 */
+                if (!handle->size) { /* A, matches D */
+                        event->pending_disable = 1;
+                        perf_output_wakeup(handle);
+                        local_set(&rb->aux_nest, 0);
+                        goto err_put;
+                }
+        }
+        return handle->rb->aux_priv;
+err_put:
+        rb_free_aux(rb);
+err:
+        ring_buffer_put(rb);
+        handle->event = NULL;
+        return NULL;
+}
+/*
+ * Commit the data written by hardware into the ring buffer by adjusting
+ * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the
+ * pmu driver's responsibility to observe ordering rules of the hardware,
+ * so that all the data is externally visible before this is called.
+ */
+void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
+                         bool truncated)
+{
+        struct ring_buffer *rb = handle->rb;
+        unsigned long aux_head;
+        u64 flags = 0;
+        if (truncated)
+                flags |= PERF_AUX_FLAG_TRUNCATED;
+        /* in overwrite mode, driver provides aux_head via handle */
+        if (rb->aux_overwrite) {
+                flags |= PERF_AUX_FLAG_OVERWRITE;
+                aux_head = handle->head;
+                local_set(&rb->aux_head, aux_head);
+        } else {
+                aux_head = local_read(&rb->aux_head);
+                local_add(size, &rb->aux_head);
+        }
+        if (size || flags) {
+                /*
+                 * Only send RECORD_AUX if we have something useful to communicate
+                 */
+                perf_event_aux_event(handle->event, aux_head, size, flags);
+        }
+        aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
+        if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) {
+                perf_output_wakeup(handle);
+                local_add(rb->aux_watermark, &rb->aux_wakeup);
+        }
+        handle->event = NULL;
+        local_set(&rb->aux_nest, 0);
+        rb_free_aux(rb);
+        ring_buffer_put(rb);
+}
+/*
+ * Skip over a given number of bytes in the AUX buffer, due to, for example,
+ * hardware's alignment constraints.
+ */
+int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
+{
+        struct ring_buffer *rb = handle->rb;
+        unsigned long aux_head;
+        if (size > handle->size)
+                return -ENOSPC;
+        local_add(size, &rb->aux_head);
+        aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
+        if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) {
+                perf_output_wakeup(handle);
+                local_add(rb->aux_watermark, &rb->aux_wakeup);
+                handle->wakeup = local_read(&rb->aux_wakeup) +
+                                 rb->aux_watermark;
+        }
+        handle->head = aux_head;
+        handle->size -= size;
+        return 0;
+}
+void *perf_get_aux(struct perf_output_handle *handle)
+{
+        /* this is only valid between perf_aux_output_begin and *_end */
+        if (!handle->event)
+                return NULL;
+        return handle->rb->aux_priv;
+}
+#define PERF_AUX_GFP    (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY)
+static struct page *rb_alloc_aux_page(int node, int order)
+{
+        struct page *page;
+        if (order > MAX_ORDER)
+                order = MAX_ORDER;
+        do {
+                page = alloc_pages_node(node, PERF_AUX_GFP, order);
+        } while (!page && order--);
+        if (page && order) {
+                /*
+                 * Communicate the allocation size to the driver
+                 */
+                split_page(page, order);
+                SetPagePrivate(page);
+                set_page_private(page, order);
+        }
+        return page;
+}
+static void rb_free_aux_page(struct ring_buffer *rb, int idx)
+{
+        struct page *page = virt_to_page(rb->aux_pages[idx]);
+        ClearPagePrivate(page);
+        page->mapping = NULL;
+        __free_page(page);
+}
+int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
+                 pgoff_t pgoff, int nr_pages, long watermark, int flags)
+{
+        bool overwrite = !(flags & RING_BUFFER_WRITABLE);
+        int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu);
+        int ret = -ENOMEM, max_order = 0;
+        if (!has_aux(event))
+                return -ENOTSUPP;
+        if (event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) {
+                /*
+                 * We need to start with the max_order that fits in nr_pages,
+                 * not the other way around, hence ilog2() and not get_order.
+                 */
+                max_order = ilog2(nr_pages);
+                /*
+                 * PMU requests more than one contiguous chunks of memory
+                 * for SW double buffering
+                 */
+                if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_SW_DOUBLEBUF) &&
+                    !overwrite) {
+                        if (!max_order)
+                                return -EINVAL;
+                        max_order--;
+                }
+        }
+        rb->aux_pages = kzalloc_node(nr_pages * sizeof(void *), GFP_KERNEL, node);
+        if (!rb->aux_pages)
+                return -ENOMEM;
+        rb->free_aux = event->pmu->free_aux;
+        for (rb->aux_nr_pages = 0; rb->aux_nr_pages < nr_pages;) {
+                struct page *page;
+                int last, order;
+                order = min(max_order, ilog2(nr_pages - rb->aux_nr_pages));
+                page = rb_alloc_aux_page(node, order);
+                if (!page)
+                        goto out;
+                for (last = rb->aux_nr_pages + (1 << page_private(page));
+                     last > rb->aux_nr_pages; rb->aux_nr_pages++)
+                        rb->aux_pages[rb->aux_nr_pages] = page_address(page++);
+        }
+        rb->aux_priv = event->pmu->setup_aux(event->cpu, rb->aux_pages, nr_pages,
+                                             overwrite);
+        if (!rb->aux_priv)
+                goto out;
+        ret = 0;
+        /*
+         * aux_pages (and pmu driver's private data, aux_priv) will be
+         * referenced in both producer's and consumer's contexts, thus
+         * we keep a refcount here to make sure either of the two can
+         * reference them safely.
+         */
+        atomic_set(&rb->aux_refcount, 1);
+        rb->aux_overwrite = overwrite;
+        rb->aux_watermark = watermark;
+        if (!rb->aux_watermark && !rb->aux_overwrite)
+                rb->aux_watermark = nr_pages << (PAGE_SHIFT - 1);
+out:
+        if (!ret)
+                rb->aux_pgoff = pgoff;
+        else
+                rb_free_aux(rb);
+        return ret;
+}
+static void __rb_free_aux(struct ring_buffer *rb)
+{
+        int pg;
+        if (rb->aux_priv) {
+                rb->free_aux(rb->aux_priv);
+                rb->free_aux = NULL;
+                rb->aux_priv = NULL;
+        }
+        for (pg = 0; pg < rb->aux_nr_pages; pg++)
+                rb_free_aux_page(rb, pg);
+        kfree(rb->aux_pages);
+        rb->aux_nr_pages = 0;
+}
+void rb_free_aux(struct ring_buffer *rb)
+{
+        if (atomic_dec_and_test(&rb->aux_refcount))
+                __rb_free_aux(rb);
+}
 #ifndef CONFIG_PERF_USE_VMALLOC
 /*
 * Back perf_mmap() with regular GFP_KERNEL-0 pages.
 */
-struct page *
+static struct page *
-perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
+__perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
 {
        if (pgoff > rb->nr_pages)
                return NULL;
@@ -340,8 +643,8 @@ static int data_page_nr(struct ring_buffer *rb)
        return rb->nr_pages << page_order(rb);
 }
-struct page *
+static struct page *
-perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
+__perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
 {
        /* The '>' counts in the user page. */
        if (pgoff > data_page_nr(rb))
@@ -416,3 +719,19 @@ fail:
 }
 #endif
+struct page *
+perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
+{
+        if (rb->aux_nr_pages) {
+                /* above AUX space */
+                if (pgoff > rb->aux_pgoff + rb->aux_nr_pages)
+                        return NULL;
+                /* AUX space */
+                if (pgoff >= rb->aux_pgoff)
+                        return virt_to_page(rb->aux_pages[pgoff - rb->aux_pgoff]);
+        }
+        return __perf_mmap_to_page(rb, pgoff);
+}
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index 83d4382f5699..6873bb3e6b7e 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -20,145 +20,10 @@
 #include <linux/types.h>
 #include <linux/fs_struct.h>
-static void default_handler(int, struct pt_regs *);
-static struct exec_domain *exec_domains = &default_exec_domain;
-static DEFINE_RWLOCK(exec_domains_lock);
-static unsigned long ident_map[32] = {
-        0,      1,      2,      3,      4,      5,      6,      7,
-        8,      9,      10,     11,     12,     13,     14,     15,
-        16,     17,     18,     19,     20,     21,     22,     23,
-        24,     25,     26,     27,     28,     29,     30,     31
-};
-struct exec_domain default_exec_domain = {
-        .name           = "Linux",              /* name */
-        .handler        = default_handler,      /* lcall7 causes a seg fault. */
-        .pers_low       = 0,                    /* PER_LINUX personality. */
-        .pers_high      = 0,                    /* PER_LINUX personality. */
-        .signal_map     = ident_map,            /* Identity map signals. */
-        .signal_invmap  = ident_map,            /*  - both ways. */
-};
-static void
-default_handler(int segment, struct pt_regs *regp)
-{
-        set_personality(0);
-        if (current_thread_info()->exec_domain->handler != default_handler)
-                current_thread_info()->exec_domain->handler(segment, regp);
-        else
-                send_sig(SIGSEGV, current, 1);
-}
-static struct exec_domain *
-lookup_exec_domain(unsigned int personality)
-{
-        unsigned int pers = personality(personality);
-        struct exec_domain *ep;
-        read_lock(&exec_domains_lock);
-        for (ep = exec_domains; ep; ep = ep->next) {
-                if (pers >= ep->pers_low && pers <= ep->pers_high)
-                        if (try_module_get(ep->module))
-                                goto out;
-        }
-#ifdef CONFIG_MODULES
-        read_unlock(&exec_domains_lock);
-        request_module("personality-%d", pers);
-        read_lock(&exec_domains_lock);
-        for (ep = exec_domains; ep; ep = ep->next) {
-                if (pers >= ep->pers_low && pers <= ep->pers_high)
-                        if (try_module_get(ep->module))
-                                goto out;
-        }
-#endif
-        ep = &default_exec_domain;
-out:
-        read_unlock(&exec_domains_lock);
-        return ep;
-}
-int
-register_exec_domain(struct exec_domain *ep)
-{
-        struct exec_domain      *tmp;
-        int                     err = -EBUSY;
-        if (ep == NULL)
-                return -EINVAL;
-        if (ep->next != NULL)
-                return -EBUSY;
-        write_lock(&exec_domains_lock);
-        for (tmp = exec_domains; tmp; tmp = tmp->next) {
-                if (tmp == ep)
-                        goto out;
-        }
-        ep->next = exec_domains;
-        exec_domains = ep;
-        err = 0;
-out:
-        write_unlock(&exec_domains_lock);
-        return err;
-}
-EXPORT_SYMBOL(register_exec_domain);
-int
-unregister_exec_domain(struct exec_domain *ep)
-{
-        struct exec_domain      **epp;
-        epp = &exec_domains;
-        write_lock(&exec_domains_lock);
-        for (epp = &exec_domains; *epp; epp = &(*epp)->next) {
-                if (ep == *epp)
-                        goto unregister;
-        }
-        write_unlock(&exec_domains_lock);
-        return -EINVAL;
-unregister:
-        *epp = ep->next;
-        ep->next = NULL;
-        write_unlock(&exec_domains_lock);
-        return 0;
-}
-EXPORT_SYMBOL(unregister_exec_domain);
-int __set_personality(unsigned int personality)
-{
-        struct exec_domain *oep = current_thread_info()->exec_domain;
-        current_thread_info()->exec_domain = lookup_exec_domain(personality);
-        current->personality = personality;
-        module_put(oep->module);
-        return 0;
-}
-EXPORT_SYMBOL(__set_personality);
 #ifdef CONFIG_PROC_FS
 static int execdomains_proc_show(struct seq_file *m, void *v)
 {
-        struct exec_domain      *ep;
+        seq_puts(m, "0-0\tLinux           \t[kernel]\n");
-        read_lock(&exec_domains_lock);
-        for (ep = exec_domains; ep; ep = ep->next)
-                seq_printf(m, "%d-%d\t%-16s\t[%s]\n",
-                               ep->pers_low, ep->pers_high, ep->name,
-                               module_name(ep->module));
-        read_unlock(&exec_domains_lock);
        return 0;
 }
diff --git a/kernel/exit.c b/kernel/exit.c
index feff10bbb307..22fcc05dec40 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -756,8 +756,6 @@ void do_exit(long code)
        cgroup_exit(tsk);
-        module_put(task_thread_info(tsk)->exec_domain->module);
        /*
         * FIXME: do that only when needed, using sched_exit tracepoint
         */
diff --git a/kernel/fork.c b/kernel/fork.c
index cf65139615a0..f2c1e7352298 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1279,9 +1279,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        if (nr_threads >= max_threads)
                goto bad_fork_cleanup_count;
-        if (!try_module_get(task_thread_info(p)->exec_domain->module))
-                goto bad_fork_cleanup_count;
        delayacct_tsk_init(p);  /* Must remain after dup_task_struct() */
        p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
        p->flags |= PF_FORKNOEXEC;
@@ -1590,7 +1587,6 @@ bad_fork_cleanup_threadgroup_lock:
        if (clone_flags & CLONE_THREAD)
                threadgroup_change_end(current);
        delayacct_tsk_free(p);
-        module_put(task_thread_info(p)->exec_domain->module);
 bad_fork_cleanup_count:
        atomic_dec(&p->cred->user->processes);
        exit_creds(p);
diff --git a/kernel/futex.c b/kernel/futex.c
index 2a5e3830e953..2579e407ff67 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -900,7 +900,7 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
        if (!p)
                return -ESRCH;
-        if (!p->mm) {
+        if (unlikely(p->flags & PF_KTHREAD)) {
                put_task_struct(p);
                return -EPERM;
        }
diff --git a/kernel/groups.c b/kernel/groups.c
index 664411f171b5..74d431d25251 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -9,9 +9,6 @@
 #include <linux/user_namespace.h>
 #include <asm/uaccess.h>
-/* init to 2 - one for init_task, one to ensure it is never freed */
-struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
 struct group_info *groups_alloc(int gidsetsize)
 {
        struct group_info *group_info;
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 06db12434d72..e0f90c2b57aa 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -169,7 +169,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
                return;
        rcu_read_lock();
-        do_each_thread(g, t) {
+        for_each_process_thread(g, t) {
                if (!max_count--)
                        goto unlock;
                if (!--batch_count) {
@@ -180,7 +180,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
                /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
                if (t->state == TASK_UNINTERRUPTIBLE)
                        check_hung_task(t, timeout);
-        } while_each_thread(g, t);
+        }
 unlock:
        rcu_read_unlock();
 }
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 6f1c7a566b95..eb9a4ea394ab 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -948,6 +948,22 @@ int irq_chip_retrigger_hierarchy(struct irq_data *data)
        return -ENOSYS;
 }
+/**
+ * irq_chip_set_wake_parent - Set/reset wake-up on the parent interrupt
+ * @data:       Pointer to interrupt specific data
+ * @on:         Whether to set or reset the wake-up capability of this irq
+ *
+ * Conditional, as the underlying parent chip might not implement it.
+ */
+int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on)
+{
+        data = data->parent_data;
+        if (data->chip->irq_set_wake)
+                return data->chip->irq_set_wake(data, on);
+        return -ENOSYS;
+}
 #endif
 /**
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 886d09e691d5..e68932bb308e 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -68,14 +68,20 @@ static void __synchronize_hardirq(struct irq_desc *desc)
 *      Do not use this for shutdown scenarios where you must be sure
 *      that all parts (hardirq and threaded handler) have completed.
 *
+ *      Returns: false if a threaded handler is active.
+ *
 *      This function may be called - with care - from IRQ context.
 */
-void synchronize_hardirq(unsigned int irq)
+bool synchronize_hardirq(unsigned int irq)
 {
        struct irq_desc *desc = irq_to_desc(irq);
-        if (desc)
+        if (desc) {
                __synchronize_hardirq(desc);
+                return !atomic_read(&desc->threads_active);
+        }
+        return true;
 }
 EXPORT_SYMBOL(synchronize_hardirq);
@@ -440,6 +446,32 @@ void disable_irq(unsigned int irq)
 }
 EXPORT_SYMBOL(disable_irq);
+/**
+ *      disable_hardirq - disables an irq and waits for hardirq completion
+ *      @irq: Interrupt to disable
+ *
+ *      Disable the selected interrupt line.  Enables and Disables are
+ *      nested.
+ *      This function waits for any pending hard IRQ handlers for this
+ *      interrupt to complete before returning. If you use this function while
+ *      holding a resource the hard IRQ handler may need you will deadlock.
+ *
+ *      When used to optimistically disable an interrupt from atomic context
+ *      the return value must be checked.
+ *
+ *      Returns: false if a threaded handler is active.
+ *
+ *      This function may be called - with care - from IRQ context.
+ */
+bool disable_hardirq(unsigned int irq)
+{
+        if (!__disable_irq_nosync(irq))
+                return synchronize_hardirq(irq);
+        return false;
+}
+EXPORT_SYMBOL_GPL(disable_hardirq);
 void __enable_irq(struct irq_desc *desc, unsigned int irq)
 {
        switch (desc->depth) {
@@ -1766,3 +1798,94 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
        return retval;
 }
+/**
+ *      irq_get_irqchip_state - returns the irqchip state of a interrupt.
+ *      @irq: Interrupt line that is forwarded to a VM
+ *      @which: One of IRQCHIP_STATE_* the caller wants to know about
+ *      @state: a pointer to a boolean where the state is to be storeed
+ *
+ *      This call snapshots the internal irqchip state of an
+ *      interrupt, returning into @state the bit corresponding to
+ *      stage @which
+ *
+ *      This function should be called with preemption disabled if the
+ *      interrupt controller has per-cpu registers.
+ */
+int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
+                          bool *state)
+{
+        struct irq_desc *desc;
+        struct irq_data *data;
+        struct irq_chip *chip;
+        unsigned long flags;
+        int err = -EINVAL;
+        desc = irq_get_desc_buslock(irq, &flags, 0);
+        if (!desc)
+                return err;
+        data = irq_desc_get_irq_data(desc);
+        do {
+                chip = irq_data_get_irq_chip(data);
+                if (chip->irq_get_irqchip_state)
+                        break;
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+                data = data->parent_data;
+#else
+                data = NULL;
+#endif
+        } while (data);
+        if (data)
+                err = chip->irq_get_irqchip_state(data, which, state);
+        irq_put_desc_busunlock(desc, flags);
+        return err;
+}
+/**
+ *      irq_set_irqchip_state - set the state of a forwarded interrupt.
+ *      @irq: Interrupt line that is forwarded to a VM
+ *      @which: State to be restored (one of IRQCHIP_STATE_*)
+ *      @val: Value corresponding to @which
+ *
+ *      This call sets the internal irqchip state of an interrupt,
+ *      depending on the value of @which.
+ *
+ *      This function should be called with preemption disabled if the
+ *      interrupt controller has per-cpu registers.
+ */
+int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
+                          bool val)
+{
+        struct irq_desc *desc;
+        struct irq_data *data;
+        struct irq_chip *chip;
+        unsigned long flags;
+        int err = -EINVAL;
+        desc = irq_get_desc_buslock(irq, &flags, 0);
+        if (!desc)
+                return err;
+        data = irq_desc_get_irq_data(desc);
+        do {
+                chip = irq_data_get_irq_chip(data);
+                if (chip->irq_set_irqchip_state)
+                        break;
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+                data = data->parent_data;
+#else
+                data = NULL;
+#endif
+        } while (data);
+        if (data)
+                err = chip->irq_set_irqchip_state(data, which, val);
+        irq_put_desc_busunlock(desc, flags);
+        return err;
+}
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 3e18163f336f..474de5cb394d 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -310,8 +310,15 @@ void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
        struct msi_desc *desc;
        for_each_msi_entry(desc, dev) {
-                irq_domain_free_irqs(desc->irq, desc->nvec_used);
+                /*
-                desc->irq = 0;
+                 * We might have failed to allocate an MSI early
+                 * enough that there is no IRQ associated to this
+                 * entry. If that's the case, don't do anything.
+                 */
+                if (desc->irq) {
+                        irq_domain_free_irqs(desc->irq, desc->nvec_used);
+                        desc->irq = 0;
+                }
        }
 }
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 01ca08804f51..284e2691e380 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -89,16 +89,28 @@ static bool klp_is_object_loaded(struct klp_object *obj)
 /* sets obj->mod if object is not vmlinux and module is found */
 static void klp_find_object_module(struct klp_object *obj)
 {
+        struct module *mod;
        if (!klp_is_module(obj))
                return;
        mutex_lock(&module_mutex);
        /*
-         * We don't need to take a reference on the module here because we have
+         * We do not want to block removal of patched modules and therefore
-         * the klp_mutex, which is also taken by the module notifier.  This
+         * we do not take a reference here. The patches are removed by
-         * prevents any module from unloading until we release the klp_mutex.
+         * a going module handler instead.
+         */
+        mod = find_module(obj->name);
+        /*
+         * Do not mess work of the module coming and going notifiers.
+         * Note that the patch might still be needed before the going handler
+         * is called. Module functions can be called even in the GOING state
+         * until mod->exit() finishes. This is especially important for
+         * patches that modify semantic of the functions.
         */
-        obj->mod = find_module(obj->name);
+        if (mod && mod->klp_alive)
+                obj->mod = mod;
        mutex_unlock(&module_mutex);
 }
@@ -323,32 +335,20 @@ unlock:
        rcu_read_unlock();
 }
-static int klp_disable_func(struct klp_func *func)
+static void klp_disable_func(struct klp_func *func)
 {
        struct klp_ops *ops;
-        int ret;
-        if (WARN_ON(func->state != KLP_ENABLED))
+        WARN_ON(func->state != KLP_ENABLED);
-                return -EINVAL;
+        WARN_ON(!func->old_addr);
-        if (WARN_ON(!func->old_addr))
-                return -EINVAL;
        ops = klp_find_ops(func->old_addr);
        if (WARN_ON(!ops))
-                return -EINVAL;
+                return;
        if (list_is_singular(&ops->func_stack)) {
-                ret = unregister_ftrace_function(&ops->fops);
+                WARN_ON(unregister_ftrace_function(&ops->fops));
-                if (ret) {
+                WARN_ON(ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0));
-                        pr_err("failed to unregister ftrace handler for function '%s' (%d)\n",
-                               func->old_name, ret);
-                        return ret;
-                }
-                ret = ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0);
-                if (ret)
-                        pr_warn("function unregister succeeded but failed to clear the filter\n");
                list_del_rcu(&func->stack_node);
                list_del(&ops->node);
@@ -358,8 +358,6 @@ static int klp_disable_func(struct klp_func *func)
        }
        func->state = KLP_DISABLED;
-        return 0;
 }
 static int klp_enable_func(struct klp_func *func)
@@ -420,23 +418,15 @@ err:
        return ret;
 }
-static int klp_disable_object(struct klp_object *obj)
+static void klp_disable_object(struct klp_object *obj)
 {
        struct klp_func *func;
-        int ret;
-        for (func = obj->funcs; func->old_name; func++) {
-                if (func->state != KLP_ENABLED)
-                        continue;
-                ret = klp_disable_func(func);
+        for (func = obj->funcs; func->old_name; func++)
-                if (ret)
+                if (func->state == KLP_ENABLED)
-                        return ret;
+                        klp_disable_func(func);
-        }
        obj->state = KLP_DISABLED;
-        return 0;
 }
 static int klp_enable_object(struct klp_object *obj)
@@ -452,22 +442,19 @@ static int klp_enable_object(struct klp_object *obj)
        for (func = obj->funcs; func->old_name; func++) {
                ret = klp_enable_func(func);
-                if (ret)
+                if (ret) {
-                        goto unregister;
+                        klp_disable_object(obj);
+                        return ret;
+                }
        }
        obj->state = KLP_ENABLED;
        return 0;
-unregister:
-        WARN_ON(klp_disable_object(obj));
-        return ret;
 }
 static int __klp_disable_patch(struct klp_patch *patch)
 {
        struct klp_object *obj;
-        int ret;
        /* enforce stacking: only the last enabled patch can be disabled */
        if (!list_is_last(&patch->list, &klp_patches) &&
@@ -477,12 +464,8 @@ static int __klp_disable_patch(struct klp_patch *patch)
        pr_notice("disabling patch '%s'\n", patch->mod->name);
        for (obj = patch->objs; obj->funcs; obj++) {
-                if (obj->state != KLP_ENABLED)
+                if (obj->state == KLP_ENABLED)
-                        continue;
+                        klp_disable_object(obj);
-                ret = klp_disable_object(obj);
-                if (ret)
-                        return ret;
        }
        patch->state = KLP_DISABLED;
@@ -541,8 +524,6 @@ static int __klp_enable_patch(struct klp_patch *patch)
        pr_notice("enabling patch '%s'\n", patch->mod->name);
        for (obj = patch->objs; obj->funcs; obj++) {
-                klp_find_object_module(obj);
                if (!klp_is_object_loaded(obj))
                        continue;
@@ -767,6 +748,7 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj)
                return -EINVAL;
        obj->state = KLP_DISABLED;
+        obj->mod = NULL;
        klp_find_object_module(obj);
@@ -932,7 +914,6 @@ static void klp_module_notify_going(struct klp_patch *patch,
 {
        struct module *pmod = patch->mod;
        struct module *mod = obj->mod;
-        int ret;
        if (patch->state == KLP_DISABLED)
                goto disabled;
@@ -940,10 +921,7 @@ static void klp_module_notify_going(struct klp_patch *patch,
        pr_notice("reverting patch '%s' on unloading module '%s'\n",
                  pmod->name, mod->name);
-        ret = klp_disable_object(obj);
+        klp_disable_object(obj);
-        if (ret)
-                pr_warn("failed to revert patch '%s' on module '%s' (%d)\n",
-                        pmod->name, mod->name, ret);
 disabled:
        klp_free_object_loaded(obj);
@@ -961,6 +939,15 @@ static int klp_module_notify(struct notifier_block *nb, unsigned long action,
        mutex_lock(&klp_mutex);
+        /*
+         * Each module has to know that the notifier has been called.
+         * We never know what module will get patched by a new patch.
+         */
+        if (action == MODULE_STATE_COMING)
+                mod->klp_alive = true;
+        else /* MODULE_STATE_GOING */
+                mod->klp_alive = false;
        list_for_each_entry(patch, &klp_patches, list) {
                for (obj = patch->objs; obj->funcs; obj++) {
                        if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 88d0d4420ad2..ba77ab5f64dd 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -633,7 +633,7 @@ static int count_matching_names(struct lock_class *new_class)
        if (!new_class->name)
                return 0;
-        list_for_each_entry(class, &all_lock_classes, lock_entry) {
+        list_for_each_entry_rcu(class, &all_lock_classes, lock_entry) {
                if (new_class->key - new_class->subclass == class->key)
                        return class->name_version;
                if (class->name && !strcmp(class->name, new_class->name))
@@ -700,10 +700,12 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
        hash_head = classhashentry(key);
        /*
-         * We can walk the hash lockfree, because the hash only
+         * We do an RCU walk of the hash, see lockdep_free_key_range().
-         * grows, and we are careful when adding entries to the end:
         */
-        list_for_each_entry(class, hash_head, hash_entry) {
+        if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+                return NULL;
+        list_for_each_entry_rcu(class, hash_head, hash_entry) {
                if (class->key == key) {
                        /*
                         * Huh! same key, different name? Did someone trample
@@ -728,7 +730,8 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
        struct lockdep_subclass_key *key;
        struct list_head *hash_head;
        struct lock_class *class;
-        unsigned long flags;
+        DEBUG_LOCKS_WARN_ON(!irqs_disabled());
        class = look_up_lock_class(lock, subclass);
        if (likely(class))
@@ -750,28 +753,26 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
        key = lock->key->subkeys + subclass;
        hash_head = classhashentry(key);
-        raw_local_irq_save(flags);
        if (!graph_lock()) {
-                raw_local_irq_restore(flags);
                return NULL;
        }
        /*
         * We have to do the hash-walk again, to avoid races
         * with another CPU:
         */
-        list_for_each_entry(class, hash_head, hash_entry)
+        list_for_each_entry_rcu(class, hash_head, hash_entry) {
                if (class->key == key)
                        goto out_unlock_set;
+        }
        /*
         * Allocate a new key from the static array, and add it to
         * the hash:
         */
        if (nr_lock_classes >= MAX_LOCKDEP_KEYS) {
                if (!debug_locks_off_graph_unlock()) {
-                        raw_local_irq_restore(flags);
                        return NULL;
                }
-                raw_local_irq_restore(flags);
                print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!");
                dump_stack();
@@ -798,7 +799,6 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
        if (verbose(class)) {
                graph_unlock();
-                raw_local_irq_restore(flags);
                printk("\nnew class %p: %s", class->key, class->name);
                if (class->name_version > 1)
@@ -806,15 +806,12 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
                printk("\n");
                dump_stack();
-                raw_local_irq_save(flags);
                if (!graph_lock()) {
-                        raw_local_irq_restore(flags);
                        return NULL;
                }
        }
 out_unlock_set:
        graph_unlock();
-        raw_local_irq_restore(flags);
 out_set_class_cache:
        if (!subclass || force)
@@ -870,11 +867,9 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
        entry->distance = distance;
        entry->trace = *trace;
        /*
-         * Since we never remove from the dependency list, the list can
+         * Both allocation and removal are done under the graph lock; but
-         * be walked lockless by other CPUs, it's only allocation
+         * iteration is under RCU-sched; see look_up_lock_class() and
-         * that must be protected by the spinlock. But this also means
+         * lockdep_free_key_range().
-         * we must make new entries visible only once writes to the
-         * entry become visible - hence the RCU op:
         */
        list_add_tail_rcu(&entry->entry, head);
@@ -1025,7 +1020,9 @@ static int __bfs(struct lock_list *source_entry,
                else
                        head = &lock->class->locks_before;
-                list_for_each_entry(entry, head, entry) {
+                DEBUG_LOCKS_WARN_ON(!irqs_disabled());
+                list_for_each_entry_rcu(entry, head, entry) {
                        if (!lock_accessed(entry)) {
                                unsigned int cq_depth;
                                mark_lock_accessed(entry, lock);
@@ -2022,7 +2019,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
         * We can walk it lock-free, because entries only get added
         * to the hash:
         */
-        list_for_each_entry(chain, hash_head, entry) {
+        list_for_each_entry_rcu(chain, hash_head, entry) {
                if (chain->chain_key == chain_key) {
 cache_hit:
                        debug_atomic_inc(chain_lookup_hits);
@@ -2996,8 +2993,18 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
        if (unlikely(!debug_locks))
                return;
-        if (subclass)
+        if (subclass) {
+                unsigned long flags;
+                if (DEBUG_LOCKS_WARN_ON(current->lockdep_recursion))
+                        return;
+                raw_local_irq_save(flags);
+                current->lockdep_recursion = 1;
                register_lock_class(lock, subclass, 1);
+                current->lockdep_recursion = 0;
+                raw_local_irq_restore(flags);
+        }
 }
 EXPORT_SYMBOL_GPL(lockdep_init_map);
@@ -3887,9 +3894,17 @@ static inline int within(const void *addr, void *start, unsigned long size)
        return addr >= start && addr < start + size;
 }
+/*
+ * Used in module.c to remove lock classes from memory that is going to be
+ * freed; and possibly re-used by other modules.
+ *
+ * We will have had one sync_sched() before getting here, so we're guaranteed
+ * nobody will look up these exact classes -- they're properly dead but still
+ * allocated.
+ */
 void lockdep_free_key_range(void *start, unsigned long size)
 {
-        struct lock_class *class, *next;
+        struct lock_class *class;
        struct list_head *head;
        unsigned long flags;
        int i;
@@ -3905,7 +3920,7 @@ void lockdep_free_key_range(void *start, unsigned long size)
                head = classhash_table + i;
                if (list_empty(head))
                        continue;
-                list_for_each_entry_safe(class, next, head, hash_entry) {
+                list_for_each_entry_rcu(class, head, hash_entry) {
                        if (within(class->key, start, size))
                                zap_class(class);
                        else if (within(class->name, start, size))
@@ -3916,11 +3931,25 @@ void lockdep_free_key_range(void *start, unsigned long size)
        if (locked)
                graph_unlock();
        raw_local_irq_restore(flags);
+        /*
+         * Wait for any possible iterators from look_up_lock_class() to pass
+         * before continuing to free the memory they refer to.
+         *
+         * sync_sched() is sufficient because the read-side is IRQ disable.
+         */
+        synchronize_sched();
+        /*
+         * XXX at this point we could return the resources to the pool;
+         * instead we leak them. We would need to change to bitmap allocators
+         * instead of the linear allocators we have now.
+         */
 }
 void lockdep_reset_lock(struct lockdep_map *lock)
 {
-        struct lock_class *class, *next;
+        struct lock_class *class;
        struct list_head *head;
        unsigned long flags;
        int i, j;
@@ -3948,7 +3977,7 @@ void lockdep_reset_lock(struct lockdep_map *lock)
                head = classhash_table + i;
                if (list_empty(head))
                        continue;
-                list_for_each_entry_safe(class, next, head, hash_entry) {
+                list_for_each_entry_rcu(class, head, hash_entry) {
                        int match = 0;
                        for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++)
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index d1fe2ba5bac9..75e114bdf3f2 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -78,7 +78,7 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
                 */
                return;
        }
-        ACCESS_ONCE(prev->next) = node;
+        WRITE_ONCE(prev->next, node);
        /* Wait until the lock holder passes the lock down. */
        arch_mcs_spin_lock_contended(&node->locked);
@@ -91,7 +91,7 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
 static inline
 void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
 {
-        struct mcs_spinlock *next = ACCESS_ONCE(node->next);
+        struct mcs_spinlock *next = READ_ONCE(node->next);
        if (likely(!next)) {
                /*
@@ -100,7 +100,7 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
                if (likely(cmpxchg(lock, node, NULL) == node))
                        return;
                /* Wait until the next pointer is set */
-                while (!(next = ACCESS_ONCE(node->next)))
+                while (!(next = READ_ONCE(node->next)))
                        cpu_relax_lowlatency();
        }
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 94674e5919cb..4cccea6b8934 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -25,7 +25,7 @@
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
 #include <linux/debug_locks.h>
-#include "mcs_spinlock.h"
+#include <linux/osq_lock.h>
 /*
 * In the DEBUG case we are using the "NULL fastpath" for mutexes,
@@ -217,44 +217,35 @@ ww_mutex_set_context_slowpath(struct ww_mutex *lock,
 }
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
-{
-        if (lock->owner != owner)
-                return false;
-        /*
-         * Ensure we emit the owner->on_cpu, dereference _after_ checking
-         * lock->owner still matches owner, if that fails, owner might
-         * point to free()d memory, if it still matches, the rcu_read_lock()
-         * ensures the memory stays valid.
-         */
-        barrier();
-        return owner->on_cpu;
-}
 /*
 * Look out! "owner" is an entirely speculative pointer
 * access and not reliable.
 */
 static noinline
-int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
+bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
 {
+        bool ret = true;
        rcu_read_lock();
-        while (owner_running(lock, owner)) {
+        while (lock->owner == owner) {
-                if (need_resched())
+                /*
+                 * Ensure we emit the owner->on_cpu, dereference _after_
+                 * checking lock->owner still matches owner. If that fails,
+                 * owner might point to freed memory. If it still matches,
+                 * the rcu_read_lock() ensures the memory stays valid.
+                 */
+                barrier();
+                if (!owner->on_cpu || need_resched()) {
+                        ret = false;
                        break;
+                }
                cpu_relax_lowlatency();
        }
        rcu_read_unlock();
-        /*
+        return ret;
-         * We break out the loop above on need_resched() and when the
-         * owner changed, which is a sign for heavy contention. Return
-         * success only when lock->owner is NULL.
-         */
-        return lock->owner == NULL;
 }
 /*
@@ -269,7 +260,7 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
                return 0;
        rcu_read_lock();
-        owner = ACCESS_ONCE(lock->owner);
+        owner = READ_ONCE(lock->owner);
        if (owner)
                retval = owner->on_cpu;
        rcu_read_unlock();
@@ -343,7 +334,7 @@ static bool mutex_optimistic_spin(struct mutex *lock,
                         * As such, when deadlock detection needs to be
                         * performed the optimistic spinning cannot be done.
                         */
-                        if (ACCESS_ONCE(ww->ctx))
+                        if (READ_ONCE(ww->ctx))
                                break;
                }
@@ -351,7 +342,7 @@ static bool mutex_optimistic_spin(struct mutex *lock,
                 * If there's an owner, wait for it to either
                 * release the lock or go to sleep.
                 */
-                owner = ACCESS_ONCE(lock->owner);
+                owner = READ_ONCE(lock->owner);
                if (owner && !mutex_spin_on_owner(lock, owner))
                        break;
@@ -490,7 +481,7 @@ static inline int __sched
 __ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
 {
        struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
-        struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
+        struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx);
        if (!hold_ctx)
                return 0;
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index c112d00341b0..dc85ee23a26f 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -98,7 +98,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
        prev = decode_cpu(old);
        node->prev = prev;
-        ACCESS_ONCE(prev->next) = node;
+        WRITE_ONCE(prev->next, node);
        /*
         * Normally @prev is untouchable after the above store; because at that
@@ -109,7 +109,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
         * cmpxchg in an attempt to undo our queueing.
         */
-        while (!ACCESS_ONCE(node->locked)) {
+        while (!READ_ONCE(node->locked)) {
                /*
                 * If we need to reschedule bail... so we can block.
                 */
@@ -148,7 +148,7 @@ unqueue:
                 * Or we race against a concurrent unqueue()'s step-B, in which
                 * case its step-C will write us a new @node->prev pointer.
                 */
-                prev = ACCESS_ONCE(node->prev);
+                prev = READ_ONCE(node->prev);
        }
        /*
@@ -170,8 +170,8 @@ unqueue:
         * it will wait in Step-A.
         */
-        ACCESS_ONCE(next->prev) = prev;
+        WRITE_ONCE(next->prev, prev);
-        ACCESS_ONCE(prev->next) = next;
+        WRITE_ONCE(prev->next, next);
        return false;
 }
@@ -193,11 +193,11 @@ void osq_unlock(struct optimistic_spin_queue *lock)
        node = this_cpu_ptr(&osq_node);
        next = xchg(&node->next, NULL);
        if (next) {
-                ACCESS_ONCE(next->locked) = 1;
+                WRITE_ONCE(next->locked, 1);
                return;
        }
        next = osq_wait_next(lock, node, NULL);
        if (next)
-                ACCESS_ONCE(next->locked) = 1;
+                WRITE_ONCE(next->locked, 1);
 }
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 6357265a31ad..b73279367087 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -349,7 +349,7 @@ static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
 *
 * @task:       the task owning the mutex (owner) for which a chain walk is
 *              probably needed
- * @deadlock_detect: do we have to carry out deadlock detection?
+ * @chwalk:     do we have to carry out deadlock detection?
 * @orig_lock:  the mutex (can be NULL if we are walking the chain to recheck
 *              things for a task that has just got its priority adjusted, and
 *              is waiting on a mutex)
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 2555ae15ec14..3a5048572065 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -85,6 +85,13 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
                list_del(&waiter->list);
                tsk = waiter->task;
+                /*
+                 * Make sure we do not wakeup the next reader before
+                 * setting the nil condition to grant the next reader;
+                 * otherwise we could miss the wakeup on the other
+                 * side and end up sleeping again. See the pairing
+                 * in rwsem_down_read_failed().
+                 */
                smp_mb();
                waiter->task = NULL;
                wake_up_process(tsk);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 2f7cc4076f50..3417d0172a5d 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -14,8 +14,9 @@
 #include <linux/init.h>
 #include <linux/export.h>
 #include <linux/sched/rt.h>
+#include <linux/osq_lock.h>
-#include "mcs_spinlock.h"
+#include "rwsem.h"
 /*
 * Guide to the rw_semaphore's count field for common values.
@@ -186,6 +187,13 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
                waiter = list_entry(next, struct rwsem_waiter, list);
                next = waiter->list.next;
                tsk = waiter->task;
+                /*
+                 * Make sure we do not wakeup the next reader before
+                 * setting the nil condition to grant the next reader;
+                 * otherwise we could miss the wakeup on the other
+                 * side and end up sleeping again. See the pairing
+                 * in rwsem_down_read_failed().
+                 */
                smp_mb();
                waiter->task = NULL;
                wake_up_process(tsk);
@@ -258,6 +266,7 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
                    RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
                if (!list_is_singular(&sem->wait_list))
                        rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
+                rwsem_set_owner(sem);
                return true;
        }
@@ -270,15 +279,17 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
 */
 static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
 {
-        long old, count = ACCESS_ONCE(sem->count);
+        long old, count = READ_ONCE(sem->count);
        while (true) {
                if (!(count == 0 || count == RWSEM_WAITING_BIAS))
                        return false;
                old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS);
-                if (old == count)
+                if (old == count) {
+                        rwsem_set_owner(sem);
                        return true;
+                }
                count = old;
        }
@@ -287,60 +298,67 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
 static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
 {
        struct task_struct *owner;
-        bool on_cpu = false;
+        bool ret = true;
        if (need_resched())
                return false;
        rcu_read_lock();
-        owner = ACCESS_ONCE(sem->owner);
+        owner = READ_ONCE(sem->owner);
-        if (owner)
+        if (!owner) {
-                on_cpu = owner->on_cpu;
+                long count = READ_ONCE(sem->count);
-        rcu_read_unlock();
+                /*
+                 * If sem->owner is not set, yet we have just recently entered the
-        /*
+                 * slowpath with the lock being active, then there is a possibility
-         * If sem->owner is not set, yet we have just recently entered the
+                 * reader(s) may have the lock. To be safe, bail spinning in these
-         * slowpath, then there is a possibility reader(s) may have the lock.
+                 * situations.
-         * To be safe, avoid spinning in these situations.
+                 */
-         */
+                if (count & RWSEM_ACTIVE_MASK)
-        return on_cpu;
+                        ret = false;
-}
+                goto done;
+        }
-static inline bool owner_running(struct rw_semaphore *sem,
-                                 struct task_struct *owner)
-{
-        if (sem->owner != owner)
-                return false;
-        /*
-         * Ensure we emit the owner->on_cpu, dereference _after_ checking
-         * sem->owner still matches owner, if that fails, owner might
-         * point to free()d memory, if it still matches, the rcu_read_lock()
-         * ensures the memory stays valid.
-         */
-        barrier();
-        return owner->on_cpu;
+        ret = owner->on_cpu;
+done:
+        rcu_read_unlock();
+        return ret;
 }
 static noinline
 bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner)
 {
+        long count;
        rcu_read_lock();
-        while (owner_running(sem, owner)) {
+        while (sem->owner == owner) {
-                if (need_resched())
+                /*
-                        break;
+                 * Ensure we emit the owner->on_cpu, dereference _after_
+                 * checking sem->owner still matches owner, if that fails,
+                 * owner might point to free()d memory, if it still matches,
+                 * the rcu_read_lock() ensures the memory stays valid.
+                 */
+                barrier();
+                /* abort spinning when need_resched or owner is not running */
+                if (!owner->on_cpu || need_resched()) {
+                        rcu_read_unlock();
+                        return false;
+                }
                cpu_relax_lowlatency();
        }
        rcu_read_unlock();
+        if (READ_ONCE(sem->owner))
+                return true; /* new owner, continue spinning */
        /*
-         * We break out the loop above on need_resched() or when the
+         * When the owner is not set, the lock could be free or
-         * owner changed, which is a sign for heavy contention. Return
+         * held by readers. Check the counter to verify the
-         * success only when sem->owner is NULL.
+         * state.
         */
-        return sem->owner == NULL;
+        count = READ_ONCE(sem->count);
+        return (count == 0 || count == RWSEM_WAITING_BIAS);
 }
 static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
@@ -358,7 +376,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
                goto done;
        while (true) {
-                owner = ACCESS_ONCE(sem->owner);
+                owner = READ_ONCE(sem->owner);
                if (owner && !rwsem_spin_on_owner(sem, owner))
                        break;
@@ -432,7 +450,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
        /* we're now waiting on the lock, but no longer actively locking */
        if (waiting) {
-                count = ACCESS_ONCE(sem->count);
+                count = READ_ONCE(sem->count);
                /*
                 * If there were already threads queued before us and there are
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index e2d3bc7f03b4..205be0ce34de 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -9,29 +9,9 @@
 #include <linux/sched.h>
 #include <linux/export.h>
 #include <linux/rwsem.h>
 #include <linux/atomic.h>
-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+#include "rwsem.h"
-static inline void rwsem_set_owner(struct rw_semaphore *sem)
-{
-        sem->owner = current;
-}
-static inline void rwsem_clear_owner(struct rw_semaphore *sem)
-{
-        sem->owner = NULL;
-}
-#else
-static inline void rwsem_set_owner(struct rw_semaphore *sem)
-{
-}
-static inline void rwsem_clear_owner(struct rw_semaphore *sem)
-{
-}
-#endif
 /*
 * lock for reading
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
new file mode 100644
index 000000000000..870ed9a5b426
--- /dev/null
+++ b/kernel/locking/rwsem.h
@@ -0,0 +1,20 @@
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+static inline void rwsem_set_owner(struct rw_semaphore *sem)
+{
+        sem->owner = current;
+}
+static inline void rwsem_clear_owner(struct rw_semaphore *sem)
+{
+        sem->owner = NULL;
+}
+#else
+static inline void rwsem_set_owner(struct rw_semaphore *sem)
+{
+}
+static inline void rwsem_clear_owner(struct rw_semaphore *sem)
+{
+}
+#endif
diff --git a/kernel/module.c b/kernel/module.c
index b3d634ed06c9..650b038ae520 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1865,7 +1865,7 @@ static void free_module(struct module *mod)
        kfree(mod->args);
        percpu_modfree(mod);
-        /* Free lock-classes: */
+        /* Free lock-classes; relies on the preceding sync_rcu(). */
        lockdep_free_key_range(mod->module_core, mod->core_size);
        /* Finally, free the core (containing the module structure) */
@@ -2479,6 +2479,23 @@ static int elf_header_check(struct load_info *info)
        return 0;
 }
+#define COPY_CHUNK_SIZE (16*PAGE_SIZE)
+static int copy_chunked_from_user(void *dst, const void __user *usrc, unsigned long len)
+{
+        do {
+                unsigned long n = min(len, COPY_CHUNK_SIZE);
+                if (copy_from_user(dst, usrc, n) != 0)
+                        return -EFAULT;
+                cond_resched();
+                dst += n;
+                usrc += n;
+                len -= n;
+        } while (len);
+        return 0;
+}
 /* Sets info->hdr and info->len. */
 static int copy_module_from_user(const void __user *umod, unsigned long len,
                                  struct load_info *info)
@@ -2498,7 +2515,7 @@ static int copy_module_from_user(const void __user *umod, unsigned long len,
        if (!info->hdr)
                return -ENOMEM;
-        if (copy_from_user(info->hdr, umod, info->len) != 0) {
+        if (copy_chunked_from_user(info->hdr, umod, info->len) != 0) {
                vfree(info->hdr);
                return -EFAULT;
        }
@@ -2753,6 +2770,9 @@ static int find_module_sections(struct module *mod, struct load_info *info)
        mod->trace_events = section_objs(info, "_ftrace_events",
                                         sizeof(*mod->trace_events),
                                         &mod->num_trace_events);
+        mod->trace_enums = section_objs(info, "_ftrace_enum_map",
+                                        sizeof(*mod->trace_enums),
+                                        &mod->num_trace_enums);
 #endif
 #ifdef CONFIG_TRACING
        mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt",
@@ -3349,9 +3369,6 @@ static int load_module(struct load_info *info, const char __user *uargs,
        module_bug_cleanup(mod);
        mutex_unlock(&module_mutex);
-        /* Free lock-classes: */
-        lockdep_free_key_range(mod->module_core, mod->core_size);
        /* we can't deallocate the module until we clear memory protection */
        unset_module_init_ro_nx(mod);
        unset_module_core_ro_nx(mod);
@@ -3375,6 +3392,9 @@ static int load_module(struct load_info *info, const char __user *uargs,
        synchronize_rcu();
        mutex_unlock(&module_mutex);
 free_module:
+        /* Free lock-classes; relies on the preceding sync_rcu() */
+        lockdep_free_key_range(mod->module_core, mod->core_size);
        module_deallocate(mod, info);
 free_copy:
        free_copy(info);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 9a59d042ea84..86e8157a450f 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -11,7 +11,7 @@
 #include <linux/export.h>
 #include <linux/kobject.h>
 #include <linux/string.h>
-#include <linux/resume-trace.h>
+#include <linux/pm-trace.h>
 #include <linux/workqueue.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index c24d5a23bf93..5235dd4e1e2f 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -955,25 +955,6 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
        }
 }
-static bool is_nosave_page(unsigned long pfn)
-{
-        struct nosave_region *region;
-        list_for_each_entry(region, &nosave_regions, list) {
-                if (pfn >= region->start_pfn && pfn < region->end_pfn) {
-                        pr_err("PM: %#010llx in e820 nosave region: "
-                               "[mem %#010llx-%#010llx]\n",
-                               (unsigned long long) pfn << PAGE_SHIFT,
-                               (unsigned long long) region->start_pfn << PAGE_SHIFT,
-                               ((unsigned long long) region->end_pfn << PAGE_SHIFT)
-                                        - 1);
-                        return true;
-                }
-        }
-        return false;
-}
 /**
 *      create_basic_memory_bitmaps - create bitmaps needed for marking page
 *      frames that should not be saved and free page frames.  The pointers
@@ -2042,7 +2023,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
        do {
                pfn = memory_bm_next_pfn(bm);
                if (likely(pfn != BM_END_OF_MAP)) {
-                        if (likely(pfn_valid(pfn)) && !is_nosave_page(pfn))
+                        if (likely(pfn_valid(pfn)))
                                swsusp_set_page_free(pfn_to_page(pfn));
                        else
                                return -EFAULT;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index b7d6b3a721b1..8d7a1ef72758 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -28,6 +28,7 @@
 #include <linux/ftrace.h>
 #include <trace/events/power.h>
 #include <linux/compiler.h>
+#include <linux/moduleparam.h>
 #include "power.h"
@@ -233,12 +234,20 @@ static bool platform_suspend_again(suspend_state_t state)
                suspend_ops->suspend_again() : false;
 }
+#ifdef CONFIG_PM_DEBUG
+static unsigned int pm_test_delay = 5;
+module_param(pm_test_delay, uint, 0644);
+MODULE_PARM_DESC(pm_test_delay,
+                 "Number of seconds to wait before resuming from suspend test");
+#endif
 static int suspend_test(int level)
 {
 #ifdef CONFIG_PM_DEBUG
        if (pm_test_level == level) {
-                printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n");
+                printk(KERN_INFO "suspend debug: Waiting for %d second(s).\n",
-                mdelay(5000);
+                                pm_test_delay);
+                mdelay(pm_test_delay * 1000);
                return 1;
        }
 #endif /* !CONFIG_PM_DEBUG */
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index bb0635bd74f2..879edfc5ee52 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -32,7 +32,6 @@
 #include <linux/security.h>
 #include <linux/bootmem.h>
 #include <linux/memblock.h>
-#include <linux/aio.h>
 #include <linux/syscalls.h>
 #include <linux/kexec.h>
 #include <linux/kdb.h>
@@ -46,6 +45,7 @@
 #include <linux/irq_work.h>
 #include <linux/utsname.h>
 #include <linux/ctype.h>
+#include <linux/uio.h>
 #include <asm/uaccess.h>
@@ -521,7 +521,7 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
        int i;
        int level = default_message_loglevel;
        int facility = 1;       /* LOG_USER */
-        size_t len = iocb->ki_nbytes;
+        size_t len = iov_iter_count(from);
        ssize_t ret = len;
        if (len > LOG_LINE_MAX)
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 30d42aa55d83..8dbe27611ec3 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -853,6 +853,8 @@ rcu_torture_fqs(void *arg)
 static int
 rcu_torture_writer(void *arg)
 {
+        bool can_expedite = !rcu_gp_is_expedited();
+        int expediting = 0;
        unsigned long gp_snap;
        bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal;
        bool gp_sync1 = gp_sync;
@@ -865,9 +867,15 @@ rcu_torture_writer(void *arg)
        int nsynctypes = 0;
        VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
+        pr_alert("%s" TORTURE_FLAG
+                 " Grace periods expedited from boot/sysfs for %s,\n",
+                 torture_type, cur_ops->name);
+        pr_alert("%s" TORTURE_FLAG
+                 " Testing of dynamic grace-period expediting diabled.\n",
+                 torture_type);
        /* Initialize synctype[] array.  If none set, take default. */
-        if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync)
+        if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1)
                gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true;
        if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync)
                synctype[nsynctypes++] = RTWS_COND_GET;
@@ -949,9 +957,26 @@ rcu_torture_writer(void *arg)
                        }
                }
                rcutorture_record_progress(++rcu_torture_current_version);
+                /* Cycle through nesting levels of rcu_expedite_gp() calls. */
+                if (can_expedite &&
+                    !(torture_random(&rand) & 0xff & (!!expediting - 1))) {
+                        WARN_ON_ONCE(expediting == 0 && rcu_gp_is_expedited());
+                        if (expediting >= 0)
+                                rcu_expedite_gp();
+                        else
+                                rcu_unexpedite_gp();
+                        if (++expediting > 3)
+                                expediting = -expediting;
+                }
                rcu_torture_writer_state = RTWS_STUTTER;
                stutter_wait("rcu_torture_writer");
        } while (!torture_must_stop());
+        /* Reset expediting back to unexpedited. */
+        if (expediting > 0)
+                expediting = -expediting;
+        while (can_expedite && expediting++ < 0)
+                rcu_unexpedite_gp();
+        WARN_ON_ONCE(can_expedite && rcu_gp_is_expedited());
        rcu_torture_writer_state = RTWS_STOPPING;
        torture_kthread_stopping("rcu_torture_writer");
        return 0;
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index 445bf8ffe3fb..cad76e76b4e7 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -402,23 +402,6 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
 }
 EXPORT_SYMBOL_GPL(call_srcu);
-struct rcu_synchronize {
-        struct rcu_head head;
-        struct completion completion;
-};
-/*
- * Awaken the corresponding synchronize_srcu() instance now that a
- * grace period has elapsed.
- */
-static void wakeme_after_rcu(struct rcu_head *head)
-{
-        struct rcu_synchronize *rcu;
-        rcu = container_of(head, struct rcu_synchronize, head);
-        complete(&rcu->completion);
-}
 static void srcu_advance_batches(struct srcu_struct *sp, int trycount);
 static void srcu_reschedule(struct srcu_struct *sp);
@@ -507,7 +490,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
 */
 void synchronize_srcu(struct srcu_struct *sp)
 {
-        __synchronize_srcu(sp, rcu_expedited
+        __synchronize_srcu(sp, rcu_gp_is_expedited()
                           ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT
                           : SYNCHRONIZE_SRCU_TRYCOUNT);
 }
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index cc9ceca7bde1..069742d61c68 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -103,8 +103,7 @@ EXPORT_SYMBOL(__rcu_is_watching);
 static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
 {
        RCU_TRACE(reset_cpu_stall_ticks(rcp));
-        if (rcp->rcucblist != NULL &&
+        if (rcp->donetail != rcp->curtail) {
-            rcp->donetail != rcp->curtail) {
                rcp->donetail = rcp->curtail;
                return 1;
        }
@@ -169,17 +168,6 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
        unsigned long flags;
        RCU_TRACE(int cb_count = 0);
-        /* If no RCU callbacks ready to invoke, just return. */
-        if (&rcp->rcucblist == rcp->donetail) {
-                RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1));
-                RCU_TRACE(trace_rcu_batch_end(rcp->name, 0,
-                                              !!ACCESS_ONCE(rcp->rcucblist),
-                                              need_resched(),
-                                              is_idle_task(current),
-                                              false));
-                return;
-        }
        /* Move the ready-to-invoke callbacks to a local list. */
        local_irq_save(flags);
        RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1));
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 48d640ca1a05..233165da782f 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -91,8 +91,10 @@ static const char *tp_##sname##_varname __used __tracepoint_string = sname##_var
 #define RCU_STATE_INITIALIZER(sname, sabbr, cr) \
 DEFINE_RCU_TPS(sname) \
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data); \
 struct rcu_state sname##_state = { \
        .level = { &sname##_state.node[0] }, \
+        .rda = &sname##_data, \
        .call = cr, \
        .fqs_state = RCU_GP_IDLE, \
        .gpnum = 0UL - 300UL, \
@@ -101,11 +103,9 @@ struct rcu_state sname##_state = { \
        .orphan_nxttail = &sname##_state.orphan_nxtlist, \
        .orphan_donetail = &sname##_state.orphan_donelist, \
        .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
-        .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \
        .name = RCU_STATE_NAME(sname), \
        .abbr = sabbr, \
-}; \
+}
-DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data)
 RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
 RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
@@ -152,6 +152,8 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active);
 */
 static int rcu_scheduler_fully_active __read_mostly;
+static void rcu_init_new_rnp(struct rcu_node *rnp_leaf);
+static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf);
 static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
 static void invoke_rcu_core(void);
 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
@@ -160,6 +162,12 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
 static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
 module_param(kthread_prio, int, 0644);
+/* Delay in jiffies for grace-period initialization delays. */
+static int gp_init_delay = IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_INIT)
+                                ? CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY
+                                : 0;
+module_param(gp_init_delay, int, 0644);
 /*
 * Track the rcutorture test sequence number and the update version
 * number within a given test.  The rcutorture_testseq is incremented
@@ -173,6 +181,17 @@ unsigned long rcutorture_testseq;
 unsigned long rcutorture_vernum;
 /*
+ * Compute the mask of online CPUs for the specified rcu_node structure.
+ * This will not be stable unless the rcu_node structure's ->lock is
+ * held, but the bit corresponding to the current CPU will be stable
+ * in most contexts.
+ */
+unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
+{
+        return ACCESS_ONCE(rnp->qsmaskinitnext);
+}
+/*
 * Return true if an RCU grace period is in progress.  The ACCESS_ONCE()s
 * permit this function to be invoked without holding the root rcu_node
 * structure's ->lock, but of course results can be subject to change.
@@ -292,10 +311,10 @@ void rcu_note_context_switch(void)
 EXPORT_SYMBOL_GPL(rcu_note_context_switch);
 /*
- * Register a quiesecent state for all RCU flavors.  If there is an
+ * Register a quiescent state for all RCU flavors.  If there is an
 * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight
 * dyntick-idle quiescent state visible to other CPUs (but only for those
- * RCU flavors in desparate need of a quiescent state, which will normally
+ * RCU flavors in desperate need of a quiescent state, which will normally
 * be none of them).  Either way, do a lightweight quiescent state for
 * all RCU flavors.
 */
@@ -410,6 +429,15 @@ void rcu_bh_force_quiescent_state(void)
 EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
 /*
+ * Force a quiescent state for RCU-sched.
+ */
+void rcu_sched_force_quiescent_state(void)
+{
+        force_quiescent_state(&rcu_sched_state);
+}
+EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
+/*
 * Show the state of the grace-period kthreads.
 */
 void show_rcu_gp_kthreads(void)
@@ -483,15 +511,6 @@ void rcutorture_record_progress(unsigned long vernum)
 EXPORT_SYMBOL_GPL(rcutorture_record_progress);
 /*
- * Force a quiescent state for RCU-sched.
- */
-void rcu_sched_force_quiescent_state(void)
-{
-        force_quiescent_state(&rcu_sched_state);
-}
-EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
-/*
 * Does the CPU have callbacks ready to be invoked?
 */
 static int
@@ -954,7 +973,7 @@ bool rcu_lockdep_current_cpu_online(void)
        preempt_disable();
        rdp = this_cpu_ptr(&rcu_sched_data);
        rnp = rdp->mynode;
-        ret = (rdp->grpmask & rnp->qsmaskinit) ||
+        ret = (rdp->grpmask & rcu_rnp_online_cpus(rnp)) ||
              !rcu_scheduler_fully_active;
        preempt_enable();
        return ret;
@@ -1196,9 +1215,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
                } else {
                        j = jiffies;
                        gpa = ACCESS_ONCE(rsp->gp_activity);
-                        pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld\n",
+                        pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n",
                               rsp->name, j - gpa, j, gpa,
-                               jiffies_till_next_fqs);
+                               jiffies_till_next_fqs,
+                               rcu_get_root(rsp)->qsmask);
                        /* In this case, the current CPU might be at fault. */
                        sched_show_task(current);
                }
@@ -1328,20 +1348,30 @@ void rcu_cpu_stall_reset(void)
 }
 /*
- * Initialize the specified rcu_data structure's callback list to empty.
+ * Initialize the specified rcu_data structure's default callback list
+ * to empty.  The default callback list is the one that is not used by
+ * no-callbacks CPUs.
 */
-static void init_callback_list(struct rcu_data *rdp)
+static void init_default_callback_list(struct rcu_data *rdp)
 {
        int i;
-        if (init_nocb_callback_list(rdp))
-                return;
        rdp->nxtlist = NULL;
        for (i = 0; i < RCU_NEXT_SIZE; i++)
                rdp->nxttail[i] = &rdp->nxtlist;
 }
 /*
+ * Initialize the specified rcu_data structure's callback list to empty.
+ */
+static void init_callback_list(struct rcu_data *rdp)
+{
+        if (init_nocb_callback_list(rdp))
+                return;
+        init_default_callback_list(rdp);
+}
+/*
 * Determine the value that ->completed will have at the end of the
 * next subsequent grace period.  This is used to tag callbacks so that
 * a CPU can invoke callbacks in a timely fashion even if that CPU has
@@ -1703,11 +1733,11 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
 */
 static int rcu_gp_init(struct rcu_state *rsp)
 {
+        unsigned long oldmask;
        struct rcu_data *rdp;
        struct rcu_node *rnp = rcu_get_root(rsp);
        ACCESS_ONCE(rsp->gp_activity) = jiffies;
-        rcu_bind_gp_kthread();
        raw_spin_lock_irq(&rnp->lock);
        smp_mb__after_unlock_lock();
        if (!ACCESS_ONCE(rsp->gp_flags)) {
@@ -1733,9 +1763,54 @@ static int rcu_gp_init(struct rcu_state *rsp)
        trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start"));
        raw_spin_unlock_irq(&rnp->lock);
-        /* Exclude any concurrent CPU-hotplug operations. */
+        /*
-        mutex_lock(&rsp->onoff_mutex);
+         * Apply per-leaf buffered online and offline operations to the
-        smp_mb__after_unlock_lock(); /* ->gpnum increment before GP! */
+         * rcu_node tree.  Note that this new grace period need not wait
+         * for subsequent online CPUs, and that quiescent-state forcing
+         * will handle subsequent offline CPUs.
+         */
+        rcu_for_each_leaf_node(rsp, rnp) {
+                raw_spin_lock_irq(&rnp->lock);
+                smp_mb__after_unlock_lock();
+                if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
+                    !rnp->wait_blkd_tasks) {
+                        /* Nothing to do on this leaf rcu_node structure. */
+                        raw_spin_unlock_irq(&rnp->lock);
+                        continue;
+                }
+                /* Record old state, apply changes to ->qsmaskinit field. */
+                oldmask = rnp->qsmaskinit;
+                rnp->qsmaskinit = rnp->qsmaskinitnext;
+                /* If zero-ness of ->qsmaskinit changed, propagate up tree. */
+                if (!oldmask != !rnp->qsmaskinit) {
+                        if (!oldmask) /* First online CPU for this rcu_node. */
+                                rcu_init_new_rnp(rnp);
+                        else if (rcu_preempt_has_tasks(rnp)) /* blocked tasks */
+                                rnp->wait_blkd_tasks = true;
+                        else /* Last offline CPU and can propagate. */
+                                rcu_cleanup_dead_rnp(rnp);
+                }
+                /*
+                 * If all waited-on tasks from prior grace period are
+                 * done, and if all this rcu_node structure's CPUs are
+                 * still offline, propagate up the rcu_node tree and
+                 * clear ->wait_blkd_tasks.  Otherwise, if one of this
+                 * rcu_node structure's CPUs has since come back online,
+                 * simply clear ->wait_blkd_tasks (but rcu_cleanup_dead_rnp()
+                 * checks for this, so just call it unconditionally).
+                 */
+                if (rnp->wait_blkd_tasks &&
+                    (!rcu_preempt_has_tasks(rnp) ||
+                     rnp->qsmaskinit)) {
+                        rnp->wait_blkd_tasks = false;
+                        rcu_cleanup_dead_rnp(rnp);
+                }
+                raw_spin_unlock_irq(&rnp->lock);
+        }
        /*
         * Set the quiescent-state-needed bits in all the rcu_node
@@ -1757,8 +1832,8 @@ static int rcu_gp_init(struct rcu_state *rsp)
                rcu_preempt_check_blocked_tasks(rnp);
                rnp->qsmask = rnp->qsmaskinit;
                ACCESS_ONCE(rnp->gpnum) = rsp->gpnum;
-                WARN_ON_ONCE(rnp->completed != rsp->completed);
+                if (WARN_ON_ONCE(rnp->completed != rsp->completed))
-                ACCESS_ONCE(rnp->completed) = rsp->completed;
+                        ACCESS_ONCE(rnp->completed) = rsp->completed;
                if (rnp == rdp->mynode)
                        (void)__note_gp_changes(rsp, rnp, rdp);
                rcu_preempt_boost_start_gp(rnp);
@@ -1768,9 +1843,12 @@ static int rcu_gp_init(struct rcu_state *rsp)
                raw_spin_unlock_irq(&rnp->lock);
                cond_resched_rcu_qs();
                ACCESS_ONCE(rsp->gp_activity) = jiffies;
+                if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_INIT) &&
+                    gp_init_delay > 0 &&
+                    !(rsp->gpnum % (rcu_num_nodes * 10)))
+                        schedule_timeout_uninterruptible(gp_init_delay);
        }
-        mutex_unlock(&rsp->onoff_mutex);
        return 1;
 }
@@ -1798,7 +1876,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
                fqs_state = RCU_FORCE_QS;
        } else {
                /* Handle dyntick-idle and offline CPUs. */
-                isidle = false;
+                isidle = true;
                force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj);
        }
        /* Clear flag to prevent immediate re-entry. */
@@ -1852,6 +1930,8 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
        rcu_for_each_node_breadth_first(rsp, rnp) {
                raw_spin_lock_irq(&rnp->lock);
                smp_mb__after_unlock_lock();
+                WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
+                WARN_ON_ONCE(rnp->qsmask);
                ACCESS_ONCE(rnp->completed) = rsp->gpnum;
                rdp = this_cpu_ptr(rsp->rda);
                if (rnp == rdp->mynode)
@@ -1895,6 +1975,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
        struct rcu_state *rsp = arg;
        struct rcu_node *rnp = rcu_get_root(rsp);
+        rcu_bind_gp_kthread();
        for (;;) {
                /* Handle grace-period start. */
@@ -2062,25 +2143,32 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
 * Similar to rcu_report_qs_rdp(), for which it is a helper function.
 * Allows quiescent states for a group of CPUs to be reported at one go
 * to the specified rcu_node structure, though all the CPUs in the group
- * must be represented by the same rcu_node structure (which need not be
+ * must be represented by the same rcu_node structure (which need not be a
- * a leaf rcu_node structure, though it often will be).  That structure's
+ * leaf rcu_node structure, though it often will be).  The gps parameter
- * lock must be held upon entry, and it is released before return.
+ * is the grace-period snapshot, which means that the quiescent states
+ * are valid only if rnp->gpnum is equal to gps.  That structure's lock
+ * must be held upon entry, and it is released before return.
 */
 static void
 rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
-                  struct rcu_node *rnp, unsigned long flags)
+                  struct rcu_node *rnp, unsigned long gps, unsigned long flags)
        __releases(rnp->lock)
 {
+        unsigned long oldmask = 0;
        struct rcu_node *rnp_c;
        /* Walk up the rcu_node hierarchy. */
        for (;;) {
-                if (!(rnp->qsmask & mask)) {
+                if (!(rnp->qsmask & mask) || rnp->gpnum != gps) {
-                        /* Our bit has already been cleared, so done. */
+                        /*
+                         * Our bit has already been cleared, or the
+                         * relevant grace period is already over, so done.
+                         */
                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
                        return;
                }
+                WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */
                rnp->qsmask &= ~mask;
                trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum,
                                                 mask, rnp->qsmask, rnp->level,
@@ -2104,7 +2192,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
                rnp = rnp->parent;
                raw_spin_lock_irqsave(&rnp->lock, flags);
                smp_mb__after_unlock_lock();
-                WARN_ON_ONCE(rnp_c->qsmask);
+                oldmask = rnp_c->qsmask;
        }
        /*
@@ -2116,6 +2204,46 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
 }
 /*
+ * Record a quiescent state for all tasks that were previously queued
+ * on the specified rcu_node structure and that were blocking the current
+ * RCU grace period.  The caller must hold the specified rnp->lock with
+ * irqs disabled, and this lock is released upon return, but irqs remain
+ * disabled.
+ */
+static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
+                                      struct rcu_node *rnp, unsigned long flags)
+        __releases(rnp->lock)
+{
+        unsigned long gps;
+        unsigned long mask;
+        struct rcu_node *rnp_p;
+        if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p ||
+            rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                return;  /* Still need more quiescent states! */
+        }
+        rnp_p = rnp->parent;
+        if (rnp_p == NULL) {
+                /*
+                 * Only one rcu_node structure in the tree, so don't
+                 * try to report up to its nonexistent parent!
+                 */
+                rcu_report_qs_rsp(rsp, flags);
+                return;
+        }
+        /* Report up the rest of the hierarchy, tracking current ->gpnum. */
+        gps = rnp->gpnum;
+        mask = rnp->grpmask;
+        raw_spin_unlock(&rnp->lock);    /* irqs remain disabled. */
+        raw_spin_lock(&rnp_p->lock);    /* irqs already disabled. */
+        smp_mb__after_unlock_lock();
+        rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags);
+}
+/*
 * Record a quiescent state for the specified CPU to that CPU's rcu_data
 * structure.  This must be either called from the specified CPU, or
 * called when the specified CPU is known to be offline (and when it is
@@ -2163,7 +2291,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
                 */
                needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
-                rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */
+                rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags);
+                /* ^^^ Released rnp->lock */
                if (needwake)
                        rcu_gp_kthread_wake(rsp);
        }
@@ -2256,8 +2385,12 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
                rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL];
        }
-        /* Finally, initialize the rcu_data structure's list to empty.  */
+        /*
+         * Finally, initialize the rcu_data structure's list to empty and
+         * disallow further callbacks on this CPU.
+         */
        init_callback_list(rdp);
+        rdp->nxttail[RCU_NEXT_TAIL] = NULL;
 }
 /*
@@ -2355,6 +2488,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
                raw_spin_lock(&rnp->lock); /* irqs already disabled. */
                smp_mb__after_unlock_lock(); /* GP memory ordering. */
                rnp->qsmaskinit &= ~mask;
+                rnp->qsmask &= ~mask;
                if (rnp->qsmaskinit) {
                        raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
                        return;
@@ -2364,6 +2498,26 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
 }
 /*
+ * The CPU is exiting the idle loop into the arch_cpu_idle_dead()
+ * function.  We now remove it from the rcu_node tree's ->qsmaskinit
+ * bit masks.
+ */
+static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
+{
+        unsigned long flags;
+        unsigned long mask;
+        struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+        struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
+        /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
+        mask = rdp->grpmask;
+        raw_spin_lock_irqsave(&rnp->lock, flags);
+        smp_mb__after_unlock_lock();    /* Enforce GP memory-order guarantee. */
+        rnp->qsmaskinitnext &= ~mask;
+        raw_spin_unlock_irqrestore(&rnp->lock, flags);
+}
+/*
 * The CPU has been completely removed, and some other CPU is reporting
 * this fact from process context.  Do the remainder of the cleanup,
 * including orphaning the outgoing CPU's RCU callbacks, and also
@@ -2379,29 +2533,15 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
        /* Adjust any no-longer-needed kthreads. */
        rcu_boost_kthread_setaffinity(rnp, -1);
-        /* Exclude any attempts to start a new grace period. */
-        mutex_lock(&rsp->onoff_mutex);
-        raw_spin_lock_irqsave(&rsp->orphan_lock, flags);
        /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
+        raw_spin_lock_irqsave(&rsp->orphan_lock, flags);
        rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
        rcu_adopt_orphan_cbs(rsp, flags);
        raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags);
-        /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
-        raw_spin_lock_irqsave(&rnp->lock, flags);
-        smp_mb__after_unlock_lock();    /* Enforce GP memory-order guarantee. */
-        rnp->qsmaskinit &= ~rdp->grpmask;
-        if (rnp->qsmaskinit == 0 && !rcu_preempt_has_tasks(rnp))
-                rcu_cleanup_dead_rnp(rnp);
-        rcu_report_qs_rnp(rdp->grpmask, rsp, rnp, flags); /* Rlses rnp->lock. */
        WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
                  "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
                  cpu, rdp->qlen, rdp->nxtlist);
-        init_callback_list(rdp);
-        /* Disallow further callbacks on this CPU. */
-        rdp->nxttail[RCU_NEXT_TAIL] = NULL;
-        mutex_unlock(&rsp->onoff_mutex);
 }
 #else /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -2414,6 +2554,10 @@ static void __maybe_unused rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
 {
 }
+static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
+{
+}
 static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 {
 }
@@ -2589,26 +2733,47 @@ static void force_qs_rnp(struct rcu_state *rsp,
                        return;
                }
                if (rnp->qsmask == 0) {
-                        rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
+                        if (rcu_state_p == &rcu_sched_state ||
-                        continue;
+                            rsp != rcu_state_p ||
+                            rcu_preempt_blocked_readers_cgp(rnp)) {
+                                /*
+                                 * No point in scanning bits because they
+                                 * are all zero.  But we might need to
+                                 * priority-boost blocked readers.
+                                 */
+                                rcu_initiate_boost(rnp, flags);
+                                /* rcu_initiate_boost() releases rnp->lock */
+                                continue;
+                        }
+                        if (rnp->parent &&
+                            (rnp->parent->qsmask & rnp->grpmask)) {
+                                /*
+                                 * Race between grace-period
+                                 * initialization and task exiting RCU
+                                 * read-side critical section: Report.
+                                 */
+                                rcu_report_unblock_qs_rnp(rsp, rnp, flags);
+                                /* rcu_report_unblock_qs_rnp() rlses ->lock */
+                                continue;
+                        }
                }
                cpu = rnp->grplo;
                bit = 1;
                for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
                        if ((rnp->qsmask & bit) != 0) {
-                                if ((rnp->qsmaskinit & bit) != 0)
+                                if ((rnp->qsmaskinit & bit) == 0)
-                                        *isidle = false;
+                                        *isidle = false; /* Pending hotplug. */
                                if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
                                        mask |= bit;
                        }
                }
                if (mask != 0) {
+                        /* Idle/offline CPUs, report (releases rnp->lock. */
-                        /* rcu_report_qs_rnp() releases rnp->lock. */
+                        rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags);
-                        rcu_report_qs_rnp(mask, rsp, rnp, flags);
+                } else {
-                        continue;
+                        /* Nothing to do here, so just drop the lock. */
+                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
                }
-                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        }
 }
@@ -2741,7 +2906,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
         * If called from an extended quiescent state, invoke the RCU
         * core in order to force a re-evaluation of RCU's idleness.
         */
-        if (!rcu_is_watching() && cpu_online(smp_processor_id()))
+        if (!rcu_is_watching())
                invoke_rcu_core();
        /* If interrupts were disabled or CPU offline, don't invoke RCU core. */
@@ -2827,11 +2992,22 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
                if (cpu != -1)
                        rdp = per_cpu_ptr(rsp->rda, cpu);
-                offline = !__call_rcu_nocb(rdp, head, lazy, flags);
+                if (likely(rdp->mynode)) {
-                WARN_ON_ONCE(offline);
+                        /* Post-boot, so this should be for a no-CBs CPU. */
-                /* _call_rcu() is illegal on offline CPU; leak the callback. */
+                        offline = !__call_rcu_nocb(rdp, head, lazy, flags);
-                local_irq_restore(flags);
+                        WARN_ON_ONCE(offline);
-                return;
+                        /* Offline CPU, _call_rcu() illegal, leak callback.  */
+                        local_irq_restore(flags);
+                        return;
+                }
+                /*
+                 * Very early boot, before rcu_init().  Initialize if needed
+                 * and then drop through to queue the callback.
+                 */
+                BUG_ON(cpu != -1);
+                WARN_ON_ONCE(!rcu_is_watching());
+                if (!likely(rdp->nxtlist))
+                        init_default_callback_list(rdp);
        }
        ACCESS_ONCE(rdp->qlen) = rdp->qlen + 1;
        if (lazy)
@@ -2954,7 +3130,7 @@ void synchronize_sched(void)
                           "Illegal synchronize_sched() in RCU-sched read-side critical section");
        if (rcu_blocking_is_gp())
                return;
-        if (rcu_expedited)
+        if (rcu_gp_is_expedited())
                synchronize_sched_expedited();
        else
                wait_rcu_gp(call_rcu_sched);
@@ -2981,7 +3157,7 @@ void synchronize_rcu_bh(void)
                           "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");
        if (rcu_blocking_is_gp())
                return;
-        if (rcu_expedited)
+        if (rcu_gp_is_expedited())
                synchronize_rcu_bh_expedited();
        else
                wait_rcu_gp(call_rcu_bh);
@@ -3518,6 +3694,28 @@ void rcu_barrier_sched(void)
 EXPORT_SYMBOL_GPL(rcu_barrier_sched);
 /*
+ * Propagate ->qsinitmask bits up the rcu_node tree to account for the
+ * first CPU in a given leaf rcu_node structure coming online.  The caller
+ * must hold the corresponding leaf rcu_node ->lock with interrrupts
+ * disabled.
+ */
+static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
+{
+        long mask;
+        struct rcu_node *rnp = rnp_leaf;
+        for (;;) {
+                mask = rnp->grpmask;
+                rnp = rnp->parent;
+                if (rnp == NULL)
+                        return;
+                raw_spin_lock(&rnp->lock); /* Interrupts already disabled. */
+                rnp->qsmaskinit |= mask;
+                raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
+        }
+}
+/*
 * Do boot-time initialization of a CPU's per-CPU RCU data.
 */
 static void __init
@@ -3553,49 +3751,37 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
        struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
        struct rcu_node *rnp = rcu_get_root(rsp);
-        /* Exclude new grace periods. */
-        mutex_lock(&rsp->onoff_mutex);
        /* Set up local state, ensuring consistent view of global state. */
        raw_spin_lock_irqsave(&rnp->lock, flags);
        rdp->beenonline = 1;     /* We have now been online. */
        rdp->qlen_last_fqs_check = 0;
        rdp->n_force_qs_snap = rsp->n_force_qs;
        rdp->blimit = blimit;
-        init_callback_list(rdp);  /* Re-enable callbacks on this CPU. */
+        if (!rdp->nxtlist)
+                init_callback_list(rdp);  /* Re-enable callbacks on this CPU. */
        rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
        rcu_sysidle_init_percpu_data(rdp->dynticks);
        atomic_set(&rdp->dynticks->dynticks,
                   (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
        raw_spin_unlock(&rnp->lock);            /* irqs remain disabled. */
-        /* Add CPU to rcu_node bitmasks. */
+        /*
+         * Add CPU to leaf rcu_node pending-online bitmask.  Any needed
+         * propagation up the rcu_node tree will happen at the beginning
+         * of the next grace period.
+         */
        rnp = rdp->mynode;
        mask = rdp->grpmask;
-        do {
+        raw_spin_lock(&rnp->lock);              /* irqs already disabled. */
-                /* Exclude any attempts to start a new GP on small systems. */
+        smp_mb__after_unlock_lock();
-                raw_spin_lock(&rnp->lock);      /* irqs already disabled. */
+        rnp->qsmaskinitnext |= mask;
-                rnp->qsmaskinit |= mask;
+        rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */
-                mask = rnp->grpmask;
+        rdp->completed = rnp->completed;
-                if (rnp == rdp->mynode) {
+        rdp->passed_quiesce = false;
-                        /*
+        rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
-                         * If there is a grace period in progress, we will
+        rdp->qs_pending = false;
-                         * set up to wait for it next time we run the
+        trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
-                         * RCU core code.
+        raw_spin_unlock_irqrestore(&rnp->lock, flags);
-                         */
-                        rdp->gpnum = rnp->completed;
-                        rdp->completed = rnp->completed;
-                        rdp->passed_quiesce = 0;
-                        rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
-                        rdp->qs_pending = 0;
-                        trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
-                }
-                raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
-                rnp = rnp->parent;
-        } while (rnp != NULL && !(rnp->qsmaskinit & mask));
-        local_irq_restore(flags);
-        mutex_unlock(&rsp->onoff_mutex);
 }
 static void rcu_prepare_cpu(int cpu)
@@ -3609,15 +3795,14 @@ static void rcu_prepare_cpu(int cpu)
 /*
 * Handle CPU online/offline notification events.
 */
-static int rcu_cpu_notify(struct notifier_block *self,
+int rcu_cpu_notify(struct notifier_block *self,
-                                    unsigned long action, void *hcpu)
+                   unsigned long action, void *hcpu)
 {
        long cpu = (long)hcpu;
        struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
        struct rcu_node *rnp = rdp->mynode;
        struct rcu_state *rsp;
-        trace_rcu_utilization(TPS("Start CPU hotplug"));
        switch (action) {
        case CPU_UP_PREPARE:
        case CPU_UP_PREPARE_FROZEN:
@@ -3637,6 +3822,11 @@ static int rcu_cpu_notify(struct notifier_block *self,
                for_each_rcu_flavor(rsp)
                        rcu_cleanup_dying_cpu(rsp);
                break;
+        case CPU_DYING_IDLE:
+                for_each_rcu_flavor(rsp) {
+                        rcu_cleanup_dying_idle_cpu(cpu, rsp);
+                }
+                break;
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
        case CPU_UP_CANCELED:
@@ -3649,7 +3839,6 @@ static int rcu_cpu_notify(struct notifier_block *self,
        default:
                break;
        }
-        trace_rcu_utilization(TPS("End CPU hotplug"));
        return NOTIFY_OK;
 }
@@ -3660,11 +3849,12 @@ static int rcu_pm_notify(struct notifier_block *self,
        case PM_HIBERNATION_PREPARE:
        case PM_SUSPEND_PREPARE:
                if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */
-                        rcu_expedited = 1;
+                        rcu_expedite_gp();
                break;
        case PM_POST_HIBERNATION:
        case PM_POST_SUSPEND:
-                rcu_expedited = 0;
+                if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */
+                        rcu_unexpedite_gp();
                break;
        default:
                break;
@@ -3734,30 +3924,26 @@ void rcu_scheduler_starting(void)
 * Compute the per-level fanout, either using the exact fanout specified
 * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
 */
-#ifdef CONFIG_RCU_FANOUT_EXACT
-static void __init rcu_init_levelspread(struct rcu_state *rsp)
-{
-        int i;
-        rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
-        for (i = rcu_num_lvls - 2; i >= 0; i--)
-                rsp->levelspread[i] = CONFIG_RCU_FANOUT;
-}
-#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
 static void __init rcu_init_levelspread(struct rcu_state *rsp)
 {
-        int ccur;
-        int cprv;
        int i;
-        cprv = nr_cpu_ids;
+        if (IS_ENABLED(CONFIG_RCU_FANOUT_EXACT)) {
-        for (i = rcu_num_lvls - 1; i >= 0; i--) {
+                rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
-                ccur = rsp->levelcnt[i];
+                for (i = rcu_num_lvls - 2; i >= 0; i--)
-                rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
+                        rsp->levelspread[i] = CONFIG_RCU_FANOUT;
-                cprv = ccur;
+        } else {
+                int ccur;
+                int cprv;
+                cprv = nr_cpu_ids;
+                for (i = rcu_num_lvls - 1; i >= 0; i--) {
+                        ccur = rsp->levelcnt[i];
+                        rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
+                        cprv = ccur;
+                }
        }
 }
-#endif /* #else #ifdef CONFIG_RCU_FANOUT_EXACT */
 /*
 * Helper function for rcu_init() that initializes one rcu_state structure.
@@ -3833,7 +4019,6 @@ static void __init rcu_init_one(struct rcu_state *rsp,
                }
        }
-        rsp->rda = rda;
        init_waitqueue_head(&rsp->gp_wq);
        rnp = rsp->level[rcu_num_lvls - 1];
        for_each_possible_cpu(i) {
@@ -3926,6 +4111,8 @@ void __init rcu_init(void)
 {
        int cpu;
+        rcu_early_boot_tests();
        rcu_bootup_announce();
        rcu_init_geometry();
        rcu_init_one(&rcu_bh_state, &rcu_bh_data);
@@ -3942,8 +4129,6 @@ void __init rcu_init(void)
        pm_notifier(rcu_pm_notify, 0);
        for_each_online_cpu(cpu)
                rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
-        rcu_early_boot_tests();
 }
 #include "tree_plugin.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 119de399eb2f..a69d3dab2ec4 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -141,12 +141,20 @@ struct rcu_node {
                                /*  complete (only for PREEMPT_RCU). */
        unsigned long qsmaskinit;
                                /* Per-GP initial value for qsmask & expmask. */
+                                /*  Initialized from ->qsmaskinitnext at the */
+                                /*  beginning of each grace period. */
+        unsigned long qsmaskinitnext;
+                                /* Online CPUs for next grace period. */
        unsigned long grpmask;  /* Mask to apply to parent qsmask. */
                                /*  Only one bit will be set in this mask. */
        int     grplo;          /* lowest-numbered CPU or group here. */
        int     grphi;          /* highest-numbered CPU or group here. */
        u8      grpnum;         /* CPU/group number for next level up. */
        u8      level;          /* root is at level 0. */
+        bool    wait_blkd_tasks;/* Necessary to wait for blocked tasks to */
+                                /*  exit RCU read-side critical sections */
+                                /*  before propagating offline up the */
+                                /*  rcu_node tree? */
        struct rcu_node *parent;
        struct list_head blkd_tasks;
                                /* Tasks blocked in RCU read-side critical */
@@ -448,8 +456,6 @@ struct rcu_state {
        long qlen;                              /* Total number of callbacks. */
        /* End of fields guarded by orphan_lock. */
-        struct mutex onoff_mutex;               /* Coordinate hotplug & GPs. */
        struct mutex barrier_mutex;             /* Guards barrier fields. */
        atomic_t barrier_cpu_count;             /* # CPUs waiting on. */
        struct completion barrier_completion;   /* Wake at barrier end. */
@@ -559,6 +565,7 @@ static void rcu_prepare_kthreads(int cpu);
 static void rcu_cleanup_after_idle(void);
 static void rcu_prepare_for_idle(void);
 static void rcu_idle_count_callbacks_posted(void);
+static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
 static void print_cpu_stall_info_begin(void);
 static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
 static void print_cpu_stall_info_end(void);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 0a571e9a0f1d..8c0ec0f5a027 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -58,38 +58,33 @@ static bool __read_mostly rcu_nocb_poll;    /* Offload kthread are to poll. */
 */
 static void __init rcu_bootup_announce_oddness(void)
 {
-#ifdef CONFIG_RCU_TRACE
+        if (IS_ENABLED(CONFIG_RCU_TRACE))
-        pr_info("\tRCU debugfs-based tracing is enabled.\n");
+                pr_info("\tRCU debugfs-based tracing is enabled.\n");
-#endif
+        if ((IS_ENABLED(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) ||
-#if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32)
+            (!IS_ENABLED(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32))
-        pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
+                pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
-               CONFIG_RCU_FANOUT);
+                       CONFIG_RCU_FANOUT);
-#endif
+        if (IS_ENABLED(CONFIG_RCU_FANOUT_EXACT))
-#ifdef CONFIG_RCU_FANOUT_EXACT
+                pr_info("\tHierarchical RCU autobalancing is disabled.\n");
-        pr_info("\tHierarchical RCU autobalancing is disabled.\n");
+        if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ))
-#endif
+                pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
-#ifdef CONFIG_RCU_FAST_NO_HZ
+        if (IS_ENABLED(CONFIG_PROVE_RCU))
-        pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
+                pr_info("\tRCU lockdep checking is enabled.\n");
-#endif
+        if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_RUNNABLE))
-#ifdef CONFIG_PROVE_RCU
+                pr_info("\tRCU torture testing starts during boot.\n");
-        pr_info("\tRCU lockdep checking is enabled.\n");
+        if (IS_ENABLED(CONFIG_RCU_CPU_STALL_INFO))
-#endif
+                pr_info("\tAdditional per-CPU info printed with stalls.\n");
-#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
+        if (NUM_RCU_LVL_4 != 0)
-        pr_info("\tRCU torture testing starts during boot.\n");
+                pr_info("\tFour-level hierarchy is enabled.\n");
-#endif
+        if (CONFIG_RCU_FANOUT_LEAF != 16)
-#if defined(CONFIG_RCU_CPU_STALL_INFO)
+                pr_info("\tBuild-time adjustment of leaf fanout to %d.\n",
-        pr_info("\tAdditional per-CPU info printed with stalls.\n");
+                        CONFIG_RCU_FANOUT_LEAF);
-#endif
-#if NUM_RCU_LVL_4 != 0
-        pr_info("\tFour-level hierarchy is enabled.\n");
-#endif
        if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF)
                pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
        if (nr_cpu_ids != NR_CPUS)
                pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
-#ifdef CONFIG_RCU_BOOST
+        if (IS_ENABLED(CONFIG_RCU_BOOST))
-        pr_info("\tRCU kthread priority: %d.\n", kthread_prio);
+                pr_info("\tRCU kthread priority: %d.\n", kthread_prio);
-#endif
 }
 #ifdef CONFIG_PREEMPT_RCU
@@ -180,7 +175,7 @@ static void rcu_preempt_note_context_switch(void)
                 * But first, note that the current CPU must still be
                 * on line!
                 */
-                WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0);
+                WARN_ON_ONCE((rdp->grpmask & rcu_rnp_online_cpus(rnp)) == 0);
                WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
                if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) {
                        list_add(&t->rcu_node_entry, rnp->gp_tasks->prev);
@@ -233,43 +228,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
 }
 /*
- * Record a quiescent state for all tasks that were previously queued
- * on the specified rcu_node structure and that were blocking the current
- * RCU grace period.  The caller must hold the specified rnp->lock with
- * irqs disabled, and this lock is released upon return, but irqs remain
- * disabled.
- */
-static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
-        __releases(rnp->lock)
-{
-        unsigned long mask;
-        struct rcu_node *rnp_p;
-        if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
-                raw_spin_unlock_irqrestore(&rnp->lock, flags);
-                return;  /* Still need more quiescent states! */
-        }
-        rnp_p = rnp->parent;
-        if (rnp_p == NULL) {
-                /*
-                 * Either there is only one rcu_node in the tree,
-                 * or tasks were kicked up to root rcu_node due to
-                 * CPUs going offline.
-                 */
-                rcu_report_qs_rsp(&rcu_preempt_state, flags);
-                return;
-        }
-        /* Report up the rest of the hierarchy. */
-        mask = rnp->grpmask;
-        raw_spin_unlock(&rnp->lock);    /* irqs remain disabled. */
-        raw_spin_lock(&rnp_p->lock);    /* irqs already disabled. */
-        smp_mb__after_unlock_lock();
-        rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags);
-}
-/*
 * Advance a ->blkd_tasks-list pointer to the next entry, instead
 * returning NULL if at the end of the list.
 */
@@ -300,7 +258,6 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
 */
 void rcu_read_unlock_special(struct task_struct *t)
 {
-        bool empty;
        bool empty_exp;
        bool empty_norm;
        bool empty_exp_now;
@@ -334,7 +291,13 @@ void rcu_read_unlock_special(struct task_struct *t)
        }
        /* Hardware IRQ handlers cannot block, complain if they get here. */
-        if (WARN_ON_ONCE(in_irq() || in_serving_softirq())) {
+        if (in_irq() || in_serving_softirq()) {
+                lockdep_rcu_suspicious(__FILE__, __LINE__,
+                                       "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n");
+                pr_alert("->rcu_read_unlock_special: %#x (b: %d, nq: %d)\n",
+                         t->rcu_read_unlock_special.s,
+                         t->rcu_read_unlock_special.b.blocked,
+                         t->rcu_read_unlock_special.b.need_qs);
                local_irq_restore(flags);
                return;
        }
@@ -356,7 +319,6 @@ void rcu_read_unlock_special(struct task_struct *t)
                                break;
                        raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
                }
-                empty = !rcu_preempt_has_tasks(rnp);
                empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
                empty_exp = !rcu_preempted_readers_exp(rnp);
                smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
@@ -377,14 +339,6 @@ void rcu_read_unlock_special(struct task_struct *t)
 #endif /* #ifdef CONFIG_RCU_BOOST */
                /*
-                 * If this was the last task on the list, go see if we
-                 * need to propagate ->qsmaskinit bit clearing up the
-                 * rcu_node tree.
-                 */
-                if (!empty && !rcu_preempt_has_tasks(rnp))
-                        rcu_cleanup_dead_rnp(rnp);
-                /*
                 * If this was the last task on the current list, and if
                 * we aren't waiting on any CPUs, report the quiescent state.
                 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
@@ -399,7 +353,8 @@ void rcu_read_unlock_special(struct task_struct *t)
                                                         rnp->grplo,
                                                         rnp->grphi,
                                                         !!rnp->gp_tasks);
-                        rcu_report_unblock_qs_rnp(rnp, flags);
+                        rcu_report_unblock_qs_rnp(&rcu_preempt_state,
+                                                  rnp, flags);
                } else {
                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
                }
@@ -520,10 +475,6 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
        WARN_ON_ONCE(rnp->qsmask);
 }
-#ifdef CONFIG_HOTPLUG_CPU
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
 /*
 * Check for a quiescent state from the current CPU.  When a task blocks,
 * the task is recorded in the corresponding CPU's rcu_node structure,
@@ -585,7 +536,7 @@ void synchronize_rcu(void)
                           "Illegal synchronize_rcu() in RCU read-side critical section");
        if (!rcu_scheduler_active)
                return;
-        if (rcu_expedited)
+        if (rcu_gp_is_expedited())
                synchronize_rcu_expedited();
        else
                wait_rcu_gp(call_rcu);
@@ -630,9 +581,6 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
 * recursively up the tree.  (Calm down, calm down, we do the recursion
 * iteratively!)
 *
- * Most callers will set the "wake" flag, but the task initiating the
- * expedited grace period need not wake itself.
- *
 * Caller must hold sync_rcu_preempt_exp_mutex.
 */
 static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
@@ -667,29 +615,85 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
 /*
 * Snapshot the tasks blocking the newly started preemptible-RCU expedited
- * grace period for the specified rcu_node structure.  If there are no such
+ * grace period for the specified rcu_node structure, phase 1.  If there
- * tasks, report it up the rcu_node hierarchy.
+ * are such tasks, set the ->expmask bits up the rcu_node tree and also
+ * set the ->expmask bits on the leaf rcu_node structures to tell phase 2
+ * that work is needed here.
 *
- * Caller must hold sync_rcu_preempt_exp_mutex and must exclude
+ * Caller must hold sync_rcu_preempt_exp_mutex.
- * CPU hotplug operations.
 */
 static void
-sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
+sync_rcu_preempt_exp_init1(struct rcu_state *rsp, struct rcu_node *rnp)
 {
        unsigned long flags;
-        int must_wait = 0;
+        unsigned long mask;
+        struct rcu_node *rnp_up;
        raw_spin_lock_irqsave(&rnp->lock, flags);
        smp_mb__after_unlock_lock();
+        WARN_ON_ONCE(rnp->expmask);
+        WARN_ON_ONCE(rnp->exp_tasks);
        if (!rcu_preempt_has_tasks(rnp)) {
+                /* No blocked tasks, nothing to do. */
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
-        } else {
+                return;
+        }
+        /* Call for Phase 2 and propagate ->expmask bits up the tree. */
+        rnp->expmask = 1;
+        rnp_up = rnp;
+        while (rnp_up->parent) {
+                mask = rnp_up->grpmask;
+                rnp_up = rnp_up->parent;
+                if (rnp_up->expmask & mask)
+                        break;
+                raw_spin_lock(&rnp_up->lock); /* irqs already off */
+                smp_mb__after_unlock_lock();
+                rnp_up->expmask |= mask;
+                raw_spin_unlock(&rnp_up->lock); /* irqs still off */
+        }
+        raw_spin_unlock_irqrestore(&rnp->lock, flags);
+}
+/*
+ * Snapshot the tasks blocking the newly started preemptible-RCU expedited
+ * grace period for the specified rcu_node structure, phase 2.  If the
+ * leaf rcu_node structure has its ->expmask field set, check for tasks.
+ * If there are some, clear ->expmask and set ->exp_tasks accordingly,
+ * then initiate RCU priority boosting.  Otherwise, clear ->expmask and
+ * invoke rcu_report_exp_rnp() to clear out the upper-level ->expmask bits,
+ * enabling rcu_read_unlock_special() to do the bit-clearing.
+ *
+ * Caller must hold sync_rcu_preempt_exp_mutex.
+ */
+static void
+sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp)
+{
+        unsigned long flags;
+        raw_spin_lock_irqsave(&rnp->lock, flags);
+        smp_mb__after_unlock_lock();
+        if (!rnp->expmask) {
+                /* Phase 1 didn't do anything, so Phase 2 doesn't either. */
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                return;
+        }
+        /* Phase 1 is over. */
+        rnp->expmask = 0;
+        /*
+         * If there are still blocked tasks, set up ->exp_tasks so that
+         * rcu_read_unlock_special() will wake us and then boost them.
+         */
+        if (rcu_preempt_has_tasks(rnp)) {
                rnp->exp_tasks = rnp->blkd_tasks.next;
                rcu_initiate_boost(rnp, flags);  /* releases rnp->lock */
-                must_wait = 1;
+                return;
        }
-        if (!must_wait)
-                rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */
+        /* No longer any blocked tasks, so undo bit setting. */
+        raw_spin_unlock_irqrestore(&rnp->lock, flags);
+        rcu_report_exp_rnp(rsp, rnp, false);
 }
 /**
@@ -706,7 +710,6 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
 */
 void synchronize_rcu_expedited(void)
 {
-        unsigned long flags;
        struct rcu_node *rnp;
        struct rcu_state *rsp = &rcu_preempt_state;
        unsigned long snap;
@@ -757,19 +760,16 @@ void synchronize_rcu_expedited(void)
        /* force all RCU readers onto ->blkd_tasks lists. */
        synchronize_sched_expedited();
-        /* Initialize ->expmask for all non-leaf rcu_node structures. */
+        /*
-        rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) {
+         * Snapshot current state of ->blkd_tasks lists into ->expmask.
-                raw_spin_lock_irqsave(&rnp->lock, flags);
+         * Phase 1 sets bits and phase 2 permits rcu_read_unlock_special()
-                smp_mb__after_unlock_lock();
+         * to start clearing them.  Doing this in one phase leads to
-                rnp->expmask = rnp->qsmaskinit;
+         * strange races between setting and clearing bits, so just say "no"!
-                raw_spin_unlock_irqrestore(&rnp->lock, flags);
+         */
-        }
+        rcu_for_each_leaf_node(rsp, rnp)
+                sync_rcu_preempt_exp_init1(rsp, rnp);
-        /* Snapshot current state of ->blkd_tasks lists. */
        rcu_for_each_leaf_node(rsp, rnp)
-                sync_rcu_preempt_exp_init(rsp, rnp);
+                sync_rcu_preempt_exp_init2(rsp, rnp);
-        if (NUM_RCU_NODES > 1)
-                sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp));
        put_online_cpus();
@@ -859,8 +859,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
        return 0;
 }
-#ifdef CONFIG_HOTPLUG_CPU
 /*
 * Because there is no preemptible RCU, there can be no readers blocked.
 */
@@ -869,8 +867,6 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
        return false;
 }
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
 /*
 * Because preemptible RCU does not exist, we never have to check for
 * tasks blocked within RCU read-side critical sections.
@@ -1170,7 +1166,7 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
 * Returns zero if all is well, a negated errno otherwise.
 */
 static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
-                                                 struct rcu_node *rnp)
+                                       struct rcu_node *rnp)
 {
        int rnp_index = rnp - &rsp->node[0];
        unsigned long flags;
@@ -1180,7 +1176,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
        if (&rcu_preempt_state != rsp)
                return 0;
-        if (!rcu_scheduler_fully_active || rnp->qsmaskinit == 0)
+        if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0)
                return 0;
        rsp->boost = 1;
@@ -1273,7 +1269,7 @@ static void rcu_cpu_kthread(unsigned int cpu)
 static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
 {
        struct task_struct *t = rnp->boost_kthread_task;
-        unsigned long mask = rnp->qsmaskinit;
+        unsigned long mask = rcu_rnp_online_cpus(rnp);
        cpumask_var_t cm;
        int cpu;
@@ -1945,7 +1941,8 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
                rhp = ACCESS_ONCE(rdp->nocb_follower_head);
        /* Having no rcuo kthread but CBs after scheduler starts is bad! */
-        if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp) {
+        if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp &&
+            rcu_scheduler_fully_active) {
                /* RCU callback enqueued before CPU first came online??? */
                pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n",
                       cpu, rhp->func);
@@ -2392,18 +2389,8 @@ void __init rcu_init_nohz(void)
                pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
        for_each_rcu_flavor(rsp) {
-                for_each_cpu(cpu, rcu_nocb_mask) {
+                for_each_cpu(cpu, rcu_nocb_mask)
-                        struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+                        init_nocb_callback_list(per_cpu_ptr(rsp->rda, cpu));
-                        /*
-                         * If there are early callbacks, they will need
-                         * to be moved to the nocb lists.
-                         */
-                        WARN_ON_ONCE(rdp->nxttail[RCU_NEXT_TAIL] !=
-                                     &rdp->nxtlist &&
-                                     rdp->nxttail[RCU_NEXT_TAIL] != NULL);
-                        init_nocb_callback_list(rdp);
-                }
                rcu_organize_nocb_kthreads(rsp);
        }
 }
@@ -2540,6 +2527,16 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
        if (!rcu_is_nocb_cpu(rdp->cpu))
                return false;
+        /* If there are early-boot callbacks, move them to nocb lists. */
+        if (rdp->nxtlist) {
+                rdp->nocb_head = rdp->nxtlist;
+                rdp->nocb_tail = rdp->nxttail[RCU_NEXT_TAIL];
+                atomic_long_set(&rdp->nocb_q_count, rdp->qlen);
+                atomic_long_set(&rdp->nocb_q_count_lazy, rdp->qlen_lazy);
+                rdp->nxtlist = NULL;
+                rdp->qlen = 0;
+                rdp->qlen_lazy = 0;
+        }
        rdp->nxttail[RCU_NEXT_TAIL] = NULL;
        return true;
 }
@@ -2763,7 +2760,8 @@ static void rcu_sysidle_exit(int irq)
 /*
 * Check to see if the current CPU is idle.  Note that usermode execution
- * does not count as idle.  The caller must have disabled interrupts.
+ * does not count as idle.  The caller must have disabled interrupts,
+ * and must be running on tick_do_timer_cpu.
 */
 static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
                                  unsigned long *maxj)
@@ -2784,8 +2782,8 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
        if (!*isidle || rdp->rsp != rcu_state_p ||
            cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu)
                return;
-        if (rcu_gp_in_progress(rdp->rsp))
+        /* Verify affinity of current kthread. */
-                WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu);
+        WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu);
        /* Pick up current idle and NMI-nesting counter and check. */
        cur = atomic_read(&rdtp->dynticks_idle);
@@ -3068,11 +3066,10 @@ static void rcu_bind_gp_kthread(void)
                return;
 #ifdef CONFIG_NO_HZ_FULL_SYSIDLE
        cpu = tick_do_timer_cpu;
-        if (cpu >= 0 && cpu < nr_cpu_ids && raw_smp_processor_id() != cpu)
+        if (cpu >= 0 && cpu < nr_cpu_ids)
                set_cpus_allowed_ptr(current, cpumask_of(cpu));
 #else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
-        if (!is_housekeeping_cpu(raw_smp_processor_id()))
+        housekeeping_affine(current);
-                housekeeping_affine(current);
 #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
 }
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index fbb6240509ea..f92361efd0f5 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -283,8 +283,8 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
                        seq_puts(m, "\n");
                        level = rnp->level;
                }
-                seq_printf(m, "%lx/%lx %c%c>%c %d:%d ^%d    ",
+                seq_printf(m, "%lx/%lx->%lx %c%c>%c %d:%d ^%d    ",
-                           rnp->qsmask, rnp->qsmaskinit,
+                           rnp->qsmask, rnp->qsmaskinit, rnp->qsmaskinitnext,
                           ".G"[rnp->gp_tasks != NULL],
                           ".E"[rnp->exp_tasks != NULL],
                           ".T"[!list_empty(&rnp->blkd_tasks)],
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index e0d31a345ee6..1f133350da01 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -62,6 +62,63 @@ MODULE_ALIAS("rcupdate");
 module_param(rcu_expedited, int, 0);
+#ifndef CONFIG_TINY_RCU
+static atomic_t rcu_expedited_nesting =
+        ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0);
+/*
+ * Should normal grace-period primitives be expedited?  Intended for
+ * use within RCU.  Note that this function takes the rcu_expedited
+ * sysfs/boot variable into account as well as the rcu_expedite_gp()
+ * nesting.  So looping on rcu_unexpedite_gp() until rcu_gp_is_expedited()
+ * returns false is a -really- bad idea.
+ */
+bool rcu_gp_is_expedited(void)
+{
+        return rcu_expedited || atomic_read(&rcu_expedited_nesting);
+}
+EXPORT_SYMBOL_GPL(rcu_gp_is_expedited);
+/**
+ * rcu_expedite_gp - Expedite future RCU grace periods
+ *
+ * After a call to this function, future calls to synchronize_rcu() and
+ * friends act as the corresponding synchronize_rcu_expedited() function
+ * had instead been called.
+ */
+void rcu_expedite_gp(void)
+{
+        atomic_inc(&rcu_expedited_nesting);
+}
+EXPORT_SYMBOL_GPL(rcu_expedite_gp);
+/**
+ * rcu_unexpedite_gp - Cancel prior rcu_expedite_gp() invocation
+ *
+ * Undo a prior call to rcu_expedite_gp().  If all prior calls to
+ * rcu_expedite_gp() are undone by a subsequent call to rcu_unexpedite_gp(),
+ * and if the rcu_expedited sysfs/boot parameter is not set, then all
+ * subsequent calls to synchronize_rcu() and friends will return to
+ * their normal non-expedited behavior.
+ */
+void rcu_unexpedite_gp(void)
+{
+        atomic_dec(&rcu_expedited_nesting);
+}
+EXPORT_SYMBOL_GPL(rcu_unexpedite_gp);
+#endif /* #ifndef CONFIG_TINY_RCU */
+/*
+ * Inform RCU of the end of the in-kernel boot sequence.
+ */
+void rcu_end_inkernel_boot(void)
+{
+        if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT))
+                rcu_unexpedite_gp();
+}
 #ifdef CONFIG_PREEMPT_RCU
 /*
@@ -199,16 +256,13 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
-struct rcu_synchronize {
+/**
-        struct rcu_head head;
+ * wakeme_after_rcu() - Callback function to awaken a task after grace period
-        struct completion completion;
+ * @head: Pointer to rcu_head member within rcu_synchronize structure
-};
+ *
+ * Awaken the corresponding task now that a grace period has elapsed.
-/*
- * Awaken the corresponding synchronize_rcu() instance now that a
- * grace period has elapsed.
 */
-static void wakeme_after_rcu(struct rcu_head  *head)
+void wakeme_after_rcu(struct rcu_head *head)
 {
        struct rcu_synchronize *rcu;
diff --git a/kernel/reboot.c b/kernel/reboot.c
index 5925f5ae8dff..d20c85d9f8c0 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -387,8 +387,9 @@ void ctrl_alt_del(void)
 }
 char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
+static const char reboot_cmd[] = "/sbin/reboot";
-static int __orderly_poweroff(bool force)
+static int run_cmd(const char *cmd)
 {
        char **argv;
        static char *envp[] = {
@@ -397,8 +398,7 @@ static int __orderly_poweroff(bool force)
                NULL
        };
        int ret;
+        argv = argv_split(GFP_KERNEL, cmd, NULL);
-        argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL);
        if (argv) {
                ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
                argv_free(argv);
@@ -406,8 +406,33 @@ static int __orderly_poweroff(bool force)
                ret = -ENOMEM;
        }
+        return ret;
+}
+static int __orderly_reboot(void)
+{
+        int ret;
+        ret = run_cmd(reboot_cmd);
+        if (ret) {
+                pr_warn("Failed to start orderly reboot: forcing the issue\n");
+                emergency_sync();
+                kernel_restart(NULL);
+        }
+        return ret;
+}
+static int __orderly_poweroff(bool force)
+{
+        int ret;
+        ret = run_cmd(poweroff_cmd);
        if (ret && force) {
                pr_warn("Failed to start orderly shutdown: forcing the issue\n");
                /*
                 * I guess this should try to kick off some daemon to sync and
                 * poweroff asap.  Or not even bother syncing if we're doing an
@@ -436,15 +461,33 @@ static DECLARE_WORK(poweroff_work, poweroff_work_func);
 * This may be called from any context to trigger a system shutdown.
 * If the orderly shutdown fails, it will force an immediate shutdown.
 */
-int orderly_poweroff(bool force)
+void orderly_poweroff(bool force)
 {
        if (force) /* do not override the pending "true" */
                poweroff_force = true;
        schedule_work(&poweroff_work);
-        return 0;
 }
 EXPORT_SYMBOL_GPL(orderly_poweroff);
+static void reboot_work_func(struct work_struct *work)
+{
+        __orderly_reboot();
+}
+static DECLARE_WORK(reboot_work, reboot_work_func);
+/**
+ * orderly_reboot - Trigger an orderly system reboot
+ *
+ * This may be called from any context to trigger a system reboot.
+ * If the orderly reboot fails, it will force an immediate reboot.
+ */
+void orderly_reboot(void)
+{
+        schedule_work(&reboot_work);
+}
+EXPORT_SYMBOL_GPL(orderly_reboot);
 static int __init reboot_setup(char *str)
 {
        for (;;) {
diff --git a/kernel/resource.c b/kernel/resource.c
index 19f2357dfda3..90552aab5f2d 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1034,8 +1034,6 @@ resource_size_t resource_alignment(struct resource *res)
 *
 * request_region creates a new busy region.
 *
- * check_region returns non-zero if the area is already busy.
- *
 * release_region releases a matching busy region.
 */
@@ -1098,36 +1096,6 @@ struct resource * __request_region(struct resource *parent,
 EXPORT_SYMBOL(__request_region);
 /**
- * __check_region - check if a resource region is busy or free
- * @parent: parent resource descriptor
- * @start: resource start address
- * @n: resource region size
- *
- * Returns 0 if the region is free at the moment it is checked,
- * returns %-EBUSY if the region is busy.
- *
- * NOTE:
- * This function is deprecated because its use is racy.
- * Even if it returns 0, a subsequent call to request_region()
- * may fail because another driver etc. just allocated the region.
- * Do NOT use it.  It will be removed from the kernel.
- */
-int __check_region(struct resource *parent, resource_size_t start,
-                        resource_size_t n)
-{
-        struct resource * res;
-        res = __request_region(parent, start, n, "check-region", 0);
-        if (!res)
-                return -EBUSY;
-        release_resource(res);
-        free_resource(res);
-        return 0;
-}
-EXPORT_SYMBOL(__check_region);
-/**
 * __release_region - release a previously reserved resource region
 * @parent: parent resource descriptor
 * @start: resource start address
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f0f831e8a345..f9123a82cbb6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -306,6 +306,9 @@ __read_mostly int scheduler_running;
 */
 int sysctl_sched_rt_runtime = 950000;
+/* cpus with isolated domains */
+cpumask_var_t cpu_isolated_map;
 /*
 * this_rq_lock - lock this runqueue and disable interrupts.
 */
@@ -690,6 +693,23 @@ static inline bool got_nohz_idle_kick(void)
 bool sched_can_stop_tick(void)
 {
        /*
+         * FIFO realtime policy runs the highest priority task. Other runnable
+         * tasks are of a lower priority. The scheduler tick does nothing.
+         */
+        if (current->policy == SCHED_FIFO)
+                return true;
+        /*
+         * Round-robin realtime tasks time slice with other tasks at the same
+         * realtime priority. Is this task the only one at this priority?
+         */
+        if (current->policy == SCHED_RR) {
+                struct sched_rt_entity *rt_se = &current->rt;
+                return rt_se->run_list.prev == rt_se->run_list.next;
+        }
+        /*
         * More than one running task need preemption.
         * nr_running update is assumed to be visible
         * after IPI is sent from wakers.
@@ -996,6 +1016,13 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
                rq_clock_skip_update(rq, true);
 }
+static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
+void register_task_migration_notifier(struct notifier_block *n)
+{
+        atomic_notifier_chain_register(&task_migration_notifier, n);
+}
 #ifdef CONFIG_SMP
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
@@ -1026,10 +1053,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
        trace_sched_migrate_task(p, new_cpu);
        if (task_cpu(p) != new_cpu) {
+                struct task_migration_notifier tmn;
                if (p->sched_class->migrate_task_rq)
                        p->sched_class->migrate_task_rq(p, new_cpu);
                p->se.nr_migrations++;
                perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
+                tmn.task = p;
+                tmn.from_cpu = task_cpu(p);
+                tmn.to_cpu = new_cpu;
+                atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn);
        }
        __set_task_cpu(p, new_cpu);
@@ -2818,7 +2853,7 @@ asmlinkage __visible void __sched schedule_user(void)
         * we find a better solution.
         *
         * NB: There are buggy callers of this function.  Ideally we
-         * should warn if prev_state != IN_USER, but that will trigger
+         * should warn if prev_state != CONTEXT_USER, but that will trigger
         * too frequently to make sense yet.
         */
        enum ctx_state prev_state = exception_enter();
@@ -3034,6 +3069,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        } else {
                if (dl_prio(oldprio))
                        p->dl.dl_boosted = 0;
+                if (rt_prio(oldprio))
+                        p->rt.timeout = 0;
                p->sched_class = &fair_sched_class;
        }
@@ -5318,36 +5355,13 @@ static int sched_cpu_active(struct notifier_block *nfb,
 static int sched_cpu_inactive(struct notifier_block *nfb,
                                        unsigned long action, void *hcpu)
 {
-        unsigned long flags;
-        long cpu = (long)hcpu;
-        struct dl_bw *dl_b;
        switch (action & ~CPU_TASKS_FROZEN) {
        case CPU_DOWN_PREPARE:
-                set_cpu_active(cpu, false);
+                set_cpu_active((long)hcpu, false);
-                /* explicitly allow suspend */
-                if (!(action & CPU_TASKS_FROZEN)) {
-                        bool overflow;
-                        int cpus;
-                        rcu_read_lock_sched();
-                        dl_b = dl_bw_of(cpu);
-                        raw_spin_lock_irqsave(&dl_b->lock, flags);
-                        cpus = dl_bw_cpus(cpu);
-                        overflow = __dl_overflow(dl_b, cpus, 0, 0);
-                        raw_spin_unlock_irqrestore(&dl_b->lock, flags);
-                        rcu_read_unlock_sched();
-                        if (overflow)
-                                return notifier_from_errno(-EBUSY);
-                }
                return NOTIFY_OK;
+        default:
+                return NOTIFY_DONE;
        }
-        return NOTIFY_DONE;
 }
 static int __init migration_init(void)
@@ -5428,17 +5442,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                        break;
                }
-                /*
-                 * Even though we initialize ->capacity to something semi-sane,
-                 * we leave capacity_orig unset. This allows us to detect if
-                 * domain iteration is still funny without causing /0 traps.
-                 */
-                if (!group->sgc->capacity_orig) {
-                        printk(KERN_CONT "\n");
-                        printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n");
-                        break;
-                }
                if (!cpumask_weight(sched_group_cpus(group))) {
                        printk(KERN_CONT "\n");
                        printk(KERN_ERR "ERROR: empty group\n");
@@ -5811,9 +5814,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
        update_top_cache_domain(cpu);
 }
-/* cpus with isolated domains */
-static cpumask_var_t cpu_isolated_map;
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
 {
@@ -5922,7 +5922,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
                 * die on a /0 trap.
                 */
                sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
-                sg->sgc->capacity_orig = sg->sgc->capacity;
                /*
                 * Make sure the first group of this domain contains the
@@ -6233,6 +6232,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
         */
        if (sd->flags & SD_SHARE_CPUCAPACITY) {
+                sd->flags |= SD_PREFER_SIBLING;
                sd->imbalance_pct = 110;
                sd->smt_gain = 1178; /* ~15% */
@@ -6998,7 +6998,6 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
                 */
        case CPU_ONLINE:
-        case CPU_DOWN_FAILED:
                cpuset_update_active_cpus(true);
                break;
        default:
@@ -7010,8 +7009,30 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
 static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
                               void *hcpu)
 {
-        switch (action) {
+        unsigned long flags;
+        long cpu = (long)hcpu;
+        struct dl_bw *dl_b;
+        switch (action & ~CPU_TASKS_FROZEN) {
        case CPU_DOWN_PREPARE:
+                /* explicitly allow suspend */
+                if (!(action & CPU_TASKS_FROZEN)) {
+                        bool overflow;
+                        int cpus;
+                        rcu_read_lock_sched();
+                        dl_b = dl_bw_of(cpu);
+                        raw_spin_lock_irqsave(&dl_b->lock, flags);
+                        cpus = dl_bw_cpus(cpu);
+                        overflow = __dl_overflow(dl_b, cpus, 0, 0);
+                        raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+                        rcu_read_unlock_sched();
+                        if (overflow)
+                                return notifier_from_errno(-EBUSY);
+                }
                cpuset_update_active_cpus(false);
                break;
        case CPU_DOWN_PREPARE_FROZEN:
@@ -7156,8 +7177,8 @@ void __init sched_init(void)
                rq->calc_load_active = 0;
                rq->calc_load_update = jiffies + LOAD_FREQ;
                init_cfs_rq(&rq->cfs);
-                init_rt_rq(&rq->rt, rq);
+                init_rt_rq(&rq->rt);
-                init_dl_rq(&rq->dl, rq);
+                init_dl_rq(&rq->dl);
 #ifdef CONFIG_FAIR_GROUP_SCHED
                root_task_group.shares = ROOT_TASK_GROUP_LOAD;
                INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
@@ -7197,7 +7218,7 @@ void __init sched_init(void)
 #ifdef CONFIG_SMP
                rq->sd = NULL;
                rq->rd = NULL;
-                rq->cpu_capacity = SCHED_CAPACITY_SCALE;
+                rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
                rq->post_schedule = 0;
                rq->active_balance = 0;
                rq->next_balance = jiffies;
@@ -7796,7 +7817,7 @@ static int sched_rt_global_constraints(void)
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
-static int sched_dl_global_constraints(void)
+static int sched_dl_global_validate(void)
 {
        u64 runtime = global_rt_runtime();
        u64 period = global_rt_period();
@@ -7897,11 +7918,11 @@ int sched_rt_handler(struct ctl_table *table, int write,
                if (ret)
                        goto undo;
-                ret = sched_rt_global_constraints();
+                ret = sched_dl_global_validate();
                if (ret)
                        goto undo;
-                ret = sched_dl_global_constraints();
+                ret = sched_rt_global_constraints();
                if (ret)
                        goto undo;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 3fa8fa6d9403..5e95145088fd 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -69,7 +69,7 @@ void init_dl_bw(struct dl_bw *dl_b)
        dl_b->total_bw = 0;
 }
-void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq)
+void init_dl_rq(struct dl_rq *dl_rq)
 {
        dl_rq->rb_root = RB_ROOT;
@@ -218,6 +218,52 @@ static inline void set_post_schedule(struct rq *rq)
        rq->post_schedule = has_pushable_dl_tasks(rq);
 }
+static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
+static void dl_task_offline_migration(struct rq *rq, struct task_struct *p)
+{
+        struct rq *later_rq = NULL;
+        bool fallback = false;
+        later_rq = find_lock_later_rq(p, rq);
+        if (!later_rq) {
+                int cpu;
+                /*
+                 * If we cannot preempt any rq, fall back to pick any
+                 * online cpu.
+                 */
+                fallback = true;
+                cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p));
+                if (cpu >= nr_cpu_ids) {
+                        /*
+                         * Fail to find any suitable cpu.
+                         * The task will never come back!
+                         */
+                        BUG_ON(dl_bandwidth_enabled());
+                        /*
+                         * If admission control is disabled we
+                         * try a little harder to let the task
+                         * run.
+                         */
+                        cpu = cpumask_any(cpu_active_mask);
+                }
+                later_rq = cpu_rq(cpu);
+                double_lock_balance(rq, later_rq);
+        }
+        deactivate_task(rq, p, 0);
+        set_task_cpu(p, later_rq->cpu);
+        activate_task(later_rq, p, ENQUEUE_REPLENISH);
+        if (!fallback)
+                resched_curr(later_rq);
+        double_unlock_balance(rq, later_rq);
+}
 #else
 static inline
@@ -514,7 +560,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
        unsigned long flags;
        struct rq *rq;
-        rq = task_rq_lock(current, &flags);
+        rq = task_rq_lock(p, &flags);
        /*
         * We need to take care of several possible races here:
@@ -536,6 +582,17 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
        sched_clock_tick();
        update_rq_clock(rq);
+#ifdef CONFIG_SMP
+        /*
+         * If we find that the rq the task was on is no longer
+         * available, we need to select a new rq.
+         */
+        if (unlikely(!rq->online)) {
+                dl_task_offline_migration(rq, p);
+                goto unlock;
+        }
+#endif
        /*
         * If the throttle happened during sched-out; like:
         *
@@ -569,7 +626,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
                push_dl_task(rq);
 #endif
 unlock:
-        task_rq_unlock(rq, current, &flags);
+        task_rq_unlock(rq, p, &flags);
        return HRTIMER_NORESTART;
 }
@@ -914,6 +971,12 @@ static void yield_task_dl(struct rq *rq)
        }
        update_rq_clock(rq);
        update_curr_dl(rq);
+        /*
+         * Tell update_rq_clock() that we've just updated,
+         * so we don't do microscopic update in schedule()
+         * and double the fastpath cost.
+         */
+        rq_clock_skip_update(rq, true);
 }
 #ifdef CONFIG_SMP
@@ -1659,14 +1722,6 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
 {
        int check_resched = 1;
-        /*
-         * If p is throttled, don't consider the possibility
-         * of preempting rq->curr, the check will be done right
-         * after its runtime will get replenished.
-         */
-        if (unlikely(p->dl.dl_throttled))
-                return;
        if (task_on_rq_queued(p) && rq->curr != p) {
 #ifdef CONFIG_SMP
                if (p->nr_cpus_allowed > 1 && rq->dl.overloaded &&
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 8baaf858d25c..a245c1fc6f0a 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -71,7 +71,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
        if (!se) {
                struct sched_avg *avg = &cpu_rq(cpu)->avg;
                P(avg->runnable_avg_sum);
-                P(avg->runnable_avg_period);
+                P(avg->avg_period);
                return;
        }
@@ -94,8 +94,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
        P(se->load.weight);
 #ifdef CONFIG_SMP
        P(se->avg.runnable_avg_sum);
-        P(se->avg.runnable_avg_period);
+        P(se->avg.running_avg_sum);
+        P(se->avg.avg_period);
        P(se->avg.load_avg_contrib);
+        P(se->avg.utilization_avg_contrib);
        P(se->avg.decay_count);
 #endif
 #undef PN
@@ -214,6 +216,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
                        cfs_rq->runnable_load_avg);
        SEQ_printf(m, "  .%-30s: %ld\n", "blocked_load_avg",
                        cfs_rq->blocked_load_avg);
+        SEQ_printf(m, "  .%-30s: %ld\n", "utilization_load_avg",
+                        cfs_rq->utilization_load_avg);
 #ifdef CONFIG_FAIR_GROUP_SCHED
        SEQ_printf(m, "  .%-30s: %ld\n", "tg_load_contrib",
                        cfs_rq->tg_load_contrib);
@@ -636,8 +640,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
        P(se.load.weight);
 #ifdef CONFIG_SMP
        P(se.avg.runnable_avg_sum);
-        P(se.avg.runnable_avg_period);
+        P(se.avg.running_avg_sum);
+        P(se.avg.avg_period);
        P(se.avg.load_avg_contrib);
+        P(se.avg.utilization_avg_contrib);
        P(se.avg.decay_count);
 #endif
        P(policy);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7ce18f3c097a..ffeaa4105e48 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -670,6 +670,7 @@ static int select_idle_sibling(struct task_struct *p, int cpu);
 static unsigned long task_h_load(struct task_struct *p);
 static inline void __update_task_entity_contrib(struct sched_entity *se);
+static inline void __update_task_entity_utilization(struct sched_entity *se);
 /* Give new task start runnable values to heavy its load in infant time */
 void init_task_runnable_average(struct task_struct *p)
@@ -677,9 +678,10 @@ void init_task_runnable_average(struct task_struct *p)
        u32 slice;
        slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
-        p->se.avg.runnable_avg_sum = slice;
+        p->se.avg.runnable_avg_sum = p->se.avg.running_avg_sum = slice;
-        p->se.avg.runnable_avg_period = slice;
+        p->se.avg.avg_period = slice;
        __update_task_entity_contrib(&p->se);
+        __update_task_entity_utilization(&p->se);
 }
 #else
 void init_task_runnable_average(struct task_struct *p)
@@ -1196,9 +1198,11 @@ static void task_numa_assign(struct task_numa_env *env,
 static bool load_too_imbalanced(long src_load, long dst_load,
                                struct task_numa_env *env)
 {
-        long imb, old_imb;
-        long orig_src_load, orig_dst_load;
        long src_capacity, dst_capacity;
+        long orig_src_load;
+        long load_a, load_b;
+        long moved_load;
+        long imb;
        /*
         * The load is corrected for the CPU capacity available on each node.
@@ -1211,30 +1215,39 @@ static bool load_too_imbalanced(long src_load, long dst_load,
        dst_capacity = env->dst_stats.compute_capacity;
        /* We care about the slope of the imbalance, not the direction. */
-        if (dst_load < src_load)
+        load_a = dst_load;
-                swap(dst_load, src_load);
+        load_b = src_load;
+        if (load_a < load_b)
+                swap(load_a, load_b);
        /* Is the difference below the threshold? */
-        imb = dst_load * src_capacity * 100 -
+        imb = load_a * src_capacity * 100 -
-              src_load * dst_capacity * env->imbalance_pct;
+                load_b * dst_capacity * env->imbalance_pct;
        if (imb <= 0)
                return false;
        /*
         * The imbalance is above the allowed threshold.
-         * Compare it with the old imbalance.
+         * Allow a move that brings us closer to a balanced situation,
+         * without moving things past the point of balance.
         */
        orig_src_load = env->src_stats.load;
-        orig_dst_load = env->dst_stats.load;
-        if (orig_dst_load < orig_src_load)
+        /*
-                swap(orig_dst_load, orig_src_load);
+         * In a task swap, there will be one load moving from src to dst,
+         * and another moving back. This is the net sum of both moves.
-        old_imb = orig_dst_load * src_capacity * 100 -
+         * A simple task move will always have a positive value.
-                  orig_src_load * dst_capacity * env->imbalance_pct;
+         * Allow the move if it brings the system closer to a balanced
+         * situation, without crossing over the balance point.
+         */
+        moved_load = orig_src_load - src_load;
-        /* Would this change make things worse? */
+        if (moved_load > 0)
-        return (imb > old_imb);
+                /* Moving src -> dst. Did we overshoot balance? */
+                return src_load * dst_capacity < dst_load * src_capacity;
+        else
+                /* Moving dst -> src. Did we overshoot balance? */
+                return dst_load * src_capacity < src_load * dst_capacity;
 }
 /*
@@ -1609,9 +1622,11 @@ static void update_task_scan_period(struct task_struct *p,
        /*
         * If there were no record hinting faults then either the task is
         * completely idle or all activity is areas that are not of interest
-         * to automatic numa balancing. Scan slower
+         * to automatic numa balancing. Related to that, if there were failed
+         * migration then it implies we are migrating too quickly or the local
+         * node is overloaded. In either case, scan slower
         */
-        if (local + shared == 0) {
+        if (local + shared == 0 || p->numa_faults_locality[2]) {
                p->numa_scan_period = min(p->numa_scan_period_max,
                        p->numa_scan_period << 1);
@@ -1673,7 +1688,7 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
                *period = now - p->last_task_numa_placement;
        } else {
                delta = p->se.avg.runnable_avg_sum;
-                *period = p->se.avg.runnable_avg_period;
+                *period = p->se.avg.avg_period;
        }
        p->last_sum_exec_runtime = runtime;
@@ -1763,6 +1778,8 @@ static int preferred_group_nid(struct task_struct *p, int nid)
                        }
                }
                /* Next round, evaluate the nodes within max_group. */
+                if (!max_faults)
+                        break;
                nodes = max_group;
        }
        return nid;
@@ -2080,6 +2097,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
        if (migrated)
                p->numa_pages_migrated += pages;
+        if (flags & TNF_MIGRATE_FAIL)
+                p->numa_faults_locality[2] += pages;
        p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
        p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
@@ -2161,8 +2180,10 @@ void task_numa_work(struct callback_head *work)
                vma = mm->mmap;
        }
        for (; vma; vma = vma->vm_next) {
-                if (!vma_migratable(vma) || !vma_policy_mof(vma))
+                if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
+                        is_vm_hugetlb_page(vma)) {
                        continue;
+                }
                /*
                 * Shared library pages mapped by multiple processes are not
@@ -2497,13 +2518,15 @@ static u32 __compute_runnable_contrib(u64 n)
 *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
 *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
 */
-static __always_inline int __update_entity_runnable_avg(u64 now,
+static __always_inline int __update_entity_runnable_avg(u64 now, int cpu,
                                                        struct sched_avg *sa,
-                                                        int runnable)
+                                                        int runnable,
+                                                        int running)
 {
        u64 delta, periods;
        u32 runnable_contrib;
        int delta_w, decayed = 0;
+        unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu);
        delta = now - sa->last_runnable_update;
        /*
@@ -2525,7 +2548,7 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
        sa->last_runnable_update = now;
        /* delta_w is the amount already accumulated against our next period */
-        delta_w = sa->runnable_avg_period % 1024;
+        delta_w = sa->avg_period % 1024;
        if (delta + delta_w >= 1024) {
                /* period roll-over */
                decayed = 1;
@@ -2538,7 +2561,10 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
                delta_w = 1024 - delta_w;
                if (runnable)
                        sa->runnable_avg_sum += delta_w;
-                sa->runnable_avg_period += delta_w;
+                if (running)
+                        sa->running_avg_sum += delta_w * scale_freq
+                                >> SCHED_CAPACITY_SHIFT;
+                sa->avg_period += delta_w;
                delta -= delta_w;
@@ -2548,20 +2574,28 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
                sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
                                                  periods + 1);
-                sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
+                sa->running_avg_sum = decay_load(sa->running_avg_sum,
+                                                  periods + 1);
+                sa->avg_period = decay_load(sa->avg_period,
                                                     periods + 1);
                /* Efficiently calculate \sum (1..n_period) 1024*y^i */
                runnable_contrib = __compute_runnable_contrib(periods);
                if (runnable)
                        sa->runnable_avg_sum += runnable_contrib;
-                sa->runnable_avg_period += runnable_contrib;
+                if (running)
+                        sa->running_avg_sum += runnable_contrib * scale_freq
+                                >> SCHED_CAPACITY_SHIFT;
+                sa->avg_period += runnable_contrib;
        }
        /* Remainder of delta accrued against u_0` */
        if (runnable)
                sa->runnable_avg_sum += delta;
-        sa->runnable_avg_period += delta;
+        if (running)
+                sa->running_avg_sum += delta * scale_freq
+                        >> SCHED_CAPACITY_SHIFT;
+        sa->avg_period += delta;
        return decayed;
 }
@@ -2578,6 +2612,8 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se)
                return 0;
        se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
+        se->avg.utilization_avg_contrib =
+                decay_load(se->avg.utilization_avg_contrib, decays);
        return decays;
 }
@@ -2613,7 +2649,7 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa,
        /* The fraction of a cpu used by this cfs_rq */
        contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,
-                          sa->runnable_avg_period + 1);
+                          sa->avg_period + 1);
        contrib -= cfs_rq->tg_runnable_contrib;
        if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
@@ -2666,7 +2702,8 @@ static inline void __update_group_entity_contrib(struct sched_entity *se)
 static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
 {
-        __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
+        __update_entity_runnable_avg(rq_clock_task(rq), cpu_of(rq), &rq->avg,
+                        runnable, runnable);
        __update_tg_runnable_avg(&rq->avg, &rq->cfs);
 }
 #else /* CONFIG_FAIR_GROUP_SCHED */
@@ -2684,7 +2721,7 @@ static inline void __update_task_entity_contrib(struct sched_entity *se)
        /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
        contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
-        contrib /= (se->avg.runnable_avg_period + 1);
+        contrib /= (se->avg.avg_period + 1);
        se->avg.load_avg_contrib = scale_load(contrib);
 }
@@ -2703,6 +2740,30 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se)
        return se->avg.load_avg_contrib - old_contrib;
 }
+static inline void __update_task_entity_utilization(struct sched_entity *se)
+{
+        u32 contrib;
+        /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
+        contrib = se->avg.running_avg_sum * scale_load_down(SCHED_LOAD_SCALE);
+        contrib /= (se->avg.avg_period + 1);
+        se->avg.utilization_avg_contrib = scale_load(contrib);
+}
+static long __update_entity_utilization_avg_contrib(struct sched_entity *se)
+{
+        long old_contrib = se->avg.utilization_avg_contrib;
+        if (entity_is_task(se))
+                __update_task_entity_utilization(se);
+        else
+                se->avg.utilization_avg_contrib =
+                                        group_cfs_rq(se)->utilization_load_avg;
+        return se->avg.utilization_avg_contrib - old_contrib;
+}
 static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
                                                 long load_contrib)
 {
@@ -2719,7 +2780,8 @@ static inline void update_entity_load_avg(struct sched_entity *se,
                                          int update_cfs_rq)
 {
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
-        long contrib_delta;
+        long contrib_delta, utilization_delta;
+        int cpu = cpu_of(rq_of(cfs_rq));
        u64 now;
        /*
@@ -2731,18 +2793,22 @@ static inline void update_entity_load_avg(struct sched_entity *se,
        else
                now = cfs_rq_clock_task(group_cfs_rq(se));
-        if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
+        if (!__update_entity_runnable_avg(now, cpu, &se->avg, se->on_rq,
+                                        cfs_rq->curr == se))
                return;
        contrib_delta = __update_entity_load_avg_contrib(se);
+        utilization_delta = __update_entity_utilization_avg_contrib(se);
        if (!update_cfs_rq)
                return;
-        if (se->on_rq)
+        if (se->on_rq) {
                cfs_rq->runnable_load_avg += contrib_delta;
-        else
+                cfs_rq->utilization_load_avg += utilization_delta;
+        } else {
                subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
+        }
 }
 /*
@@ -2817,6 +2883,7 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
        }
        cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
+        cfs_rq->utilization_load_avg += se->avg.utilization_avg_contrib;
        /* we force update consideration on load-balancer moves */
        update_cfs_rq_blocked_load(cfs_rq, !wakeup);
 }
@@ -2835,6 +2902,7 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
        update_cfs_rq_blocked_load(cfs_rq, !sleep);
        cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
+        cfs_rq->utilization_load_avg -= se->avg.utilization_avg_contrib;
        if (sleep) {
                cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
                se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
@@ -3172,6 +3240,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
                 */
                update_stats_wait_end(cfs_rq, se);
                __dequeue_entity(cfs_rq, se);
+                update_entity_load_avg(se, 1);
        }
        update_stats_curr_start(cfs_rq, se);
@@ -4298,6 +4367,11 @@ static unsigned long capacity_of(int cpu)
        return cpu_rq(cpu)->cpu_capacity;
 }
+static unsigned long capacity_orig_of(int cpu)
+{
+        return cpu_rq(cpu)->cpu_capacity_orig;
+}
 static unsigned long cpu_avg_load_per_task(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
@@ -4711,6 +4785,33 @@ next:
 done:
        return target;
 }
+/*
+ * get_cpu_usage returns the amount of capacity of a CPU that is used by CFS
+ * tasks. The unit of the return value must be the one of capacity so we can
+ * compare the usage with the capacity of the CPU that is available for CFS
+ * task (ie cpu_capacity).
+ * cfs.utilization_load_avg is the sum of running time of runnable tasks on a
+ * CPU. It represents the amount of utilization of a CPU in the range
+ * [0..SCHED_LOAD_SCALE].  The usage of a CPU can't be higher than the full
+ * capacity of the CPU because it's about the running time on this CPU.
+ * Nevertheless, cfs.utilization_load_avg can be higher than SCHED_LOAD_SCALE
+ * because of unfortunate rounding in avg_period and running_load_avg or just
+ * after migrating tasks until the average stabilizes with the new running
+ * time. So we need to check that the usage stays into the range
+ * [0..cpu_capacity_orig] and cap if necessary.
+ * Without capping the usage, a group could be seen as overloaded (CPU0 usage
+ * at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity
+ */
+static int get_cpu_usage(int cpu)
+{
+        unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg;
+        unsigned long capacity = capacity_orig_of(cpu);
+        if (usage >= SCHED_LOAD_SCALE)
+                return capacity;
+        return (usage * capacity) >> SCHED_LOAD_SHIFT;
+}
 /*
 * select_task_rq_fair: Select target runqueue for the waking task in domains
@@ -5837,12 +5938,12 @@ struct sg_lb_stats {
        unsigned long sum_weighted_load; /* Weighted load of group's tasks */
        unsigned long load_per_task;
        unsigned long group_capacity;
+        unsigned long group_usage; /* Total usage of the group */
        unsigned int sum_nr_running; /* Nr tasks running in the group */
-        unsigned int group_capacity_factor;
        unsigned int idle_cpus;
        unsigned int group_weight;
        enum group_type group_type;
-        int group_has_free_capacity;
+        int group_no_capacity;
 #ifdef CONFIG_NUMA_BALANCING
        unsigned int nr_numa_running;
        unsigned int nr_preferred_running;
@@ -5913,16 +6014,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
        return load_idx;
 }
-static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu)
-{
-        return SCHED_CAPACITY_SCALE;
-}
-unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
-{
-        return default_scale_capacity(sd, cpu);
-}
 static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu)
 {
        if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
@@ -5939,7 +6030,7 @@ unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
 static unsigned long scale_rt_capacity(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
-        u64 total, available, age_stamp, avg;
+        u64 total, used, age_stamp, avg;
        s64 delta;
        /*
@@ -5955,19 +6046,12 @@ static unsigned long scale_rt_capacity(int cpu)
        total = sched_avg_period() + delta;
-        if (unlikely(total < avg)) {
+        used = div_u64(avg, total);
-                /* Ensures that capacity won't end up being negative */
-                available = 0;
-        } else {
-                available = total - avg;
-        }
-        if (unlikely((s64)total < SCHED_CAPACITY_SCALE))
-                total = SCHED_CAPACITY_SCALE;
-        total >>= SCHED_CAPACITY_SHIFT;
+        if (likely(used < SCHED_CAPACITY_SCALE))
+                return SCHED_CAPACITY_SCALE - used;
-        return div_u64(available, total);
+        return 1;
 }
 static void update_cpu_capacity(struct sched_domain *sd, int cpu)
@@ -5982,14 +6066,7 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
        capacity >>= SCHED_CAPACITY_SHIFT;
-        sdg->sgc->capacity_orig = capacity;
+        cpu_rq(cpu)->cpu_capacity_orig = capacity;
-        if (sched_feat(ARCH_CAPACITY))
-                capacity *= arch_scale_freq_capacity(sd, cpu);
-        else
-                capacity *= default_scale_capacity(sd, cpu);
-        capacity >>= SCHED_CAPACITY_SHIFT;
        capacity *= scale_rt_capacity(cpu);
        capacity >>= SCHED_CAPACITY_SHIFT;
@@ -6005,7 +6082,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
 {
        struct sched_domain *child = sd->child;
        struct sched_group *group, *sdg = sd->groups;
-        unsigned long capacity, capacity_orig;
+        unsigned long capacity;
        unsigned long interval;
        interval = msecs_to_jiffies(sd->balance_interval);
@@ -6017,7 +6094,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
                return;
        }
-        capacity_orig = capacity = 0;
+        capacity = 0;
        if (child->flags & SD_OVERLAP) {
                /*
@@ -6037,19 +6114,15 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
                         * Use capacity_of(), which is set irrespective of domains
                         * in update_cpu_capacity().
                         *
-                         * This avoids capacity/capacity_orig from being 0 and
+                         * This avoids capacity from being 0 and
                         * causing divide-by-zero issues on boot.
-                         *
-                         * Runtime updates will correct capacity_orig.
                         */
                        if (unlikely(!rq->sd)) {
-                                capacity_orig += capacity_of(cpu);
                                capacity += capacity_of(cpu);
                                continue;
                        }
                        sgc = rq->sd->groups->sgc;
-                        capacity_orig += sgc->capacity_orig;
                        capacity += sgc->capacity;
                }
        } else  {
@@ -6060,39 +6133,24 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
                group = child->groups;
                do {
-                        capacity_orig += group->sgc->capacity_orig;
                        capacity += group->sgc->capacity;
                        group = group->next;
                } while (group != child->groups);
        }
-        sdg->sgc->capacity_orig = capacity_orig;
        sdg->sgc->capacity = capacity;
 }
 /*
- * Try and fix up capacity for tiny siblings, this is needed when
+ * Check whether the capacity of the rq has been noticeably reduced by side
- * things like SD_ASYM_PACKING need f_b_g to select another sibling
+ * activity. The imbalance_pct is used for the threshold.
- * which on its own isn't powerful enough.
+ * Return true is the capacity is reduced
- *
- * See update_sd_pick_busiest() and check_asym_packing().
 */
 static inline int
-fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
+check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
 {
-        /*
+        return ((rq->cpu_capacity * sd->imbalance_pct) <
-         * Only siblings can have significantly less than SCHED_CAPACITY_SCALE
+                                (rq->cpu_capacity_orig * 100));
-         */
-        if (!(sd->flags & SD_SHARE_CPUCAPACITY))
-                return 0;
-        /*
-         * If ~90% of the cpu_capacity is still there, we're good.
-         */
-        if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29)
-                return 1;
-        return 0;
 }
 /*
@@ -6130,37 +6188,56 @@ static inline int sg_imbalanced(struct sched_group *group)
 }
 /*
- * Compute the group capacity factor.
+ * group_has_capacity returns true if the group has spare capacity that could
- *
+ * be used by some tasks.
- * Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by
+ * We consider that a group has spare capacity if the  * number of task is
- * first dividing out the smt factor and computing the actual number of cores
+ * smaller than the number of CPUs or if the usage is lower than the available
- * and limit unit capacity with that.
+ * capacity for CFS tasks.
+ * For the latter, we use a threshold to stabilize the state, to take into
+ * account the variance of the tasks' load and to return true if the available
+ * capacity in meaningful for the load balancer.
+ * As an example, an available capacity of 1% can appear but it doesn't make
+ * any benefit for the load balance.
 */
-static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group)
+static inline bool
+group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
 {
-        unsigned int capacity_factor, smt, cpus;
+        if (sgs->sum_nr_running < sgs->group_weight)
-        unsigned int capacity, capacity_orig;
+                return true;
-        capacity = group->sgc->capacity;
+        if ((sgs->group_capacity * 100) >
-        capacity_orig = group->sgc->capacity_orig;
+                        (sgs->group_usage * env->sd->imbalance_pct))
-        cpus = group->group_weight;
+                return true;
+        return false;
+}
-        /* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */
+/*
-        smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig);
+ *  group_is_overloaded returns true if the group has more tasks than it can
-        capacity_factor = cpus / smt; /* cores */
+ *  handle.
+ *  group_is_overloaded is not equals to !group_has_capacity because a group
+ *  with the exact right number of tasks, has no more spare capacity but is not
+ *  overloaded so both group_has_capacity and group_is_overloaded return
+ *  false.
+ */
+static inline bool
+group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
+{
+        if (sgs->sum_nr_running <= sgs->group_weight)
+                return false;
-        capacity_factor = min_t(unsigned,
+        if ((sgs->group_capacity * 100) <
-                capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE));
+                        (sgs->group_usage * env->sd->imbalance_pct))
-        if (!capacity_factor)
+                return true;
-                capacity_factor = fix_small_capacity(env->sd, group);
-        return capacity_factor;
+        return false;
 }
-static enum group_type
+static enum group_type group_classify(struct lb_env *env,
-group_classify(struct sched_group *group, struct sg_lb_stats *sgs)
+                struct sched_group *group,
+                struct sg_lb_stats *sgs)
 {
-        if (sgs->sum_nr_running > sgs->group_capacity_factor)
+        if (sgs->group_no_capacity)
                return group_overloaded;
        if (sg_imbalanced(group))
@@ -6198,6 +6275,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                        load = source_load(i, load_idx);
                sgs->group_load += load;
+                sgs->group_usage += get_cpu_usage(i);
                sgs->sum_nr_running += rq->cfs.h_nr_running;
                if (rq->nr_running > 1)
@@ -6220,11 +6298,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
        sgs->group_weight = group->group_weight;
-        sgs->group_capacity_factor = sg_capacity_factor(env, group);
-        sgs->group_type = group_classify(group, sgs);
-        if (sgs->group_capacity_factor > sgs->sum_nr_running)
+        sgs->group_no_capacity = group_is_overloaded(env, sgs);
-                sgs->group_has_free_capacity = 1;
+        sgs->group_type = group_classify(env, group, sgs);
 }
 /**
@@ -6346,18 +6422,19 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
                /*
                 * In case the child domain prefers tasks go to siblings
-                 * first, lower the sg capacity factor to one so that we'll try
+                 * first, lower the sg capacity so that we'll try
                 * and move all the excess tasks away. We lower the capacity
                 * of a group only if the local group has the capacity to fit
-                 * these excess tasks, i.e. nr_running < group_capacity_factor. The
+                 * these excess tasks. The extra check prevents the case where
-                 * extra check prevents the case where you always pull from the
+                 * you always pull from the heaviest group when it is already
-                 * heaviest group when it is already under-utilized (possible
+                 * under-utilized (possible with a large weight task outweighs
-                 * with a large weight task outweighs the tasks on the system).
+                 * the tasks on the system).
                 */
                if (prefer_sibling && sds->local &&
-                    sds->local_stat.group_has_free_capacity) {
+                    group_has_capacity(env, &sds->local_stat) &&
-                        sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U);
+                    (sgs->sum_nr_running > 1)) {
-                        sgs->group_type = group_classify(sg, sgs);
+                        sgs->group_no_capacity = 1;
+                        sgs->group_type = group_overloaded;
                }
                if (update_sd_pick_busiest(env, sds, sg, sgs)) {
@@ -6537,11 +6614,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
         */
        if (busiest->group_type == group_overloaded &&
            local->group_type   == group_overloaded) {
-                load_above_capacity =
+                load_above_capacity = busiest->sum_nr_running *
-                        (busiest->sum_nr_running - busiest->group_capacity_factor);
+                                        SCHED_LOAD_SCALE;
+                if (load_above_capacity > busiest->group_capacity)
-                load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE);
+                        load_above_capacity -= busiest->group_capacity;
-                load_above_capacity /= busiest->group_capacity;
+                else
+                        load_above_capacity = ~0UL;
        }
        /*
@@ -6604,6 +6682,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
        local = &sds.local_stat;
        busiest = &sds.busiest_stat;
+        /* ASYM feature bypasses nice load balance check */
        if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
            check_asym_packing(env, &sds))
                return sds.busiest;
@@ -6624,8 +6703,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
                goto force_balance;
        /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
-        if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity &&
+        if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
-            !busiest->group_has_free_capacity)
+            busiest->group_no_capacity)
                goto force_balance;
        /*
@@ -6684,7 +6763,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
        int i;
        for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
-                unsigned long capacity, capacity_factor, wl;
+                unsigned long capacity, wl;
                enum fbq_type rt;
                rq = cpu_rq(i);
@@ -6713,9 +6792,6 @@ static struct rq *find_busiest_queue(struct lb_env *env,
                        continue;
                capacity = capacity_of(i);
-                capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE);
-                if (!capacity_factor)
-                        capacity_factor = fix_small_capacity(env->sd, group);
                wl = weighted_cpuload(i);
@@ -6723,7 +6799,9 @@ static struct rq *find_busiest_queue(struct lb_env *env,
                 * When comparing with imbalance, use weighted_cpuload()
                 * which is not scaled with the cpu capacity.
                 */
-                if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance)
+                if (rq->nr_running == 1 && wl > env->imbalance &&
+                    !check_cpu_capacity(rq, env->sd))
                        continue;
                /*
@@ -6771,6 +6849,19 @@ static int need_active_balance(struct lb_env *env)
                        return 1;
        }
+        /*
+         * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
+         * It's worth migrating the task if the src_cpu's capacity is reduced
+         * because of other sched_class or IRQs if more capacity stays
+         * available on dst_cpu.
+         */
+        if ((env->idle != CPU_NOT_IDLE) &&
+            (env->src_rq->cfs.h_nr_running == 1)) {
+                if ((check_cpu_capacity(env->src_rq, sd)) &&
+                    (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
+                        return 1;
+        }
        return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
 }
@@ -6870,6 +6961,9 @@ redo:
        schedstat_add(sd, lb_imbalance[idle], env.imbalance);
+        env.src_cpu = busiest->cpu;
+        env.src_rq = busiest;
        ld_moved = 0;
        if (busiest->nr_running > 1) {
                /*
@@ -6879,8 +6973,6 @@ redo:
                 * correctly treated as an imbalance.
                 */
                env.flags |= LBF_ALL_PINNED;
-                env.src_cpu   = busiest->cpu;
-                env.src_rq    = busiest;
                env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
 more_balance:
@@ -7580,22 +7672,25 @@ end:
 /*
 * Current heuristic for kicking the idle load balancer in the presence
- * of an idle cpu is the system.
+ * of an idle cpu in the system.
 *   - This rq has more than one task.
- *   - At any scheduler domain level, this cpu's scheduler group has multiple
+ *   - This rq has at least one CFS task and the capacity of the CPU is
- *     busy cpu's exceeding the group's capacity.
+ *     significantly reduced because of RT tasks or IRQs.
+ *   - At parent of LLC scheduler domain level, this cpu's scheduler group has
+ *     multiple busy cpu.
 *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
 *     domain span are idle.
 */
-static inline int nohz_kick_needed(struct rq *rq)
+static inline bool nohz_kick_needed(struct rq *rq)
 {
        unsigned long now = jiffies;
        struct sched_domain *sd;
        struct sched_group_capacity *sgc;
        int nr_busy, cpu = rq->cpu;
+        bool kick = false;
        if (unlikely(rq->idle_balance))
-                return 0;
+                return false;
       /*
        * We may be recently in ticked or tickless idle mode. At the first
@@ -7609,38 +7704,46 @@ static inline int nohz_kick_needed(struct rq *rq)
         * balancing.
         */
        if (likely(!atomic_read(&nohz.nr_cpus)))
-                return 0;
+                return false;
        if (time_before(now, nohz.next_balance))
-                return 0;
+                return false;
        if (rq->nr_running >= 2)
-                goto need_kick;
+                return true;
        rcu_read_lock();
        sd = rcu_dereference(per_cpu(sd_busy, cpu));
        if (sd) {
                sgc = sd->groups->sgc;
                nr_busy = atomic_read(&sgc->nr_busy_cpus);
-                if (nr_busy > 1)
+                if (nr_busy > 1) {
-                        goto need_kick_unlock;
+                        kick = true;
+                        goto unlock;
+                }
        }
-        sd = rcu_dereference(per_cpu(sd_asym, cpu));
+        sd = rcu_dereference(rq->sd);
+        if (sd) {
+                if ((rq->cfs.h_nr_running >= 1) &&
+                                check_cpu_capacity(rq, sd)) {
+                        kick = true;
+                        goto unlock;
+                }
+        }
+        sd = rcu_dereference(per_cpu(sd_asym, cpu));
        if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
-                                  sched_domain_span(sd)) < cpu))
+                                  sched_domain_span(sd)) < cpu)) {
-                goto need_kick_unlock;
+                kick = true;
+                goto unlock;
-        rcu_read_unlock();
+        }
-        return 0;
-need_kick_unlock:
+unlock:
        rcu_read_unlock();
-need_kick:
+        return kick;
-        return 1;
 }
 #else
 static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
@@ -7656,14 +7759,16 @@ static void run_rebalance_domains(struct softirq_action *h)
        enum cpu_idle_type idle = this_rq->idle_balance ?
                                                CPU_IDLE : CPU_NOT_IDLE;
-        rebalance_domains(this_rq, idle);
        /*
         * If this cpu has a pending nohz_balance_kick, then do the
         * balancing on behalf of the other idle cpus whose ticks are
-         * stopped.
+         * stopped. Do nohz_idle_balance *before* rebalance_domains to
+         * give the idle cpus a chance to load balance. Else we may
+         * load balance only within the local sched_domain hierarchy
+         * and abort nohz_idle_balance altogether if we pull some load.
         */
        nohz_idle_balance(this_rq, idle);
+        rebalance_domains(this_rq, idle);
 }
 /*
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 90284d117fe6..91e33cd485f6 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -56,6 +56,19 @@ SCHED_FEAT(NONTASK_CAPACITY, true)
 */
 SCHED_FEAT(TTWU_QUEUE, true)
+#ifdef HAVE_RT_PUSH_IPI
+/*
+ * In order to avoid a thundering herd attack of CPUs that are
+ * lowering their priorities at the same time, and there being
+ * a single CPU that has an RT task that can migrate and is waiting
+ * to run, where the other CPUs will try to take that CPUs
+ * rq lock and possibly create a large contention, sending an
+ * IPI to that CPU and let that CPU push the RT task to where
+ * it should go may be a better scenario.
+ */
+SCHED_FEAT(RT_PUSH_IPI, true)
+#endif
 SCHED_FEAT(FORCE_SD_OVERLAP, false)
 SCHED_FEAT(RT_RUNTIME_SHARE, true)
 SCHED_FEAT(LB_MIN, false)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 80014a178342..deef1caa94c6 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -158,8 +158,7 @@ static void cpuidle_idle_call(void)
         * is used from another cpu as a broadcast timer, this call may
         * fail if it is not available
         */
-        if (broadcast &&
+        if (broadcast && tick_broadcast_enter())
-            clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu))
                goto use_default;
        /* Take note of the planned idle state. */
@@ -176,7 +175,7 @@ static void cpuidle_idle_call(void)
        idle_set_state(this_rq(), NULL);
        if (broadcast)
-                clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
+                tick_broadcast_exit();
        /*
         * Give the governor an opportunity to reflect on the outcome
@@ -210,6 +209,8 @@ use_default:
        goto exit_idle;
 }
+DEFINE_PER_CPU(bool, cpu_dead_idle);
 /*
 * Generic idle loop implementation
 *
@@ -234,8 +235,13 @@ static void cpu_idle_loop(void)
                        check_pgt_cache();
                        rmb();
-                        if (cpu_is_offline(smp_processor_id()))
+                        if (cpu_is_offline(smp_processor_id())) {
+                                rcu_cpu_notify(NULL, CPU_DYING_IDLE,
+                                               (void *)(long)smp_processor_id());
+                                smp_mb(); /* all activity before dead. */
+                                this_cpu_write(cpu_dead_idle, true);
                                arch_cpu_idle_dead();
+                        }
                        local_irq_disable();
                        arch_cpu_idle_enter();
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index f4d4b077eba0..575da76a3874 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -6,6 +6,7 @@
 #include "sched.h"
 #include <linux/slab.h>
+#include <linux/irq_work.h>
 int sched_rr_timeslice = RR_TIMESLICE;
@@ -59,7 +60,11 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
        raw_spin_unlock(&rt_b->rt_runtime_lock);
 }
-void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
+#ifdef CONFIG_SMP
+static void push_irq_work_func(struct irq_work *work);
+#endif
+void init_rt_rq(struct rt_rq *rt_rq)
 {
        struct rt_prio_array *array;
        int i;
@@ -78,7 +83,14 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
        rt_rq->rt_nr_migratory = 0;
        rt_rq->overloaded = 0;
        plist_head_init(&rt_rq->pushable_tasks);
+#ifdef HAVE_RT_PUSH_IPI
+        rt_rq->push_flags = 0;
+        rt_rq->push_cpu = nr_cpu_ids;
+        raw_spin_lock_init(&rt_rq->push_lock);
+        init_irq_work(&rt_rq->push_work, push_irq_work_func);
 #endif
+#endif /* CONFIG_SMP */
        /* We start is dequeued state, because no RT tasks are queued */
        rt_rq->rt_queued = 0;
@@ -193,7 +205,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
                if (!rt_se)
                        goto err_free_rq;
-                init_rt_rq(rt_rq, cpu_rq(i));
+                init_rt_rq(rt_rq);
                rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
                init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
        }
@@ -1778,6 +1790,164 @@ static void push_rt_tasks(struct rq *rq)
                ;
 }
+#ifdef HAVE_RT_PUSH_IPI
+/*
+ * The search for the next cpu always starts at rq->cpu and ends
+ * when we reach rq->cpu again. It will never return rq->cpu.
+ * This returns the next cpu to check, or nr_cpu_ids if the loop
+ * is complete.
+ *
+ * rq->rt.push_cpu holds the last cpu returned by this function,
+ * or if this is the first instance, it must hold rq->cpu.
+ */
+static int rto_next_cpu(struct rq *rq)
+{
+        int prev_cpu = rq->rt.push_cpu;
+        int cpu;
+        cpu = cpumask_next(prev_cpu, rq->rd->rto_mask);
+        /*
+         * If the previous cpu is less than the rq's CPU, then it already
+         * passed the end of the mask, and has started from the beginning.
+         * We end if the next CPU is greater or equal to rq's CPU.
+         */
+        if (prev_cpu < rq->cpu) {
+                if (cpu >= rq->cpu)
+                        return nr_cpu_ids;
+        } else if (cpu >= nr_cpu_ids) {
+                /*
+                 * We passed the end of the mask, start at the beginning.
+                 * If the result is greater or equal to the rq's CPU, then
+                 * the loop is finished.
+                 */
+                cpu = cpumask_first(rq->rd->rto_mask);
+                if (cpu >= rq->cpu)
+                        return nr_cpu_ids;
+        }
+        rq->rt.push_cpu = cpu;
+        /* Return cpu to let the caller know if the loop is finished or not */
+        return cpu;
+}
+static int find_next_push_cpu(struct rq *rq)
+{
+        struct rq *next_rq;
+        int cpu;
+        while (1) {
+                cpu = rto_next_cpu(rq);
+                if (cpu >= nr_cpu_ids)
+                        break;
+                next_rq = cpu_rq(cpu);
+                /* Make sure the next rq can push to this rq */
+                if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr)
+                        break;
+        }
+        return cpu;
+}
+#define RT_PUSH_IPI_EXECUTING           1
+#define RT_PUSH_IPI_RESTART             2
+static void tell_cpu_to_push(struct rq *rq)
+{
+        int cpu;
+        if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
+                raw_spin_lock(&rq->rt.push_lock);
+                /* Make sure it's still executing */
+                if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
+                        /*
+                         * Tell the IPI to restart the loop as things have
+                         * changed since it started.
+                         */
+                        rq->rt.push_flags |= RT_PUSH_IPI_RESTART;
+                        raw_spin_unlock(&rq->rt.push_lock);
+                        return;
+                }
+                raw_spin_unlock(&rq->rt.push_lock);
+        }
+        /* When here, there's no IPI going around */
+        rq->rt.push_cpu = rq->cpu;
+        cpu = find_next_push_cpu(rq);
+        if (cpu >= nr_cpu_ids)
+                return;
+        rq->rt.push_flags = RT_PUSH_IPI_EXECUTING;
+        irq_work_queue_on(&rq->rt.push_work, cpu);
+}
+/* Called from hardirq context */
+static void try_to_push_tasks(void *arg)
+{
+        struct rt_rq *rt_rq = arg;
+        struct rq *rq, *src_rq;
+        int this_cpu;
+        int cpu;
+        this_cpu = rt_rq->push_cpu;
+        /* Paranoid check */
+        BUG_ON(this_cpu != smp_processor_id());
+        rq = cpu_rq(this_cpu);
+        src_rq = rq_of_rt_rq(rt_rq);
+again:
+        if (has_pushable_tasks(rq)) {
+                raw_spin_lock(&rq->lock);
+                push_rt_task(rq);
+                raw_spin_unlock(&rq->lock);
+        }
+        /* Pass the IPI to the next rt overloaded queue */
+        raw_spin_lock(&rt_rq->push_lock);
+        /*
+         * If the source queue changed since the IPI went out,
+         * we need to restart the search from that CPU again.
+         */
+        if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) {
+                rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART;
+                rt_rq->push_cpu = src_rq->cpu;
+        }
+        cpu = find_next_push_cpu(src_rq);
+        if (cpu >= nr_cpu_ids)
+                rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING;
+        raw_spin_unlock(&rt_rq->push_lock);
+        if (cpu >= nr_cpu_ids)
+                return;
+        /*
+         * It is possible that a restart caused this CPU to be
+         * chosen again. Don't bother with an IPI, just see if we
+         * have more to push.
+         */
+        if (unlikely(cpu == rq->cpu))
+                goto again;
+        /* Try the next RT overloaded CPU */
+        irq_work_queue_on(&rt_rq->push_work, cpu);
+}
+static void push_irq_work_func(struct irq_work *work)
+{
+        struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work);
+        try_to_push_tasks(rt_rq);
+}
+#endif /* HAVE_RT_PUSH_IPI */
 static int pull_rt_task(struct rq *this_rq)
 {
        int this_cpu = this_rq->cpu, ret = 0, cpu;
@@ -1793,6 +1963,13 @@ static int pull_rt_task(struct rq *this_rq)
         */
        smp_rmb();
+#ifdef HAVE_RT_PUSH_IPI
+        if (sched_feat(RT_PUSH_IPI)) {
+                tell_cpu_to_push(this_rq);
+                return 0;
+        }
+#endif
        for_each_cpu(cpu, this_rq->rd->rto_mask) {
                if (this_cpu == cpu)
                        continue;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index dc0f435a2779..e0e129993958 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -6,6 +6,7 @@
 #include <linux/mutex.h>
 #include <linux/spinlock.h>
 #include <linux/stop_machine.h>
+#include <linux/irq_work.h>
 #include <linux/tick.h>
 #include <linux/slab.h>
@@ -362,8 +363,14 @@ struct cfs_rq {
         * Under CFS, load is tracked on a per-entity basis and aggregated up.
         * This allows for the description of both thread and group usage (in
         * the FAIR_GROUP_SCHED case).
+         * runnable_load_avg is the sum of the load_avg_contrib of the
+         * sched_entities on the rq.
+         * blocked_load_avg is similar to runnable_load_avg except that its
+         * the blocked sched_entities on the rq.
+         * utilization_load_avg is the sum of the average running time of the
+         * sched_entities on the rq.
         */
-        unsigned long runnable_load_avg, blocked_load_avg;
+        unsigned long runnable_load_avg, blocked_load_avg, utilization_load_avg;
        atomic64_t decay_counter;
        u64 last_decay;
        atomic_long_t removed_load;
@@ -418,6 +425,11 @@ static inline int rt_bandwidth_enabled(void)
        return sysctl_sched_rt_runtime >= 0;
 }
+/* RT IPI pull logic requires IRQ_WORK */
+#ifdef CONFIG_IRQ_WORK
+# define HAVE_RT_PUSH_IPI
+#endif
 /* Real-Time classes' related field in a runqueue: */
 struct rt_rq {
        struct rt_prio_array active;
@@ -435,7 +447,13 @@ struct rt_rq {
        unsigned long rt_nr_total;
        int overloaded;
        struct plist_head pushable_tasks;
+#ifdef HAVE_RT_PUSH_IPI
+        int push_flags;
+        int push_cpu;
+        struct irq_work push_work;
+        raw_spinlock_t push_lock;
 #endif
+#endif /* CONFIG_SMP */
        int rt_queued;
        int rt_throttled;
@@ -597,6 +615,7 @@ struct rq {
        struct sched_domain *sd;
        unsigned long cpu_capacity;
+        unsigned long cpu_capacity_orig;
        unsigned char idle_balance;
        /* For active balancing */
@@ -807,7 +826,7 @@ struct sched_group_capacity {
         * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity
         * for a single CPU.
         */
-        unsigned int capacity, capacity_orig;
+        unsigned int capacity;
        unsigned long next_update;
        int imbalance; /* XXX unrelated to capacity but shared group state */
        /*
@@ -1368,9 +1387,18 @@ static inline int hrtick_enabled(struct rq *rq)
 #ifdef CONFIG_SMP
 extern void sched_avg_update(struct rq *rq);
+#ifndef arch_scale_freq_capacity
+static __always_inline
+unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
+{
+        return SCHED_CAPACITY_SCALE;
+}
+#endif
 static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
 {
-        rq->rt_avg += rt_delta;
+        rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
        sched_avg_update(rq);
 }
 #else
@@ -1643,8 +1671,8 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
 extern void print_dl_stats(struct seq_file *m, int cpu);
 extern void init_cfs_rq(struct cfs_rq *cfs_rq);
-extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
+extern void init_rt_rq(struct rt_rq *rt_rq);
-extern void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq);
+extern void init_dl_rq(struct dl_rq *dl_rq);
 extern void cfs_bandwidth_usage_inc(void);
 extern void cfs_bandwidth_usage_dec(void);
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 40190f28db35..c697f73d82d6 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -4,6 +4,7 @@
 #include <linux/cpu.h>
 #include <linux/err.h>
 #include <linux/smp.h>
+#include <linux/delay.h>
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/slab.h>
@@ -314,3 +315,158 @@ void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread)
        put_online_cpus();
 }
 EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
+static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD);
+/*
+ * Called to poll specified CPU's state, for example, when waiting for
+ * a CPU to come online.
+ */
+int cpu_report_state(int cpu)
+{
+        return atomic_read(&per_cpu(cpu_hotplug_state, cpu));
+}
+/*
+ * If CPU has died properly, set its state to CPU_UP_PREPARE and
+ * return success.  Otherwise, return -EBUSY if the CPU died after
+ * cpu_wait_death() timed out.  And yet otherwise again, return -EAGAIN
+ * if cpu_wait_death() timed out and the CPU still hasn't gotten around
+ * to dying.  In the latter two cases, the CPU might not be set up
+ * properly, but it is up to the arch-specific code to decide.
+ * Finally, -EIO indicates an unanticipated problem.
+ *
+ * Note that it is permissible to omit this call entirely, as is
+ * done in architectures that do no CPU-hotplug error checking.
+ */
+int cpu_check_up_prepare(int cpu)
+{
+        if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) {
+                atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_UP_PREPARE);
+                return 0;
+        }
+        switch (atomic_read(&per_cpu(cpu_hotplug_state, cpu))) {
+        case CPU_POST_DEAD:
+                /* The CPU died properly, so just start it up again. */
+                atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_UP_PREPARE);
+                return 0;
+        case CPU_DEAD_FROZEN:
+                /*
+                 * Timeout during CPU death, so let caller know.
+                 * The outgoing CPU completed its processing, but after
+                 * cpu_wait_death() timed out and reported the error. The
+                 * caller is free to proceed, in which case the state
+                 * will be reset properly by cpu_set_state_online().
+                 * Proceeding despite this -EBUSY return makes sense
+                 * for systems where the outgoing CPUs take themselves
+                 * offline, with no post-death manipulation required from
+                 * a surviving CPU.
+                 */
+                return -EBUSY;
+        case CPU_BROKEN:
+                /*
+                 * The most likely reason we got here is that there was
+                 * a timeout during CPU death, and the outgoing CPU never
+                 * did complete its processing.  This could happen on
+                 * a virtualized system if the outgoing VCPU gets preempted
+                 * for more than five seconds, and the user attempts to
+                 * immediately online that same CPU.  Trying again later
+                 * might return -EBUSY above, hence -EAGAIN.
+                 */
+                return -EAGAIN;
+        default:
+                /* Should not happen.  Famous last words. */
+                return -EIO;
+        }
+}
+/*
+ * Mark the specified CPU online.
+ *
+ * Note that it is permissible to omit this call entirely, as is
+ * done in architectures that do no CPU-hotplug error checking.
+ */
+void cpu_set_state_online(int cpu)
+{
+        (void)atomic_xchg(&per_cpu(cpu_hotplug_state, cpu), CPU_ONLINE);
+}
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * Wait for the specified CPU to exit the idle loop and die.
+ */
+bool cpu_wait_death(unsigned int cpu, int seconds)
+{
+        int jf_left = seconds * HZ;
+        int oldstate;
+        bool ret = true;
+        int sleep_jf = 1;
+        might_sleep();
+        /* The outgoing CPU will normally get done quite quickly. */
+        if (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) == CPU_DEAD)
+                goto update_state;
+        udelay(5);
+        /* But if the outgoing CPU dawdles, wait increasingly long times. */
+        while (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) != CPU_DEAD) {
+                schedule_timeout_uninterruptible(sleep_jf);
+                jf_left -= sleep_jf;
+                if (jf_left <= 0)
+                        break;
+                sleep_jf = DIV_ROUND_UP(sleep_jf * 11, 10);
+        }
+update_state:
+        oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu));
+        if (oldstate == CPU_DEAD) {
+                /* Outgoing CPU died normally, update state. */
+                smp_mb(); /* atomic_read() before update. */
+                atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_POST_DEAD);
+        } else {
+                /* Outgoing CPU still hasn't died, set state accordingly. */
+                if (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu),
+                                   oldstate, CPU_BROKEN) != oldstate)
+                        goto update_state;
+                ret = false;
+        }
+        return ret;
+}
+/*
+ * Called by the outgoing CPU to report its successful death.  Return
+ * false if this report follows the surviving CPU's timing out.
+ *
+ * A separate "CPU_DEAD_FROZEN" is used when the surviving CPU
+ * timed out.  This approach allows architectures to omit calls to
+ * cpu_check_up_prepare() and cpu_set_state_online() without defeating
+ * the next cpu_wait_death()'s polling loop.
+ */
+bool cpu_report_death(void)
+{
+        int oldstate;
+        int newstate;
+        int cpu = smp_processor_id();
+        do {
+                oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu));
+                if (oldstate != CPU_BROKEN)
+                        newstate = CPU_DEAD;
+                else
+                        newstate = CPU_DEAD_FROZEN;
+        } while (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu),
+                                oldstate, newstate) != oldstate);
+        return newstate == CPU_DEAD;
+}
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
diff --git a/kernel/sys.c b/kernel/sys.c
index a03d9cd23ed7..3be344902316 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -325,6 +325,7 @@ out_unlock:
 * SMP: There are not races, the GIDs are checked only by filesystem
 *      operations (as far as semantic preservation is concerned).
 */
+#ifdef CONFIG_MULTIUSER
 SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
 {
        struct user_namespace *ns = current_user_ns();
@@ -815,6 +816,7 @@ change_okay:
        commit_creds(new);
        return old_fsgid;
 }
+#endif /* CONFIG_MULTIUSER */
 /**
 * sys_getpid - return the thread group id of the current process
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 5adcb0ae3a58..7995ef5868d8 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -159,6 +159,20 @@ cond_syscall(sys_uselib);
 cond_syscall(sys_fadvise64);
 cond_syscall(sys_fadvise64_64);
 cond_syscall(sys_madvise);
+cond_syscall(sys_setuid);
+cond_syscall(sys_setregid);
+cond_syscall(sys_setgid);
+cond_syscall(sys_setreuid);
+cond_syscall(sys_setresuid);
+cond_syscall(sys_getresuid);
+cond_syscall(sys_setresgid);
+cond_syscall(sys_getresgid);
+cond_syscall(sys_setgroups);
+cond_syscall(sys_getgroups);
+cond_syscall(sys_setfsuid);
+cond_syscall(sys_setfsgid);
+cond_syscall(sys_capget);
+cond_syscall(sys_capset);
 /* arch-specific weak syscall entries */
 cond_syscall(sys_pciconfig_read);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 88ea2d6e0031..42b7fc2860c1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -19,6 +19,7 @@
 */
 #include <linux/module.h>
+#include <linux/aio.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/slab.h>
@@ -846,7 +847,7 @@ static struct ctl_table kern_table[] = {
                .data           = &watchdog_user_enabled,
                .maxlen         = sizeof (int),
                .mode           = 0644,
-                .proc_handler   = proc_dowatchdog,
+                .proc_handler   = proc_watchdog,
                .extra1         = &zero,
                .extra2         = &one,
        },
@@ -855,11 +856,33 @@ static struct ctl_table kern_table[] = {
                .data           = &watchdog_thresh,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = proc_dowatchdog,
+                .proc_handler   = proc_watchdog_thresh,
                .extra1         = &zero,
                .extra2         = &sixty,
        },
        {
+                .procname       = "nmi_watchdog",
+                .data           = &nmi_watchdog_enabled,
+                .maxlen         = sizeof (int),
+                .mode           = 0644,
+                .proc_handler   = proc_nmi_watchdog,
+                .extra1         = &zero,
+#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
+                .extra2         = &one,
+#else
+                .extra2         = &zero,
+#endif
+        },
+        {
+                .procname       = "soft_watchdog",
+                .data           = &soft_watchdog_enabled,
+                .maxlen         = sizeof (int),
+                .mode           = 0644,
+                .proc_handler   = proc_soft_watchdog,
+                .extra1         = &zero,
+                .extra2         = &one,
+        },
+        {
                .procname       = "softlockup_panic",
                .data           = &softlockup_panic,
                .maxlen         = sizeof(int),
@@ -879,15 +902,6 @@ static struct ctl_table kern_table[] = {
                .extra2         = &one,
        },
 #endif /* CONFIG_SMP */
-        {
-                .procname       = "nmi_watchdog",
-                .data           = &watchdog_user_enabled,
-                .maxlen         = sizeof (int),
-                .mode           = 0644,
-                .proc_handler   = proc_dowatchdog,
-                .extra1         = &zero,
-                .extra2         = &one,
-        },
 #endif
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
        {
@@ -1228,6 +1242,14 @@ static struct ctl_table vm_table[] = {
                .extra1         = &zero,
        },
        {
+                .procname       = "dirtytime_expire_seconds",
+                .data           = &dirtytime_expire_interval,
+                .maxlen         = sizeof(dirty_expire_interval),
+                .mode           = 0644,
+                .proc_handler   = dirtytime_interval_handler,
+                .extra1         = &zero,
+        },
+        {
                .procname       = "nr_pdflush_threads",
                .mode           = 0444 /* read-only */,
                .proc_handler   = pdflush_proc_obsolete,
@@ -1313,6 +1335,15 @@ static struct ctl_table vm_table[] = {
                .extra1         = &min_extfrag_threshold,
                .extra2         = &max_extfrag_threshold,
        },
+        {
+                .procname       = "compact_unevictable_allowed",
+                .data           = &sysctl_compact_unevictable_allowed,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec,
+                .extra1         = &zero,
+                .extra2         = &one,
+        },
 #endif /* CONFIG_COMPACTION */
        {
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index d626dc98e8df..579ce1b929af 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -33,12 +33,6 @@ config ARCH_USES_GETTIMEOFFSET
 config GENERIC_CLOCKEVENTS
        bool
-# Migration helper. Builds, but does not invoke
-config GENERIC_CLOCKEVENTS_BUILD
-        bool
-        default y
-        depends on GENERIC_CLOCKEVENTS
 # Architecture can handle broadcast in a driver-agnostic way
 config ARCH_HAS_TICK_BROADCAST
        bool
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index c09c07817d7a..01f0312419b3 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -2,15 +2,13 @@ obj-y += time.o timer.o hrtimer.o itimer.o posix-timers.o posix-cpu-timers.o
 obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
 obj-y += timeconv.o timecounter.o posix-clock.o alarmtimer.o
-obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD)         += clockevents.o
+obj-$(CONFIG_GENERIC_CLOCKEVENTS)               += clockevents.o tick-common.o
-obj-$(CONFIG_GENERIC_CLOCKEVENTS)               += tick-common.o
 ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y)
 obj-y                                          += tick-broadcast.o
 obj-$(CONFIG_TICK_ONESHOT)                     += tick-broadcast-hrtimer.o
 endif
 obj-$(CONFIG_GENERIC_SCHED_CLOCK)               += sched_clock.o
-obj-$(CONFIG_TICK_ONESHOT)                      += tick-oneshot.o
+obj-$(CONFIG_TICK_ONESHOT)                      += tick-oneshot.o tick-sched.o
-obj-$(CONFIG_TICK_ONESHOT)                      += tick-sched.o
 obj-$(CONFIG_TIMER_STATS)                       += timer_stats.o
 obj-$(CONFIG_DEBUG_FS)                          += timekeeping_debug.o
 obj-$(CONFIG_TEST_UDELAY)                       += test_udelay.o
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 55449909f114..25d942d1da27 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -94,25 +94,76 @@ u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt)
 }
 EXPORT_SYMBOL_GPL(clockevent_delta2ns);
+static int __clockevents_set_state(struct clock_event_device *dev,
+                                   enum clock_event_state state)
+{
+        /* Transition with legacy set_mode() callback */
+        if (dev->set_mode) {
+                /* Legacy callback doesn't support new modes */
+                if (state > CLOCK_EVT_STATE_ONESHOT)
+                        return -ENOSYS;
+                /*
+                 * 'clock_event_state' and 'clock_event_mode' have 1-to-1
+                 * mapping until *_ONESHOT, and so a simple cast will work.
+                 */
+                dev->set_mode((enum clock_event_mode)state, dev);
+                dev->mode = (enum clock_event_mode)state;
+                return 0;
+        }
+        if (dev->features & CLOCK_EVT_FEAT_DUMMY)
+                return 0;
+        /* Transition with new state-specific callbacks */
+        switch (state) {
+        case CLOCK_EVT_STATE_DETACHED:
+                /*
+                 * This is an internal state, which is guaranteed to go from
+                 * SHUTDOWN to DETACHED. No driver interaction required.
+                 */
+                return 0;
+        case CLOCK_EVT_STATE_SHUTDOWN:
+                return dev->set_state_shutdown(dev);
+        case CLOCK_EVT_STATE_PERIODIC:
+                /* Core internal bug */
+                if (!(dev->features & CLOCK_EVT_FEAT_PERIODIC))
+                        return -ENOSYS;
+                return dev->set_state_periodic(dev);
+        case CLOCK_EVT_STATE_ONESHOT:
+                /* Core internal bug */
+                if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT))
+                        return -ENOSYS;
+                return dev->set_state_oneshot(dev);
+        default:
+                return -ENOSYS;
+        }
+}
 /**
- * clockevents_set_mode - set the operating mode of a clock event device
+ * clockevents_set_state - set the operating state of a clock event device
 * @dev:        device to modify
- * @mode:       new mode
+ * @state:      new state
 *
 * Must be called with interrupts disabled !
 */
-void clockevents_set_mode(struct clock_event_device *dev,
+void clockevents_set_state(struct clock_event_device *dev,
-                                 enum clock_event_mode mode)
+                           enum clock_event_state state)
 {
-        if (dev->mode != mode) {
+        if (dev->state != state) {
-                dev->set_mode(mode, dev);
+                if (__clockevents_set_state(dev, state))
-                dev->mode = mode;
+                        return;
+                dev->state = state;
                /*
                 * A nsec2cyc multiplicator of 0 is invalid and we'd crash
                 * on it, so fix it up and emit a warning:
                 */
-                if (mode == CLOCK_EVT_MODE_ONESHOT) {
+                if (state == CLOCK_EVT_STATE_ONESHOT) {
                        if (unlikely(!dev->mult)) {
                                dev->mult = 1;
                                WARN_ON(1);
@@ -127,10 +178,28 @@ void clockevents_set_mode(struct clock_event_device *dev,
 */
 void clockevents_shutdown(struct clock_event_device *dev)
 {
-        clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
+        clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
        dev->next_event.tv64 = KTIME_MAX;
 }
+/**
+ * clockevents_tick_resume -    Resume the tick device before using it again
+ * @dev:                        device to resume
+ */
+int clockevents_tick_resume(struct clock_event_device *dev)
+{
+        int ret = 0;
+        if (dev->set_mode) {
+                dev->set_mode(CLOCK_EVT_MODE_RESUME, dev);
+                dev->mode = CLOCK_EVT_MODE_RESUME;
+        } else if (dev->tick_resume) {
+                ret = dev->tick_resume(dev);
+        }
+        return ret;
+}
 #ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST
 /* Limit min_delta to a jiffie */
@@ -183,7 +252,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev)
                delta = dev->min_delta_ns;
                dev->next_event = ktime_add_ns(ktime_get(), delta);
-                if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
+                if (dev->state == CLOCK_EVT_STATE_SHUTDOWN)
                        return 0;
                dev->retries++;
@@ -220,7 +289,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev)
        delta = dev->min_delta_ns;
        dev->next_event = ktime_add_ns(ktime_get(), delta);
-        if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
+        if (dev->state == CLOCK_EVT_STATE_SHUTDOWN)
                return 0;
        dev->retries++;
@@ -252,7 +321,7 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
        dev->next_event = expires;
-        if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
+        if (dev->state == CLOCK_EVT_STATE_SHUTDOWN)
                return 0;
        /* Shortcut for clockevent devices that can deal with ktime. */
@@ -297,7 +366,7 @@ static int clockevents_replace(struct clock_event_device *ced)
        struct clock_event_device *dev, *newdev = NULL;
        list_for_each_entry(dev, &clockevent_devices, list) {
-                if (dev == ced || dev->mode != CLOCK_EVT_MODE_UNUSED)
+                if (dev == ced || dev->state != CLOCK_EVT_STATE_DETACHED)
                        continue;
                if (!tick_check_replacement(newdev, dev))
@@ -323,7 +392,7 @@ static int clockevents_replace(struct clock_event_device *ced)
 static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu)
 {
        /* Fast track. Device is unused */
-        if (ced->mode == CLOCK_EVT_MODE_UNUSED) {
+        if (ced->state == CLOCK_EVT_STATE_DETACHED) {
                list_del_init(&ced->list);
                return 0;
        }
@@ -373,6 +442,37 @@ int clockevents_unbind_device(struct clock_event_device *ced, int cpu)
 }
 EXPORT_SYMBOL_GPL(clockevents_unbind);
+/* Sanity check of state transition callbacks */
+static int clockevents_sanity_check(struct clock_event_device *dev)
+{
+        /* Legacy set_mode() callback */
+        if (dev->set_mode) {
+                /* We shouldn't be supporting new modes now */
+                WARN_ON(dev->set_state_periodic || dev->set_state_oneshot ||
+                        dev->set_state_shutdown || dev->tick_resume);
+                BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
+                return 0;
+        }
+        if (dev->features & CLOCK_EVT_FEAT_DUMMY)
+                return 0;
+        /* New state-specific callbacks */
+        if (!dev->set_state_shutdown)
+                return -EINVAL;
+        if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
+            !dev->set_state_periodic)
+                return -EINVAL;
+        if ((dev->features & CLOCK_EVT_FEAT_ONESHOT) &&
+            !dev->set_state_oneshot)
+                return -EINVAL;
+        return 0;
+}
 /**
 * clockevents_register_device - register a clock event device
 * @dev:        device to register
@@ -381,7 +481,11 @@ void clockevents_register_device(struct clock_event_device *dev)
 {
        unsigned long flags;
-        BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
+        BUG_ON(clockevents_sanity_check(dev));
+        /* Initialize state to DETACHED */
+        dev->state = CLOCK_EVT_STATE_DETACHED;
        if (!dev->cpumask) {
                WARN_ON(num_possible_cpus() > 1);
                dev->cpumask = cpumask_of(smp_processor_id());
@@ -445,11 +549,11 @@ int __clockevents_update_freq(struct clock_event_device *dev, u32 freq)
 {
        clockevents_config(dev, freq);
-        if (dev->mode == CLOCK_EVT_MODE_ONESHOT)
+        if (dev->state == CLOCK_EVT_STATE_ONESHOT)
                return clockevents_program_event(dev, dev->next_event, false);
-        if (dev->mode == CLOCK_EVT_MODE_PERIODIC)
+        if (dev->state == CLOCK_EVT_STATE_PERIODIC)
-                dev->set_mode(CLOCK_EVT_MODE_PERIODIC, dev);
+                return __clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC);
        return 0;
 }
@@ -491,30 +595,27 @@ void clockevents_handle_noop(struct clock_event_device *dev)
 * @old:        device to release (can be NULL)
 * @new:        device to request (can be NULL)
 *
- * Called from the notifier chain. clockevents_lock is held already
+ * Called from various tick functions with clockevents_lock held and
+ * interrupts disabled.
 */
 void clockevents_exchange_device(struct clock_event_device *old,
                                 struct clock_event_device *new)
 {
-        unsigned long flags;
-        local_irq_save(flags);
        /*
         * Caller releases a clock event device. We queue it into the
         * released list and do a notify add later.
         */
        if (old) {
                module_put(old->owner);
-                clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED);
+                clockevents_set_state(old, CLOCK_EVT_STATE_DETACHED);
                list_del(&old->list);
                list_add(&old->list, &clockevents_released);
        }
        if (new) {
-                BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED);
+                BUG_ON(new->state != CLOCK_EVT_STATE_DETACHED);
                clockevents_shutdown(new);
        }
-        local_irq_restore(flags);
 }
 /**
@@ -541,74 +642,40 @@ void clockevents_resume(void)
                        dev->resume(dev);
 }
-#ifdef CONFIG_GENERIC_CLOCKEVENTS
+#ifdef CONFIG_HOTPLUG_CPU
 /**
- * clockevents_notify - notification about relevant events
+ * tick_cleanup_dead_cpu - Cleanup the tick and clockevents of a dead cpu
- * Returns 0 on success, any other value on error
 */
-int clockevents_notify(unsigned long reason, void *arg)
+void tick_cleanup_dead_cpu(int cpu)
 {
        struct clock_event_device *dev, *tmp;
        unsigned long flags;
-        int cpu, ret = 0;
        raw_spin_lock_irqsave(&clockevents_lock, flags);
-        switch (reason) {
+        tick_shutdown_broadcast_oneshot(cpu);
-        case CLOCK_EVT_NOTIFY_BROADCAST_ON:
+        tick_shutdown_broadcast(cpu);
-        case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
+        tick_shutdown(cpu);
-        case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
+        /*
-                tick_broadcast_on_off(reason, arg);
+         * Unregister the clock event devices which were
-                break;
+         * released from the users in the notify chain.
+         */
-        case CLOCK_EVT_NOTIFY_BROADCAST_ENTER:
+        list_for_each_entry_safe(dev, tmp, &clockevents_released, list)
-        case CLOCK_EVT_NOTIFY_BROADCAST_EXIT:
+                list_del(&dev->list);
-                ret = tick_broadcast_oneshot_control(reason);
+        /*
-                break;
+         * Now check whether the CPU has left unused per cpu devices
+         */
-        case CLOCK_EVT_NOTIFY_CPU_DYING:
+        list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) {
-                tick_handover_do_timer(arg);
+                if (cpumask_test_cpu(cpu, dev->cpumask) &&
-                break;
+                    cpumask_weight(dev->cpumask) == 1 &&
+                    !tick_is_broadcast_device(dev)) {
-        case CLOCK_EVT_NOTIFY_SUSPEND:
+                        BUG_ON(dev->state != CLOCK_EVT_STATE_DETACHED);
-                tick_suspend();
-                tick_suspend_broadcast();
-                break;
-        case CLOCK_EVT_NOTIFY_RESUME:
-                tick_resume();
-                break;
-        case CLOCK_EVT_NOTIFY_CPU_DEAD:
-                tick_shutdown_broadcast_oneshot(arg);
-                tick_shutdown_broadcast(arg);
-                tick_shutdown(arg);
-                /*
-                 * Unregister the clock event devices which were
-                 * released from the users in the notify chain.
-                 */
-                list_for_each_entry_safe(dev, tmp, &clockevents_released, list)
                        list_del(&dev->list);
-                /*
-                 * Now check whether the CPU has left unused per cpu devices
-                 */
-                cpu = *((int *)arg);
-                list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) {
-                        if (cpumask_test_cpu(cpu, dev->cpumask) &&
-                            cpumask_weight(dev->cpumask) == 1 &&
-                            !tick_is_broadcast_device(dev)) {
-                                BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
-                                list_del(&dev->list);
-                        }
                }
-                break;
-        default:
-                break;
        }
        raw_spin_unlock_irqrestore(&clockevents_lock, flags);
-        return ret;
 }
-EXPORT_SYMBOL_GPL(clockevents_notify);
+#endif
 #ifdef CONFIG_SYSFS
 struct bus_type clockevents_subsys = {
@@ -727,5 +794,3 @@ static int __init clockevents_init_sysfs(void)
 }
 device_initcall(clockevents_init_sysfs);
 #endif /* SYSFS */
-#endif /* GENERIC_CLOCK_EVENTS */
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 4892352f0e49..15facb1b9c60 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -142,13 +142,6 @@ static void __clocksource_unstable(struct clocksource *cs)
                schedule_work(&watchdog_work);
 }
-static void clocksource_unstable(struct clocksource *cs, int64_t delta)
-{
-        printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n",
-               cs->name, delta);
-        __clocksource_unstable(cs);
-}
 /**
 * clocksource_mark_unstable - mark clocksource unstable via watchdog
 * @cs:         clocksource to be marked unstable
@@ -174,7 +167,7 @@ void clocksource_mark_unstable(struct clocksource *cs)
 static void clocksource_watchdog(unsigned long data)
 {
        struct clocksource *cs;
-        cycle_t csnow, wdnow, delta;
+        cycle_t csnow, wdnow, cslast, wdlast, delta;
        int64_t wd_nsec, cs_nsec;
        int next_cpu, reset_pending;
@@ -213,6 +206,8 @@ static void clocksource_watchdog(unsigned long data)
                delta = clocksource_delta(csnow, cs->cs_last, cs->mask);
                cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);
+                wdlast = cs->wd_last; /* save these in case we print them */
+                cslast = cs->cs_last;
                cs->cs_last = csnow;
                cs->wd_last = wdnow;
@@ -221,7 +216,12 @@ static void clocksource_watchdog(unsigned long data)
                /* Check the deviation from the watchdog clocksource. */
                if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) {
-                        clocksource_unstable(cs, cs_nsec - wd_nsec);
+                        pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable, because the skew is too large:\n", cs->name);
+                        pr_warn("       '%s' wd_now: %llx wd_last: %llx mask: %llx\n",
+                                watchdog->name, wdnow, wdlast, watchdog->mask);
+                        pr_warn("       '%s' cs_now: %llx cs_last: %llx mask: %llx\n",
+                                cs->name, csnow, cslast, cs->mask);
+                        __clocksource_unstable(cs);
                        continue;
                }
@@ -469,26 +469,25 @@ static u32 clocksource_max_adjustment(struct clocksource *cs)
 * @shift:      cycle to nanosecond divisor (power of two)
 * @maxadj:     maximum adjustment value to mult (~11%)
 * @mask:       bitmask for two's complement subtraction of non 64 bit counters
+ * @max_cyc:    maximum cycle value before potential overflow (does not include
+ *              any safety margin)
+ *
+ * NOTE: This function includes a safety margin of 50%, in other words, we
+ * return half the number of nanoseconds the hardware counter can technically
+ * cover. This is done so that we can potentially detect problems caused by
+ * delayed timers or bad hardware, which might result in time intervals that
+ * are larger then what the math used can handle without overflows.
 */
-u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask)
+u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cyc)
 {
        u64 max_nsecs, max_cycles;
        /*
         * Calculate the maximum number of cycles that we can pass to the
-         * cyc2ns function without overflowing a 64-bit signed result. The
+         * cyc2ns() function without overflowing a 64-bit result.
-         * maximum number of cycles is equal to ULLONG_MAX/(mult+maxadj)
-         * which is equivalent to the below.
-         * max_cycles < (2^63)/(mult + maxadj)
-         * max_cycles < 2^(log2((2^63)/(mult + maxadj)))
-         * max_cycles < 2^(log2(2^63) - log2(mult + maxadj))
-         * max_cycles < 2^(63 - log2(mult + maxadj))
-         * max_cycles < 1 << (63 - log2(mult + maxadj))
-         * Please note that we add 1 to the result of the log2 to account for
-         * any rounding errors, ensure the above inequality is satisfied and
-         * no overflow will occur.
         */
-        max_cycles = 1ULL << (63 - (ilog2(mult + maxadj) + 1));
+        max_cycles = ULLONG_MAX;
+        do_div(max_cycles, mult+maxadj);
        /*
         * The actual maximum number of cycles we can defer the clocksource is
@@ -499,27 +498,26 @@ u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask)
        max_cycles = min(max_cycles, mask);
        max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift);
+        /* return the max_cycles value as well if requested */
+        if (max_cyc)
+                *max_cyc = max_cycles;
+        /* Return 50% of the actual maximum, so we can detect bad values */
+        max_nsecs >>= 1;
        return max_nsecs;
 }
 /**
- * clocksource_max_deferment - Returns max time the clocksource can be deferred
+ * clocksource_update_max_deferment - Updates the clocksource max_idle_ns & max_cycles
- * @cs:         Pointer to clocksource
+ * @cs:         Pointer to clocksource to be updated
 *
 */
-static u64 clocksource_max_deferment(struct clocksource *cs)
+static inline void clocksource_update_max_deferment(struct clocksource *cs)
 {
-        u64 max_nsecs;
+        cs->max_idle_ns = clocks_calc_max_nsecs(cs->mult, cs->shift,
+                                                cs->maxadj, cs->mask,
-        max_nsecs = clocks_calc_max_nsecs(cs->mult, cs->shift, cs->maxadj,
+                                                &cs->max_cycles);
-                                          cs->mask);
-        /*
-         * To ensure that the clocksource does not wrap whilst we are idle,
-         * limit the time the clocksource can be deferred by 12.5%. Please
-         * note a margin of 12.5% is used because this can be computed with
-         * a shift, versus say 10% which would require division.
-         */
-        return max_nsecs - (max_nsecs >> 3);
 }
 #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
@@ -648,7 +646,7 @@ static void clocksource_enqueue(struct clocksource *cs)
 }
 /**
- * __clocksource_updatefreq_scale - Used update clocksource with new freq
+ * __clocksource_update_freq_scale - Used update clocksource with new freq
 * @cs:         clocksource to be registered
 * @scale:      Scale factor multiplied against freq to get clocksource hz
 * @freq:       clocksource frequency (cycles per second) divided by scale
@@ -656,48 +654,64 @@ static void clocksource_enqueue(struct clocksource *cs)
 * This should only be called from the clocksource->enable() method.
 *
 * This *SHOULD NOT* be called directly! Please use the
- * clocksource_updatefreq_hz() or clocksource_updatefreq_khz helper functions.
+ * __clocksource_update_freq_hz() or __clocksource_update_freq_khz() helper
+ * functions.
 */
-void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
+void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq)
 {
        u64 sec;
        /*
-         * Calc the maximum number of seconds which we can run before
+         * Default clocksources are *special* and self-define their mult/shift.
-         * wrapping around. For clocksources which have a mask > 32bit
+         * But, you're not special, so you should specify a freq value.
-         * we need to limit the max sleep time to have a good
-         * conversion precision. 10 minutes is still a reasonable
-         * amount. That results in a shift value of 24 for a
-         * clocksource with mask >= 40bit and f >= 4GHz. That maps to
-         * ~ 0.06ppm granularity for NTP. We apply the same 12.5%
-         * margin as we do in clocksource_max_deferment()
         */
-        sec = (cs->mask - (cs->mask >> 3));
+        if (freq) {
-        do_div(sec, freq);
+                /*
-        do_div(sec, scale);
+                 * Calc the maximum number of seconds which we can run before
-        if (!sec)
+                 * wrapping around. For clocksources which have a mask > 32-bit
-                sec = 1;
+                 * we need to limit the max sleep time to have a good
-        else if (sec > 600 && cs->mask > UINT_MAX)
+                 * conversion precision. 10 minutes is still a reasonable
-                sec = 600;
+                 * amount. That results in a shift value of 24 for a
+                 * clocksource with mask >= 40-bit and f >= 4GHz. That maps to
-        clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
+                 * ~ 0.06ppm granularity for NTP.
-                               NSEC_PER_SEC / scale, sec * scale);
+                 */
+                sec = cs->mask;
+                do_div(sec, freq);
+                do_div(sec, scale);
+                if (!sec)
+                        sec = 1;
+                else if (sec > 600 && cs->mask > UINT_MAX)
+                        sec = 600;
+                clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
+                                       NSEC_PER_SEC / scale, sec * scale);
+        }
        /*
-         * for clocksources that have large mults, to avoid overflow.
+         * Ensure clocksources that have large 'mult' values don't overflow
-         * Since mult may be adjusted by ntp, add an safety extra margin
+         * when adjusted.
-         *
         */
        cs->maxadj = clocksource_max_adjustment(cs);
-        while ((cs->mult + cs->maxadj < cs->mult)
+        while (freq && ((cs->mult + cs->maxadj < cs->mult)
-                || (cs->mult - cs->maxadj > cs->mult)) {
+                || (cs->mult - cs->maxadj > cs->mult))) {
                cs->mult >>= 1;
                cs->shift--;
                cs->maxadj = clocksource_max_adjustment(cs);
        }
-        cs->max_idle_ns = clocksource_max_deferment(cs);
+        /*
+         * Only warn for *special* clocksources that self-define
+         * their mult/shift values and don't specify a freq.
+         */
+        WARN_ONCE(cs->mult + cs->maxadj < cs->mult,
+                "timekeeping: Clocksource %s might overflow on 11%% adjustment\n",
+                cs->name);
+        clocksource_update_max_deferment(cs);
+        pr_info("clocksource %s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n",
+                        cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns);
 }
-EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
+EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale);
 /**
 * __clocksource_register_scale - Used to install new clocksources
@@ -714,7 +728,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
 {
        /* Initialize mult/shift and max_idle_ns */
-        __clocksource_updatefreq_scale(cs, scale, freq);
+        __clocksource_update_freq_scale(cs, scale, freq);
        /* Add clocksource to the clocksource list */
        mutex_lock(&clocksource_mutex);
@@ -726,33 +740,6 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
 }
 EXPORT_SYMBOL_GPL(__clocksource_register_scale);
-/**
- * clocksource_register - Used to install new clocksources
- * @cs:         clocksource to be registered
- *
- * Returns -EBUSY if registration fails, zero otherwise.
- */
-int clocksource_register(struct clocksource *cs)
-{
-        /* calculate max adjustment for given mult/shift */
-        cs->maxadj = clocksource_max_adjustment(cs);
-        WARN_ONCE(cs->mult + cs->maxadj < cs->mult,
-                "Clocksource %s might overflow on 11%% adjustment\n",
-                cs->name);
-        /* calculate max idle time permitted for this clocksource */
-        cs->max_idle_ns = clocksource_max_deferment(cs);
-        mutex_lock(&clocksource_mutex);
-        clocksource_enqueue(cs);
-        clocksource_enqueue_watchdog(cs);
-        clocksource_select();
-        mutex_unlock(&clocksource_mutex);
-        return 0;
-}
-EXPORT_SYMBOL(clocksource_register);
 static void __clocksource_change_rating(struct clocksource *cs, int rating)
 {
        list_del(&cs->list);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index bee0c1f78091..76d4bd962b19 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -54,7 +54,7 @@
 #include <trace/events/timer.h>
-#include "timekeeping.h"
+#include "tick-internal.h"
 /*
 * The timer bases:
@@ -1707,17 +1707,10 @@ static int hrtimer_cpu_notify(struct notifier_block *self,
                break;
 #ifdef CONFIG_HOTPLUG_CPU
-        case CPU_DYING:
-        case CPU_DYING_FROZEN:
-                clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DYING, &scpu);
-                break;
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
-        {
-                clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu);
                migrate_hrtimers(scpu);
                break;
-        }
 #endif
        default:
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index a6a5bf53e86d..347fecf86a3f 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -25,7 +25,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
-#include "tick-internal.h"
+#include "timekeeping.h"
 /* The Jiffies based clocksource is the lowest common
 * denominator clock source which should function on
@@ -71,6 +71,7 @@ static struct clocksource clocksource_jiffies = {
        .mask           = 0xffffffff, /*32bits*/
        .mult           = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
        .shift          = JIFFIES_SHIFT,
+        .max_cycles     = 10,
 };
 __cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
@@ -94,7 +95,7 @@ EXPORT_SYMBOL(jiffies);
 static int __init init_jiffies_clocksource(void)
 {
-        return clocksource_register(&clocksource_jiffies);
+        return __clocksource_register(&clocksource_jiffies);
 }
 core_initcall(init_jiffies_clocksource);
@@ -130,6 +131,6 @@ int register_refined_jiffies(long cycles_per_second)
        refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT;
-        clocksource_register(&refined_jiffies);
+        __clocksource_register(&refined_jiffies);
        return 0;
 }
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 0f60b08a4f07..7a681003001c 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -17,7 +17,6 @@
 #include <linux/module.h>
 #include <linux/rtc.h>
-#include "tick-internal.h"
 #include "ntp_internal.h"
 /*
@@ -459,6 +458,16 @@ out:
        return leap;
 }
+#ifdef CONFIG_GENERIC_CMOS_UPDATE
+int __weak update_persistent_clock64(struct timespec64 now64)
+{
+        struct timespec now;
+        now = timespec64_to_timespec(now64);
+        return update_persistent_clock(now);
+}
+#endif
 #if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC)
 static void sync_cmos_clock(struct work_struct *work);
@@ -494,8 +503,9 @@ static void sync_cmos_clock(struct work_struct *work)
                if (persistent_clock_is_local)
                        adjust.tv_sec -= (sys_tz.tz_minuteswest * 60);
 #ifdef CONFIG_GENERIC_CMOS_UPDATE
-                fail = update_persistent_clock(timespec64_to_timespec(adjust));
+                fail = update_persistent_clock64(adjust);
 #endif
 #ifdef CONFIG_RTC_SYSTOHC
                if (fail == -ENODEV)
                        fail = rtc_set_ntp_time(adjust);
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 01d2d15aa662..a26036d37a38 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -1,5 +1,6 @@
 /*
- * sched_clock.c: support for extending counters to full 64-bit ns counter
+ * sched_clock.c: Generic sched_clock() support, to extend low level
+ *                hardware time counters to full 64-bit ns values.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
@@ -18,15 +19,53 @@
 #include <linux/seqlock.h>
 #include <linux/bitops.h>
-struct clock_data {
+/**
-        ktime_t wrap_kt;
+ * struct clock_read_data - data required to read from sched_clock()
+ *
+ * @epoch_ns:           sched_clock() value at last update
+ * @epoch_cyc:          Clock cycle value at last update.
+ * @sched_clock_mask:   Bitmask for two's complement subtraction of non 64bit
+ *                      clocks.
+ * @read_sched_clock:   Current clock source (or dummy source when suspended).
+ * @mult:               Multipler for scaled math conversion.
+ * @shift:              Shift value for scaled math conversion.
+ *
+ * Care must be taken when updating this structure; it is read by
+ * some very hot code paths. It occupies <=40 bytes and, when combined
+ * with the seqcount used to synchronize access, comfortably fits into
+ * a 64 byte cache line.
+ */
+struct clock_read_data {
        u64 epoch_ns;
        u64 epoch_cyc;
-        seqcount_t seq;
+        u64 sched_clock_mask;
-        unsigned long rate;
+        u64 (*read_sched_clock)(void);
        u32 mult;
        u32 shift;
-        bool suspended;
+};
+/**
+ * struct clock_data - all data needed for sched_clock() (including
+ *                     registration of a new clock source)
+ *
+ * @seq:                Sequence counter for protecting updates. The lowest
+ *                      bit is the index for @read_data.
+ * @read_data:          Data required to read from sched_clock.
+ * @wrap_kt:            Duration for which clock can run before wrapping.
+ * @rate:               Tick rate of the registered clock.
+ * @actual_read_sched_clock: Registered hardware level clock read function.
+ *
+ * The ordering of this structure has been chosen to optimize cache
+ * performance. In particular 'seq' and 'read_data[0]' (combined) should fit
+ * into a single 64-byte cache line.
+ */
+struct clock_data {
+        seqcount_t              seq;
+        struct clock_read_data  read_data[2];
+        ktime_t                 wrap_kt;
+        unsigned long           rate;
+        u64 (*actual_read_sched_clock)(void);
 };
 static struct hrtimer sched_clock_timer;
@@ -34,12 +73,6 @@ static int irqtime = -1;
 core_param(irqtime, irqtime, int, 0400);
-static struct clock_data cd = {
-        .mult   = NSEC_PER_SEC / HZ,
-};
-static u64 __read_mostly sched_clock_mask;
 static u64 notrace jiffy_sched_clock_read(void)
 {
        /*
@@ -49,7 +82,11 @@ static u64 notrace jiffy_sched_clock_read(void)
        return (u64)(jiffies - INITIAL_JIFFIES);
 }
-static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read;
+static struct clock_data cd ____cacheline_aligned = {
+        .read_data[0] = { .mult = NSEC_PER_SEC / HZ,
+                          .read_sched_clock = jiffy_sched_clock_read, },
+        .actual_read_sched_clock = jiffy_sched_clock_read,
+};
 static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
 {
@@ -58,111 +95,136 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
 unsigned long long notrace sched_clock(void)
 {
-        u64 epoch_ns;
+        u64 cyc, res;
-        u64 epoch_cyc;
-        u64 cyc;
        unsigned long seq;
+        struct clock_read_data *rd;
-        if (cd.suspended)
-                return cd.epoch_ns;
        do {
-                seq = raw_read_seqcount_begin(&cd.seq);
+                seq = raw_read_seqcount(&cd.seq);
-                epoch_cyc = cd.epoch_cyc;
+                rd = cd.read_data + (seq & 1);
-                epoch_ns = cd.epoch_ns;
+                cyc = (rd->read_sched_clock() - rd->epoch_cyc) &
+                      rd->sched_clock_mask;
+                res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift);
        } while (read_seqcount_retry(&cd.seq, seq));
-        cyc = read_sched_clock();
+        return res;
-        cyc = (cyc - epoch_cyc) & sched_clock_mask;
+}
-        return epoch_ns + cyc_to_ns(cyc, cd.mult, cd.shift);
+/*
+ * Updating the data required to read the clock.
+ *
+ * sched_clock() will never observe mis-matched data even if called from
+ * an NMI. We do this by maintaining an odd/even copy of the data and
+ * steering sched_clock() to one or the other using a sequence counter.
+ * In order to preserve the data cache profile of sched_clock() as much
+ * as possible the system reverts back to the even copy when the update
+ * completes; the odd copy is used *only* during an update.
+ */
+static void update_clock_read_data(struct clock_read_data *rd)
+{
+        /* update the backup (odd) copy with the new data */
+        cd.read_data[1] = *rd;
+        /* steer readers towards the odd copy */
+        raw_write_seqcount_latch(&cd.seq);
+        /* now its safe for us to update the normal (even) copy */
+        cd.read_data[0] = *rd;
+        /* switch readers back to the even copy */
+        raw_write_seqcount_latch(&cd.seq);
 }
 /*
- * Atomically update the sched_clock epoch.
+ * Atomically update the sched_clock() epoch.
 */
-static void notrace update_sched_clock(void)
+static void update_sched_clock(void)
 {
-        unsigned long flags;
        u64 cyc;
        u64 ns;
+        struct clock_read_data rd;
+        rd = cd.read_data[0];
+        cyc = cd.actual_read_sched_clock();
+        ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift);
+        rd.epoch_ns = ns;
+        rd.epoch_cyc = cyc;
-        cyc = read_sched_clock();
+        update_clock_read_data(&rd);
-        ns = cd.epoch_ns +
-                cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
-                          cd.mult, cd.shift);
-        raw_local_irq_save(flags);
-        raw_write_seqcount_begin(&cd.seq);
-        cd.epoch_ns = ns;
-        cd.epoch_cyc = cyc;
-        raw_write_seqcount_end(&cd.seq);
-        raw_local_irq_restore(flags);
 }
 static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt)
 {
        update_sched_clock();
        hrtimer_forward_now(hrt, cd.wrap_kt);
        return HRTIMER_RESTART;
 }
-void __init sched_clock_register(u64 (*read)(void), int bits,
+void __init
-                                 unsigned long rate)
+sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
 {
        u64 res, wrap, new_mask, new_epoch, cyc, ns;
        u32 new_mult, new_shift;
-        ktime_t new_wrap_kt;
        unsigned long r;
        char r_unit;
+        struct clock_read_data rd;
        if (cd.rate > rate)
                return;
        WARN_ON(!irqs_disabled());
-        /* calculate the mult/shift to convert counter ticks to ns. */
+        /* Calculate the mult/shift to convert counter ticks to ns. */
        clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600);
        new_mask = CLOCKSOURCE_MASK(bits);
+        cd.rate = rate;
+        /* Calculate how many nanosecs until we risk wrapping */
+        wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask, NULL);
+        cd.wrap_kt = ns_to_ktime(wrap);
-        /* calculate how many ns until we wrap */
+        rd = cd.read_data[0];
-        wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask);
-        new_wrap_kt = ns_to_ktime(wrap - (wrap >> 3));
-        /* update epoch for new counter and update epoch_ns from old counter*/
+        /* Update epoch for new counter and update 'epoch_ns' from old counter*/
        new_epoch = read();
-        cyc = read_sched_clock();
+        cyc = cd.actual_read_sched_clock();
-        ns = cd.epoch_ns + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
+        ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift);
-                          cd.mult, cd.shift);
+        cd.actual_read_sched_clock = read;
-        raw_write_seqcount_begin(&cd.seq);
+        rd.read_sched_clock     = read;
-        read_sched_clock = read;
+        rd.sched_clock_mask     = new_mask;
-        sched_clock_mask = new_mask;
+        rd.mult                 = new_mult;
-        cd.rate = rate;
+        rd.shift                = new_shift;
-        cd.wrap_kt = new_wrap_kt;
+        rd.epoch_cyc            = new_epoch;
-        cd.mult = new_mult;
+        rd.epoch_ns             = ns;
-        cd.shift = new_shift;
-        cd.epoch_cyc = new_epoch;
+        update_clock_read_data(&rd);
-        cd.epoch_ns = ns;
-        raw_write_seqcount_end(&cd.seq);
        r = rate;
        if (r >= 4000000) {
                r /= 1000000;
                r_unit = 'M';
-        } else if (r >= 1000) {
+        } else {
-                r /= 1000;
+                if (r >= 1000) {
-                r_unit = 'k';
+                        r /= 1000;
-        } else
+                        r_unit = 'k';
-                r_unit = ' ';
+                } else {
+                        r_unit = ' ';
-        /* calculate the ns resolution of this counter */
+                }
+        }
+        /* Calculate the ns resolution of this counter */
        res = cyc_to_ns(1ULL, new_mult, new_shift);
        pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n",
                bits, r, r_unit, res, wrap);
-        /* Enable IRQ time accounting if we have a fast enough sched_clock */
+        /* Enable IRQ time accounting if we have a fast enough sched_clock() */
        if (irqtime > 0 || (irqtime == -1 && rate >= 1000000))
                enable_sched_clock_irqtime();
@@ -172,10 +234,10 @@ void __init sched_clock_register(u64 (*read)(void), int bits,
 void __init sched_clock_postinit(void)
 {
        /*
-         * If no sched_clock function has been provided at that point,
+         * If no sched_clock() function has been provided at that point,
         * make it the final one one.
         */
-        if (read_sched_clock == jiffy_sched_clock_read)
+        if (cd.actual_read_sched_clock == jiffy_sched_clock_read)
                sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ);
        update_sched_clock();
@@ -189,29 +251,53 @@ void __init sched_clock_postinit(void)
        hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
 }
+/*
+ * Clock read function for use when the clock is suspended.
+ *
+ * This function makes it appear to sched_clock() as if the clock
+ * stopped counting at its last update.
+ *
+ * This function must only be called from the critical
+ * section in sched_clock(). It relies on the read_seqcount_retry()
+ * at the end of the critical section to be sure we observe the
+ * correct copy of 'epoch_cyc'.
+ */
+static u64 notrace suspended_sched_clock_read(void)
+{
+        unsigned long seq = raw_read_seqcount(&cd.seq);
+        return cd.read_data[seq & 1].epoch_cyc;
+}
 static int sched_clock_suspend(void)
 {
+        struct clock_read_data *rd = &cd.read_data[0];
        update_sched_clock();
        hrtimer_cancel(&sched_clock_timer);
-        cd.suspended = true;
+        rd->read_sched_clock = suspended_sched_clock_read;
        return 0;
 }
 static void sched_clock_resume(void)
 {
-        cd.epoch_cyc = read_sched_clock();
+        struct clock_read_data *rd = &cd.read_data[0];
+        rd->epoch_cyc = cd.actual_read_sched_clock();
        hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
-        cd.suspended = false;
+        rd->read_sched_clock = cd.actual_read_sched_clock;
 }
 static struct syscore_ops sched_clock_ops = {
-        .suspend = sched_clock_suspend,
+        .suspend        = sched_clock_suspend,
-        .resume = sched_clock_resume,
+        .resume         = sched_clock_resume,
 };
 static int __init sched_clock_syscore_init(void)
 {
        register_syscore_ops(&sched_clock_ops);
        return 0;
 }
 device_initcall(sched_clock_syscore_init);
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index eb682d5c697c..6aac4beedbbe 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -49,6 +49,7 @@ static void bc_set_mode(enum clock_event_mode mode,
 */
 static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
 {
+        int bc_moved;
        /*
         * We try to cancel the timer first. If the callback is on
         * flight on some other cpu then we let it handle it. If we
@@ -60,9 +61,15 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
         * restart the timer because we are in the callback, but we
         * can set the expiry time and let the callback return
         * HRTIMER_RESTART.
+         *
+         * Since we are in the idle loop at this point and because
+         * hrtimer_{start/cancel} functions call into tracing,
+         * calls to these functions must be bound within RCU_NONIDLE.
         */
-        if (hrtimer_try_to_cancel(&bctimer) >= 0) {
+        RCU_NONIDLE(bc_moved = (hrtimer_try_to_cancel(&bctimer) >= 0) ?
-                hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED);
+                !hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED) :
+                        0);
+        if (bc_moved) {
                /* Bind the "device" to the cpu */
                bc->bound_on = smp_processor_id();
        } else if (bc->bound_on == smp_processor_id()) {
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 066f0ec05e48..7e8ca4f448a8 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -33,12 +33,14 @@ static cpumask_var_t tick_broadcast_mask;
 static cpumask_var_t tick_broadcast_on;
 static cpumask_var_t tmpmask;
 static DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
-static int tick_broadcast_force;
+static int tick_broadcast_forced;
 #ifdef CONFIG_TICK_ONESHOT
 static void tick_broadcast_clear_oneshot(int cpu);
+static void tick_resume_broadcast_oneshot(struct clock_event_device *bc);
 #else
 static inline void tick_broadcast_clear_oneshot(int cpu) { }
+static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { }
 #endif
 /*
@@ -303,7 +305,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
        /*
         * The device is in periodic mode. No reprogramming necessary:
         */
-        if (dev->mode == CLOCK_EVT_MODE_PERIODIC)
+        if (dev->state == CLOCK_EVT_STATE_PERIODIC)
                goto unlock;
        /*
@@ -324,49 +326,54 @@ unlock:
        raw_spin_unlock(&tick_broadcast_lock);
 }
-/*
+/**
- * Powerstate information: The system enters/leaves a state, where
+ * tick_broadcast_control - Enable/disable or force broadcast mode
- * affected devices might stop
+ * @mode:       The selected broadcast mode
+ *
+ * Called when the system enters a state where affected tick devices
+ * might stop. Note: TICK_BROADCAST_FORCE cannot be undone.
+ *
+ * Called with interrupts disabled, so clockevents_lock is not
+ * required here because the local clock event device cannot go away
+ * under us.
 */
-static void tick_do_broadcast_on_off(unsigned long *reason)
+void tick_broadcast_control(enum tick_broadcast_mode mode)
 {
        struct clock_event_device *bc, *dev;
        struct tick_device *td;
-        unsigned long flags;
        int cpu, bc_stopped;
-        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
+        td = this_cpu_ptr(&tick_cpu_device);
-        cpu = smp_processor_id();
-        td = &per_cpu(tick_cpu_device, cpu);
        dev = td->evtdev;
-        bc = tick_broadcast_device.evtdev;
        /*
         * Is the device not affected by the powerstate ?
         */
        if (!dev || !(dev->features & CLOCK_EVT_FEAT_C3STOP))
-                goto out;
+                return;
        if (!tick_device_is_functional(dev))
-                goto out;
+                return;
+        raw_spin_lock(&tick_broadcast_lock);
+        cpu = smp_processor_id();
+        bc = tick_broadcast_device.evtdev;
        bc_stopped = cpumask_empty(tick_broadcast_mask);
-        switch (*reason) {
+        switch (mode) {
-        case CLOCK_EVT_NOTIFY_BROADCAST_ON:
+        case TICK_BROADCAST_FORCE:
-        case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
+                tick_broadcast_forced = 1;
+        case TICK_BROADCAST_ON:
                cpumask_set_cpu(cpu, tick_broadcast_on);
                if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) {
                        if (tick_broadcast_device.mode ==
                            TICKDEV_MODE_PERIODIC)
                                clockevents_shutdown(dev);
                }
-                if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE)
-                        tick_broadcast_force = 1;
                break;
-        case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
-                if (tick_broadcast_force)
+        case TICK_BROADCAST_OFF:
+                if (tick_broadcast_forced)
                        break;
                cpumask_clear_cpu(cpu, tick_broadcast_on);
                if (!tick_device_is_functional(dev))
@@ -388,22 +395,9 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
                else
                        tick_broadcast_setup_oneshot(bc);
        }
-out:
+        raw_spin_unlock(&tick_broadcast_lock);
-        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
-}
-/*
- * Powerstate information: The system enters/leaves a state, where
- * affected devices might stop.
- */
-void tick_broadcast_on_off(unsigned long reason, int *oncpu)
-{
-        if (!cpumask_test_cpu(*oncpu, cpu_online_mask))
-                printk(KERN_ERR "tick-broadcast: ignoring broadcast for "
-                       "offline CPU #%d\n", *oncpu);
-        else
-                tick_do_broadcast_on_off(&reason);
 }
+EXPORT_SYMBOL_GPL(tick_broadcast_control);
 /*
 * Set the periodic handler depending on broadcast on/off
@@ -416,14 +410,14 @@ void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast)
                dev->event_handler = tick_handle_periodic_broadcast;
 }
+#ifdef CONFIG_HOTPLUG_CPU
 /*
 * Remove a CPU from broadcasting
 */
-void tick_shutdown_broadcast(unsigned int *cpup)
+void tick_shutdown_broadcast(unsigned int cpu)
 {
        struct clock_event_device *bc;
        unsigned long flags;
-        unsigned int cpu = *cpup;
        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
@@ -438,6 +432,7 @@ void tick_shutdown_broadcast(unsigned int *cpup)
        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
+#endif
 void tick_suspend_broadcast(void)
 {
@@ -453,38 +448,48 @@ void tick_suspend_broadcast(void)
        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
-int tick_resume_broadcast(void)
+/*
+ * This is called from tick_resume_local() on a resuming CPU. That's
+ * called from the core resume function, tick_unfreeze() and the magic XEN
+ * resume hackery.
+ *
+ * In none of these cases the broadcast device mode can change and the
+ * bit of the resuming CPU in the broadcast mask is safe as well.
+ */
+bool tick_resume_check_broadcast(void)
+{
+        if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT)
+                return false;
+        else
+                return cpumask_test_cpu(smp_processor_id(), tick_broadcast_mask);
+}
+void tick_resume_broadcast(void)
 {
        struct clock_event_device *bc;
        unsigned long flags;
-        int broadcast = 0;
        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
        bc = tick_broadcast_device.evtdev;
        if (bc) {
-                clockevents_set_mode(bc, CLOCK_EVT_MODE_RESUME);
+                clockevents_tick_resume(bc);
                switch (tick_broadcast_device.mode) {
                case TICKDEV_MODE_PERIODIC:
                        if (!cpumask_empty(tick_broadcast_mask))
                                tick_broadcast_start_periodic(bc);
-                        broadcast = cpumask_test_cpu(smp_processor_id(),
-                                                     tick_broadcast_mask);
                        break;
                case TICKDEV_MODE_ONESHOT:
                        if (!cpumask_empty(tick_broadcast_mask))
-                                broadcast = tick_resume_broadcast_oneshot(bc);
+                                tick_resume_broadcast_oneshot(bc);
                        break;
                }
        }
        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
-        return broadcast;
 }
 #ifdef CONFIG_TICK_ONESHOT
 static cpumask_var_t tick_broadcast_oneshot_mask;
@@ -532,8 +537,8 @@ static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu,
 {
        int ret;
-        if (bc->mode != CLOCK_EVT_MODE_ONESHOT)
+        if (bc->state != CLOCK_EVT_STATE_ONESHOT)
-                clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
+                clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT);
        ret = clockevents_program_event(bc, expires, force);
        if (!ret)
@@ -541,10 +546,9 @@ static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu,
        return ret;
 }
-int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
+static void tick_resume_broadcast_oneshot(struct clock_event_device *bc)
 {
-        clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
+        clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT);
-        return 0;
 }
 /*
@@ -562,8 +566,8 @@ void tick_check_oneshot_broadcast_this_cpu(void)
                 * switched over, leave the device alone.
                 */
                if (td->mode == TICKDEV_MODE_ONESHOT) {
-                        clockevents_set_mode(td->evtdev,
+                        clockevents_set_state(td->evtdev,
-                                             CLOCK_EVT_MODE_ONESHOT);
+                                              CLOCK_EVT_STATE_ONESHOT);
                }
        }
 }
@@ -666,31 +670,26 @@ static void broadcast_shutdown_local(struct clock_event_device *bc,
                if (dev->next_event.tv64 < bc->next_event.tv64)
                        return;
        }
-        clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
+        clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
 }
-static void broadcast_move_bc(int deadcpu)
+/**
-{
+ * tick_broadcast_oneshot_control - Enter/exit broadcast oneshot mode
-        struct clock_event_device *bc = tick_broadcast_device.evtdev;
+ * @state:      The target state (enter/exit)
+ *
-        if (!bc || !broadcast_needs_cpu(bc, deadcpu))
+ * The system enters/leaves a state, where affected devices might stop
-                return;
-        /* This moves the broadcast assignment to this cpu */
-        clockevents_program_event(bc, bc->next_event, 1);
-}
-/*
- * Powerstate information: The system enters/leaves a state, where
- * affected devices might stop
 * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups.
+ *
+ * Called with interrupts disabled, so clockevents_lock is not
+ * required here because the local clock event device cannot go away
+ * under us.
 */
-int tick_broadcast_oneshot_control(unsigned long reason)
+int tick_broadcast_oneshot_control(enum tick_broadcast_state state)
 {
        struct clock_event_device *bc, *dev;
        struct tick_device *td;
-        unsigned long flags;
-        ktime_t now;
        int cpu, ret = 0;
+        ktime_t now;
        /*
         * Periodic mode does not care about the enter/exit of power
@@ -703,17 +702,17 @@ int tick_broadcast_oneshot_control(unsigned long reason)
         * We are called with preemtion disabled from the depth of the
         * idle code, so we can't be moved away.
         */
-        cpu = smp_processor_id();
+        td = this_cpu_ptr(&tick_cpu_device);
-        td = &per_cpu(tick_cpu_device, cpu);
        dev = td->evtdev;
        if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
                return 0;
+        raw_spin_lock(&tick_broadcast_lock);
        bc = tick_broadcast_device.evtdev;
+        cpu = smp_processor_id();
-        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
+        if (state == TICK_BROADCAST_ENTER) {
-        if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
                if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) {
                        WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask));
                        broadcast_shutdown_local(bc, dev);
@@ -741,7 +740,7 @@ int tick_broadcast_oneshot_control(unsigned long reason)
                        cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
        } else {
                if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) {
-                        clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
+                        clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
                        /*
                         * The cpu which was handling the broadcast
                         * timer marked this cpu in the broadcast
@@ -805,9 +804,10 @@ int tick_broadcast_oneshot_control(unsigned long reason)
                }
        }
 out:
-        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+        raw_spin_unlock(&tick_broadcast_lock);
        return ret;
 }
+EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control);
 /*
 * Reset the one shot broadcast for a cpu
@@ -842,7 +842,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
        /* Set it up only once ! */
        if (bc->event_handler != tick_handle_oneshot_broadcast) {
-                int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC;
+                int was_periodic = bc->state == CLOCK_EVT_STATE_PERIODIC;
                bc->event_handler = tick_handle_oneshot_broadcast;
@@ -858,7 +858,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
                           tick_broadcast_oneshot_mask, tmpmask);
                if (was_periodic && !cpumask_empty(tmpmask)) {
-                        clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
+                        clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT);
                        tick_broadcast_init_next_event(tmpmask,
                                                       tick_next_period);
                        tick_broadcast_set_event(bc, cpu, tick_next_period, 1);
@@ -894,14 +894,28 @@ void tick_broadcast_switch_to_oneshot(void)
        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
+#ifdef CONFIG_HOTPLUG_CPU
+void hotplug_cpu__broadcast_tick_pull(int deadcpu)
+{
+        struct clock_event_device *bc;
+        unsigned long flags;
+        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
+        bc = tick_broadcast_device.evtdev;
+        if (bc && broadcast_needs_cpu(bc, deadcpu)) {
+                /* This moves the broadcast assignment to this CPU: */
+                clockevents_program_event(bc, bc->next_event, 1);
+        }
+        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+}
 /*
 * Remove a dead CPU from broadcasting
 */
-void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
+void tick_shutdown_broadcast_oneshot(unsigned int cpu)
 {
        unsigned long flags;
-        unsigned int cpu = *cpup;
        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
@@ -913,10 +927,9 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
        cpumask_clear_cpu(cpu, tick_broadcast_pending_mask);
        cpumask_clear_cpu(cpu, tick_broadcast_force_mask);
-        broadcast_move_bc(cpu);
        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
+#endif
 /*
 * Check, whether the broadcast device is in one shot mode
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index f7c515595b42..3ae6afa1eb98 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -102,7 +102,7 @@ void tick_handle_periodic(struct clock_event_device *dev)
        tick_periodic(cpu);
-        if (dev->mode != CLOCK_EVT_MODE_ONESHOT)
+        if (dev->state != CLOCK_EVT_STATE_ONESHOT)
                return;
        for (;;) {
                /*
@@ -140,7 +140,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
        if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
            !tick_broadcast_oneshot_active()) {
-                clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC);
+                clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC);
        } else {
                unsigned long seq;
                ktime_t next;
@@ -150,7 +150,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
                        next = tick_next_period;
                } while (read_seqretry(&jiffies_lock, seq));
-                clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
+                clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
                for (;;) {
                        if (!clockevents_program_event(dev, next, false))
@@ -332,14 +332,16 @@ out_bc:
        tick_install_broadcast_device(newdev);
 }
+#ifdef CONFIG_HOTPLUG_CPU
 /*
 * Transfer the do_timer job away from a dying cpu.
 *
- * Called with interrupts disabled.
+ * Called with interrupts disabled. Not locking required. If
+ * tick_do_timer_cpu is owned by this cpu, nothing can change it.
 */
-void tick_handover_do_timer(int *cpup)
+void tick_handover_do_timer(void)
 {
-        if (*cpup == tick_do_timer_cpu) {
+        if (tick_do_timer_cpu == smp_processor_id()) {
                int cpu = cpumask_first(cpu_online_mask);
                tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu :
@@ -354,9 +356,9 @@ void tick_handover_do_timer(int *cpup)
 * access the hardware device itself.
 * We just set the mode and remove it from the lists.
 */
-void tick_shutdown(unsigned int *cpup)
+void tick_shutdown(unsigned int cpu)
 {
-        struct tick_device *td = &per_cpu(tick_cpu_device, *cpup);
+        struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
        struct clock_event_device *dev = td->evtdev;
        td->mode = TICKDEV_MODE_PERIODIC;
@@ -365,27 +367,42 @@ void tick_shutdown(unsigned int *cpup)
                 * Prevent that the clock events layer tries to call
                 * the set mode function!
                 */
+                dev->state = CLOCK_EVT_STATE_DETACHED;
                dev->mode = CLOCK_EVT_MODE_UNUSED;
                clockevents_exchange_device(dev, NULL);
                dev->event_handler = clockevents_handle_noop;
                td->evtdev = NULL;
        }
 }
+#endif
-void tick_suspend(void)
+/**
+ * tick_suspend_local - Suspend the local tick device
+ *
+ * Called from the local cpu for freeze with interrupts disabled.
+ *
+ * No locks required. Nothing can change the per cpu device.
+ */
+void tick_suspend_local(void)
 {
        struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
        clockevents_shutdown(td->evtdev);
 }
-void tick_resume(void)
+/**
+ * tick_resume_local - Resume the local tick device
+ *
+ * Called from the local CPU for unfreeze or XEN resume magic.
+ *
+ * No locks required. Nothing can change the per cpu device.
+ */
+void tick_resume_local(void)
 {
        struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
-        int broadcast = tick_resume_broadcast();
+        bool broadcast = tick_resume_check_broadcast();
-        clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME);
+        clockevents_tick_resume(td->evtdev);
        if (!broadcast) {
                if (td->mode == TICKDEV_MODE_PERIODIC)
                        tick_setup_periodic(td->evtdev, 0);
@@ -394,6 +411,35 @@ void tick_resume(void)
        }
 }
+/**
+ * tick_suspend - Suspend the tick and the broadcast device
+ *
+ * Called from syscore_suspend() via timekeeping_suspend with only one
+ * CPU online and interrupts disabled or from tick_unfreeze() under
+ * tick_freeze_lock.
+ *
+ * No locks required. Nothing can change the per cpu device.
+ */
+void tick_suspend(void)
+{
+        tick_suspend_local();
+        tick_suspend_broadcast();
+}
+/**
+ * tick_resume - Resume the tick and the broadcast device
+ *
+ * Called from syscore_resume() via timekeeping_resume with only one
+ * CPU online and interrupts disabled.
+ *
+ * No locks required. Nothing can change the per cpu device.
+ */
+void tick_resume(void)
+{
+        tick_resume_broadcast();
+        tick_resume_local();
+}
 static DEFINE_RAW_SPINLOCK(tick_freeze_lock);
 static unsigned int tick_freeze_depth;
@@ -411,12 +457,10 @@ void tick_freeze(void)
        raw_spin_lock(&tick_freeze_lock);
        tick_freeze_depth++;
-        if (tick_freeze_depth == num_online_cpus()) {
+        if (tick_freeze_depth == num_online_cpus())
                timekeeping_suspend();
-        } else {
+        else
-                tick_suspend();
+                tick_suspend_local();
-                tick_suspend_broadcast();
-        }
        raw_spin_unlock(&tick_freeze_lock);
 }
@@ -437,7 +481,7 @@ void tick_unfreeze(void)
        if (tick_freeze_depth == num_online_cpus())
                timekeeping_resume();
        else
-                tick_resume();
+                tick_resume_local();
        tick_freeze_depth--;
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 366aeb4f2c66..b64fdd8054c5 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -5,15 +5,12 @@
 #include <linux/tick.h>
 #include "timekeeping.h"
+#include "tick-sched.h"
-extern seqlock_t jiffies_lock;
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
-#define CS_NAME_LEN     32
+# define TICK_DO_TIMER_NONE     -1
+# define TICK_DO_TIMER_BOOT     -2
-#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD
-#define TICK_DO_TIMER_NONE      -1
-#define TICK_DO_TIMER_BOOT      -2
 DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
 extern ktime_t tick_next_period;
@@ -23,21 +20,72 @@ extern int tick_do_timer_cpu __read_mostly;
 extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
 extern void tick_handle_periodic(struct clock_event_device *dev);
 extern void tick_check_new_device(struct clock_event_device *dev);
-extern void tick_handover_do_timer(int *cpup);
+extern void tick_shutdown(unsigned int cpu);
-extern void tick_shutdown(unsigned int *cpup);
 extern void tick_suspend(void);
 extern void tick_resume(void);
 extern bool tick_check_replacement(struct clock_event_device *curdev,
                                   struct clock_event_device *newdev);
 extern void tick_install_replacement(struct clock_event_device *dev);
+extern int tick_is_oneshot_available(void);
+extern struct tick_device *tick_get_device(int cpu);
-extern void clockevents_shutdown(struct clock_event_device *dev);
+extern int clockevents_tick_resume(struct clock_event_device *dev);
+/* Check, if the device is functional or a dummy for broadcast */
+static inline int tick_device_is_functional(struct clock_event_device *dev)
+{
+        return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
+}
+extern void clockevents_shutdown(struct clock_event_device *dev);
+extern void clockevents_exchange_device(struct clock_event_device *old,
+                                        struct clock_event_device *new);
+extern void clockevents_set_state(struct clock_event_device *dev,
+                                 enum clock_event_state state);
+extern int clockevents_program_event(struct clock_event_device *dev,
+                                     ktime_t expires, bool force);
+extern void clockevents_handle_noop(struct clock_event_device *dev);
+extern int __clockevents_update_freq(struct clock_event_device *dev, u32 freq);
 extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
-/*
+/* Broadcasting support */
- * NO_HZ / high resolution timer shared code
+# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
- */
+extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
+extern void tick_install_broadcast_device(struct clock_event_device *dev);
+extern int tick_is_broadcast_device(struct clock_event_device *dev);
+extern void tick_shutdown_broadcast(unsigned int cpu);
+extern void tick_suspend_broadcast(void);
+extern void tick_resume_broadcast(void);
+extern bool tick_resume_check_broadcast(void);
+extern void tick_broadcast_init(void);
+extern void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast);
+extern int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq);
+extern struct tick_device *tick_get_broadcast_device(void);
+extern struct cpumask *tick_get_broadcast_mask(void);
+# else /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST: */
+static inline void tick_install_broadcast_device(struct clock_event_device *dev) { }
+static inline int tick_is_broadcast_device(struct clock_event_device *dev) { return 0; }
+static inline int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { return 0; }
+static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { }
+static inline void tick_shutdown_broadcast(unsigned int cpu) { }
+static inline void tick_suspend_broadcast(void) { }
+static inline void tick_resume_broadcast(void) { }
+static inline bool tick_resume_check_broadcast(void) { return false; }
+static inline void tick_broadcast_init(void) { }
+static inline int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq) { return -ENODEV; }
+/* Set the periodic handler in non broadcast mode */
+static inline void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast)
+{
+        dev->event_handler = tick_handle_periodic;
+}
+# endif /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST */
+#else /* !GENERIC_CLOCKEVENTS: */
+static inline void tick_suspend(void) { }
+static inline void tick_resume(void) { }
+#endif /* !GENERIC_CLOCKEVENTS */
+/* Oneshot related functions */
 #ifdef CONFIG_TICK_ONESHOT
 extern void tick_setup_oneshot(struct clock_event_device *newdev,
                               void (*handler)(struct clock_event_device *),
@@ -46,58 +94,42 @@ extern int tick_program_event(ktime_t expires, int force);
 extern void tick_oneshot_notify(void);
 extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *));
 extern void tick_resume_oneshot(void);
-# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+static inline bool tick_oneshot_possible(void) { return true; }
+extern int tick_oneshot_mode_active(void);
+extern void tick_clock_notify(void);
+extern int tick_check_oneshot_change(int allow_nohz);
+extern int tick_init_highres(void);
+#else /* !CONFIG_TICK_ONESHOT: */
+static inline
+void tick_setup_oneshot(struct clock_event_device *newdev,
+                        void (*handler)(struct clock_event_device *),
+                        ktime_t nextevt) { BUG(); }
+static inline void tick_resume_oneshot(void) { BUG(); }
+static inline int tick_program_event(ktime_t expires, int force) { return 0; }
+static inline void tick_oneshot_notify(void) { }
+static inline bool tick_oneshot_possible(void) { return false; }
+static inline int tick_oneshot_mode_active(void) { return 0; }
+static inline void tick_clock_notify(void) { }
+static inline int tick_check_oneshot_change(int allow_nohz) { return 0; }
+#endif /* !CONFIG_TICK_ONESHOT */
+/* Functions related to oneshot broadcasting */
+#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT)
 extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc);
-extern int tick_broadcast_oneshot_control(unsigned long reason);
 extern void tick_broadcast_switch_to_oneshot(void);
-extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
+extern void tick_shutdown_broadcast_oneshot(unsigned int cpu);
-extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
 extern int tick_broadcast_oneshot_active(void);
 extern void tick_check_oneshot_broadcast_this_cpu(void);
 bool tick_broadcast_oneshot_available(void);
-# else /* BROADCAST */
+extern struct cpumask *tick_get_broadcast_oneshot_mask(void);
-static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
+#else /* !(BROADCAST && ONESHOT): */
-{
+static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); }
-        BUG();
-}
-static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; }
 static inline void tick_broadcast_switch_to_oneshot(void) { }
-static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
+static inline void tick_shutdown_broadcast_oneshot(unsigned int cpu) { }
 static inline int tick_broadcast_oneshot_active(void) { return 0; }
 static inline void tick_check_oneshot_broadcast_this_cpu(void) { }
-static inline bool tick_broadcast_oneshot_available(void) { return true; }
+static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_possible(); }
-# endif /* !BROADCAST */
+#endif /* !(BROADCAST && ONESHOT) */
-#else /* !ONESHOT */
-static inline
-void tick_setup_oneshot(struct clock_event_device *newdev,
-                        void (*handler)(struct clock_event_device *),
-                        ktime_t nextevt)
-{
-        BUG();
-}
-static inline void tick_resume_oneshot(void)
-{
-        BUG();
-}
-static inline int tick_program_event(ktime_t expires, int force)
-{
-        return 0;
-}
-static inline void tick_oneshot_notify(void) { }
-static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
-{
-        BUG();
-}
-static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; }
-static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
-static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
-{
-        return 0;
-}
-static inline int tick_broadcast_oneshot_active(void) { return 0; }
-static inline bool tick_broadcast_oneshot_available(void) { return false; }
-#endif /* !TICK_ONESHOT */
 /* NO_HZ_FULL internal */
 #ifdef CONFIG_NO_HZ_FULL
@@ -105,68 +137,3 @@ extern void tick_nohz_init(void);
 # else
 static inline void tick_nohz_init(void) { }
 #endif
-/*
- * Broadcasting support
- */
-#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
-extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
-extern void tick_install_broadcast_device(struct clock_event_device *dev);
-extern int tick_is_broadcast_device(struct clock_event_device *dev);
-extern void tick_broadcast_on_off(unsigned long reason, int *oncpu);
-extern void tick_shutdown_broadcast(unsigned int *cpup);
-extern void tick_suspend_broadcast(void);
-extern int tick_resume_broadcast(void);
-extern void tick_broadcast_init(void);
-extern void
-tick_set_periodic_handler(struct clock_event_device *dev, int broadcast);
-int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq);
-#else /* !BROADCAST */
-static inline void tick_install_broadcast_device(struct clock_event_device *dev)
-{
-}
-static inline int tick_is_broadcast_device(struct clock_event_device *dev)
-{
-        return 0;
-}
-static inline int tick_device_uses_broadcast(struct clock_event_device *dev,
-                                             int cpu)
-{
-        return 0;
-}
-static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { }
-static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { }
-static inline void tick_shutdown_broadcast(unsigned int *cpup) { }
-static inline void tick_suspend_broadcast(void) { }
-static inline int tick_resume_broadcast(void) { return 0; }
-static inline void tick_broadcast_init(void) { }
-static inline int tick_broadcast_update_freq(struct clock_event_device *dev,
-                                             u32 freq) { return -ENODEV; }
-/*
- * Set the periodic handler in non broadcast mode
- */
-static inline void tick_set_periodic_handler(struct clock_event_device *dev,
-                                             int broadcast)
-{
-        dev->event_handler = tick_handle_periodic;
-}
-#endif /* !BROADCAST */
-/*
- * Check, if the device is functional or a dummy for broadcast
- */
-static inline int tick_device_is_functional(struct clock_event_device *dev)
-{
-        return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
-}
-int __clockevents_update_freq(struct clock_event_device *dev, u32 freq);
-#endif
-extern void do_timer(unsigned long ticks);
-extern void update_wall_time(void);
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 7ce740e78e1b..67a64b1670bf 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -38,7 +38,7 @@ void tick_resume_oneshot(void)
 {
        struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
-        clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
+        clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
        clockevents_program_event(dev, ktime_get(), true);
 }
@@ -50,7 +50,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev,
                        ktime_t next_event)
 {
        newdev->event_handler = handler;
-        clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT);
+        clockevents_set_state(newdev, CLOCK_EVT_STATE_ONESHOT);
        clockevents_program_event(newdev, next_event, true);
 }
@@ -81,7 +81,7 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
        td->mode = TICKDEV_MODE_ONESHOT;
        dev->event_handler = handler;
-        clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
+        clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
        tick_broadcast_switch_to_oneshot();
        return 0;
 }
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index a4c4edac4528..914259128145 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -34,7 +34,7 @@
 /*
 * Per cpu nohz control structure
 */
-DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
+static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
 /*
 * The time, when the last jiffy update happened. Protected by jiffies_lock.
@@ -416,6 +416,11 @@ static int __init setup_tick_nohz(char *str)
 __setup("nohz=", setup_tick_nohz);
+int tick_nohz_tick_stopped(void)
+{
+        return __this_cpu_read(tick_cpu_sched.tick_stopped);
+}
 /**
 * tick_nohz_update_jiffies - update jiffies when idle was interrupted
 *
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
new file mode 100644
index 000000000000..28b5da3e1a17
--- /dev/null
+++ b/kernel/time/tick-sched.h
@@ -0,0 +1,74 @@
+#ifndef _TICK_SCHED_H
+#define _TICK_SCHED_H
+#include <linux/hrtimer.h>
+enum tick_device_mode {
+        TICKDEV_MODE_PERIODIC,
+        TICKDEV_MODE_ONESHOT,
+};
+struct tick_device {
+        struct clock_event_device *evtdev;
+        enum tick_device_mode mode;
+};
+enum tick_nohz_mode {
+        NOHZ_MODE_INACTIVE,
+        NOHZ_MODE_LOWRES,
+        NOHZ_MODE_HIGHRES,
+};
+/**
+ * struct tick_sched - sched tick emulation and no idle tick control/stats
+ * @sched_timer:        hrtimer to schedule the periodic tick in high
+ *                      resolution mode
+ * @last_tick:          Store the last tick expiry time when the tick
+ *                      timer is modified for nohz sleeps. This is necessary
+ *                      to resume the tick timer operation in the timeline
+ *                      when the CPU returns from nohz sleep.
+ * @tick_stopped:       Indicator that the idle tick has been stopped
+ * @idle_jiffies:       jiffies at the entry to idle for idle time accounting
+ * @idle_calls:         Total number of idle calls
+ * @idle_sleeps:        Number of idle calls, where the sched tick was stopped
+ * @idle_entrytime:     Time when the idle call was entered
+ * @idle_waketime:      Time when the idle was interrupted
+ * @idle_exittime:      Time when the idle state was left
+ * @idle_sleeptime:     Sum of the time slept in idle with sched tick stopped
+ * @iowait_sleeptime:   Sum of the time slept in idle with sched tick stopped, with IO outstanding
+ * @sleep_length:       Duration of the current idle sleep
+ * @do_timer_lst:       CPU was the last one doing do_timer before going idle
+ */
+struct tick_sched {
+        struct hrtimer                  sched_timer;
+        unsigned long                   check_clocks;
+        enum tick_nohz_mode             nohz_mode;
+        ktime_t                         last_tick;
+        int                             inidle;
+        int                             tick_stopped;
+        unsigned long                   idle_jiffies;
+        unsigned long                   idle_calls;
+        unsigned long                   idle_sleeps;
+        int                             idle_active;
+        ktime_t                         idle_entrytime;
+        ktime_t                         idle_waketime;
+        ktime_t                         idle_exittime;
+        ktime_t                         idle_sleeptime;
+        ktime_t                         iowait_sleeptime;
+        ktime_t                         sleep_length;
+        unsigned long                   last_jiffies;
+        unsigned long                   next_jiffies;
+        ktime_t                         idle_expires;
+        int                             do_timer_last;
+};
+extern struct tick_sched *tick_get_tick_sched(int cpu);
+extern void tick_setup_sched_timer(void);
+#if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS
+extern void tick_cancel_sched_timer(int cpu);
+#else
+static inline void tick_cancel_sched_timer(int cpu) { }
+#endif
+#endif
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 91db94136c10..946acb72179f 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -59,17 +59,15 @@ struct tk_fast {
 };
 static struct tk_fast tk_fast_mono ____cacheline_aligned;
+static struct tk_fast tk_fast_raw  ____cacheline_aligned;
 /* flag for if timekeeping is suspended */
 int __read_mostly timekeeping_suspended;
-/* Flag for if there is a persistent clock on this platform */
-bool __read_mostly persistent_clock_exist = false;
 static inline void tk_normalize_xtime(struct timekeeper *tk)
 {
-        while (tk->tkr.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr.shift)) {
+        while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) {
-                tk->tkr.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr.shift;
+                tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
                tk->xtime_sec++;
        }
 }
@@ -79,20 +77,20 @@ static inline struct timespec64 tk_xtime(struct timekeeper *tk)
        struct timespec64 ts;
        ts.tv_sec = tk->xtime_sec;
-        ts.tv_nsec = (long)(tk->tkr.xtime_nsec >> tk->tkr.shift);
+        ts.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
        return ts;
 }
 static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts)
 {
        tk->xtime_sec = ts->tv_sec;
-        tk->tkr.xtime_nsec = (u64)ts->tv_nsec << tk->tkr.shift;
+        tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift;
 }
 static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts)
 {
        tk->xtime_sec += ts->tv_sec;
-        tk->tkr.xtime_nsec += (u64)ts->tv_nsec << tk->tkr.shift;
+        tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift;
        tk_normalize_xtime(tk);
 }
@@ -118,6 +116,117 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
        tk->offs_boot = ktime_add(tk->offs_boot, delta);
 }
+#ifdef CONFIG_DEBUG_TIMEKEEPING
+#define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */
+/*
+ * These simple flag variables are managed
+ * without locks, which is racy, but ok since
+ * we don't really care about being super
+ * precise about how many events were seen,
+ * just that a problem was observed.
+ */
+static int timekeeping_underflow_seen;
+static int timekeeping_overflow_seen;
+/* last_warning is only modified under the timekeeping lock */
+static long timekeeping_last_warning;
+static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
+{
+        cycle_t max_cycles = tk->tkr_mono.clock->max_cycles;
+        const char *name = tk->tkr_mono.clock->name;
+        if (offset > max_cycles) {
+                printk_deferred("WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow danger\n",
+                                offset, name, max_cycles);
+                printk_deferred("         timekeeping: Your kernel is sick, but tries to cope by capping time updates\n");
+        } else {
+                if (offset > (max_cycles >> 1)) {
+                        printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the the '%s' clock's 50%% safety margin (%lld)\n",
+                                        offset, name, max_cycles >> 1);
+                        printk_deferred("      timekeeping: Your kernel is still fine, but is feeling a bit nervous\n");
+                }
+        }
+        if (timekeeping_underflow_seen) {
+                if (jiffies - timekeeping_last_warning > WARNING_FREQ) {
+                        printk_deferred("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", name);
+                        printk_deferred("         Please report this, consider using a different clocksource, if possible.\n");
+                        printk_deferred("         Your kernel is probably still fine.\n");
+                        timekeeping_last_warning = jiffies;
+                }
+                timekeeping_underflow_seen = 0;
+        }
+        if (timekeeping_overflow_seen) {
+                if (jiffies - timekeeping_last_warning > WARNING_FREQ) {
+                        printk_deferred("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", name);
+                        printk_deferred("         Please report this, consider using a different clocksource, if possible.\n");
+                        printk_deferred("         Your kernel is probably still fine.\n");
+                        timekeeping_last_warning = jiffies;
+                }
+                timekeeping_overflow_seen = 0;
+        }
+}
+static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
+{
+        cycle_t now, last, mask, max, delta;
+        unsigned int seq;
+        /*
+         * Since we're called holding a seqlock, the data may shift
+         * under us while we're doing the calculation. This can cause
+         * false positives, since we'd note a problem but throw the
+         * results away. So nest another seqlock here to atomically
+         * grab the points we are checking with.
+         */
+        do {
+                seq = read_seqcount_begin(&tk_core.seq);
+                now = tkr->read(tkr->clock);
+                last = tkr->cycle_last;
+                mask = tkr->mask;
+                max = tkr->clock->max_cycles;
+        } while (read_seqcount_retry(&tk_core.seq, seq));
+        delta = clocksource_delta(now, last, mask);
+        /*
+         * Try to catch underflows by checking if we are seeing small
+         * mask-relative negative values.
+         */
+        if (unlikely((~delta & mask) < (mask >> 3))) {
+                timekeeping_underflow_seen = 1;
+                delta = 0;
+        }
+        /* Cap delta value to the max_cycles values to avoid mult overflows */
+        if (unlikely(delta > max)) {
+                timekeeping_overflow_seen = 1;
+                delta = tkr->clock->max_cycles;
+        }
+        return delta;
+}
+#else
+static inline void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
+{
+}
+static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
+{
+        cycle_t cycle_now, delta;
+        /* read clocksource */
+        cycle_now = tkr->read(tkr->clock);
+        /* calculate the delta since the last update_wall_time */
+        delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask);
+        return delta;
+}
+#endif
 /**
 * tk_setup_internals - Set up internals to use clocksource clock.
 *
@@ -135,11 +244,16 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
        u64 tmp, ntpinterval;
        struct clocksource *old_clock;
-        old_clock = tk->tkr.clock;
+        old_clock = tk->tkr_mono.clock;
-        tk->tkr.clock = clock;
+        tk->tkr_mono.clock = clock;
-        tk->tkr.read = clock->read;
+        tk->tkr_mono.read = clock->read;
-        tk->tkr.mask = clock->mask;
+        tk->tkr_mono.mask = clock->mask;
-        tk->tkr.cycle_last = tk->tkr.read(clock);
+        tk->tkr_mono.cycle_last = tk->tkr_mono.read(clock);
+        tk->tkr_raw.clock = clock;
+        tk->tkr_raw.read = clock->read;
+        tk->tkr_raw.mask = clock->mask;
+        tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last;
        /* Do the ns -> cycle conversion first, using original mult */
        tmp = NTP_INTERVAL_LENGTH;
@@ -163,11 +277,14 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
        if (old_clock) {
                int shift_change = clock->shift - old_clock->shift;
                if (shift_change < 0)
-                        tk->tkr.xtime_nsec >>= -shift_change;
+                        tk->tkr_mono.xtime_nsec >>= -shift_change;
                else
-                        tk->tkr.xtime_nsec <<= shift_change;
+                        tk->tkr_mono.xtime_nsec <<= shift_change;
        }
-        tk->tkr.shift = clock->shift;
+        tk->tkr_raw.xtime_nsec = 0;
+        tk->tkr_mono.shift = clock->shift;
+        tk->tkr_raw.shift = clock->shift;
        tk->ntp_error = 0;
        tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
@@ -178,7 +295,8 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
         * active clocksource. These value will be adjusted via NTP
         * to counteract clock drifting.
         */
-        tk->tkr.mult = clock->mult;
+        tk->tkr_mono.mult = clock->mult;
+        tk->tkr_raw.mult = clock->mult;
        tk->ntp_err_mult = 0;
 }
@@ -193,14 +311,10 @@ static inline u32 arch_gettimeoffset(void) { return 0; }
 static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
 {
-        cycle_t cycle_now, delta;
+        cycle_t delta;
        s64 nsec;
-        /* read clocksource: */
+        delta = timekeeping_get_delta(tkr);
-        cycle_now = tkr->read(tkr->clock);
-        /* calculate the delta since the last update_wall_time: */
-        delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask);
        nsec = delta * tkr->mult + tkr->xtime_nsec;
        nsec >>= tkr->shift;
@@ -209,25 +323,6 @@ static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
        return nsec + arch_gettimeoffset();
 }
-static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
-{
-        struct clocksource *clock = tk->tkr.clock;
-        cycle_t cycle_now, delta;
-        s64 nsec;
-        /* read clocksource: */
-        cycle_now = tk->tkr.read(clock);
-        /* calculate the delta since the last update_wall_time: */
-        delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask);
-        /* convert delta to nanoseconds. */
-        nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift);
-        /* If arch requires, add in get_arch_timeoffset() */
-        return nsec + arch_gettimeoffset();
-}
 /**
 * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
 * @tkr: Timekeeping readout base from which we take the update
@@ -267,18 +362,18 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
 * slightly wrong timestamp (a few nanoseconds). See
 * @ktime_get_mono_fast_ns.
 */
-static void update_fast_timekeeper(struct tk_read_base *tkr)
+static void update_fast_timekeeper(struct tk_read_base *tkr, struct tk_fast *tkf)
 {
-        struct tk_read_base *base = tk_fast_mono.base;
+        struct tk_read_base *base = tkf->base;
        /* Force readers off to base[1] */
-        raw_write_seqcount_latch(&tk_fast_mono.seq);
+        raw_write_seqcount_latch(&tkf->seq);
        /* Update base[0] */
        memcpy(base, tkr, sizeof(*base));
        /* Force readers back to base[0] */
-        raw_write_seqcount_latch(&tk_fast_mono.seq);
+        raw_write_seqcount_latch(&tkf->seq);
        /* Update base[1] */
        memcpy(base + 1, base, sizeof(*base));
@@ -316,22 +411,33 @@ static void update_fast_timekeeper(struct tk_read_base *tkr)
 * of the following timestamps. Callers need to be aware of that and
 * deal with it.
 */
-u64 notrace ktime_get_mono_fast_ns(void)
+static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
 {
        struct tk_read_base *tkr;
        unsigned int seq;
        u64 now;
        do {
-                seq = raw_read_seqcount(&tk_fast_mono.seq);
+                seq = raw_read_seqcount(&tkf->seq);
-                tkr = tk_fast_mono.base + (seq & 0x01);
+                tkr = tkf->base + (seq & 0x01);
-                now = ktime_to_ns(tkr->base_mono) + timekeeping_get_ns(tkr);
+                now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr);
+        } while (read_seqcount_retry(&tkf->seq, seq));
-        } while (read_seqcount_retry(&tk_fast_mono.seq, seq));
        return now;
 }
+u64 ktime_get_mono_fast_ns(void)
+{
+        return __ktime_get_fast_ns(&tk_fast_mono);
+}
 EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);
+u64 ktime_get_raw_fast_ns(void)
+{
+        return __ktime_get_fast_ns(&tk_fast_raw);
+}
+EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns);
 /* Suspend-time cycles value for halted fast timekeeper. */
 static cycle_t cycles_at_suspend;
@@ -353,12 +459,17 @@ static cycle_t dummy_clock_read(struct clocksource *cs)
 static void halt_fast_timekeeper(struct timekeeper *tk)
 {
        static struct tk_read_base tkr_dummy;
-        struct tk_read_base *tkr = &tk->tkr;
+        struct tk_read_base *tkr = &tk->tkr_mono;
        memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
        cycles_at_suspend = tkr->read(tkr->clock);
        tkr_dummy.read = dummy_clock_read;
-        update_fast_timekeeper(&tkr_dummy);
+        update_fast_timekeeper(&tkr_dummy, &tk_fast_mono);
+        tkr = &tk->tkr_raw;
+        memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
+        tkr_dummy.read = dummy_clock_read;
+        update_fast_timekeeper(&tkr_dummy, &tk_fast_raw);
 }
 #ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
@@ -369,8 +480,8 @@ static inline void update_vsyscall(struct timekeeper *tk)
        xt = timespec64_to_timespec(tk_xtime(tk));
        wm = timespec64_to_timespec(tk->wall_to_monotonic);
-        update_vsyscall_old(&xt, &wm, tk->tkr.clock, tk->tkr.mult,
+        update_vsyscall_old(&xt, &wm, tk->tkr_mono.clock, tk->tkr_mono.mult,
-                            tk->tkr.cycle_last);
+                            tk->tkr_mono.cycle_last);
 }
 static inline void old_vsyscall_fixup(struct timekeeper *tk)
@@ -387,11 +498,11 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk)
        * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD
        * users are removed, this can be killed.
        */
-        remainder = tk->tkr.xtime_nsec & ((1ULL << tk->tkr.shift) - 1);
+        remainder = tk->tkr_mono.xtime_nsec & ((1ULL << tk->tkr_mono.shift) - 1);
-        tk->tkr.xtime_nsec -= remainder;
+        tk->tkr_mono.xtime_nsec -= remainder;
-        tk->tkr.xtime_nsec += 1ULL << tk->tkr.shift;
+        tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift;
        tk->ntp_error += remainder << tk->ntp_error_shift;
-        tk->ntp_error -= (1ULL << tk->tkr.shift) << tk->ntp_error_shift;
+        tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift;
 }
 #else
 #define old_vsyscall_fixup(tk)
@@ -456,17 +567,17 @@ static inline void tk_update_ktime_data(struct timekeeper *tk)
         */
        seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec);
        nsec = (u32) tk->wall_to_monotonic.tv_nsec;
-        tk->tkr.base_mono = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);
+        tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);
        /* Update the monotonic raw base */
-        tk->base_raw = timespec64_to_ktime(tk->raw_time);
+        tk->tkr_raw.base = timespec64_to_ktime(tk->raw_time);
        /*
         * The sum of the nanoseconds portions of xtime and
         * wall_to_monotonic can be greater/equal one second. Take
         * this into account before updating tk->ktime_sec.
         */
-        nsec += (u32)(tk->tkr.xtime_nsec >> tk->tkr.shift);
+        nsec += (u32)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
        if (nsec >= NSEC_PER_SEC)
                seconds++;
        tk->ktime_sec = seconds;
@@ -489,7 +600,8 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
                memcpy(&shadow_timekeeper, &tk_core.timekeeper,
                       sizeof(tk_core.timekeeper));
-        update_fast_timekeeper(&tk->tkr);
+        update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono);
+        update_fast_timekeeper(&tk->tkr_raw,  &tk_fast_raw);
 }
 /**
@@ -501,22 +613,23 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
 */
 static void timekeeping_forward_now(struct timekeeper *tk)
 {
-        struct clocksource *clock = tk->tkr.clock;
+        struct clocksource *clock = tk->tkr_mono.clock;
        cycle_t cycle_now, delta;
        s64 nsec;
-        cycle_now = tk->tkr.read(clock);
+        cycle_now = tk->tkr_mono.read(clock);
-        delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask);
+        delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
-        tk->tkr.cycle_last = cycle_now;
+        tk->tkr_mono.cycle_last = cycle_now;
+        tk->tkr_raw.cycle_last  = cycle_now;
-        tk->tkr.xtime_nsec += delta * tk->tkr.mult;
+        tk->tkr_mono.xtime_nsec += delta * tk->tkr_mono.mult;
        /* If arch requires, add in get_arch_timeoffset() */
-        tk->tkr.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr.shift;
+        tk->tkr_mono.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_mono.shift;
        tk_normalize_xtime(tk);
-        nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift);
+        nsec = clocksource_cyc2ns(delta, tk->tkr_raw.mult, tk->tkr_raw.shift);
        timespec64_add_ns(&tk->raw_time, nsec);
 }
@@ -537,7 +650,7 @@ int __getnstimeofday64(struct timespec64 *ts)
                seq = read_seqcount_begin(&tk_core.seq);
                ts->tv_sec = tk->xtime_sec;
-                nsecs = timekeeping_get_ns(&tk->tkr);
+                nsecs = timekeeping_get_ns(&tk->tkr_mono);
        } while (read_seqcount_retry(&tk_core.seq, seq));
@@ -577,8 +690,8 @@ ktime_t ktime_get(void)
        do {
                seq = read_seqcount_begin(&tk_core.seq);
-                base = tk->tkr.base_mono;
+                base = tk->tkr_mono.base;
-                nsecs = timekeeping_get_ns(&tk->tkr);
+                nsecs = timekeeping_get_ns(&tk->tkr_mono);
        } while (read_seqcount_retry(&tk_core.seq, seq));
@@ -603,8 +716,8 @@ ktime_t ktime_get_with_offset(enum tk_offsets offs)
        do {
                seq = read_seqcount_begin(&tk_core.seq);
-                base = ktime_add(tk->tkr.base_mono, *offset);
+                base = ktime_add(tk->tkr_mono.base, *offset);
-                nsecs = timekeeping_get_ns(&tk->tkr);
+                nsecs = timekeeping_get_ns(&tk->tkr_mono);
        } while (read_seqcount_retry(&tk_core.seq, seq));
@@ -645,8 +758,8 @@ ktime_t ktime_get_raw(void)
        do {
                seq = read_seqcount_begin(&tk_core.seq);
-                base = tk->base_raw;
+                base = tk->tkr_raw.base;
-                nsecs = timekeeping_get_ns_raw(tk);
+                nsecs = timekeeping_get_ns(&tk->tkr_raw);
        } while (read_seqcount_retry(&tk_core.seq, seq));
@@ -674,7 +787,7 @@ void ktime_get_ts64(struct timespec64 *ts)
        do {
                seq = read_seqcount_begin(&tk_core.seq);
                ts->tv_sec = tk->xtime_sec;
-                nsec = timekeeping_get_ns(&tk->tkr);
+                nsec = timekeeping_get_ns(&tk->tkr_mono);
                tomono = tk->wall_to_monotonic;
        } while (read_seqcount_retry(&tk_core.seq, seq));
@@ -759,8 +872,8 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
                ts_real->tv_sec = tk->xtime_sec;
                ts_real->tv_nsec = 0;
-                nsecs_raw = timekeeping_get_ns_raw(tk);
+                nsecs_raw  = timekeeping_get_ns(&tk->tkr_raw);
-                nsecs_real = timekeeping_get_ns(&tk->tkr);
+                nsecs_real = timekeeping_get_ns(&tk->tkr_mono);
        } while (read_seqcount_retry(&tk_core.seq, seq));
@@ -943,7 +1056,7 @@ static int change_clocksource(void *data)
         */
        if (try_module_get(new->owner)) {
                if (!new->enable || new->enable(new) == 0) {
-                        old = tk->tkr.clock;
+                        old = tk->tkr_mono.clock;
                        tk_setup_internals(tk, new);
                        if (old->disable)
                                old->disable(old);
@@ -971,11 +1084,11 @@ int timekeeping_notify(struct clocksource *clock)
 {
        struct timekeeper *tk = &tk_core.timekeeper;
-        if (tk->tkr.clock == clock)
+        if (tk->tkr_mono.clock == clock)
                return 0;
        stop_machine(change_clocksource, clock, NULL);
        tick_clock_notify();
-        return tk->tkr.clock == clock ? 0 : -1;
+        return tk->tkr_mono.clock == clock ? 0 : -1;
 }
 /**
@@ -993,7 +1106,7 @@ void getrawmonotonic64(struct timespec64 *ts)
        do {
                seq = read_seqcount_begin(&tk_core.seq);
-                nsecs = timekeeping_get_ns_raw(tk);
+                nsecs = timekeeping_get_ns(&tk->tkr_raw);
                ts64 = tk->raw_time;
        } while (read_seqcount_retry(&tk_core.seq, seq));
@@ -1016,7 +1129,7 @@ int timekeeping_valid_for_hres(void)
        do {
                seq = read_seqcount_begin(&tk_core.seq);
-                ret = tk->tkr.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
+                ret = tk->tkr_mono.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
        } while (read_seqcount_retry(&tk_core.seq, seq));
@@ -1035,7 +1148,7 @@ u64 timekeeping_max_deferment(void)
        do {
                seq = read_seqcount_begin(&tk_core.seq);
-                ret = tk->tkr.clock->max_idle_ns;
+                ret = tk->tkr_mono.clock->max_idle_ns;
        } while (read_seqcount_retry(&tk_core.seq, seq));
@@ -1057,6 +1170,14 @@ void __weak read_persistent_clock(struct timespec *ts)
        ts->tv_nsec = 0;
 }
+void __weak read_persistent_clock64(struct timespec64 *ts64)
+{
+        struct timespec ts;
+        read_persistent_clock(&ts);
+        *ts64 = timespec_to_timespec64(ts);
+}
 /**
 * read_boot_clock -  Return time of the system start.
 *
@@ -1072,6 +1193,20 @@ void __weak read_boot_clock(struct timespec *ts)
        ts->tv_nsec = 0;
 }
+void __weak read_boot_clock64(struct timespec64 *ts64)
+{
+        struct timespec ts;
+        read_boot_clock(&ts);
+        *ts64 = timespec_to_timespec64(ts);
+}
+/* Flag for if timekeeping_resume() has injected sleeptime */
+static bool sleeptime_injected;
+/* Flag for if there is a persistent clock on this platform */
+static bool persistent_clock_exists;
 /*
 * timekeeping_init - Initializes the clocksource and common timekeeping values
 */
@@ -1081,20 +1216,17 @@ void __init timekeeping_init(void)
        struct clocksource *clock;
        unsigned long flags;
        struct timespec64 now, boot, tmp;
-        struct timespec ts;
-        read_persistent_clock(&ts);
+        read_persistent_clock64(&now);
-        now = timespec_to_timespec64(ts);
        if (!timespec64_valid_strict(&now)) {
                pr_warn("WARNING: Persistent clock returned invalid value!\n"
                        "         Check your CMOS/BIOS settings.\n");
                now.tv_sec = 0;
                now.tv_nsec = 0;
        } else if (now.tv_sec || now.tv_nsec)
-                persistent_clock_exist = true;
+                persistent_clock_exists = true;
-        read_boot_clock(&ts);
+        read_boot_clock64(&boot);
-        boot = timespec_to_timespec64(ts);
        if (!timespec64_valid_strict(&boot)) {
                pr_warn("WARNING: Boot clock returned invalid value!\n"
                        "         Check your CMOS/BIOS settings.\n");
@@ -1114,7 +1246,6 @@ void __init timekeeping_init(void)
        tk_set_xtime(tk, &now);
        tk->raw_time.tv_sec = 0;
        tk->raw_time.tv_nsec = 0;
-        tk->base_raw.tv64 = 0;
        if (boot.tv_sec == 0 && boot.tv_nsec == 0)
                boot = tk_xtime(tk);
@@ -1127,7 +1258,7 @@ void __init timekeeping_init(void)
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 }
-/* time in seconds when suspend began */
+/* time in seconds when suspend began for persistent clock */
 static struct timespec64 timekeeping_suspend_time;
 /**
@@ -1152,12 +1283,49 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
        tk_debug_account_sleep_time(delta);
 }
+#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_RTC_HCTOSYS_DEVICE)
+/**
+ * We have three kinds of time sources to use for sleep time
+ * injection, the preference order is:
+ * 1) non-stop clocksource
+ * 2) persistent clock (ie: RTC accessible when irqs are off)
+ * 3) RTC
+ *
+ * 1) and 2) are used by timekeeping, 3) by RTC subsystem.
+ * If system has neither 1) nor 2), 3) will be used finally.
+ *
+ *
+ * If timekeeping has injected sleeptime via either 1) or 2),
+ * 3) becomes needless, so in this case we don't need to call
+ * rtc_resume(), and this is what timekeeping_rtc_skipresume()
+ * means.
+ */
+bool timekeeping_rtc_skipresume(void)
+{
+        return sleeptime_injected;
+}
+/**
+ * 1) can be determined whether to use or not only when doing
+ * timekeeping_resume() which is invoked after rtc_suspend(),
+ * so we can't skip rtc_suspend() surely if system has 1).
+ *
+ * But if system has 2), 2) will definitely be used, so in this
+ * case we don't need to call rtc_suspend(), and this is what
+ * timekeeping_rtc_skipsuspend() means.
+ */
+bool timekeeping_rtc_skipsuspend(void)
+{
+        return persistent_clock_exists;
+}
 /**
 * timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values
 * @delta: pointer to a timespec64 delta value
 *
- * This hook is for architectures that cannot support read_persistent_clock
+ * This hook is for architectures that cannot support read_persistent_clock64
 * because their RTC/persistent clock is only accessible when irqs are enabled.
+ * and also don't have an effective nonstop clocksource.
 *
 * This function should only be called by rtc_resume(), and allows
 * a suspend offset to be injected into the timekeeping values.
@@ -1167,13 +1335,6 @@ void timekeeping_inject_sleeptime64(struct timespec64 *delta)
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned long flags;
-        /*
-         * Make sure we don't set the clock twice, as timekeeping_resume()
-         * already did it
-         */
-        if (has_persistent_clock())
-                return;
        raw_spin_lock_irqsave(&timekeeper_lock, flags);
        write_seqcount_begin(&tk_core.seq);
@@ -1189,26 +1350,21 @@ void timekeeping_inject_sleeptime64(struct timespec64 *delta)
        /* signal hrtimers about time change */
        clock_was_set();
 }
+#endif
 /**
 * timekeeping_resume - Resumes the generic timekeeping subsystem.
- *
- * This is for the generic clocksource timekeeping.
- * xtime/wall_to_monotonic/jiffies/etc are
- * still managed by arch specific suspend/resume code.
 */
 void timekeeping_resume(void)
 {
        struct timekeeper *tk = &tk_core.timekeeper;
-        struct clocksource *clock = tk->tkr.clock;
+        struct clocksource *clock = tk->tkr_mono.clock;
        unsigned long flags;
        struct timespec64 ts_new, ts_delta;
-        struct timespec tmp;
        cycle_t cycle_now, cycle_delta;
-        bool suspendtime_found = false;
-        read_persistent_clock(&tmp);
+        sleeptime_injected = false;
-        ts_new = timespec_to_timespec64(tmp);
+        read_persistent_clock64(&ts_new);
        clockevents_resume();
        clocksource_resume();
@@ -1228,16 +1384,16 @@ void timekeeping_resume(void)
         * The less preferred source will only be tried if there is no better
         * usable source. The rtc part is handled separately in rtc core code.
         */
-        cycle_now = tk->tkr.read(clock);
+        cycle_now = tk->tkr_mono.read(clock);
        if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) &&
-                cycle_now > tk->tkr.cycle_last) {
+                cycle_now > tk->tkr_mono.cycle_last) {
                u64 num, max = ULLONG_MAX;
                u32 mult = clock->mult;
                u32 shift = clock->shift;
                s64 nsec = 0;
-                cycle_delta = clocksource_delta(cycle_now, tk->tkr.cycle_last,
+                cycle_delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last,
-                                                tk->tkr.mask);
+                                                tk->tkr_mono.mask);
                /*
                 * "cycle_delta * mutl" may cause 64 bits overflow, if the
@@ -1253,17 +1409,19 @@ void timekeeping_resume(void)
                nsec += ((u64) cycle_delta * mult) >> shift;
                ts_delta = ns_to_timespec64(nsec);
-                suspendtime_found = true;
+                sleeptime_injected = true;
        } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) {
                ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time);
-                suspendtime_found = true;
+                sleeptime_injected = true;
        }
-        if (suspendtime_found)
+        if (sleeptime_injected)
                __timekeeping_inject_sleeptime(tk, &ts_delta);
        /* Re-base the last cycle value */
-        tk->tkr.cycle_last = cycle_now;
+        tk->tkr_mono.cycle_last = cycle_now;
+        tk->tkr_raw.cycle_last  = cycle_now;
        tk->ntp_error = 0;
        timekeeping_suspended = 0;
        timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
@@ -1272,9 +1430,7 @@ void timekeeping_resume(void)
        touch_softlockup_watchdog();
-        clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
+        tick_resume();
-        /* Resume hrtimers */
        hrtimers_resume();
 }
@@ -1284,10 +1440,8 @@ int timekeeping_suspend(void)
        unsigned long flags;
        struct timespec64               delta, delta_delta;
        static struct timespec64        old_delta;
-        struct timespec tmp;
-        read_persistent_clock(&tmp);
+        read_persistent_clock64(&timekeeping_suspend_time);
-        timekeeping_suspend_time = timespec_to_timespec64(tmp);
        /*
         * On some systems the persistent_clock can not be detected at
@@ -1295,31 +1449,33 @@ int timekeeping_suspend(void)
         * value returned, update the persistent_clock_exists flag.
         */
        if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec)
-                persistent_clock_exist = true;
+                persistent_clock_exists = true;
        raw_spin_lock_irqsave(&timekeeper_lock, flags);
        write_seqcount_begin(&tk_core.seq);
        timekeeping_forward_now(tk);
        timekeeping_suspended = 1;
-        /*
+        if (persistent_clock_exists) {
-         * To avoid drift caused by repeated suspend/resumes,
-         * which each can add ~1 second drift error,
-         * try to compensate so the difference in system time
-         * and persistent_clock time stays close to constant.
-         */
-        delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time);
-        delta_delta = timespec64_sub(delta, old_delta);
-        if (abs(delta_delta.tv_sec)  >= 2) {
                /*
-                 * if delta_delta is too large, assume time correction
+                 * To avoid drift caused by repeated suspend/resumes,
-                 * has occured and set old_delta to the current delta.
+                 * which each can add ~1 second drift error,
+                 * try to compensate so the difference in system time
+                 * and persistent_clock time stays close to constant.
                 */
-                old_delta = delta;
+                delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time);
-        } else {
+                delta_delta = timespec64_sub(delta, old_delta);
-                /* Otherwise try to adjust old_system to compensate */
+                if (abs(delta_delta.tv_sec) >= 2) {
-                timekeeping_suspend_time =
+                        /*
-                        timespec64_add(timekeeping_suspend_time, delta_delta);
+                         * if delta_delta is too large, assume time correction
+                         * has occurred and set old_delta to the current delta.
+                         */
+                        old_delta = delta;
+                } else {
+                        /* Otherwise try to adjust old_system to compensate */
+                        timekeeping_suspend_time =
+                                timespec64_add(timekeeping_suspend_time, delta_delta);
+                }
        }
        timekeeping_update(tk, TK_MIRROR);
@@ -1327,7 +1483,7 @@ int timekeeping_suspend(void)
        write_seqcount_end(&tk_core.seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
-        clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
+        tick_suspend();
        clocksource_suspend();
        clockevents_suspend();
@@ -1416,15 +1572,15 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
         *
         * XXX - TODO: Doc ntp_error calculation.
         */
-        if ((mult_adj > 0) && (tk->tkr.mult + mult_adj < mult_adj)) {
+        if ((mult_adj > 0) && (tk->tkr_mono.mult + mult_adj < mult_adj)) {
                /* NTP adjustment caused clocksource mult overflow */
                WARN_ON_ONCE(1);
                return;
        }
-        tk->tkr.mult += mult_adj;
+        tk->tkr_mono.mult += mult_adj;
        tk->xtime_interval += interval;
-        tk->tkr.xtime_nsec -= offset;
+        tk->tkr_mono.xtime_nsec -= offset;
        tk->ntp_error -= (interval - offset) << tk->ntp_error_shift;
 }
@@ -1486,13 +1642,13 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
                tk->ntp_err_mult = 0;
        }
-        if (unlikely(tk->tkr.clock->maxadj &&
+        if (unlikely(tk->tkr_mono.clock->maxadj &&
-                (abs(tk->tkr.mult - tk->tkr.clock->mult)
+                (abs(tk->tkr_mono.mult - tk->tkr_mono.clock->mult)
-                        > tk->tkr.clock->maxadj))) {
+                        > tk->tkr_mono.clock->maxadj))) {
                printk_once(KERN_WARNING
                        "Adjusting %s more than 11%% (%ld vs %ld)\n",
-                        tk->tkr.clock->name, (long)tk->tkr.mult,
+                        tk->tkr_mono.clock->name, (long)tk->tkr_mono.mult,
-                        (long)tk->tkr.clock->mult + tk->tkr.clock->maxadj);
+                        (long)tk->tkr_mono.clock->mult + tk->tkr_mono.clock->maxadj);
        }
        /*
@@ -1509,9 +1665,9 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
         * We'll correct this error next time through this function, when
         * xtime_nsec is not as small.
         */
-        if (unlikely((s64)tk->tkr.xtime_nsec < 0)) {
+        if (unlikely((s64)tk->tkr_mono.xtime_nsec < 0)) {
-                s64 neg = -(s64)tk->tkr.xtime_nsec;
+                s64 neg = -(s64)tk->tkr_mono.xtime_nsec;
-                tk->tkr.xtime_nsec = 0;
+                tk->tkr_mono.xtime_nsec = 0;
                tk->ntp_error += neg << tk->ntp_error_shift;
        }
 }
@@ -1526,13 +1682,13 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
 */
 static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
 {
-        u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr.shift;
+        u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
        unsigned int clock_set = 0;
-        while (tk->tkr.xtime_nsec >= nsecps) {
+        while (tk->tkr_mono.xtime_nsec >= nsecps) {
                int leap;
-                tk->tkr.xtime_nsec -= nsecps;
+                tk->tkr_mono.xtime_nsec -= nsecps;
                tk->xtime_sec++;
                /* Figure out if its a leap sec and apply if needed */
@@ -1577,9 +1733,10 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
        /* Accumulate one shifted interval */
        offset -= interval;
-        tk->tkr.cycle_last += interval;
+        tk->tkr_mono.cycle_last += interval;
+        tk->tkr_raw.cycle_last  += interval;
-        tk->tkr.xtime_nsec += tk->xtime_interval << shift;
+        tk->tkr_mono.xtime_nsec += tk->xtime_interval << shift;
        *clock_set |= accumulate_nsecs_to_secs(tk);
        /* Accumulate raw time */
@@ -1622,14 +1779,17 @@ void update_wall_time(void)
 #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
        offset = real_tk->cycle_interval;
 #else
-        offset = clocksource_delta(tk->tkr.read(tk->tkr.clock),
+        offset = clocksource_delta(tk->tkr_mono.read(tk->tkr_mono.clock),
-                                   tk->tkr.cycle_last, tk->tkr.mask);
+                                   tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
 #endif
        /* Check if there's really nothing to do */
        if (offset < real_tk->cycle_interval)
                goto out;
+        /* Do some additional sanity checking */
+        timekeeping_check_update(real_tk, offset);
        /*
         * With NO_HZ we may have to accumulate many cycle_intervals
         * (think "ticks") worth of time at once. To do this efficiently,
@@ -1784,8 +1944,8 @@ ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, ktime_t *offs_boot,
        do {
                seq = read_seqcount_begin(&tk_core.seq);
-                base = tk->tkr.base_mono;
+                base = tk->tkr_mono.base;
-                nsecs = tk->tkr.xtime_nsec >> tk->tkr.shift;
+                nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
                *offs_real = tk->offs_real;
                *offs_boot = tk->offs_boot;
@@ -1816,8 +1976,8 @@ ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot,
        do {
                seq = read_seqcount_begin(&tk_core.seq);
-                base = tk->tkr.base_mono;
+                base = tk->tkr_mono.base;
-                nsecs = timekeeping_get_ns(&tk->tkr);
+                nsecs = timekeeping_get_ns(&tk->tkr_mono);
                *offs_real = tk->offs_real;
                *offs_boot = tk->offs_boot;
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index 1d91416055d5..ead8794b9a4e 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -19,4 +19,11 @@ extern void timekeeping_clocktai(struct timespec *ts);
 extern int timekeeping_suspend(void);
 extern void timekeeping_resume(void);
+extern void do_timer(unsigned long ticks);
+extern void update_wall_time(void);
+extern seqlock_t jiffies_lock;
+#define CS_NAME_LEN     32
 #endif
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 2d3f5c504939..2ece3aa5069c 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -90,8 +90,18 @@ struct tvec_base {
        struct tvec tv5;
 } ____cacheline_aligned;
+/*
+ * __TIMER_INITIALIZER() needs to set ->base to a valid pointer (because we've
+ * made NULL special, hint: lock_timer_base()) and we cannot get a compile time
+ * pointer to per-cpu entries because we don't know where we'll map the section,
+ * even for the boot cpu.
+ *
+ * And so we use boot_tvec_bases for boot CPU and per-cpu __tvec_bases for the
+ * rest of them.
+ */
 struct tvec_base boot_tvec_bases;
 EXPORT_SYMBOL(boot_tvec_bases);
 static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
 /* Functions below help us manage 'deferrable' flag */
@@ -1027,6 +1037,8 @@ int try_to_del_timer_sync(struct timer_list *timer)
 EXPORT_SYMBOL(try_to_del_timer_sync);
 #ifdef CONFIG_SMP
+static DEFINE_PER_CPU(struct tvec_base, __tvec_bases);
 /**
 * del_timer_sync - deactivate a timer and wait for the handler to finish.
 * @timer: the timer to be deactivated
@@ -1532,64 +1544,6 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout)
 }
 EXPORT_SYMBOL(schedule_timeout_uninterruptible);
-static int init_timers_cpu(int cpu)
-{
-        int j;
-        struct tvec_base *base;
-        static char tvec_base_done[NR_CPUS];
-        if (!tvec_base_done[cpu]) {
-                static char boot_done;
-                if (boot_done) {
-                        /*
-                         * The APs use this path later in boot
-                         */
-                        base = kzalloc_node(sizeof(*base), GFP_KERNEL,
-                                            cpu_to_node(cpu));
-                        if (!base)
-                                return -ENOMEM;
-                        /* Make sure tvec_base has TIMER_FLAG_MASK bits free */
-                        if (WARN_ON(base != tbase_get_base(base))) {
-                                kfree(base);
-                                return -ENOMEM;
-                        }
-                        per_cpu(tvec_bases, cpu) = base;
-                } else {
-                        /*
-                         * This is for the boot CPU - we use compile-time
-                         * static initialisation because per-cpu memory isn't
-                         * ready yet and because the memory allocators are not
-                         * initialised either.
-                         */
-                        boot_done = 1;
-                        base = &boot_tvec_bases;
-                }
-                spin_lock_init(&base->lock);
-                tvec_base_done[cpu] = 1;
-                base->cpu = cpu;
-        } else {
-                base = per_cpu(tvec_bases, cpu);
-        }
-        for (j = 0; j < TVN_SIZE; j++) {
-                INIT_LIST_HEAD(base->tv5.vec + j);
-                INIT_LIST_HEAD(base->tv4.vec + j);
-                INIT_LIST_HEAD(base->tv3.vec + j);
-                INIT_LIST_HEAD(base->tv2.vec + j);
-        }
-        for (j = 0; j < TVR_SIZE; j++)
-                INIT_LIST_HEAD(base->tv1.vec + j);
-        base->timer_jiffies = jiffies;
-        base->next_timer = base->timer_jiffies;
-        base->active_timers = 0;
-        base->all_timers = 0;
-        return 0;
-}
 #ifdef CONFIG_HOTPLUG_CPU
 static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head)
 {
@@ -1631,55 +1585,86 @@ static void migrate_timers(int cpu)
                migrate_timer_list(new_base, old_base->tv5.vec + i);
        }
+        old_base->active_timers = 0;
+        old_base->all_timers = 0;
        spin_unlock(&old_base->lock);
        spin_unlock_irq(&new_base->lock);
        put_cpu_var(tvec_bases);
 }
-#endif /* CONFIG_HOTPLUG_CPU */
 static int timer_cpu_notify(struct notifier_block *self,
                                unsigned long action, void *hcpu)
 {
-        long cpu = (long)hcpu;
+        switch (action) {
-        int err;
-        switch(action) {
-        case CPU_UP_PREPARE:
-        case CPU_UP_PREPARE_FROZEN:
-                err = init_timers_cpu(cpu);
-                if (err < 0)
-                        return notifier_from_errno(err);
-                break;
-#ifdef CONFIG_HOTPLUG_CPU
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
-                migrate_timers(cpu);
+                migrate_timers((long)hcpu);
                break;
-#endif
        default:
                break;
        }
        return NOTIFY_OK;
 }
-static struct notifier_block timers_nb = {
+static inline void timer_register_cpu_notifier(void)
-        .notifier_call  = timer_cpu_notify,
+{
-};
+        cpu_notifier(timer_cpu_notify, 0);
+}
+#else
+static inline void timer_register_cpu_notifier(void) { }
+#endif /* CONFIG_HOTPLUG_CPU */
+static void __init init_timer_cpu(struct tvec_base *base, int cpu)
+{
+        int j;
-void __init init_timers(void)
+        BUG_ON(base != tbase_get_base(base));
+        base->cpu = cpu;
+        per_cpu(tvec_bases, cpu) = base;
+        spin_lock_init(&base->lock);
+        for (j = 0; j < TVN_SIZE; j++) {
+                INIT_LIST_HEAD(base->tv5.vec + j);
+                INIT_LIST_HEAD(base->tv4.vec + j);
+                INIT_LIST_HEAD(base->tv3.vec + j);
+                INIT_LIST_HEAD(base->tv2.vec + j);
+        }
+        for (j = 0; j < TVR_SIZE; j++)
+                INIT_LIST_HEAD(base->tv1.vec + j);
+        base->timer_jiffies = jiffies;
+        base->next_timer = base->timer_jiffies;
+}
+static void __init init_timer_cpus(void)
 {
-        int err;
+        struct tvec_base *base;
+        int local_cpu = smp_processor_id();
+        int cpu;
+        for_each_possible_cpu(cpu) {
+                if (cpu == local_cpu)
+                        base = &boot_tvec_bases;
+#ifdef CONFIG_SMP
+                else
+                        base = per_cpu_ptr(&__tvec_bases, cpu);
+#endif
+                init_timer_cpu(base, cpu);
+        }
+}
+void __init init_timers(void)
+{
        /* ensure there are enough low bits for flags in timer->base pointer */
        BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK);
-        err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
+        init_timer_cpus();
-                               (void *)(long)smp_processor_id());
-        BUG_ON(err != NOTIFY_OK);
        init_timer_stats();
-        register_cpu_notifier(&timers_nb);
+        timer_register_cpu_notifier();
        open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
 }
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 61ed862cdd37..e878c2e0ba45 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -16,10 +16,10 @@
 #include <linux/sched.h>
 #include <linux/seq_file.h>
 #include <linux/kallsyms.h>
-#include <linux/tick.h>
 #include <asm/uaccess.h>
+#include "tick-internal.h"
 struct timer_list_iter {
        int cpu;
@@ -228,9 +228,35 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
        print_name_offset(m, dev->set_next_event);
        SEQ_printf(m, "\n");
-        SEQ_printf(m, " set_mode:       ");
+        if (dev->set_mode) {
-        print_name_offset(m, dev->set_mode);
+                SEQ_printf(m, " set_mode:       ");
-        SEQ_printf(m, "\n");
+                print_name_offset(m, dev->set_mode);
+                SEQ_printf(m, "\n");
+        } else {
+                if (dev->set_state_shutdown) {
+                        SEQ_printf(m, " shutdown: ");
+                        print_name_offset(m, dev->set_state_shutdown);
+                        SEQ_printf(m, "\n");
+                }
+                if (dev->set_state_periodic) {
+                        SEQ_printf(m, " periodic: ");
+                        print_name_offset(m, dev->set_state_periodic);
+                        SEQ_printf(m, "\n");
+                }
+                if (dev->set_state_oneshot) {
+                        SEQ_printf(m, " oneshot:  ");
+                        print_name_offset(m, dev->set_state_oneshot);
+                        SEQ_printf(m, "\n");
+                }
+                if (dev->tick_resume) {
+                        SEQ_printf(m, " resume:   ");
+                        print_name_offset(m, dev->tick_resume);
+                        SEQ_printf(m, "\n");
+                }
+        }
        SEQ_printf(m, " event_handler:  ");
        print_name_offset(m, dev->event_handler);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index a5da09c899dd..3b9a48ae153a 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -432,6 +432,14 @@ config UPROBE_EVENT
          This option is required if you plan to use perf-probe subcommand
          of perf tools on user space applications.
+config BPF_EVENTS
+        depends on BPF_SYSCALL
+        depends on KPROBE_EVENT
+        bool
+        default y
+        help
+          This allows the user to attach BPF programs to kprobe events.
 config PROBE_EVENTS
        def_bool n
@@ -599,6 +607,34 @@ config RING_BUFFER_STARTUP_TEST
         If unsure, say N
+config TRACE_ENUM_MAP_FILE
+       bool "Show enum mappings for trace events"
+       depends on TRACING
+       help
+        The "print fmt" of the trace events will show the enum names instead
+        of their values. This can cause problems for user space tools that
+        use this string to parse the raw data as user space does not know
+        how to convert the string to its value.
+        To fix this, there's a special macro in the kernel that can be used
+        to convert the enum into its value. If this macro is used, then the
+        print fmt strings will have the enums converted to their values.
+        If something does not get converted properly, this option can be
+        used to show what enums the kernel tried to convert.
+        This option is for debugging the enum conversions. A file is created
+        in the tracing directory called "enum_map" that will show the enum
+        names matched with their values and what trace event system they
+        belong too.
+        Normally, the mapping of the strings to values will be freed after
+        boot up or module load. With this option, they will not be freed, as
+        they are needed for the "enum_map" file. Enabling this option will
+        increase the memory footprint of the running kernel.
+        If unsure, say N
 endif # FTRACE
 endif # TRACING_SUPPORT
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 98f26588255e..9b1044e936a6 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -53,6 +53,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
 endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
+obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o
 obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
 obj-$(CONFIG_TRACEPOINTS) += power-traces.o
 ifeq ($(CONFIG_PM),y)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
new file mode 100644
index 000000000000..2d56ce501632
--- /dev/null
+++ b/kernel/trace/bpf_trace.c
@@ -0,0 +1,222 @@
+/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/uaccess.h>
+#include <linux/ctype.h>
+#include "trace.h"
+static DEFINE_PER_CPU(int, bpf_prog_active);
+/**
+ * trace_call_bpf - invoke BPF program
+ * @prog: BPF program
+ * @ctx: opaque context pointer
+ *
+ * kprobe handlers execute BPF programs via this helper.
+ * Can be used from static tracepoints in the future.
+ *
+ * Return: BPF programs always return an integer which is interpreted by
+ * kprobe handler as:
+ * 0 - return from kprobe (event is filtered out)
+ * 1 - store kprobe event into ring buffer
+ * Other values are reserved and currently alias to 1
+ */
+unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
+{
+        unsigned int ret;
+        if (in_nmi()) /* not supported yet */
+                return 1;
+        preempt_disable();
+        if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
+                /*
+                 * since some bpf program is already running on this cpu,
+                 * don't call into another bpf program (same or different)
+                 * and don't send kprobe event into ring-buffer,
+                 * so return zero here
+                 */
+                ret = 0;
+                goto out;
+        }
+        rcu_read_lock();
+        ret = BPF_PROG_RUN(prog, ctx);
+        rcu_read_unlock();
+ out:
+        __this_cpu_dec(bpf_prog_active);
+        preempt_enable();
+        return ret;
+}
+EXPORT_SYMBOL_GPL(trace_call_bpf);
+static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+        void *dst = (void *) (long) r1;
+        int size = (int) r2;
+        void *unsafe_ptr = (void *) (long) r3;
+        return probe_kernel_read(dst, unsafe_ptr, size);
+}
+static const struct bpf_func_proto bpf_probe_read_proto = {
+        .func           = bpf_probe_read,
+        .gpl_only       = true,
+        .ret_type       = RET_INTEGER,
+        .arg1_type      = ARG_PTR_TO_STACK,
+        .arg2_type      = ARG_CONST_STACK_SIZE,
+        .arg3_type      = ARG_ANYTHING,
+};
+static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+        /* NMI safe access to clock monotonic */
+        return ktime_get_mono_fast_ns();
+}
+static const struct bpf_func_proto bpf_ktime_get_ns_proto = {
+        .func           = bpf_ktime_get_ns,
+        .gpl_only       = true,
+        .ret_type       = RET_INTEGER,
+};
+/*
+ * limited trace_printk()
+ * only %d %u %x %ld %lu %lx %lld %llu %llx %p conversion specifiers allowed
+ */
+static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
+{
+        char *fmt = (char *) (long) r1;
+        int mod[3] = {};
+        int fmt_cnt = 0;
+        int i;
+        /*
+         * bpf_check()->check_func_arg()->check_stack_boundary()
+         * guarantees that fmt points to bpf program stack,
+         * fmt_size bytes of it were initialized and fmt_size > 0
+         */
+        if (fmt[--fmt_size] != 0)
+                return -EINVAL;
+        /* check format string for allowed specifiers */
+        for (i = 0; i < fmt_size; i++) {
+                if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i]))
+                        return -EINVAL;
+                if (fmt[i] != '%')
+                        continue;
+                if (fmt_cnt >= 3)
+                        return -EINVAL;
+                /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */
+                i++;
+                if (fmt[i] == 'l') {
+                        mod[fmt_cnt]++;
+                        i++;
+                } else if (fmt[i] == 'p') {
+                        mod[fmt_cnt]++;
+                        i++;
+                        if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0)
+                                return -EINVAL;
+                        fmt_cnt++;
+                        continue;
+                }
+                if (fmt[i] == 'l') {
+                        mod[fmt_cnt]++;
+                        i++;
+                }
+                if (fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x')
+                        return -EINVAL;
+                fmt_cnt++;
+        }
+        return __trace_printk(1/* fake ip will not be printed */, fmt,
+                              mod[0] == 2 ? r3 : mod[0] == 1 ? (long) r3 : (u32) r3,
+                              mod[1] == 2 ? r4 : mod[1] == 1 ? (long) r4 : (u32) r4,
+                              mod[2] == 2 ? r5 : mod[2] == 1 ? (long) r5 : (u32) r5);
+}
+static const struct bpf_func_proto bpf_trace_printk_proto = {
+        .func           = bpf_trace_printk,
+        .gpl_only       = true,
+        .ret_type       = RET_INTEGER,
+        .arg1_type      = ARG_PTR_TO_STACK,
+        .arg2_type      = ARG_CONST_STACK_SIZE,
+};
+static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
+{
+        switch (func_id) {
+        case BPF_FUNC_map_lookup_elem:
+                return &bpf_map_lookup_elem_proto;
+        case BPF_FUNC_map_update_elem:
+                return &bpf_map_update_elem_proto;
+        case BPF_FUNC_map_delete_elem:
+                return &bpf_map_delete_elem_proto;
+        case BPF_FUNC_probe_read:
+                return &bpf_probe_read_proto;
+        case BPF_FUNC_ktime_get_ns:
+                return &bpf_ktime_get_ns_proto;
+        case BPF_FUNC_trace_printk:
+                /*
+                 * this program might be calling bpf_trace_printk,
+                 * so allocate per-cpu printk buffers
+                 */
+                trace_printk_init_buffers();
+                return &bpf_trace_printk_proto;
+        default:
+                return NULL;
+        }
+}
+/* bpf+kprobe programs can access fields of 'struct pt_regs' */
+static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type)
+{
+        /* check bounds */
+        if (off < 0 || off >= sizeof(struct pt_regs))
+                return false;
+        /* only read is allowed */
+        if (type != BPF_READ)
+                return false;
+        /* disallow misaligned access */
+        if (off % size != 0)
+                return false;
+        return true;
+}
+static struct bpf_verifier_ops kprobe_prog_ops = {
+        .get_func_proto  = kprobe_prog_func_proto,
+        .is_valid_access = kprobe_prog_is_valid_access,
+};
+static struct bpf_prog_type_list kprobe_tl = {
+        .ops    = &kprobe_prog_ops,
+        .type   = BPF_PROG_TYPE_KPROBE,
+};
+static int __init register_kprobe_prog_ops(void)
+{
+        bpf_register_prog_type(&kprobe_tl);
+        return 0;
+}
+late_initcall(register_kprobe_prog_ops);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4f228024055b..02bece4a99ea 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -18,7 +18,7 @@
 #include <linux/kallsyms.h>
 #include <linux/seq_file.h>
 #include <linux/suspend.h>
-#include <linux/debugfs.h>
+#include <linux/tracefs.h>
 #include <linux/hardirq.h>
 #include <linux/kthread.h>
 #include <linux/uaccess.h>
@@ -249,6 +249,19 @@ static void update_function_graph_func(void);
 static inline void update_function_graph_func(void) { }
 #endif
+static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops)
+{
+        /*
+         * If this is a dynamic ops or we force list func,
+         * then it needs to call the list anyway.
+         */
+        if (ops->flags & FTRACE_OPS_FL_DYNAMIC || FTRACE_FORCE_LIST_FUNC)
+                return ftrace_ops_list_func;
+        return ftrace_ops_get_func(ops);
+}
 static void update_ftrace_function(void)
 {
        ftrace_func_t func;
@@ -270,7 +283,7 @@ static void update_ftrace_function(void)
         * then have the mcount trampoline call the function directly.
         */
        } else if (ftrace_ops_list->next == &ftrace_list_end) {
-                func = ftrace_ops_get_func(ftrace_ops_list);
+                func = ftrace_ops_get_list_func(ftrace_ops_list);
        } else {
                /* Just use the default ftrace_ops */
@@ -1008,7 +1021,7 @@ static struct tracer_stat function_stats __initdata = {
        .stat_show      = function_stat_show
 };
-static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
+static __init void ftrace_profile_tracefs(struct dentry *d_tracer)
 {
        struct ftrace_profile_stat *stat;
        struct dentry *entry;
@@ -1044,15 +1057,15 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
                }
        }
-        entry = debugfs_create_file("function_profile_enabled", 0644,
+        entry = tracefs_create_file("function_profile_enabled", 0644,
                                    d_tracer, NULL, &ftrace_profile_fops);
        if (!entry)
-                pr_warning("Could not create debugfs "
+                pr_warning("Could not create tracefs "
                           "'function_profile_enabled' entry\n");
 }
 #else /* CONFIG_FUNCTION_PROFILER */
-static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
+static __init void ftrace_profile_tracefs(struct dentry *d_tracer)
 {
 }
 #endif /* CONFIG_FUNCTION_PROFILER */
@@ -4712,7 +4725,7 @@ void ftrace_destroy_filter_files(struct ftrace_ops *ops)
        mutex_unlock(&ftrace_lock);
 }
-static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
+static __init int ftrace_init_dyn_tracefs(struct dentry *d_tracer)
 {
        trace_create_file("available_filter_functions", 0444,
@@ -5020,7 +5033,7 @@ static int __init ftrace_nodyn_init(void)
 }
 core_initcall(ftrace_nodyn_init);
-static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
+static inline int ftrace_init_dyn_tracefs(struct dentry *d_tracer) { return 0; }
 static inline void ftrace_startup_enable(int command) { }
 static inline void ftrace_startup_all(int command) { }
 /* Keep as macros so we do not need to define the commands */
@@ -5209,13 +5222,6 @@ static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip,
 ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
 {
        /*
-         * If this is a dynamic ops or we force list func,
-         * then it needs to call the list anyway.
-         */
-        if (ops->flags & FTRACE_OPS_FL_DYNAMIC || FTRACE_FORCE_LIST_FUNC)
-                return ftrace_ops_list_func;
-        /*
         * If the func handles its own recursion, call it directly.
         * Otherwise call the recursion protected function that
         * will call the ftrace ops function.
@@ -5473,7 +5479,7 @@ static const struct file_operations ftrace_pid_fops = {
        .release        = ftrace_pid_release,
 };
-static __init int ftrace_init_debugfs(void)
+static __init int ftrace_init_tracefs(void)
 {
        struct dentry *d_tracer;
@@ -5481,16 +5487,16 @@ static __init int ftrace_init_debugfs(void)
        if (IS_ERR(d_tracer))
                return 0;
-        ftrace_init_dyn_debugfs(d_tracer);
+        ftrace_init_dyn_tracefs(d_tracer);
        trace_create_file("set_ftrace_pid", 0644, d_tracer,
                            NULL, &ftrace_pid_fops);
-        ftrace_profile_debugfs(d_tracer);
+        ftrace_profile_tracefs(d_tracer);
        return 0;
 }
-fs_initcall(ftrace_init_debugfs);
+fs_initcall(ftrace_init_tracefs);
 /**
 * ftrace_kill - kill ftrace
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 5040d44fe5a3..0315d43176d8 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2679,7 +2679,7 @@ static DEFINE_PER_CPU(unsigned int, current_context);
 static __always_inline int trace_recursive_lock(void)
 {
-        unsigned int val = this_cpu_read(current_context);
+        unsigned int val = __this_cpu_read(current_context);
        int bit;
        if (in_interrupt()) {
@@ -2696,18 +2696,14 @@ static __always_inline int trace_recursive_lock(void)
                return 1;
        val |= (1 << bit);
-        this_cpu_write(current_context, val);
+        __this_cpu_write(current_context, val);
        return 0;
 }
 static __always_inline void trace_recursive_unlock(void)
 {
-        unsigned int val = this_cpu_read(current_context);
+        __this_cpu_and(current_context, __this_cpu_read(current_context) - 1);
-        val--;
-        val &= this_cpu_read(current_context);
-        this_cpu_write(current_context, val);
 }
 #else
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 62c6506d663f..91eecaaa43e0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -20,6 +20,7 @@
 #include <linux/notifier.h>
 #include <linux/irqflags.h>
 #include <linux/debugfs.h>
+#include <linux/tracefs.h>
 #include <linux/pagemap.h>
 #include <linux/hardirq.h>
 #include <linux/linkage.h>
@@ -31,6 +32,7 @@
 #include <linux/splice.h>
 #include <linux/kdebug.h>
 #include <linux/string.h>
+#include <linux/mount.h>
 #include <linux/rwsem.h>
 #include <linux/slab.h>
 #include <linux/ctype.h>
@@ -123,6 +125,42 @@ enum ftrace_dump_mode ftrace_dump_on_oops;
 /* When set, tracing will stop when a WARN*() is hit */
 int __disable_trace_on_warning;
+#ifdef CONFIG_TRACE_ENUM_MAP_FILE
+/* Map of enums to their values, for "enum_map" file */
+struct trace_enum_map_head {
+        struct module                   *mod;
+        unsigned long                   length;
+};
+union trace_enum_map_item;
+struct trace_enum_map_tail {
+        /*
+         * "end" is first and points to NULL as it must be different
+         * than "mod" or "enum_string"
+         */
+        union trace_enum_map_item       *next;
+        const char                      *end;   /* points to NULL */
+};
+static DEFINE_MUTEX(trace_enum_mutex);
+/*
+ * The trace_enum_maps are saved in an array with two extra elements,
+ * one at the beginning, and one at the end. The beginning item contains
+ * the count of the saved maps (head.length), and the module they
+ * belong to if not built in (head.mod). The ending item contains a
+ * pointer to the next array of saved enum_map items.
+ */
+union trace_enum_map_item {
+        struct trace_enum_map           map;
+        struct trace_enum_map_head      head;
+        struct trace_enum_map_tail      tail;
+};
+static union trace_enum_map_item *trace_enum_maps;
+#endif /* CONFIG_TRACE_ENUM_MAP_FILE */
 static int tracing_set_tracer(struct trace_array *tr, const char *buf);
 #define MAX_TRACER_SIZE         100
@@ -3908,6 +3946,182 @@ static const struct file_operations tracing_saved_cmdlines_size_fops = {
        .write          = tracing_saved_cmdlines_size_write,
 };
+#ifdef CONFIG_TRACE_ENUM_MAP_FILE
+static union trace_enum_map_item *
+update_enum_map(union trace_enum_map_item *ptr)
+{
+        if (!ptr->map.enum_string) {
+                if (ptr->tail.next) {
+                        ptr = ptr->tail.next;
+                        /* Set ptr to the next real item (skip head) */
+                        ptr++;
+                } else
+                        return NULL;
+        }
+        return ptr;
+}
+static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos)
+{
+        union trace_enum_map_item *ptr = v;
+        /*
+         * Paranoid! If ptr points to end, we don't want to increment past it.
+         * This really should never happen.
+         */
+        ptr = update_enum_map(ptr);
+        if (WARN_ON_ONCE(!ptr))
+                return NULL;
+        ptr++;
+        (*pos)++;
+        ptr = update_enum_map(ptr);
+        return ptr;
+}
+static void *enum_map_start(struct seq_file *m, loff_t *pos)
+{
+        union trace_enum_map_item *v;
+        loff_t l = 0;
+        mutex_lock(&trace_enum_mutex);
+        v = trace_enum_maps;
+        if (v)
+                v++;
+        while (v && l < *pos) {
+                v = enum_map_next(m, v, &l);
+        }
+        return v;
+}
+static void enum_map_stop(struct seq_file *m, void *v)
+{
+        mutex_unlock(&trace_enum_mutex);
+}
+static int enum_map_show(struct seq_file *m, void *v)
+{
+        union trace_enum_map_item *ptr = v;
+        seq_printf(m, "%s %ld (%s)\n",
+                   ptr->map.enum_string, ptr->map.enum_value,
+                   ptr->map.system);
+        return 0;
+}
+static const struct seq_operations tracing_enum_map_seq_ops = {
+        .start          = enum_map_start,
+        .next           = enum_map_next,
+        .stop           = enum_map_stop,
+        .show           = enum_map_show,
+};
+static int tracing_enum_map_open(struct inode *inode, struct file *filp)
+{
+        if (tracing_disabled)
+                return -ENODEV;
+        return seq_open(filp, &tracing_enum_map_seq_ops);
+}
+static const struct file_operations tracing_enum_map_fops = {
+        .open           = tracing_enum_map_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+};
+static inline union trace_enum_map_item *
+trace_enum_jmp_to_tail(union trace_enum_map_item *ptr)
+{
+        /* Return tail of array given the head */
+        return ptr + ptr->head.length + 1;
+}
+static void
+trace_insert_enum_map_file(struct module *mod, struct trace_enum_map **start,
+                           int len)
+{
+        struct trace_enum_map **stop;
+        struct trace_enum_map **map;
+        union trace_enum_map_item *map_array;
+        union trace_enum_map_item *ptr;
+        stop = start + len;
+        /*
+         * The trace_enum_maps contains the map plus a head and tail item,
+         * where the head holds the module and length of array, and the
+         * tail holds a pointer to the next list.
+         */
+        map_array = kmalloc(sizeof(*map_array) * (len + 2), GFP_KERNEL);
+        if (!map_array) {
+                pr_warning("Unable to allocate trace enum mapping\n");
+                return;
+        }
+        mutex_lock(&trace_enum_mutex);
+        if (!trace_enum_maps)
+                trace_enum_maps = map_array;
+        else {
+                ptr = trace_enum_maps;
+                for (;;) {
+                        ptr = trace_enum_jmp_to_tail(ptr);
+                        if (!ptr->tail.next)
+                                break;
+                        ptr = ptr->tail.next;
+                }
+                ptr->tail.next = map_array;
+        }
+        map_array->head.mod = mod;
+        map_array->head.length = len;
+        map_array++;
+        for (map = start; (unsigned long)map < (unsigned long)stop; map++) {
+                map_array->map = **map;
+                map_array++;
+        }
+        memset(map_array, 0, sizeof(*map_array));
+        mutex_unlock(&trace_enum_mutex);
+}
+static void trace_create_enum_file(struct dentry *d_tracer)
+{
+        trace_create_file("enum_map", 0444, d_tracer,
+                          NULL, &tracing_enum_map_fops);
+}
+#else /* CONFIG_TRACE_ENUM_MAP_FILE */
+static inline void trace_create_enum_file(struct dentry *d_tracer) { }
+static inline void trace_insert_enum_map_file(struct module *mod,
+                              struct trace_enum_map **start, int len) { }
+#endif /* !CONFIG_TRACE_ENUM_MAP_FILE */
+static void trace_insert_enum_map(struct module *mod,
+                                  struct trace_enum_map **start, int len)
+{
+        struct trace_enum_map **map;
+        if (len <= 0)
+                return;
+        map = start;
+        trace_event_enum_update(map, len);
+        trace_insert_enum_map_file(mod, start, len);
+}
 static ssize_t
 tracing_set_trace_read(struct file *filp, char __user *ubuf,
                       size_t cnt, loff_t *ppos)
@@ -4105,9 +4319,24 @@ static void tracing_set_nop(struct trace_array *tr)
        tr->current_trace = &nop_trace;
 }
-static int tracing_set_tracer(struct trace_array *tr, const char *buf)
+static void update_tracer_options(struct trace_array *tr, struct tracer *t)
 {
        static struct trace_option_dentry *topts;
+        /* Only enable if the directory has been created already. */
+        if (!tr->dir)
+                return;
+        /* Currently, only the top instance has options */
+        if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL))
+                return;
+        destroy_trace_option_files(topts);
+        topts = create_trace_option_files(tr, t);
+}
+static int tracing_set_tracer(struct trace_array *tr, const char *buf)
+{
        struct tracer *t;
 #ifdef CONFIG_TRACER_MAX_TRACE
        bool had_max_tr;
@@ -4172,11 +4401,7 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf)
                free_snapshot(tr);
        }
 #endif
-        /* Currently, only the top instance has options */
+        update_tracer_options(tr, t);
-        if (tr->flags & TRACE_ARRAY_FL_GLOBAL) {
-                destroy_trace_option_files(topts);
-                topts = create_trace_option_files(tr, t);
-        }
 #ifdef CONFIG_TRACER_MAX_TRACE
        if (t->use_max_tr && !had_max_tr) {
@@ -5817,6 +6042,14 @@ static inline __init int register_snapshot_cmd(void) { return 0; }
 static struct dentry *tracing_get_dentry(struct trace_array *tr)
 {
+        if (WARN_ON(!tr->dir))
+                return ERR_PTR(-ENODEV);
+        /* Top directory uses NULL as the parent */
+        if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
+                return NULL;
+        /* All sub buffers have a descriptor */
        return tr->dir;
 }
@@ -5831,10 +6064,10 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)
        if (IS_ERR(d_tracer))
                return NULL;
-        tr->percpu_dir = debugfs_create_dir("per_cpu", d_tracer);
+        tr->percpu_dir = tracefs_create_dir("per_cpu", d_tracer);
        WARN_ONCE(!tr->percpu_dir,
-                  "Could not create debugfs directory 'per_cpu/%d'\n", cpu);
+                  "Could not create tracefs directory 'per_cpu/%d'\n", cpu);
        return tr->percpu_dir;
 }
@@ -5851,7 +6084,7 @@ trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent,
 }
 static void
-tracing_init_debugfs_percpu(struct trace_array *tr, long cpu)
+tracing_init_tracefs_percpu(struct trace_array *tr, long cpu)
 {
        struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu);
        struct dentry *d_cpu;
@@ -5861,9 +6094,9 @@ tracing_init_debugfs_percpu(struct trace_array *tr, long cpu)
                return;
        snprintf(cpu_dir, 30, "cpu%ld", cpu);
-        d_cpu = debugfs_create_dir(cpu_dir, d_percpu);
+        d_cpu = tracefs_create_dir(cpu_dir, d_percpu);
        if (!d_cpu) {
-                pr_warning("Could not create debugfs '%s' entry\n", cpu_dir);
+                pr_warning("Could not create tracefs '%s' entry\n", cpu_dir);
                return;
        }
@@ -6015,9 +6248,9 @@ struct dentry *trace_create_file(const char *name,
 {
        struct dentry *ret;
-        ret = debugfs_create_file(name, mode, parent, data, fops);
+        ret = tracefs_create_file(name, mode, parent, data, fops);
        if (!ret)
-                pr_warning("Could not create debugfs '%s' entry\n", name);
+                pr_warning("Could not create tracefs '%s' entry\n", name);
        return ret;
 }
@@ -6034,9 +6267,9 @@ static struct dentry *trace_options_init_dentry(struct trace_array *tr)
        if (IS_ERR(d_tracer))
                return NULL;
-        tr->options = debugfs_create_dir("options", d_tracer);
+        tr->options = tracefs_create_dir("options", d_tracer);
        if (!tr->options) {
-                pr_warning("Could not create debugfs directory 'options'\n");
+                pr_warning("Could not create tracefs directory 'options'\n");
                return NULL;
        }
@@ -6105,7 +6338,7 @@ destroy_trace_option_files(struct trace_option_dentry *topts)
                return;
        for (cnt = 0; topts[cnt].opt; cnt++)
-                debugfs_remove(topts[cnt].entry);
+                tracefs_remove(topts[cnt].entry);
        kfree(topts);
 }
@@ -6194,7 +6427,7 @@ static const struct file_operations rb_simple_fops = {
 struct dentry *trace_instance_dir;
 static void
-init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer);
+init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer);
 static int
 allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size)
@@ -6271,7 +6504,7 @@ static void free_trace_buffers(struct trace_array *tr)
 #endif
 }
-static int new_instance_create(const char *name)
+static int instance_mkdir(const char *name)
 {
        struct trace_array *tr;
        int ret;
@@ -6310,17 +6543,17 @@ static int new_instance_create(const char *name)
        if (allocate_trace_buffers(tr, trace_buf_size) < 0)
                goto out_free_tr;
-        tr->dir = debugfs_create_dir(name, trace_instance_dir);
+        tr->dir = tracefs_create_dir(name, trace_instance_dir);
        if (!tr->dir)
                goto out_free_tr;
        ret = event_trace_add_tracer(tr->dir, tr);
        if (ret) {
-                debugfs_remove_recursive(tr->dir);
+                tracefs_remove_recursive(tr->dir);
                goto out_free_tr;
        }
-        init_tracer_debugfs(tr, tr->dir);
+        init_tracer_tracefs(tr, tr->dir);
        list_add(&tr->list, &ftrace_trace_arrays);
@@ -6341,7 +6574,7 @@ static int new_instance_create(const char *name)
 }
-static int instance_delete(const char *name)
+static int instance_rmdir(const char *name)
 {
        struct trace_array *tr;
        int found = 0;
@@ -6382,82 +6615,17 @@ static int instance_delete(const char *name)
        return ret;
 }
-static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t mode)
-{
-        struct dentry *parent;
-        int ret;
-        /* Paranoid: Make sure the parent is the "instances" directory */
-        parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
-        if (WARN_ON_ONCE(parent != trace_instance_dir))
-                return -ENOENT;
-        /*
-         * The inode mutex is locked, but debugfs_create_dir() will also
-         * take the mutex. As the instances directory can not be destroyed
-         * or changed in any other way, it is safe to unlock it, and
-         * let the dentry try. If two users try to make the same dir at
-         * the same time, then the new_instance_create() will determine the
-         * winner.
-         */
-        mutex_unlock(&inode->i_mutex);
-        ret = new_instance_create(dentry->d_iname);
-        mutex_lock(&inode->i_mutex);
-        return ret;
-}
-static int instance_rmdir(struct inode *inode, struct dentry *dentry)
-{
-        struct dentry *parent;
-        int ret;
-        /* Paranoid: Make sure the parent is the "instances" directory */
-        parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
-        if (WARN_ON_ONCE(parent != trace_instance_dir))
-                return -ENOENT;
-        /* The caller did a dget() on dentry */
-        mutex_unlock(&dentry->d_inode->i_mutex);
-        /*
-         * The inode mutex is locked, but debugfs_create_dir() will also
-         * take the mutex. As the instances directory can not be destroyed
-         * or changed in any other way, it is safe to unlock it, and
-         * let the dentry try. If two users try to make the same dir at
-         * the same time, then the instance_delete() will determine the
-         * winner.
-         */
-        mutex_unlock(&inode->i_mutex);
-        ret = instance_delete(dentry->d_iname);
-        mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
-        mutex_lock(&dentry->d_inode->i_mutex);
-        return ret;
-}
-static const struct inode_operations instance_dir_inode_operations = {
-        .lookup         = simple_lookup,
-        .mkdir          = instance_mkdir,
-        .rmdir          = instance_rmdir,
-};
 static __init void create_trace_instances(struct dentry *d_tracer)
 {
-        trace_instance_dir = debugfs_create_dir("instances", d_tracer);
+        trace_instance_dir = tracefs_create_instance_dir("instances", d_tracer,
+                                                         instance_mkdir,
+                                                         instance_rmdir);
        if (WARN_ON(!trace_instance_dir))
                return;
-        /* Hijack the dir inode operations, to allow mkdir */
-        trace_instance_dir->d_inode->i_op = &instance_dir_inode_operations;
 }
 static void
-init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
+init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
 {
        int cpu;
@@ -6511,10 +6679,32 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
 #endif
        for_each_tracing_cpu(cpu)
-                tracing_init_debugfs_percpu(tr, cpu);
+                tracing_init_tracefs_percpu(tr, cpu);
 }
+static struct vfsmount *trace_automount(void *ingore)
+{
+        struct vfsmount *mnt;
+        struct file_system_type *type;
+        /*
+         * To maintain backward compatibility for tools that mount
+         * debugfs to get to the tracing facility, tracefs is automatically
+         * mounted to the debugfs/tracing directory.
+         */
+        type = get_fs_type("tracefs");
+        if (!type)
+                return NULL;
+        mnt = vfs_kern_mount(type, 0, "tracefs", NULL);
+        put_filesystem(type);
+        if (IS_ERR(mnt))
+                return NULL;
+        mntget(mnt);
+        return mnt;
+}
 /**
 * tracing_init_dentry - initialize top level trace array
 *
@@ -6526,23 +6716,112 @@ struct dentry *tracing_init_dentry(void)
 {
        struct trace_array *tr = &global_trace;
+        /* The top level trace array uses  NULL as parent */
        if (tr->dir)
-                return tr->dir;
+                return NULL;
        if (WARN_ON(!debugfs_initialized()))
                return ERR_PTR(-ENODEV);
-        tr->dir = debugfs_create_dir("tracing", NULL);
+        /*
+         * As there may still be users that expect the tracing
+         * files to exist in debugfs/tracing, we must automount
+         * the tracefs file system there, so older tools still
+         * work with the newer kerenl.
+         */
+        tr->dir = debugfs_create_automount("tracing", NULL,
+                                           trace_automount, NULL);
        if (!tr->dir) {
                pr_warn_once("Could not create debugfs directory 'tracing'\n");
                return ERR_PTR(-ENOMEM);
        }
-        return tr->dir;
+        return NULL;
+}
+extern struct trace_enum_map *__start_ftrace_enum_maps[];
+extern struct trace_enum_map *__stop_ftrace_enum_maps[];
+static void __init trace_enum_init(void)
+{
+        int len;
+        len = __stop_ftrace_enum_maps - __start_ftrace_enum_maps;
+        trace_insert_enum_map(NULL, __start_ftrace_enum_maps, len);
+}
+#ifdef CONFIG_MODULES
+static void trace_module_add_enums(struct module *mod)
+{
+        if (!mod->num_trace_enums)
+                return;
+        /*
+         * Modules with bad taint do not have events created, do
+         * not bother with enums either.
+         */
+        if (trace_module_has_bad_taint(mod))
+                return;
+        trace_insert_enum_map(mod, mod->trace_enums, mod->num_trace_enums);
 }
-static __init int tracer_init_debugfs(void)
+#ifdef CONFIG_TRACE_ENUM_MAP_FILE
+static void trace_module_remove_enums(struct module *mod)
+{
+        union trace_enum_map_item *map;
+        union trace_enum_map_item **last = &trace_enum_maps;
+        if (!mod->num_trace_enums)
+                return;
+        mutex_lock(&trace_enum_mutex);
+        map = trace_enum_maps;
+        while (map) {
+                if (map->head.mod == mod)
+                        break;
+                map = trace_enum_jmp_to_tail(map);
+                last = &map->tail.next;
+                map = map->tail.next;
+        }
+        if (!map)
+                goto out;
+        *last = trace_enum_jmp_to_tail(map)->tail.next;
+        kfree(map);
+ out:
+        mutex_unlock(&trace_enum_mutex);
+}
+#else
+static inline void trace_module_remove_enums(struct module *mod) { }
+#endif /* CONFIG_TRACE_ENUM_MAP_FILE */
+static int trace_module_notify(struct notifier_block *self,
+                               unsigned long val, void *data)
+{
+        struct module *mod = data;
+        switch (val) {
+        case MODULE_STATE_COMING:
+                trace_module_add_enums(mod);
+                break;
+        case MODULE_STATE_GOING:
+                trace_module_remove_enums(mod);
+                break;
+        }
+        return 0;
+}
+static struct notifier_block trace_module_nb = {
+        .notifier_call = trace_module_notify,
+        .priority = 0,
+};
+#endif /* CONFIG_MODULES */
+static __init int tracer_init_tracefs(void)
 {
        struct dentry *d_tracer;
@@ -6552,7 +6831,7 @@ static __init int tracer_init_debugfs(void)
        if (IS_ERR(d_tracer))
                return 0;
-        init_tracer_debugfs(&global_trace, d_tracer);
+        init_tracer_tracefs(&global_trace, d_tracer);
        trace_create_file("tracing_thresh", 0644, d_tracer,
                        &global_trace, &tracing_thresh_fops);
@@ -6566,6 +6845,14 @@ static __init int tracer_init_debugfs(void)
        trace_create_file("saved_cmdlines_size", 0644, d_tracer,
                          NULL, &tracing_saved_cmdlines_size_fops);
+        trace_enum_init();
+        trace_create_enum_file(d_tracer);
+#ifdef CONFIG_MODULES
+        register_module_notifier(&trace_module_nb);
+#endif
 #ifdef CONFIG_DYNAMIC_FTRACE
        trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
                        &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
@@ -6575,6 +6862,10 @@ static __init int tracer_init_debugfs(void)
        create_trace_options_dir(&global_trace);
+        /* If the tracer was started via cmdline, create options for it here */
+        if (global_trace.current_trace != &nop_trace)
+                update_tracer_options(&global_trace, global_trace.current_trace);
        return 0;
 }
@@ -6888,7 +7179,7 @@ void __init trace_init(void)
                        tracepoint_printk = 0;
        }
        tracer_alloc_buffers();
-        trace_event_init();     
+        trace_event_init();
 }
 __init static int clear_boot_tracer(void)
@@ -6910,5 +7201,5 @@ __init static int clear_boot_tracer(void)
        return 0;
 }
-fs_initcall(tracer_init_debugfs);
+fs_initcall(tracer_init_tracefs);
 late_initcall(clear_boot_tracer);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index dd8205a35760..d2612016de94 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -334,7 +334,7 @@ struct tracer_flags {
 /**
- * struct tracer - a specific tracer and its callbacks to interact with debugfs
+ * struct tracer - a specific tracer and its callbacks to interact with tracefs
 * @name: the name chosen to select it on the available_tracers file
 * @init: called when one switches to this tracer (echo name > current_tracer)
 * @reset: called when one switches to another tracer
@@ -1309,8 +1309,10 @@ static inline void init_ftrace_syscalls(void) { }
 #ifdef CONFIG_EVENT_TRACING
 void trace_event_init(void);
+void trace_event_enum_update(struct trace_enum_map **map, int len);
 #else
 static inline void __init trace_event_init(void) { }
+static inlin void trace_event_enum_update(struct trace_enum_map **map, int len) { }
 #endif
 extern struct trace_iterator *tracepoint_print_iter;
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index e2d027ac66a2..ee7b94a4810a 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -223,7 +223,7 @@ FTRACE_ENTRY(bprint, bprint_entry,
                __dynamic_array(        u32,    buf     )
        ),
-        F_printk("%pf: %s",
+        F_printk("%ps: %s",
                 (void *)__entry->ip, __entry->fmt),
        FILTER_OTHER
@@ -238,7 +238,7 @@ FTRACE_ENTRY(print, print_entry,
                __dynamic_array(        char,   buf     )
        ),
-        F_printk("%pf: %s",
+        F_printk("%ps: %s",
                 (void *)__entry->ip, __entry->buf),
        FILTER_OTHER
@@ -253,7 +253,7 @@ FTRACE_ENTRY(bputs, bputs_entry,
                __field(        const char *,   str     )
        ),
-        F_printk("%pf: %s",
+        F_printk("%ps: %s",
                 (void *)__entry->ip, __entry->str),
        FILTER_OTHER
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index db54dda10ccc..7da1dfeb322e 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -13,7 +13,7 @@
 #include <linux/workqueue.h>
 #include <linux/spinlock.h>
 #include <linux/kthread.h>
-#include <linux/debugfs.h>
+#include <linux/tracefs.h>
 #include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/ctype.h>
@@ -480,7 +480,7 @@ static void remove_subsystem(struct ftrace_subsystem_dir *dir)
                return;
        if (!--dir->nr_events) {
-                debugfs_remove_recursive(dir->entry);
+                tracefs_remove_recursive(dir->entry);
                list_del(&dir->list);
                __put_system_dir(dir);
        }
@@ -499,7 +499,7 @@ static void remove_event_file_dir(struct ftrace_event_file *file)
                }
                spin_unlock(&dir->d_lock);
-                debugfs_remove_recursive(dir);
+                tracefs_remove_recursive(dir);
        }
        list_del(&file->list);
@@ -1526,7 +1526,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
        } else
                __get_system(system);
-        dir->entry = debugfs_create_dir(name, parent);
+        dir->entry = tracefs_create_dir(name, parent);
        if (!dir->entry) {
                pr_warn("Failed to create system directory %s\n", name);
                __put_system(system);
@@ -1539,12 +1539,12 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
        dir->subsystem = system;
        file->system = dir;
-        entry = debugfs_create_file("filter", 0644, dir->entry, dir,
+        entry = tracefs_create_file("filter", 0644, dir->entry, dir,
                                    &ftrace_subsystem_filter_fops);
        if (!entry) {
                kfree(system->filter);
                system->filter = NULL;
-                pr_warn("Could not create debugfs '%s/filter' entry\n", name);
+                pr_warn("Could not create tracefs '%s/filter' entry\n", name);
        }
        trace_create_file("enable", 0644, dir->entry, dir,
@@ -1585,9 +1585,9 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
                d_events = parent;
        name = ftrace_event_name(call);
-        file->dir = debugfs_create_dir(name, d_events);
+        file->dir = tracefs_create_dir(name, d_events);
        if (!file->dir) {
-                pr_warn("Could not create debugfs '%s' directory\n", name);
+                pr_warn("Could not create tracefs '%s' directory\n", name);
                return -1;
        }
@@ -1704,6 +1704,125 @@ __register_event(struct ftrace_event_call *call, struct module *mod)
        return 0;
 }
+static char *enum_replace(char *ptr, struct trace_enum_map *map, int len)
+{
+        int rlen;
+        int elen;
+        /* Find the length of the enum value as a string */
+        elen = snprintf(ptr, 0, "%ld", map->enum_value);
+        /* Make sure there's enough room to replace the string with the value */
+        if (len < elen)
+                return NULL;
+        snprintf(ptr, elen + 1, "%ld", map->enum_value);
+        /* Get the rest of the string of ptr */
+        rlen = strlen(ptr + len);
+        memmove(ptr + elen, ptr + len, rlen);
+        /* Make sure we end the new string */
+        ptr[elen + rlen] = 0;
+        return ptr + elen;
+}
+static void update_event_printk(struct ftrace_event_call *call,
+                                struct trace_enum_map *map)
+{
+        char *ptr;
+        int quote = 0;
+        int len = strlen(map->enum_string);
+        for (ptr = call->print_fmt; *ptr; ptr++) {
+                if (*ptr == '\\') {
+                        ptr++;
+                        /* paranoid */
+                        if (!*ptr)
+                                break;
+                        continue;
+                }
+                if (*ptr == '"') {
+                        quote ^= 1;
+                        continue;
+                }
+                if (quote)
+                        continue;
+                if (isdigit(*ptr)) {
+                        /* skip numbers */
+                        do {
+                                ptr++;
+                                /* Check for alpha chars like ULL */
+                        } while (isalnum(*ptr));
+                        /*
+                         * A number must have some kind of delimiter after
+                         * it, and we can ignore that too.
+                         */
+                        continue;
+                }
+                if (isalpha(*ptr) || *ptr == '_') {
+                        if (strncmp(map->enum_string, ptr, len) == 0 &&
+                            !isalnum(ptr[len]) && ptr[len] != '_') {
+                                ptr = enum_replace(ptr, map, len);
+                                /* Hmm, enum string smaller than value */
+                                if (WARN_ON_ONCE(!ptr))
+                                        return;
+                                /*
+                                 * No need to decrement here, as enum_replace()
+                                 * returns the pointer to the character passed
+                                 * the enum, and two enums can not be placed
+                                 * back to back without something in between.
+                                 * We can skip that something in between.
+                                 */
+                                continue;
+                        }
+                skip_more:
+                        do {
+                                ptr++;
+                        } while (isalnum(*ptr) || *ptr == '_');
+                        /*
+                         * If what comes after this variable is a '.' or
+                         * '->' then we can continue to ignore that string.
+                         */
+                        if (*ptr == '.' || (ptr[0] == '-' && ptr[1] == '>')) {
+                                ptr += *ptr == '.' ? 1 : 2;
+                                goto skip_more;
+                        }
+                        /*
+                         * Once again, we can skip the delimiter that came
+                         * after the string.
+                         */
+                        continue;
+                }
+        }
+}
+void trace_event_enum_update(struct trace_enum_map **map, int len)
+{
+        struct ftrace_event_call *call, *p;
+        const char *last_system = NULL;
+        int last_i;
+        int i;
+        down_write(&trace_event_sem);
+        list_for_each_entry_safe(call, p, &ftrace_events, list) {
+                /* events are usually grouped together with systems */
+                if (!last_system || call->class->system != last_system) {
+                        last_i = 0;
+                        last_system = call->class->system;
+                }
+                for (i = last_i; i < len; i++) {
+                        if (call->class->system == map[i]->system) {
+                                /* Save the first system if need be */
+                                if (!last_i)
+                                        last_i = i;
+                                update_event_printk(call, map[i]);
+                        }
+                }
+        }
+        up_write(&trace_event_sem);
+}
 static struct ftrace_event_file *
 trace_create_new_event(struct ftrace_event_call *call,
                       struct trace_array *tr)
@@ -1915,7 +2034,7 @@ static int trace_module_notify(struct notifier_block *self,
 static struct notifier_block trace_module_nb = {
        .notifier_call = trace_module_notify,
-        .priority = 0,
+        .priority = 1, /* higher than trace.c module notify */
 };
 #endif /* CONFIG_MODULES */
@@ -2228,7 +2347,7 @@ static inline int register_event_cmds(void) { return 0; }
 /*
 * The top level array has already had its ftrace_event_file
 * descriptors created in order to allow for early events to
- * be recorded. This function is called after the debugfs has been
+ * be recorded. This function is called after the tracefs has been
 * initialized, and we now have to create the files associated
 * to the events.
 */
@@ -2311,16 +2430,16 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
        struct dentry *d_events;
        struct dentry *entry;
-        entry = debugfs_create_file("set_event", 0644, parent,
+        entry = tracefs_create_file("set_event", 0644, parent,
                                    tr, &ftrace_set_event_fops);
        if (!entry) {
-                pr_warn("Could not create debugfs 'set_event' entry\n");
+                pr_warn("Could not create tracefs 'set_event' entry\n");
                return -ENOMEM;
        }
-        d_events = debugfs_create_dir("events", parent);
+        d_events = tracefs_create_dir("events", parent);
        if (!d_events) {
-                pr_warn("Could not create debugfs 'events' directory\n");
+                pr_warn("Could not create tracefs 'events' directory\n");
                return -ENOMEM;
        }
@@ -2412,7 +2531,7 @@ int event_trace_del_tracer(struct trace_array *tr)
        down_write(&trace_event_sem);
        __trace_remove_event_dirs(tr);
-        debugfs_remove_recursive(tr->event_dir);
+        tracefs_remove_recursive(tr->event_dir);
        up_write(&trace_event_sem);
        tr->event_dir = NULL;
@@ -2534,10 +2653,10 @@ static __init int event_trace_init(void)
        if (IS_ERR(d_tracer))
                return 0;
-        entry = debugfs_create_file("available_events", 0444, d_tracer,
+        entry = tracefs_create_file("available_events", 0444, d_tracer,
                                    tr, &ftrace_avail_fops);
        if (!entry)
-                pr_warn("Could not create debugfs 'available_events' entry\n");
+                pr_warn("Could not create tracefs 'available_events' entry\n");
        if (trace_define_common_fields())
                pr_warn("tracing: Failed to allocate common fields");
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 12e2b99be862..174a6a71146c 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -177,7 +177,7 @@ struct ftrace_event_call __used event_##call = {			\
        },                                                              \
        .event.type             = etype,                                \
        .print_fmt              = print,                                \
-        .flags                  = TRACE_EVENT_FL_IGNORE_ENABLE | TRACE_EVENT_FL_USE_CALL_FILTER, \
+        .flags                  = TRACE_EVENT_FL_IGNORE_ENABLE,         \
 };                                                                      \
 struct ftrace_event_call __used                                         \
 __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 2d25ad1526bb..9cfea4c6d314 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -6,7 +6,6 @@
 * is Copyright (c) Steven Rostedt <srostedt@redhat.com>
 *
 */
-#include <linux/debugfs.h>
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
 #include <linux/slab.h>
@@ -151,7 +150,7 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
         * The curr_ret_stack is initialized to -1 and get increased
         * in this function.  So it can be less than -1 only if it was
         * filtered out via ftrace_graph_notrace_addr() which can be
-         * set from set_graph_notrace file in debugfs by user.
+         * set from set_graph_notrace file in tracefs by user.
         */
        if (current->curr_ret_stack < -1)
                return -EBUSY;
@@ -1432,7 +1431,7 @@ static const struct file_operations graph_depth_fops = {
        .llseek         = generic_file_llseek,
 };
-static __init int init_graph_debugfs(void)
+static __init int init_graph_tracefs(void)
 {
        struct dentry *d_tracer;
@@ -1445,7 +1444,7 @@ static __init int init_graph_debugfs(void)
        return 0;
 }
-fs_initcall(init_graph_debugfs);
+fs_initcall(init_graph_tracefs);
 static __init int init_graph_trace(void)
 {
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index d73f565b4e06..d0ce590f06e1 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -250,7 +250,7 @@ DEFINE_FETCH_symbol(string_size)
 #define fetch_file_offset_string_size   NULL
 /* Fetch type information table */
-const struct fetch_type kprobes_fetch_type_table[] = {
+static const struct fetch_type kprobes_fetch_type_table[] = {
        /* Special types */
        [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
                                        sizeof(u32), 1, "__data_loc char[]"),
@@ -760,7 +760,8 @@ static int create_trace_kprobe(int argc, char **argv)
                /* Parse fetch argument */
                ret = traceprobe_parse_probe_arg(arg, &tk->tp.size, parg,
-                                                is_return, true);
+                                                is_return, true,
+                                                kprobes_fetch_type_table);
                if (ret) {
                        pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
                        goto error;
@@ -1134,11 +1135,15 @@ static void
 kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 {
        struct ftrace_event_call *call = &tk->tp.call;
+        struct bpf_prog *prog = call->prog;
        struct kprobe_trace_entry_head *entry;
        struct hlist_head *head;
        int size, __size, dsize;
        int rctx;
+        if (prog && !trace_call_bpf(prog, regs))
+                return;
        head = this_cpu_ptr(call->perf_events);
        if (hlist_empty(head))
                return;
@@ -1165,11 +1170,15 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
                    struct pt_regs *regs)
 {
        struct ftrace_event_call *call = &tk->tp.call;
+        struct bpf_prog *prog = call->prog;
        struct kretprobe_trace_entry_head *entry;
        struct hlist_head *head;
        int size, __size, dsize;
        int rctx;
+        if (prog && !trace_call_bpf(prog, regs))
+                return;
        head = this_cpu_ptr(call->perf_events);
        if (hlist_empty(head))
                return;
@@ -1286,7 +1295,7 @@ static int register_kprobe_event(struct trace_kprobe *tk)
                kfree(call->print_fmt);
                return -ENODEV;
        }
-        call->flags = 0;
+        call->flags = TRACE_EVENT_FL_KPROBE;
        call->class->reg = kprobe_register;
        call->data = tk;
        ret = trace_add_event_call(call);
@@ -1310,7 +1319,7 @@ static int unregister_kprobe_event(struct trace_kprobe *tk)
        return ret;
 }
-/* Make a debugfs interface for controlling probe points */
+/* Make a tracefs interface for controlling probe points */
 static __init int init_kprobe_trace(void)
 {
        struct dentry *d_tracer;
@@ -1323,20 +1332,20 @@ static __init int init_kprobe_trace(void)
        if (IS_ERR(d_tracer))
                return 0;
-        entry = debugfs_create_file("kprobe_events", 0644, d_tracer,
+        entry = tracefs_create_file("kprobe_events", 0644, d_tracer,
                                    NULL, &kprobe_events_ops);
        /* Event list interface */
        if (!entry)
-                pr_warning("Could not create debugfs "
+                pr_warning("Could not create tracefs "
                           "'kprobe_events' entry\n");
        /* Profile interface */
-        entry = debugfs_create_file("kprobe_profile", 0444, d_tracer,
+        entry = tracefs_create_file("kprobe_profile", 0444, d_tracer,
                                    NULL, &kprobe_profile_ops);
        if (!entry)
-                pr_warning("Could not create debugfs "
+                pr_warning("Could not create tracefs "
                           "'kprobe_profile' entry\n");
        return 0;
 }
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index b983b2fd2ca1..1769a81da8a7 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -356,17 +356,14 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
 /* Recursive argument parser */
 static int parse_probe_arg(char *arg, const struct fetch_type *t,
-                     struct fetch_param *f, bool is_return, bool is_kprobe)
+                     struct fetch_param *f, bool is_return, bool is_kprobe,
+                     const struct fetch_type *ftbl)
 {
-        const struct fetch_type *ftbl;
        unsigned long param;
        long offset;
        char *tmp;
        int ret = 0;
-        ftbl = is_kprobe ? kprobes_fetch_type_table : uprobes_fetch_type_table;
-        BUG_ON(ftbl == NULL);
        switch (arg[0]) {
        case '$':
                ret = parse_probe_vars(arg + 1, t, f, is_return, is_kprobe);
@@ -447,7 +444,7 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t,
                        dprm->fetch_size = get_fetch_size_function(t,
                                                        dprm->fetch, ftbl);
                        ret = parse_probe_arg(arg, t2, &dprm->orig, is_return,
-                                                        is_kprobe);
+                                                        is_kprobe, ftbl);
                        if (ret)
                                kfree(dprm);
                        else {
@@ -505,15 +502,12 @@ static int __parse_bitfield_probe_arg(const char *bf,
 /* String length checking wrapper */
 int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
-                struct probe_arg *parg, bool is_return, bool is_kprobe)
+                struct probe_arg *parg, bool is_return, bool is_kprobe,
+                const struct fetch_type *ftbl)
 {
-        const struct fetch_type *ftbl;
        const char *t;
        int ret;
-        ftbl = is_kprobe ? kprobes_fetch_type_table : uprobes_fetch_type_table;
-        BUG_ON(ftbl == NULL);
        if (strlen(arg) > MAX_ARGSTR_LEN) {
                pr_info("Argument is too long.: %s\n",  arg);
                return -ENOSPC;
@@ -535,7 +529,8 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
        }
        parg->offset = *size;
        *size += parg->type->size;
-        ret = parse_probe_arg(arg, parg->type, &parg->fetch, is_return, is_kprobe);
+        ret = parse_probe_arg(arg, parg->type, &parg->fetch, is_return,
+                              is_kprobe, ftbl);
        if (ret >= 0 && t != NULL)
                ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch);
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 4f815fbce16d..ab283e146b70 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -25,7 +25,7 @@
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/smp.h>
-#include <linux/debugfs.h>
+#include <linux/tracefs.h>
 #include <linux/types.h>
 #include <linux/string.h>
 #include <linux/ctype.h>
@@ -229,13 +229,6 @@ ASSIGN_FETCH_FUNC(file_offset, ftype),			\
 #define FETCH_TYPE_STRING       0
 #define FETCH_TYPE_STRSIZE      1
-/*
- * Fetch type information table.
- * It's declared as a weak symbol due to conditional compilation.
- */
-extern __weak const struct fetch_type kprobes_fetch_type_table[];
-extern __weak const struct fetch_type uprobes_fetch_type_table[];
 #ifdef CONFIG_KPROBE_EVENT
 struct symbol_cache;
 unsigned long update_symbol_cache(struct symbol_cache *sc);
@@ -333,7 +326,8 @@ find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file)
 }
 extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
-                   struct probe_arg *parg, bool is_return, bool is_kprobe);
+                   struct probe_arg *parg, bool is_return, bool is_kprobe,
+                   const struct fetch_type *ftbl);
 extern int traceprobe_conflict_field_name(const char *name,
                               struct probe_arg *args, int narg);
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index c3e4fcfddd45..3f34496244e9 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -327,11 +327,11 @@ static void t_stop(struct seq_file *m, void *p)
        local_irq_enable();
 }
-static int trace_lookup_stack(struct seq_file *m, long i)
+static void trace_lookup_stack(struct seq_file *m, long i)
 {
        unsigned long addr = stack_dump_trace[i];
-        return seq_printf(m, "%pS\n", (void *)addr);
+        seq_printf(m, "%pS\n", (void *)addr);
 }
 static void print_disabled(struct seq_file *m)
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 75e19e86c954..6cf935316769 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -12,7 +12,7 @@
 #include <linux/list.h>
 #include <linux/slab.h>
 #include <linux/rbtree.h>
-#include <linux/debugfs.h>
+#include <linux/tracefs.h>
 #include "trace_stat.h"
 #include "trace.h"
@@ -65,7 +65,7 @@ static void reset_stat_session(struct stat_session *session)
 static void destroy_session(struct stat_session *session)
 {
-        debugfs_remove(session->file);
+        tracefs_remove(session->file);
        __reset_stat_session(session);
        mutex_destroy(&session->stat_mutex);
        kfree(session);
@@ -279,9 +279,9 @@ static int tracing_stat_init(void)
        if (IS_ERR(d_tracing))
                return 0;
-        stat_dir = debugfs_create_dir("trace_stat", d_tracing);
+        stat_dir = tracefs_create_dir("trace_stat", d_tracing);
        if (!stat_dir)
-                pr_warning("Could not create debugfs "
+                pr_warning("Could not create tracefs "
                           "'trace_stat' entry\n");
        return 0;
 }
@@ -291,7 +291,7 @@ static int init_stat_file(struct stat_session *session)
        if (!stat_dir && tracing_stat_init())
                return -ENODEV;
-        session->file = debugfs_create_file(session->ts->name, 0644,
+        session->file = tracefs_create_file(session->ts->name, 0644,
                                            stat_dir,
                                            session, &tracing_stat_fops);
        if (!session->file)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 7dc1c8abecd6..d60fe62ec4fa 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -196,7 +196,7 @@ DEFINE_FETCH_file_offset(string)
 DEFINE_FETCH_file_offset(string_size)
 /* Fetch type information table */
-const struct fetch_type uprobes_fetch_type_table[] = {
+static const struct fetch_type uprobes_fetch_type_table[] = {
        /* Special types */
        [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
                                        sizeof(u32), 1, "__data_loc char[]"),
@@ -535,7 +535,8 @@ static int create_trace_uprobe(int argc, char **argv)
                /* Parse fetch argument */
                ret = traceprobe_parse_probe_arg(arg, &tu->tp.size, parg,
-                                                 is_return, false);
+                                                 is_return, false,
+                                                 uprobes_fetch_type_table);
                if (ret) {
                        pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
                        goto error;
@@ -1005,7 +1006,7 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
                return true;
        list_for_each_entry(event, &filter->perf_events, hw.tp_list) {
-                if (event->hw.tp_target->mm == mm)
+                if (event->hw.target->mm == mm)
                        return true;
        }
@@ -1015,7 +1016,7 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
 static inline bool
 uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event)
 {
-        return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm);
+        return __uprobe_perf_filter(&tu->filter, event->hw.target->mm);
 }
 static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
@@ -1023,10 +1024,10 @@ static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
        bool done;
        write_lock(&tu->filter.rwlock);
-        if (event->hw.tp_target) {
+        if (event->hw.target) {
                list_del(&event->hw.tp_list);
                done = tu->filter.nr_systemwide ||
-                        (event->hw.tp_target->flags & PF_EXITING) ||
+                        (event->hw.target->flags & PF_EXITING) ||
                        uprobe_filter_event(tu, event);
        } else {
                tu->filter.nr_systemwide--;
@@ -1046,7 +1047,7 @@ static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
        int err;
        write_lock(&tu->filter.rwlock);
-        if (event->hw.tp_target) {
+        if (event->hw.target) {
                /*
                 * event->parent != NULL means copy_process(), we can avoid
                 * uprobe_apply(). current->mm must be probed and we can rely
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 3174bf8e3538..2316f50b07a4 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -24,8 +24,33 @@
 #include <linux/kvm_para.h>
 #include <linux/perf_event.h>
-int watchdog_user_enabled = 1;
+/*
+ * The run state of the lockup detectors is controlled by the content of the
+ * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit -
+ * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector.
+ *
+ * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled'
+ * are variables that are only used as an 'interface' between the parameters
+ * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The
+ * 'watchdog_thresh' variable is handled differently because its value is not
+ * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh'
+ * is equal zero.
+ */
+#define NMI_WATCHDOG_ENABLED_BIT   0
+#define SOFT_WATCHDOG_ENABLED_BIT  1
+#define NMI_WATCHDOG_ENABLED      (1 << NMI_WATCHDOG_ENABLED_BIT)
+#define SOFT_WATCHDOG_ENABLED     (1 << SOFT_WATCHDOG_ENABLED_BIT)
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED;
+#else
+static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED;
+#endif
+int __read_mostly nmi_watchdog_enabled;
+int __read_mostly soft_watchdog_enabled;
+int __read_mostly watchdog_user_enabled;
 int __read_mostly watchdog_thresh = 10;
 #ifdef CONFIG_SMP
 int __read_mostly sysctl_softlockup_all_cpu_backtrace;
 #else
@@ -58,8 +83,6 @@ static unsigned long soft_lockup_nmi_warn;
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
 static int hardlockup_panic =
                        CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
-static bool hardlockup_detector_enabled = true;
 /*
 * We may not want to enable hard lockup detection by default in all cases,
 * for example when running the kernel as a guest on a hypervisor. In these
@@ -68,14 +91,9 @@ static bool hardlockup_detector_enabled = true;
 * kernel command line parameters are parsed, because otherwise it is not
 * possible to override this in hardlockup_panic_setup().
 */
-void watchdog_enable_hardlockup_detector(bool val)
+void hardlockup_detector_disable(void)
-{
-        hardlockup_detector_enabled = val;
-}
-bool watchdog_hardlockup_detector_is_enabled(void)
 {
-        return hardlockup_detector_enabled;
+        watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
 }
 static int __init hardlockup_panic_setup(char *str)
@@ -85,15 +103,9 @@ static int __init hardlockup_panic_setup(char *str)
        else if (!strncmp(str, "nopanic", 7))
                hardlockup_panic = 0;
        else if (!strncmp(str, "0", 1))
-                watchdog_user_enabled = 0;
+                watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
-        else if (!strncmp(str, "1", 1) || !strncmp(str, "2", 1)) {
+        else if (!strncmp(str, "1", 1))
-                /*
+                watchdog_enabled |= NMI_WATCHDOG_ENABLED;
-                 * Setting 'nmi_watchdog=1' or 'nmi_watchdog=2' (legacy option)
-                 * has the same effect.
-                 */
-                watchdog_user_enabled = 1;
-                watchdog_enable_hardlockup_detector(true);
-        }
        return 1;
 }
 __setup("nmi_watchdog=", hardlockup_panic_setup);
@@ -112,19 +124,18 @@ __setup("softlockup_panic=", softlockup_panic_setup);
 static int __init nowatchdog_setup(char *str)
 {
-        watchdog_user_enabled = 0;
+        watchdog_enabled = 0;
        return 1;
 }
 __setup("nowatchdog", nowatchdog_setup);
-/* deprecated */
 static int __init nosoftlockup_setup(char *str)
 {
-        watchdog_user_enabled = 0;
+        watchdog_enabled &= ~SOFT_WATCHDOG_ENABLED;
        return 1;
 }
 __setup("nosoftlockup", nosoftlockup_setup);
-/*  */
 #ifdef CONFIG_SMP
 static int __init softlockup_all_cpu_backtrace_setup(char *str)
 {
@@ -239,10 +250,11 @@ static int is_softlockup(unsigned long touch_ts)
 {
        unsigned long now = get_timestamp();
-        /* Warn about unreasonable delays: */
+        if (watchdog_enabled & SOFT_WATCHDOG_ENABLED) {
-        if (time_after(now, touch_ts + get_softlockup_thresh()))
+                /* Warn about unreasonable delays. */
-                return now - touch_ts;
+                if (time_after(now, touch_ts + get_softlockup_thresh()))
+                        return now - touch_ts;
+        }
        return 0;
 }
@@ -477,6 +489,21 @@ static void watchdog(unsigned int cpu)
        __this_cpu_write(soft_lockup_hrtimer_cnt,
                         __this_cpu_read(hrtimer_interrupts));
        __touch_watchdog();
+        /*
+         * watchdog_nmi_enable() clears the NMI_WATCHDOG_ENABLED bit in the
+         * failure path. Check for failures that can occur asynchronously -
+         * for example, when CPUs are on-lined - and shut down the hardware
+         * perf event on each CPU accordingly.
+         *
+         * The only non-obvious place this bit can be cleared is through
+         * watchdog_nmi_enable(), so a pr_info() is placed there.  Placing a
+         * pr_info here would be too noisy as it would result in a message
+         * every few seconds if the hardlockup was disabled but the softlockup
+         * enabled.
+         */
+        if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
+                watchdog_nmi_disable(cpu);
 }
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
@@ -492,14 +519,9 @@ static int watchdog_nmi_enable(unsigned int cpu)
        struct perf_event_attr *wd_attr;
        struct perf_event *event = per_cpu(watchdog_ev, cpu);
-        /*
+        /* nothing to do if the hard lockup detector is disabled */
-         * Some kernels need to default hard lockup detection to
+        if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
-         * 'disabled', for example a guest on a hypervisor.
+                goto out;
-         */
-        if (!watchdog_hardlockup_detector_is_enabled()) {
-                event = ERR_PTR(-ENOENT);
-                goto handle_err;
-        }
        /* is it already setup and enabled? */
        if (event && event->state > PERF_EVENT_STATE_OFF)
@@ -515,7 +537,6 @@ static int watchdog_nmi_enable(unsigned int cpu)
        /* Try to register using hardware perf events */
        event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
-handle_err:
        /* save cpu0 error for future comparision */
        if (cpu == 0 && IS_ERR(event))
                cpu0_err = PTR_ERR(event);
@@ -527,6 +548,18 @@ handle_err:
                goto out_save;
        }
+        /*
+         * Disable the hard lockup detector if _any_ CPU fails to set up
+         * set up the hardware perf event. The watchdog() function checks
+         * the NMI_WATCHDOG_ENABLED bit periodically.
+         *
+         * The barriers are for syncing up watchdog_enabled across all the
+         * cpus, as clear_bit() does not use barriers.
+         */
+        smp_mb__before_atomic();
+        clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled);
+        smp_mb__after_atomic();
        /* skip displaying the same error again */
        if (cpu > 0 && (PTR_ERR(event) == cpu0_err))
                return PTR_ERR(event);
@@ -540,6 +573,9 @@ handle_err:
        else
                pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
                        cpu, PTR_ERR(event));
+        pr_info("Shutting down hard lockup detector on all cpus\n");
        return PTR_ERR(event);
        /* success path */
@@ -567,9 +603,37 @@ static void watchdog_nmi_disable(unsigned int cpu)
                cpu0_err = 0;
        }
 }
+void watchdog_nmi_enable_all(void)
+{
+        int cpu;
+        if (!watchdog_user_enabled)
+                return;
+        get_online_cpus();
+        for_each_online_cpu(cpu)
+                watchdog_nmi_enable(cpu);
+        put_online_cpus();
+}
+void watchdog_nmi_disable_all(void)
+{
+        int cpu;
+        if (!watchdog_running)
+                return;
+        get_online_cpus();
+        for_each_online_cpu(cpu)
+                watchdog_nmi_disable(cpu);
+        put_online_cpus();
+}
 #else
 static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
 static void watchdog_nmi_disable(unsigned int cpu) { return; }
+void watchdog_nmi_enable_all(void) {}
+void watchdog_nmi_disable_all(void) {}
 #endif /* CONFIG_HARDLOCKUP_DETECTOR */
 static struct smp_hotplug_thread watchdog_threads = {
@@ -600,7 +664,7 @@ static void restart_watchdog_hrtimer(void *info)
                                HRTIMER_MODE_REL_PINNED);
 }
-static void update_timers(int cpu)
+static void update_watchdog(int cpu)
 {
        /*
         * Make sure that perf event counter will adopt to a new
@@ -615,17 +679,17 @@ static void update_timers(int cpu)
        watchdog_nmi_enable(cpu);
 }
-static void update_timers_all_cpus(void)
+static void update_watchdog_all_cpus(void)
 {
        int cpu;
        get_online_cpus();
        for_each_online_cpu(cpu)
-                update_timers(cpu);
+                update_watchdog(cpu);
        put_online_cpus();
 }
-static int watchdog_enable_all_cpus(bool sample_period_changed)
+static int watchdog_enable_all_cpus(void)
 {
        int err = 0;
@@ -635,8 +699,12 @@ static int watchdog_enable_all_cpus(bool sample_period_changed)
                        pr_err("Failed to create watchdog threads, disabled\n");
                else
                        watchdog_running = 1;
-        } else if (sample_period_changed) {
+        } else {
-                update_timers_all_cpus();
+                /*
+                 * Enable/disable the lockup detectors or
+                 * change the sample period 'on the fly'.
+                 */
+                update_watchdog_all_cpus();
        }
        return err;
@@ -654,58 +722,159 @@ static void watchdog_disable_all_cpus(void)
 }
 /*
- * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh
+ * Update the run state of the lockup detectors.
 */
+static int proc_watchdog_update(void)
+{
+        int err = 0;
-int proc_dowatchdog(struct ctl_table *table, int write,
+        /*
-                    void __user *buffer, size_t *lenp, loff_t *ppos)
+         * Watchdog threads won't be started if they are already active.
+         * The 'watchdog_running' variable in watchdog_*_all_cpus() takes
+         * care of this. If those threads are already active, the sample
+         * period will be updated and the lockup detectors will be enabled
+         * or disabled 'on the fly'.
+         */
+        if (watchdog_enabled && watchdog_thresh)
+                err = watchdog_enable_all_cpus();
+        else
+                watchdog_disable_all_cpus();
+        return err;
+}
+static DEFINE_MUTEX(watchdog_proc_mutex);
+/*
+ * common function for watchdog, nmi_watchdog and soft_watchdog parameter
+ *
+ * caller             | table->data points to | 'which' contains the flag(s)
+ * -------------------|-----------------------|-----------------------------
+ * proc_watchdog      | watchdog_user_enabled | NMI_WATCHDOG_ENABLED or'ed
+ *                    |                       | with SOFT_WATCHDOG_ENABLED
+ * -------------------|-----------------------|-----------------------------
+ * proc_nmi_watchdog  | nmi_watchdog_enabled  | NMI_WATCHDOG_ENABLED
+ * -------------------|-----------------------|-----------------------------
+ * proc_soft_watchdog | soft_watchdog_enabled | SOFT_WATCHDOG_ENABLED
+ */
+static int proc_watchdog_common(int which, struct ctl_table *table, int write,
+                                void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-        int err, old_thresh, old_enabled;
+        int err, old, new;
-        bool old_hardlockup;
+        int *watchdog_param = (int *)table->data;
-        static DEFINE_MUTEX(watchdog_proc_mutex);
        mutex_lock(&watchdog_proc_mutex);
-        old_thresh = ACCESS_ONCE(watchdog_thresh);
-        old_enabled = ACCESS_ONCE(watchdog_user_enabled);
-        old_hardlockup = watchdog_hardlockup_detector_is_enabled();
-        err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-        if (err || !write)
-                goto out;
-        set_sample_period();
        /*
-         * Watchdog threads shouldn't be enabled if they are
+         * If the parameter is being read return the state of the corresponding
-         * disabled. The 'watchdog_running' variable check in
+         * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the
-         * watchdog_*_all_cpus() function takes care of this.
+         * run state of the lockup detectors.
         */
-        if (watchdog_user_enabled && watchdog_thresh) {
+        if (!write) {
+                *watchdog_param = (watchdog_enabled & which) != 0;
+                err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+        } else {
+                err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+                if (err)
+                        goto out;
                /*
-                 * Prevent a change in watchdog_thresh accidentally overriding
+                 * There is a race window between fetching the current value
-                 * the enablement of the hardlockup detector.
+                 * from 'watchdog_enabled' and storing the new value. During
+                 * this race window, watchdog_nmi_enable() can sneak in and
+                 * clear the NMI_WATCHDOG_ENABLED bit in 'watchdog_enabled'.
+                 * The 'cmpxchg' detects this race and the loop retries.
                 */
-                if (watchdog_user_enabled != old_enabled)
+                do {
-                        watchdog_enable_hardlockup_detector(true);
+                        old = watchdog_enabled;
-                err = watchdog_enable_all_cpus(old_thresh != watchdog_thresh);
+                        /*
-        } else
+                         * If the parameter value is not zero set the
-                watchdog_disable_all_cpus();
+                         * corresponding bit(s), else clear it(them).
+                         */
+                        if (*watchdog_param)
+                                new = old | which;
+                        else
+                                new = old & ~which;
+                } while (cmpxchg(&watchdog_enabled, old, new) != old);
-        /* Restore old values on failure */
+                /*
-        if (err) {
+                 * Update the run state of the lockup detectors.
-                watchdog_thresh = old_thresh;
+                 * Restore 'watchdog_enabled' on failure.
-                watchdog_user_enabled = old_enabled;
+                 */
-                watchdog_enable_hardlockup_detector(old_hardlockup);
+                err = proc_watchdog_update();
+                if (err)
+                        watchdog_enabled = old;
        }
 out:
        mutex_unlock(&watchdog_proc_mutex);
        return err;
 }
+/*
+ * /proc/sys/kernel/watchdog
+ */
+int proc_watchdog(struct ctl_table *table, int write,
+                  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        return proc_watchdog_common(NMI_WATCHDOG_ENABLED|SOFT_WATCHDOG_ENABLED,
+                                    table, write, buffer, lenp, ppos);
+}
+/*
+ * /proc/sys/kernel/nmi_watchdog
+ */
+int proc_nmi_watchdog(struct ctl_table *table, int write,
+                      void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        return proc_watchdog_common(NMI_WATCHDOG_ENABLED,
+                                    table, write, buffer, lenp, ppos);
+}
+/*
+ * /proc/sys/kernel/soft_watchdog
+ */
+int proc_soft_watchdog(struct ctl_table *table, int write,
+                        void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        return proc_watchdog_common(SOFT_WATCHDOG_ENABLED,
+                                    table, write, buffer, lenp, ppos);
+}
+/*
+ * /proc/sys/kernel/watchdog_thresh
+ */
+int proc_watchdog_thresh(struct ctl_table *table, int write,
+                         void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        int err, old;
+        mutex_lock(&watchdog_proc_mutex);
+        old = ACCESS_ONCE(watchdog_thresh);
+        err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+        if (err || !write)
+                goto out;
+        /*
+         * Update the sample period.
+         * Restore 'watchdog_thresh' on failure.
+         */
+        set_sample_period();
+        err = proc_watchdog_update();
+        if (err)
+                watchdog_thresh = old;
+out:
+        mutex_unlock(&watchdog_proc_mutex);
+        return err;
+}
 #endif /* CONFIG_SYSCTL */
 void __init lockup_detector_init(void)
 {
        set_sample_period();
-        if (watchdog_user_enabled)
+        if (watchdog_enabled)
-                watchdog_enable_all_cpus(false);
+                watchdog_enable_all_cpus();
 }
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 41ff75b478c6..586ad91300b0 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -159,6 +159,7 @@ struct worker_pool {
        /* see manage_workers() for details on the two manager mutexes */
        struct mutex            manager_arb;    /* manager arbitration */
+        struct worker           *manager;       /* L: purely informational */
        struct mutex            attach_mutex;   /* attach/detach exclusion */
        struct list_head        workers;        /* A: attached workers */
        struct completion       *detach_completion; /* all workers detached */
@@ -230,7 +231,7 @@ struct wq_device;
 */
 struct workqueue_struct {
        struct list_head        pwqs;           /* WR: all pwqs of this wq */
-        struct list_head        list;           /* PL: list of all workqueues */
+        struct list_head        list;           /* PR: list of all workqueues */
        struct mutex            mutex;          /* protects this wq */
        int                     work_color;     /* WQ: current work color */
@@ -257,6 +258,13 @@ struct workqueue_struct {
 #endif
        char                    name[WQ_NAME_LEN]; /* I: workqueue name */
+        /*
+         * Destruction of workqueue_struct is sched-RCU protected to allow
+         * walking the workqueues list without grabbing wq_pool_mutex.
+         * This is used to dump all workqueues from sysrq.
+         */
+        struct rcu_head         rcu;
        /* hot fields used during command issue, aligned to cacheline */
        unsigned int            flags ____cacheline_aligned; /* WQ: WQ_* flags */
        struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */
@@ -288,7 +296,7 @@ static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf;
 static DEFINE_MUTEX(wq_pool_mutex);     /* protects pools and workqueues list */
 static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */
-static LIST_HEAD(workqueues);           /* PL: list of all workqueues */
+static LIST_HEAD(workqueues);           /* PR: list of all workqueues */
 static bool workqueue_freezing;         /* PL: have wqs started freezing? */
 /* the per-cpu worker pools */
@@ -324,6 +332,7 @@ EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
 static int worker_thread(void *__worker);
 static void copy_workqueue_attrs(struct workqueue_attrs *to,
                                 const struct workqueue_attrs *from);
+static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
 #define CREATE_TRACE_POINTS
 #include <trace/events/workqueue.h>
@@ -1911,9 +1920,11 @@ static bool manage_workers(struct worker *worker)
         */
        if (!mutex_trylock(&pool->manager_arb))
                return false;
+        pool->manager = worker;
        maybe_create_worker(pool);
+        pool->manager = NULL;
        mutex_unlock(&pool->manager_arb);
        return true;
 }
@@ -2303,6 +2314,7 @@ repeat:
 struct wq_barrier {
        struct work_struct      work;
        struct completion       done;
+        struct task_struct      *task;  /* purely informational */
 };
 static void wq_barrier_func(struct work_struct *work)
@@ -2351,6 +2363,7 @@ static void insert_wq_barrier(struct pool_workqueue *pwq,
        INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
        __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
        init_completion(&barr->done);
+        barr->task = current;
        /*
         * If @target is currently being executed, schedule the
@@ -2989,323 +3002,6 @@ int execute_in_process_context(work_func_t fn, struct execute_work *ew)
 }
 EXPORT_SYMBOL_GPL(execute_in_process_context);
-#ifdef CONFIG_SYSFS
-/*
- * Workqueues with WQ_SYSFS flag set is visible to userland via
- * /sys/bus/workqueue/devices/WQ_NAME.  All visible workqueues have the
- * following attributes.
- *
- *  per_cpu     RO bool : whether the workqueue is per-cpu or unbound
- *  max_active  RW int  : maximum number of in-flight work items
- *
- * Unbound workqueues have the following extra attributes.
- *
- *  id          RO int  : the associated pool ID
- *  nice        RW int  : nice value of the workers
- *  cpumask     RW mask : bitmask of allowed CPUs for the workers
- */
-struct wq_device {
-        struct workqueue_struct         *wq;
-        struct device                   dev;
-};
-static struct workqueue_struct *dev_to_wq(struct device *dev)
-{
-        struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
-        return wq_dev->wq;
-}
-static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr,
-                            char *buf)
-{
-        struct workqueue_struct *wq = dev_to_wq(dev);
-        return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
-}
-static DEVICE_ATTR_RO(per_cpu);
-static ssize_t max_active_show(struct device *dev,
-                               struct device_attribute *attr, char *buf)
-{
-        struct workqueue_struct *wq = dev_to_wq(dev);
-        return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
-}
-static ssize_t max_active_store(struct device *dev,
-                                struct device_attribute *attr, const char *buf,
-                                size_t count)
-{
-        struct workqueue_struct *wq = dev_to_wq(dev);
-        int val;
-        if (sscanf(buf, "%d", &val) != 1 || val <= 0)
-                return -EINVAL;
-        workqueue_set_max_active(wq, val);
-        return count;
-}
-static DEVICE_ATTR_RW(max_active);
-static struct attribute *wq_sysfs_attrs[] = {
-        &dev_attr_per_cpu.attr,
-        &dev_attr_max_active.attr,
-        NULL,
-};
-ATTRIBUTE_GROUPS(wq_sysfs);
-static ssize_t wq_pool_ids_show(struct device *dev,
-                                struct device_attribute *attr, char *buf)
-{
-        struct workqueue_struct *wq = dev_to_wq(dev);
-        const char *delim = "";
-        int node, written = 0;
-        rcu_read_lock_sched();
-        for_each_node(node) {
-                written += scnprintf(buf + written, PAGE_SIZE - written,
-                                     "%s%d:%d", delim, node,
-                                     unbound_pwq_by_node(wq, node)->pool->id);
-                delim = " ";
-        }
-        written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
-        rcu_read_unlock_sched();
-        return written;
-}
-static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
-                            char *buf)
-{
-        struct workqueue_struct *wq = dev_to_wq(dev);
-        int written;
-        mutex_lock(&wq->mutex);
-        written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice);
-        mutex_unlock(&wq->mutex);
-        return written;
-}
-/* prepare workqueue_attrs for sysfs store operations */
-static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
-{
-        struct workqueue_attrs *attrs;
-        attrs = alloc_workqueue_attrs(GFP_KERNEL);
-        if (!attrs)
-                return NULL;
-        mutex_lock(&wq->mutex);
-        copy_workqueue_attrs(attrs, wq->unbound_attrs);
-        mutex_unlock(&wq->mutex);
-        return attrs;
-}
-static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
-                             const char *buf, size_t count)
-{
-        struct workqueue_struct *wq = dev_to_wq(dev);
-        struct workqueue_attrs *attrs;
-        int ret;
-        attrs = wq_sysfs_prep_attrs(wq);
-        if (!attrs)
-                return -ENOMEM;
-        if (sscanf(buf, "%d", &attrs->nice) == 1 &&
-            attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
-                ret = apply_workqueue_attrs(wq, attrs);
-        else
-                ret = -EINVAL;
-        free_workqueue_attrs(attrs);
-        return ret ?: count;
-}
-static ssize_t wq_cpumask_show(struct device *dev,
-                               struct device_attribute *attr, char *buf)
-{
-        struct workqueue_struct *wq = dev_to_wq(dev);
-        int written;
-        mutex_lock(&wq->mutex);
-        written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
-                            cpumask_pr_args(wq->unbound_attrs->cpumask));
-        mutex_unlock(&wq->mutex);
-        return written;
-}
-static ssize_t wq_cpumask_store(struct device *dev,
-                                struct device_attribute *attr,
-                                const char *buf, size_t count)
-{
-        struct workqueue_struct *wq = dev_to_wq(dev);
-        struct workqueue_attrs *attrs;
-        int ret;
-        attrs = wq_sysfs_prep_attrs(wq);
-        if (!attrs)
-                return -ENOMEM;
-        ret = cpumask_parse(buf, attrs->cpumask);
-        if (!ret)
-                ret = apply_workqueue_attrs(wq, attrs);
-        free_workqueue_attrs(attrs);
-        return ret ?: count;
-}
-static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr,
-                            char *buf)
-{
-        struct workqueue_struct *wq = dev_to_wq(dev);
-        int written;
-        mutex_lock(&wq->mutex);
-        written = scnprintf(buf, PAGE_SIZE, "%d\n",
-                            !wq->unbound_attrs->no_numa);
-        mutex_unlock(&wq->mutex);
-        return written;
-}
-static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
-                             const char *buf, size_t count)
-{
-        struct workqueue_struct *wq = dev_to_wq(dev);
-        struct workqueue_attrs *attrs;
-        int v, ret;
-        attrs = wq_sysfs_prep_attrs(wq);
-        if (!attrs)
-                return -ENOMEM;
-        ret = -EINVAL;
-        if (sscanf(buf, "%d", &v) == 1) {
-                attrs->no_numa = !v;
-                ret = apply_workqueue_attrs(wq, attrs);
-        }
-        free_workqueue_attrs(attrs);
-        return ret ?: count;
-}
-static struct device_attribute wq_sysfs_unbound_attrs[] = {
-        __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
-        __ATTR(nice, 0644, wq_nice_show, wq_nice_store),
-        __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
-        __ATTR(numa, 0644, wq_numa_show, wq_numa_store),
-        __ATTR_NULL,
-};
-static struct bus_type wq_subsys = {
-        .name                           = "workqueue",
-        .dev_groups                     = wq_sysfs_groups,
-};
-static int __init wq_sysfs_init(void)
-{
-        return subsys_virtual_register(&wq_subsys, NULL);
-}
-core_initcall(wq_sysfs_init);
-static void wq_device_release(struct device *dev)
-{
-        struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
-        kfree(wq_dev);
-}
-/**
- * workqueue_sysfs_register - make a workqueue visible in sysfs
- * @wq: the workqueue to register
- *
- * Expose @wq in sysfs under /sys/bus/workqueue/devices.
- * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set
- * which is the preferred method.
- *
- * Workqueue user should use this function directly iff it wants to apply
- * workqueue_attrs before making the workqueue visible in sysfs; otherwise,
- * apply_workqueue_attrs() may race against userland updating the
- * attributes.
- *
- * Return: 0 on success, -errno on failure.
- */
-int workqueue_sysfs_register(struct workqueue_struct *wq)
-{
-        struct wq_device *wq_dev;
-        int ret;
-        /*
-         * Adjusting max_active or creating new pwqs by applyting
-         * attributes breaks ordering guarantee.  Disallow exposing ordered
-         * workqueues.
-         */
-        if (WARN_ON(wq->flags & __WQ_ORDERED))
-                return -EINVAL;
-        wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
-        if (!wq_dev)
-                return -ENOMEM;
-        wq_dev->wq = wq;
-        wq_dev->dev.bus = &wq_subsys;
-        wq_dev->dev.init_name = wq->name;
-        wq_dev->dev.release = wq_device_release;
-        /*
-         * unbound_attrs are created separately.  Suppress uevent until
-         * everything is ready.
-         */
-        dev_set_uevent_suppress(&wq_dev->dev, true);
-        ret = device_register(&wq_dev->dev);
-        if (ret) {
-                kfree(wq_dev);
-                wq->wq_dev = NULL;
-                return ret;
-        }
-        if (wq->flags & WQ_UNBOUND) {
-                struct device_attribute *attr;
-                for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) {
-                        ret = device_create_file(&wq_dev->dev, attr);
-                        if (ret) {
-                                device_unregister(&wq_dev->dev);
-                                wq->wq_dev = NULL;
-                                return ret;
-                        }
-                }
-        }
-        dev_set_uevent_suppress(&wq_dev->dev, false);
-        kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
-        return 0;
-}
-/**
- * workqueue_sysfs_unregister - undo workqueue_sysfs_register()
- * @wq: the workqueue to unregister
- *
- * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister.
- */
-static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
-{
-        struct wq_device *wq_dev = wq->wq_dev;
-        if (!wq->wq_dev)
-                return;
-        wq->wq_dev = NULL;
-        device_unregister(&wq_dev->dev);
-}
-#else   /* CONFIG_SYSFS */
-static void workqueue_sysfs_unregister(struct workqueue_struct *wq)     { }
-#endif  /* CONFIG_SYSFS */
 /**
 * free_workqueue_attrs - free a workqueue_attrs
 * @attrs: workqueue_attrs to free
@@ -3424,6 +3120,20 @@ static int init_worker_pool(struct worker_pool *pool)
        return 0;
 }
+static void rcu_free_wq(struct rcu_head *rcu)
+{
+        struct workqueue_struct *wq =
+                container_of(rcu, struct workqueue_struct, rcu);
+        if (!(wq->flags & WQ_UNBOUND))
+                free_percpu(wq->cpu_pwqs);
+        else
+                free_workqueue_attrs(wq->unbound_attrs);
+        kfree(wq->rescuer);
+        kfree(wq);
+}
 static void rcu_free_pool(struct rcu_head *rcu)
 {
        struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu);
@@ -3601,12 +3311,10 @@ static void pwq_unbound_release_workfn(struct work_struct *work)
        /*
         * If we're the last pwq going away, @wq is already dead and no one
-         * is gonna access it anymore.  Free it.
+         * is gonna access it anymore.  Schedule RCU free.
         */
-        if (is_last) {
+        if (is_last)
-                free_workqueue_attrs(wq->unbound_attrs);
+                call_rcu_sched(&wq->rcu, rcu_free_wq);
-                kfree(wq);
-        }
 }
 /**
@@ -4143,7 +3851,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
                pwq_adjust_max_active(pwq);
        mutex_unlock(&wq->mutex);
-        list_add(&wq->list, &workqueues);
+        list_add_tail_rcu(&wq->list, &workqueues);
        mutex_unlock(&wq_pool_mutex);
@@ -4199,24 +3907,20 @@ void destroy_workqueue(struct workqueue_struct *wq)
         * flushing is complete in case freeze races us.
         */
        mutex_lock(&wq_pool_mutex);
-        list_del_init(&wq->list);
+        list_del_rcu(&wq->list);
        mutex_unlock(&wq_pool_mutex);
        workqueue_sysfs_unregister(wq);
-        if (wq->rescuer) {
+        if (wq->rescuer)
                kthread_stop(wq->rescuer->task);
-                kfree(wq->rescuer);
-                wq->rescuer = NULL;
-        }
        if (!(wq->flags & WQ_UNBOUND)) {
                /*
                 * The base ref is never dropped on per-cpu pwqs.  Directly
-                 * free the pwqs and wq.
+                 * schedule RCU free.
                 */
-                free_percpu(wq->cpu_pwqs);
+                call_rcu_sched(&wq->rcu, rcu_free_wq);
-                kfree(wq);
        } else {
                /*
                 * We're the sole accessor of @wq at this point.  Directly
@@ -4437,6 +4141,166 @@ void print_worker_info(const char *log_lvl, struct task_struct *task)
        }
 }
+static void pr_cont_pool_info(struct worker_pool *pool)
+{
+        pr_cont(" cpus=%*pbl", nr_cpumask_bits, pool->attrs->cpumask);
+        if (pool->node != NUMA_NO_NODE)
+                pr_cont(" node=%d", pool->node);
+        pr_cont(" flags=0x%x nice=%d", pool->flags, pool->attrs->nice);
+}
+static void pr_cont_work(bool comma, struct work_struct *work)
+{
+        if (work->func == wq_barrier_func) {
+                struct wq_barrier *barr;
+                barr = container_of(work, struct wq_barrier, work);
+                pr_cont("%s BAR(%d)", comma ? "," : "",
+                        task_pid_nr(barr->task));
+        } else {
+                pr_cont("%s %pf", comma ? "," : "", work->func);
+        }
+}
+static void show_pwq(struct pool_workqueue *pwq)
+{
+        struct worker_pool *pool = pwq->pool;
+        struct work_struct *work;
+        struct worker *worker;
+        bool has_in_flight = false, has_pending = false;
+        int bkt;
+        pr_info("  pwq %d:", pool->id);
+        pr_cont_pool_info(pool);
+        pr_cont(" active=%d/%d%s\n", pwq->nr_active, pwq->max_active,
+                !list_empty(&pwq->mayday_node) ? " MAYDAY" : "");
+        hash_for_each(pool->busy_hash, bkt, worker, hentry) {
+                if (worker->current_pwq == pwq) {
+                        has_in_flight = true;
+                        break;
+                }
+        }
+        if (has_in_flight) {
+                bool comma = false;
+                pr_info("    in-flight:");
+                hash_for_each(pool->busy_hash, bkt, worker, hentry) {
+                        if (worker->current_pwq != pwq)
+                                continue;
+                        pr_cont("%s %d%s:%pf", comma ? "," : "",
+                                task_pid_nr(worker->task),
+                                worker == pwq->wq->rescuer ? "(RESCUER)" : "",
+                                worker->current_func);
+                        list_for_each_entry(work, &worker->scheduled, entry)
+                                pr_cont_work(false, work);
+                        comma = true;
+                }
+                pr_cont("\n");
+        }
+        list_for_each_entry(work, &pool->worklist, entry) {
+                if (get_work_pwq(work) == pwq) {
+                        has_pending = true;
+                        break;
+                }
+        }
+        if (has_pending) {
+                bool comma = false;
+                pr_info("    pending:");
+                list_for_each_entry(work, &pool->worklist, entry) {
+                        if (get_work_pwq(work) != pwq)
+                                continue;
+                        pr_cont_work(comma, work);
+                        comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
+                }
+                pr_cont("\n");
+        }
+        if (!list_empty(&pwq->delayed_works)) {
+                bool comma = false;
+                pr_info("    delayed:");
+                list_for_each_entry(work, &pwq->delayed_works, entry) {
+                        pr_cont_work(comma, work);
+                        comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
+                }
+                pr_cont("\n");
+        }
+}
+/**
+ * show_workqueue_state - dump workqueue state
+ *
+ * Called from a sysrq handler and prints out all busy workqueues and
+ * pools.
+ */
+void show_workqueue_state(void)
+{
+        struct workqueue_struct *wq;
+        struct worker_pool *pool;
+        unsigned long flags;
+        int pi;
+        rcu_read_lock_sched();
+        pr_info("Showing busy workqueues and worker pools:\n");
+        list_for_each_entry_rcu(wq, &workqueues, list) {
+                struct pool_workqueue *pwq;
+                bool idle = true;
+                for_each_pwq(pwq, wq) {
+                        if (pwq->nr_active || !list_empty(&pwq->delayed_works)) {
+                                idle = false;
+                                break;
+                        }
+                }
+                if (idle)
+                        continue;
+                pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags);
+                for_each_pwq(pwq, wq) {
+                        spin_lock_irqsave(&pwq->pool->lock, flags);
+                        if (pwq->nr_active || !list_empty(&pwq->delayed_works))
+                                show_pwq(pwq);
+                        spin_unlock_irqrestore(&pwq->pool->lock, flags);
+                }
+        }
+        for_each_pool(pool, pi) {
+                struct worker *worker;
+                bool first = true;
+                spin_lock_irqsave(&pool->lock, flags);
+                if (pool->nr_workers == pool->nr_idle)
+                        goto next_pool;
+                pr_info("pool %d:", pool->id);
+                pr_cont_pool_info(pool);
+                pr_cont(" workers=%d", pool->nr_workers);
+                if (pool->manager)
+                        pr_cont(" manager: %d",
+                                task_pid_nr(pool->manager->task));
+                list_for_each_entry(worker, &pool->idle_list, entry) {
+                        pr_cont(" %s%d", first ? "idle: " : "",
+                                task_pid_nr(worker->task));
+                        first = false;
+                }
+                pr_cont("\n");
+        next_pool:
+                spin_unlock_irqrestore(&pool->lock, flags);
+        }
+        rcu_read_unlock_sched();
+}
 /*
 * CPU hotplug.
 *
@@ -4834,6 +4698,323 @@ out_unlock:
 }
 #endif /* CONFIG_FREEZER */
+#ifdef CONFIG_SYSFS
+/*
+ * Workqueues with WQ_SYSFS flag set is visible to userland via
+ * /sys/bus/workqueue/devices/WQ_NAME.  All visible workqueues have the
+ * following attributes.
+ *
+ *  per_cpu     RO bool : whether the workqueue is per-cpu or unbound
+ *  max_active  RW int  : maximum number of in-flight work items
+ *
+ * Unbound workqueues have the following extra attributes.
+ *
+ *  id          RO int  : the associated pool ID
+ *  nice        RW int  : nice value of the workers
+ *  cpumask     RW mask : bitmask of allowed CPUs for the workers
+ */
+struct wq_device {
+        struct workqueue_struct         *wq;
+        struct device                   dev;
+};
+static struct workqueue_struct *dev_to_wq(struct device *dev)
+{
+        struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
+        return wq_dev->wq;
+}
+static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr,
+                            char *buf)
+{
+        struct workqueue_struct *wq = dev_to_wq(dev);
+        return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
+}
+static DEVICE_ATTR_RO(per_cpu);
+static ssize_t max_active_show(struct device *dev,
+                               struct device_attribute *attr, char *buf)
+{
+        struct workqueue_struct *wq = dev_to_wq(dev);
+        return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
+}
+static ssize_t max_active_store(struct device *dev,
+                                struct device_attribute *attr, const char *buf,
+                                size_t count)
+{
+        struct workqueue_struct *wq = dev_to_wq(dev);
+        int val;
+        if (sscanf(buf, "%d", &val) != 1 || val <= 0)
+                return -EINVAL;
+        workqueue_set_max_active(wq, val);
+        return count;
+}
+static DEVICE_ATTR_RW(max_active);
+static struct attribute *wq_sysfs_attrs[] = {
+        &dev_attr_per_cpu.attr,
+        &dev_attr_max_active.attr,
+        NULL,
+};
+ATTRIBUTE_GROUPS(wq_sysfs);
+static ssize_t wq_pool_ids_show(struct device *dev,
+                                struct device_attribute *attr, char *buf)
+{
+        struct workqueue_struct *wq = dev_to_wq(dev);
+        const char *delim = "";
+        int node, written = 0;
+        rcu_read_lock_sched();
+        for_each_node(node) {
+                written += scnprintf(buf + written, PAGE_SIZE - written,
+                                     "%s%d:%d", delim, node,
+                                     unbound_pwq_by_node(wq, node)->pool->id);
+                delim = " ";
+        }
+        written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
+        rcu_read_unlock_sched();
+        return written;
+}
+static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
+                            char *buf)
+{
+        struct workqueue_struct *wq = dev_to_wq(dev);
+        int written;
+        mutex_lock(&wq->mutex);
+        written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice);
+        mutex_unlock(&wq->mutex);
+        return written;
+}
+/* prepare workqueue_attrs for sysfs store operations */
+static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
+{
+        struct workqueue_attrs *attrs;
+        attrs = alloc_workqueue_attrs(GFP_KERNEL);
+        if (!attrs)
+                return NULL;
+        mutex_lock(&wq->mutex);
+        copy_workqueue_attrs(attrs, wq->unbound_attrs);
+        mutex_unlock(&wq->mutex);
+        return attrs;
+}
+static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
+                             const char *buf, size_t count)
+{
+        struct workqueue_struct *wq = dev_to_wq(dev);
+        struct workqueue_attrs *attrs;
+        int ret;
+        attrs = wq_sysfs_prep_attrs(wq);
+        if (!attrs)
+                return -ENOMEM;
+        if (sscanf(buf, "%d", &attrs->nice) == 1 &&
+            attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
+                ret = apply_workqueue_attrs(wq, attrs);
+        else
+                ret = -EINVAL;
+        free_workqueue_attrs(attrs);
+        return ret ?: count;
+}
+static ssize_t wq_cpumask_show(struct device *dev,
+                               struct device_attribute *attr, char *buf)
+{
+        struct workqueue_struct *wq = dev_to_wq(dev);
+        int written;
+        mutex_lock(&wq->mutex);
+        written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
+                            cpumask_pr_args(wq->unbound_attrs->cpumask));
+        mutex_unlock(&wq->mutex);
+        return written;
+}
+static ssize_t wq_cpumask_store(struct device *dev,
+                                struct device_attribute *attr,
+                                const char *buf, size_t count)
+{
+        struct workqueue_struct *wq = dev_to_wq(dev);
+        struct workqueue_attrs *attrs;
+        int ret;
+        attrs = wq_sysfs_prep_attrs(wq);
+        if (!attrs)
+                return -ENOMEM;
+        ret = cpumask_parse(buf, attrs->cpumask);
+        if (!ret)
+                ret = apply_workqueue_attrs(wq, attrs);
+        free_workqueue_attrs(attrs);
+        return ret ?: count;
+}
+static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr,
+                            char *buf)
+{
+        struct workqueue_struct *wq = dev_to_wq(dev);
+        int written;
+        mutex_lock(&wq->mutex);
+        written = scnprintf(buf, PAGE_SIZE, "%d\n",
+                            !wq->unbound_attrs->no_numa);
+        mutex_unlock(&wq->mutex);
+        return written;
+}
+static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
+                             const char *buf, size_t count)
+{
+        struct workqueue_struct *wq = dev_to_wq(dev);
+        struct workqueue_attrs *attrs;
+        int v, ret;
+        attrs = wq_sysfs_prep_attrs(wq);
+        if (!attrs)
+                return -ENOMEM;
+        ret = -EINVAL;
+        if (sscanf(buf, "%d", &v) == 1) {
+                attrs->no_numa = !v;
+                ret = apply_workqueue_attrs(wq, attrs);
+        }
+        free_workqueue_attrs(attrs);
+        return ret ?: count;
+}
+static struct device_attribute wq_sysfs_unbound_attrs[] = {
+        __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
+        __ATTR(nice, 0644, wq_nice_show, wq_nice_store),
+        __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
+        __ATTR(numa, 0644, wq_numa_show, wq_numa_store),
+        __ATTR_NULL,
+};
+static struct bus_type wq_subsys = {
+        .name                           = "workqueue",
+        .dev_groups                     = wq_sysfs_groups,
+};
+static int __init wq_sysfs_init(void)
+{
+        return subsys_virtual_register(&wq_subsys, NULL);
+}
+core_initcall(wq_sysfs_init);
+static void wq_device_release(struct device *dev)
+{
+        struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
+        kfree(wq_dev);
+}
+/**
+ * workqueue_sysfs_register - make a workqueue visible in sysfs
+ * @wq: the workqueue to register
+ *
+ * Expose @wq in sysfs under /sys/bus/workqueue/devices.
+ * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set
+ * which is the preferred method.
+ *
+ * Workqueue user should use this function directly iff it wants to apply
+ * workqueue_attrs before making the workqueue visible in sysfs; otherwise,
+ * apply_workqueue_attrs() may race against userland updating the
+ * attributes.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int workqueue_sysfs_register(struct workqueue_struct *wq)
+{
+        struct wq_device *wq_dev;
+        int ret;
+        /*
+         * Adjusting max_active or creating new pwqs by applyting
+         * attributes breaks ordering guarantee.  Disallow exposing ordered
+         * workqueues.
+         */
+        if (WARN_ON(wq->flags & __WQ_ORDERED))
+                return -EINVAL;
+        wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
+        if (!wq_dev)
+                return -ENOMEM;
+        wq_dev->wq = wq;
+        wq_dev->dev.bus = &wq_subsys;
+        wq_dev->dev.init_name = wq->name;
+        wq_dev->dev.release = wq_device_release;
+        /*
+         * unbound_attrs are created separately.  Suppress uevent until
+         * everything is ready.
+         */
+        dev_set_uevent_suppress(&wq_dev->dev, true);
+        ret = device_register(&wq_dev->dev);
+        if (ret) {
+                kfree(wq_dev);
+                wq->wq_dev = NULL;
+                return ret;
+        }
+        if (wq->flags & WQ_UNBOUND) {
+                struct device_attribute *attr;
+                for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) {
+                        ret = device_create_file(&wq_dev->dev, attr);
+                        if (ret) {
+                                device_unregister(&wq_dev->dev);
+                                wq->wq_dev = NULL;
+                                return ret;
+                        }
+                }
+        }
+        dev_set_uevent_suppress(&wq_dev->dev, false);
+        kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
+        return 0;
+}
+/**
+ * workqueue_sysfs_unregister - undo workqueue_sysfs_register()
+ * @wq: the workqueue to unregister
+ *
+ * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister.
+ */
+static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
+{
+        struct wq_device *wq_dev = wq->wq_dev;
+        if (!wq->wq_dev)
+                return;
+        wq->wq_dev = NULL;
+        device_unregister(&wq_dev->dev);
+}
+#else   /* CONFIG_SYSFS */
+static void workqueue_sysfs_unregister(struct workqueue_struct *wq)     { }
+#endif  /* CONFIG_SYSFS */
 static void __init wq_numa_init(void)
 {
        cpumask_var_t *tbl;