114 files changed, 5349 insertions, 3590 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index 22bb4f24f071..8d528f9930da 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1883,6 +1883,23 @@ out_null:
        audit_log_format(ab, " exe=(null)");
 }
+struct tty_struct *audit_get_tty(struct task_struct *tsk)
+{
+        struct tty_struct *tty = NULL;
+        unsigned long flags;
+        spin_lock_irqsave(&tsk->sighand->siglock, flags);
+        if (tsk->signal)
+                tty = tty_kref_get(tsk->signal->tty);
+        spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
+        return tty;
+}
+void audit_put_tty(struct tty_struct *tty)
+{
+        tty_kref_put(tty);
+}
 void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
 {
        const struct cred *cred;
diff --git a/kernel/audit.h b/kernel/audit.h
index cbbe6bb6496e..a492f4c4e710 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -23,6 +23,7 @@
 #include <linux/audit.h>
 #include <linux/skbuff.h>
 #include <uapi/linux/mqueue.h>
+#include <linux/tty.h>
 /* AUDIT_NAMES is the number of slots we reserve in the audit_context
 * for saving names from getname().  If we get more names we will allocate
@@ -262,6 +263,9 @@ extern struct audit_entry *audit_dupe_rule(struct audit_krule *old);
 extern void audit_log_d_path_exe(struct audit_buffer *ab,
                                 struct mm_struct *mm);
+extern struct tty_struct *audit_get_tty(struct task_struct *tsk);
+extern void audit_put_tty(struct tty_struct *tty);
 /* audit watch functions */
 #ifdef CONFIG_AUDIT_WATCH
 extern void audit_put_watch(struct audit_watch *watch);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 62ab53d7619c..2672d105cffc 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -63,7 +63,6 @@
 #include <asm/unistd.h>
 #include <linux/security.h>
 #include <linux/list.h>
-#include <linux/tty.h>
 #include <linux/binfmts.h>
 #include <linux/highmem.h>
 #include <linux/syscalls.h>
@@ -1985,14 +1984,15 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
        if (!audit_enabled)
                return;
+        ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
+        if (!ab)
+                return;
        uid = from_kuid(&init_user_ns, task_uid(current));
        oldloginuid = from_kuid(&init_user_ns, koldloginuid);
        loginuid = from_kuid(&init_user_ns, kloginuid),
        tty = audit_get_tty(current);
-        ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
-        if (!ab)
-                return;
        audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid);
        audit_log_task_context(ab);
        audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d",
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 76d5a794e426..633a650d7aeb 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -328,8 +328,8 @@ static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
 }
 /* only called from syscall */
-static int fd_array_map_update_elem(struct bpf_map *map, void *key,
+int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
-                                    void *value, u64 map_flags)
+                                 void *key, void *value, u64 map_flags)
 {
        struct bpf_array *array = container_of(map, struct bpf_array, map);
        void *new_ptr, *old_ptr;
@@ -342,7 +342,7 @@ static int fd_array_map_update_elem(struct bpf_map *map, void *key,
                return -E2BIG;
        ufd = *(u32 *)value;
-        new_ptr = map->ops->map_fd_get_ptr(map, ufd);
+        new_ptr = map->ops->map_fd_get_ptr(map, map_file, ufd);
        if (IS_ERR(new_ptr))
                return PTR_ERR(new_ptr);
@@ -371,10 +371,12 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
        }
 }
-static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd)
+static void *prog_fd_array_get_ptr(struct bpf_map *map,
+                                   struct file *map_file, int fd)
 {
        struct bpf_array *array = container_of(map, struct bpf_array, map);
        struct bpf_prog *prog = bpf_prog_get(fd);
        if (IS_ERR(prog))
                return prog;
@@ -382,14 +384,13 @@ static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd)
                bpf_prog_put(prog);
                return ERR_PTR(-EINVAL);
        }
        return prog;
 }
 static void prog_fd_array_put_ptr(void *ptr)
 {
-        struct bpf_prog *prog = ptr;
+        bpf_prog_put(ptr);
-        bpf_prog_put_rcu(prog);
 }
 /* decrement refcnt of all bpf_progs that are stored in this map */
@@ -407,7 +408,6 @@ static const struct bpf_map_ops prog_array_ops = {
        .map_free = fd_array_map_free,
        .map_get_next_key = array_map_get_next_key,
        .map_lookup_elem = fd_array_map_lookup_elem,
-        .map_update_elem = fd_array_map_update_elem,
        .map_delete_elem = fd_array_map_delete_elem,
        .map_fd_get_ptr = prog_fd_array_get_ptr,
        .map_fd_put_ptr = prog_fd_array_put_ptr,
@@ -425,59 +425,105 @@ static int __init register_prog_array_map(void)
 }
 late_initcall(register_prog_array_map);
-static void perf_event_array_map_free(struct bpf_map *map)
+static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
+                                                   struct file *map_file)
 {
-        bpf_fd_array_map_clear(map);
+        struct bpf_event_entry *ee;
-        fd_array_map_free(map);
+        ee = kzalloc(sizeof(*ee), GFP_ATOMIC);
+        if (ee) {
+                ee->event = perf_file->private_data;
+                ee->perf_file = perf_file;
+                ee->map_file = map_file;
+        }
+        return ee;
 }
-static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
+static void __bpf_event_entry_free(struct rcu_head *rcu)
 {
-        struct perf_event *event;
+        struct bpf_event_entry *ee;
-        const struct perf_event_attr *attr;
-        struct file *file;
-        file = perf_event_get(fd);
+        ee = container_of(rcu, struct bpf_event_entry, rcu);
-        if (IS_ERR(file))
+        fput(ee->perf_file);
-                return file;
+        kfree(ee);
+}
-        event = file->private_data;
+static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee)
+{
+        call_rcu(&ee->rcu, __bpf_event_entry_free);
+}
-        attr = perf_event_attrs(event);
+static void *perf_event_fd_array_get_ptr(struct bpf_map *map,
-        if (IS_ERR(attr))
+                                         struct file *map_file, int fd)
-                goto err;
+{
+        const struct perf_event_attr *attr;
+        struct bpf_event_entry *ee;
+        struct perf_event *event;
+        struct file *perf_file;
-        if (attr->inherit)
+        perf_file = perf_event_get(fd);
-                goto err;
+        if (IS_ERR(perf_file))
+                return perf_file;
-        if (attr->type == PERF_TYPE_RAW)
+        event = perf_file->private_data;
-                return file;
+        ee = ERR_PTR(-EINVAL);
-        if (attr->type == PERF_TYPE_HARDWARE)
+        attr = perf_event_attrs(event);
-                return file;
+        if (IS_ERR(attr) || attr->inherit)
+                goto err_out;
+        switch (attr->type) {
+        case PERF_TYPE_SOFTWARE:
+                if (attr->config != PERF_COUNT_SW_BPF_OUTPUT)
+                        goto err_out;
+                /* fall-through */
+        case PERF_TYPE_RAW:
+        case PERF_TYPE_HARDWARE:
+                ee = bpf_event_entry_gen(perf_file, map_file);
+                if (ee)
+                        return ee;
+                ee = ERR_PTR(-ENOMEM);
+                /* fall-through */
+        default:
+                break;
+        }
-        if (attr->type == PERF_TYPE_SOFTWARE &&
+err_out:
-            attr->config == PERF_COUNT_SW_BPF_OUTPUT)
+        fput(perf_file);
-                return file;
+        return ee;
-err:
-        fput(file);
-        return ERR_PTR(-EINVAL);
 }
 static void perf_event_fd_array_put_ptr(void *ptr)
 {
-        fput((struct file *)ptr);
+        bpf_event_entry_free_rcu(ptr);
+}
+static void perf_event_fd_array_release(struct bpf_map *map,
+                                        struct file *map_file)
+{
+        struct bpf_array *array = container_of(map, struct bpf_array, map);
+        struct bpf_event_entry *ee;
+        int i;
+        rcu_read_lock();
+        for (i = 0; i < array->map.max_entries; i++) {
+                ee = READ_ONCE(array->ptrs[i]);
+                if (ee && ee->map_file == map_file)
+                        fd_array_map_delete_elem(map, &i);
+        }
+        rcu_read_unlock();
 }
 static const struct bpf_map_ops perf_event_array_ops = {
        .map_alloc = fd_array_map_alloc,
-        .map_free = perf_event_array_map_free,
+        .map_free = fd_array_map_free,
        .map_get_next_key = array_map_get_next_key,
        .map_lookup_elem = fd_array_map_lookup_elem,
-        .map_update_elem = fd_array_map_update_elem,
        .map_delete_elem = fd_array_map_delete_elem,
        .map_fd_get_ptr = perf_event_fd_array_get_ptr,
        .map_fd_put_ptr = perf_event_fd_array_put_ptr,
+        .map_release = perf_event_fd_array_release,
 };
 static struct bpf_map_type_list perf_event_array_type __read_mostly = {
@@ -491,3 +537,46 @@ static int __init register_perf_event_array_map(void)
        return 0;
 }
 late_initcall(register_perf_event_array_map);
+#ifdef CONFIG_SOCK_CGROUP_DATA
+static void *cgroup_fd_array_get_ptr(struct bpf_map *map,
+                                     struct file *map_file /* not used */,
+                                     int fd)
+{
+        return cgroup_get_from_fd(fd);
+}
+static void cgroup_fd_array_put_ptr(void *ptr)
+{
+        /* cgroup_put free cgrp after a rcu grace period */
+        cgroup_put(ptr);
+}
+static void cgroup_fd_array_free(struct bpf_map *map)
+{
+        bpf_fd_array_map_clear(map);
+        fd_array_map_free(map);
+}
+static const struct bpf_map_ops cgroup_array_ops = {
+        .map_alloc = fd_array_map_alloc,
+        .map_free = cgroup_fd_array_free,
+        .map_get_next_key = array_map_get_next_key,
+        .map_lookup_elem = fd_array_map_lookup_elem,
+        .map_delete_elem = fd_array_map_delete_elem,
+        .map_fd_get_ptr = cgroup_fd_array_get_ptr,
+        .map_fd_put_ptr = cgroup_fd_array_put_ptr,
+};
+static struct bpf_map_type_list cgroup_array_type __read_mostly = {
+        .ops = &cgroup_array_ops,
+        .type = BPF_MAP_TYPE_CGROUP_ARRAY,
+};
+static int __init register_cgroup_array_map(void)
+{
+        bpf_register_map_type(&cgroup_array_type);
+        return 0;
+}
+late_initcall(register_cgroup_array_map);
+#endif
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index b94a36550591..03fd23d4d587 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -719,14 +719,13 @@ select_insn:
                if (unlikely(index >= array->map.max_entries))
                        goto out;
                if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT))
                        goto out;
                tail_call_cnt++;
                prog = READ_ONCE(array->ptrs[index]);
-                if (unlikely(!prog))
+                if (!prog)
                        goto out;
                /* ARG1 at this point is guaranteed to point to CTX from
@@ -1055,9 +1054,11 @@ const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
        return NULL;
 }
-const struct bpf_func_proto * __weak bpf_get_event_output_proto(void)
+u64 __weak
+bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
+                 void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
 {
-        return NULL;
+        return -ENOTSUPP;
 }
 /* Always built-in helper functions. */
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index ad7a0573f71b..1ea3afba1a4f 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -101,7 +101,7 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto = {
 static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 {
-        return raw_smp_processor_id();
+        return smp_processor_id();
 }
 const struct bpf_func_proto bpf_get_smp_processor_id_proto = {
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 318858edb1cd..5967b870a895 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -11,7 +11,7 @@
 * version 2 as published by the Free Software Foundation.
 */
-#include <linux/module.h>
+#include <linux/init.h>
 #include <linux/magic.h>
 #include <linux/major.h>
 #include <linux/mount.h>
@@ -367,8 +367,6 @@ static struct file_system_type bpf_fs_type = {
        .kill_sb        = kill_litter_super,
 };
-MODULE_ALIAS_FS("bpf");
 static int __init bpf_init(void)
 {
        int ret;
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 080a2dfb5800..bf4495fcd25d 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -99,7 +99,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
        if (err)
                goto free_smap;
-        err = get_callchain_buffers();
+        err = get_callchain_buffers(sysctl_perf_event_max_stack);
        if (err)
                goto free_smap;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 46ecce4b79ed..228f962447a5 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -124,7 +124,12 @@ void bpf_map_put_with_uref(struct bpf_map *map)
 static int bpf_map_release(struct inode *inode, struct file *filp)
 {
-        bpf_map_put_with_uref(filp->private_data);
+        struct bpf_map *map = filp->private_data;
+        if (map->ops->map_release)
+                map->ops->map_release(map, filp);
+        bpf_map_put_with_uref(map);
        return 0;
 }
@@ -387,6 +392,13 @@ static int map_update_elem(union bpf_attr *attr)
                err = bpf_percpu_hash_update(map, key, value, attr->flags);
        } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
                err = bpf_percpu_array_update(map, key, value, attr->flags);
+        } else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY ||
+                   map->map_type == BPF_MAP_TYPE_PROG_ARRAY ||
+                   map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY) {
+                rcu_read_lock();
+                err = bpf_fd_array_map_update_elem(map, f.file, key, value,
+                                                   attr->flags);
+                rcu_read_unlock();
        } else {
                rcu_read_lock();
                err = map->ops->map_update_elem(map, key, value, attr->flags);
@@ -612,7 +624,7 @@ static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
        free_uid(user);
 }
-static void __prog_put_common(struct rcu_head *rcu)
+static void __bpf_prog_put_rcu(struct rcu_head *rcu)
 {
        struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
@@ -621,17 +633,10 @@ static void __prog_put_common(struct rcu_head *rcu)
        bpf_prog_free(aux->prog);
 }
-/* version of bpf_prog_put() that is called after a grace period */
-void bpf_prog_put_rcu(struct bpf_prog *prog)
-{
-        if (atomic_dec_and_test(&prog->aux->refcnt))
-                call_rcu(&prog->aux->rcu, __prog_put_common);
-}
 void bpf_prog_put(struct bpf_prog *prog)
 {
        if (atomic_dec_and_test(&prog->aux->refcnt))
-                __prog_put_common(&prog->aux->rcu);
+                call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
 }
 EXPORT_SYMBOL_GPL(bpf_prog_put);
@@ -639,7 +644,7 @@ static int bpf_prog_release(struct inode *inode, struct file *filp)
 {
        struct bpf_prog *prog = filp->private_data;
-        bpf_prog_put_rcu(prog);
+        bpf_prog_put(prog);
        return 0;
 }
@@ -653,7 +658,7 @@ int bpf_prog_new_fd(struct bpf_prog *prog)
                                O_RDWR | O_CLOEXEC);
 }
-static struct bpf_prog *__bpf_prog_get(struct fd f)
+static struct bpf_prog *____bpf_prog_get(struct fd f)
 {
        if (!f.file)
                return ERR_PTR(-EBADF);
@@ -665,33 +670,50 @@ static struct bpf_prog *__bpf_prog_get(struct fd f)
        return f.file->private_data;
 }
-struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
+struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i)
 {
-        if (atomic_inc_return(&prog->aux->refcnt) > BPF_MAX_REFCNT) {
+        if (atomic_add_return(i, &prog->aux->refcnt) > BPF_MAX_REFCNT) {
-                atomic_dec(&prog->aux->refcnt);
+                atomic_sub(i, &prog->aux->refcnt);
                return ERR_PTR(-EBUSY);
        }
        return prog;
 }
+EXPORT_SYMBOL_GPL(bpf_prog_add);
-/* called by sockets/tracing/seccomp before attaching program to an event
+struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
- * pairs with bpf_prog_put()
+{
- */
+        return bpf_prog_add(prog, 1);
-struct bpf_prog *bpf_prog_get(u32 ufd)
+}
+static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
 {
        struct fd f = fdget(ufd);
        struct bpf_prog *prog;
-        prog = __bpf_prog_get(f);
+        prog = ____bpf_prog_get(f);
        if (IS_ERR(prog))
                return prog;
+        if (type && prog->type != *type) {
+                prog = ERR_PTR(-EINVAL);
+                goto out;
+        }
        prog = bpf_prog_inc(prog);
+out:
        fdput(f);
        return prog;
 }
-EXPORT_SYMBOL_GPL(bpf_prog_get);
+struct bpf_prog *bpf_prog_get(u32 ufd)
+{
+        return __bpf_prog_get(ufd, NULL);
+}
+struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
+{
+        return __bpf_prog_get(ufd, &type);
+}
+EXPORT_SYMBOL_GPL(bpf_prog_get_type);
 /* last field in 'union bpf_attr' used by this command */
 #define BPF_PROG_LOAD_LAST_FIELD kern_version
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 668e07903c8f..f72f23b8fdab 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -126,31 +126,6 @@
 * are set to NOT_INIT to indicate that they are no longer readable.
 */
-/* types of values stored in eBPF registers */
-enum bpf_reg_type {
-        NOT_INIT = 0,            /* nothing was written into register */
-        UNKNOWN_VALUE,           /* reg doesn't contain a valid pointer */
-        PTR_TO_CTX,              /* reg points to bpf_context */
-        CONST_PTR_TO_MAP,        /* reg points to struct bpf_map */
-        PTR_TO_MAP_VALUE,        /* reg points to map element value */
-        PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */
-        FRAME_PTR,               /* reg == frame_pointer */
-        PTR_TO_STACK,            /* reg == frame_pointer + imm */
-        CONST_IMM,               /* constant integer value */
-        /* PTR_TO_PACKET represents:
-         * skb->data
-         * skb->data + imm
-         * skb->data + (u16) var
-         * skb->data + (u16) var + imm
-         * if (range > 0) then [ptr, ptr + range - off) is safe to access
-         * if (id > 0) means that some 'var' was added
-         * if (off > 0) menas that 'imm' was added
-         */
-        PTR_TO_PACKET,
-        PTR_TO_PACKET_END,       /* skb->data + headlen */
-};
 struct reg_state {
        enum bpf_reg_type type;
        union {
@@ -678,6 +653,16 @@ static int check_map_access(struct verifier_env *env, u32 regno, int off,
 #define MAX_PACKET_OFF 0xffff
+static bool may_write_pkt_data(enum bpf_prog_type type)
+{
+        switch (type) {
+        case BPF_PROG_TYPE_XDP:
+                return true;
+        default:
+                return false;
+        }
+}
 static int check_packet_access(struct verifier_env *env, u32 regno, int off,
                               int size)
 {
@@ -695,10 +680,10 @@ static int check_packet_access(struct verifier_env *env, u32 regno, int off,
 /* check access to 'struct bpf_context' fields */
 static int check_ctx_access(struct verifier_env *env, int off, int size,
-                            enum bpf_access_type t)
+                            enum bpf_access_type t, enum bpf_reg_type *reg_type)
 {
        if (env->prog->aux->ops->is_valid_access &&
-            env->prog->aux->ops->is_valid_access(off, size, t)) {
+            env->prog->aux->ops->is_valid_access(off, size, t, reg_type)) {
                /* remember the offset of last byte accessed in ctx */
                if (env->prog->aux->max_ctx_offset < off + size)
                        env->prog->aux->max_ctx_offset = off + size;
@@ -738,6 +723,7 @@ static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg,
        switch (env->prog->type) {
        case BPF_PROG_TYPE_SCHED_CLS:
        case BPF_PROG_TYPE_SCHED_ACT:
+        case BPF_PROG_TYPE_XDP:
                break;
        default:
                verbose("verifier is misconfigured\n");
@@ -798,21 +784,19 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
                        mark_reg_unknown_value(state->regs, value_regno);
        } else if (reg->type == PTR_TO_CTX) {
+                enum bpf_reg_type reg_type = UNKNOWN_VALUE;
                if (t == BPF_WRITE && value_regno >= 0 &&
                    is_pointer_value(env, value_regno)) {
                        verbose("R%d leaks addr into ctx\n", value_regno);
                        return -EACCES;
                }
-                err = check_ctx_access(env, off, size, t);
+                err = check_ctx_access(env, off, size, t, &reg_type);
                if (!err && t == BPF_READ && value_regno >= 0) {
                        mark_reg_unknown_value(state->regs, value_regno);
-                        if (off == offsetof(struct __sk_buff, data) &&
+                        if (env->allow_ptr_leaks)
-                            env->allow_ptr_leaks)
                                /* note that reg.[id|off|range] == 0 */
-                                state->regs[value_regno].type = PTR_TO_PACKET;
+                                state->regs[value_regno].type = reg_type;
-                        else if (off == offsetof(struct __sk_buff, data_end) &&
-                                 env->allow_ptr_leaks)
-                                state->regs[value_regno].type = PTR_TO_PACKET_END;
                }
        } else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) {
@@ -832,10 +816,15 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
                        err = check_stack_read(state, off, size, value_regno);
                }
        } else if (state->regs[regno].type == PTR_TO_PACKET) {
-                if (t == BPF_WRITE) {
+                if (t == BPF_WRITE && !may_write_pkt_data(env->prog->type)) {
                        verbose("cannot write into packet\n");
                        return -EACCES;
                }
+                if (t == BPF_WRITE && value_regno >= 0 &&
+                    is_pointer_value(env, value_regno)) {
+                        verbose("R%d leaks addr into packet\n", value_regno);
+                        return -EACCES;
+                }
                err = check_packet_access(env, regno, off, size);
                if (!err && t == BPF_READ && value_regno >= 0)
                        mark_reg_unknown_value(state->regs, value_regno);
@@ -1062,6 +1051,10 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
                if (func_id != BPF_FUNC_get_stackid)
                        goto error;
                break;
+        case BPF_MAP_TYPE_CGROUP_ARRAY:
+                if (func_id != BPF_FUNC_skb_in_cgroup)
+                        goto error;
+                break;
        default:
                break;
        }
@@ -1081,6 +1074,10 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
                if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
                        goto error;
                break;
+        case BPF_FUNC_skb_in_cgroup:
+                if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY)
+                        goto error;
+                break;
        default:
                break;
        }
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 86cb5c6e8932..d1c51b7f5221 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -61,7 +61,7 @@
 #include <linux/cpuset.h>
 #include <linux/proc_ns.h>
 #include <linux/nsproxy.h>
-#include <linux/proc_ns.h>
+#include <linux/file.h>
 #include <net/sock.h>
 /*
@@ -837,6 +837,8 @@ static void put_css_set_locked(struct css_set *cset)
 static void put_css_set(struct css_set *cset)
 {
+        unsigned long flags;
        /*
         * Ensure that the refcount doesn't hit zero while any readers
         * can see it. Similar to atomic_dec_and_lock(), but for an
@@ -845,9 +847,9 @@ static void put_css_set(struct css_set *cset)
        if (atomic_add_unless(&cset->refcount, -1, 1))
                return;
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irqsave(&css_set_lock, flags);
        put_css_set_locked(cset);
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irqrestore(&css_set_lock, flags);
 }
 /*
@@ -1070,11 +1072,11 @@ static struct css_set *find_css_set(struct css_set *old_cset,
        /* First see if we already have a cgroup group that matches
         * the desired set */
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        cset = find_existing_css_set(old_cset, cgrp, template);
        if (cset)
                get_css_set(cset);
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
        if (cset)
                return cset;
@@ -1102,7 +1104,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
         * find_existing_css_set() */
        memcpy(cset->subsys, template, sizeof(cset->subsys));
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        /* Add reference counts and links from the new css_set. */
        list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
                struct cgroup *c = link->cgrp;
@@ -1128,7 +1130,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
                css_get(css);
        }
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
        return cset;
 }
@@ -1158,18 +1160,12 @@ static void cgroup_exit_root_id(struct cgroup_root *root)
 {
        lockdep_assert_held(&cgroup_mutex);
-        if (root->hierarchy_id) {
+        idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
-                idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
-                root->hierarchy_id = 0;
-        }
 }
 static void cgroup_free_root(struct cgroup_root *root)
 {
        if (root) {
-                /* hierarchy ID should already have been released */
-                WARN_ON_ONCE(root->hierarchy_id);
                idr_destroy(&root->cgroup_idr);
                kfree(root);
        }
@@ -1192,7 +1188,7 @@ static void cgroup_destroy_root(struct cgroup_root *root)
         * Release all the links from cset_links to this hierarchy's
         * root cgroup
         */
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
                list_del(&link->cset_link);
@@ -1200,7 +1196,7 @@ static void cgroup_destroy_root(struct cgroup_root *root)
                kfree(link);
        }
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
        if (!list_empty(&root->root_list)) {
                list_del(&root->root_list);
@@ -1600,11 +1596,11 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
                ss->root = dst_root;
                css->cgroup = dcgrp;
-                spin_lock_bh(&css_set_lock);
+                spin_lock_irq(&css_set_lock);
                hash_for_each(css_set_table, i, cset, hlist)
                        list_move_tail(&cset->e_cset_node[ss->id],
                                       &dcgrp->e_csets[ss->id]);
-                spin_unlock_bh(&css_set_lock);
+                spin_unlock_irq(&css_set_lock);
                /* default hierarchy doesn't enable controllers by default */
                dst_root->subsys_mask |= 1 << ssid;
@@ -1640,10 +1636,10 @@ static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
        if (!buf)
                return -ENOMEM;
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot);
        len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
        if (len >= PATH_MAX)
                len = -ERANGE;
@@ -1897,7 +1893,7 @@ static void cgroup_enable_task_cg_lists(void)
 {
        struct task_struct *p, *g;
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        if (use_task_css_set_links)
                goto out_unlock;
@@ -1922,8 +1918,12 @@ static void cgroup_enable_task_cg_lists(void)
                 * entry won't be deleted though the process has exited.
                 * Do it while holding siglock so that we don't end up
                 * racing against cgroup_exit().
+                 *
+                 * Interrupts were already disabled while acquiring
+                 * the css_set_lock, so we do not need to disable it
+                 * again when acquiring the sighand->siglock here.
                 */
-                spin_lock_irq(&p->sighand->siglock);
+                spin_lock(&p->sighand->siglock);
                if (!(p->flags & PF_EXITING)) {
                        struct css_set *cset = task_css_set(p);
@@ -1932,11 +1932,11 @@ static void cgroup_enable_task_cg_lists(void)
                        list_add_tail(&p->cg_list, &cset->tasks);
                        get_css_set(cset);
                }
-                spin_unlock_irq(&p->sighand->siglock);
+                spin_unlock(&p->sighand->siglock);
        } while_each_thread(g, p);
        read_unlock(&tasklist_lock);
 out_unlock:
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
 }
 static void init_cgroup_housekeeping(struct cgroup *cgrp)
@@ -2043,13 +2043,13 @@ static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
         * Link the root cgroup in this hierarchy into all the css_set
         * objects.
         */
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        hash_for_each(css_set_table, i, cset, hlist) {
                link_css_set(&tmp_links, cset, root_cgrp);
                if (css_set_populated(cset))
                        cgroup_update_populated(root_cgrp, true);
        }
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
        BUG_ON(!list_empty(&root_cgrp->self.children));
        BUG_ON(atomic_read(&root->nr_cgrps) != 1);
@@ -2209,12 +2209,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                goto out_unlock;
        }
-        /*
+        /* Hierarchies may only be created in the initial cgroup namespace. */
-         * We know this subsystem has not yet been bound.  Users in a non-init
+        if (ns != &init_cgroup_ns) {
-         * user namespace may only mount hierarchies with no bound subsystems,
-         * i.e. 'none,name=user1'
-         */
-        if (!opts.none && !capable(CAP_SYS_ADMIN)) {
                ret = -EPERM;
                goto out_unlock;
        }
@@ -2256,11 +2252,11 @@ out_mount:
                struct cgroup *cgrp;
                mutex_lock(&cgroup_mutex);
-                spin_lock_bh(&css_set_lock);
+                spin_lock_irq(&css_set_lock);
                cgrp = cset_cgroup_from_root(ns->root_cset, root);
-                spin_unlock_bh(&css_set_lock);
+                spin_unlock_irq(&css_set_lock);
                mutex_unlock(&cgroup_mutex);
                nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb);
@@ -2337,11 +2333,11 @@ char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
        char *ret;
        mutex_lock(&cgroup_mutex);
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
        mutex_unlock(&cgroup_mutex);
        return ret;
@@ -2369,7 +2365,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
        char *path = NULL;
        mutex_lock(&cgroup_mutex);
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
@@ -2382,7 +2378,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
                        path = buf;
        }
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
        mutex_unlock(&cgroup_mutex);
        return path;
 }
@@ -2557,7 +2553,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
         * the new cgroup.  There are no failure cases after here, so this
         * is the commit point.
         */
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        list_for_each_entry(cset, &tset->src_csets, mg_node) {
                list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
                        struct css_set *from_cset = task_css_set(task);
@@ -2568,7 +2564,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
                        put_css_set_locked(from_cset);
                }
        }
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
        /*
         * Migration is committed, all target tasks are now on dst_csets.
@@ -2597,13 +2593,13 @@ out_cancel_attach:
                }
        } while_each_subsys_mask();
 out_release_tset:
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        list_splice_init(&tset->dst_csets, &tset->src_csets);
        list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
                list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
                list_del_init(&cset->mg_node);
        }
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
        return ret;
 }
@@ -2634,7 +2630,7 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
        lockdep_assert_held(&cgroup_mutex);
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
                cset->mg_src_cgrp = NULL;
                cset->mg_dst_cgrp = NULL;
@@ -2642,7 +2638,7 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
                list_del_init(&cset->mg_preload_node);
                put_css_set_locked(cset);
        }
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
 }
 /**
@@ -2783,7 +2779,7 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
         * already PF_EXITING could be freed from underneath us unless we
         * take an rcu_read_lock.
         */
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        rcu_read_lock();
        task = leader;
        do {
@@ -2792,7 +2788,7 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
                        break;
        } while_each_thread(leader, task);
        rcu_read_unlock();
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
        return cgroup_taskset_migrate(&tset, root);
 }
@@ -2816,7 +2812,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
                return -EBUSY;
        /* look up all src csets */
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        rcu_read_lock();
        task = leader;
        do {
@@ -2826,7 +2822,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
                        break;
        } while_each_thread(leader, task);
        rcu_read_unlock();
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
        /* prepare dst csets and commit */
        ret = cgroup_migrate_prepare_dst(&preloaded_csets);
@@ -2859,9 +2855,9 @@ static int cgroup_procs_write_permission(struct task_struct *task,
                struct cgroup *cgrp;
                struct inode *inode;
-                spin_lock_bh(&css_set_lock);
+                spin_lock_irq(&css_set_lock);
                cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
-                spin_unlock_bh(&css_set_lock);
+                spin_unlock_irq(&css_set_lock);
                while (!cgroup_is_descendant(dst_cgrp, cgrp))
                        cgrp = cgroup_parent(cgrp);
@@ -2956,20 +2952,22 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
        int retval = 0;
        mutex_lock(&cgroup_mutex);
+        percpu_down_write(&cgroup_threadgroup_rwsem);
        for_each_root(root) {
                struct cgroup *from_cgrp;
                if (root == &cgrp_dfl_root)
                        continue;
-                spin_lock_bh(&css_set_lock);
+                spin_lock_irq(&css_set_lock);
                from_cgrp = task_cgroup_from_root(from, root);
-                spin_unlock_bh(&css_set_lock);
+                spin_unlock_irq(&css_set_lock);
                retval = cgroup_attach_task(from_cgrp, tsk, false);
                if (retval)
                        break;
        }
+        percpu_up_write(&cgroup_threadgroup_rwsem);
        mutex_unlock(&cgroup_mutex);
        return retval;
@@ -3080,7 +3078,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
        percpu_down_write(&cgroup_threadgroup_rwsem);
        /* look up all csses currently attached to @cgrp's subtree */
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
                struct cgrp_cset_link *link;
@@ -3088,14 +3086,14 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
                        cgroup_migrate_add_src(link->cset, dsct,
                                               &preloaded_csets);
        }
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
        /* NULL dst indicates self on default hierarchy */
        ret = cgroup_migrate_prepare_dst(&preloaded_csets);
        if (ret)
                goto out_finish;
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
                struct task_struct *task, *ntask;
@@ -3107,7 +3105,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
                list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
                        cgroup_taskset_add(task, &tset);
        }
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
        ret = cgroup_taskset_migrate(&tset, cgrp->root);
 out_finish:
@@ -3908,10 +3906,10 @@ static int cgroup_task_count(const struct cgroup *cgrp)
        int count = 0;
        struct cgrp_cset_link *link;
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        list_for_each_entry(link, &cgrp->cset_links, cset_link)
                count += atomic_read(&link->cset->refcount);
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
        return count;
 }
@@ -4249,7 +4247,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
        memset(it, 0, sizeof(*it));
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        it->ss = css->ss;
@@ -4262,7 +4260,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
        css_task_iter_advance_css_set(it);
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
 }
 /**
@@ -4280,7 +4278,7 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
                it->cur_task = NULL;
        }
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        if (it->task_pos) {
                it->cur_task = list_entry(it->task_pos, struct task_struct,
@@ -4289,7 +4287,7 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
                css_task_iter_advance(it);
        }
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
        return it->cur_task;
 }
@@ -4303,10 +4301,10 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
 void css_task_iter_end(struct css_task_iter *it)
 {
        if (it->cur_cset) {
-                spin_lock_bh(&css_set_lock);
+                spin_lock_irq(&css_set_lock);
                list_del(&it->iters_node);
                put_css_set_locked(it->cur_cset);
-                spin_unlock_bh(&css_set_lock);
+                spin_unlock_irq(&css_set_lock);
        }
        if (it->cur_task)
@@ -4337,11 +4335,13 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
        mutex_lock(&cgroup_mutex);
+        percpu_down_write(&cgroup_threadgroup_rwsem);
        /* all tasks in @from are being moved, all csets are source */
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        list_for_each_entry(link, &from->cset_links, cset_link)
                cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
        ret = cgroup_migrate_prepare_dst(&preloaded_csets);
        if (ret)
@@ -4365,6 +4365,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
        } while (task && !ret);
 out_err:
        cgroup_migrate_finish(&preloaded_csets);
+        percpu_up_write(&cgroup_threadgroup_rwsem);
        mutex_unlock(&cgroup_mutex);
        return ret;
 }
@@ -5063,6 +5064,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
        memset(css, 0, sizeof(*css));
        css->cgroup = cgrp;
        css->ss = ss;
+        css->id = -1;
        INIT_LIST_HEAD(&css->sibling);
        INIT_LIST_HEAD(&css->children);
        css->serial_nr = css_serial_nr_next++;
@@ -5139,6 +5141,8 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
        lockdep_assert_held(&cgroup_mutex);
        css = ss->css_alloc(parent_css);
+        if (!css)
+                css = ERR_PTR(-ENOMEM);
        if (IS_ERR(css))
                return css;
@@ -5150,7 +5154,7 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
        err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL);
        if (err < 0)
-                goto err_free_percpu_ref;
+                goto err_free_css;
        css->id = err;
        /* @css is ready to be brought online now, make it visible */
@@ -5174,9 +5178,6 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
 err_list_del:
        list_del_rcu(&css->sibling);
-        cgroup_idr_remove(&ss->css_idr, css->id);
-err_free_percpu_ref:
-        percpu_ref_exit(&css->refcnt);
 err_free_css:
        call_rcu(&css->rcu_head, css_free_rcu_fn);
        return ERR_PTR(err);
@@ -5451,10 +5452,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
         */
        cgrp->self.flags &= ~CSS_ONLINE;
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        list_for_each_entry(link, &cgrp->cset_links, cset_link)
                link->cset->dead = true;
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
        /* initiate massacre of all css's */
        for_each_css(css, ssid, cgrp)
@@ -5725,7 +5726,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
                goto out;
        mutex_lock(&cgroup_mutex);
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        for_each_root(root) {
                struct cgroup_subsys *ss;
@@ -5778,7 +5779,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
        retval = 0;
 out_unlock:
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
        mutex_unlock(&cgroup_mutex);
        kfree(buf);
 out:
@@ -5923,13 +5924,13 @@ void cgroup_post_fork(struct task_struct *child)
        if (use_task_css_set_links) {
                struct css_set *cset;
-                spin_lock_bh(&css_set_lock);
+                spin_lock_irq(&css_set_lock);
                cset = task_css_set(current);
                if (list_empty(&child->cg_list)) {
                        get_css_set(cset);
                        css_set_move_task(child, NULL, cset, false);
                }
-                spin_unlock_bh(&css_set_lock);
+                spin_unlock_irq(&css_set_lock);
        }
        /*
@@ -5974,9 +5975,9 @@ void cgroup_exit(struct task_struct *tsk)
        cset = task_css_set(tsk);
        if (!list_empty(&tsk->cg_list)) {
-                spin_lock_bh(&css_set_lock);
+                spin_lock_irq(&css_set_lock);
                css_set_move_task(tsk, cset, NULL, false);
-                spin_unlock_bh(&css_set_lock);
+                spin_unlock_irq(&css_set_lock);
        } else {
                get_css_set(cset);
        }
@@ -6044,9 +6045,9 @@ static void cgroup_release_agent(struct work_struct *work)
        if (!pathbuf || !agentbuf)
                goto out;
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        path = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
        if (!path)
                goto out;
@@ -6168,7 +6169,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
 struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
 {
        WARN_ON_ONCE(!rcu_read_lock_held());
-        return id > 0 ? idr_find(&ss->css_idr, id) : NULL;
+        return idr_find(&ss->css_idr, id);
 }
 /**
@@ -6205,6 +6206,40 @@ struct cgroup *cgroup_get_from_path(const char *path)
 }
 EXPORT_SYMBOL_GPL(cgroup_get_from_path);
+/**
+ * cgroup_get_from_fd - get a cgroup pointer from a fd
+ * @fd: fd obtained by open(cgroup2_dir)
+ *
+ * Find the cgroup from a fd which should be obtained
+ * by opening a cgroup directory.  Returns a pointer to the
+ * cgroup on success. ERR_PTR is returned if the cgroup
+ * cannot be found.
+ */
+struct cgroup *cgroup_get_from_fd(int fd)
+{
+        struct cgroup_subsys_state *css;
+        struct cgroup *cgrp;
+        struct file *f;
+        f = fget_raw(fd);
+        if (!f)
+                return ERR_PTR(-EBADF);
+        css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
+        fput(f);
+        if (IS_ERR(css))
+                return ERR_CAST(css);
+        cgrp = css->cgroup;
+        if (!cgroup_on_dfl(cgrp)) {
+                cgroup_put(cgrp);
+                return ERR_PTR(-EBADF);
+        }
+        return cgrp;
+}
+EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
 /*
 * sock->sk_cgrp_data handling.  For more info, see sock_cgroup_data
 * definition in cgroup-defs.h.
@@ -6305,14 +6340,11 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
        if (!ns_capable(user_ns, CAP_SYS_ADMIN))
                return ERR_PTR(-EPERM);
-        mutex_lock(&cgroup_mutex);
+        /* It is not safe to take cgroup_mutex here */
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        cset = task_css_set(current);
        get_css_set(cset);
+        spin_unlock_irq(&css_set_lock);
-        spin_unlock_bh(&css_set_lock);
-        mutex_unlock(&cgroup_mutex);
        new_ns = alloc_cgroup_ns();
        if (IS_ERR(new_ns)) {
@@ -6435,7 +6467,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
        if (!name_buf)
                return -ENOMEM;
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        rcu_read_lock();
        cset = rcu_dereference(current->cgroups);
        list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
@@ -6446,7 +6478,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
                           c->root->hierarchy_id, name_buf);
        }
        rcu_read_unlock();
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
        kfree(name_buf);
        return 0;
 }
@@ -6457,7 +6489,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
        struct cgroup_subsys_state *css = seq_css(seq);
        struct cgrp_cset_link *link;
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
                struct css_set *cset = link->cset;
                struct task_struct *task;
@@ -6480,7 +6512,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
        overflow:
                seq_puts(seq, "  ...\n");
        }
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
        return 0;
 }
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c
index 303097b37429..2bd673783f1a 100644
--- a/kernel/cgroup_pids.c
+++ b/kernel/cgroup_pids.c
@@ -49,6 +49,12 @@ struct pids_cgroup {
         */
        atomic64_t                      counter;
        int64_t                         limit;
+        /* Handle for "pids.events" */
+        struct cgroup_file              events_file;
+        /* Number of times fork failed because limit was hit. */
+        atomic64_t                      events_limit;
 };
 static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css)
@@ -72,6 +78,7 @@ pids_css_alloc(struct cgroup_subsys_state *parent)
        pids->limit = PIDS_MAX;
        atomic64_set(&pids->counter, 0);
+        atomic64_set(&pids->events_limit, 0);
        return &pids->css;
 }
@@ -213,10 +220,21 @@ static int pids_can_fork(struct task_struct *task)
 {
        struct cgroup_subsys_state *css;
        struct pids_cgroup *pids;
+        int err;
        css = task_css_check(current, pids_cgrp_id, true);
        pids = css_pids(css);
-        return pids_try_charge(pids, 1);
+        err = pids_try_charge(pids, 1);
+        if (err) {
+                /* Only log the first time events_limit is incremented. */
+                if (atomic64_inc_return(&pids->events_limit) == 1) {
+                        pr_info("cgroup: fork rejected by pids controller in ");
+                        pr_cont_cgroup_path(task_cgroup(current, pids_cgrp_id));
+                        pr_cont("\n");
+                }
+                cgroup_file_notify(&pids->events_file);
+        }
+        return err;
 }
 static void pids_cancel_fork(struct task_struct *task)
@@ -288,6 +306,14 @@ static s64 pids_current_read(struct cgroup_subsys_state *css,
        return atomic64_read(&pids->counter);
 }
+static int pids_events_show(struct seq_file *sf, void *v)
+{
+        struct pids_cgroup *pids = css_pids(seq_css(sf));
+        seq_printf(sf, "max %lld\n", (s64)atomic64_read(&pids->events_limit));
+        return 0;
+}
 static struct cftype pids_files[] = {
        {
                .name = "max",
@@ -300,6 +326,12 @@ static struct cftype pids_files[] = {
                .read_s64 = pids_current_read,
                .flags = CFTYPE_NOT_ON_ROOT,
        },
+        {
+                .name = "events",
+                .seq_show = pids_events_show,
+                .file_offset = offsetof(struct pids_cgroup, events_file),
+                .flags = CFTYPE_NOT_ON_ROOT,
+        },
        { }     /* terminate */
 };
diff --git a/kernel/cpu.c b/kernel/cpu.c
index d948e44c471e..341bf80f80bd 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -517,6 +517,13 @@ static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state,
        if (!cpu_online(cpu))
                return 0;
+        /*
+         * If we are up and running, use the hotplug thread. For early calls
+         * we invoke the thread function directly.
+         */
+        if (!st->thread)
+                return cpuhp_invoke_callback(cpu, state, cb);
        st->cb_state = state;
        st->cb = cb;
        /*
@@ -1173,6 +1180,31 @@ static struct cpuhp_step cpuhp_bp_states[] = {
                .teardown               = NULL,
                .cant_stop              = true,
        },
+        [CPUHP_PERF_PREPARE] = {
+                .name = "perf prepare",
+                .startup = perf_event_init_cpu,
+                .teardown = perf_event_exit_cpu,
+        },
+        [CPUHP_WORKQUEUE_PREP] = {
+                .name = "workqueue prepare",
+                .startup = workqueue_prepare_cpu,
+                .teardown = NULL,
+        },
+        [CPUHP_HRTIMERS_PREPARE] = {
+                .name = "hrtimers prepare",
+                .startup = hrtimers_prepare_cpu,
+                .teardown = hrtimers_dead_cpu,
+        },
+        [CPUHP_SMPCFD_PREPARE] = {
+                .name = "SMPCFD prepare",
+                .startup = smpcfd_prepare_cpu,
+                .teardown = smpcfd_dead_cpu,
+        },
+        [CPUHP_RCUTREE_PREP] = {
+                .name = "RCU-tree prepare",
+                .startup = rcutree_prepare_cpu,
+                .teardown = rcutree_dead_cpu,
+        },
        /*
         * Preparatory and dead notifiers. Will be replaced once the notifiers
         * are converted to states.
@@ -1184,6 +1216,16 @@ static struct cpuhp_step cpuhp_bp_states[] = {
                .skip_onerr             = true,
                .cant_stop              = true,
        },
+        /*
+         * On the tear-down path, timers_dead_cpu() must be invoked
+         * before blk_mq_queue_reinit_notify() from notify_dead(),
+         * otherwise a RCU stall occurs.
+         */
+        [CPUHP_TIMERS_DEAD] = {
+                .name = "timers dead",
+                .startup = NULL,
+                .teardown = timers_dead_cpu,
+        },
        /* Kicks the plugged cpu into life */
        [CPUHP_BRINGUP_CPU] = {
                .name                   = "cpu:bringup",
@@ -1191,6 +1233,10 @@ static struct cpuhp_step cpuhp_bp_states[] = {
                .teardown               = NULL,
                .cant_stop              = true,
        },
+        [CPUHP_AP_SMPCFD_DYING] = {
+                .startup = NULL,
+                .teardown = smpcfd_dying_cpu,
+        },
        /*
         * Handled on controll processor until the plugged processor manages
         * this itself.
@@ -1201,6 +1247,8 @@ static struct cpuhp_step cpuhp_bp_states[] = {
                .teardown               = takedown_cpu,
                .cant_stop              = true,
        },
+#else
+        [CPUHP_BRINGUP_CPU] = { },
 #endif
 };
@@ -1225,6 +1273,10 @@ static struct cpuhp_step cpuhp_ap_states[] = {
                .startup                = sched_cpu_starting,
                .teardown               = sched_cpu_dying,
        },
+        [CPUHP_AP_RCUTREE_DYING] = {
+                .startup = NULL,
+                .teardown = rcutree_dying_cpu,
+        },
        /*
         * Low level startup/teardown notifiers. Run with interrupts
         * disabled. Will be removed once the notifiers are converted to
@@ -1248,6 +1300,22 @@ static struct cpuhp_step cpuhp_ap_states[] = {
                .startup                = smpboot_unpark_threads,
                .teardown               = NULL,
        },
+        [CPUHP_AP_PERF_ONLINE] = {
+                .name = "perf online",
+                .startup = perf_event_init_cpu,
+                .teardown = perf_event_exit_cpu,
+        },
+        [CPUHP_AP_WORKQUEUE_ONLINE] = {
+                .name = "workqueue online",
+                .startup = workqueue_online_cpu,
+                .teardown = workqueue_offline_cpu,
+        },
+        [CPUHP_AP_RCUTREE_ONLINE] = {
+                .name = "RCU-tree online",
+                .startup = rcutree_online_cpu,
+                .teardown = rcutree_offline_cpu,
+        },
        /*
         * Online/down_prepare notifiers. Will be removed once the notifiers
         * are converted to states.
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 73e93e53884d..c7fd2778ed50 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1034,15 +1034,6 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
 {
        bool need_loop;
-        /*
-         * Allow tasks that have access to memory reserves because they have
-         * been OOM killed to get memory anywhere.
-         */
-        if (unlikely(test_thread_flag(TIF_MEMDIE)))
-                return;
-        if (current->flags & PF_EXITING) /* Let dying task have memory */
-                return;
        task_lock(tsk);
        /*
         * Determine if a loop is necessary if another thread is doing
diff --git a/kernel/cred.c b/kernel/cred.c
index 0c0cd8a62285..5f264fb5737d 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -689,6 +689,8 @@ EXPORT_SYMBOL(set_security_override_from_ctx);
 */
 int set_create_files_as(struct cred *new, struct inode *inode)
 {
+        if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid))
+                return -EINVAL;
        new->fsuid = inode->i_uid;
        new->fsgid = inode->i_gid;
        return security_kernel_create_files_as(new, inode);
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 179ef4640964..e9fdb5203de5 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -104,7 +104,7 @@ fail:
        return -ENOMEM;
 }
-int get_callchain_buffers(void)
+int get_callchain_buffers(int event_max_stack)
 {
        int err = 0;
        int count;
@@ -121,6 +121,15 @@ int get_callchain_buffers(void)
                /* If the allocation failed, give up */
                if (!callchain_cpus_entries)
                        err = -ENOMEM;
+                /*
+                 * If requesting per event more than the global cap,
+                 * return a different error to help userspace figure
+                 * this out.
+                 *
+                 * And also do it here so that we have &callchain_mutex held.
+                 */
+                if (event_max_stack > sysctl_perf_event_max_stack)
+                        err = -EOVERFLOW;
                goto exit;
        }
@@ -174,11 +183,12 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
        bool user   = !event->attr.exclude_callchain_user;
        /* Disallow cross-task user callchains. */
        bool crosstask = event->ctx->task && event->ctx->task != current;
+        const u32 max_stack = event->attr.sample_max_stack;
        if (!kernel && !user)
                return NULL;
-        return get_perf_callchain(regs, 0, kernel, user, sysctl_perf_event_max_stack, crosstask, true);
+        return get_perf_callchain(regs, 0, kernel, user, max_stack, crosstask, true);
 }
 struct perf_callchain_entry *
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 274450efea90..356a6c7cb52a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -335,6 +335,7 @@ static atomic_t perf_sched_count;
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
 static DEFINE_PER_CPU(int, perf_sched_cb_usages);
+static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
 static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
@@ -396,6 +397,13 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
        if (ret || !write)
                return ret;
+        /*
+         * If throttling is disabled don't allow the write:
+         */
+        if (sysctl_perf_cpu_time_max_percent == 100 ||
+            sysctl_perf_cpu_time_max_percent == 0)
+                return -EINVAL;
        max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
        perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
        update_perf_cpu_limits();
@@ -1678,12 +1686,33 @@ static bool is_orphaned_event(struct perf_event *event)
        return event->state == PERF_EVENT_STATE_DEAD;
 }
-static inline int pmu_filter_match(struct perf_event *event)
+static inline int __pmu_filter_match(struct perf_event *event)
 {
        struct pmu *pmu = event->pmu;
        return pmu->filter_match ? pmu->filter_match(event) : 1;
 }
+/*
+ * Check whether we should attempt to schedule an event group based on
+ * PMU-specific filtering. An event group can consist of HW and SW events,
+ * potentially with a SW leader, so we must check all the filters, to
+ * determine whether a group is schedulable:
+ */
+static inline int pmu_filter_match(struct perf_event *event)
+{
+        struct perf_event *child;
+        if (!__pmu_filter_match(event))
+                return 0;
+        list_for_each_entry(child, &event->sibling_list, group_entry) {
+                if (!__pmu_filter_match(child))
+                        return 0;
+        }
+        return 1;
+}
 static inline int
 event_filter_match(struct perf_event *event)
 {
@@ -3665,6 +3694,39 @@ static void free_event_rcu(struct rcu_head *head)
 static void ring_buffer_attach(struct perf_event *event,
                               struct ring_buffer *rb);
+static void detach_sb_event(struct perf_event *event)
+{
+        struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
+        raw_spin_lock(&pel->lock);
+        list_del_rcu(&event->sb_list);
+        raw_spin_unlock(&pel->lock);
+}
+static bool is_sb_event(struct perf_event *event)
+{
+        struct perf_event_attr *attr = &event->attr;
+        if (event->parent)
+                return false;
+        if (event->attach_state & PERF_ATTACH_TASK)
+                return false;
+        if (attr->mmap || attr->mmap_data || attr->mmap2 ||
+            attr->comm || attr->comm_exec ||
+            attr->task ||
+            attr->context_switch)
+                return true;
+        return false;
+}
+static void unaccount_pmu_sb_event(struct perf_event *event)
+{
+        if (is_sb_event(event))
+                detach_sb_event(event);
+}
 static void unaccount_event_cpu(struct perf_event *event, int cpu)
 {
        if (event->parent)
@@ -3728,6 +3790,8 @@ static void unaccount_event(struct perf_event *event)
        }
        unaccount_event_cpu(event, event->cpu);
+        unaccount_pmu_sb_event(event);
 }
 static void perf_sched_delayed(struct work_struct *work)
@@ -3862,10 +3926,8 @@ static void _free_event(struct perf_event *event)
        if (event->ctx)
                put_ctx(event->ctx);
-        if (event->pmu) {
+        exclusive_event_destroy(event);
-                exclusive_event_destroy(event);
+        module_put(event->pmu->module);
-                module_put(event->pmu->module);
-        }
        call_rcu(&event->rcu_head, free_event_rcu);
 }
@@ -5555,16 +5617,26 @@ void perf_output_sample(struct perf_output_handle *handle,
        }
        if (sample_type & PERF_SAMPLE_RAW) {
-                if (data->raw) {
+                struct perf_raw_record *raw = data->raw;
-                        u32 raw_size = data->raw->size;
-                        u32 real_size = round_up(raw_size + sizeof(u32),
+                if (raw) {
-                                                 sizeof(u64)) - sizeof(u32);
+                        struct perf_raw_frag *frag = &raw->frag;
-                        u64 zero = 0;
+                        perf_output_put(handle, raw->size);
-                        perf_output_put(handle, real_size);
+                        do {
-                        __output_copy(handle, data->raw->data, raw_size);
+                                if (frag->copy) {
-                        if (real_size - raw_size)
+                                        __output_custom(handle, frag->copy,
-                                __output_copy(handle, &zero, real_size - raw_size);
+                                                        frag->data, frag->size);
+                                } else {
+                                        __output_copy(handle, frag->data,
+                                                      frag->size);
+                                }
+                                if (perf_raw_frag_last(frag))
+                                        break;
+                                frag = frag->next;
+                        } while (1);
+                        if (frag->pad)
+                                __output_skip(handle, NULL, frag->pad);
                } else {
                        struct {
                                u32     size;
@@ -5689,14 +5761,28 @@ void perf_prepare_sample(struct perf_event_header *header,
        }
        if (sample_type & PERF_SAMPLE_RAW) {
-                int size = sizeof(u32);
+                struct perf_raw_record *raw = data->raw;
+                int size;
-                if (data->raw)
-                        size += data->raw->size;
+                if (raw) {
-                else
+                        struct perf_raw_frag *frag = &raw->frag;
-                        size += sizeof(u32);
+                        u32 sum = 0;
+                        do {
+                                sum += frag->size;
+                                if (perf_raw_frag_last(frag))
+                                        break;
+                                frag = frag->next;
+                        } while (1);
+                        size = round_up(sum + sizeof(u32), sizeof(u64));
+                        raw->size = size - sizeof(u32);
+                        frag->pad = raw->size - sum;
+                } else {
+                        size = sizeof(u64);
+                }
-                header->size += round_up(size, sizeof(u64));
+                header->size += size;
        }
        if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
@@ -5856,11 +5942,11 @@ perf_event_read_event(struct perf_event *event,
        perf_output_end(&handle);
 }
-typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data);
+typedef void (perf_iterate_f)(struct perf_event *event, void *data);
 static void
-perf_event_aux_ctx(struct perf_event_context *ctx,
+perf_iterate_ctx(struct perf_event_context *ctx,
-                   perf_event_aux_output_cb output,
+                   perf_iterate_f output,
                   void *data, bool all)
 {
        struct perf_event *event;
@@ -5877,52 +5963,55 @@ perf_event_aux_ctx(struct perf_event_context *ctx,
        }
 }
-static void
+static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
-perf_event_aux_task_ctx(perf_event_aux_output_cb output, void *data,
-                        struct perf_event_context *task_ctx)
 {
-        rcu_read_lock();
+        struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
-        preempt_disable();
+        struct perf_event *event;
-        perf_event_aux_ctx(task_ctx, output, data, false);
-        preempt_enable();
+        list_for_each_entry_rcu(event, &pel->list, sb_list) {
-        rcu_read_unlock();
+                if (event->state < PERF_EVENT_STATE_INACTIVE)
+                        continue;
+                if (!event_filter_match(event))
+                        continue;
+                output(event, data);
+        }
 }
+/*
+ * Iterate all events that need to receive side-band events.
+ *
+ * For new callers; ensure that account_pmu_sb_event() includes
+ * your event, otherwise it might not get delivered.
+ */
 static void
-perf_event_aux(perf_event_aux_output_cb output, void *data,
+perf_iterate_sb(perf_iterate_f output, void *data,
               struct perf_event_context *task_ctx)
 {
-        struct perf_cpu_context *cpuctx;
        struct perf_event_context *ctx;
-        struct pmu *pmu;
        int ctxn;
+        rcu_read_lock();
+        preempt_disable();
        /*
-         * If we have task_ctx != NULL we only notify
+         * If we have task_ctx != NULL we only notify the task context itself.
-         * the task context itself. The task_ctx is set
+         * The task_ctx is set only for EXIT events before releasing task
-         * only for EXIT events before releasing task
         * context.
         */
        if (task_ctx) {
-                perf_event_aux_task_ctx(output, data, task_ctx);
+                perf_iterate_ctx(task_ctx, output, data, false);
-                return;
+                goto done;
        }
-        rcu_read_lock();
+        perf_iterate_sb_cpu(output, data);
-        list_for_each_entry_rcu(pmu, &pmus, entry) {
-                cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+        for_each_task_context_nr(ctxn) {
-                if (cpuctx->unique_pmu != pmu)
-                        goto next;
-                perf_event_aux_ctx(&cpuctx->ctx, output, data, false);
-                ctxn = pmu->task_ctx_nr;
-                if (ctxn < 0)
-                        goto next;
                ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
                if (ctx)
-                        perf_event_aux_ctx(ctx, output, data, false);
+                        perf_iterate_ctx(ctx, output, data, false);
-next:
-                put_cpu_ptr(pmu->pmu_cpu_context);
        }
+done:
+        preempt_enable();
        rcu_read_unlock();
 }
@@ -5971,7 +6060,7 @@ void perf_event_exec(void)
                perf_event_enable_on_exec(ctxn);
-                perf_event_aux_ctx(ctx, perf_event_addr_filters_exec, NULL,
+                perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL,
                                   true);
        }
        rcu_read_unlock();
@@ -6015,9 +6104,9 @@ static int __perf_pmu_output_stop(void *info)
        };
        rcu_read_lock();
-        perf_event_aux_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
+        perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
        if (cpuctx->task_ctx)
-                perf_event_aux_ctx(cpuctx->task_ctx, __perf_event_output_stop,
+                perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop,
                                   &ro, false);
        rcu_read_unlock();
@@ -6146,7 +6235,7 @@ static void perf_event_task(struct task_struct *task,
                },
        };
-        perf_event_aux(perf_event_task_output,
+        perf_iterate_sb(perf_event_task_output,
                       &task_event,
                       task_ctx);
 }
@@ -6225,7 +6314,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
        comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
-        perf_event_aux(perf_event_comm_output,
+        perf_iterate_sb(perf_event_comm_output,
                       comm_event,
                       NULL);
 }
@@ -6456,7 +6545,7 @@ got_name:
        mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
-        perf_event_aux(perf_event_mmap_output,
+        perf_iterate_sb(perf_event_mmap_output,
                       mmap_event,
                       NULL);
@@ -6539,7 +6628,7 @@ static void perf_addr_filters_adjust(struct vm_area_struct *vma)
                if (!ctx)
                        continue;
-                perf_event_aux_ctx(ctx, __perf_addr_filters_adjust, vma, true);
+                perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
        }
        rcu_read_unlock();
 }
@@ -6726,7 +6815,7 @@ static void perf_event_switch(struct task_struct *task,
                },
        };
-        perf_event_aux(perf_event_switch_output,
+        perf_iterate_sb(perf_event_switch_output,
                       &switch_event,
                       NULL);
 }
@@ -7333,7 +7422,7 @@ static struct pmu perf_swevent = {
 static int perf_tp_filter_match(struct perf_event *event,
                                struct perf_sample_data *data)
 {
-        void *record = data->raw->data;
+        void *record = data->raw->frag.data;
        /* only top level events have filters set */
        if (event->parent)
@@ -7389,8 +7478,10 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
        struct perf_event *event;
        struct perf_raw_record raw = {
-                .size = entry_size,
+                .frag = {
-                .data = record,
+                        .size = entry_size,
+                        .data = record,
+                },
        };
        perf_sample_data_init(&data, 0, 0);
@@ -8648,6 +8739,28 @@ unlock:
        return pmu;
 }
+static void attach_sb_event(struct perf_event *event)
+{
+        struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
+        raw_spin_lock(&pel->lock);
+        list_add_rcu(&event->sb_list, &pel->list);
+        raw_spin_unlock(&pel->lock);
+}
+/*
+ * We keep a list of all !task (and therefore per-cpu) events
+ * that need to receive side-band records.
+ *
+ * This avoids having to scan all the various PMU per-cpu contexts
+ * looking for them.
+ */
+static void account_pmu_sb_event(struct perf_event *event)
+{
+        if (is_sb_event(event))
+                attach_sb_event(event);
+}
 static void account_event_cpu(struct perf_event *event, int cpu)
 {
        if (event->parent)
@@ -8728,6 +8841,8 @@ static void account_event(struct perf_event *event)
 enabled:
        account_event_cpu(event, event->cpu);
+        account_pmu_sb_event(event);
 }
 /*
@@ -8876,7 +8991,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
        if (!event->parent) {
                if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
-                        err = get_callchain_buffers();
+                        err = get_callchain_buffers(attr->sample_max_stack);
                        if (err)
                                goto err_addr_filters;
                }
@@ -9198,6 +9313,9 @@ SYSCALL_DEFINE5(perf_event_open,
                        return -EINVAL;
        }
+        if (!attr.sample_max_stack)
+                attr.sample_max_stack = sysctl_perf_event_max_stack;
        /*
         * In cgroup mode, the pid argument is used to pass the fd
         * opened to the cgroup directory in cgroupfs. The cpu argument
@@ -9271,7 +9389,7 @@ SYSCALL_DEFINE5(perf_event_open,
        if (is_sampling_event(event)) {
                if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
-                        err = -ENOTSUPP;
+                        err = -EOPNOTSUPP;
                        goto err_alloc;
                }
        }
@@ -10233,10 +10351,13 @@ static void __init perf_event_init_all_cpus(void)
                swhash = &per_cpu(swevent_htable, cpu);
                mutex_init(&swhash->hlist_mutex);
                INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
+                INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
+                raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
        }
 }
-static void perf_event_init_cpu(int cpu)
+int perf_event_init_cpu(unsigned int cpu)
 {
        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
@@ -10249,6 +10370,7 @@ static void perf_event_init_cpu(int cpu)
                rcu_assign_pointer(swhash->swevent_hlist, hlist);
        }
        mutex_unlock(&swhash->hlist_mutex);
+        return 0;
 }
 #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
@@ -10280,14 +10402,17 @@ static void perf_event_exit_cpu_context(int cpu)
        }
        srcu_read_unlock(&pmus_srcu, idx);
 }
+#else
+static void perf_event_exit_cpu_context(int cpu) { }
-static void perf_event_exit_cpu(int cpu)
+#endif
+int perf_event_exit_cpu(unsigned int cpu)
 {
        perf_event_exit_cpu_context(cpu);
+        return 0;
 }
-#else
-static inline void perf_event_exit_cpu(int cpu) { }
-#endif
 static int
 perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
@@ -10309,46 +10434,6 @@ static struct notifier_block perf_reboot_notifier = {
        .priority = INT_MIN,
 };
-static int
-perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
-{
-        unsigned int cpu = (long)hcpu;
-        switch (action & ~CPU_TASKS_FROZEN) {
-        case CPU_UP_PREPARE:
-                /*
-                 * This must be done before the CPU comes alive, because the
-                 * moment we can run tasks we can encounter (software) events.
-                 *
-                 * Specifically, someone can have inherited events on kthreadd
-                 * or a pre-existing worker thread that gets re-bound.
-                 */
-                perf_event_init_cpu(cpu);
-                break;
-        case CPU_DOWN_PREPARE:
-                /*
-                 * This must be done before the CPU dies because after that an
-                 * active event might want to IPI the CPU and that'll not work
-                 * so great for dead CPUs.
-                 *
-                 * XXX smp_call_function_single() return -ENXIO without a warn
-                 * so we could possibly deal with this.
-                 *
-                 * This is safe against new events arriving because
-                 * sys_perf_event_open() serializes against hotplug using
-                 * get_online_cpus().
-                 */
-                perf_event_exit_cpu(cpu);
-                break;
-        default:
-                break;
-        }
-        return NOTIFY_OK;
-}
 void __init perf_event_init(void)
 {
        int ret;
@@ -10361,7 +10446,7 @@ void __init perf_event_init(void)
        perf_pmu_register(&perf_cpu_clock, NULL, -1);
        perf_pmu_register(&perf_task_clock, NULL, -1);
        perf_tp_register();
-        perf_cpu_notifier(perf_cpu_notify);
+        perf_event_init_cpu(smp_processor_id());
        register_reboot_notifier(&perf_reboot_notifier);
        ret = init_hw_breakpoint();
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 05f9f6d626df..486fd78eb8d5 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -123,21 +123,19 @@ static inline unsigned long perf_aux_size(struct ring_buffer *rb)
        return rb->aux_nr_pages << PAGE_SHIFT;
 }
-#define DEFINE_OUTPUT_COPY(func_name, memcpy_func)                      \
+#define __DEFINE_OUTPUT_COPY_BODY(advance_buf, memcpy_func, ...)        \
-static inline unsigned long                                             \
-func_name(struct perf_output_handle *handle,                            \
-          const void *buf, unsigned long len)                           \
 {                                                                       \
        unsigned long size, written;                                    \
                                                                        \
        do {                                                            \
                size    = min(handle->size, len);                       \
-                written = memcpy_func(handle->addr, buf, size);         \
+                written = memcpy_func(__VA_ARGS__);                     \
                written = size - written;                               \
                                                                        \
                len -= written;                                         \
                handle->addr += written;                                \
-                buf += written;                                         \
+                if (advance_buf)                                        \
+                        buf += written;                                 \
                handle->size -= written;                                \
                if (!handle->size) {                                    \
                        struct ring_buffer *rb = handle->rb;            \
@@ -152,6 +150,21 @@ func_name(struct perf_output_handle *handle,				\
        return len;                                                     \
 }
+#define DEFINE_OUTPUT_COPY(func_name, memcpy_func)                      \
+static inline unsigned long                                             \
+func_name(struct perf_output_handle *handle,                            \
+          const void *buf, unsigned long len)                           \
+__DEFINE_OUTPUT_COPY_BODY(true, memcpy_func, handle->addr, buf, size)
+static inline unsigned long
+__output_custom(struct perf_output_handle *handle, perf_copy_f copy_func,
+                const void *buf, unsigned long len)
+{
+        unsigned long orig_len = len;
+        __DEFINE_OUTPUT_COPY_BODY(false, copy_func, handle->addr, buf,
+                                  orig_len - len, size)
+}
 static inline unsigned long
 memcpy_common(void *dst, const void *src, unsigned long n)
 {
diff --git a/kernel/exit.c b/kernel/exit.c
index 9e6e1356e6bb..84ae830234f8 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -211,6 +211,82 @@ repeat:
 }
 /*
+ * Note that if this function returns a valid task_struct pointer (!NULL)
+ * task->usage must remain >0 for the duration of the RCU critical section.
+ */
+struct task_struct *task_rcu_dereference(struct task_struct **ptask)
+{
+        struct sighand_struct *sighand;
+        struct task_struct *task;
+        /*
+         * We need to verify that release_task() was not called and thus
+         * delayed_put_task_struct() can't run and drop the last reference
+         * before rcu_read_unlock(). We check task->sighand != NULL,
+         * but we can read the already freed and reused memory.
+         */
+retry:
+        task = rcu_dereference(*ptask);
+        if (!task)
+                return NULL;
+        probe_kernel_address(&task->sighand, sighand);
+        /*
+         * Pairs with atomic_dec_and_test() in put_task_struct(). If this task
+         * was already freed we can not miss the preceding update of this
+         * pointer.
+         */
+        smp_rmb();
+        if (unlikely(task != READ_ONCE(*ptask)))
+                goto retry;
+        /*
+         * We've re-checked that "task == *ptask", now we have two different
+         * cases:
+         *
+         * 1. This is actually the same task/task_struct. In this case
+         *    sighand != NULL tells us it is still alive.
+         *
+         * 2. This is another task which got the same memory for task_struct.
+         *    We can't know this of course, and we can not trust
+         *    sighand != NULL.
+         *
+         *    In this case we actually return a random value, but this is
+         *    correct.
+         *
+         *    If we return NULL - we can pretend that we actually noticed that
+         *    *ptask was updated when the previous task has exited. Or pretend
+         *    that probe_slab_address(&sighand) reads NULL.
+         *
+         *    If we return the new task (because sighand is not NULL for any
+         *    reason) - this is fine too. This (new) task can't go away before
+         *    another gp pass.
+         *
+         *    And note: We could even eliminate the false positive if re-read
+         *    task->sighand once again to avoid the falsely NULL. But this case
+         *    is very unlikely so we don't care.
+         */
+        if (!sighand)
+                return NULL;
+        return task;
+}
+struct task_struct *try_get_task_struct(struct task_struct **ptask)
+{
+        struct task_struct *task;
+        rcu_read_lock();
+        task = task_rcu_dereference(ptask);
+        if (task)
+                get_task_struct(task);
+        rcu_read_unlock();
+        return task;
+}
+/*
 * Determine if a process group is "orphaned", according to the POSIX
 * definition in 2.2.2.52.  Orphaned process groups are not to be affected
 * by terminal-generated stop signals.  Newly orphaned process groups are
@@ -700,10 +776,14 @@ void do_exit(long code)
        exit_signals(tsk);  /* sets PF_EXITING */
        /*
-         * tsk->flags are checked in the futex code to protect against
+         * Ensure that all new tsk->pi_lock acquisitions must observe
-         * an exiting task cleaning up the robust pi futexes.
+         * PF_EXITING. Serializes against futex.c:attach_to_pi_owner().
         */
        smp_mb();
+        /*
+         * Ensure that we must observe the pi_state in exit_mm() ->
+         * mm_release() -> exit_pi_state_list().
+         */
        raw_spin_unlock_wait(&tsk->pi_lock);
        if (unlikely(in_atomic())) {
diff --git a/kernel/fork.c b/kernel/fork.c
index 5c2c355aa97f..52e725d4a866 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -148,57 +148,49 @@ static inline void free_task_struct(struct task_struct *tsk)
 }
 #endif
-void __weak arch_release_thread_info(struct thread_info *ti)
+void __weak arch_release_thread_stack(unsigned long *stack)
 {
 }
-#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR
+#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR
 /*
 * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
 * kmemcache based allocator.
 */
 # if THREAD_SIZE >= PAGE_SIZE
-static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
+static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
                                                  int node)
 {
-        struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP,
+        struct page *page = alloc_pages_node(node, THREADINFO_GFP,
-                                                  THREAD_SIZE_ORDER);
+                                             THREAD_SIZE_ORDER);
-        if (page)
-                memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
-                                            1 << THREAD_SIZE_ORDER);
        return page ? page_address(page) : NULL;
 }
-static inline void free_thread_info(struct thread_info *ti)
+static inline void free_thread_stack(unsigned long *stack)
 {
-        struct page *page = virt_to_page(ti);
+        __free_pages(virt_to_page(stack), THREAD_SIZE_ORDER);
-        memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
-                                    -(1 << THREAD_SIZE_ORDER));
-        __free_kmem_pages(page, THREAD_SIZE_ORDER);
 }
 # else
-static struct kmem_cache *thread_info_cache;
+static struct kmem_cache *thread_stack_cache;
-static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
+static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
                                                  int node)
 {
-        return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node);
+        return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
 }
-static void free_thread_info(struct thread_info *ti)
+static void free_thread_stack(unsigned long *stack)
 {
-        kmem_cache_free(thread_info_cache, ti);
+        kmem_cache_free(thread_stack_cache, stack);
 }
-void thread_info_cache_init(void)
+void thread_stack_cache_init(void)
 {
-        thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE,
+        thread_stack_cache = kmem_cache_create("thread_stack", THREAD_SIZE,
                                              THREAD_SIZE, 0, NULL);
-        BUG_ON(thread_info_cache == NULL);
+        BUG_ON(thread_stack_cache == NULL);
 }
 # endif
 #endif
@@ -221,18 +213,24 @@ struct kmem_cache *vm_area_cachep;
 /* SLAB cache for mm_struct structures (tsk->mm) */
 static struct kmem_cache *mm_cachep;
-static void account_kernel_stack(struct thread_info *ti, int account)
+static void account_kernel_stack(unsigned long *stack, int account)
 {
-        struct zone *zone = page_zone(virt_to_page(ti));
+        /* All stack pages are in the same zone and belong to the same memcg. */
+        struct page *first_page = virt_to_page(stack);
+        mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
+                            THREAD_SIZE / 1024 * account);
-        mod_zone_page_state(zone, NR_KERNEL_STACK, account);
+        memcg_kmem_update_page_stat(
+                first_page, MEMCG_KERNEL_STACK_KB,
+                account * (THREAD_SIZE / 1024));
 }
 void free_task(struct task_struct *tsk)
 {
        account_kernel_stack(tsk->stack, -1);
-        arch_release_thread_info(tsk->stack);
+        arch_release_thread_stack(tsk->stack);
-        free_thread_info(tsk->stack);
+        free_thread_stack(tsk->stack);
        rt_mutex_debug_task_free(tsk);
        ftrace_graph_exit_task(tsk);
        put_seccomp_filter(tsk);
@@ -343,7 +341,7 @@ void set_task_stack_end_magic(struct task_struct *tsk)
 static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 {
        struct task_struct *tsk;
-        struct thread_info *ti;
+        unsigned long *stack;
        int err;
        if (node == NUMA_NO_NODE)
@@ -352,15 +350,15 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
        if (!tsk)
                return NULL;
-        ti = alloc_thread_info_node(tsk, node);
+        stack = alloc_thread_stack_node(tsk, node);
-        if (!ti)
+        if (!stack)
                goto free_tsk;
        err = arch_dup_task_struct(tsk, orig);
        if (err)
-                goto free_ti;
+                goto free_stack;
-        tsk->stack = ti;
+        tsk->stack = stack;
 #ifdef CONFIG_SECCOMP
        /*
         * We must handle setting up seccomp filters once we're under
@@ -392,14 +390,14 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
        tsk->task_frag.page = NULL;
        tsk->wake_q.next = NULL;
-        account_kernel_stack(ti, 1);
+        account_kernel_stack(stack, 1);
        kcov_task_init(tsk);
        return tsk;
-free_ti:
+free_stack:
-        free_thread_info(ti);
+        free_thread_stack(stack);
 free_tsk:
        free_task_struct(tsk);
        return NULL;
diff --git a/kernel/freezer.c b/kernel/freezer.c
index a8900a3bc27a..6f56a9e219fa 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -42,7 +42,7 @@ bool freezing_slow_path(struct task_struct *p)
        if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK))
                return false;
-        if (test_thread_flag(TIF_MEMDIE))
+        if (test_tsk_thread_flag(p, TIF_MEMDIE))
                return false;
        if (pm_nosig_freezing || cgroup_freezing(p))
diff --git a/kernel/futex.c b/kernel/futex.c
index ee25f5ba4aca..33664f70e2d2 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -469,7 +469,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
 {
        unsigned long address = (unsigned long)uaddr;
        struct mm_struct *mm = current->mm;
-        struct page *page;
+        struct page *page, *tail;
        struct address_space *mapping;
        int err, ro = 0;
@@ -530,7 +530,15 @@ again:
         * considered here and page lock forces unnecessarily serialization
         * From this point on, mapping will be re-verified if necessary and
         * page lock will be acquired only if it is unavoidable
-         */
+         *
+         * Mapping checks require the head page for any compound page so the
+         * head page and mapping is looked up now. For anonymous pages, it
+         * does not matter if the page splits in the future as the key is
+         * based on the address. For filesystem-backed pages, the tail is
+         * required as the index of the page determines the key. For
+         * base pages, there is no tail page and tail == page.
+         */
+        tail = page;
        page = compound_head(page);
        mapping = READ_ONCE(page->mapping);
@@ -654,7 +662,7 @@ again:
                key->both.offset |= FUT_OFF_INODE; /* inode-based key */
                key->shared.inode = inode;
-                key->shared.pgoff = basepage_index(page);
+                key->shared.pgoff = basepage_index(tail);
                rcu_read_unlock();
        }
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
index e25e92fb44fa..6a5c239c7669 100644
--- a/kernel/gcov/gcc_4_7.c
+++ b/kernel/gcov/gcc_4_7.c
@@ -18,7 +18,7 @@
 #include <linux/vmalloc.h>
 #include "gcov.h"
-#if __GNUC__ == 5 && __GNUC_MINOR__ >= 1
+#if (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1)
 #define GCOV_COUNTERS                   10
 #elif __GNUC__ == 4 && __GNUC_MINOR__ >= 9
 #define GCOV_COUNTERS                   9
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 2ee42e95a3ce..1d3ee3169202 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -9,3 +9,4 @@ obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o
 obj-$(CONFIG_PM_SLEEP) += pm.o
 obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
 obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o
+obj-$(CONFIG_SMP) += affinity.o
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
new file mode 100644
index 000000000000..f68959341c0f
--- /dev/null
+++ b/kernel/irq/affinity.c
@@ -0,0 +1,61 @@
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/cpu.h>
+static int get_first_sibling(unsigned int cpu)
+{
+        unsigned int ret;
+        ret = cpumask_first(topology_sibling_cpumask(cpu));
+        if (ret < nr_cpu_ids)
+                return ret;
+        return cpu;
+}
+/*
+ * Take a map of online CPUs and the number of available interrupt vectors
+ * and generate an output cpumask suitable for spreading MSI/MSI-X vectors
+ * so that they are distributed as good as possible around the CPUs.  If
+ * more vectors than CPUs are available we'll map one to each CPU,
+ * otherwise we map one to the first sibling of each socket.
+ *
+ * If there are more vectors than CPUs we will still only have one bit
+ * set per CPU, but interrupt code will keep on assigning the vectors from
+ * the start of the bitmap until we run out of vectors.
+ */
+struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs)
+{
+        struct cpumask *affinity_mask;
+        unsigned int max_vecs = *nr_vecs;
+        if (max_vecs == 1)
+                return NULL;
+        affinity_mask = kzalloc(cpumask_size(), GFP_KERNEL);
+        if (!affinity_mask) {
+                *nr_vecs = 1;
+                return NULL;
+        }
+        if (max_vecs >= num_online_cpus()) {
+                cpumask_copy(affinity_mask, cpu_online_mask);
+                *nr_vecs = num_online_cpus();
+        } else {
+                unsigned int vecs = 0, cpu;
+                for_each_online_cpu(cpu) {
+                        if (cpu == get_first_sibling(cpu)) {
+                                cpumask_set_cpu(cpu, affinity_mask);
+                                vecs++;
+                        }
+                        if (--max_vecs == 0)
+                                break;
+                }
+                *nr_vecs = vecs;
+        }
+        return affinity_mask;
+}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 2f9f2b0e79f2..b4c1bc7c9ca2 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -426,6 +426,49 @@ out_unlock:
 }
 EXPORT_SYMBOL_GPL(handle_simple_irq);
+/**
+ *      handle_untracked_irq - Simple and software-decoded IRQs.
+ *      @desc:  the interrupt description structure for this irq
+ *
+ *      Untracked interrupts are sent from a demultiplexing interrupt
+ *      handler when the demultiplexer does not know which device it its
+ *      multiplexed irq domain generated the interrupt. IRQ's handled
+ *      through here are not subjected to stats tracking, randomness, or
+ *      spurious interrupt detection.
+ *
+ *      Note: Like handle_simple_irq, the caller is expected to handle
+ *      the ack, clear, mask and unmask issues if necessary.
+ */
+void handle_untracked_irq(struct irq_desc *desc)
+{
+        unsigned int flags = 0;
+        raw_spin_lock(&desc->lock);
+        if (!irq_may_run(desc))
+                goto out_unlock;
+        desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+        if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
+                desc->istate |= IRQS_PENDING;
+                goto out_unlock;
+        }
+        desc->istate &= ~IRQS_PENDING;
+        irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
+        raw_spin_unlock(&desc->lock);
+        __handle_irq_event_percpu(desc, &flags);
+        raw_spin_lock(&desc->lock);
+        irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
+out_unlock:
+        raw_spin_unlock(&desc->lock);
+}
+EXPORT_SYMBOL_GPL(handle_untracked_irq);
 /*
 * Called unconditionally from handle_level_irq() and only for oneshot
 * interrupts from handle_fasteoi_irq()
@@ -1093,3 +1136,43 @@ int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
        return 0;
 }
+/**
+ * irq_chip_pm_get - Enable power for an IRQ chip
+ * @data:       Pointer to interrupt specific data
+ *
+ * Enable the power to the IRQ chip referenced by the interrupt data
+ * structure.
+ */
+int irq_chip_pm_get(struct irq_data *data)
+{
+        int retval;
+        if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device) {
+                retval = pm_runtime_get_sync(data->chip->parent_device);
+                if (retval < 0) {
+                        pm_runtime_put_noidle(data->chip->parent_device);
+                        return retval;
+                }
+        }
+        return 0;
+}
+/**
+ * irq_chip_pm_put - Disable power for an IRQ chip
+ * @data:       Pointer to interrupt specific data
+ *
+ * Disable the power to the IRQ chip referenced by the interrupt data
+ * structure, belongs. Note that power will only be disabled, once this
+ * function has been called for all IRQs that have called irq_chip_pm_get().
+ */
+int irq_chip_pm_put(struct irq_data *data)
+{
+        int retval = 0;
+        if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device)
+                retval = pm_runtime_put(data->chip->parent_device);
+        return (retval < 0) ? retval : 0;
+}
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index a15b5485b446..d3f24905852c 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -132,10 +132,10 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
        wake_up_process(action->thread);
 }
-irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
+irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags)
 {
        irqreturn_t retval = IRQ_NONE;
-        unsigned int flags = 0, irq = desc->irq_data.irq;
+        unsigned int irq = desc->irq_data.irq;
        struct irqaction *action;
        for_each_action_of_desc(desc, action) {
@@ -164,7 +164,7 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
                        /* Fall through to add to randomness */
                case IRQ_HANDLED:
-                        flags |= action->flags;
+                        *flags |= action->flags;
                        break;
                default:
@@ -174,7 +174,17 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
                retval |= res;
        }
-        add_interrupt_randomness(irq, flags);
+        return retval;
+}
+irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
+{
+        irqreturn_t retval;
+        unsigned int flags = 0;
+        retval = __handle_irq_event_percpu(desc, &flags);
+        add_interrupt_randomness(desc->irq_data.irq, flags);
        if (!noirqdebug)
                note_interrupt(desc, retval);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 09be2c903c6d..bc226e783bd2 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -7,6 +7,7 @@
 */
 #include <linux/irqdesc.h>
 #include <linux/kernel_stat.h>
+#include <linux/pm_runtime.h>
 #ifdef CONFIG_SPARSE_IRQ
 # define IRQ_BITMAP_BITS        (NR_IRQS + 8196)
@@ -83,6 +84,7 @@ extern void irq_mark_irq(unsigned int irq);
 extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
+irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags);
 irqreturn_t handle_irq_event_percpu(struct irq_desc *desc);
 irqreturn_t handle_irq_event(struct irq_desc *desc);
@@ -105,6 +107,8 @@ static inline void unregister_handler_proc(unsigned int irq,
                                           struct irqaction *action) { }
 #endif
+extern bool irq_can_set_affinity_usr(unsigned int irq);
 extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask);
 extern void irq_set_thread_affinity(struct irq_desc *desc);
diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c
index 89b49f6773f0..1a9abc1c8ea0 100644
--- a/kernel/irq/ipi.c
+++ b/kernel/irq/ipi.c
@@ -76,14 +76,14 @@ int irq_reserve_ipi(struct irq_domain *domain,
                }
        }
-        virq = irq_domain_alloc_descs(-1, nr_irqs, 0, NUMA_NO_NODE);
+        virq = irq_domain_alloc_descs(-1, nr_irqs, 0, NUMA_NO_NODE, NULL);
        if (virq <= 0) {
                pr_warn("Can't reserve IPI, failed to alloc descs\n");
                return -ENOMEM;
        }
        virq = __irq_domain_alloc_irqs(domain, virq, nr_irqs, NUMA_NO_NODE,
-                                       (void *) dest, true);
+                                       (void *) dest, true, NULL);
        if (virq <= 0) {
                pr_warn("Can't reserve IPI, failed to alloc hw irqs\n");
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 8731e1c5d1e7..a623b44f2d4b 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -68,9 +68,13 @@ static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node)
        return 0;
 }
-static void desc_smp_init(struct irq_desc *desc, int node)
+static void desc_smp_init(struct irq_desc *desc, int node,
+                          const struct cpumask *affinity)
 {
-        cpumask_copy(desc->irq_common_data.affinity, irq_default_affinity);
+        if (!affinity)
+                affinity = irq_default_affinity;
+        cpumask_copy(desc->irq_common_data.affinity, affinity);
 #ifdef CONFIG_GENERIC_PENDING_IRQ
        cpumask_clear(desc->pending_mask);
 #endif
@@ -82,11 +86,12 @@ static void desc_smp_init(struct irq_desc *desc, int node)
 #else
 static inline int
 alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; }
-static inline void desc_smp_init(struct irq_desc *desc, int node) { }
+static inline void
+desc_smp_init(struct irq_desc *desc, int node, const struct cpumask *affinity) { }
 #endif
 static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
-                struct module *owner)
+                              const struct cpumask *affinity, struct module *owner)
 {
        int cpu;
@@ -107,7 +112,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
        desc->owner = owner;
        for_each_possible_cpu(cpu)
                *per_cpu_ptr(desc->kstat_irqs, cpu) = 0;
-        desc_smp_init(desc, node);
+        desc_smp_init(desc, node, affinity);
 }
 int nr_irqs = NR_IRQS;
@@ -158,7 +163,9 @@ void irq_unlock_sparse(void)
        mutex_unlock(&sparse_irq_lock);
 }
-static struct irq_desc *alloc_desc(int irq, int node, struct module *owner)
+static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags,
+                                   const struct cpumask *affinity,
+                                   struct module *owner)
 {
        struct irq_desc *desc;
        gfp_t gfp = GFP_KERNEL;
@@ -178,7 +185,8 @@ static struct irq_desc *alloc_desc(int irq, int node, struct module *owner)
        lockdep_set_class(&desc->lock, &irq_desc_lock_class);
        init_rcu_head(&desc->rcu);
-        desc_set_defaults(irq, desc, node, owner);
+        desc_set_defaults(irq, desc, node, affinity, owner);
+        irqd_set(&desc->irq_data, flags);
        return desc;
@@ -223,13 +231,32 @@ static void free_desc(unsigned int irq)
 }
 static int alloc_descs(unsigned int start, unsigned int cnt, int node,
-                       struct module *owner)
+                       const struct cpumask *affinity, struct module *owner)
 {
+        const struct cpumask *mask = NULL;
        struct irq_desc *desc;
-        int i;
+        unsigned int flags;
+        int i, cpu = -1;
+        if (affinity && cpumask_empty(affinity))
+                return -EINVAL;
+        flags = affinity ? IRQD_AFFINITY_MANAGED : 0;
        for (i = 0; i < cnt; i++) {
-                desc = alloc_desc(start + i, node, owner);
+                if (affinity) {
+                        cpu = cpumask_next(cpu, affinity);
+                        if (cpu >= nr_cpu_ids)
+                                cpu = cpumask_first(affinity);
+                        node = cpu_to_node(cpu);
+                        /*
+                         * For single allocations we use the caller provided
+                         * mask otherwise we use the mask of the target cpu
+                         */
+                        mask = cnt == 1 ? affinity : cpumask_of(cpu);
+                }
+                desc = alloc_desc(start + i, node, flags, mask, owner);
                if (!desc)
                        goto err;
                mutex_lock(&sparse_irq_lock);
@@ -277,7 +304,7 @@ int __init early_irq_init(void)
                nr_irqs = initcnt;
        for (i = 0; i < initcnt; i++) {
-                desc = alloc_desc(i, node, NULL);
+                desc = alloc_desc(i, node, 0, NULL, NULL);
                set_bit(i, allocated_irqs);
                irq_insert_desc(i, desc);
        }
@@ -311,7 +338,7 @@ int __init early_irq_init(void)
                alloc_masks(&desc[i], GFP_KERNEL, node);
                raw_spin_lock_init(&desc[i].lock);
                lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
-                desc_set_defaults(i, &desc[i], node, NULL);
+                desc_set_defaults(i, &desc[i], node, NULL, NULL);
        }
        return arch_early_irq_init();
 }
@@ -328,11 +355,12 @@ static void free_desc(unsigned int irq)
        unsigned long flags;
        raw_spin_lock_irqsave(&desc->lock, flags);
-        desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL);
+        desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL, NULL);
        raw_spin_unlock_irqrestore(&desc->lock, flags);
 }
 static inline int alloc_descs(unsigned int start, unsigned int cnt, int node,
+                              const struct cpumask *affinity,
                              struct module *owner)
 {
        u32 i;
@@ -453,12 +481,15 @@ EXPORT_SYMBOL_GPL(irq_free_descs);
 * @cnt:        Number of consecutive irqs to allocate.
 * @node:       Preferred node on which the irq descriptor should be allocated
 * @owner:      Owning module (can be NULL)
+ * @affinity:   Optional pointer to an affinity mask which hints where the
+ *              irq descriptors should be allocated and which default
+ *              affinities to use
 *
 * Returns the first irq number or error code
 */
 int __ref
 __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
-                  struct module *owner)
+                  struct module *owner, const struct cpumask *affinity)
 {
        int start, ret;
@@ -494,7 +525,7 @@ __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
        bitmap_set(allocated_irqs, start, cnt);
        mutex_unlock(&sparse_irq_lock);
-        return alloc_descs(start, cnt, node, owner);
+        return alloc_descs(start, cnt, node, affinity, owner);
 err:
        mutex_unlock(&sparse_irq_lock);
@@ -512,7 +543,7 @@ EXPORT_SYMBOL_GPL(__irq_alloc_descs);
 */
 unsigned int irq_alloc_hwirqs(int cnt, int node)
 {
-        int i, irq = __irq_alloc_descs(-1, 0, cnt, node, NULL);
+        int i, irq = __irq_alloc_descs(-1, 0, cnt, node, NULL, NULL);
        if (irq < 0)
                return 0;
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 8798b6c9e945..4752b43662e0 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -481,7 +481,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
        }
        /* Allocate a virtual interrupt number */
-        virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node));
+        virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node), NULL);
        if (virq <= 0) {
                pr_debug("-> virq allocation failed\n");
                return 0;
@@ -567,6 +567,7 @@ static void of_phandle_args_to_fwspec(struct of_phandle_args *irq_data,
 unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
 {
        struct irq_domain *domain;
+        struct irq_data *irq_data;
        irq_hw_number_t hwirq;
        unsigned int type = IRQ_TYPE_NONE;
        int virq;
@@ -588,15 +589,46 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
        if (irq_domain_translate(domain, fwspec, &hwirq, &type))
                return 0;
-        if (irq_domain_is_hierarchy(domain)) {
+        /*
+         * WARN if the irqchip returns a type with bits
+         * outside the sense mask set and clear these bits.
+         */
+        if (WARN_ON(type & ~IRQ_TYPE_SENSE_MASK))
+                type &= IRQ_TYPE_SENSE_MASK;
+        /*
+         * If we've already configured this interrupt,
+         * don't do it again, or hell will break loose.
+         */
+        virq = irq_find_mapping(domain, hwirq);
+        if (virq) {
+                /*
+                 * If the trigger type is not specified or matches the
+                 * current trigger type then we are done so return the
+                 * interrupt number.
+                 */
+                if (type == IRQ_TYPE_NONE || type == irq_get_trigger_type(virq))
+                        return virq;
                /*
-                 * If we've already configured this interrupt,
+                 * If the trigger type has not been set yet, then set
-                 * don't do it again, or hell will break loose.
+                 * it now and return the interrupt number.
                 */
-                virq = irq_find_mapping(domain, hwirq);
+                if (irq_get_trigger_type(virq) == IRQ_TYPE_NONE) {
-                if (virq)
+                        irq_data = irq_get_irq_data(virq);
+                        if (!irq_data)
+                                return 0;
+                        irqd_set_trigger_type(irq_data, type);
                        return virq;
+                }
+                pr_warn("type mismatch, failed to map hwirq-%lu for %s!\n",
+                        hwirq, of_node_full_name(to_of_node(fwspec->fwnode)));
+                return 0;
+        }
+        if (irq_domain_is_hierarchy(domain)) {
                virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, fwspec);
                if (virq <= 0)
                        return 0;
@@ -607,10 +639,18 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
                        return virq;
        }
-        /* Set type if specified and different than the current one */
+        irq_data = irq_get_irq_data(virq);
-        if (type != IRQ_TYPE_NONE &&
+        if (!irq_data) {
-            type != irq_get_trigger_type(virq))
+                if (irq_domain_is_hierarchy(domain))
-                irq_set_irq_type(virq, type);
+                        irq_domain_free_irqs(virq, 1);
+                else
+                        irq_dispose_mapping(virq);
+                return 0;
+        }
+        /* Store trigger type */
+        irqd_set_trigger_type(irq_data, type);
        return virq;
 }
 EXPORT_SYMBOL_GPL(irq_create_fwspec_mapping);
@@ -640,8 +680,12 @@ void irq_dispose_mapping(unsigned int virq)
        if (WARN_ON(domain == NULL))
                return;
-        irq_domain_disassociate(domain, virq);
+        if (irq_domain_is_hierarchy(domain)) {
-        irq_free_desc(virq);
+                irq_domain_free_irqs(virq, 1);
+        } else {
+                irq_domain_disassociate(domain, virq);
+                irq_free_desc(virq);
+        }
 }
 EXPORT_SYMBOL_GPL(irq_dispose_mapping);
@@ -835,19 +879,23 @@ const struct irq_domain_ops irq_domain_simple_ops = {
 EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
 int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq,
-                           int node)
+                           int node, const struct cpumask *affinity)
 {
        unsigned int hint;
        if (virq >= 0) {
-                virq = irq_alloc_descs(virq, virq, cnt, node);
+                virq = __irq_alloc_descs(virq, virq, cnt, node, THIS_MODULE,
+                                         affinity);
        } else {
                hint = hwirq % nr_irqs;
                if (hint == 0)
                        hint++;
-                virq = irq_alloc_descs_from(hint, cnt, node);
+                virq = __irq_alloc_descs(-1, hint, cnt, node, THIS_MODULE,
-                if (virq <= 0 && hint > 1)
+                                         affinity);
-                        virq = irq_alloc_descs_from(1, cnt, node);
+                if (virq <= 0 && hint > 1) {
+                        virq = __irq_alloc_descs(-1, 1, cnt, node, THIS_MODULE,
+                                                 affinity);
+                }
        }
        return virq;
@@ -1144,8 +1192,10 @@ int irq_domain_alloc_irqs_recursive(struct irq_domain *domain,
        if (recursive)
                ret = irq_domain_alloc_irqs_recursive(parent, irq_base,
                                                      nr_irqs, arg);
-        if (ret >= 0)
+        if (ret < 0)
-                ret = domain->ops->alloc(domain, irq_base, nr_irqs, arg);
+                return ret;
+        ret = domain->ops->alloc(domain, irq_base, nr_irqs, arg);
        if (ret < 0 && recursive)
                irq_domain_free_irqs_recursive(parent, irq_base, nr_irqs);
@@ -1160,6 +1210,7 @@ int irq_domain_alloc_irqs_recursive(struct irq_domain *domain,
 * @node:       NUMA node id for memory allocation
 * @arg:        domain specific argument
 * @realloc:    IRQ descriptors have already been allocated if true
+ * @affinity:   Optional irq affinity mask for multiqueue devices
 *
 * Allocate IRQ numbers and initialized all data structures to support
 * hierarchy IRQ domains.
@@ -1175,7 +1226,7 @@ int irq_domain_alloc_irqs_recursive(struct irq_domain *domain,
 */
 int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
                            unsigned int nr_irqs, int node, void *arg,
-                            bool realloc)
+                            bool realloc, const struct cpumask *affinity)
 {
        int i, ret, virq;
@@ -1193,7 +1244,8 @@ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
        if (realloc && irq_base >= 0) {
                virq = irq_base;
        } else {
-                virq = irq_domain_alloc_descs(irq_base, nr_irqs, 0, node);
+                virq = irq_domain_alloc_descs(irq_base, nr_irqs, 0, node,
+                                              affinity);
                if (virq < 0) {
                        pr_debug("cannot allocate IRQ(base %d, count %d)\n",
                                 irq_base, nr_irqs);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index ef0bc02c3a70..73a2b786b5e9 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -115,12 +115,12 @@ EXPORT_SYMBOL(synchronize_irq);
 #ifdef CONFIG_SMP
 cpumask_var_t irq_default_affinity;
-static int __irq_can_set_affinity(struct irq_desc *desc)
+static bool __irq_can_set_affinity(struct irq_desc *desc)
 {
        if (!desc || !irqd_can_balance(&desc->irq_data) ||
            !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity)
-                return 0;
+                return false;
-        return 1;
+        return true;
 }
 /**
@@ -134,6 +134,21 @@ int irq_can_set_affinity(unsigned int irq)
 }
 /**
+ * irq_can_set_affinity_usr - Check if affinity of a irq can be set from user space
+ * @irq:        Interrupt to check
+ *
+ * Like irq_can_set_affinity() above, but additionally checks for the
+ * AFFINITY_MANAGED flag.
+ */
+bool irq_can_set_affinity_usr(unsigned int irq)
+{
+        struct irq_desc *desc = irq_to_desc(irq);
+        return __irq_can_set_affinity(desc) &&
+                !irqd_affinity_is_managed(&desc->irq_data);
+}
+/**
 *      irq_set_thread_affinity - Notify irq threads to adjust affinity
 *      @desc:          irq descriptor which has affitnity changed
 *
@@ -338,10 +353,11 @@ static int setup_affinity(struct irq_desc *desc, struct cpumask *mask)
                return 0;
        /*
-         * Preserve an userspace affinity setup, but make sure that
+         * Preserve the managed affinity setting and an userspace affinity
-         * one of the targets is online.
+         * setup, but make sure that one of the targets is online.
         */
-        if (irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) {
+        if (irqd_affinity_is_managed(&desc->irq_data) ||
+            irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) {
                if (cpumask_intersects(desc->irq_common_data.affinity,
                                       cpu_online_mask))
                        set = desc->irq_common_data.affinity;
@@ -1117,6 +1133,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
        new->irq = irq;
        /*
+         * If the trigger type is not specified by the caller,
+         * then use the default for this interrupt.
+         */
+        if (!(new->flags & IRQF_TRIGGER_MASK))
+                new->flags |= irqd_get_trigger_type(&desc->irq_data);
+        /*
         * Check whether the interrupt nests into another interrupt
         * thread.
         */
@@ -1409,10 +1432,18 @@ int setup_irq(unsigned int irq, struct irqaction *act)
        if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc)))
                return -EINVAL;
+        retval = irq_chip_pm_get(&desc->irq_data);
+        if (retval < 0)
+                return retval;
        chip_bus_lock(desc);
        retval = __setup_irq(irq, desc, act);
        chip_bus_sync_unlock(desc);
+        if (retval)
+                irq_chip_pm_put(&desc->irq_data);
        return retval;
 }
 EXPORT_SYMBOL_GPL(setup_irq);
@@ -1506,6 +1537,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
                }
        }
+        irq_chip_pm_put(&desc->irq_data);
        module_put(desc->owner);
        kfree(action->secondary);
        return action;
@@ -1648,11 +1680,16 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
        action->name = devname;
        action->dev_id = dev_id;
+        retval = irq_chip_pm_get(&desc->irq_data);
+        if (retval < 0)
+                return retval;
        chip_bus_lock(desc);
        retval = __setup_irq(irq, desc, action);
        chip_bus_sync_unlock(desc);
        if (retval) {
+                irq_chip_pm_put(&desc->irq_data);
                kfree(action->secondary);
                kfree(action);
        }
@@ -1730,7 +1767,14 @@ void enable_percpu_irq(unsigned int irq, unsigned int type)
        if (!desc)
                return;
+        /*
+         * If the trigger type is not specified by the caller, then
+         * use the default for this interrupt.
+         */
        type &= IRQ_TYPE_SENSE_MASK;
+        if (type == IRQ_TYPE_NONE)
+                type = irqd_get_trigger_type(&desc->irq_data);
        if (type != IRQ_TYPE_NONE) {
                int ret;
@@ -1822,6 +1866,7 @@ static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_
        unregister_handler_proc(irq, action);
+        irq_chip_pm_put(&desc->irq_data);
        module_put(desc->owner);
        return action;
@@ -1884,10 +1929,18 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act)
        if (!desc || !irq_settings_is_per_cpu_devid(desc))
                return -EINVAL;
+        retval = irq_chip_pm_get(&desc->irq_data);
+        if (retval < 0)
+                return retval;
        chip_bus_lock(desc);
        retval = __setup_irq(irq, desc, act);
        chip_bus_sync_unlock(desc);
+        if (retval)
+                irq_chip_pm_put(&desc->irq_data);
        return retval;
 }
@@ -1931,12 +1984,18 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
        action->name = devname;
        action->percpu_dev_id = dev_id;
+        retval = irq_chip_pm_get(&desc->irq_data);
+        if (retval < 0)
+                return retval;
        chip_bus_lock(desc);
        retval = __setup_irq(irq, desc, action);
        chip_bus_sync_unlock(desc);
-        if (retval)
+        if (retval) {
+                irq_chip_pm_put(&desc->irq_data);
                kfree(action);
+        }
        return retval;
 }
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 38e89ce7b071..54999350162c 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -324,7 +324,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
        struct msi_domain_ops *ops = info->ops;
        msi_alloc_info_t arg;
        struct msi_desc *desc;
-        int i, ret, virq = -1;
+        int i, ret, virq;
        ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg);
        if (ret)
@@ -332,13 +332,10 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
        for_each_msi_entry(desc, dev) {
                ops->set_desc(&arg, desc);
-                if (info->flags & MSI_FLAG_IDENTITY_MAP)
-                        virq = (int)ops->get_hwirq(info, &arg);
-                else
-                        virq = -1;
-                virq = __irq_domain_alloc_irqs(domain, virq, desc->nvec_used,
+                virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used,
-                                               dev_to_node(dev), &arg, false);
+                                               dev_to_node(dev), &arg, false,
+                                               desc->affinity);
                if (virq < 0) {
                        ret = -ENOSPC;
                        if (ops->handle_error)
@@ -356,6 +353,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
                ops->msi_finish(&arg, 0);
        for_each_msi_entry(desc, dev) {
+                virq = desc->irq;
                if (desc->nvec_used == 1)
                        dev_dbg(dev, "irq %d for MSI\n", virq);
                else
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 4e1b94726818..feaa813b84a9 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -96,7 +96,7 @@ static ssize_t write_irq_affinity(int type, struct file *file,
        cpumask_var_t new_value;
        int err;
-        if (!irq_can_set_affinity(irq) || no_irq_affinity)
+        if (!irq_can_set_affinity_usr(irq) || no_irq_affinity)
                return -EIO;
        if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
@@ -311,7 +311,6 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
                                        !name_unique(irq, action))
                return;
-        memset(name, 0, MAX_NAMELEN);
        snprintf(name, MAX_NAMELEN, "%s", action->name);
        /* create /proc/irq/1234/handler/ */
@@ -340,7 +339,6 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
        if (desc->dir)
                goto out_unlock;
-        memset(name, 0, MAX_NAMELEN);
        sprintf(name, "%d", irq);
        /* create /proc/irq/1234 */
@@ -386,7 +384,6 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
 #endif
        remove_proc_entry("spurious", desc->dir);
-        memset(name, 0, MAX_NAMELEN);
        sprintf(name, "%u", irq);
        remove_proc_entry(name, root_irq_dir);
 }
@@ -421,12 +418,8 @@ void init_irq_proc(void)
        /*
         * Create entries for all existing IRQs.
         */
-        for_each_irq_desc(irq, desc) {
+        for_each_irq_desc(irq, desc)
-                if (!desc)
-                        continue;
                register_irq_proc(irq, desc);
-        }
 }
 #ifdef CONFIG_GENERIC_IRQ_SHOW
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 05254eeb4b4e..0dbea887d625 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -58,13 +58,36 @@ static void jump_label_update(struct static_key *key);
 void static_key_slow_inc(struct static_key *key)
 {
+        int v, v1;
        STATIC_KEY_CHECK_USE();
-        if (atomic_inc_not_zero(&key->enabled))
-                return;
+        /*
+         * Careful if we get concurrent static_key_slow_inc() calls;
+         * later calls must wait for the first one to _finish_ the
+         * jump_label_update() process.  At the same time, however,
+         * the jump_label_update() call below wants to see
+         * static_key_enabled(&key) for jumps to be updated properly.
+         *
+         * So give a special meaning to negative key->enabled: it sends
+         * static_key_slow_inc() down the slow path, and it is non-zero
+         * so it counts as "enabled" in jump_label_update().  Note that
+         * atomic_inc_unless_negative() checks >= 0, so roll our own.
+         */
+        for (v = atomic_read(&key->enabled); v > 0; v = v1) {
+                v1 = atomic_cmpxchg(&key->enabled, v, v + 1);
+                if (likely(v1 == v))
+                        return;
+        }
        jump_label_lock();
-        if (atomic_inc_return(&key->enabled) == 1)
+        if (atomic_read(&key->enabled) == 0) {
+                atomic_set(&key->enabled, -1);
                jump_label_update(key);
+                atomic_set(&key->enabled, 1);
+        } else {
+                atomic_inc(&key->enabled);
+        }
        jump_label_unlock();
 }
 EXPORT_SYMBOL_GPL(static_key_slow_inc);
@@ -72,6 +95,13 @@ EXPORT_SYMBOL_GPL(static_key_slow_inc);
 static void __static_key_slow_dec(struct static_key *key,
                unsigned long rate_limit, struct delayed_work *work)
 {
+        /*
+         * The negative count check is valid even when a negative
+         * key->enabled is in use by static_key_slow_inc(); a
+         * __static_key_slow_dec() before the first static_key_slow_inc()
+         * returns is unbalanced, because all other static_key_slow_inc()
+         * instances block while the update is in progress.
+         */
        if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) {
                WARN(atomic_read(&key->enabled) < 0,
                     "jump label: negative count!\n");
@@ -422,7 +452,7 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val,
        return notifier_from_errno(ret);
 }
-struct notifier_block jump_label_module_nb = {
+static struct notifier_block jump_label_module_nb = {
        .notifier_call = jump_label_module_notify,
        .priority = 1, /* higher than tracepoints */
 };
diff --git a/kernel/kcov.c b/kernel/kcov.c
index a02f2dddd1d7..8d44b3fea9d0 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -264,7 +264,12 @@ static const struct file_operations kcov_fops = {
 static int __init kcov_init(void)
 {
-        if (!debugfs_create_file("kcov", 0600, NULL, NULL, &kcov_fops)) {
+        /*
+         * The kcov debugfs file won't ever get removed and thus,
+         * there is no need to protect it against removal races. The
+         * use of debugfs_create_file_unsafe() is actually safe here.
+         */
+        if (!debugfs_create_file_unsafe("kcov", 0600, NULL, NULL, &kcov_fops)) {
                pr_err("failed to create kcov in debugfs\n");
                return -ENOMEM;
        }
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 81f1a7107c0e..589d763a49b3 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -46,6 +46,7 @@
 #include <linux/gfp.h>
 #include <linux/kmemcheck.h>
 #include <linux/random.h>
+#include <linux/jhash.h>
 #include <asm/sections.h>
@@ -309,10 +310,14 @@ static struct hlist_head chainhash_table[CHAINHASH_SIZE];
 * It's a 64-bit hash, because it's important for the keys to be
 * unique.
 */
-#define iterate_chain_key(key1, key2) \
+static inline u64 iterate_chain_key(u64 key, u32 idx)
-        (((key1) << MAX_LOCKDEP_KEYS_BITS) ^ \
+{
-        ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \
+        u32 k0 = key, k1 = key >> 32;
-        (key2))
+        __jhash_mix(idx, k0, k1); /* Macro that modifies arguments! */
+        return k0 | (u64)k1 << 32;
+}
 void lockdep_off(void)
 {
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index 3ef3736002d8..9c951fade415 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -49,21 +49,21 @@ void debug_mutex_free_waiter(struct mutex_waiter *waiter)
 }
 void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
-                            struct thread_info *ti)
+                            struct task_struct *task)
 {
        SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock));
        /* Mark the current thread as blocked on the lock: */
-        ti->task->blocked_on = waiter;
+        task->blocked_on = waiter;
 }
 void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
-                         struct thread_info *ti)
+                         struct task_struct *task)
 {
        DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
-        DEBUG_LOCKS_WARN_ON(waiter->task != ti->task);
+        DEBUG_LOCKS_WARN_ON(waiter->task != task);
-        DEBUG_LOCKS_WARN_ON(ti->task->blocked_on != waiter);
+        DEBUG_LOCKS_WARN_ON(task->blocked_on != waiter);
-        ti->task->blocked_on = NULL;
+        task->blocked_on = NULL;
        list_del_init(&waiter->list);
        waiter->task = NULL;
diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h
index 0799fd3e4cfa..57a871ae3c81 100644
--- a/kernel/locking/mutex-debug.h
+++ b/kernel/locking/mutex-debug.h
@@ -20,21 +20,21 @@ extern void debug_mutex_wake_waiter(struct mutex *lock,
 extern void debug_mutex_free_waiter(struct mutex_waiter *waiter);
 extern void debug_mutex_add_waiter(struct mutex *lock,
                                   struct mutex_waiter *waiter,
-                                   struct thread_info *ti);
+                                   struct task_struct *task);
 extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
-                                struct thread_info *ti);
+                                struct task_struct *task);
 extern void debug_mutex_unlock(struct mutex *lock);
 extern void debug_mutex_init(struct mutex *lock, const char *name,
                             struct lock_class_key *key);
 static inline void mutex_set_owner(struct mutex *lock)
 {
-        lock->owner = current;
+        WRITE_ONCE(lock->owner, current);
 }
 static inline void mutex_clear_owner(struct mutex *lock)
 {
-        lock->owner = NULL;
+        WRITE_ONCE(lock->owner, NULL);
 }
 #define spin_lock_mutex(lock, flags)                    \
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index e364b424b019..a70b90db3909 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -486,9 +486,6 @@ __ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
        if (!hold_ctx)
                return 0;
-        if (unlikely(ctx == hold_ctx))
-                return -EALREADY;
        if (ctx->stamp - hold_ctx->stamp <= LONG_MAX &&
            (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) {
 #ifdef CONFIG_DEBUG_MUTEXES
@@ -514,6 +511,12 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
        unsigned long flags;
        int ret;
+        if (use_ww_ctx) {
+                struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
+                if (unlikely(ww_ctx == READ_ONCE(ww->ctx)))
+                        return -EALREADY;
+        }
        preempt_disable();
        mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
@@ -534,7 +537,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                goto skip_wait;
        debug_mutex_lock_common(lock, &waiter);
-        debug_mutex_add_waiter(lock, &waiter, task_thread_info(task));
+        debug_mutex_add_waiter(lock, &waiter, task);
        /* add waiting tasks to the end of the waitqueue (FIFO): */
        list_add_tail(&waiter.list, &lock->wait_list);
@@ -581,7 +584,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
        }
        __set_task_state(task, TASK_RUNNING);
-        mutex_remove_waiter(lock, &waiter, current_thread_info());
+        mutex_remove_waiter(lock, &waiter, task);
        /* set it to 0 if there are no waiters left: */
        if (likely(list_empty(&lock->wait_list)))
                atomic_set(&lock->count, 0);
@@ -602,7 +605,7 @@ skip_wait:
        return 0;
 err:
-        mutex_remove_waiter(lock, &waiter, task_thread_info(task));
+        mutex_remove_waiter(lock, &waiter, task);
        spin_unlock_mutex(&lock->wait_lock, flags);
        debug_mutex_free_waiter(&waiter);
        mutex_release(&lock->dep_map, 1, ip);
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h
index 5cda397607f2..6cd6b8e9efd7 100644
--- a/kernel/locking/mutex.h
+++ b/kernel/locking/mutex.h
@@ -13,18 +13,24 @@
                do { spin_lock(lock); (void)(flags); } while (0)
 #define spin_unlock_mutex(lock, flags) \
                do { spin_unlock(lock); (void)(flags); } while (0)
-#define mutex_remove_waiter(lock, waiter, ti) \
+#define mutex_remove_waiter(lock, waiter, task) \
                __list_del((waiter)->list.prev, (waiter)->list.next)
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+/*
+ * The mutex owner can get read and written to locklessly.
+ * We should use WRITE_ONCE when writing the owner value to
+ * avoid store tearing, otherwise, a thread could potentially
+ * read a partially written and incomplete owner value.
+ */
 static inline void mutex_set_owner(struct mutex *lock)
 {
-        lock->owner = current;
+        WRITE_ONCE(lock->owner, current);
 }
 static inline void mutex_clear_owner(struct mutex *lock)
 {
-        lock->owner = NULL;
+        WRITE_ONCE(lock->owner, NULL);
 }
 #else
 static inline void mutex_set_owner(struct mutex *lock)
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index fec082338668..19248ddf37ce 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -93,7 +93,7 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
         * that accesses can't leak upwards out of our subsequent critical
         * section in the case that the lock is currently held for write.
         */
-        cnts = atomic_add_return_acquire(_QR_BIAS, &lock->cnts) - _QR_BIAS;
+        cnts = atomic_fetch_add_acquire(_QR_BIAS, &lock->cnts);
        rspin_until_writer_unlock(lock, cnts);
        /*
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index ce2f75e32ae1..b2caec7315af 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -90,7 +90,7 @@ static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]);
 * therefore increment the cpu number by one.
 */
-static inline u32 encode_tail(int cpu, int idx)
+static inline __pure u32 encode_tail(int cpu, int idx)
 {
        u32 tail;
@@ -103,7 +103,7 @@ static inline u32 encode_tail(int cpu, int idx)
        return tail;
 }
-static inline struct mcs_spinlock *decode_tail(u32 tail)
+static inline __pure struct mcs_spinlock *decode_tail(u32 tail)
 {
        int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
        int idx = (tail &  _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
@@ -267,6 +267,123 @@ static __always_inline u32  __pv_wait_head_or_lock(struct qspinlock *lock,
 #define queued_spin_lock_slowpath       native_queued_spin_lock_slowpath
 #endif
+/*
+ * Various notes on spin_is_locked() and spin_unlock_wait(), which are
+ * 'interesting' functions:
+ *
+ * PROBLEM: some architectures have an interesting issue with atomic ACQUIRE
+ * operations in that the ACQUIRE applies to the LOAD _not_ the STORE (ARM64,
+ * PPC). Also qspinlock has a similar issue per construction, the setting of
+ * the locked byte can be unordered acquiring the lock proper.
+ *
+ * This gets to be 'interesting' in the following cases, where the /should/s
+ * end up false because of this issue.
+ *
+ *
+ * CASE 1:
+ *
+ * So the spin_is_locked() correctness issue comes from something like:
+ *
+ *   CPU0                               CPU1
+ *
+ *   global_lock();                     local_lock(i)
+ *     spin_lock(&G)                      spin_lock(&L[i])
+ *     for (i)                            if (!spin_is_locked(&G)) {
+ *       spin_unlock_wait(&L[i]);           smp_acquire__after_ctrl_dep();
+ *                                          return;
+ *                                        }
+ *                                        // deal with fail
+ *
+ * Where it is important CPU1 sees G locked or CPU0 sees L[i] locked such
+ * that there is exclusion between the two critical sections.
+ *
+ * The load from spin_is_locked(&G) /should/ be constrained by the ACQUIRE from
+ * spin_lock(&L[i]), and similarly the load(s) from spin_unlock_wait(&L[i])
+ * /should/ be constrained by the ACQUIRE from spin_lock(&G).
+ *
+ * Similarly, later stuff is constrained by the ACQUIRE from CTRL+RMB.
+ *
+ *
+ * CASE 2:
+ *
+ * For spin_unlock_wait() there is a second correctness issue, namely:
+ *
+ *   CPU0                               CPU1
+ *
+ *   flag = set;
+ *   smp_mb();                          spin_lock(&l)
+ *   spin_unlock_wait(&l);              if (!flag)
+ *                                        // add to lockless list
+ *                                      spin_unlock(&l);
+ *   // iterate lockless list
+ *
+ * Which wants to ensure that CPU1 will stop adding bits to the list and CPU0
+ * will observe the last entry on the list (if spin_unlock_wait() had ACQUIRE
+ * semantics etc..)
+ *
+ * Where flag /should/ be ordered against the locked store of l.
+ */
+/*
+ * queued_spin_lock_slowpath() can (load-)ACQUIRE the lock before
+ * issuing an _unordered_ store to set _Q_LOCKED_VAL.
+ *
+ * This means that the store can be delayed, but no later than the
+ * store-release from the unlock. This means that simply observing
+ * _Q_LOCKED_VAL is not sufficient to determine if the lock is acquired.
+ *
+ * There are two paths that can issue the unordered store:
+ *
+ *  (1) clear_pending_set_locked():     *,1,0 -> *,0,1
+ *
+ *  (2) set_locked():                   t,0,0 -> t,0,1 ; t != 0
+ *      atomic_cmpxchg_relaxed():       t,0,0 -> 0,0,1
+ *
+ * However, in both cases we have other !0 state we've set before to queue
+ * ourseves:
+ *
+ * For (1) we have the atomic_cmpxchg_acquire() that set _Q_PENDING_VAL, our
+ * load is constrained by that ACQUIRE to not pass before that, and thus must
+ * observe the store.
+ *
+ * For (2) we have a more intersting scenario. We enqueue ourselves using
+ * xchg_tail(), which ends up being a RELEASE. This in itself is not
+ * sufficient, however that is followed by an smp_cond_acquire() on the same
+ * word, giving a RELEASE->ACQUIRE ordering. This again constrains our load and
+ * guarantees we must observe that store.
+ *
+ * Therefore both cases have other !0 state that is observable before the
+ * unordered locked byte store comes through. This means we can use that to
+ * wait for the lock store, and then wait for an unlock.
+ */
+#ifndef queued_spin_unlock_wait
+void queued_spin_unlock_wait(struct qspinlock *lock)
+{
+        u32 val;
+        for (;;) {
+                val = atomic_read(&lock->val);
+                if (!val) /* not locked, we're done */
+                        goto done;
+                if (val & _Q_LOCKED_MASK) /* locked, go wait for unlock */
+                        break;
+                /* not locked, but pending, wait until we observe the lock */
+                cpu_relax();
+        }
+        /* any unlock is good */
+        while (atomic_read(&lock->val) & _Q_LOCKED_MASK)
+                cpu_relax();
+done:
+        smp_acquire__after_ctrl_dep();
+}
+EXPORT_SYMBOL(queued_spin_unlock_wait);
+#endif
 #endif /* _GEN_PV_LOCK_SLOWPATH */
 /**
@@ -358,7 +475,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
         * sequentiality; this is because not all clear_pending_set_locked()
         * implementations imply full barriers.
         */
-        smp_cond_acquire(!(atomic_read(&lock->val) & _Q_LOCKED_MASK));
+        smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_MASK));
        /*
         * take ownership and clear the pending bit.
@@ -395,6 +512,8 @@ queue:
         * pending stuff.
         *
         * p,*,* -> n,*,*
+         *
+         * RELEASE, such that the stores to @node must be complete.
         */
        old = xchg_tail(lock, tail);
        next = NULL;
@@ -405,6 +524,15 @@ queue:
         */
        if (old & _Q_TAIL_MASK) {
                prev = decode_tail(old);
+                /*
+                 * The above xchg_tail() is also a load of @lock which generates,
+                 * through decode_tail(), a pointer.
+                 *
+                 * The address dependency matches the RELEASE of xchg_tail()
+                 * such that the access to @prev must happen after.
+                 */
+                smp_read_barrier_depends();
                WRITE_ONCE(prev->next, node);
                pv_wait_node(node, prev);
@@ -434,7 +562,7 @@ queue:
         *
         * The PV pv_wait_head_or_lock function, if active, will acquire
         * the lock and return a non-zero value. So we have to skip the
-         * smp_cond_acquire() call. As the next PV queue head hasn't been
+         * smp_cond_load_acquire() call. As the next PV queue head hasn't been
         * designated yet, there is no way for the locked value to become
         * _Q_SLOW_VAL. So both the set_locked() and the
         * atomic_cmpxchg_relaxed() calls will be safe.
@@ -445,7 +573,7 @@ queue:
        if ((val = pv_wait_head_or_lock(lock, node)))
                goto locked;
-        smp_cond_acquire(!((val = atomic_read(&lock->val)) & _Q_LOCKED_PENDING_MASK));
+        val = smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_PENDING_MASK));
 locked:
        /*
@@ -465,9 +593,9 @@ locked:
                        break;
                }
                /*
-                 * The smp_cond_acquire() call above has provided the necessary
+                 * The smp_cond_load_acquire() call above has provided the
-                 * acquire semantics required for locking. At most two
+                 * necessary acquire semantics required for locking. At most
-                 * iterations of this loop may be ran.
+                 * two iterations of this loop may be ran.
                 */
                old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL);
                if (old == val)
@@ -491,7 +619,7 @@ release:
        /*
         * release the node
         */
-        this_cpu_dec(mcs_nodes[0].count);
+        __this_cpu_dec(mcs_nodes[0].count);
 }
 EXPORT_SYMBOL(queued_spin_lock_slowpath);
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 21ede57f68b3..37649e69056c 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -112,12 +112,12 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock)
 #else /* _Q_PENDING_BITS == 8 */
 static __always_inline void set_pending(struct qspinlock *lock)
 {
-        atomic_set_mask(_Q_PENDING_VAL, &lock->val);
+        atomic_or(_Q_PENDING_VAL, &lock->val);
 }
 static __always_inline void clear_pending(struct qspinlock *lock)
 {
-        atomic_clear_mask(_Q_PENDING_VAL, &lock->val);
+        atomic_andnot(_Q_PENDING_VAL, &lock->val);
 }
 static __always_inline int trylock_clear_pending(struct qspinlock *lock)
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 3e746607abe5..1ec0f48962b3 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1478,7 +1478,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
 */
 int __sched rt_mutex_trylock(struct rt_mutex *lock)
 {
-        if (WARN_ON(in_irq() || in_nmi() || in_serving_softirq()))
+        if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq()))
                return 0;
        return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 09e30c6225e5..447e08de1fab 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -80,7 +80,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
        debug_check_no_locks_freed((void *)sem, sizeof(*sem));
        lockdep_init_map(&sem->dep_map, name, key, 0);
 #endif
-        sem->count = RWSEM_UNLOCKED_VALUE;
+        atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE);
        raw_spin_lock_init(&sem->wait_lock);
        INIT_LIST_HEAD(&sem->wait_list);
 #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
@@ -114,12 +114,16 @@ enum rwsem_wake_type {
 *   - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed)
 *   - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so)
 * - there must be someone on the queue
- * - the spinlock must be held by the caller
+ * - the wait_lock must be held by the caller
+ * - tasks are marked for wakeup, the caller must later invoke wake_up_q()
+ *   to actually wakeup the blocked task(s) and drop the reference count,
+ *   preferably when the wait_lock is released
 * - woken process blocks are discarded from the list after having task zeroed
- * - writers are only woken if downgrading is false
+ * - writers are only marked woken if downgrading is false
 */
 static struct rw_semaphore *
-__rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
+__rwsem_mark_wake(struct rw_semaphore *sem,
+                  enum rwsem_wake_type wake_type, struct wake_q_head *wake_q)
 {
        struct rwsem_waiter *waiter;
        struct task_struct *tsk;
@@ -128,13 +132,16 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
        waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
        if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
-                if (wake_type == RWSEM_WAKE_ANY)
+                if (wake_type == RWSEM_WAKE_ANY) {
-                        /* Wake writer at the front of the queue, but do not
+                        /*
-                         * grant it the lock yet as we want other writers
+                         * Mark writer at the front of the queue for wakeup.
-                         * to be able to steal it.  Readers, on the other hand,
+                         * Until the task is actually later awoken later by
-                         * will block as they will notice the queued writer.
+                         * the caller, other writers are able to steal it.
+                         * Readers, on the other hand, will block as they
+                         * will notice the queued writer.
                         */
-                        wake_up_process(waiter->task);
+                        wake_q_add(wake_q, waiter->task);
+                }
                goto out;
        }
@@ -146,15 +153,27 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
        if (wake_type != RWSEM_WAKE_READ_OWNED) {
                adjustment = RWSEM_ACTIVE_READ_BIAS;
 try_reader_grant:
-                oldcount = rwsem_atomic_update(adjustment, sem) - adjustment;
+                oldcount = atomic_long_fetch_add(adjustment, &sem->count);
                if (unlikely(oldcount < RWSEM_WAITING_BIAS)) {
-                        /* A writer stole the lock. Undo our reader grant. */
+                        /*
-                        if (rwsem_atomic_update(-adjustment, sem) &
+                         * If the count is still less than RWSEM_WAITING_BIAS
-                                                RWSEM_ACTIVE_MASK)
+                         * after removing the adjustment, it is assumed that
+                         * a writer has stolen the lock. We have to undo our
+                         * reader grant.
+                         */
+                        if (atomic_long_add_return(-adjustment, &sem->count) <
+                            RWSEM_WAITING_BIAS)
                                goto out;
                        /* Last active locker left. Retry waking readers. */
                        goto try_reader_grant;
                }
+                /*
+                 * It is not really necessary to set it to reader-owned here,
+                 * but it gives the spinners an early indication that the
+                 * readers now have the lock.
+                 */
+                rwsem_set_reader_owned(sem);
        }
        /* Grant an infinite number of read locks to the readers at the front
@@ -179,7 +198,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
                adjustment -= RWSEM_WAITING_BIAS;
        if (adjustment)
-                rwsem_atomic_add(adjustment, sem);
+                atomic_long_add(adjustment, &sem->count);
        next = sem->wait_list.next;
        loop = woken;
@@ -187,17 +206,15 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
                waiter = list_entry(next, struct rwsem_waiter, list);
                next = waiter->list.next;
                tsk = waiter->task;
+                wake_q_add(wake_q, tsk);
                /*
-                 * Make sure we do not wakeup the next reader before
+                 * Ensure that the last operation is setting the reader
-                 * setting the nil condition to grant the next reader;
+                 * waiter to nil such that rwsem_down_read_failed() cannot
-                 * otherwise we could miss the wakeup on the other
+                 * race with do_exit() by always holding a reference count
-                 * side and end up sleeping again. See the pairing
+                 * to the task to wakeup.
-                 * in rwsem_down_read_failed().
                 */
-                smp_mb();
+                smp_store_release(&waiter->task, NULL);
-                waiter->task = NULL;
-                wake_up_process(tsk);
-                put_task_struct(tsk);
        } while (--loop);
        sem->wait_list.next = next;
@@ -216,11 +233,11 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
        long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
        struct rwsem_waiter waiter;
        struct task_struct *tsk = current;
+        WAKE_Q(wake_q);
        /* set up my own style of waitqueue */
        waiter.task = tsk;
        waiter.type = RWSEM_WAITING_FOR_READ;
-        get_task_struct(tsk);
        raw_spin_lock_irq(&sem->wait_lock);
        if (list_empty(&sem->wait_list))
@@ -228,7 +245,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
        list_add_tail(&waiter.list, &sem->wait_list);
        /* we're now waiting on the lock, but no longer actively locking */
-        count = rwsem_atomic_update(adjustment, sem);
+        count = atomic_long_add_return(adjustment, &sem->count);
        /* If there are no active locks, wake the front queued process(es).
         *
@@ -238,9 +255,10 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
        if (count == RWSEM_WAITING_BIAS ||
            (count > RWSEM_WAITING_BIAS &&
             adjustment != -RWSEM_ACTIVE_READ_BIAS))
-                sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY);
+                sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
        raw_spin_unlock_irq(&sem->wait_lock);
+        wake_up_q(&wake_q);
        /* wait to be given the lock */
        while (true) {
@@ -255,17 +273,29 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
 }
 EXPORT_SYMBOL(rwsem_down_read_failed);
+/*
+ * This function must be called with the sem->wait_lock held to prevent
+ * race conditions between checking the rwsem wait list and setting the
+ * sem->count accordingly.
+ */
 static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
 {
        /*
-         * Try acquiring the write lock. Check count first in order
+         * Avoid trying to acquire write lock if count isn't RWSEM_WAITING_BIAS.
-         * to reduce unnecessary expensive cmpxchg() operations.
         */
-        if (count == RWSEM_WAITING_BIAS &&
+        if (count != RWSEM_WAITING_BIAS)
-            cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS,
+                return false;
-                    RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
-                if (!list_is_singular(&sem->wait_list))
+        /*
-                        rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
+         * Acquire the lock by trying to set it to ACTIVE_WRITE_BIAS. If there
+         * are other tasks on the wait list, we need to add on WAITING_BIAS.
+         */
+        count = list_is_singular(&sem->wait_list) ?
+                        RWSEM_ACTIVE_WRITE_BIAS :
+                        RWSEM_ACTIVE_WRITE_BIAS + RWSEM_WAITING_BIAS;
+        if (atomic_long_cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, count)
+                                                        == RWSEM_WAITING_BIAS) {
                rwsem_set_owner(sem);
                return true;
        }
@@ -279,13 +309,13 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
 */
 static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
 {
-        long old, count = READ_ONCE(sem->count);
+        long old, count = atomic_long_read(&sem->count);
        while (true) {
                if (!(count == 0 || count == RWSEM_WAITING_BIAS))
                        return false;
-                old = cmpxchg_acquire(&sem->count, count,
+                old = atomic_long_cmpxchg_acquire(&sem->count, count,
                                      count + RWSEM_ACTIVE_WRITE_BIAS);
                if (old == count) {
                        rwsem_set_owner(sem);
@@ -306,16 +336,11 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
        rcu_read_lock();
        owner = READ_ONCE(sem->owner);
-        if (!owner) {
+        if (!rwsem_owner_is_writer(owner)) {
-                long count = READ_ONCE(sem->count);
                /*
-                 * If sem->owner is not set, yet we have just recently entered the
+                 * Don't spin if the rwsem is readers owned.
-                 * slowpath with the lock being active, then there is a possibility
-                 * reader(s) may have the lock. To be safe, bail spinning in these
-                 * situations.
                 */
-                if (count & RWSEM_ACTIVE_MASK)
+                ret = !rwsem_owner_is_reader(owner);
-                        ret = false;
                goto done;
        }
@@ -325,10 +350,15 @@ done:
        return ret;
 }
-static noinline
+/*
-bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner)
+ * Return true only if we can still spin on the owner field of the rwsem.
+ */
+static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem)
 {
-        long count;
+        struct task_struct *owner = READ_ONCE(sem->owner);
+        if (!rwsem_owner_is_writer(owner))
+                goto out;
        rcu_read_lock();
        while (sem->owner == owner) {
@@ -349,22 +379,16 @@ bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner)
                cpu_relax_lowlatency();
        }
        rcu_read_unlock();
+out:
-        if (READ_ONCE(sem->owner))
-                return true; /* new owner, continue spinning */
        /*
-         * When the owner is not set, the lock could be free or
+         * If there is a new owner or the owner is not set, we continue
-         * held by readers. Check the counter to verify the
+         * spinning.
-         * state.
         */
-        count = READ_ONCE(sem->count);
+        return !rwsem_owner_is_reader(READ_ONCE(sem->owner));
-        return (count == 0 || count == RWSEM_WAITING_BIAS);
 }
 static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
 {
-        struct task_struct *owner;
        bool taken = false;
        preempt_disable();
@@ -376,12 +400,17 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
        if (!osq_lock(&sem->osq))
                goto done;
-        while (true) {
+        /*
-                owner = READ_ONCE(sem->owner);
+         * Optimistically spin on the owner field and attempt to acquire the
-                if (owner && !rwsem_spin_on_owner(sem, owner))
+         * lock whenever the owner changes. Spinning will be stopped when:
-                        break;
+         *  1) the owning writer isn't running; or
+         *  2) readers own the lock as we can't determine if they are
-                /* wait_lock will be acquired if write_lock is obtained */
+         *     actively running or not.
+         */
+        while (rwsem_spin_on_owner(sem)) {
+                /*
+                 * Try to acquire the lock
+                 */
                if (rwsem_try_write_lock_unqueued(sem)) {
                        taken = true;
                        break;
@@ -393,7 +422,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
                 * we're an RT task that will live-lock because we won't let
                 * the owner complete.
                 */
-                if (!owner && (need_resched() || rt_task(current)))
+                if (!sem->owner && (need_resched() || rt_task(current)))
                        break;
                /*
@@ -440,9 +469,10 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
        bool waiting = true; /* any queued threads before us */
        struct rwsem_waiter waiter;
        struct rw_semaphore *ret = sem;
+        WAKE_Q(wake_q);
        /* undo write bias from down_write operation, stop active locking */
-        count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem);
+        count = atomic_long_sub_return(RWSEM_ACTIVE_WRITE_BIAS, &sem->count);
        /* do optimistic spinning and steal lock if possible */
        if (rwsem_optimistic_spin(sem))
@@ -465,18 +495,29 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
        /* we're now waiting on the lock, but no longer actively locking */
        if (waiting) {
-                count = READ_ONCE(sem->count);
+                count = atomic_long_read(&sem->count);
                /*
                 * If there were already threads queued before us and there are
                 * no active writers, the lock must be read owned; so we try to
                 * wake any read locks that were queued ahead of us.
                 */
-                if (count > RWSEM_WAITING_BIAS)
+                if (count > RWSEM_WAITING_BIAS) {
-                        sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS);
+                        WAKE_Q(wake_q);
+                        sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q);
+                        /*
+                         * The wakeup is normally called _after_ the wait_lock
+                         * is released, but given that we are proactively waking
+                         * readers we can deal with the wake_q overhead as it is
+                         * similar to releasing and taking the wait_lock again
+                         * for attempting rwsem_try_write_lock().
+                         */
+                        wake_up_q(&wake_q);
+                }
        } else
-                count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
+                count = atomic_long_add_return(RWSEM_WAITING_BIAS, &sem->count);
        /* wait until we successfully acquire the lock */
        set_current_state(state);
@@ -492,7 +533,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
                        schedule();
                        set_current_state(state);
-                } while ((count = sem->count) & RWSEM_ACTIVE_MASK);
+                } while ((count = atomic_long_read(&sem->count)) & RWSEM_ACTIVE_MASK);
                raw_spin_lock_irq(&sem->wait_lock);
        }
@@ -507,10 +548,11 @@ out_nolock:
        raw_spin_lock_irq(&sem->wait_lock);
        list_del(&waiter.list);
        if (list_empty(&sem->wait_list))
-                rwsem_atomic_update(-RWSEM_WAITING_BIAS, sem);
+                atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count);
        else
-                __rwsem_do_wake(sem, RWSEM_WAKE_ANY);
+                __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
        raw_spin_unlock_irq(&sem->wait_lock);
+        wake_up_q(&wake_q);
        return ERR_PTR(-EINTR);
 }
@@ -537,6 +579,7 @@ __visible
 struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
 {
        unsigned long flags;
+        WAKE_Q(wake_q);
        /*
         * If a spinner is present, it is not necessary to do the wakeup.
@@ -573,9 +616,10 @@ locked:
        /* do nothing if list empty */
        if (!list_empty(&sem->wait_list))
-                sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY);
+                sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
        raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+        wake_up_q(&wake_q);
        return sem;
 }
@@ -590,14 +634,16 @@ __visible
 struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
 {
        unsigned long flags;
+        WAKE_Q(wake_q);
        raw_spin_lock_irqsave(&sem->wait_lock, flags);
        /* do nothing if list empty */
        if (!list_empty(&sem->wait_list))
-                sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED);
+                sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q);
        raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+        wake_up_q(&wake_q);
        return sem;
 }
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 2e853ad93a3a..45ba475d4be3 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -22,6 +22,7 @@ void __sched down_read(struct rw_semaphore *sem)
        rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
        LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
+        rwsem_set_reader_owned(sem);
 }
 EXPORT_SYMBOL(down_read);
@@ -33,8 +34,10 @@ int down_read_trylock(struct rw_semaphore *sem)
 {
        int ret = __down_read_trylock(sem);
-        if (ret == 1)
+        if (ret == 1) {
                rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);
+                rwsem_set_reader_owned(sem);
+        }
        return ret;
 }
@@ -124,7 +127,7 @@ void downgrade_write(struct rw_semaphore *sem)
         * lockdep: a downgraded write will live on as a write
         * dependency.
         */
-        rwsem_clear_owner(sem);
+        rwsem_set_reader_owned(sem);
        __downgrade_write(sem);
 }
@@ -138,6 +141,7 @@ void down_read_nested(struct rw_semaphore *sem, int subclass)
        rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
        LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
+        rwsem_set_reader_owned(sem);
 }
 EXPORT_SYMBOL(down_read_nested);
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index 870ed9a5b426..a699f4048ba1 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -1,14 +1,58 @@
+/*
+ * The owner field of the rw_semaphore structure will be set to
+ * RWSEM_READ_OWNED when a reader grabs the lock. A writer will clear
+ * the owner field when it unlocks. A reader, on the other hand, will
+ * not touch the owner field when it unlocks.
+ *
+ * In essence, the owner field now has the following 3 states:
+ *  1) 0
+ *     - lock is free or the owner hasn't set the field yet
+ *  2) RWSEM_READER_OWNED
+ *     - lock is currently or previously owned by readers (lock is free
+ *       or not set by owner yet)
+ *  3) Other non-zero value
+ *     - a writer owns the lock
+ */
+#define RWSEM_READER_OWNED      ((struct task_struct *)1UL)
 #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+/*
+ * All writes to owner are protected by WRITE_ONCE() to make sure that
+ * store tearing can't happen as optimistic spinners may read and use
+ * the owner value concurrently without lock. Read from owner, however,
+ * may not need READ_ONCE() as long as the pointer value is only used
+ * for comparison and isn't being dereferenced.
+ */
 static inline void rwsem_set_owner(struct rw_semaphore *sem)
 {
-        sem->owner = current;
+        WRITE_ONCE(sem->owner, current);
 }
 static inline void rwsem_clear_owner(struct rw_semaphore *sem)
 {
-        sem->owner = NULL;
+        WRITE_ONCE(sem->owner, NULL);
+}
+static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
+{
+        /*
+         * We check the owner value first to make sure that we will only
+         * do a write to the rwsem cacheline when it is really necessary
+         * to minimize cacheline contention.
+         */
+        if (sem->owner != RWSEM_READER_OWNED)
+                WRITE_ONCE(sem->owner, RWSEM_READER_OWNED);
+}
+static inline bool rwsem_owner_is_writer(struct task_struct *owner)
+{
+        return owner && owner != RWSEM_READER_OWNED;
 }
+static inline bool rwsem_owner_is_reader(struct task_struct *owner)
+{
+        return owner == RWSEM_READER_OWNED;
+}
 #else
 static inline void rwsem_set_owner(struct rw_semaphore *sem)
 {
@@ -17,4 +61,8 @@ static inline void rwsem_set_owner(struct rw_semaphore *sem)
 static inline void rwsem_clear_owner(struct rw_semaphore *sem)
 {
 }
+static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
+{
+}
 #endif
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 017532193fb1..251d16b4cb41 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -169,12 +169,6 @@ void devm_memunmap(struct device *dev, void *addr)
 }
 EXPORT_SYMBOL(devm_memunmap);
-pfn_t phys_to_pfn_t(phys_addr_t addr, u64 flags)
-{
-        return __pfn_to_pfn_t(addr >> PAGE_SHIFT, flags);
-}
-EXPORT_SYMBOL(phys_to_pfn_t);
 #ifdef CONFIG_ZONE_DEVICE
 static DEFINE_MUTEX(pgmap_lock);
 static RADIX_TREE(pgmap_radix, GFP_KERNEL);
@@ -308,12 +302,6 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
        if (is_ram == REGION_INTERSECTS)
                return __va(res->start);
-        if (altmap && !IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) {
-                dev_err(dev, "%s: altmap requires CONFIG_SPARSEMEM_VMEMMAP=y\n",
-                                __func__);
-                return ERR_PTR(-ENXIO);
-        }
        if (!ref)
                return ERR_PTR(-EINVAL);
@@ -401,7 +389,6 @@ void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns)
        altmap->alloc -= nr_pfns;
 }
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
 struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
 {
        /*
@@ -427,5 +414,4 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
        return pgmap ? pgmap->altmap : NULL;
 }
-#endif /* CONFIG_SPARSEMEM_VMEMMAP */
 #endif /* CONFIG_ZONE_DEVICE */
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index cb880a14cc39..eb4f717705ba 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,6 +1,8 @@
 ccflags-$(CONFIG_PM_DEBUG)      := -DDEBUG
+KASAN_SANITIZE_snapshot.o       := n
 obj-y                           += qos.o
 obj-$(CONFIG_PM)                += main.o
 obj-$(CONFIG_VT_CONSOLE_SLEEP)  += console.o
diff --git a/kernel/power/console.c b/kernel/power/console.c
index aba9c545a0e3..0e781798b0b3 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -126,17 +126,17 @@ out:
        return ret;
 }
-int pm_prepare_console(void)
+void pm_prepare_console(void)
 {
        if (!pm_vt_switch())
-                return 0;
+                return;
        orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1);
        if (orig_fgconsole < 0)
-                return 1;
+                return;
        orig_kmsg = vt_kmsg_redirect(SUSPEND_CONSOLE);
-        return 0;
+        return;
 }
 void pm_restore_console(void)
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index fca9254280ee..a881c6a7ba74 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -52,6 +52,7 @@ enum {
 #ifdef CONFIG_SUSPEND
        HIBERNATION_SUSPEND,
 #endif
+        HIBERNATION_TEST_RESUME,
        /* keep last */
        __HIBERNATION_AFTER_LAST
 };
@@ -409,6 +410,11 @@ int hibernation_snapshot(int platform_mode)
        goto Close;
 }
+int __weak hibernate_resume_nonboot_cpu_disable(void)
+{
+        return disable_nonboot_cpus();
+}
 /**
 * resume_target_kernel - Restore system state from a hibernation image.
 * @platform_mode: Whether or not to use the platform driver.
@@ -433,7 +439,7 @@ static int resume_target_kernel(bool platform_mode)
        if (error)
                goto Cleanup;
-        error = disable_nonboot_cpus();
+        error = hibernate_resume_nonboot_cpu_disable();
        if (error)
                goto Enable_cpus;
@@ -642,12 +648,39 @@ static void power_down(void)
                cpu_relax();
 }
+static int load_image_and_restore(void)
+{
+        int error;
+        unsigned int flags;
+        pr_debug("PM: Loading hibernation image.\n");
+        lock_device_hotplug();
+        error = create_basic_memory_bitmaps();
+        if (error)
+                goto Unlock;
+        error = swsusp_read(&flags);
+        swsusp_close(FMODE_READ);
+        if (!error)
+                hibernation_restore(flags & SF_PLATFORM_MODE);
+        printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n");
+        swsusp_free();
+        free_basic_memory_bitmaps();
+ Unlock:
+        unlock_device_hotplug();
+        return error;
+}
 /**
 * hibernate - Carry out system hibernation, including saving the image.
 */
 int hibernate(void)
 {
-        int error;
+        int error, nr_calls = 0;
+        bool snapshot_test = false;
        if (!hibernation_available()) {
                pr_debug("PM: Hibernation not available.\n");
@@ -662,9 +695,11 @@ int hibernate(void)
        }
        pm_prepare_console();
-        error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
+        error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls);
-        if (error)
+        if (error) {
+                nr_calls--;
                goto Exit;
+        }
        printk(KERN_INFO "PM: Syncing filesystems ... ");
        sys_sync();
@@ -697,8 +732,12 @@ int hibernate(void)
                pr_debug("PM: writing image.\n");
                error = swsusp_write(flags);
                swsusp_free();
-                if (!error)
+                if (!error) {
-                        power_down();
+                        if (hibernation_mode == HIBERNATION_TEST_RESUME)
+                                snapshot_test = true;
+                        else
+                                power_down();
+                }
                in_suspend = 0;
                pm_restore_gfp_mask();
        } else {
@@ -709,12 +748,18 @@ int hibernate(void)
        free_basic_memory_bitmaps();
 Thaw:
        unlock_device_hotplug();
+        if (snapshot_test) {
+                pr_debug("PM: Checking hibernation image\n");
+                error = swsusp_check();
+                if (!error)
+                        error = load_image_and_restore();
+        }
        thaw_processes();
        /* Don't bother checking whether freezer_test_done is true */
        freezer_test_done = false;
 Exit:
-        pm_notifier_call_chain(PM_POST_HIBERNATION);
+        __pm_notifier_call_chain(PM_POST_HIBERNATION, nr_calls, NULL);
        pm_restore_console();
        atomic_inc(&snapshot_device_available);
 Unlock:
@@ -740,8 +785,7 @@ int hibernate(void)
 */
 static int software_resume(void)
 {
-        int error;
+        int error, nr_calls = 0;
-        unsigned int flags;
        /*
         * If the user said "noresume".. bail out early.
@@ -827,35 +871,20 @@ static int software_resume(void)
        }
        pm_prepare_console();
-        error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
+        error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls);
-        if (error)
+        if (error) {
+                nr_calls--;
                goto Close_Finish;
+        }
        pr_debug("PM: Preparing processes for restore.\n");
        error = freeze_processes();
        if (error)
                goto Close_Finish;
+        error = load_image_and_restore();
-        pr_debug("PM: Loading hibernation image.\n");
-        lock_device_hotplug();
-        error = create_basic_memory_bitmaps();
-        if (error)
-                goto Thaw;
-        error = swsusp_read(&flags);
-        swsusp_close(FMODE_READ);
-        if (!error)
-                hibernation_restore(flags & SF_PLATFORM_MODE);
-        printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n");
-        swsusp_free();
-        free_basic_memory_bitmaps();
- Thaw:
-        unlock_device_hotplug();
        thaw_processes();
 Finish:
-        pm_notifier_call_chain(PM_POST_RESTORE);
+        __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL);
        pm_restore_console();
        atomic_inc(&snapshot_device_available);
        /* For success case, the suspend path will release the lock */
@@ -878,6 +907,7 @@ static const char * const hibernation_modes[] = {
 #ifdef CONFIG_SUSPEND
        [HIBERNATION_SUSPEND]   = "suspend",
 #endif
+        [HIBERNATION_TEST_RESUME]       = "test_resume",
 };
 /*
@@ -924,6 +954,7 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
 #ifdef CONFIG_SUSPEND
                case HIBERNATION_SUSPEND:
 #endif
+                case HIBERNATION_TEST_RESUME:
                        break;
                case HIBERNATION_PLATFORM:
                        if (hibernation_ops)
@@ -970,6 +1001,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
 #ifdef CONFIG_SUSPEND
                case HIBERNATION_SUSPEND:
 #endif
+                case HIBERNATION_TEST_RESUME:
                        hibernation_mode = mode;
                        break;
                case HIBERNATION_PLATFORM:
@@ -1115,13 +1147,16 @@ static int __init resume_offset_setup(char *str)
 static int __init hibernate_setup(char *str)
 {
-        if (!strncmp(str, "noresume", 8))
+        if (!strncmp(str, "noresume", 8)) {
                noresume = 1;
-        else if (!strncmp(str, "nocompress", 10))
+        } else if (!strncmp(str, "nocompress", 10)) {
                nocompress = 1;
-        else if (!strncmp(str, "no", 2)) {
+        } else if (!strncmp(str, "no", 2)) {
                noresume = 1;
                nohibernate = 1;
+        } else if (IS_ENABLED(CONFIG_DEBUG_RODATA)
+                   && !strncmp(str, "protect_image", 13)) {
+                enable_restore_image_protection();
        }
        return 1;
 }
@@ -1154,11 +1189,6 @@ static int __init nohibernate_setup(char *str)
        return 1;
 }
-static int __init kaslr_nohibernate_setup(char *str)
-{
-        return nohibernate_setup(str);
-}
 static int __init page_poison_nohibernate_setup(char *str)
 {
 #ifdef CONFIG_PAGE_POISONING_ZERO
@@ -1182,5 +1212,4 @@ __setup("hibernate=", hibernate_setup);
 __setup("resumewait", resumewait_setup);
 __setup("resumedelay=", resumedelay_setup);
 __setup("nohibernate", nohibernate_setup);
-__setup("kaslr", kaslr_nohibernate_setup);
 __setup("page_poison=", page_poison_nohibernate_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 27946975eff0..5ea50b1b7595 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -38,12 +38,19 @@ int unregister_pm_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL_GPL(unregister_pm_notifier);
-int pm_notifier_call_chain(unsigned long val)
+int __pm_notifier_call_chain(unsigned long val, int nr_to_call, int *nr_calls)
 {
-        int ret = blocking_notifier_call_chain(&pm_chain_head, val, NULL);
+        int ret;
+        ret = __blocking_notifier_call_chain(&pm_chain_head, val, NULL,
+                                                nr_to_call, nr_calls);
        return notifier_to_errno(ret);
 }
+int pm_notifier_call_chain(unsigned long val)
+{
+        return __pm_notifier_call_chain(val, -1, NULL);
+}
 /* If set, devices may be suspended and resumed asynchronously. */
 int pm_async_enabled = 1;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index efe1b3b17c88..242d8b827dd5 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -38,6 +38,8 @@ static inline char *check_image_kernel(struct swsusp_info *info)
 }
 #endif /* CONFIG_ARCH_HIBERNATION_HEADER */
+extern int hibernate_resume_nonboot_cpu_disable(void);
 /*
 * Keep some memory free so that I/O operations can succeed without paging
 * [Might this be more than 4 MB?]
@@ -59,6 +61,13 @@ extern int hibernation_snapshot(int platform_mode);
 extern int hibernation_restore(int platform_mode);
 extern int hibernation_platform_enter(void);
+#ifdef CONFIG_DEBUG_RODATA
+/* kernel/power/snapshot.c */
+extern void enable_restore_image_protection(void);
+#else
+static inline void enable_restore_image_protection(void) {}
+#endif /* CONFIG_DEBUG_RODATA */
 #else /* !CONFIG_HIBERNATION */
 static inline void hibernate_reserved_size_init(void) {}
@@ -200,6 +209,8 @@ static inline void suspend_test_finish(const char *label) {}
 #ifdef CONFIG_PM_SLEEP
 /* kernel/power/main.c */
+extern int __pm_notifier_call_chain(unsigned long val, int nr_to_call,
+                                    int *nr_calls);
 extern int pm_notifier_call_chain(unsigned long val);
 #endif
diff --git a/kernel/power/process.c b/kernel/power/process.c
index df058bed53ce..8f27d5a8adf6 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -89,6 +89,9 @@ static int try_to_freeze_tasks(bool user_only)
                       elapsed_msecs / 1000, elapsed_msecs % 1000,
                       todo - wq_busy, wq_busy);
+                if (wq_busy)
+                        show_workqueue_state();
                if (!wakeup) {
                        read_lock(&tasklist_lock);
                        for_each_process_thread(g, p) {
@@ -146,6 +149,18 @@ int freeze_processes(void)
        if (!error && !oom_killer_disable())
                error = -EBUSY;
+        /*
+         * There is a hard to fix race between oom_reaper kernel thread
+         * and oom_killer_disable. oom_reaper calls exit_oom_victim
+         * before the victim reaches exit_mm so try to freeze all the tasks
+         * again and catch such a left over task.
+         */
+        if (!error) {
+                pr_info("Double checking all user space processes after OOM killer disable... ");
+                error = try_to_freeze_tasks(true);
+                pr_cont("\n");
+        }
        if (error)
                thaw_processes();
        return error;
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 3a970604308f..9a0178c2ac1d 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -38,6 +38,43 @@
 #include "power.h"
+#ifdef CONFIG_DEBUG_RODATA
+static bool hibernate_restore_protection;
+static bool hibernate_restore_protection_active;
+void enable_restore_image_protection(void)
+{
+        hibernate_restore_protection = true;
+}
+static inline void hibernate_restore_protection_begin(void)
+{
+        hibernate_restore_protection_active = hibernate_restore_protection;
+}
+static inline void hibernate_restore_protection_end(void)
+{
+        hibernate_restore_protection_active = false;
+}
+static inline void hibernate_restore_protect_page(void *page_address)
+{
+        if (hibernate_restore_protection_active)
+                set_memory_ro((unsigned long)page_address, 1);
+}
+static inline void hibernate_restore_unprotect_page(void *page_address)
+{
+        if (hibernate_restore_protection_active)
+                set_memory_rw((unsigned long)page_address, 1);
+}
+#else
+static inline void hibernate_restore_protection_begin(void) {}
+static inline void hibernate_restore_protection_end(void) {}
+static inline void hibernate_restore_protect_page(void *page_address) {}
+static inline void hibernate_restore_unprotect_page(void *page_address) {}
+#endif /* CONFIG_DEBUG_RODATA */
 static int swsusp_page_is_free(struct page *);
 static void swsusp_set_page_forbidden(struct page *);
 static void swsusp_unset_page_forbidden(struct page *);
@@ -67,25 +104,32 @@ void __init hibernate_image_size_init(void)
        image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE;
 }
-/* List of PBEs needed for restoring the pages that were allocated before
+/*
+ * List of PBEs needed for restoring the pages that were allocated before
 * the suspend and included in the suspend image, but have also been
 * allocated by the "resume" kernel, so their contents cannot be written
 * directly to their "original" page frames.
 */
 struct pbe *restore_pblist;
-/* Pointer to an auxiliary buffer (1 page) */
+/* struct linked_page is used to build chains of pages */
-static void *buffer;
-/**
+#define LINKED_PAGE_DATA_SIZE   (PAGE_SIZE - sizeof(void *))
- *      @safe_needed - on resume, for storing the PBE list and the image,
- *      we can only use memory pages that do not conflict with the pages
+struct linked_page {
- *      used before suspend.  The unsafe pages have PageNosaveFree set
+        struct linked_page *next;
- *      and we count them using unsafe_pages.
+        char data[LINKED_PAGE_DATA_SIZE];
- *
+} __packed;
- *      Each allocated image page is marked as PageNosave and PageNosaveFree
- *      so that swsusp_free() can release it.
+/*
+ * List of "safe" pages (ie. pages that were not used by the image kernel
+ * before hibernation) that may be used as temporary storage for image kernel
+ * memory contents.
 */
+static struct linked_page *safe_pages_list;
+/* Pointer to an auxiliary buffer (1 page) */
+static void *buffer;
 #define PG_ANY          0
 #define PG_SAFE         1
@@ -94,6 +138,19 @@ static void *buffer;
 static unsigned int allocated_unsafe_pages;
+/**
+ * get_image_page - Allocate a page for a hibernation image.
+ * @gfp_mask: GFP mask for the allocation.
+ * @safe_needed: Get pages that were not used before hibernation (restore only)
+ *
+ * During image restoration, for storing the PBE list and the image data, we can
+ * only use memory pages that do not conflict with the pages used before
+ * hibernation.  The "unsafe" pages have PageNosaveFree set and we count them
+ * using allocated_unsafe_pages.
+ *
+ * Each allocated image page is marked as PageNosave and PageNosaveFree so that
+ * swsusp_free() can release it.
+ */
 static void *get_image_page(gfp_t gfp_mask, int safe_needed)
 {
        void *res;
@@ -113,9 +170,21 @@ static void *get_image_page(gfp_t gfp_mask, int safe_needed)
        return res;
 }
+static void *__get_safe_page(gfp_t gfp_mask)
+{
+        if (safe_pages_list) {
+                void *ret = safe_pages_list;
+                safe_pages_list = safe_pages_list->next;
+                memset(ret, 0, PAGE_SIZE);
+                return ret;
+        }
+        return get_image_page(gfp_mask, PG_SAFE);
+}
 unsigned long get_safe_page(gfp_t gfp_mask)
 {
-        return (unsigned long)get_image_page(gfp_mask, PG_SAFE);
+        return (unsigned long)__get_safe_page(gfp_mask);
 }
 static struct page *alloc_image_page(gfp_t gfp_mask)
@@ -130,11 +199,22 @@ static struct page *alloc_image_page(gfp_t gfp_mask)
        return page;
 }
+static void recycle_safe_page(void *page_address)
+{
+        struct linked_page *lp = page_address;
+        lp->next = safe_pages_list;
+        safe_pages_list = lp;
+}
 /**
- *      free_image_page - free page represented by @addr, allocated with
+ * free_image_page - Free a page allocated for hibernation image.
- *      get_image_page (page flags set by it must be cleared)
+ * @addr: Address of the page to free.
+ * @clear_nosave_free: If set, clear the PageNosaveFree bit for the page.
+ *
+ * The page to free should have been allocated by get_image_page() (page flags
+ * set by it are affected).
 */
 static inline void free_image_page(void *addr, int clear_nosave_free)
 {
        struct page *page;
@@ -150,17 +230,8 @@ static inline void free_image_page(void *addr, int clear_nosave_free)
        __free_page(page);
 }
-/* struct linked_page is used to build chains of pages */
+static inline void free_list_of_pages(struct linked_page *list,
+                                      int clear_page_nosave)
-#define LINKED_PAGE_DATA_SIZE   (PAGE_SIZE - sizeof(void *))
-struct linked_page {
-        struct linked_page *next;
-        char data[LINKED_PAGE_DATA_SIZE];
-} __packed;
-static inline void
-free_list_of_pages(struct linked_page *list, int clear_page_nosave)
 {
        while (list) {
                struct linked_page *lp = list->next;
@@ -170,30 +241,28 @@ free_list_of_pages(struct linked_page *list, int clear_page_nosave)
        }
 }
-/**
+/*
-  *     struct chain_allocator is used for allocating small objects out of
+ * struct chain_allocator is used for allocating small objects out of
-  *     a linked list of pages called 'the chain'.
+ * a linked list of pages called 'the chain'.
-  *
+ *
-  *     The chain grows each time when there is no room for a new object in
+ * The chain grows each time when there is no room for a new object in
-  *     the current page.  The allocated objects cannot be freed individually.
+ * the current page.  The allocated objects cannot be freed individually.
-  *     It is only possible to free them all at once, by freeing the entire
+ * It is only possible to free them all at once, by freeing the entire
-  *     chain.
+ * chain.
-  *
+ *
-  *     NOTE: The chain allocator may be inefficient if the allocated objects
+ * NOTE: The chain allocator may be inefficient if the allocated objects
-  *     are not much smaller than PAGE_SIZE.
+ * are not much smaller than PAGE_SIZE.
-  */
+ */
 struct chain_allocator {
        struct linked_page *chain;      /* the chain */
        unsigned int used_space;        /* total size of objects allocated out
-                                         * of the current page
+                                           of the current page */
-                                         */
        gfp_t gfp_mask;         /* mask for allocating pages */
        int safe_needed;        /* if set, only "safe" pages are allocated */
 };
-static void
+static void chain_init(struct chain_allocator *ca, gfp_t gfp_mask,
-chain_init(struct chain_allocator *ca, gfp_t gfp_mask, int safe_needed)
+                       int safe_needed)
 {
        ca->chain = NULL;
        ca->used_space = LINKED_PAGE_DATA_SIZE;
@@ -208,7 +277,8 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
        if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) {
                struct linked_page *lp;
-                lp = get_image_page(ca->gfp_mask, ca->safe_needed);
+                lp = ca->safe_needed ? __get_safe_page(ca->gfp_mask) :
+                                        get_image_page(ca->gfp_mask, PG_ANY);
                if (!lp)
                        return NULL;
@@ -222,44 +292,44 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
 }
 /**
- *      Data types related to memory bitmaps.
+ * Data types related to memory bitmaps.
 *
- *      Memory bitmap is a structure consiting of many linked lists of
+ * Memory bitmap is a structure consiting of many linked lists of
- *      objects.  The main list's elements are of type struct zone_bitmap
+ * objects.  The main list's elements are of type struct zone_bitmap
- *      and each of them corresonds to one zone.  For each zone bitmap
+ * and each of them corresonds to one zone.  For each zone bitmap
- *      object there is a list of objects of type struct bm_block that
+ * object there is a list of objects of type struct bm_block that
- *      represent each blocks of bitmap in which information is stored.
+ * represent each blocks of bitmap in which information is stored.
 *
- *      struct memory_bitmap contains a pointer to the main list of zone
+ * struct memory_bitmap contains a pointer to the main list of zone
- *      bitmap objects, a struct bm_position used for browsing the bitmap,
+ * bitmap objects, a struct bm_position used for browsing the bitmap,
- *      and a pointer to the list of pages used for allocating all of the
+ * and a pointer to the list of pages used for allocating all of the
- *      zone bitmap objects and bitmap block objects.
+ * zone bitmap objects and bitmap block objects.
 *
- *      NOTE: It has to be possible to lay out the bitmap in memory
+ * NOTE: It has to be possible to lay out the bitmap in memory
- *      using only allocations of order 0.  Additionally, the bitmap is
+ * using only allocations of order 0.  Additionally, the bitmap is
- *      designed to work with arbitrary number of zones (this is over the
+ * designed to work with arbitrary number of zones (this is over the
- *      top for now, but let's avoid making unnecessary assumptions ;-).
+ * top for now, but let's avoid making unnecessary assumptions ;-).
 *
- *      struct zone_bitmap contains a pointer to a list of bitmap block
+ * struct zone_bitmap contains a pointer to a list of bitmap block
- *      objects and a pointer to the bitmap block object that has been
+ * objects and a pointer to the bitmap block object that has been
- *      most recently used for setting bits.  Additionally, it contains the
+ * most recently used for setting bits.  Additionally, it contains the
- *      pfns that correspond to the start and end of the represented zone.
+ * PFNs that correspond to the start and end of the represented zone.
 *
- *      struct bm_block contains a pointer to the memory page in which
+ * struct bm_block contains a pointer to the memory page in which
- *      information is stored (in the form of a block of bitmap)
+ * information is stored (in the form of a block of bitmap)
- *      It also contains the pfns that correspond to the start and end of
+ * It also contains the pfns that correspond to the start and end of
- *      the represented memory area.
+ * the represented memory area.
 *
- *      The memory bitmap is organized as a radix tree to guarantee fast random
+ * The memory bitmap is organized as a radix tree to guarantee fast random
- *      access to the bits. There is one radix tree for each zone (as returned
+ * access to the bits. There is one radix tree for each zone (as returned
- *      from create_mem_extents).
+ * from create_mem_extents).
 *
- *      One radix tree is represented by one struct mem_zone_bm_rtree. There are
+ * One radix tree is represented by one struct mem_zone_bm_rtree. There are
- *      two linked lists for the nodes of the tree, one for the inner nodes and
+ * two linked lists for the nodes of the tree, one for the inner nodes and
- *      one for the leave nodes. The linked leave nodes are used for fast linear
+ * one for the leave nodes. The linked leave nodes are used for fast linear
- *      access of the memory bitmap.
+ * access of the memory bitmap.
 *
- *      The struct rtree_node represents one node of the radix tree.
+ * The struct rtree_node represents one node of the radix tree.
 */
 #define BM_END_OF_MAP   (~0UL)
@@ -305,9 +375,8 @@ struct bm_position {
 struct memory_bitmap {
        struct list_head zones;
        struct linked_page *p_list;     /* list of pages used to store zone
-                                         * bitmap objects and bitmap block
+                                           bitmap objects and bitmap block
-                                         * objects
+                                           objects */
-                                         */
        struct bm_position cur; /* most recently used bit position */
 };
@@ -321,12 +390,12 @@ struct memory_bitmap {
 #endif
 #define BM_RTREE_LEVEL_MASK     ((1UL << BM_RTREE_LEVEL_SHIFT) - 1)
-/*
+/**
- *      alloc_rtree_node - Allocate a new node and add it to the radix tree.
+ * alloc_rtree_node - Allocate a new node and add it to the radix tree.
 *
- *      This function is used to allocate inner nodes as well as the
+ * This function is used to allocate inner nodes as well as the
- *      leave nodes of the radix tree. It also adds the node to the
+ * leave nodes of the radix tree. It also adds the node to the
- *      corresponding linked list passed in by the *list parameter.
+ * corresponding linked list passed in by the *list parameter.
 */
 static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed,
                                           struct chain_allocator *ca,
@@ -347,12 +416,12 @@ static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed,
        return node;
 }
-/*
+/**
- *      add_rtree_block - Add a new leave node to the radix tree
+ * add_rtree_block - Add a new leave node to the radix tree.
 *
- *      The leave nodes need to be allocated in order to keep the leaves
+ * The leave nodes need to be allocated in order to keep the leaves
- *      linked list in order. This is guaranteed by the zone->blocks
+ * linked list in order. This is guaranteed by the zone->blocks
- *      counter.
+ * counter.
 */
 static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask,
                           int safe_needed, struct chain_allocator *ca)
@@ -417,17 +486,18 @@ static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask,
 static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone,
                               int clear_nosave_free);
-/*
+/**
- *      create_zone_bm_rtree - create a radix tree for one zone
+ * create_zone_bm_rtree - Create a radix tree for one zone.
 *
- *      Allocated the mem_zone_bm_rtree structure and initializes it.
+ * Allocated the mem_zone_bm_rtree structure and initializes it.
- *      This function also allocated and builds the radix tree for the
+ * This function also allocated and builds the radix tree for the
- *      zone.
+ * zone.
 */
-static struct mem_zone_bm_rtree *
+static struct mem_zone_bm_rtree *create_zone_bm_rtree(gfp_t gfp_mask,
-create_zone_bm_rtree(gfp_t gfp_mask, int safe_needed,
+                                                      int safe_needed,
-                     struct chain_allocator *ca,
+                                                      struct chain_allocator *ca,
-                     unsigned long start, unsigned long end)
+                                                      unsigned long start,
+                                                      unsigned long end)
 {
        struct mem_zone_bm_rtree *zone;
        unsigned int i, nr_blocks;
@@ -454,12 +524,12 @@ create_zone_bm_rtree(gfp_t gfp_mask, int safe_needed,
        return zone;
 }
-/*
+/**
- *      free_zone_bm_rtree - Free the memory of the radix tree
+ * free_zone_bm_rtree - Free the memory of the radix tree.
 *
- *      Free all node pages of the radix tree. The mem_zone_bm_rtree
+ * Free all node pages of the radix tree. The mem_zone_bm_rtree
- *      structure itself is not freed here nor are the rtree_node
+ * structure itself is not freed here nor are the rtree_node
- *      structs.
+ * structs.
 */
 static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone,
                               int clear_nosave_free)
@@ -492,8 +562,8 @@ struct mem_extent {
 };
 /**
- *      free_mem_extents - free a list of memory extents
+ * free_mem_extents - Free a list of memory extents.
- *      @list - list of extents to empty
+ * @list: List of extents to free.
 */
 static void free_mem_extents(struct list_head *list)
 {
@@ -506,10 +576,11 @@ static void free_mem_extents(struct list_head *list)
 }
 /**
- *      create_mem_extents - create a list of memory extents representing
+ * create_mem_extents - Create a list of memory extents.
- *                           contiguous ranges of PFNs
+ * @list: List to put the extents into.
- *      @list - list to put the extents into
+ * @gfp_mask: Mask to use for memory allocations.
- *      @gfp_mask - mask to use for memory allocations
+ *
+ * The extents represent contiguous ranges of PFNs.
 */
 static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)
 {
@@ -565,10 +636,10 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)
 }
 /**
-  *     memory_bm_create - allocate memory for a memory bitmap
+ * memory_bm_create - Allocate memory for a memory bitmap.
-  */
+ */
-static int
+static int memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask,
-memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
+                            int safe_needed)
 {
        struct chain_allocator ca;
        struct list_head mem_extents;
@@ -607,8 +678,9 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
 }
 /**
-  *     memory_bm_free - free memory occupied by the memory bitmap @bm
+ * memory_bm_free - Free memory occupied by the memory bitmap.
-  */
+ * @bm: Memory bitmap.
+ */
 static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
 {
        struct mem_zone_bm_rtree *zone;
@@ -622,14 +694,13 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
 }
 /**
- *      memory_bm_find_bit - Find the bit for pfn in the memory
+ * memory_bm_find_bit - Find the bit for a given PFN in a memory bitmap.
- *                           bitmap
 *
- *      Find the bit in the bitmap @bm that corresponds to given pfn.
+ * Find the bit in memory bitmap @bm that corresponds to the given PFN.
- *      The cur.zone, cur.block and cur.node_pfn member of @bm are
+ * The cur.zone, cur.block and cur.node_pfn members of @bm are updated.
- *      updated.
+ *
- *      It walks the radix tree to find the page which contains the bit for
+ * Walk the radix tree to find the page containing the bit that represents @pfn
- *      pfn and returns the bit position in **addr and *bit_nr.
+ * and return the position of the bit in @addr and @bit_nr.
 */
 static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
                              void **addr, unsigned int *bit_nr)
@@ -658,10 +729,9 @@ static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
 zone_found:
        /*
-         * We have a zone. Now walk the radix tree to find the leave
+         * We have found the zone. Now walk the radix tree to find the leaf node
-         * node for our pfn.
+         * for our PFN.
         */
        node = bm->cur.node;
        if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn)
                goto node_found;
@@ -754,14 +824,14 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)
 }
 /*
- *      rtree_next_node - Jumps to the next leave node
+ * rtree_next_node - Jump to the next leaf node.
 *
- *      Sets the position to the beginning of the next node in the
+ * Set the position to the beginning of the next node in the
- *      memory bitmap. This is either the next node in the current
+ * memory bitmap. This is either the next node in the current
- *      zone's radix tree or the first node in the radix tree of the
+ * zone's radix tree or the first node in the radix tree of the
- *      next zone.
+ * next zone.
 *
- *      Returns true if there is a next node, false otherwise.
+ * Return true if there is a next node, false otherwise.
 */
 static bool rtree_next_node(struct memory_bitmap *bm)
 {
@@ -790,14 +860,15 @@ static bool rtree_next_node(struct memory_bitmap *bm)
 }
 /**
- *      memory_bm_rtree_next_pfn - Find the next set bit in the bitmap @bm
+ * memory_bm_rtree_next_pfn - Find the next set bit in a memory bitmap.
+ * @bm: Memory bitmap.
 *
- *      Starting from the last returned position this function searches
+ * Starting from the last returned position this function searches for the next
- *      for the next set bit in the memory bitmap and returns its
+ * set bit in @bm and returns the PFN represented by it.  If no more bits are
- *      number. If no more bit is set BM_END_OF_MAP is returned.
+ * set, BM_END_OF_MAP is returned.
 *
- *      It is required to run memory_bm_position_reset() before the
+ * It is required to run memory_bm_position_reset() before the first call to
- *      first call to this function.
+ * this function for the given memory bitmap.
 */
 static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
 {
@@ -819,11 +890,10 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
        return BM_END_OF_MAP;
 }
-/**
+/*
- *      This structure represents a range of page frames the contents of which
+ * This structure represents a range of page frames the contents of which
- *      should not be saved during the suspend.
+ * should not be saved during hibernation.
 */
 struct nosave_region {
        struct list_head list;
        unsigned long start_pfn;
@@ -832,15 +902,42 @@ struct nosave_region {
 static LIST_HEAD(nosave_regions);
+static void recycle_zone_bm_rtree(struct mem_zone_bm_rtree *zone)
+{
+        struct rtree_node *node;
+        list_for_each_entry(node, &zone->nodes, list)
+                recycle_safe_page(node->data);
+        list_for_each_entry(node, &zone->leaves, list)
+                recycle_safe_page(node->data);
+}
+static void memory_bm_recycle(struct memory_bitmap *bm)
+{
+        struct mem_zone_bm_rtree *zone;
+        struct linked_page *p_list;
+        list_for_each_entry(zone, &bm->zones, list)
+                recycle_zone_bm_rtree(zone);
+        p_list = bm->p_list;
+        while (p_list) {
+                struct linked_page *lp = p_list;
+                p_list = lp->next;
+                recycle_safe_page(lp);
+        }
+}
 /**
- *      register_nosave_region - register a range of page frames the contents
+ * register_nosave_region - Register a region of unsaveable memory.
- *      of which should not be saved during the suspend (to be used in the early
+ *
- *      initialization code)
+ * Register a range of page frames the contents of which should not be saved
+ * during hibernation (to be used in the early initialization code).
 */
+void __init __register_nosave_region(unsigned long start_pfn,
-void __init
+                                     unsigned long end_pfn, int use_kmalloc)
-__register_nosave_region(unsigned long start_pfn, unsigned long end_pfn,
-                         int use_kmalloc)
 {
        struct nosave_region *region;
@@ -857,12 +954,13 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn,
                }
        }
        if (use_kmalloc) {
-                /* during init, this shouldn't fail */
+                /* During init, this shouldn't fail */
                region = kmalloc(sizeof(struct nosave_region), GFP_KERNEL);
                BUG_ON(!region);
-        } else
+        } else {
                /* This allocation cannot fail */
                region = memblock_virt_alloc(sizeof(struct nosave_region), 0);
+        }
        region->start_pfn = start_pfn;
        region->end_pfn = end_pfn;
        list_add_tail(&region->list, &nosave_regions);
@@ -923,10 +1021,12 @@ static void swsusp_unset_page_forbidden(struct page *page)
 }
 /**
- *      mark_nosave_pages - set bits corresponding to the page frames the
+ * mark_nosave_pages - Mark pages that should not be saved.
- *      contents of which should not be saved in a given bitmap.
+ * @bm: Memory bitmap.
+ *
+ * Set the bits in @bm that correspond to the page frames the contents of which
+ * should not be saved.
 */
 static void mark_nosave_pages(struct memory_bitmap *bm)
 {
        struct nosave_region *region;
@@ -956,13 +1056,13 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
 }
 /**
- *      create_basic_memory_bitmaps - create bitmaps needed for marking page
+ * create_basic_memory_bitmaps - Create bitmaps to hold basic page information.
- *      frames that should not be saved and free page frames.  The pointers
+ *
- *      forbidden_pages_map and free_pages_map are only modified if everything
+ * Create bitmaps needed for marking page frames that should not be saved and
- *      goes well, because we don't want the bits to be used before both bitmaps
+ * free page frames.  The forbidden_pages_map and free_pages_map pointers are
- *      are set up.
+ * only modified if everything goes well, because we don't want the bits to be
+ * touched before both bitmaps are set up.
 */
 int create_basic_memory_bitmaps(void)
 {
        struct memory_bitmap *bm1, *bm2;
@@ -1007,12 +1107,12 @@ int create_basic_memory_bitmaps(void)
 }
 /**
- *      free_basic_memory_bitmaps - free memory bitmaps allocated by
+ * free_basic_memory_bitmaps - Free memory bitmaps holding basic information.
- *      create_basic_memory_bitmaps().  The auxiliary pointers are necessary
+ *
- *      so that the bitmaps themselves are not referred to while they are being
+ * Free memory bitmaps allocated by create_basic_memory_bitmaps().  The
- *      freed.
+ * auxiliary pointers are necessary so that the bitmaps themselves are not
+ * referred to while they are being freed.
 */
 void free_basic_memory_bitmaps(void)
 {
        struct memory_bitmap *bm1, *bm2;
@@ -1033,11 +1133,13 @@ void free_basic_memory_bitmaps(void)
 }
 /**
- *      snapshot_additional_pages - estimate the number of additional pages
+ * snapshot_additional_pages - Estimate the number of extra pages needed.
- *      be needed for setting up the suspend image data structures for given
+ * @zone: Memory zone to carry out the computation for.
- *      zone (usually the returned value is greater than the exact number)
+ *
+ * Estimate the number of additional pages needed for setting up a hibernation
+ * image data structures for @zone (usually, the returned value is greater than
+ * the exact number).
 */
 unsigned int snapshot_additional_pages(struct zone *zone)
 {
        unsigned int rtree, nodes;
@@ -1055,10 +1157,10 @@ unsigned int snapshot_additional_pages(struct zone *zone)
 #ifdef CONFIG_HIGHMEM
 /**
- *      count_free_highmem_pages - compute the total number of free highmem
+ * count_free_highmem_pages - Compute the total number of free highmem pages.
- *      pages, system-wide.
+ *
+ * The returned number is system-wide.
 */
 static unsigned int count_free_highmem_pages(void)
 {
        struct zone *zone;
@@ -1072,11 +1174,12 @@ static unsigned int count_free_highmem_pages(void)
 }
 /**
- *      saveable_highmem_page - Determine whether a highmem page should be
+ * saveable_highmem_page - Check if a highmem page is saveable.
- *      included in the suspend image.
 *
- *      We should save the page if it isn't Nosave or NosaveFree, or Reserved,
+ * Determine whether a highmem page should be included in a hibernation image.
- *      and it isn't a part of a free chunk of pages.
+ *
+ * We should save the page if it isn't Nosave or NosaveFree, or Reserved,
+ * and it isn't part of a free chunk of pages.
 */
 static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
 {
@@ -1102,10 +1205,8 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
 }
 /**
- *      count_highmem_pages - compute the total number of saveable highmem
+ * count_highmem_pages - Compute the total number of saveable highmem pages.
- *      pages.
 */
 static unsigned int count_highmem_pages(void)
 {
        struct zone *zone;
@@ -1133,12 +1234,14 @@ static inline void *saveable_highmem_page(struct zone *z, unsigned long p)
 #endif /* CONFIG_HIGHMEM */
 /**
- *      saveable_page - Determine whether a non-highmem page should be included
+ * saveable_page - Check if the given page is saveable.
- *      in the suspend image.
 *
- *      We should save the page if it isn't Nosave, and is not in the range
+ * Determine whether a non-highmem page should be included in a hibernation
- *      of pages statically defined as 'unsaveable', and it isn't a part of
+ * image.
- *      a free chunk of pages.
+ *
+ * We should save the page if it isn't Nosave, and is not in the range
+ * of pages statically defined as 'unsaveable', and it isn't part of
+ * a free chunk of pages.
 */
 static struct page *saveable_page(struct zone *zone, unsigned long pfn)
 {
@@ -1167,10 +1270,8 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)
 }
 /**
- *      count_data_pages - compute the total number of saveable non-highmem
+ * count_data_pages - Compute the total number of saveable non-highmem pages.
- *      pages.
 */
 static unsigned int count_data_pages(void)
 {
        struct zone *zone;
@@ -1190,7 +1291,8 @@ static unsigned int count_data_pages(void)
        return n;
 }
-/* This is needed, because copy_page and memcpy are not usable for copying
+/*
+ * This is needed, because copy_page and memcpy are not usable for copying
 * task structs.
 */
 static inline void do_copy_page(long *dst, long *src)
@@ -1201,12 +1303,12 @@ static inline void do_copy_page(long *dst, long *src)
                *dst++ = *src++;
 }
 /**
- *      safe_copy_page - check if the page we are going to copy is marked as
+ * safe_copy_page - Copy a page in a safe way.
- *              present in the kernel page tables (this always is the case if
+ *
- *              CONFIG_DEBUG_PAGEALLOC is not set and in that case
+ * Check if the page we are going to copy is marked as present in the kernel
- *              kernel_page_present() always returns 'true').
+ * page tables (this always is the case if CONFIG_DEBUG_PAGEALLOC is not set
+ * and in that case kernel_page_present() always returns 'true').
 */
 static void safe_copy_page(void *dst, struct page *s_page)
 {
@@ -1219,10 +1321,8 @@ static void safe_copy_page(void *dst, struct page *s_page)
        }
 }
 #ifdef CONFIG_HIGHMEM
-static inline struct page *
+static inline struct page *page_is_saveable(struct zone *zone, unsigned long pfn)
-page_is_saveable(struct zone *zone, unsigned long pfn)
 {
        return is_highmem(zone) ?
                saveable_highmem_page(zone, pfn) : saveable_page(zone, pfn);
@@ -1243,7 +1343,8 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
                kunmap_atomic(src);
        } else {
                if (PageHighMem(d_page)) {
-                        /* Page pointed to by src may contain some kernel
+                        /*
+                         * The page pointed to by src may contain some kernel
                         * data modified by kmap_atomic()
                         */
                        safe_copy_page(buffer, s_page);
@@ -1265,8 +1366,8 @@ static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
 }
 #endif /* CONFIG_HIGHMEM */
-static void
+static void copy_data_pages(struct memory_bitmap *copy_bm,
-copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
+                            struct memory_bitmap *orig_bm)
 {
        struct zone *zone;
        unsigned long pfn;
@@ -1315,12 +1416,11 @@ static struct memory_bitmap orig_bm;
 static struct memory_bitmap copy_bm;
 /**
- *      swsusp_free - free pages allocated for the suspend.
+ * swsusp_free - Free pages allocated for hibernation image.
 *
- *      Suspend pages are alocated before the atomic copy is made, so we
+ * Image pages are alocated before snapshot creation, so they need to be
- *      need to release them after the resume.
+ * released after resume.
 */
 void swsusp_free(void)
 {
        unsigned long fb_pfn, fr_pfn;
@@ -1351,6 +1451,7 @@ loop:
                memory_bm_clear_current(forbidden_pages_map);
                memory_bm_clear_current(free_pages_map);
+                hibernate_restore_unprotect_page(page_address(page));
                __free_page(page);
                goto loop;
        }
@@ -1362,6 +1463,7 @@ out:
        buffer = NULL;
        alloc_normal = 0;
        alloc_highmem = 0;
+        hibernate_restore_protection_end();
 }
 /* Helper functions used for the shrinking of memory. */
@@ -1369,7 +1471,7 @@ out:
 #define GFP_IMAGE       (GFP_KERNEL | __GFP_NOWARN)
 /**
- * preallocate_image_pages - Allocate a number of pages for hibernation image
+ * preallocate_image_pages - Allocate a number of pages for hibernation image.
 * @nr_pages: Number of page frames to allocate.
 * @mask: GFP flags to use for the allocation.
 *
@@ -1419,7 +1521,7 @@ static unsigned long preallocate_image_highmem(unsigned long nr_pages)
 }
 /**
- *  __fraction - Compute (an approximation of) x * (multiplier / base)
+ *  __fraction - Compute (an approximation of) x * (multiplier / base).
 */
 static unsigned long __fraction(u64 x, u64 multiplier, u64 base)
 {
@@ -1429,8 +1531,8 @@ static unsigned long __fraction(u64 x, u64 multiplier, u64 base)
 }
 static unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
-                                                unsigned long highmem,
+                                                  unsigned long highmem,
-                                                unsigned long total)
+                                                  unsigned long total)
 {
        unsigned long alloc = __fraction(nr_pages, highmem, total);
@@ -1443,15 +1545,15 @@ static inline unsigned long preallocate_image_highmem(unsigned long nr_pages)
 }
 static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
-                                                unsigned long highmem,
+                                                         unsigned long highmem,
-                                                unsigned long total)
+                                                         unsigned long total)
 {
        return 0;
 }
 #endif /* CONFIG_HIGHMEM */
 /**
- * free_unnecessary_pages - Release preallocated pages not needed for the image
+ * free_unnecessary_pages - Release preallocated pages not needed for the image.
 */
 static unsigned long free_unnecessary_pages(void)
 {
@@ -1505,7 +1607,7 @@ static unsigned long free_unnecessary_pages(void)
 }
 /**
- * minimum_image_size - Estimate the minimum acceptable size of an image
+ * minimum_image_size - Estimate the minimum acceptable size of an image.
 * @saveable: Number of saveable pages in the system.
 *
 * We want to avoid attempting to free too much memory too hard, so estimate the
@@ -1525,17 +1627,17 @@ static unsigned long minimum_image_size(unsigned long saveable)
        unsigned long size;
        size = global_page_state(NR_SLAB_RECLAIMABLE)
-                + global_page_state(NR_ACTIVE_ANON)
+                + global_node_page_state(NR_ACTIVE_ANON)
-                + global_page_state(NR_INACTIVE_ANON)
+                + global_node_page_state(NR_INACTIVE_ANON)
-                + global_page_state(NR_ACTIVE_FILE)
+                + global_node_page_state(NR_ACTIVE_FILE)
-                + global_page_state(NR_INACTIVE_FILE)
+                + global_node_page_state(NR_INACTIVE_FILE)
-                - global_page_state(NR_FILE_MAPPED);
+                - global_node_page_state(NR_FILE_MAPPED);
        return saveable <= size ? 0 : saveable - size;
 }
 /**
- * hibernate_preallocate_memory - Preallocate memory for hibernation image
+ * hibernate_preallocate_memory - Preallocate memory for hibernation image.
 *
 * To create a hibernation image it is necessary to make a copy of every page
 * frame in use.  We also need a number of page frames to be free during
@@ -1708,10 +1810,11 @@ int hibernate_preallocate_memory(void)
 #ifdef CONFIG_HIGHMEM
 /**
-  *     count_pages_for_highmem - compute the number of non-highmem pages
+ * count_pages_for_highmem - Count non-highmem pages needed for copying highmem.
-  *     that will be necessary for creating copies of highmem pages.
+ *
-  */
+ * Compute the number of non-highmem pages that will be necessary for creating
+ * copies of highmem pages.
+ */
 static unsigned int count_pages_for_highmem(unsigned int nr_highmem)
 {
        unsigned int free_highmem = count_free_highmem_pages() + alloc_highmem;
@@ -1724,15 +1827,12 @@ static unsigned int count_pages_for_highmem(unsigned int nr_highmem)
        return nr_highmem;
 }
 #else
-static unsigned int
+static unsigned int count_pages_for_highmem(unsigned int nr_highmem) { return 0; }
-count_pages_for_highmem(unsigned int nr_highmem) { return 0; }
 #endif /* CONFIG_HIGHMEM */
 /**
- *      enough_free_mem - Make sure we have enough free memory for the
+ * enough_free_mem - Check if there is enough free memory for the image.
- *      snapshot image.
 */
 static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
 {
        struct zone *zone;
@@ -1751,10 +1851,11 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
 #ifdef CONFIG_HIGHMEM
 /**
- *      get_highmem_buffer - if there are some highmem pages in the suspend
+ * get_highmem_buffer - Allocate a buffer for highmem pages.
- *      image, we may need the buffer to copy them and/or load their data.
+ *
+ * If there are some highmem pages in the hibernation image, we may need a
+ * buffer to copy them and/or load their data.
 */
 static inline int get_highmem_buffer(int safe_needed)
 {
        buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed);
@@ -1762,13 +1863,13 @@ static inline int get_highmem_buffer(int safe_needed)
 }
 /**
- *      alloc_highmem_image_pages - allocate some highmem pages for the image.
+ * alloc_highmem_image_pages - Allocate some highmem pages for the image.
- *      Try to allocate as many pages as needed, but if the number of free
+ *
- *      highmem pages is lesser than that, allocate them all.
+ * Try to allocate as many pages as needed, but if the number of free highmem
+ * pages is less than that, allocate them all.
 */
+static inline unsigned int alloc_highmem_pages(struct memory_bitmap *bm,
-static inline unsigned int
+                                               unsigned int nr_highmem)
-alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
 {
        unsigned int to_alloc = count_free_highmem_pages();
@@ -1787,25 +1888,24 @@ alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
 #else
 static inline int get_highmem_buffer(int safe_needed) { return 0; }
-static inline unsigned int
+static inline unsigned int alloc_highmem_pages(struct memory_bitmap *bm,
-alloc_highmem_pages(struct memory_bitmap *bm, unsigned int n) { return 0; }
+                                               unsigned int n) { return 0; }
 #endif /* CONFIG_HIGHMEM */
 /**
- *      swsusp_alloc - allocate memory for the suspend image
+ * swsusp_alloc - Allocate memory for hibernation image.
 *
- *      We first try to allocate as many highmem pages as there are
+ * We first try to allocate as many highmem pages as there are
- *      saveable highmem pages in the system.  If that fails, we allocate
+ * saveable highmem pages in the system.  If that fails, we allocate
- *      non-highmem pages for the copies of the remaining highmem ones.
+ * non-highmem pages for the copies of the remaining highmem ones.
 *
- *      In this approach it is likely that the copies of highmem pages will
+ * In this approach it is likely that the copies of highmem pages will
- *      also be located in the high memory, because of the way in which
+ * also be located in the high memory, because of the way in which
- *      copy_data_pages() works.
+ * copy_data_pages() works.
 */
+static int swsusp_alloc(struct memory_bitmap *orig_bm,
-static int
+                        struct memory_bitmap *copy_bm,
-swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
+                        unsigned int nr_pages, unsigned int nr_highmem)
-                unsigned int nr_pages, unsigned int nr_highmem)
 {
        if (nr_highmem > 0) {
                if (get_highmem_buffer(PG_ANY))
@@ -1855,7 +1955,8 @@ asmlinkage __visible int swsusp_save(void)
                return -ENOMEM;
        }
-        /* During allocating of suspend pagedir, new cold pages may appear.
+        /*
+         * During allocating of suspend pagedir, new cold pages may appear.
         * Kill them.
         */
        drain_local_pages(NULL);
@@ -1918,12 +2019,14 @@ static int init_header(struct swsusp_info *info)
 }
 /**
- *      pack_pfns - pfns corresponding to the set bits found in the bitmap @bm
+ * pack_pfns - Prepare PFNs for saving.
- *      are stored in the array @buf[] (1 page at a time)
+ * @bm: Memory bitmap.
+ * @buf: Memory buffer to store the PFNs in.
+ *
+ * PFNs corresponding to set bits in @bm are stored in the area of memory
+ * pointed to by @buf (1 page at a time).
 */
+static inline void pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
-static inline void
-pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
 {
        int j;
@@ -1937,22 +2040,21 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
 }
 /**
- *      snapshot_read_next - used for reading the system memory snapshot.
+ * snapshot_read_next - Get the address to read the next image page from.
+ * @handle: Snapshot handle to be used for the reading.
 *
- *      On the first call to it @handle should point to a zeroed
+ * On the first call, @handle should point to a zeroed snapshot_handle
- *      snapshot_handle structure.  The structure gets updated and a pointer
+ * structure.  The structure gets populated then and a pointer to it should be
- *      to it should be passed to this function every next time.
+ * passed to this function every next time.
 *
- *      On success the function returns a positive number.  Then, the caller
+ * On success, the function returns a positive number.  Then, the caller
- *      is allowed to read up to the returned number of bytes from the memory
+ * is allowed to read up to the returned number of bytes from the memory
- *      location computed by the data_of() macro.
+ * location computed by the data_of() macro.
 *
- *      The function returns 0 to indicate the end of data stream condition,
+ * The function returns 0 to indicate the end of the data stream condition,
- *      and a negative number is returned on error.  In such cases the
+ * and negative numbers are returned on errors.  If that happens, the structure
- *      structure pointed to by @handle is not updated and should not be used
+ * pointed to by @handle is not updated and should not be used any more.
- *      any more.
 */
 int snapshot_read_next(struct snapshot_handle *handle)
 {
        if (handle->cur > nr_meta_pages + nr_copy_pages)
@@ -1981,7 +2083,8 @@ int snapshot_read_next(struct snapshot_handle *handle)
                page = pfn_to_page(memory_bm_next_pfn(&copy_bm));
                if (PageHighMem(page)) {
-                        /* Highmem pages are copied to the buffer,
+                        /*
+                         * Highmem pages are copied to the buffer,
                         * because we can't return with a kmapped
                         * highmem page (we may not be called again).
                         */
@@ -1999,53 +2102,41 @@ int snapshot_read_next(struct snapshot_handle *handle)
        return PAGE_SIZE;
 }
-/**
+static void duplicate_memory_bitmap(struct memory_bitmap *dst,
- *      mark_unsafe_pages - mark the pages that cannot be used for storing
+                                    struct memory_bitmap *src)
- *      the image during resume, because they conflict with the pages that
- *      had been used before suspend
- */
-static int mark_unsafe_pages(struct memory_bitmap *bm)
 {
-        struct zone *zone;
+        unsigned long pfn;
-        unsigned long pfn, max_zone_pfn;
-        /* Clear page flags */
+        memory_bm_position_reset(src);
-        for_each_populated_zone(zone) {
+        pfn = memory_bm_next_pfn(src);
-                max_zone_pfn = zone_end_pfn(zone);
+        while (pfn != BM_END_OF_MAP) {
-                for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
+                memory_bm_set_bit(dst, pfn);
-                        if (pfn_valid(pfn))
+                pfn = memory_bm_next_pfn(src);
-                                swsusp_unset_page_free(pfn_to_page(pfn));
        }
-        /* Mark pages that correspond to the "original" pfns as "unsafe" */
-        memory_bm_position_reset(bm);
-        do {
-                pfn = memory_bm_next_pfn(bm);
-                if (likely(pfn != BM_END_OF_MAP)) {
-                        if (likely(pfn_valid(pfn)))
-                                swsusp_set_page_free(pfn_to_page(pfn));
-                        else
-                                return -EFAULT;
-                }
-        } while (pfn != BM_END_OF_MAP);
-        allocated_unsafe_pages = 0;
-        return 0;
 }
-static void
+/**
-duplicate_memory_bitmap(struct memory_bitmap *dst, struct memory_bitmap *src)
+ * mark_unsafe_pages - Mark pages that were used before hibernation.
+ *
+ * Mark the pages that cannot be used for storing the image during restoration,
+ * because they conflict with the pages that had been used before hibernation.
+ */
+static void mark_unsafe_pages(struct memory_bitmap *bm)
 {
        unsigned long pfn;
-        memory_bm_position_reset(src);
+        /* Clear the "free"/"unsafe" bit for all PFNs */
-        pfn = memory_bm_next_pfn(src);
+        memory_bm_position_reset(free_pages_map);
+        pfn = memory_bm_next_pfn(free_pages_map);
        while (pfn != BM_END_OF_MAP) {
-                memory_bm_set_bit(dst, pfn);
+                memory_bm_clear_current(free_pages_map);
-                pfn = memory_bm_next_pfn(src);
+                pfn = memory_bm_next_pfn(free_pages_map);
        }
+        /* Mark pages that correspond to the "original" PFNs as "unsafe" */
+        duplicate_memory_bitmap(free_pages_map, bm);
+        allocated_unsafe_pages = 0;
 }
 static int check_header(struct swsusp_info *info)
@@ -2063,11 +2154,9 @@ static int check_header(struct swsusp_info *info)
 }
 /**
- *      load header - check the image header and copy data from it
+ * load header - Check the image header and copy the data from it.
 */
+static int load_header(struct swsusp_info *info)
-static int
-load_header(struct swsusp_info *info)
 {
        int error;
@@ -2081,8 +2170,12 @@ load_header(struct swsusp_info *info)
 }
 /**
- *      unpack_orig_pfns - for each element of @buf[] (1 page at a time) set
+ * unpack_orig_pfns - Set bits corresponding to given PFNs in a memory bitmap.
- *      the corresponding bit in the memory bitmap @bm
+ * @bm: Memory bitmap.
+ * @buf: Area of memory containing the PFNs.
+ *
+ * For each element of the array pointed to by @buf (1 page at a time), set the
+ * corresponding bit in @bm.
 */
 static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
 {
@@ -2095,7 +2188,7 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
                /* Extract and buffer page key for data page (s390 only). */
                page_key_memorize(buf + j);
-                if (memory_bm_pfn_present(bm, buf[j]))
+                if (pfn_valid(buf[j]) && memory_bm_pfn_present(bm, buf[j]))
                        memory_bm_set_bit(bm, buf[j]);
                else
                        return -EFAULT;
@@ -2104,13 +2197,9 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
        return 0;
 }
-/* List of "safe" pages that may be used to store data loaded from the suspend
- * image
- */
-static struct linked_page *safe_pages_list;
 #ifdef CONFIG_HIGHMEM
-/* struct highmem_pbe is used for creating the list of highmem pages that
+/*
+ * struct highmem_pbe is used for creating the list of highmem pages that
 * should be restored atomically during the resume from disk, because the page
 * frames they have occupied before the suspend are in use.
 */
@@ -2120,7 +2209,8 @@ struct highmem_pbe {
        struct highmem_pbe *next;
 };
-/* List of highmem PBEs needed for restoring the highmem pages that were
+/*
+ * List of highmem PBEs needed for restoring the highmem pages that were
 * allocated before the suspend and included in the suspend image, but have
 * also been allocated by the "resume" kernel, so their contents cannot be
 * written directly to their "original" page frames.
@@ -2128,11 +2218,11 @@ struct highmem_pbe {
 static struct highmem_pbe *highmem_pblist;
 /**
- *      count_highmem_image_pages - compute the number of highmem pages in the
+ * count_highmem_image_pages - Compute the number of highmem pages in the image.
- *      suspend image.  The bits in the memory bitmap @bm that correspond to the
+ * @bm: Memory bitmap.
- *      image pages are assumed to be set.
+ *
+ * The bits in @bm that correspond to image pages are assumed to be set.
 */
 static unsigned int count_highmem_image_pages(struct memory_bitmap *bm)
 {
        unsigned long pfn;
@@ -2149,24 +2239,25 @@ static unsigned int count_highmem_image_pages(struct memory_bitmap *bm)
        return cnt;
 }
-/**
- *      prepare_highmem_image - try to allocate as many highmem pages as
- *      there are highmem image pages (@nr_highmem_p points to the variable
- *      containing the number of highmem image pages).  The pages that are
- *      "safe" (ie. will not be overwritten when the suspend image is
- *      restored) have the corresponding bits set in @bm (it must be
- *      unitialized).
- *
- *      NOTE: This function should not be called if there are no highmem
- *      image pages.
- */
 static unsigned int safe_highmem_pages;
 static struct memory_bitmap *safe_highmem_bm;
-static int
+/**
-prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
+ * prepare_highmem_image - Allocate memory for loading highmem data from image.
+ * @bm: Pointer to an uninitialized memory bitmap structure.
+ * @nr_highmem_p: Pointer to the number of highmem image pages.
+ *
+ * Try to allocate as many highmem pages as there are highmem image pages
+ * (@nr_highmem_p points to the variable containing the number of highmem image
+ * pages).  The pages that are "safe" (ie. will not be overwritten when the
+ * hibernation image is restored entirely) have the corresponding bits set in
+ * @bm (it must be unitialized).
+ *
+ * NOTE: This function should not be called if there are no highmem image pages.
+ */
+static int prepare_highmem_image(struct memory_bitmap *bm,
+                                 unsigned int *nr_highmem_p)
 {
        unsigned int to_alloc;
@@ -2201,39 +2292,42 @@ prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
        return 0;
 }
+static struct page *last_highmem_page;
 /**
- *      get_highmem_page_buffer - for given highmem image page find the buffer
+ * get_highmem_page_buffer - Prepare a buffer to store a highmem image page.
- *      that suspend_write_next() should set for its caller to write to.
 *
- *      If the page is to be saved to its "original" page frame or a copy of
+ * For a given highmem image page get a buffer that suspend_write_next() should
- *      the page is to be made in the highmem, @buffer is returned.  Otherwise,
+ * return to its caller to write to.
- *      the copy of the page is to be made in normal memory, so the address of
- *      the copy is returned.
 *
- *      If @buffer is returned, the caller of suspend_write_next() will write
+ * If the page is to be saved to its "original" page frame or a copy of
- *      the page's contents to @buffer, so they will have to be copied to the
+ * the page is to be made in the highmem, @buffer is returned.  Otherwise,
- *      right location on the next call to suspend_write_next() and it is done
+ * the copy of the page is to be made in normal memory, so the address of
- *      with the help of copy_last_highmem_page().  For this purpose, if
+ * the copy is returned.
- *      @buffer is returned, @last_highmem page is set to the page to which
+ *
- *      the data will have to be copied from @buffer.
+ * If @buffer is returned, the caller of suspend_write_next() will write
+ * the page's contents to @buffer, so they will have to be copied to the
+ * right location on the next call to suspend_write_next() and it is done
+ * with the help of copy_last_highmem_page().  For this purpose, if
+ * @buffer is returned, @last_highmem_page is set to the page to which
+ * the data will have to be copied from @buffer.
 */
+static void *get_highmem_page_buffer(struct page *page,
-static struct page *last_highmem_page;
+                                     struct chain_allocator *ca)
-static void *
-get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
 {
        struct highmem_pbe *pbe;
        void *kaddr;
        if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page)) {
-                /* We have allocated the "original" page frame and we can
+                /*
+                 * We have allocated the "original" page frame and we can
                 * use it directly to store the loaded page.
                 */
                last_highmem_page = page;
                return buffer;
        }
-        /* The "original" page frame has not been allocated and we have to
+        /*
+         * The "original" page frame has not been allocated and we have to
         * use a "safe" page frame to store the loaded page.
         */
        pbe = chain_alloc(ca, sizeof(struct highmem_pbe));
@@ -2263,11 +2357,12 @@ get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
 }
 /**
- *      copy_last_highmem_page - copy the contents of a highmem image from
+ * copy_last_highmem_page - Copy most the most recent highmem image page.
- *      @buffer, where the caller of snapshot_write_next() has place them,
+ *
- *      to the right location represented by @last_highmem_page .
+ * Copy the contents of a highmem image from @buffer, where the caller of
+ * snapshot_write_next() has stored them, to the right location represented by
+ * @last_highmem_page .
 */
 static void copy_last_highmem_page(void)
 {
        if (last_highmem_page) {
@@ -2294,17 +2389,13 @@ static inline void free_highmem_data(void)
                free_image_page(buffer, PG_UNSAFE_CLEAR);
 }
 #else
-static unsigned int
+static unsigned int count_highmem_image_pages(struct memory_bitmap *bm) { return 0; }
-count_highmem_image_pages(struct memory_bitmap *bm) { return 0; }
-static inline int
+static inline int prepare_highmem_image(struct memory_bitmap *bm,
-prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
+                                        unsigned int *nr_highmem_p) { return 0; }
-{
-        return 0;
-}
-static inline void *
+static inline void *get_highmem_page_buffer(struct page *page,
-get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
+                                            struct chain_allocator *ca)
 {
        return ERR_PTR(-EINVAL);
 }
@@ -2314,27 +2405,27 @@ static inline int last_highmem_page_copied(void) { return 1; }
 static inline void free_highmem_data(void) {}
 #endif /* CONFIG_HIGHMEM */
+#define PBES_PER_LINKED_PAGE    (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe))
 /**
- *      prepare_image - use the memory bitmap @bm to mark the pages that will
+ * prepare_image - Make room for loading hibernation image.
- *      be overwritten in the process of restoring the system memory state
+ * @new_bm: Unitialized memory bitmap structure.
- *      from the suspend image ("unsafe" pages) and allocate memory for the
+ * @bm: Memory bitmap with unsafe pages marked.
- *      image.
+ *
+ * Use @bm to mark the pages that will be overwritten in the process of
+ * restoring the system memory state from the suspend image ("unsafe" pages)
+ * and allocate memory for the image.
 *
- *      The idea is to allocate a new memory bitmap first and then allocate
+ * The idea is to allocate a new memory bitmap first and then allocate
- *      as many pages as needed for the image data, but not to assign these
+ * as many pages as needed for image data, but without specifying what those
- *      pages to specific tasks initially.  Instead, we just mark them as
+ * pages will be used for just yet.  Instead, we mark them all as allocated and
- *      allocated and create a lists of "safe" pages that will be used
+ * create a lists of "safe" pages to be used later.  On systems with high
- *      later.  On systems with high memory a list of "safe" highmem pages is
+ * memory a list of "safe" highmem pages is created too.
- *      also created.
 */
+static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
-#define PBES_PER_LINKED_PAGE    (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe))
-static int
-prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
 {
        unsigned int nr_pages, nr_highmem;
-        struct linked_page *sp_list, *lp;
+        struct linked_page *lp;
        int error;
        /* If there is no highmem, the buffer will not be necessary */
@@ -2342,9 +2433,7 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
        buffer = NULL;
        nr_highmem = count_highmem_image_pages(bm);
-        error = mark_unsafe_pages(bm);
+        mark_unsafe_pages(bm);
-        if (error)
-                goto Free;
        error = memory_bm_create(new_bm, GFP_ATOMIC, PG_SAFE);
        if (error)
@@ -2357,14 +2446,15 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
                if (error)
                        goto Free;
        }
-        /* Reserve some safe pages for potential later use.
+        /*
+         * Reserve some safe pages for potential later use.
         *
         * NOTE: This way we make sure there will be enough safe pages for the
         * chain_alloc() in get_buffer().  It is a bit wasteful, but
         * nr_copy_pages cannot be greater than 50% of the memory anyway.
+         *
+         * nr_copy_pages cannot be less than allocated_unsafe_pages too.
         */
-        sp_list = NULL;
-        /* nr_copy_pages cannot be lesser than allocated_unsafe_pages */
        nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;
        nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE);
        while (nr_pages > 0) {
@@ -2373,12 +2463,11 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
                        error = -ENOMEM;
                        goto Free;
                }
-                lp->next = sp_list;
+                lp->next = safe_pages_list;
-                sp_list = lp;
+                safe_pages_list = lp;
                nr_pages--;
        }
        /* Preallocate memory for the image */
-        safe_pages_list = NULL;
        nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;
        while (nr_pages > 0) {
                lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC);
@@ -2396,12 +2485,6 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
                swsusp_set_page_free(virt_to_page(lp));
                nr_pages--;
        }
-        /* Free the reserved safe pages so that chain_alloc() can use them */
-        while (sp_list) {
-                lp = sp_list->next;
-                free_image_page(sp_list, PG_UNSAFE_CLEAR);
-                sp_list = lp;
-        }
        return 0;
 Free:
@@ -2410,10 +2493,11 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
 }
 /**
- *      get_buffer - compute the address that snapshot_write_next() should
+ * get_buffer - Get the address to store the next image data page.
- *      set for its caller to write to.
+ *
+ * Get the address that snapshot_write_next() should return to its caller to
+ * write to.
 */
 static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
 {
        struct pbe *pbe;
@@ -2428,12 +2512,14 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
                return get_highmem_page_buffer(page, ca);
        if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page))
-                /* We have allocated the "original" page frame and we can
+                /*
+                 * We have allocated the "original" page frame and we can
                 * use it directly to store the loaded page.
                 */
                return page_address(page);
-        /* The "original" page frame has not been allocated and we have to
+        /*
+         * The "original" page frame has not been allocated and we have to
         * use a "safe" page frame to store the loaded page.
         */
        pbe = chain_alloc(ca, sizeof(struct pbe));
@@ -2450,22 +2536,21 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
 }
 /**
- *      snapshot_write_next - used for writing the system memory snapshot.
+ * snapshot_write_next - Get the address to store the next image page.
+ * @handle: Snapshot handle structure to guide the writing.
 *
- *      On the first call to it @handle should point to a zeroed
+ * On the first call, @handle should point to a zeroed snapshot_handle
- *      snapshot_handle structure.  The structure gets updated and a pointer
+ * structure.  The structure gets populated then and a pointer to it should be
- *      to it should be passed to this function every next time.
+ * passed to this function every next time.
 *
- *      On success the function returns a positive number.  Then, the caller
+ * On success, the function returns a positive number.  Then, the caller
- *      is allowed to write up to the returned number of bytes to the memory
+ * is allowed to write up to the returned number of bytes to the memory
- *      location computed by the data_of() macro.
+ * location computed by the data_of() macro.
 *
- *      The function returns 0 to indicate the "end of file" condition,
+ * The function returns 0 to indicate the "end of file" condition.  Negative
- *      and a negative number is returned on error.  In such cases the
+ * numbers are returned on errors, in which cases the structure pointed to by
- *      structure pointed to by @handle is not updated and should not be used
+ * @handle is not updated and should not be used any more.
- *      any more.
 */
 int snapshot_write_next(struct snapshot_handle *handle)
 {
        static struct chain_allocator ca;
@@ -2491,6 +2576,8 @@ int snapshot_write_next(struct snapshot_handle *handle)
                if (error)
                        return error;
+                safe_pages_list = NULL;
                error = memory_bm_create(&copy_bm, GFP_ATOMIC, PG_ANY);
                if (error)
                        return error;
@@ -2500,6 +2587,7 @@ int snapshot_write_next(struct snapshot_handle *handle)
                if (error)
                        return error;
+                hibernate_restore_protection_begin();
        } else if (handle->cur <= nr_meta_pages + 1) {
                error = unpack_orig_pfns(buffer, &copy_bm);
                if (error)
@@ -2522,6 +2610,7 @@ int snapshot_write_next(struct snapshot_handle *handle)
                copy_last_highmem_page();
                /* Restore page key for data page (s390 only). */
                page_key_write(handle->buffer);
+                hibernate_restore_protect_page(handle->buffer);
                handle->buffer = get_buffer(&orig_bm, &ca);
                if (IS_ERR(handle->buffer))
                        return PTR_ERR(handle->buffer);
@@ -2533,22 +2622,23 @@ int snapshot_write_next(struct snapshot_handle *handle)
 }
 /**
- *      snapshot_write_finalize - must be called after the last call to
+ * snapshot_write_finalize - Complete the loading of a hibernation image.
- *      snapshot_write_next() in case the last page in the image happens
+ *
- *      to be a highmem page and its contents should be stored in the
+ * Must be called after the last call to snapshot_write_next() in case the last
- *      highmem.  Additionally, it releases the memory that will not be
+ * page in the image happens to be a highmem page and its contents should be
- *      used any more.
+ * stored in highmem.  Additionally, it recycles bitmap memory that's not
+ * necessary any more.
 */
 void snapshot_write_finalize(struct snapshot_handle *handle)
 {
        copy_last_highmem_page();
        /* Restore page key for data page (s390 only). */
        page_key_write(handle->buffer);
        page_key_free();
-        /* Free only if we have loaded the image entirely */
+        hibernate_restore_protect_page(handle->buffer);
+        /* Do that only if we have loaded the image entirely */
        if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) {
-                memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
+                memory_bm_recycle(&orig_bm);
                free_highmem_data();
        }
 }
@@ -2561,8 +2651,8 @@ int snapshot_image_loaded(struct snapshot_handle *handle)
 #ifdef CONFIG_HIGHMEM
 /* Assumes that @buf is ready and points to a "safe" page */
-static inline void
+static inline void swap_two_pages_data(struct page *p1, struct page *p2,
-swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
+                                       void *buf)
 {
        void *kaddr1, *kaddr2;
@@ -2576,15 +2666,15 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
 }
 /**
- *      restore_highmem - for each highmem page that was allocated before
+ * restore_highmem - Put highmem image pages into their original locations.
- *      the suspend and included in the suspend image, and also has been
+ *
- *      allocated by the "resume" kernel swap its current (ie. "before
+ * For each highmem page that was in use before hibernation and is included in
- *      resume") contents with the previous (ie. "before suspend") one.
+ * the image, and also has been allocated by the "restore" kernel, swap its
+ * current contents with the previous (ie. "before hibernation") ones.
 *
- *      If the resume eventually fails, we can call this function once
+ * If the restore eventually fails, we can call this function once again and
- *      again and restore the "before resume" highmem state.
+ * restore the highmem state as seen by the restore kernel.
 */
 int restore_highmem(void)
 {
        struct highmem_pbe *pbe = highmem_pblist;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 5b70d64b871e..0acab9d7f96f 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -266,16 +266,18 @@ static int suspend_test(int level)
 */
 static int suspend_prepare(suspend_state_t state)
 {
-        int error;
+        int error, nr_calls = 0;
        if (!sleep_state_supported(state))
                return -EPERM;
        pm_prepare_console();
-        error = pm_notifier_call_chain(PM_SUSPEND_PREPARE);
+        error = __pm_notifier_call_chain(PM_SUSPEND_PREPARE, -1, &nr_calls);
-        if (error)
+        if (error) {
+                nr_calls--;
                goto Finish;
+        }
        trace_suspend_resume(TPS("freeze_processes"), 0, true);
        error = suspend_freeze_processes();
@@ -286,7 +288,7 @@ static int suspend_prepare(suspend_state_t state)
        suspend_stats.failed_freeze++;
        dpm_save_failed_step(SUSPEND_FREEZE);
 Finish:
-        pm_notifier_call_chain(PM_POST_SUSPEND);
+        __pm_notifier_call_chain(PM_POST_SUSPEND, nr_calls, NULL);
        pm_restore_console();
        return error;
 }
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 160e1006640d..a3b1e617bcdc 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -261,7 +261,7 @@ static void hib_end_io(struct bio *bio)
        bio_put(bio);
 }
-static int hib_submit_io(int rw, pgoff_t page_off, void *addr,
+static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr,
                struct hib_bio_batch *hb)
 {
        struct page *page = virt_to_page(addr);
@@ -271,6 +271,7 @@ static int hib_submit_io(int rw, pgoff_t page_off, void *addr,
        bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1);
        bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);
        bio->bi_bdev = hib_resume_bdev;
+        bio_set_op_attrs(bio, op, op_flags);
        if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
                printk(KERN_ERR "PM: Adding page to bio failed at %llu\n",
@@ -283,9 +284,9 @@ static int hib_submit_io(int rw, pgoff_t page_off, void *addr,
                bio->bi_end_io = hib_end_io;
                bio->bi_private = hb;
                atomic_inc(&hb->count);
-                submit_bio(rw, bio);
+                submit_bio(bio);
        } else {
-                error = submit_bio_wait(rw, bio);
+                error = submit_bio_wait(bio);
                bio_put(bio);
        }
@@ -306,7 +307,8 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
 {
        int error;
-        hib_submit_io(READ_SYNC, swsusp_resume_block, swsusp_header, NULL);
+        hib_submit_io(REQ_OP_READ, READ_SYNC, swsusp_resume_block,
+                      swsusp_header, NULL);
        if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) ||
            !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) {
                memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
@@ -315,8 +317,8 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
                swsusp_header->flags = flags;
                if (flags & SF_CRC32_MODE)
                        swsusp_header->crc32 = handle->crc32;
-                error = hib_submit_io(WRITE_SYNC, swsusp_resume_block,
+                error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC,
-                                        swsusp_header, NULL);
+                                      swsusp_resume_block, swsusp_header, NULL);
        } else {
                printk(KERN_ERR "PM: Swap header not found!\n");
                error = -ENODEV;
@@ -348,6 +350,12 @@ static int swsusp_swap_check(void)
        if (res < 0)
                blkdev_put(hib_resume_bdev, FMODE_WRITE);
+        /*
+         * Update the resume device to the one actually used,
+         * so the test_resume mode can use it in case it is
+         * invoked from hibernate() to test the snapshot.
+         */
+        swsusp_resume_device = hib_resume_bdev->bd_dev;
        return res;
 }
@@ -389,7 +397,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb)
        } else {
                src = buf;
        }
-        return hib_submit_io(WRITE_SYNC, offset, src, hb);
+        return hib_submit_io(REQ_OP_WRITE, WRITE_SYNC, offset, src, hb);
 }
 static void release_swap_writer(struct swap_map_handle *handle)
@@ -992,7 +1000,8 @@ static int get_swap_reader(struct swap_map_handle *handle,
                        return -ENOMEM;
                }
-                error = hib_submit_io(READ_SYNC, offset, tmp->map, NULL);
+                error = hib_submit_io(REQ_OP_READ, READ_SYNC, offset,
+                                      tmp->map, NULL);
                if (error) {
                        release_swap_reader(handle);
                        return error;
@@ -1016,7 +1025,7 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
        offset = handle->cur->entries[handle->k];
        if (!offset)
                return -EFAULT;
-        error = hib_submit_io(READ_SYNC, offset, buf, hb);
+        error = hib_submit_io(REQ_OP_READ, READ_SYNC, offset, buf, hb);
        if (error)
                return error;
        if (++handle->k >= MAP_PAGE_ENTRIES) {
@@ -1525,7 +1534,8 @@ int swsusp_check(void)
        if (!IS_ERR(hib_resume_bdev)) {
                set_blocksize(hib_resume_bdev, PAGE_SIZE);
                clear_page(swsusp_header);
-                error = hib_submit_io(READ_SYNC, swsusp_resume_block,
+                error = hib_submit_io(REQ_OP_READ, READ_SYNC,
+                                        swsusp_resume_block,
                                        swsusp_header, NULL);
                if (error)
                        goto put;
@@ -1533,7 +1543,8 @@ int swsusp_check(void)
                if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) {
                        memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
                        /* Reset swap signature now */
-                        error = hib_submit_io(WRITE_SYNC, swsusp_resume_block,
+                        error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC,
+                                                swsusp_resume_block,
                                                swsusp_header, NULL);
                } else {
                        error = -EINVAL;
@@ -1577,10 +1588,12 @@ int swsusp_unmark(void)
 {
        int error;
-        hib_submit_io(READ_SYNC, swsusp_resume_block, swsusp_header, NULL);
+        hib_submit_io(REQ_OP_READ, READ_SYNC, swsusp_resume_block,
+                      swsusp_header, NULL);
        if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) {
                memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10);
-                error = hib_submit_io(WRITE_SYNC, swsusp_resume_block,
+                error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC,
+                                        swsusp_resume_block,
                                        swsusp_header, NULL);
        } else {
                printk(KERN_ERR "PM: Cannot find swsusp signature!\n");
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 526e8911460a..35310b627388 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -47,7 +47,7 @@ atomic_t snapshot_device_available = ATOMIC_INIT(1);
 static int snapshot_open(struct inode *inode, struct file *filp)
 {
        struct snapshot_data *data;
-        int error;
+        int error, nr_calls = 0;
        if (!hibernation_available())
                return -EPERM;
@@ -74,9 +74,9 @@ static int snapshot_open(struct inode *inode, struct file *filp)
                        swap_type_of(swsusp_resume_device, 0, NULL) : -1;
                data->mode = O_RDONLY;
                data->free_bitmaps = false;
-                error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
+                error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls);
                if (error)
-                        pm_notifier_call_chain(PM_POST_HIBERNATION);
+                        __pm_notifier_call_chain(PM_POST_HIBERNATION, --nr_calls, NULL);
        } else {
                /*
                 * Resuming.  We may need to wait for the image device to
@@ -86,13 +86,15 @@ static int snapshot_open(struct inode *inode, struct file *filp)
                data->swap = -1;
                data->mode = O_WRONLY;
-                error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
+                error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls);
                if (!error) {
                        error = create_basic_memory_bitmaps();
                        data->free_bitmaps = !error;
-                }
+                } else
+                        nr_calls--;
                if (error)
-                        pm_notifier_call_chain(PM_POST_RESTORE);
+                        __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL);
        }
        if (error)
                atomic_inc(&snapshot_device_available);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 60cdf6386763..d4de33934dac 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3177,9 +3177,8 @@ void show_regs_print_info(const char *log_lvl)
 {
        dump_stack_print_info(log_lvl);
-        printk("%stask: %p ti: %p task.ti: %p\n",
+        printk("%stask: %p task.stack: %p\n",
-               log_lvl, current, current_thread_info(),
+               log_lvl, current, task_stack_page(current));
-               task_thread_info(current));
 }
 #endif
diff --git a/kernel/profile.c b/kernel/profile.c
index c2199e9901c9..2dbccf2d806c 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -328,68 +328,57 @@ out:
        put_cpu();
 }
-static int profile_cpu_callback(struct notifier_block *info,
+static int profile_dead_cpu(unsigned int cpu)
-                                        unsigned long action, void *__cpu)
 {
-        int node, cpu = (unsigned long)__cpu;
        struct page *page;
+        int i;
-        switch (action) {
+        if (prof_cpu_mask != NULL)
-        case CPU_UP_PREPARE:
+                cpumask_clear_cpu(cpu, prof_cpu_mask);
-        case CPU_UP_PREPARE_FROZEN:
-                node = cpu_to_mem(cpu);
+        for (i = 0; i < 2; i++) {
-                per_cpu(cpu_profile_flip, cpu) = 0;
+                if (per_cpu(cpu_profile_hits, cpu)[i]) {
-                if (!per_cpu(cpu_profile_hits, cpu)[1]) {
+                        page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[i]);
-                        page = __alloc_pages_node(node,
+                        per_cpu(cpu_profile_hits, cpu)[i] = NULL;
-                                        GFP_KERNEL | __GFP_ZERO,
-                                        0);
-                        if (!page)
-                                return notifier_from_errno(-ENOMEM);
-                        per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
-                }
-                if (!per_cpu(cpu_profile_hits, cpu)[0]) {
-                        page = __alloc_pages_node(node,
-                                        GFP_KERNEL | __GFP_ZERO,
-                                        0);
-                        if (!page)
-                                goto out_free;
-                        per_cpu(cpu_profile_hits, cpu)[0] = page_address(page);
-                }
-                break;
-out_free:
-                page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
-                per_cpu(cpu_profile_hits, cpu)[1] = NULL;
-                __free_page(page);
-                return notifier_from_errno(-ENOMEM);
-        case CPU_ONLINE:
-        case CPU_ONLINE_FROZEN:
-                if (prof_cpu_mask != NULL)
-                        cpumask_set_cpu(cpu, prof_cpu_mask);
-                break;
-        case CPU_UP_CANCELED:
-        case CPU_UP_CANCELED_FROZEN:
-        case CPU_DEAD:
-        case CPU_DEAD_FROZEN:
-                if (prof_cpu_mask != NULL)
-                        cpumask_clear_cpu(cpu, prof_cpu_mask);
-                if (per_cpu(cpu_profile_hits, cpu)[0]) {
-                        page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
-                        per_cpu(cpu_profile_hits, cpu)[0] = NULL;
                        __free_page(page);
                }
-                if (per_cpu(cpu_profile_hits, cpu)[1]) {
+        }
-                        page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
+        return 0;
-                        per_cpu(cpu_profile_hits, cpu)[1] = NULL;
+}
-                        __free_page(page);
+static int profile_prepare_cpu(unsigned int cpu)
+{
+        int i, node = cpu_to_mem(cpu);
+        struct page *page;
+        per_cpu(cpu_profile_flip, cpu) = 0;
+        for (i = 0; i < 2; i++) {
+                if (per_cpu(cpu_profile_hits, cpu)[i])
+                        continue;
+                page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+                if (!page) {
+                        profile_dead_cpu(cpu);
+                        return -ENOMEM;
                }
-                break;
+                per_cpu(cpu_profile_hits, cpu)[i] = page_address(page);
        }
-        return NOTIFY_OK;
+        return 0;
+}
+static int profile_online_cpu(unsigned int cpu)
+{
+        if (prof_cpu_mask != NULL)
+                cpumask_set_cpu(cpu, prof_cpu_mask);
+        return 0;
 }
 #else /* !CONFIG_SMP */
 #define profile_flip_buffers()          do { } while (0)
 #define profile_discard_flip_buffers()  do { } while (0)
-#define profile_cpu_callback            NULL
 static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
 {
@@ -531,83 +520,43 @@ static const struct file_operations proc_profile_operations = {
        .llseek         = default_llseek,
 };
-#ifdef CONFIG_SMP
+int __ref create_proc_profile(void)
-static void profile_nop(void *unused)
-{
-}
-static int create_hash_tables(void)
 {
-        int cpu;
+        struct proc_dir_entry *entry;
+#ifdef CONFIG_SMP
-        for_each_online_cpu(cpu) {
+        enum cpuhp_state online_state;
-                int node = cpu_to_mem(cpu);
-                struct page *page;
-                page = __alloc_pages_node(node,
-                                GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
-                                0);
-                if (!page)
-                        goto out_cleanup;
-                per_cpu(cpu_profile_hits, cpu)[1]
-                                = (struct profile_hit *)page_address(page);
-                page = __alloc_pages_node(node,
-                                GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
-                                0);
-                if (!page)
-                        goto out_cleanup;
-                per_cpu(cpu_profile_hits, cpu)[0]
-                                = (struct profile_hit *)page_address(page);
-        }
-        return 0;
-out_cleanup:
-        prof_on = 0;
-        smp_mb();
-        on_each_cpu(profile_nop, NULL, 1);
-        for_each_online_cpu(cpu) {
-                struct page *page;
-                if (per_cpu(cpu_profile_hits, cpu)[0]) {
-                        page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
-                        per_cpu(cpu_profile_hits, cpu)[0] = NULL;
-                        __free_page(page);
-                }
-                if (per_cpu(cpu_profile_hits, cpu)[1]) {
-                        page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
-                        per_cpu(cpu_profile_hits, cpu)[1] = NULL;
-                        __free_page(page);
-                }
-        }
-        return -1;
-}
-#else
-#define create_hash_tables()                    ({ 0; })
 #endif
-int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */
-{
-        struct proc_dir_entry *entry;
        int err = 0;
        if (!prof_on)
                return 0;
+#ifdef CONFIG_SMP
-        cpu_notifier_register_begin();
+        err = cpuhp_setup_state(CPUHP_PROFILE_PREPARE, "PROFILE_PREPARE",
+                                profile_prepare_cpu, profile_dead_cpu);
-        if (create_hash_tables()) {
+        if (err)
-                err = -ENOMEM;
+                return err;
-                goto out;
-        }
+        err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "AP_PROFILE_ONLINE",
+                                profile_online_cpu, NULL);
+        if (err < 0)
+                goto err_state_prep;
+        online_state = err;
+        err = 0;
+#endif
        entry = proc_create("profile", S_IWUSR | S_IRUGO,
                            NULL, &proc_profile_operations);
        if (!entry)
-                goto out;
+                goto err_state_onl;
        proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t));
-        __hotcpu_notifier(profile_cpu_callback, 0);
-out:
+        return err;
-        cpu_notifier_register_done();
+err_state_onl:
+#ifdef CONFIG_SMP
+        cpuhp_remove_state(online_state);
+err_state_prep:
+        cpuhp_remove_state(CPUHP_PROFILE_PREPARE);
+#endif
        return err;
 }
 subsys_initcall(create_proc_profile);
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 3cee0d8393ed..d38ab08a3fe7 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -58,7 +58,7 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>");
 #define VERBOSE_PERFOUT_ERRSTRING(s) \
        do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0)
-torture_param(bool, gp_exp, true, "Use expedited GP wait primitives");
+torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
 torture_param(int, holdoff, 10, "Holdoff time before test start (s)");
 torture_param(int, nreaders, -1, "Number of RCU reader threads");
 torture_param(int, nwriters, -1, "Number of RCU updater threads");
@@ -96,12 +96,7 @@ static int rcu_perf_writer_state;
 #define MAX_MEAS 10000
 #define MIN_MEAS 100
-#if defined(MODULE) || defined(CONFIG_RCU_PERF_TEST_RUNNABLE)
+static int perf_runnable = IS_ENABLED(MODULE);
-#define RCUPERF_RUNNABLE_INIT 1
-#else
-#define RCUPERF_RUNNABLE_INIT 0
-#endif
-static int perf_runnable = RCUPERF_RUNNABLE_INIT;
 module_param(perf_runnable, int, 0444);
 MODULE_PARM_DESC(perf_runnable, "Start rcuperf at boot");
@@ -363,8 +358,6 @@ rcu_perf_writer(void *arg)
        u64 *wdpp = writer_durations[me];
        VERBOSE_PERFOUT_STRING("rcu_perf_writer task started");
-        WARN_ON(rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp);
-        WARN_ON(rcu_gp_is_normal() && gp_exp);
        WARN_ON(!wdpp);
        set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
        sp.sched_priority = 1;
@@ -631,12 +624,24 @@ rcu_perf_init(void)
                firsterr = -ENOMEM;
                goto unwind;
        }
+        if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) {
+                VERBOSE_PERFOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!");
+                firsterr = -EINVAL;
+                goto unwind;
+        }
+        if (rcu_gp_is_normal() && gp_exp) {
+                VERBOSE_PERFOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!");
+                firsterr = -EINVAL;
+                goto unwind;
+        }
        for (i = 0; i < nrealwriters; i++) {
                writer_durations[i] =
                        kcalloc(MAX_MEAS, sizeof(*writer_durations[i]),
                                GFP_KERNEL);
-                if (!writer_durations[i])
+                if (!writer_durations[i]) {
+                        firsterr = -ENOMEM;
                        goto unwind;
+                }
                firsterr = torture_create_kthread(rcu_perf_writer, (void *)i,
                                                  writer_tasks[i]);
                if (firsterr)
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 084a28a732eb..971e2b138063 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -182,12 +182,7 @@ static const char *rcu_torture_writer_state_getname(void)
        return rcu_torture_writer_state_names[i];
 }
-#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE)
+static int torture_runnable = IS_ENABLED(MODULE);
-#define RCUTORTURE_RUNNABLE_INIT 1
-#else
-#define RCUTORTURE_RUNNABLE_INIT 0
-#endif
-static int torture_runnable = RCUTORTURE_RUNNABLE_INIT;
 module_param(torture_runnable, int, 0444);
 MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot");
@@ -1476,7 +1471,7 @@ static int rcu_torture_barrier_cbs(void *arg)
                        break;
                /*
                 * The above smp_load_acquire() ensures barrier_phase load
-                 * is ordered before the folloiwng ->call().
+                 * is ordered before the following ->call().
                 */
                local_irq_disable(); /* Just to test no-irq call_rcu(). */
                cur_ops->call(&rcu, rcu_torture_barrier_cbf);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index c7f1bc4f817c..5d80925e7fc8 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -125,12 +125,14 @@ int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
 /* Number of rcu_nodes at specified level. */
 static int num_rcu_lvl[] = NUM_RCU_LVL_INIT;
 int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
+/* panic() on RCU Stall sysctl. */
+int sysctl_panic_on_rcu_stall __read_mostly;
 /*
 * The rcu_scheduler_active variable transitions from zero to one just
 * before the first task is spawned.  So when this variable is zero, RCU
 * can assume that there is but one task, allowing RCU to (for example)
- * optimize synchronize_sched() to a simple barrier().  When this variable
+ * optimize synchronize_rcu() to a simple barrier().  When this variable
 * is one, RCU must actually do all the hard work required to detect real
 * grace periods.  This variable is also used to suppress boot-time false
 * positives from lockdep-RCU error checking.
@@ -159,6 +161,7 @@ static void invoke_rcu_core(void);
 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
 static void rcu_report_exp_rdp(struct rcu_state *rsp,
                               struct rcu_data *rdp, bool wake);
+static void sync_sched_exp_online_cleanup(int cpu);
 /* rcuc/rcub kthread realtime priority */
 #ifdef CONFIG_RCU_KTHREAD_PRIO
@@ -1070,11 +1073,11 @@ EXPORT_SYMBOL_GPL(rcu_is_watching);
 * offline to continue to use RCU for one jiffy after marking itself
 * offline in the cpu_online_mask.  This leniency is necessary given the
 * non-atomic nature of the online and offline processing, for example,
- * the fact that a CPU enters the scheduler after completing the CPU_DYING
+ * the fact that a CPU enters the scheduler after completing the teardown
- * notifiers.
+ * of the CPU.
 *
- * This is also why RCU internally marks CPUs online during the
+ * This is also why RCU internally marks CPUs online during in the
- * CPU_UP_PREPARE phase and offline during the CPU_DEAD phase.
+ * preparation phase and offline after the CPU has been taken down.
 *
 * Disable checking if in an NMI handler because we cannot safely report
 * errors from NMI handlers anyway.
@@ -1284,9 +1287,9 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
        rcu_for_each_leaf_node(rsp, rnp) {
                raw_spin_lock_irqsave_rcu_node(rnp, flags);
                if (rnp->qsmask != 0) {
-                        for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
+                        for_each_leaf_node_possible_cpu(rnp, cpu)
-                                if (rnp->qsmask & (1UL << cpu))
+                                if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu))
-                                        dump_cpu_task(rnp->grplo + cpu);
+                                        dump_cpu_task(cpu);
                }
                raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
        }
@@ -1311,6 +1314,12 @@ static void rcu_stall_kick_kthreads(struct rcu_state *rsp)
        }
 }
+static inline void panic_on_rcu_stall(void)
+{
+        if (sysctl_panic_on_rcu_stall)
+                panic("RCU Stall\n");
+}
 static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
 {
        int cpu;
@@ -1351,10 +1360,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
                raw_spin_lock_irqsave_rcu_node(rnp, flags);
                ndetected += rcu_print_task_stall(rnp);
                if (rnp->qsmask != 0) {
-                        for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
+                        for_each_leaf_node_possible_cpu(rnp, cpu)
-                                if (rnp->qsmask & (1UL << cpu)) {
+                                if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) {
-                                        print_cpu_stall_info(rsp,
+                                        print_cpu_stall_info(rsp, cpu);
-                                                             rnp->grplo + cpu);
                                        ndetected++;
                                }
                }
@@ -1390,6 +1398,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
        rcu_check_gp_kthread_starvation(rsp);
+        panic_on_rcu_stall();
        force_quiescent_state(rsp);  /* Kick them all. */
 }
@@ -1430,6 +1440,8 @@ static void print_cpu_stall(struct rcu_state *rsp)
                           jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
        raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+        panic_on_rcu_stall();
        /*
         * Attempt to revive the RCU machinery by forcing a context switch.
         *
@@ -1989,8 +2001,7 @@ static bool rcu_gp_init(struct rcu_state *rsp)
         * of the tree within the rsp->node[] array.  Note that other CPUs
         * will access only the leaves of the hierarchy, thus seeing that no
         * grace period is in progress, at least until the corresponding
-         * leaf node has been initialized.  In addition, we have excluded
+         * leaf node has been initialized.
-         * CPU-hotplug operations.
         *
         * The grace period cannot complete until the initialization
         * process finishes, because this kthread handles both.
@@ -2872,7 +2883,6 @@ static void force_qs_rnp(struct rcu_state *rsp,
                                  unsigned long *maxj),
                         bool *isidle, unsigned long *maxj)
 {
-        unsigned long bit;
        int cpu;
        unsigned long flags;
        unsigned long mask;
@@ -2907,9 +2917,8 @@ static void force_qs_rnp(struct rcu_state *rsp,
                                continue;
                        }
                }
-                cpu = rnp->grplo;
+                for_each_leaf_node_possible_cpu(rnp, cpu) {
-                bit = 1;
+                        unsigned long bit = leaf_node_cpu_bit(rnp, cpu);
-                for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
                        if ((rnp->qsmask & bit) != 0) {
                                if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
                                        mask |= bit;
@@ -3448,549 +3457,6 @@ static bool rcu_seq_done(unsigned long *sp, unsigned long s)
        return ULONG_CMP_GE(READ_ONCE(*sp), s);
 }
-/* Wrapper functions for expedited grace periods.  */
-static void rcu_exp_gp_seq_start(struct rcu_state *rsp)
-{
-        rcu_seq_start(&rsp->expedited_sequence);
-}
-static void rcu_exp_gp_seq_end(struct rcu_state *rsp)
-{
-        rcu_seq_end(&rsp->expedited_sequence);
-        smp_mb(); /* Ensure that consecutive grace periods serialize. */
-}
-static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp)
-{
-        unsigned long s;
-        smp_mb(); /* Caller's modifications seen first by other CPUs. */
-        s = rcu_seq_snap(&rsp->expedited_sequence);
-        trace_rcu_exp_grace_period(rsp->name, s, TPS("snap"));
-        return s;
-}
-static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s)
-{
-        return rcu_seq_done(&rsp->expedited_sequence, s);
-}
-/*
- * Reset the ->expmaskinit values in the rcu_node tree to reflect any
- * recent CPU-online activity.  Note that these masks are not cleared
- * when CPUs go offline, so they reflect the union of all CPUs that have
- * ever been online.  This means that this function normally takes its
- * no-work-to-do fastpath.
- */
-static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp)
-{
-        bool done;
-        unsigned long flags;
-        unsigned long mask;
-        unsigned long oldmask;
-        int ncpus = READ_ONCE(rsp->ncpus);
-        struct rcu_node *rnp;
-        struct rcu_node *rnp_up;
-        /* If no new CPUs onlined since last time, nothing to do. */
-        if (likely(ncpus == rsp->ncpus_snap))
-                return;
-        rsp->ncpus_snap = ncpus;
-        /*
-         * Each pass through the following loop propagates newly onlined
-         * CPUs for the current rcu_node structure up the rcu_node tree.
-         */
-        rcu_for_each_leaf_node(rsp, rnp) {
-                raw_spin_lock_irqsave_rcu_node(rnp, flags);
-                if (rnp->expmaskinit == rnp->expmaskinitnext) {
-                        raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-                        continue;  /* No new CPUs, nothing to do. */
-                }
-                /* Update this node's mask, track old value for propagation. */
-                oldmask = rnp->expmaskinit;
-                rnp->expmaskinit = rnp->expmaskinitnext;
-                raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-                /* If was already nonzero, nothing to propagate. */
-                if (oldmask)
-                        continue;
-                /* Propagate the new CPU up the tree. */
-                mask = rnp->grpmask;
-                rnp_up = rnp->parent;
-                done = false;
-                while (rnp_up) {
-                        raw_spin_lock_irqsave_rcu_node(rnp_up, flags);
-                        if (rnp_up->expmaskinit)
-                                done = true;
-                        rnp_up->expmaskinit |= mask;
-                        raw_spin_unlock_irqrestore_rcu_node(rnp_up, flags);
-                        if (done)
-                                break;
-                        mask = rnp_up->grpmask;
-                        rnp_up = rnp_up->parent;
-                }
-        }
-}
-/*
- * Reset the ->expmask values in the rcu_node tree in preparation for
- * a new expedited grace period.
- */
-static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp)
-{
-        unsigned long flags;
-        struct rcu_node *rnp;
-        sync_exp_reset_tree_hotplug(rsp);
-        rcu_for_each_node_breadth_first(rsp, rnp) {
-                raw_spin_lock_irqsave_rcu_node(rnp, flags);
-                WARN_ON_ONCE(rnp->expmask);
-                rnp->expmask = rnp->expmaskinit;
-                raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-        }
-}
-/*
- * Return non-zero if there is no RCU expedited grace period in progress
- * for the specified rcu_node structure, in other words, if all CPUs and
- * tasks covered by the specified rcu_node structure have done their bit
- * for the current expedited grace period.  Works only for preemptible
- * RCU -- other RCU implementation use other means.
- *
- * Caller must hold the rcu_state's exp_mutex.
- */
-static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
-{
-        return rnp->exp_tasks == NULL &&
-               READ_ONCE(rnp->expmask) == 0;
-}
-/*
- * Report the exit from RCU read-side critical section for the last task
- * that queued itself during or before the current expedited preemptible-RCU
- * grace period.  This event is reported either to the rcu_node structure on
- * which the task was queued or to one of that rcu_node structure's ancestors,
- * recursively up the tree.  (Calm down, calm down, we do the recursion
- * iteratively!)
- *
- * Caller must hold the rcu_state's exp_mutex and the specified rcu_node
- * structure's ->lock.
- */
-static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
-                                 bool wake, unsigned long flags)
-        __releases(rnp->lock)
-{
-        unsigned long mask;
-        for (;;) {
-                if (!sync_rcu_preempt_exp_done(rnp)) {
-                        if (!rnp->expmask)
-                                rcu_initiate_boost(rnp, flags);
-                        else
-                                raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-                        break;
-                }
-                if (rnp->parent == NULL) {
-                        raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-                        if (wake) {
-                                smp_mb(); /* EGP done before wake_up(). */
-                                swake_up(&rsp->expedited_wq);
-                        }
-                        break;
-                }
-                mask = rnp->grpmask;
-                raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled */
-                rnp = rnp->parent;
-                raw_spin_lock_rcu_node(rnp); /* irqs already disabled */
-                WARN_ON_ONCE(!(rnp->expmask & mask));
-                rnp->expmask &= ~mask;
-        }
-}
-/*
- * Report expedited quiescent state for specified node.  This is a
- * lock-acquisition wrapper function for __rcu_report_exp_rnp().
- *
- * Caller must hold the rcu_state's exp_mutex.
- */
-static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
-                                              struct rcu_node *rnp, bool wake)
-{
-        unsigned long flags;
-        raw_spin_lock_irqsave_rcu_node(rnp, flags);
-        __rcu_report_exp_rnp(rsp, rnp, wake, flags);
-}
-/*
- * Report expedited quiescent state for multiple CPUs, all covered by the
- * specified leaf rcu_node structure.  Caller must hold the rcu_state's
- * exp_mutex.
- */
-static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp,
-                                    unsigned long mask, bool wake)
-{
-        unsigned long flags;
-        raw_spin_lock_irqsave_rcu_node(rnp, flags);
-        if (!(rnp->expmask & mask)) {
-                raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-                return;
-        }
-        rnp->expmask &= ~mask;
-        __rcu_report_exp_rnp(rsp, rnp, wake, flags); /* Releases rnp->lock. */
-}
-/*
- * Report expedited quiescent state for specified rcu_data (CPU).
- */
-static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp,
-                               bool wake)
-{
-        rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake);
-}
-/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */
-static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat,
-                               unsigned long s)
-{
-        if (rcu_exp_gp_seq_done(rsp, s)) {
-                trace_rcu_exp_grace_period(rsp->name, s, TPS("done"));
-                /* Ensure test happens before caller kfree(). */
-                smp_mb__before_atomic(); /* ^^^ */
-                atomic_long_inc(stat);
-                return true;
-        }
-        return false;
-}
-/*
- * Funnel-lock acquisition for expedited grace periods.  Returns true
- * if some other task completed an expedited grace period that this task
- * can piggy-back on, and with no mutex held.  Otherwise, returns false
- * with the mutex held, indicating that the caller must actually do the
- * expedited grace period.
- */
-static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
-{
-        struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
-        struct rcu_node *rnp = rdp->mynode;
-        struct rcu_node *rnp_root = rcu_get_root(rsp);
-        /* Low-contention fastpath. */
-        if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s) &&
-            (rnp == rnp_root ||
-             ULONG_CMP_LT(READ_ONCE(rnp_root->exp_seq_rq), s)) &&
-            !mutex_is_locked(&rsp->exp_mutex) &&
-            mutex_trylock(&rsp->exp_mutex))
-                goto fastpath;
-        /*
-         * Each pass through the following loop works its way up
-         * the rcu_node tree, returning if others have done the work or
-         * otherwise falls through to acquire rsp->exp_mutex.  The mapping
-         * from CPU to rcu_node structure can be inexact, as it is just
-         * promoting locality and is not strictly needed for correctness.
-         */
-        for (; rnp != NULL; rnp = rnp->parent) {
-                if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s))
-                        return true;
-                /* Work not done, either wait here or go up. */
-                spin_lock(&rnp->exp_lock);
-                if (ULONG_CMP_GE(rnp->exp_seq_rq, s)) {
-                        /* Someone else doing GP, so wait for them. */
-                        spin_unlock(&rnp->exp_lock);
-                        trace_rcu_exp_funnel_lock(rsp->name, rnp->level,
-                                                  rnp->grplo, rnp->grphi,
-                                                  TPS("wait"));
-                        wait_event(rnp->exp_wq[(s >> 1) & 0x3],
-                                   sync_exp_work_done(rsp,
-                                                      &rdp->exp_workdone2, s));
-                        return true;
-                }
-                rnp->exp_seq_rq = s; /* Followers can wait on us. */
-                spin_unlock(&rnp->exp_lock);
-                trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo,
-                                          rnp->grphi, TPS("nxtlvl"));
-        }
-        mutex_lock(&rsp->exp_mutex);
-fastpath:
-        if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) {
-                mutex_unlock(&rsp->exp_mutex);
-                return true;
-        }
-        rcu_exp_gp_seq_start(rsp);
-        trace_rcu_exp_grace_period(rsp->name, s, TPS("start"));
-        return false;
-}
-/* Invoked on each online non-idle CPU for expedited quiescent state. */
-static void sync_sched_exp_handler(void *data)
-{
-        struct rcu_data *rdp;
-        struct rcu_node *rnp;
-        struct rcu_state *rsp = data;
-        rdp = this_cpu_ptr(rsp->rda);
-        rnp = rdp->mynode;
-        if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
-            __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
-                return;
-        if (rcu_is_cpu_rrupt_from_idle()) {
-                rcu_report_exp_rdp(&rcu_sched_state,
-                                   this_cpu_ptr(&rcu_sched_data), true);
-                return;
-        }
-        __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true);
-        resched_cpu(smp_processor_id());
-}
-/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */
-static void sync_sched_exp_online_cleanup(int cpu)
-{
-        struct rcu_data *rdp;
-        int ret;
-        struct rcu_node *rnp;
-        struct rcu_state *rsp = &rcu_sched_state;
-        rdp = per_cpu_ptr(rsp->rda, cpu);
-        rnp = rdp->mynode;
-        if (!(READ_ONCE(rnp->expmask) & rdp->grpmask))
-                return;
-        ret = smp_call_function_single(cpu, sync_sched_exp_handler, rsp, 0);
-        WARN_ON_ONCE(ret);
-}
-/*
- * Select the nodes that the upcoming expedited grace period needs
- * to wait for.
- */
-static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
-                                     smp_call_func_t func)
-{
-        int cpu;
-        unsigned long flags;
-        unsigned long mask;
-        unsigned long mask_ofl_test;
-        unsigned long mask_ofl_ipi;
-        int ret;
-        struct rcu_node *rnp;
-        sync_exp_reset_tree(rsp);
-        rcu_for_each_leaf_node(rsp, rnp) {
-                raw_spin_lock_irqsave_rcu_node(rnp, flags);
-                /* Each pass checks a CPU for identity, offline, and idle. */
-                mask_ofl_test = 0;
-                for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) {
-                        struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
-                        struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-                        if (raw_smp_processor_id() == cpu ||
-                            !(atomic_add_return(0, &rdtp->dynticks) & 0x1))
-                                mask_ofl_test |= rdp->grpmask;
-                }
-                mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
-                /*
-                 * Need to wait for any blocked tasks as well.  Note that
-                 * additional blocking tasks will also block the expedited
-                 * GP until such time as the ->expmask bits are cleared.
-                 */
-                if (rcu_preempt_has_tasks(rnp))
-                        rnp->exp_tasks = rnp->blkd_tasks.next;
-                raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-                /* IPI the remaining CPUs for expedited quiescent state. */
-                mask = 1;
-                for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
-                        if (!(mask_ofl_ipi & mask))
-                                continue;
-retry_ipi:
-                        ret = smp_call_function_single(cpu, func, rsp, 0);
-                        if (!ret) {
-                                mask_ofl_ipi &= ~mask;
-                                continue;
-                        }
-                        /* Failed, raced with offline. */
-                        raw_spin_lock_irqsave_rcu_node(rnp, flags);
-                        if (cpu_online(cpu) &&
-                            (rnp->expmask & mask)) {
-                                raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-                                schedule_timeout_uninterruptible(1);
-                                if (cpu_online(cpu) &&
-                                    (rnp->expmask & mask))
-                                        goto retry_ipi;
-                                raw_spin_lock_irqsave_rcu_node(rnp, flags);
-                        }
-                        if (!(rnp->expmask & mask))
-                                mask_ofl_ipi &= ~mask;
-                        raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-                }
-                /* Report quiescent states for those that went offline. */
-                mask_ofl_test |= mask_ofl_ipi;
-                if (mask_ofl_test)
-                        rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false);
-        }
-}
-static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
-{
-        int cpu;
-        unsigned long jiffies_stall;
-        unsigned long jiffies_start;
-        unsigned long mask;
-        int ndetected;
-        struct rcu_node *rnp;
-        struct rcu_node *rnp_root = rcu_get_root(rsp);
-        int ret;
-        jiffies_stall = rcu_jiffies_till_stall_check();
-        jiffies_start = jiffies;
-        for (;;) {
-                ret = swait_event_timeout(
-                                rsp->expedited_wq,
-                                sync_rcu_preempt_exp_done(rnp_root),
-                                jiffies_stall);
-                if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root))
-                        return;
-                if (ret < 0) {
-                        /* Hit a signal, disable CPU stall warnings. */
-                        swait_event(rsp->expedited_wq,
-                                   sync_rcu_preempt_exp_done(rnp_root));
-                        return;
-                }
-                pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {",
-                       rsp->name);
-                ndetected = 0;
-                rcu_for_each_leaf_node(rsp, rnp) {
-                        ndetected += rcu_print_task_exp_stall(rnp);
-                        mask = 1;
-                        for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
-                                struct rcu_data *rdp;
-                                if (!(rnp->expmask & mask))
-                                        continue;
-                                ndetected++;
-                                rdp = per_cpu_ptr(rsp->rda, cpu);
-                                pr_cont(" %d-%c%c%c", cpu,
-                                        "O."[!!cpu_online(cpu)],
-                                        "o."[!!(rdp->grpmask & rnp->expmaskinit)],
-                                        "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]);
-                        }
-                        mask <<= 1;
-                }
-                pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
-                        jiffies - jiffies_start, rsp->expedited_sequence,
-                        rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]);
-                if (ndetected) {
-                        pr_err("blocking rcu_node structures:");
-                        rcu_for_each_node_breadth_first(rsp, rnp) {
-                                if (rnp == rnp_root)
-                                        continue; /* printed unconditionally */
-                                if (sync_rcu_preempt_exp_done(rnp))
-                                        continue;
-                                pr_cont(" l=%u:%d-%d:%#lx/%c",
-                                        rnp->level, rnp->grplo, rnp->grphi,
-                                        rnp->expmask,
-                                        ".T"[!!rnp->exp_tasks]);
-                        }
-                        pr_cont("\n");
-                }
-                rcu_for_each_leaf_node(rsp, rnp) {
-                        mask = 1;
-                        for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
-                                if (!(rnp->expmask & mask))
-                                        continue;
-                                dump_cpu_task(cpu);
-                        }
-                }
-                jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3;
-        }
-}
-/*
- * Wait for the current expedited grace period to complete, and then
- * wake up everyone who piggybacked on the just-completed expedited
- * grace period.  Also update all the ->exp_seq_rq counters as needed
- * in order to avoid counter-wrap problems.
- */
-static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s)
-{
-        struct rcu_node *rnp;
-        synchronize_sched_expedited_wait(rsp);
-        rcu_exp_gp_seq_end(rsp);
-        trace_rcu_exp_grace_period(rsp->name, s, TPS("end"));
-        /*
-         * Switch over to wakeup mode, allowing the next GP, but -only- the
-         * next GP, to proceed.
-         */
-        mutex_lock(&rsp->exp_wake_mutex);
-        mutex_unlock(&rsp->exp_mutex);
-        rcu_for_each_node_breadth_first(rsp, rnp) {
-                if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) {
-                        spin_lock(&rnp->exp_lock);
-                        /* Recheck, avoid hang in case someone just arrived. */
-                        if (ULONG_CMP_LT(rnp->exp_seq_rq, s))
-                                rnp->exp_seq_rq = s;
-                        spin_unlock(&rnp->exp_lock);
-                }
-                wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]);
-        }
-        trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake"));
-        mutex_unlock(&rsp->exp_wake_mutex);
-}
-/**
- * synchronize_sched_expedited - Brute-force RCU-sched grace period
- *
- * Wait for an RCU-sched grace period to elapse, but use a "big hammer"
- * approach to force the grace period to end quickly.  This consumes
- * significant time on all CPUs and is unfriendly to real-time workloads,
- * so is thus not recommended for any sort of common-case code.  In fact,
- * if you are using synchronize_sched_expedited() in a loop, please
- * restructure your code to batch your updates, and then use a single
- * synchronize_sched() instead.
- *
- * This implementation can be thought of as an application of sequence
- * locking to expedited grace periods, but using the sequence counter to
- * determine when someone else has already done the work instead of for
- * retrying readers.
- */
-void synchronize_sched_expedited(void)
-{
-        unsigned long s;
-        struct rcu_state *rsp = &rcu_sched_state;
-        /* If only one CPU, this is automatically a grace period. */
-        if (rcu_blocking_is_gp())
-                return;
-        /* If expedited grace periods are prohibited, fall back to normal. */
-        if (rcu_gp_is_normal()) {
-                wait_rcu_gp(call_rcu_sched);
-                return;
-        }
-        /* Take a snapshot of the sequence number.  */
-        s = rcu_exp_gp_seq_snap(rsp);
-        if (exp_funnel_lock(rsp, s))
-                return;  /* Someone else did our work for us. */
-        /* Initialize the rcu_node tree in preparation for the wait. */
-        sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler);
-        /* Wait and clean up, including waking everyone. */
-        rcu_exp_wait_wake(rsp, s);
-}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
 /*
 * Check to see if there is any immediate RCU-related work to be done
 * by the current CPU, for the specified type of RCU, returning 1 if so.
@@ -4281,7 +3747,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
        /* Set up local state, ensuring consistent view of global state. */
        raw_spin_lock_irqsave_rcu_node(rnp, flags);
-        rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
+        rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu);
        rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
        WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
        WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
@@ -4340,12 +3806,58 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
        raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 }
-static void rcu_prepare_cpu(int cpu)
+int rcutree_prepare_cpu(unsigned int cpu)
 {
        struct rcu_state *rsp;
        for_each_rcu_flavor(rsp)
                rcu_init_percpu_data(cpu, rsp);
+        rcu_prepare_kthreads(cpu);
+        rcu_spawn_all_nocb_kthreads(cpu);
+        return 0;
+}
+static void rcutree_affinity_setting(unsigned int cpu, int outgoing)
+{
+        struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
+        rcu_boost_kthread_setaffinity(rdp->mynode, outgoing);
+}
+int rcutree_online_cpu(unsigned int cpu)
+{
+        sync_sched_exp_online_cleanup(cpu);
+        rcutree_affinity_setting(cpu, -1);
+        return 0;
+}
+int rcutree_offline_cpu(unsigned int cpu)
+{
+        rcutree_affinity_setting(cpu, cpu);
+        return 0;
+}
+int rcutree_dying_cpu(unsigned int cpu)
+{
+        struct rcu_state *rsp;
+        for_each_rcu_flavor(rsp)
+                rcu_cleanup_dying_cpu(rsp);
+        return 0;
+}
+int rcutree_dead_cpu(unsigned int cpu)
+{
+        struct rcu_state *rsp;
+        for_each_rcu_flavor(rsp) {
+                rcu_cleanup_dead_cpu(cpu, rsp);
+                do_nocb_deferred_wakeup(per_cpu_ptr(rsp->rda, cpu));
+        }
+        return 0;
 }
 #ifdef CONFIG_HOTPLUG_CPU
@@ -4364,9 +3876,6 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
        struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
        struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
-        if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
-                return;
        /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
        mask = rdp->grpmask;
        raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
@@ -4388,52 +3897,6 @@ void rcu_report_dead(unsigned int cpu)
 }
 #endif
-/*
- * Handle CPU online/offline notification events.
- */
-int rcu_cpu_notify(struct notifier_block *self,
-                   unsigned long action, void *hcpu)
-{
-        long cpu = (long)hcpu;
-        struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
-        struct rcu_node *rnp = rdp->mynode;
-        struct rcu_state *rsp;
-        switch (action) {
-        case CPU_UP_PREPARE:
-        case CPU_UP_PREPARE_FROZEN:
-                rcu_prepare_cpu(cpu);
-                rcu_prepare_kthreads(cpu);
-                rcu_spawn_all_nocb_kthreads(cpu);
-                break;
-        case CPU_ONLINE:
-        case CPU_DOWN_FAILED:
-                sync_sched_exp_online_cleanup(cpu);
-                rcu_boost_kthread_setaffinity(rnp, -1);
-                break;
-        case CPU_DOWN_PREPARE:
-                rcu_boost_kthread_setaffinity(rnp, cpu);
-                break;
-        case CPU_DYING:
-        case CPU_DYING_FROZEN:
-                for_each_rcu_flavor(rsp)
-                        rcu_cleanup_dying_cpu(rsp);
-                break;
-        case CPU_DEAD:
-        case CPU_DEAD_FROZEN:
-        case CPU_UP_CANCELED:
-        case CPU_UP_CANCELED_FROZEN:
-                for_each_rcu_flavor(rsp) {
-                        rcu_cleanup_dead_cpu(cpu, rsp);
-                        do_nocb_deferred_wakeup(per_cpu_ptr(rsp->rda, cpu));
-                }
-                break;
-        default:
-                break;
-        }
-        return NOTIFY_OK;
-}
 static int rcu_pm_notify(struct notifier_block *self,
                         unsigned long action, void *hcpu)
 {
@@ -4745,10 +4208,10 @@ void __init rcu_init(void)
         * this is called early in boot, before either interrupts
         * or the scheduler are operational.
         */
-        cpu_notifier(rcu_cpu_notify, 0);
        pm_notifier(rcu_pm_notify, 0);
        for_each_online_cpu(cpu)
-                rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
+                rcutree_prepare_cpu(cpu);
 }
+#include "tree_exp.h"
 #include "tree_plugin.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index e3959f5e6ddf..f714f873bf9d 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -254,6 +254,13 @@ struct rcu_node {
 } ____cacheline_internodealigned_in_smp;
 /*
+ * Bitmasks in an rcu_node cover the interval [grplo, grphi] of CPU IDs, and
+ * are indexed relative to this interval rather than the global CPU ID space.
+ * This generates the bit for a CPU in node-local masks.
+ */
+#define leaf_node_cpu_bit(rnp, cpu) (1UL << ((cpu) - (rnp)->grplo))
+/*
 * Do a full breadth-first scan of the rcu_node structures for the
 * specified rcu_state structure.
 */
@@ -281,6 +288,14 @@ struct rcu_node {
             (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
 /*
+ * Iterate over all possible CPUs in a leaf RCU node.
+ */
+#define for_each_leaf_node_possible_cpu(rnp, cpu) \
+        for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \
+             cpu <= rnp->grphi; \
+             cpu = cpumask_next((cpu), cpu_possible_mask))
+/*
 * Union to allow "aggregate OR" operation on the need for a quiescent
 * state by the normal and expedited grace periods.
 */
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
new file mode 100644
index 000000000000..6d86ab6ec2c9
--- /dev/null
+++ b/kernel/rcu/tree_exp.h
@@ -0,0 +1,655 @@
+/*
+ * RCU expedited grace periods
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright IBM Corporation, 2016
+ *
+ * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ */
+/* Wrapper functions for expedited grace periods.  */
+static void rcu_exp_gp_seq_start(struct rcu_state *rsp)
+{
+        rcu_seq_start(&rsp->expedited_sequence);
+}
+static void rcu_exp_gp_seq_end(struct rcu_state *rsp)
+{
+        rcu_seq_end(&rsp->expedited_sequence);
+        smp_mb(); /* Ensure that consecutive grace periods serialize. */
+}
+static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp)
+{
+        unsigned long s;
+        smp_mb(); /* Caller's modifications seen first by other CPUs. */
+        s = rcu_seq_snap(&rsp->expedited_sequence);
+        trace_rcu_exp_grace_period(rsp->name, s, TPS("snap"));
+        return s;
+}
+static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s)
+{
+        return rcu_seq_done(&rsp->expedited_sequence, s);
+}
+/*
+ * Reset the ->expmaskinit values in the rcu_node tree to reflect any
+ * recent CPU-online activity.  Note that these masks are not cleared
+ * when CPUs go offline, so they reflect the union of all CPUs that have
+ * ever been online.  This means that this function normally takes its
+ * no-work-to-do fastpath.
+ */
+static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp)
+{
+        bool done;
+        unsigned long flags;
+        unsigned long mask;
+        unsigned long oldmask;
+        int ncpus = READ_ONCE(rsp->ncpus);
+        struct rcu_node *rnp;
+        struct rcu_node *rnp_up;
+        /* If no new CPUs onlined since last time, nothing to do. */
+        if (likely(ncpus == rsp->ncpus_snap))
+                return;
+        rsp->ncpus_snap = ncpus;
+        /*
+         * Each pass through the following loop propagates newly onlined
+         * CPUs for the current rcu_node structure up the rcu_node tree.
+         */
+        rcu_for_each_leaf_node(rsp, rnp) {
+                raw_spin_lock_irqsave_rcu_node(rnp, flags);
+                if (rnp->expmaskinit == rnp->expmaskinitnext) {
+                        raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+                        continue;  /* No new CPUs, nothing to do. */
+                }
+                /* Update this node's mask, track old value for propagation. */
+                oldmask = rnp->expmaskinit;
+                rnp->expmaskinit = rnp->expmaskinitnext;
+                raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+                /* If was already nonzero, nothing to propagate. */
+                if (oldmask)
+                        continue;
+                /* Propagate the new CPU up the tree. */
+                mask = rnp->grpmask;
+                rnp_up = rnp->parent;
+                done = false;
+                while (rnp_up) {
+                        raw_spin_lock_irqsave_rcu_node(rnp_up, flags);
+                        if (rnp_up->expmaskinit)
+                                done = true;
+                        rnp_up->expmaskinit |= mask;
+                        raw_spin_unlock_irqrestore_rcu_node(rnp_up, flags);
+                        if (done)
+                                break;
+                        mask = rnp_up->grpmask;
+                        rnp_up = rnp_up->parent;
+                }
+        }
+}
+/*
+ * Reset the ->expmask values in the rcu_node tree in preparation for
+ * a new expedited grace period.
+ */
+static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp)
+{
+        unsigned long flags;
+        struct rcu_node *rnp;
+        sync_exp_reset_tree_hotplug(rsp);
+        rcu_for_each_node_breadth_first(rsp, rnp) {
+                raw_spin_lock_irqsave_rcu_node(rnp, flags);
+                WARN_ON_ONCE(rnp->expmask);
+                rnp->expmask = rnp->expmaskinit;
+                raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+        }
+}
+/*
+ * Return non-zero if there is no RCU expedited grace period in progress
+ * for the specified rcu_node structure, in other words, if all CPUs and
+ * tasks covered by the specified rcu_node structure have done their bit
+ * for the current expedited grace period.  Works only for preemptible
+ * RCU -- other RCU implementation use other means.
+ *
+ * Caller must hold the rcu_state's exp_mutex.
+ */
+static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
+{
+        return rnp->exp_tasks == NULL &&
+               READ_ONCE(rnp->expmask) == 0;
+}
+/*
+ * Report the exit from RCU read-side critical section for the last task
+ * that queued itself during or before the current expedited preemptible-RCU
+ * grace period.  This event is reported either to the rcu_node structure on
+ * which the task was queued or to one of that rcu_node structure's ancestors,
+ * recursively up the tree.  (Calm down, calm down, we do the recursion
+ * iteratively!)
+ *
+ * Caller must hold the rcu_state's exp_mutex and the specified rcu_node
+ * structure's ->lock.
+ */
+static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
+                                 bool wake, unsigned long flags)
+        __releases(rnp->lock)
+{
+        unsigned long mask;
+        for (;;) {
+                if (!sync_rcu_preempt_exp_done(rnp)) {
+                        if (!rnp->expmask)
+                                rcu_initiate_boost(rnp, flags);
+                        else
+                                raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+                        break;
+                }
+                if (rnp->parent == NULL) {
+                        raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+                        if (wake) {
+                                smp_mb(); /* EGP done before wake_up(). */
+                                swake_up(&rsp->expedited_wq);
+                        }
+                        break;
+                }
+                mask = rnp->grpmask;
+                raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled */
+                rnp = rnp->parent;
+                raw_spin_lock_rcu_node(rnp); /* irqs already disabled */
+                WARN_ON_ONCE(!(rnp->expmask & mask));
+                rnp->expmask &= ~mask;
+        }
+}
+/*
+ * Report expedited quiescent state for specified node.  This is a
+ * lock-acquisition wrapper function for __rcu_report_exp_rnp().
+ *
+ * Caller must hold the rcu_state's exp_mutex.
+ */
+static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
+                                              struct rcu_node *rnp, bool wake)
+{
+        unsigned long flags;
+        raw_spin_lock_irqsave_rcu_node(rnp, flags);
+        __rcu_report_exp_rnp(rsp, rnp, wake, flags);
+}
+/*
+ * Report expedited quiescent state for multiple CPUs, all covered by the
+ * specified leaf rcu_node structure.  Caller must hold the rcu_state's
+ * exp_mutex.
+ */
+static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp,
+                                    unsigned long mask, bool wake)
+{
+        unsigned long flags;
+        raw_spin_lock_irqsave_rcu_node(rnp, flags);
+        if (!(rnp->expmask & mask)) {
+                raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+                return;
+        }
+        rnp->expmask &= ~mask;
+        __rcu_report_exp_rnp(rsp, rnp, wake, flags); /* Releases rnp->lock. */
+}
+/*
+ * Report expedited quiescent state for specified rcu_data (CPU).
+ */
+static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp,
+                               bool wake)
+{
+        rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake);
+}
+/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */
+static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat,
+                               unsigned long s)
+{
+        if (rcu_exp_gp_seq_done(rsp, s)) {
+                trace_rcu_exp_grace_period(rsp->name, s, TPS("done"));
+                /* Ensure test happens before caller kfree(). */
+                smp_mb__before_atomic(); /* ^^^ */
+                atomic_long_inc(stat);
+                return true;
+        }
+        return false;
+}
+/*
+ * Funnel-lock acquisition for expedited grace periods.  Returns true
+ * if some other task completed an expedited grace period that this task
+ * can piggy-back on, and with no mutex held.  Otherwise, returns false
+ * with the mutex held, indicating that the caller must actually do the
+ * expedited grace period.
+ */
+static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
+{
+        struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
+        struct rcu_node *rnp = rdp->mynode;
+        struct rcu_node *rnp_root = rcu_get_root(rsp);
+        /* Low-contention fastpath. */
+        if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s) &&
+            (rnp == rnp_root ||
+             ULONG_CMP_LT(READ_ONCE(rnp_root->exp_seq_rq), s)) &&
+            mutex_trylock(&rsp->exp_mutex))
+                goto fastpath;
+        /*
+         * Each pass through the following loop works its way up
+         * the rcu_node tree, returning if others have done the work or
+         * otherwise falls through to acquire rsp->exp_mutex.  The mapping
+         * from CPU to rcu_node structure can be inexact, as it is just
+         * promoting locality and is not strictly needed for correctness.
+         */
+        for (; rnp != NULL; rnp = rnp->parent) {
+                if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s))
+                        return true;
+                /* Work not done, either wait here or go up. */
+                spin_lock(&rnp->exp_lock);
+                if (ULONG_CMP_GE(rnp->exp_seq_rq, s)) {
+                        /* Someone else doing GP, so wait for them. */
+                        spin_unlock(&rnp->exp_lock);
+                        trace_rcu_exp_funnel_lock(rsp->name, rnp->level,
+                                                  rnp->grplo, rnp->grphi,
+                                                  TPS("wait"));
+                        wait_event(rnp->exp_wq[(s >> 1) & 0x3],
+                                   sync_exp_work_done(rsp,
+                                                      &rdp->exp_workdone2, s));
+                        return true;
+                }
+                rnp->exp_seq_rq = s; /* Followers can wait on us. */
+                spin_unlock(&rnp->exp_lock);
+                trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo,
+                                          rnp->grphi, TPS("nxtlvl"));
+        }
+        mutex_lock(&rsp->exp_mutex);
+fastpath:
+        if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) {
+                mutex_unlock(&rsp->exp_mutex);
+                return true;
+        }
+        rcu_exp_gp_seq_start(rsp);
+        trace_rcu_exp_grace_period(rsp->name, s, TPS("start"));
+        return false;
+}
+/* Invoked on each online non-idle CPU for expedited quiescent state. */
+static void sync_sched_exp_handler(void *data)
+{
+        struct rcu_data *rdp;
+        struct rcu_node *rnp;
+        struct rcu_state *rsp = data;
+        rdp = this_cpu_ptr(rsp->rda);
+        rnp = rdp->mynode;
+        if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
+            __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
+                return;
+        if (rcu_is_cpu_rrupt_from_idle()) {
+                rcu_report_exp_rdp(&rcu_sched_state,
+                                   this_cpu_ptr(&rcu_sched_data), true);
+                return;
+        }
+        __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true);
+        resched_cpu(smp_processor_id());
+}
+/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */
+static void sync_sched_exp_online_cleanup(int cpu)
+{
+        struct rcu_data *rdp;
+        int ret;
+        struct rcu_node *rnp;
+        struct rcu_state *rsp = &rcu_sched_state;
+        rdp = per_cpu_ptr(rsp->rda, cpu);
+        rnp = rdp->mynode;
+        if (!(READ_ONCE(rnp->expmask) & rdp->grpmask))
+                return;
+        ret = smp_call_function_single(cpu, sync_sched_exp_handler, rsp, 0);
+        WARN_ON_ONCE(ret);
+}
+/*
+ * Select the nodes that the upcoming expedited grace period needs
+ * to wait for.
+ */
+static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
+                                     smp_call_func_t func)
+{
+        int cpu;
+        unsigned long flags;
+        unsigned long mask_ofl_test;
+        unsigned long mask_ofl_ipi;
+        int ret;
+        struct rcu_node *rnp;
+        sync_exp_reset_tree(rsp);
+        rcu_for_each_leaf_node(rsp, rnp) {
+                raw_spin_lock_irqsave_rcu_node(rnp, flags);
+                /* Each pass checks a CPU for identity, offline, and idle. */
+                mask_ofl_test = 0;
+                for_each_leaf_node_possible_cpu(rnp, cpu) {
+                        struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+                        struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+                        if (raw_smp_processor_id() == cpu ||
+                            !(atomic_add_return(0, &rdtp->dynticks) & 0x1))
+                                mask_ofl_test |= rdp->grpmask;
+                }
+                mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
+                /*
+                 * Need to wait for any blocked tasks as well.  Note that
+                 * additional blocking tasks will also block the expedited
+                 * GP until such time as the ->expmask bits are cleared.
+                 */
+                if (rcu_preempt_has_tasks(rnp))
+                        rnp->exp_tasks = rnp->blkd_tasks.next;
+                raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+                /* IPI the remaining CPUs for expedited quiescent state. */
+                for_each_leaf_node_possible_cpu(rnp, cpu) {
+                        unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
+                        if (!(mask_ofl_ipi & mask))
+                                continue;
+retry_ipi:
+                        ret = smp_call_function_single(cpu, func, rsp, 0);
+                        if (!ret) {
+                                mask_ofl_ipi &= ~mask;
+                                continue;
+                        }
+                        /* Failed, raced with offline. */
+                        raw_spin_lock_irqsave_rcu_node(rnp, flags);
+                        if (cpu_online(cpu) &&
+                            (rnp->expmask & mask)) {
+                                raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+                                schedule_timeout_uninterruptible(1);
+                                if (cpu_online(cpu) &&
+                                    (rnp->expmask & mask))
+                                        goto retry_ipi;
+                                raw_spin_lock_irqsave_rcu_node(rnp, flags);
+                        }
+                        if (!(rnp->expmask & mask))
+                                mask_ofl_ipi &= ~mask;
+                        raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+                }
+                /* Report quiescent states for those that went offline. */
+                mask_ofl_test |= mask_ofl_ipi;
+                if (mask_ofl_test)
+                        rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false);
+        }
+}
+static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
+{
+        int cpu;
+        unsigned long jiffies_stall;
+        unsigned long jiffies_start;
+        unsigned long mask;
+        int ndetected;
+        struct rcu_node *rnp;
+        struct rcu_node *rnp_root = rcu_get_root(rsp);
+        int ret;
+        jiffies_stall = rcu_jiffies_till_stall_check();
+        jiffies_start = jiffies;
+        for (;;) {
+                ret = swait_event_timeout(
+                                rsp->expedited_wq,
+                                sync_rcu_preempt_exp_done(rnp_root),
+                                jiffies_stall);
+                if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root))
+                        return;
+                if (ret < 0) {
+                        /* Hit a signal, disable CPU stall warnings. */
+                        swait_event(rsp->expedited_wq,
+                                   sync_rcu_preempt_exp_done(rnp_root));
+                        return;
+                }
+                pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {",
+                       rsp->name);
+                ndetected = 0;
+                rcu_for_each_leaf_node(rsp, rnp) {
+                        ndetected += rcu_print_task_exp_stall(rnp);
+                        for_each_leaf_node_possible_cpu(rnp, cpu) {
+                                struct rcu_data *rdp;
+                                mask = leaf_node_cpu_bit(rnp, cpu);
+                                if (!(rnp->expmask & mask))
+                                        continue;
+                                ndetected++;
+                                rdp = per_cpu_ptr(rsp->rda, cpu);
+                                pr_cont(" %d-%c%c%c", cpu,
+                                        "O."[!!cpu_online(cpu)],
+                                        "o."[!!(rdp->grpmask & rnp->expmaskinit)],
+                                        "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]);
+                        }
+                }
+                pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
+                        jiffies - jiffies_start, rsp->expedited_sequence,
+                        rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]);
+                if (ndetected) {
+                        pr_err("blocking rcu_node structures:");
+                        rcu_for_each_node_breadth_first(rsp, rnp) {
+                                if (rnp == rnp_root)
+                                        continue; /* printed unconditionally */
+                                if (sync_rcu_preempt_exp_done(rnp))
+                                        continue;
+                                pr_cont(" l=%u:%d-%d:%#lx/%c",
+                                        rnp->level, rnp->grplo, rnp->grphi,
+                                        rnp->expmask,
+                                        ".T"[!!rnp->exp_tasks]);
+                        }
+                        pr_cont("\n");
+                }
+                rcu_for_each_leaf_node(rsp, rnp) {
+                        for_each_leaf_node_possible_cpu(rnp, cpu) {
+                                mask = leaf_node_cpu_bit(rnp, cpu);
+                                if (!(rnp->expmask & mask))
+                                        continue;
+                                dump_cpu_task(cpu);
+                        }
+                }
+                jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3;
+        }
+}
+/*
+ * Wait for the current expedited grace period to complete, and then
+ * wake up everyone who piggybacked on the just-completed expedited
+ * grace period.  Also update all the ->exp_seq_rq counters as needed
+ * in order to avoid counter-wrap problems.
+ */
+static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s)
+{
+        struct rcu_node *rnp;
+        synchronize_sched_expedited_wait(rsp);
+        rcu_exp_gp_seq_end(rsp);
+        trace_rcu_exp_grace_period(rsp->name, s, TPS("end"));
+        /*
+         * Switch over to wakeup mode, allowing the next GP, but -only- the
+         * next GP, to proceed.
+         */
+        mutex_lock(&rsp->exp_wake_mutex);
+        mutex_unlock(&rsp->exp_mutex);
+        rcu_for_each_node_breadth_first(rsp, rnp) {
+                if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) {
+                        spin_lock(&rnp->exp_lock);
+                        /* Recheck, avoid hang in case someone just arrived. */
+                        if (ULONG_CMP_LT(rnp->exp_seq_rq, s))
+                                rnp->exp_seq_rq = s;
+                        spin_unlock(&rnp->exp_lock);
+                }
+                wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]);
+        }
+        trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake"));
+        mutex_unlock(&rsp->exp_wake_mutex);
+}
+/**
+ * synchronize_sched_expedited - Brute-force RCU-sched grace period
+ *
+ * Wait for an RCU-sched grace period to elapse, but use a "big hammer"
+ * approach to force the grace period to end quickly.  This consumes
+ * significant time on all CPUs and is unfriendly to real-time workloads,
+ * so is thus not recommended for any sort of common-case code.  In fact,
+ * if you are using synchronize_sched_expedited() in a loop, please
+ * restructure your code to batch your updates, and then use a single
+ * synchronize_sched() instead.
+ *
+ * This implementation can be thought of as an application of sequence
+ * locking to expedited grace periods, but using the sequence counter to
+ * determine when someone else has already done the work instead of for
+ * retrying readers.
+ */
+void synchronize_sched_expedited(void)
+{
+        unsigned long s;
+        struct rcu_state *rsp = &rcu_sched_state;
+        /* If only one CPU, this is automatically a grace period. */
+        if (rcu_blocking_is_gp())
+                return;
+        /* If expedited grace periods are prohibited, fall back to normal. */
+        if (rcu_gp_is_normal()) {
+                wait_rcu_gp(call_rcu_sched);
+                return;
+        }
+        /* Take a snapshot of the sequence number.  */
+        s = rcu_exp_gp_seq_snap(rsp);
+        if (exp_funnel_lock(rsp, s))
+                return;  /* Someone else did our work for us. */
+        /* Initialize the rcu_node tree in preparation for the wait. */
+        sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler);
+        /* Wait and clean up, including waking everyone. */
+        rcu_exp_wait_wake(rsp, s);
+}
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+#ifdef CONFIG_PREEMPT_RCU
+/*
+ * Remote handler for smp_call_function_single().  If there is an
+ * RCU read-side critical section in effect, request that the
+ * next rcu_read_unlock() record the quiescent state up the
+ * ->expmask fields in the rcu_node tree.  Otherwise, immediately
+ * report the quiescent state.
+ */
+static void sync_rcu_exp_handler(void *info)
+{
+        struct rcu_data *rdp;
+        struct rcu_state *rsp = info;
+        struct task_struct *t = current;
+        /*
+         * Within an RCU read-side critical section, request that the next
+         * rcu_read_unlock() report.  Unless this RCU read-side critical
+         * section has already blocked, in which case it is already set
+         * up for the expedited grace period to wait on it.
+         */
+        if (t->rcu_read_lock_nesting > 0 &&
+            !t->rcu_read_unlock_special.b.blocked) {
+                t->rcu_read_unlock_special.b.exp_need_qs = true;
+                return;
+        }
+        /*
+         * We are either exiting an RCU read-side critical section (negative
+         * values of t->rcu_read_lock_nesting) or are not in one at all
+         * (zero value of t->rcu_read_lock_nesting).  Or we are in an RCU
+         * read-side critical section that blocked before this expedited
+         * grace period started.  Either way, we can immediately report
+         * the quiescent state.
+         */
+        rdp = this_cpu_ptr(rsp->rda);
+        rcu_report_exp_rdp(rsp, rdp, true);
+}
+/**
+ * synchronize_rcu_expedited - Brute-force RCU grace period
+ *
+ * Wait for an RCU-preempt grace period, but expedite it.  The basic
+ * idea is to IPI all non-idle non-nohz online CPUs.  The IPI handler
+ * checks whether the CPU is in an RCU-preempt critical section, and
+ * if so, it sets a flag that causes the outermost rcu_read_unlock()
+ * to report the quiescent state.  On the other hand, if the CPU is
+ * not in an RCU read-side critical section, the IPI handler reports
+ * the quiescent state immediately.
+ *
+ * Although this is a greate improvement over previous expedited
+ * implementations, it is still unfriendly to real-time workloads, so is
+ * thus not recommended for any sort of common-case code.  In fact, if
+ * you are using synchronize_rcu_expedited() in a loop, please restructure
+ * your code to batch your updates, and then Use a single synchronize_rcu()
+ * instead.
+ */
+void synchronize_rcu_expedited(void)
+{
+        struct rcu_state *rsp = rcu_state_p;
+        unsigned long s;
+        /* If expedited grace periods are prohibited, fall back to normal. */
+        if (rcu_gp_is_normal()) {
+                wait_rcu_gp(call_rcu);
+                return;
+        }
+        s = rcu_exp_gp_seq_snap(rsp);
+        if (exp_funnel_lock(rsp, s))
+                return;  /* Someone else did our work for us. */
+        /* Initialize the rcu_node tree in preparation for the wait. */
+        sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler);
+        /* Wait for ->blkd_tasks lists to drain, then wake everyone up. */
+        rcu_exp_wait_wake(rsp, s);
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
+#else /* #ifdef CONFIG_PREEMPT_RCU */
+/*
+ * Wait for an rcu-preempt grace period, but make it happen quickly.
+ * But because preemptible RCU does not exist, map to rcu-sched.
+ */
+void synchronize_rcu_expedited(void)
+{
+        synchronize_sched_expedited();
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index ff1cd4e1188d..0082fce402a0 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -79,8 +79,6 @@ static void __init rcu_bootup_announce_oddness(void)
                pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
        if (IS_ENABLED(CONFIG_PROVE_RCU))
                pr_info("\tRCU lockdep checking is enabled.\n");
-        if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_RUNNABLE))
-                pr_info("\tRCU torture testing starts during boot.\n");
        if (RCU_NUM_LVLS >= 4)
                pr_info("\tFour(or more)-level hierarchy is enabled.\n");
        if (RCU_FANOUT_LEAF != 16)
@@ -681,84 +679,6 @@ void synchronize_rcu(void)
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu);
-/*
- * Remote handler for smp_call_function_single().  If there is an
- * RCU read-side critical section in effect, request that the
- * next rcu_read_unlock() record the quiescent state up the
- * ->expmask fields in the rcu_node tree.  Otherwise, immediately
- * report the quiescent state.
- */
-static void sync_rcu_exp_handler(void *info)
-{
-        struct rcu_data *rdp;
-        struct rcu_state *rsp = info;
-        struct task_struct *t = current;
-        /*
-         * Within an RCU read-side critical section, request that the next
-         * rcu_read_unlock() report.  Unless this RCU read-side critical
-         * section has already blocked, in which case it is already set
-         * up for the expedited grace period to wait on it.
-         */
-        if (t->rcu_read_lock_nesting > 0 &&
-            !t->rcu_read_unlock_special.b.blocked) {
-                t->rcu_read_unlock_special.b.exp_need_qs = true;
-                return;
-        }
-        /*
-         * We are either exiting an RCU read-side critical section (negative
-         * values of t->rcu_read_lock_nesting) or are not in one at all
-         * (zero value of t->rcu_read_lock_nesting).  Or we are in an RCU
-         * read-side critical section that blocked before this expedited
-         * grace period started.  Either way, we can immediately report
-         * the quiescent state.
-         */
-        rdp = this_cpu_ptr(rsp->rda);
-        rcu_report_exp_rdp(rsp, rdp, true);
-}
-/**
- * synchronize_rcu_expedited - Brute-force RCU grace period
- *
- * Wait for an RCU-preempt grace period, but expedite it.  The basic
- * idea is to IPI all non-idle non-nohz online CPUs.  The IPI handler
- * checks whether the CPU is in an RCU-preempt critical section, and
- * if so, it sets a flag that causes the outermost rcu_read_unlock()
- * to report the quiescent state.  On the other hand, if the CPU is
- * not in an RCU read-side critical section, the IPI handler reports
- * the quiescent state immediately.
- *
- * Although this is a greate improvement over previous expedited
- * implementations, it is still unfriendly to real-time workloads, so is
- * thus not recommended for any sort of common-case code.  In fact, if
- * you are using synchronize_rcu_expedited() in a loop, please restructure
- * your code to batch your updates, and then Use a single synchronize_rcu()
- * instead.
- */
-void synchronize_rcu_expedited(void)
-{
-        struct rcu_state *rsp = rcu_state_p;
-        unsigned long s;
-        /* If expedited grace periods are prohibited, fall back to normal. */
-        if (rcu_gp_is_normal()) {
-                wait_rcu_gp(call_rcu);
-                return;
-        }
-        s = rcu_exp_gp_seq_snap(rsp);
-        if (exp_funnel_lock(rsp, s))
-                return;  /* Someone else did our work for us. */
-        /* Initialize the rcu_node tree in preparation for the wait. */
-        sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler);
-        /* Wait for ->blkd_tasks lists to drain, then wake everyone up. */
-        rcu_exp_wait_wake(rsp, s);
-}
-EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
 /**
 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
 *
@@ -883,16 +803,6 @@ static void rcu_preempt_check_callbacks(void)
 }
 /*
- * Wait for an rcu-preempt grace period, but make it happen quickly.
- * But because preemptible RCU does not exist, map to rcu-sched.
- */
-void synchronize_rcu_expedited(void)
-{
-        synchronize_sched_expedited();
-}
-EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
-/*
 * Because preemptible RCU does not exist, rcu_barrier() is just
 * another name for rcu_barrier_sched().
 */
@@ -1254,8 +1164,9 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
                return;
        if (!zalloc_cpumask_var(&cm, GFP_KERNEL))
                return;
-        for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
+        for_each_leaf_node_possible_cpu(rnp, cpu)
-                if ((mask & 0x1) && cpu != outgoingcpu)
+                if ((mask & leaf_node_cpu_bit(rnp, cpu)) &&
+                    cpu != outgoingcpu)
                        cpumask_set_cpu(cpu, cm);
        if (cpumask_weight(cm) == 0)
                cpumask_setall(cm);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 3e888cd5a594..f0d8322bc3ec 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -528,6 +528,7 @@ static int rcu_task_stall_timeout __read_mostly = HZ * 60 * 10;
 module_param(rcu_task_stall_timeout, int, 0644);
 static void rcu_spawn_tasks_kthread(void);
+static struct task_struct *rcu_tasks_kthread_ptr;
 /*
 * Post an RCU-tasks callback.  First call must be from process context
@@ -537,6 +538,7 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func)
 {
        unsigned long flags;
        bool needwake;
+        bool havetask = READ_ONCE(rcu_tasks_kthread_ptr);
        rhp->next = NULL;
        rhp->func = func;
@@ -545,7 +547,9 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func)
        *rcu_tasks_cbs_tail = rhp;
        rcu_tasks_cbs_tail = &rhp->next;
        raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags);
-        if (needwake) {
+        /* We can't create the thread unless interrupts are enabled. */
+        if ((needwake && havetask) ||
+            (!havetask && !irqs_disabled_flags(flags))) {
                rcu_spawn_tasks_kthread();
                wake_up(&rcu_tasks_cbs_wq);
        }
@@ -790,7 +794,6 @@ static int __noreturn rcu_tasks_kthread(void *arg)
 static void rcu_spawn_tasks_kthread(void)
 {
        static DEFINE_MUTEX(rcu_tasks_kthread_mutex);
-        static struct task_struct *rcu_tasks_kthread_ptr;
        struct task_struct *t;
        if (READ_ONCE(rcu_tasks_kthread_ptr)) {
diff --git a/kernel/relay.c b/kernel/relay.c
index 074994bcfa9b..04d7cf3ef8cf 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -614,6 +614,7 @@ free_bufs:
        kref_put(&chan->kref, relay_destroy_channel);
        mutex_unlock(&relay_channels_mutex);
+        kfree(chan);
        return NULL;
 }
 EXPORT_SYMBOL_GPL(relay_open);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7f2cae4620c7..5c883fe8e440 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1536,7 +1536,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
        for (;;) {
                /* Any allowed, online CPU? */
                for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
-                        if (!cpu_active(dest_cpu))
+                        if (!(p->flags & PF_KTHREAD) && !cpu_active(dest_cpu))
+                                continue;
+                        if (!cpu_online(dest_cpu))
                                continue;
                        goto out;
                }
@@ -1935,7 +1937,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
 * chain to provide order. Instead we do:
 *
 *   1) smp_store_release(X->on_cpu, 0)
- *   2) smp_cond_acquire(!X->on_cpu)
+ *   2) smp_cond_load_acquire(!X->on_cpu)
 *
 * Example:
 *
@@ -1946,7 +1948,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
 *   sched-out X
 *   smp_store_release(X->on_cpu, 0);
 *
- *                    smp_cond_acquire(!X->on_cpu);
+ *                    smp_cond_load_acquire(&X->on_cpu, !VAL);
 *                    X->state = WAKING
 *                    set_task_cpu(X,2)
 *
@@ -1972,7 +1974,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
 * This means that any means of doing remote wakeups must order the CPU doing
 * the wakeup against the CPU the task is going to end up running on. This,
 * however, is already required for the regular Program-Order guarantee above,
- * since the waking CPU is the one issueing the ACQUIRE (smp_cond_acquire).
+ * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire).
 *
 */
@@ -2045,7 +2047,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
         * This ensures that tasks getting woken will be fully ordered against
         * their previous state and preserve Program Order.
         */
-        smp_cond_acquire(!p->on_cpu);
+        smp_cond_load_acquire(&p->on_cpu, !VAL);
        p->sched_contributes_to_load = !!task_contributes_to_load(p);
        p->state = TASK_WAKING;
@@ -2253,9 +2255,11 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
 #endif
 #endif
+#ifdef CONFIG_SCHEDSTATS
 DEFINE_STATIC_KEY_FALSE(sched_schedstats);
+static bool __initdata __sched_schedstats = false;
-#ifdef CONFIG_SCHEDSTATS
 static void set_schedstats(bool enabled)
 {
        if (enabled)
@@ -2278,11 +2282,16 @@ static int __init setup_schedstats(char *str)
        if (!str)
                goto out;
+        /*
+         * This code is called before jump labels have been set up, so we can't
+         * change the static branch directly just yet.  Instead set a temporary
+         * variable so init_schedstats() can do it later.
+         */
        if (!strcmp(str, "enable")) {
-                set_schedstats(true);
+                __sched_schedstats = true;
                ret = 1;
        } else if (!strcmp(str, "disable")) {
-                set_schedstats(false);
+                __sched_schedstats = false;
                ret = 1;
        }
 out:
@@ -2293,6 +2302,11 @@ out:
 }
 __setup("schedstats=", setup_schedstats);
+static void __init init_schedstats(void)
+{
+        set_schedstats(__sched_schedstats);
+}
 #ifdef CONFIG_PROC_SYSCTL
 int sysctl_schedstats(struct ctl_table *table, int write,
                         void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -2313,8 +2327,10 @@ int sysctl_schedstats(struct ctl_table *table, int write,
                set_schedstats(state);
        return err;
 }
-#endif
+#endif /* CONFIG_PROC_SYSCTL */
-#endif
+#else  /* !CONFIG_SCHEDSTATS */
+static inline void init_schedstats(void) {}
+#endif /* CONFIG_SCHEDSTATS */
 /*
 * fork()/clone()-time setup:
@@ -2326,11 +2342,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
        __sched_fork(clone_flags, p);
        /*
-         * We mark the process as running here. This guarantees that
+         * We mark the process as NEW here. This guarantees that
         * nobody will actually run it, and a signal or other external
         * event cannot wake it up and insert it on the runqueue either.
         */
-        p->state = TASK_RUNNING;
+        p->state = TASK_NEW;
        /*
         * Make sure we do not leak PI boosting priority to the child.
@@ -2367,8 +2383,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
                p->sched_class = &fair_sched_class;
        }
-        if (p->sched_class->task_fork)
+        init_entity_runnable_average(&p->se);
-                p->sched_class->task_fork(p);
        /*
         * The child is not yet in the pid-hash so no cgroup attach races,
@@ -2378,7 +2393,13 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
         * Silence PROVE_RCU.
         */
        raw_spin_lock_irqsave(&p->pi_lock, flags);
-        set_task_cpu(p, cpu);
+        /*
+         * We're setting the cpu for the first time, we don't migrate,
+         * so use __set_task_cpu().
+         */
+        __set_task_cpu(p, cpu);
+        if (p->sched_class->task_fork)
+                p->sched_class->task_fork(p);
        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 #ifdef CONFIG_SCHED_INFO
@@ -2510,21 +2531,22 @@ void wake_up_new_task(struct task_struct *p)
        struct rq_flags rf;
        struct rq *rq;
-        /* Initialize new task's runnable average */
-        init_entity_runnable_average(&p->se);
        raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
+        p->state = TASK_RUNNING;
 #ifdef CONFIG_SMP
        /*
         * Fork balancing, do it here and not earlier because:
         *  - cpus_allowed can change in the fork path
         *  - any previously selected cpu might disappear through hotplug
+         *
+         * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
+         * as we're not fully set-up yet.
         */
-        set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
+        __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
 #endif
-        /* Post initialize new task's util average when its cfs_rq is set */
+        rq = __task_rq_lock(p, &rf);
        post_init_entity_util_avg(&p->se);
-        rq = __task_rq_lock(p, &rf);
        activate_task(rq, p, 0);
        p->on_rq = TASK_ON_RQ_QUEUED;
        trace_sched_wakeup_new(p);
@@ -3146,6 +3168,9 @@ static noinline void __schedule_bug(struct task_struct *prev)
                pr_cont("\n");
        }
 #endif
+        if (panic_on_warn)
+                panic("scheduling while atomic\n");
        dump_stack();
        add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
 }
@@ -3156,7 +3181,8 @@ static noinline void __schedule_bug(struct task_struct *prev)
 static inline void schedule_debug(struct task_struct *prev)
 {
 #ifdef CONFIG_SCHED_STACK_END_CHECK
-        BUG_ON(task_stack_end_corrupted(prev));
+        if (task_stack_end_corrupted(prev))
+                panic("corrupted stack end detected inside scheduler\n");
 #endif
        if (unlikely(in_atomic_preempt_off())) {
@@ -4736,7 +4762,8 @@ out_unlock:
 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
 * @user_mask_ptr: user-space pointer to hold the current cpu mask
 *
- * Return: 0 on success. An error code otherwise.
+ * Return: size of CPU mask copied to user_mask_ptr on success. An
+ * error code otherwise.
 */
 SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
                unsigned long __user *, user_mask_ptr)
@@ -5133,14 +5160,16 @@ void show_state_filter(unsigned long state_filter)
                /*
                 * reset the NMI-timeout, listing all files on a slow
                 * console might take a lot of time:
+                 * Also, reset softlockup watchdogs on all CPUs, because
+                 * another CPU might be blocked waiting for us to process
+                 * an IPI.
                 */
                touch_nmi_watchdog();
+                touch_all_softlockup_watchdogs();
                if (!state_filter || (p->state & state_filter))
                        sched_show_task(p);
        }
-        touch_all_softlockup_watchdogs();
 #ifdef CONFIG_SCHED_DEBUG
        if (!state_filter)
                sysrq_sched_debug_show();
@@ -5376,13 +5405,15 @@ void idle_task_exit(void)
 /*
 * Since this CPU is going 'away' for a while, fold any nr_active delta
 * we might have. Assumes we're called after migrate_tasks() so that the
- * nr_active count is stable.
+ * nr_active count is stable. We need to take the teardown thread which
+ * is calling this into account, so we hand in adjust = 1 to the load
+ * calculation.
 *
 * Also see the comment "Global load-average calculations".
 */
 static void calc_load_migrate(struct rq *rq)
 {
-        long delta = calc_load_fold_active(rq);
+        long delta = calc_load_fold_active(rq, 1);
        if (delta)
                atomic_long_add(delta, &calc_load_tasks);
 }
@@ -7213,7 +7244,6 @@ static void sched_rq_cpu_starting(unsigned int cpu)
        struct rq *rq = cpu_rq(cpu);
        rq->calc_load_update = calc_load_update;
-        account_reset_rq(rq);
        update_max_interval();
 }
@@ -7487,6 +7517,8 @@ void __init sched_init(void)
 #endif
        init_sched_fair_class();
+        init_schedstats();
        scheduler_running = 1;
 }
@@ -7691,6 +7723,8 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
        INIT_LIST_HEAD(&tg->children);
        list_add_rcu(&tg->siblings, &parent->children);
        spin_unlock_irqrestore(&task_group_lock, flags);
+        online_fair_sched_group(tg);
 }
 /* rcu callback to free various structures associated with a task group */
@@ -7719,27 +7753,9 @@ void sched_offline_group(struct task_group *tg)
        spin_unlock_irqrestore(&task_group_lock, flags);
 }
-/* change task's runqueue when it moves between groups.
+static void sched_change_group(struct task_struct *tsk, int type)
- *      The caller of this function should have put the task in its new group
- *      by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
- *      reflect its new group.
- */
-void sched_move_task(struct task_struct *tsk)
 {
        struct task_group *tg;
-        int queued, running;
-        struct rq_flags rf;
-        struct rq *rq;
-        rq = task_rq_lock(tsk, &rf);
-        running = task_current(rq, tsk);
-        queued = task_on_rq_queued(tsk);
-        if (queued)
-                dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
-        if (unlikely(running))
-                put_prev_task(rq, tsk);
        /*
         * All callers are synchronized by task_rq_lock(); we do not use RCU
@@ -7752,11 +7768,37 @@ void sched_move_task(struct task_struct *tsk)
        tsk->sched_task_group = tg;
 #ifdef CONFIG_FAIR_GROUP_SCHED
-        if (tsk->sched_class->task_move_group)
+        if (tsk->sched_class->task_change_group)
-                tsk->sched_class->task_move_group(tsk);
+                tsk->sched_class->task_change_group(tsk, type);
        else
 #endif
                set_task_rq(tsk, task_cpu(tsk));
+}
+/*
+ * Change task's runqueue when it moves between groups.
+ *
+ * The caller of this function should have put the task in its new group by
+ * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect
+ * its new group.
+ */
+void sched_move_task(struct task_struct *tsk)
+{
+        int queued, running;
+        struct rq_flags rf;
+        struct rq *rq;
+        rq = task_rq_lock(tsk, &rf);
+        running = task_current(rq, tsk);
+        queued = task_on_rq_queued(tsk);
+        if (queued)
+                dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
+        if (unlikely(running))
+                put_prev_task(rq, tsk);
+        sched_change_group(tsk, TASK_MOVE_GROUP);
        if (unlikely(running))
                tsk->sched_class->set_curr_task(rq);
@@ -8184,15 +8226,27 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
        sched_free_group(tg);
 }
+/*
+ * This is called before wake_up_new_task(), therefore we really only
+ * have to set its group bits, all the other stuff does not apply.
+ */
 static void cpu_cgroup_fork(struct task_struct *task)
 {
-        sched_move_task(task);
+        struct rq_flags rf;
+        struct rq *rq;
+        rq = task_rq_lock(task, &rf);
+        sched_change_group(task, TASK_SET_GROUP);
+        task_rq_unlock(rq, task, &rf);
 }
 static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
 {
        struct task_struct *task;
        struct cgroup_subsys_state *css;
+        int ret = 0;
        cgroup_taskset_for_each(task, css, tset) {
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -8203,8 +8257,24 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
                if (task->sched_class != &fair_sched_class)
                        return -EINVAL;
 #endif
+                /*
+                 * Serialize against wake_up_new_task() such that if its
+                 * running, we're sure to observe its full state.
+                 */
+                raw_spin_lock_irq(&task->pi_lock);
+                /*
+                 * Avoid calling sched_move_task() before wake_up_new_task()
+                 * has happened. This would lead to problems with PELT, due to
+                 * move wanting to detach+attach while we're not attached yet.
+                 */
+                if (task->state == TASK_NEW)
+                        ret = -EINVAL;
+                raw_spin_unlock_irq(&task->pi_lock);
+                if (ret)
+                        break;
        }
-        return 0;
+        return ret;
 }
 static void cpu_cgroup_attach(struct cgroup_taskset *tset)
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 41f85c4d0938..bc0b309c3f19 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -25,15 +25,13 @@ enum cpuacct_stat_index {
        CPUACCT_STAT_NSTATS,
 };
-enum cpuacct_usage_index {
+static const char * const cpuacct_stat_desc[] = {
-        CPUACCT_USAGE_USER,     /* ... user mode */
+        [CPUACCT_STAT_USER] = "user",
-        CPUACCT_USAGE_SYSTEM,   /* ... kernel mode */
+        [CPUACCT_STAT_SYSTEM] = "system",
-        CPUACCT_USAGE_NRUSAGE,
 };
 struct cpuacct_usage {
-        u64     usages[CPUACCT_USAGE_NRUSAGE];
+        u64     usages[CPUACCT_STAT_NSTATS];
 };
 /* track cpu usage of a group of tasks and its child groups */
@@ -108,16 +106,16 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css)
 }
 static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
-                                 enum cpuacct_usage_index index)
+                                 enum cpuacct_stat_index index)
 {
        struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
        u64 data;
        /*
-         * We allow index == CPUACCT_USAGE_NRUSAGE here to read
+         * We allow index == CPUACCT_STAT_NSTATS here to read
         * the sum of suages.
         */
-        BUG_ON(index > CPUACCT_USAGE_NRUSAGE);
+        BUG_ON(index > CPUACCT_STAT_NSTATS);
 #ifndef CONFIG_64BIT
        /*
@@ -126,11 +124,11 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
        raw_spin_lock_irq(&cpu_rq(cpu)->lock);
 #endif
-        if (index == CPUACCT_USAGE_NRUSAGE) {
+        if (index == CPUACCT_STAT_NSTATS) {
                int i = 0;
                data = 0;
-                for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++)
+                for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
                        data += cpuusage->usages[i];
        } else {
                data = cpuusage->usages[index];
@@ -155,7 +153,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
        raw_spin_lock_irq(&cpu_rq(cpu)->lock);
 #endif
-        for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++)
+        for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
                cpuusage->usages[i] = val;
 #ifndef CONFIG_64BIT
@@ -165,7 +163,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
 /* return total cpu usage (in nanoseconds) of a group */
 static u64 __cpuusage_read(struct cgroup_subsys_state *css,
-                           enum cpuacct_usage_index index)
+                           enum cpuacct_stat_index index)
 {
        struct cpuacct *ca = css_ca(css);
        u64 totalcpuusage = 0;
@@ -180,18 +178,18 @@ static u64 __cpuusage_read(struct cgroup_subsys_state *css,
 static u64 cpuusage_user_read(struct cgroup_subsys_state *css,
                              struct cftype *cft)
 {
-        return __cpuusage_read(css, CPUACCT_USAGE_USER);
+        return __cpuusage_read(css, CPUACCT_STAT_USER);
 }
 static u64 cpuusage_sys_read(struct cgroup_subsys_state *css,
                             struct cftype *cft)
 {
-        return __cpuusage_read(css, CPUACCT_USAGE_SYSTEM);
+        return __cpuusage_read(css, CPUACCT_STAT_SYSTEM);
 }
 static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft)
 {
-        return __cpuusage_read(css, CPUACCT_USAGE_NRUSAGE);
+        return __cpuusage_read(css, CPUACCT_STAT_NSTATS);
 }
 static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
@@ -213,7 +211,7 @@ static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
 }
 static int __cpuacct_percpu_seq_show(struct seq_file *m,
-                                     enum cpuacct_usage_index index)
+                                     enum cpuacct_stat_index index)
 {
        struct cpuacct *ca = css_ca(seq_css(m));
        u64 percpu;
@@ -229,48 +227,78 @@ static int __cpuacct_percpu_seq_show(struct seq_file *m,
 static int cpuacct_percpu_user_seq_show(struct seq_file *m, void *V)
 {
-        return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_USER);
+        return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_USER);
 }
 static int cpuacct_percpu_sys_seq_show(struct seq_file *m, void *V)
 {
-        return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_SYSTEM);
+        return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_SYSTEM);
 }
 static int cpuacct_percpu_seq_show(struct seq_file *m, void *V)
 {
-        return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_NRUSAGE);
+        return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_NSTATS);
 }
-static const char * const cpuacct_stat_desc[] = {
+static int cpuacct_all_seq_show(struct seq_file *m, void *V)
-        [CPUACCT_STAT_USER] = "user",
+{
-        [CPUACCT_STAT_SYSTEM] = "system",
+        struct cpuacct *ca = css_ca(seq_css(m));
-};
+        int index;
+        int cpu;
+        seq_puts(m, "cpu");
+        for (index = 0; index < CPUACCT_STAT_NSTATS; index++)
+                seq_printf(m, " %s", cpuacct_stat_desc[index]);
+        seq_puts(m, "\n");
+        for_each_possible_cpu(cpu) {
+                struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+                seq_printf(m, "%d", cpu);
+                for (index = 0; index < CPUACCT_STAT_NSTATS; index++) {
+#ifndef CONFIG_64BIT
+                        /*
+                         * Take rq->lock to make 64-bit read safe on 32-bit
+                         * platforms.
+                         */
+                        raw_spin_lock_irq(&cpu_rq(cpu)->lock);
+#endif
+                        seq_printf(m, " %llu", cpuusage->usages[index]);
+#ifndef CONFIG_64BIT
+                        raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
+#endif
+                }
+                seq_puts(m, "\n");
+        }
+        return 0;
+}
 static int cpuacct_stats_show(struct seq_file *sf, void *v)
 {
        struct cpuacct *ca = css_ca(seq_css(sf));
+        s64 val[CPUACCT_STAT_NSTATS];
        int cpu;
-        s64 val = 0;
+        int stat;
+        memset(val, 0, sizeof(val));
        for_each_possible_cpu(cpu) {
-                struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
+                u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat;
-                val += kcpustat->cpustat[CPUTIME_USER];
-                val += kcpustat->cpustat[CPUTIME_NICE];
-        }
-        val = cputime64_to_clock_t(val);
-        seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val);
-        val = 0;
+                val[CPUACCT_STAT_USER]   += cpustat[CPUTIME_USER];
-        for_each_possible_cpu(cpu) {
+                val[CPUACCT_STAT_USER]   += cpustat[CPUTIME_NICE];
-                struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
+                val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM];
-                val += kcpustat->cpustat[CPUTIME_SYSTEM];
+                val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ];
-                val += kcpustat->cpustat[CPUTIME_IRQ];
+                val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ];
-                val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
        }
-        val = cputime64_to_clock_t(val);
+        for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) {
-        seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
+                seq_printf(sf, "%s %lld\n",
+                           cpuacct_stat_desc[stat],
+                           cputime64_to_clock_t(val[stat]));
+        }
        return 0;
 }
@@ -302,6 +330,10 @@ static struct cftype files[] = {
                .seq_show = cpuacct_percpu_sys_seq_show,
        },
        {
+                .name = "usage_all",
+                .seq_show = cpuacct_all_seq_show,
+        },
+        {
                .name = "stat",
                .seq_show = cpuacct_stats_show,
        },
@@ -316,11 +348,11 @@ static struct cftype files[] = {
 void cpuacct_charge(struct task_struct *tsk, u64 cputime)
 {
        struct cpuacct *ca;
-        int index = CPUACCT_USAGE_SYSTEM;
+        int index = CPUACCT_STAT_SYSTEM;
        struct pt_regs *regs = task_pt_regs(tsk);
        if (regs && user_mode(regs))
-                index = CPUACCT_USAGE_USER;
+                index = CPUACCT_STAT_USER;
        rcu_read_lock();
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 14c4aa25cc45..a84641b222c1 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -47,6 +47,8 @@ struct sugov_cpu {
        struct update_util_data update_util;
        struct sugov_policy *sg_policy;
+        unsigned int cached_raw_freq;
        /* The fields below are only needed when sharing a policy. */
        unsigned long util;
        unsigned long max;
@@ -106,7 +108,7 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
 /**
 * get_next_freq - Compute a new frequency for a given cpufreq policy.
- * @policy: cpufreq policy object to compute the new frequency for.
+ * @sg_cpu: schedutil cpu object to compute the new frequency for.
 * @util: Current CPU utilization.
 * @max: CPU capacity.
 *
@@ -121,14 +123,25 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
 * next_freq = C * curr_freq * util_raw / max
 *
 * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8.
+ *
+ * The lowest driver-supported frequency which is equal or greater than the raw
+ * next_freq (as calculated above) is returned, subject to policy min/max and
+ * cpufreq driver limitations.
 */
-static unsigned int get_next_freq(struct cpufreq_policy *policy,
+static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util,
-                                  unsigned long util, unsigned long max)
+                                  unsigned long max)
 {
+        struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+        struct cpufreq_policy *policy = sg_policy->policy;
        unsigned int freq = arch_scale_freq_invariant() ?
                                policy->cpuinfo.max_freq : policy->cur;
-        return (freq + (freq >> 2)) * util / max;
+        freq = (freq + (freq >> 2)) * util / max;
+        if (freq == sg_cpu->cached_raw_freq && sg_policy->next_freq != UINT_MAX)
+                return sg_policy->next_freq;
+        sg_cpu->cached_raw_freq = freq;
+        return cpufreq_driver_resolve_freq(policy, freq);
 }
 static void sugov_update_single(struct update_util_data *hook, u64 time,
@@ -143,13 +156,14 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
                return;
        next_f = util == ULONG_MAX ? policy->cpuinfo.max_freq :
-                        get_next_freq(policy, util, max);
+                        get_next_freq(sg_cpu, util, max);
        sugov_update_commit(sg_policy, time, next_f);
 }
-static unsigned int sugov_next_freq_shared(struct sugov_policy *sg_policy,
+static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
                                           unsigned long util, unsigned long max)
 {
+        struct sugov_policy *sg_policy = sg_cpu->sg_policy;
        struct cpufreq_policy *policy = sg_policy->policy;
        unsigned int max_f = policy->cpuinfo.max_freq;
        u64 last_freq_update_time = sg_policy->last_freq_update_time;
@@ -189,7 +203,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_policy *sg_policy,
                }
        }
-        return get_next_freq(policy, util, max);
+        return get_next_freq(sg_cpu, util, max);
 }
 static void sugov_update_shared(struct update_util_data *hook, u64 time,
@@ -206,7 +220,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
        sg_cpu->last_update = time;
        if (sugov_should_update_freq(sg_policy, time)) {
-                next_f = sugov_next_freq_shared(sg_policy, util, max);
+                next_f = sugov_next_freq_shared(sg_cpu, util, max);
                sugov_update_commit(sg_policy, time, next_f);
        }
@@ -394,7 +408,7 @@ static int sugov_init(struct cpufreq_policy *policy)
        return ret;
 }
-static int sugov_exit(struct cpufreq_policy *policy)
+static void sugov_exit(struct cpufreq_policy *policy)
 {
        struct sugov_policy *sg_policy = policy->governor_data;
        struct sugov_tunables *tunables = sg_policy->tunables;
@@ -412,7 +426,6 @@ static int sugov_exit(struct cpufreq_policy *policy)
        mutex_unlock(&global_tunables_lock);
        sugov_policy_free(sg_policy);
-        return 0;
 }
 static int sugov_start(struct cpufreq_policy *policy)
@@ -434,6 +447,7 @@ static int sugov_start(struct cpufreq_policy *policy)
                        sg_cpu->util = ULONG_MAX;
                        sg_cpu->max = 0;
                        sg_cpu->last_update = 0;
+                        sg_cpu->cached_raw_freq = 0;
                        cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
                                                     sugov_update_shared);
                } else {
@@ -444,7 +458,7 @@ static int sugov_start(struct cpufreq_policy *policy)
        return 0;
 }
-static int sugov_stop(struct cpufreq_policy *policy)
+static void sugov_stop(struct cpufreq_policy *policy)
 {
        struct sugov_policy *sg_policy = policy->governor_data;
        unsigned int cpu;
@@ -456,53 +470,29 @@ static int sugov_stop(struct cpufreq_policy *policy)
        irq_work_sync(&sg_policy->irq_work);
        cancel_work_sync(&sg_policy->work);
-        return 0;
 }
-static int sugov_limits(struct cpufreq_policy *policy)
+static void sugov_limits(struct cpufreq_policy *policy)
 {
        struct sugov_policy *sg_policy = policy->governor_data;
        if (!policy->fast_switch_enabled) {
                mutex_lock(&sg_policy->work_lock);
+                cpufreq_policy_apply_limits(policy);
-                if (policy->max < policy->cur)
-                        __cpufreq_driver_target(policy, policy->max,
-                                                CPUFREQ_RELATION_H);
-                else if (policy->min > policy->cur)
-                        __cpufreq_driver_target(policy, policy->min,
-                                                CPUFREQ_RELATION_L);
                mutex_unlock(&sg_policy->work_lock);
        }
        sg_policy->need_freq_update = true;
-        return 0;
-}
-int sugov_governor(struct cpufreq_policy *policy, unsigned int event)
-{
-        if (event == CPUFREQ_GOV_POLICY_INIT) {
-                return sugov_init(policy);
-        } else if (policy->governor_data) {
-                switch (event) {
-                case CPUFREQ_GOV_POLICY_EXIT:
-                        return sugov_exit(policy);
-                case CPUFREQ_GOV_START:
-                        return sugov_start(policy);
-                case CPUFREQ_GOV_STOP:
-                        return sugov_stop(policy);
-                case CPUFREQ_GOV_LIMITS:
-                        return sugov_limits(policy);
-                }
-        }
-        return -EINVAL;
 }
 static struct cpufreq_governor schedutil_gov = {
        .name = "schedutil",
-        .governor = sugov_governor,
        .owner = THIS_MODULE,
+        .init = sugov_init,
+        .exit = sugov_exit,
+        .start = sugov_start,
+        .stop = sugov_stop,
+        .limits = sugov_limits,
 };
 static int __init sugov_module_init(void)
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 75f98c5498d5..1934f658c036 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -49,15 +49,12 @@ DEFINE_PER_CPU(seqcount_t, irq_time_seq);
 */
 void irqtime_account_irq(struct task_struct *curr)
 {
-        unsigned long flags;
        s64 delta;
        int cpu;
        if (!sched_clock_irqtime)
                return;
-        local_irq_save(flags);
        cpu = smp_processor_id();
        delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
        __this_cpu_add(irq_start_time, delta);
@@ -75,44 +72,53 @@ void irqtime_account_irq(struct task_struct *curr)
                __this_cpu_add(cpu_softirq_time, delta);
        irq_time_write_end();
-        local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(irqtime_account_irq);
-static int irqtime_account_hi_update(void)
+static cputime_t irqtime_account_hi_update(cputime_t maxtime)
 {
        u64 *cpustat = kcpustat_this_cpu->cpustat;
        unsigned long flags;
-        u64 latest_ns;
+        cputime_t irq_cputime;
-        int ret = 0;
        local_irq_save(flags);
-        latest_ns = this_cpu_read(cpu_hardirq_time);
+        irq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time)) -
-        if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
+                      cpustat[CPUTIME_IRQ];
-                ret = 1;
+        irq_cputime = min(irq_cputime, maxtime);
+        cpustat[CPUTIME_IRQ] += irq_cputime;
        local_irq_restore(flags);
-        return ret;
+        return irq_cputime;
 }
-static int irqtime_account_si_update(void)
+static cputime_t irqtime_account_si_update(cputime_t maxtime)
 {
        u64 *cpustat = kcpustat_this_cpu->cpustat;
        unsigned long flags;
-        u64 latest_ns;
+        cputime_t softirq_cputime;
-        int ret = 0;
        local_irq_save(flags);
-        latest_ns = this_cpu_read(cpu_softirq_time);
+        softirq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time)) -
-        if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
+                          cpustat[CPUTIME_SOFTIRQ];
-                ret = 1;
+        softirq_cputime = min(softirq_cputime, maxtime);
+        cpustat[CPUTIME_SOFTIRQ] += softirq_cputime;
        local_irq_restore(flags);
-        return ret;
+        return softirq_cputime;
 }
 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
 #define sched_clock_irqtime     (0)
+static cputime_t irqtime_account_hi_update(cputime_t dummy)
+{
+        return 0;
+}
+static cputime_t irqtime_account_si_update(cputime_t dummy)
+{
+        return 0;
+}
 #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
 static inline void task_group_account_field(struct task_struct *p, int index,
@@ -257,29 +263,42 @@ void account_idle_time(cputime_t cputime)
                cpustat[CPUTIME_IDLE] += (__force u64) cputime;
 }
-static __always_inline bool steal_account_process_tick(void)
+static __always_inline cputime_t steal_account_process_time(cputime_t maxtime)
 {
 #ifdef CONFIG_PARAVIRT
        if (static_key_false(&paravirt_steal_enabled)) {
+                cputime_t steal_cputime;
                u64 steal;
-                unsigned long steal_jiffies;
                steal = paravirt_steal_clock(smp_processor_id());
                steal -= this_rq()->prev_steal_time;
-                /*
+                steal_cputime = min(nsecs_to_cputime(steal), maxtime);
-                 * steal is in nsecs but our caller is expecting steal
+                account_steal_time(steal_cputime);
-                 * time in jiffies. Lets cast the result to jiffies
+                this_rq()->prev_steal_time += cputime_to_nsecs(steal_cputime);
-                 * granularity and account the rest on the next rounds.
-                 */
-                steal_jiffies = nsecs_to_jiffies(steal);
-                this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies);
-                account_steal_time(jiffies_to_cputime(steal_jiffies));
+                return steal_cputime;
-                return steal_jiffies;
        }
 #endif
-        return false;
+        return 0;
+}
+/*
+ * Account how much elapsed time was spent in steal, irq, or softirq time.
+ */
+static inline cputime_t account_other_time(cputime_t max)
+{
+        cputime_t accounted;
+        accounted = steal_account_process_time(max);
+        if (accounted < max)
+                accounted += irqtime_account_hi_update(max - accounted);
+        if (accounted < max)
+                accounted += irqtime_account_si_update(max - accounted);
+        return accounted;
 }
 /*
@@ -342,21 +361,23 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
 static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
                                         struct rq *rq, int ticks)
 {
-        cputime_t scaled = cputime_to_scaled(cputime_one_jiffy);
+        u64 cputime = (__force u64) cputime_one_jiffy * ticks;
-        u64 cputime = (__force u64) cputime_one_jiffy;
+        cputime_t scaled, other;
-        u64 *cpustat = kcpustat_this_cpu->cpustat;
-        if (steal_account_process_tick())
+        /*
+         * When returning from idle, many ticks can get accounted at
+         * once, including some ticks of steal, irq, and softirq time.
+         * Subtract those ticks from the amount of time accounted to
+         * idle, or potentially user or system time. Due to rounding,
+         * other time can exceed ticks occasionally.
+         */
+        other = account_other_time(cputime);
+        if (other >= cputime)
                return;
+        cputime -= other;
+        scaled = cputime_to_scaled(cputime);
-        cputime *= ticks;
+        if (this_cpu_ksoftirqd() == p) {
-        scaled *= ticks;
-        if (irqtime_account_hi_update()) {
-                cpustat[CPUTIME_IRQ] += cputime;
-        } else if (irqtime_account_si_update()) {
-                cpustat[CPUTIME_SOFTIRQ] += cputime;
-        } else if (this_cpu_ksoftirqd() == p) {
                /*
                 * ksoftirqd time do not get accounted in cpu_softirq_time.
                 * So, we have to handle it separately here.
@@ -406,6 +427,10 @@ void vtime_common_task_switch(struct task_struct *prev)
 }
 #endif
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 /*
 * Archs that account the whole time spent in the idle task
 * (outside irq) as idle time can rely on this and just implement
@@ -415,33 +440,16 @@ void vtime_common_task_switch(struct task_struct *prev)
 * vtime_account().
 */
 #ifndef __ARCH_HAS_VTIME_ACCOUNT
-void vtime_common_account_irq_enter(struct task_struct *tsk)
+void vtime_account_irq_enter(struct task_struct *tsk)
 {
-        if (!in_interrupt()) {
+        if (!in_interrupt() && is_idle_task(tsk))
-                /*
+                vtime_account_idle(tsk);
-                 * If we interrupted user, context_tracking_in_user()
+        else
-                 * is 1 because the context tracking don't hook
+                vtime_account_system(tsk);
-                 * on irq entry/exit. This way we know if
-                 * we need to flush user time on kernel entry.
-                 */
-                if (context_tracking_in_user()) {
-                        vtime_account_user(tsk);
-                        return;
-                }
-                if (is_idle_task(tsk)) {
-                        vtime_account_idle(tsk);
-                        return;
-                }
-        }
-        vtime_account_system(tsk);
 }
-EXPORT_SYMBOL_GPL(vtime_common_account_irq_enter);
+EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
 #endif /* __ARCH_HAS_VTIME_ACCOUNT */
-#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
        *ut = p->utime;
@@ -466,7 +474,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
 */
 void account_process_tick(struct task_struct *p, int user_tick)
 {
-        cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+        cputime_t cputime, scaled, steal;
        struct rq *rq = this_rq();
        if (vtime_accounting_cpu_enabled())
@@ -477,26 +485,21 @@ void account_process_tick(struct task_struct *p, int user_tick)
                return;
        }
-        if (steal_account_process_tick())
+        cputime = cputime_one_jiffy;
+        steal = steal_account_process_time(cputime);
+        if (steal >= cputime)
                return;
+        cputime -= steal;
+        scaled = cputime_to_scaled(cputime);
        if (user_tick)
-                account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
+                account_user_time(p, cputime, scaled);
        else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
-                account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
+                account_system_time(p, HARDIRQ_OFFSET, cputime, scaled);
-                                    one_jiffy_scaled);
        else
-                account_idle_time(cputime_one_jiffy);
+                account_idle_time(cputime);
-}
-/*
- * Account multiple ticks of steal time.
- * @p: the process from which the cpu time has been stolen
- * @ticks: number of stolen ticks
- */
-void account_steal_ticks(unsigned long ticks)
-{
-        account_steal_time(jiffies_to_cputime(ticks));
 }
 /*
@@ -681,12 +684,14 @@ static cputime_t vtime_delta(struct task_struct *tsk)
 static cputime_t get_vtime_delta(struct task_struct *tsk)
 {
        unsigned long now = READ_ONCE(jiffies);
-        unsigned long delta = now - tsk->vtime_snap;
+        cputime_t delta, other;
+        delta = jiffies_to_cputime(now - tsk->vtime_snap);
+        other = account_other_time(delta);
        WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
        tsk->vtime_snap = now;
-        return jiffies_to_cputime(delta);
+        return delta - other;
 }
 static void __vtime_account_system(struct task_struct *tsk)
@@ -706,16 +711,6 @@ void vtime_account_system(struct task_struct *tsk)
        write_seqcount_end(&tsk->vtime_seqcount);
 }
-void vtime_gen_account_irq_exit(struct task_struct *tsk)
-{
-        write_seqcount_begin(&tsk->vtime_seqcount);
-        if (vtime_delta(tsk))
-                __vtime_account_system(tsk);
-        if (context_tracking_in_user())
-                tsk->vtime_snap_whence = VTIME_USER;
-        write_seqcount_end(&tsk->vtime_seqcount);
-}
 void vtime_account_user(struct task_struct *tsk)
 {
        cputime_t delta_cpu;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index cf905f655ba1..2a0a9995256d 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -427,19 +427,12 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
                SPLIT_NS(p->se.vruntime),
                (long long)(p->nvcsw + p->nivcsw),
                p->prio);
-#ifdef CONFIG_SCHEDSTATS
-        if (schedstat_enabled()) {
-                SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
-                        SPLIT_NS(p->se.statistics.wait_sum),
-                        SPLIT_NS(p->se.sum_exec_runtime),
-                        SPLIT_NS(p->se.statistics.sum_sleep_runtime));
-        }
-#else
        SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
-                0LL, 0L,
+                SPLIT_NS(schedstat_val(p, se.statistics.wait_sum)),
                SPLIT_NS(p->se.sum_exec_runtime),
-                0LL, 0L);
+                SPLIT_NS(schedstat_val(p, se.statistics.sum_sleep_runtime)));
-#endif
 #ifdef CONFIG_NUMA_BALANCING
        SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
 #endif
@@ -886,9 +879,9 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
        nr_switches = p->nvcsw + p->nivcsw;
-#ifdef CONFIG_SCHEDSTATS
        P(se.nr_migrations);
+#ifdef CONFIG_SCHEDSTATS
        if (schedstat_enabled()) {
                u64 avg_atom, avg_per_cpu;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 218f8e83db73..4088eedea763 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -690,6 +690,11 @@ void init_entity_runnable_average(struct sched_entity *se)
        /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
 }
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
+static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq);
+static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force);
+static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se);
 /*
 * With new tasks being created, their initial util_avgs are extrapolated
 * based on the cfs_rq's current util_avg:
@@ -720,6 +725,8 @@ void post_init_entity_util_avg(struct sched_entity *se)
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
        struct sched_avg *sa = &se->avg;
        long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
+        u64 now = cfs_rq_clock_task(cfs_rq);
+        int tg_update;
        if (cap > 0) {
                if (cfs_rq->avg.util_avg != 0) {
@@ -733,18 +740,42 @@ void post_init_entity_util_avg(struct sched_entity *se)
                }
                sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
        }
+        if (entity_is_task(se)) {
+                struct task_struct *p = task_of(se);
+                if (p->sched_class != &fair_sched_class) {
+                        /*
+                         * For !fair tasks do:
+                         *
+                        update_cfs_rq_load_avg(now, cfs_rq, false);
+                        attach_entity_load_avg(cfs_rq, se);
+                        switched_from_fair(rq, p);
+                         *
+                         * such that the next switched_to_fair() has the
+                         * expected state.
+                         */
+                        se->avg.last_update_time = now;
+                        return;
+                }
+        }
+        tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
+        attach_entity_load_avg(cfs_rq, se);
+        if (tg_update)
+                update_tg_load_avg(cfs_rq, false);
 }
-static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
+#else /* !CONFIG_SMP */
-static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
-#else
 void init_entity_runnable_average(struct sched_entity *se)
 {
 }
 void post_init_entity_util_avg(struct sched_entity *se)
 {
 }
-#endif
+static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
+{
+}
+#endif /* CONFIG_SMP */
 /*
 * Update the current task's runtime statistics.
@@ -1305,6 +1336,8 @@ static void task_numa_assign(struct task_numa_env *env,
 {
        if (env->best_task)
                put_task_struct(env->best_task);
+        if (p)
+                get_task_struct(p);
        env->best_task = p;
        env->best_imp = imp;
@@ -1372,31 +1405,11 @@ static void task_numa_compare(struct task_numa_env *env,
        long imp = env->p->numa_group ? groupimp : taskimp;
        long moveimp = imp;
        int dist = env->dist;
-        bool assigned = false;
        rcu_read_lock();
+        cur = task_rcu_dereference(&dst_rq->curr);
-        raw_spin_lock_irq(&dst_rq->lock);
+        if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
-        cur = dst_rq->curr;
-        /*
-         * No need to move the exiting task or idle task.
-         */
-        if ((cur->flags & PF_EXITING) || is_idle_task(cur))
                cur = NULL;
-        else {
-                /*
-                 * The task_struct must be protected here to protect the
-                 * p->numa_faults access in the task_weight since the
-                 * numa_faults could already be freed in the following path:
-                 * finish_task_switch()
-                 *     --> put_task_struct()
-                 *         --> __put_task_struct()
-                 *             --> task_numa_free()
-                 */
-                get_task_struct(cur);
-        }
-        raw_spin_unlock_irq(&dst_rq->lock);
        /*
         * Because we have preemption enabled we can get migrated around and
@@ -1479,7 +1492,6 @@ balance:
                 */
                if (!load_too_imbalanced(src_load, dst_load, env)) {
                        imp = moveimp - 1;
-                        put_task_struct(cur);
                        cur = NULL;
                        goto assign;
                }
@@ -1505,16 +1517,9 @@ balance:
                env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
 assign:
-        assigned = true;
        task_numa_assign(env, cur, imp);
 unlock:
        rcu_read_unlock();
-        /*
-         * The dst_rq->curr isn't assigned. The protection for task_struct is
-         * finished.
-         */
-        if (cur && !assigned)
-                put_task_struct(cur);
 }
 static void task_numa_find_cpu(struct task_numa_env *env,
@@ -2499,28 +2504,22 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 #ifdef CONFIG_FAIR_GROUP_SCHED
 # ifdef CONFIG_SMP
-static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
+static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
 {
-        long tg_weight;
+        long tg_weight, load, shares;
        /*
-         * Use this CPU's real-time load instead of the last load contribution
+         * This really should be: cfs_rq->avg.load_avg, but instead we use
-         * as the updating of the contribution is delayed, and we will use the
+         * cfs_rq->load.weight, which is its upper bound. This helps ramp up
-         * the real-time load to calc the share. See update_tg_load_avg().
+         * the shares for small weight interactive tasks.
         */
-        tg_weight = atomic_long_read(&tg->load_avg);
+        load = scale_load_down(cfs_rq->load.weight);
-        tg_weight -= cfs_rq->tg_load_avg_contrib;
-        tg_weight += cfs_rq->load.weight;
-        return tg_weight;
+        tg_weight = atomic_long_read(&tg->load_avg);
-}
-static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
-{
-        long tg_weight, load, shares;
-        tg_weight = calc_tg_weight(tg, cfs_rq);
+        /* Ensure tg_weight >= load */
-        load = cfs_rq->load.weight;
+        tg_weight -= cfs_rq->tg_load_avg_contrib;
+        tg_weight += load;
        shares = (tg->shares * load);
        if (tg_weight)
@@ -2539,6 +2538,7 @@ static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
        return tg->shares;
 }
 # endif /* CONFIG_SMP */
 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
                            unsigned long weight)
 {
@@ -2873,8 +2873,6 @@ void set_task_rq_fair(struct sched_entity *se,
 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
 #endif /* CONFIG_FAIR_GROUP_SCHED */
-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
 static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
 {
        struct rq *rq = rq_of(cfs_rq);
@@ -2904,7 +2902,40 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
        }
 }
-/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
+/*
+ * Unsigned subtract and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define sub_positive(_ptr, _val) do {                           \
+        typeof(_ptr) ptr = (_ptr);                              \
+        typeof(*ptr) val = (_val);                              \
+        typeof(*ptr) res, var = READ_ONCE(*ptr);                \
+        res = var - val;                                        \
+        if (res > var)                                          \
+                res = 0;                                        \
+        WRITE_ONCE(*ptr, res);                                  \
+} while (0)
+/**
+ * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
+ * @now: current time, as per cfs_rq_clock_task()
+ * @cfs_rq: cfs_rq to update
+ * @update_freq: should we call cfs_rq_util_change() or will the call do so
+ *
+ * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
+ * avg. The immediate corollary is that all (fair) tasks must be attached, see
+ * post_init_entity_util_avg().
+ *
+ * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
+ *
+ * Returns true if the load decayed or we removed utilization. It is expected
+ * that one calls update_tg_load_avg() on this condition, but after you've
+ * modified the cfs_rq avg (attach/detach), such that we propagate the new
+ * avg up.
+ */
 static inline int
 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
 {
@@ -2913,15 +2944,15 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
        if (atomic_long_read(&cfs_rq->removed_load_avg)) {
                s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
-                sa->load_avg = max_t(long, sa->load_avg - r, 0);
+                sub_positive(&sa->load_avg, r);
-                sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
+                sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
                removed_load = 1;
        }
        if (atomic_long_read(&cfs_rq->removed_util_avg)) {
                long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
-                sa->util_avg = max_t(long, sa->util_avg - r, 0);
+                sub_positive(&sa->util_avg, r);
-                sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
+                sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
                removed_util = 1;
        }
@@ -2959,6 +2990,14 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
                update_tg_load_avg(cfs_rq, 0);
 }
+/**
+ * attach_entity_load_avg - attach this entity to its cfs_rq load avg
+ * @cfs_rq: cfs_rq to attach to
+ * @se: sched_entity to attach
+ *
+ * Must call update_cfs_rq_load_avg() before this, since we rely on
+ * cfs_rq->avg.last_update_time being current.
+ */
 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        if (!sched_feat(ATTACH_AGE_LOAD))
@@ -2967,6 +3006,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
        /*
         * If we got migrated (either between CPUs or between cgroups) we'll
         * have aged the average right before clearing @last_update_time.
+         *
+         * Or we're fresh through post_init_entity_util_avg().
         */
        if (se->avg.last_update_time) {
                __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
@@ -2988,16 +3029,24 @@ skip_aging:
        cfs_rq_util_change(cfs_rq);
 }
+/**
+ * detach_entity_load_avg - detach this entity from its cfs_rq load avg
+ * @cfs_rq: cfs_rq to detach from
+ * @se: sched_entity to detach
+ *
+ * Must call update_cfs_rq_load_avg() before this, since we rely on
+ * cfs_rq->avg.last_update_time being current.
+ */
 static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
                          &se->avg, se->on_rq * scale_load_down(se->load.weight),
                          cfs_rq->curr == se, NULL);
-        cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
+        sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
-        cfs_rq->avg.load_sum = max_t(s64,  cfs_rq->avg.load_sum - se->avg.load_sum, 0);
+        sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
-        cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
+        sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
-        cfs_rq->avg.util_sum = max_t(s32,  cfs_rq->avg.util_sum - se->avg.util_sum, 0);
+        sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
        cfs_rq_util_change(cfs_rq);
 }
@@ -3072,11 +3121,14 @@ void remove_entity_load_avg(struct sched_entity *se)
        u64 last_update_time;
        /*
-         * Newly created task or never used group entity should not be removed
+         * tasks cannot exit without having gone through wake_up_new_task() ->
-         * from its (source) cfs_rq
+         * post_init_entity_util_avg() which will have added things to the
+         * cfs_rq, so we can remove unconditionally.
+         *
+         * Similarly for groups, they will have passed through
+         * post_init_entity_util_avg() before unregister_sched_fair_group()
+         * calls this.
         */
-        if (se->avg.last_update_time == 0)
-                return;
        last_update_time = cfs_rq_last_update_time(cfs_rq);
@@ -3099,6 +3151,12 @@ static int idle_balance(struct rq *this_rq);
 #else /* CONFIG_SMP */
+static inline int
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
+{
+        return 0;
+}
 static inline void update_load_avg(struct sched_entity *se, int not_used)
 {
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
@@ -3246,7 +3304,7 @@ static inline void check_schedstat_required(void)
                        trace_sched_stat_iowait_enabled()  ||
                        trace_sched_stat_blocked_enabled() ||
                        trace_sched_stat_runtime_enabled())  {
-                pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, "
+                printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
                             "stat_blocked and stat_runtime require the "
                             "kernel parameter schedstats=enabled or "
                             "kernel.sched_schedstats=1\n");
@@ -3688,7 +3746,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
 {
        if (unlikely(cfs_rq->throttle_count))
-                return cfs_rq->throttled_clock_task;
+                return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
        return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
 }
@@ -3826,13 +3884,11 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
        struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
        cfs_rq->throttle_count--;
-#ifdef CONFIG_SMP
        if (!cfs_rq->throttle_count) {
                /* adjust cfs_rq_clock_task() */
                cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
                                             cfs_rq->throttled_clock_task;
        }
-#endif
        return 0;
 }
@@ -4199,6 +4255,23 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
                throttle_cfs_rq(cfs_rq);
 }
+static void sync_throttle(struct task_group *tg, int cpu)
+{
+        struct cfs_rq *pcfs_rq, *cfs_rq;
+        if (!cfs_bandwidth_used())
+                return;
+        if (!tg->parent)
+                return;
+        cfs_rq = tg->cfs_rq[cpu];
+        pcfs_rq = tg->parent->cfs_rq[cpu];
+        cfs_rq->throttle_count = pcfs_rq->throttle_count;
+        pcfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
+}
 /* conditionally throttle active cfs_rq's from put_prev_entity() */
 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
@@ -4338,6 +4411,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
+static inline void sync_throttle(struct task_group *tg, int cpu) {}
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
@@ -4446,7 +4520,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                 *
                 * note: in the case of encountering a throttled cfs_rq we will
                 * post the final h_nr_running increment below.
-                */
+                 */
                if (cfs_rq_throttled(cfs_rq))
                        break;
                cfs_rq->h_nr_running++;
@@ -4500,15 +4574,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                /* Don't dequeue parent if it has other entities besides us */
                if (cfs_rq->load.weight) {
+                        /* Avoid re-evaluating load for this entity: */
+                        se = parent_entity(se);
                        /*
                         * Bias pick_next to pick a task from this cfs_rq, as
                         * p is sleeping when it is within its sched_slice.
                         */
-                        if (task_sleep && parent_entity(se))
+                        if (task_sleep && se && !throttled_hierarchy(cfs_rq))
-                                set_next_buddy(parent_entity(se));
+                                set_next_buddy(se);
-                        /* avoid re-evaluating load for this entity */
-                        se = parent_entity(se);
                        break;
                }
                flags |= DEQUEUE_SLEEP;
@@ -4910,19 +4983,24 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
                return wl;
        for_each_sched_entity(se) {
-                long w, W;
+                struct cfs_rq *cfs_rq = se->my_q;
+                long W, w = cfs_rq_load_avg(cfs_rq);
-                tg = se->my_q->tg;
+                tg = cfs_rq->tg;
                /*
                 * W = @wg + \Sum rw_j
                 */
-                W = wg + calc_tg_weight(tg, se->my_q);
+                W = wg + atomic_long_read(&tg->load_avg);
+                /* Ensure \Sum rw_j >= rw_i */
+                W -= cfs_rq->tg_load_avg_contrib;
+                W += w;
                /*
                 * w = rw_i + @wl
                 */
-                w = cfs_rq_load_avg(se->my_q) + wl;
+                w += wl;
                /*
                 * wl = S * s'_i; see (2)
@@ -8283,31 +8361,17 @@ static void task_fork_fair(struct task_struct *p)
 {
        struct cfs_rq *cfs_rq;
        struct sched_entity *se = &p->se, *curr;
-        int this_cpu = smp_processor_id();
        struct rq *rq = this_rq();
-        unsigned long flags;
-        raw_spin_lock_irqsave(&rq->lock, flags);
+        raw_spin_lock(&rq->lock);
        update_rq_clock(rq);
        cfs_rq = task_cfs_rq(current);
        curr = cfs_rq->curr;
+        if (curr) {
-        /*
+                update_curr(cfs_rq);
-         * Not only the cpu but also the task_group of the parent might have
-         * been changed after parent->se.parent,cfs_rq were copied to
-         * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
-         * of child point to valid ones.
-         */
-        rcu_read_lock();
-        __set_task_cpu(p, this_cpu);
-        rcu_read_unlock();
-        update_curr(cfs_rq);
-        if (curr)
                se->vruntime = curr->vruntime;
+        }
        place_entity(cfs_rq, se, 1);
        if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
@@ -8320,8 +8384,7 @@ static void task_fork_fair(struct task_struct *p)
        }
        se->vruntime -= cfs_rq->min_vruntime;
+        raw_spin_unlock(&rq->lock);
-        raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 /*
@@ -8377,6 +8440,8 @@ static void detach_task_cfs_rq(struct task_struct *p)
 {
        struct sched_entity *se = &p->se;
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
+        u64 now = cfs_rq_clock_task(cfs_rq);
+        int tg_update;
        if (!vruntime_normalized(p)) {
                /*
@@ -8388,13 +8453,18 @@ static void detach_task_cfs_rq(struct task_struct *p)
        }
        /* Catch up with the cfs_rq and remove our load when we leave */
+        tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
        detach_entity_load_avg(cfs_rq, se);
+        if (tg_update)
+                update_tg_load_avg(cfs_rq, false);
 }
 static void attach_task_cfs_rq(struct task_struct *p)
 {
        struct sched_entity *se = &p->se;
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
+        u64 now = cfs_rq_clock_task(cfs_rq);
+        int tg_update;
 #ifdef CONFIG_FAIR_GROUP_SCHED
        /*
@@ -8405,7 +8475,10 @@ static void attach_task_cfs_rq(struct task_struct *p)
 #endif
        /* Synchronize task with its cfs_rq */
+        tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
        attach_entity_load_avg(cfs_rq, se);
+        if (tg_update)
+                update_tg_load_avg(cfs_rq, false);
        if (!vruntime_normalized(p))
                se->vruntime += cfs_rq->min_vruntime;
@@ -8465,6 +8538,14 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
+static void task_set_group_fair(struct task_struct *p)
+{
+        struct sched_entity *se = &p->se;
+        set_task_rq(p, task_cpu(p));
+        se->depth = se->parent ? se->parent->depth + 1 : 0;
+}
 static void task_move_group_fair(struct task_struct *p)
 {
        detach_task_cfs_rq(p);
@@ -8477,6 +8558,19 @@ static void task_move_group_fair(struct task_struct *p)
        attach_task_cfs_rq(p);
 }
+static void task_change_group_fair(struct task_struct *p, int type)
+{
+        switch (type) {
+        case TASK_SET_GROUP:
+                task_set_group_fair(p);
+                break;
+        case TASK_MOVE_GROUP:
+                task_move_group_fair(p);
+                break;
+        }
+}
 void free_fair_sched_group(struct task_group *tg)
 {
        int i;
@@ -8496,8 +8590,9 @@ void free_fair_sched_group(struct task_group *tg)
 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 {
-        struct cfs_rq *cfs_rq;
        struct sched_entity *se;
+        struct cfs_rq *cfs_rq;
+        struct rq *rq;
        int i;
        tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8512,6 +8607,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
        init_cfs_bandwidth(tg_cfs_bandwidth(tg));
        for_each_possible_cpu(i) {
+                rq = cpu_rq(i);
                cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
                                      GFP_KERNEL, cpu_to_node(i));
                if (!cfs_rq)
@@ -8525,7 +8622,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
                init_cfs_rq(cfs_rq);
                init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
                init_entity_runnable_average(se);
-                post_init_entity_util_avg(se);
        }
        return 1;
@@ -8536,6 +8632,23 @@ err:
        return 0;
 }
+void online_fair_sched_group(struct task_group *tg)
+{
+        struct sched_entity *se;
+        struct rq *rq;
+        int i;
+        for_each_possible_cpu(i) {
+                rq = cpu_rq(i);
+                se = tg->se[i];
+                raw_spin_lock_irq(&rq->lock);
+                post_init_entity_util_avg(se);
+                sync_throttle(tg, i);
+                raw_spin_unlock_irq(&rq->lock);
+        }
+}
 void unregister_fair_sched_group(struct task_group *tg)
 {
        unsigned long flags;
@@ -8640,6 +8753,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
        return 1;
 }
+void online_fair_sched_group(struct task_group *tg) { }
 void unregister_fair_sched_group(struct task_group *tg) { }
 #endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -8699,7 +8814,7 @@ const struct sched_class fair_sched_class = {
        .update_curr            = update_curr_fair,
 #ifdef CONFIG_FAIR_GROUP_SCHED
-        .task_move_group        = task_move_group_fair,
+        .task_change_group      = task_change_group_fair,
 #endif
 };
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index bd12c6c714ec..9fb873cfc75c 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -127,7 +127,7 @@ static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 */
 static void cpuidle_idle_call(void)
 {
-        struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
+        struct cpuidle_device *dev = cpuidle_get_device();
        struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
        int next_state, entered_state;
@@ -201,6 +201,8 @@ exit_idle:
 */
 static void cpu_idle_loop(void)
 {
+        int cpu = smp_processor_id();
        while (1) {
                /*
                 * If the arch has a polling bit, we maintain an invariant:
@@ -219,7 +221,7 @@ static void cpu_idle_loop(void)
                        check_pgt_cache();
                        rmb();
-                        if (cpu_is_offline(smp_processor_id())) {
+                        if (cpu_is_offline(cpu)) {
                                cpuhp_report_idle_dead();
                                arch_cpu_idle_dead();
                        }
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index b0b93fd33af9..a2d6eb71f06b 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -78,11 +78,11 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
        loads[2] = (avenrun[2] + offset) << shift;
 }
-long calc_load_fold_active(struct rq *this_rq)
+long calc_load_fold_active(struct rq *this_rq, long adjust)
 {
        long nr_active, delta = 0;
-        nr_active = this_rq->nr_running;
+        nr_active = this_rq->nr_running - adjust;
        nr_active += (long)this_rq->nr_uninterruptible;
        if (nr_active != this_rq->calc_load_active) {
@@ -188,7 +188,7 @@ void calc_load_enter_idle(void)
         * We're going into NOHZ mode, if there's any pending delta, fold it
         * into the pending idle delta.
         */
-        delta = calc_load_fold_active(this_rq);
+        delta = calc_load_fold_active(this_rq, 0);
        if (delta) {
                int idx = calc_load_write_idx();
@@ -389,7 +389,7 @@ void calc_global_load_tick(struct rq *this_rq)
        if (time_before(jiffies, this_rq->calc_load_update))
                return;
-        delta  = calc_load_fold_active(this_rq);
+        delta  = calc_load_fold_active(this_rq, 0);
        if (delta)
                atomic_long_add(delta, &calc_load_tasks);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 72f1f3087b04..c64fc5114004 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -28,7 +28,7 @@ extern unsigned long calc_load_update;
 extern atomic_long_t calc_load_tasks;
 extern void calc_global_load_tick(struct rq *this_rq);
-extern long calc_load_fold_active(struct rq *this_rq);
+extern long calc_load_fold_active(struct rq *this_rq, long adjust);
 #ifdef CONFIG_SMP
 extern void cpu_load_update_active(struct rq *this_rq);
@@ -321,6 +321,7 @@ extern int tg_nop(struct task_group *tg, void *data);
 extern void free_fair_sched_group(struct task_group *tg);
 extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
+extern void online_fair_sched_group(struct task_group *tg);
 extern void unregister_fair_sched_group(struct task_group *tg);
 extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
                        struct sched_entity *se, int cpu,
@@ -1113,7 +1114,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
         * In particular, the load of prev->state in finish_task_switch() must
         * happen before this.
         *
-         * Pairs with the smp_cond_acquire() in try_to_wake_up().
+         * Pairs with the smp_cond_load_acquire() in try_to_wake_up().
         */
        smp_store_release(&prev->on_cpu, 0);
 #endif
@@ -1246,8 +1247,11 @@ struct sched_class {
        void (*update_curr) (struct rq *rq);
+#define TASK_SET_GROUP  0
+#define TASK_MOVE_GROUP 1
 #ifdef CONFIG_FAIR_GROUP_SCHED
-        void (*task_move_group) (struct task_struct *p);
+        void (*task_change_group) (struct task_struct *p, int type);
 #endif
 };
@@ -1809,16 +1813,3 @@ static inline void cpufreq_trigger_update(u64 time) {}
 #else /* arch_scale_freq_capacity */
 #define arch_scale_freq_invariant()     (false)
 #endif
-static inline void account_reset_rq(struct rq *rq)
-{
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-        rq->prev_irq_time = 0;
-#endif
-#ifdef CONFIG_PARAVIRT
-        rq->prev_steal_time = 0;
-#endif
-#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
-        rq->prev_steal_time_rq = 0;
-#endif
-}
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 70b3b6a20fb0..78955cbea31c 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -33,6 +33,8 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
 # define schedstat_inc(rq, field)       do { if (schedstat_enabled()) { (rq)->field++; } } while (0)
 # define schedstat_add(rq, field, amt)  do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0)
 # define schedstat_set(var, val)        do { if (schedstat_enabled()) { var = (val); } } while (0)
+# define schedstat_val(rq, field)       ((schedstat_enabled()) ? (rq)->field : 0)
 #else /* !CONFIG_SCHEDSTATS */
 static inline void
 rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
@@ -47,6 +49,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
 # define schedstat_inc(rq, field)       do { } while (0)
 # define schedstat_add(rq, field, amt)  do { } while (0)
 # define schedstat_set(var, val)        do { } while (0)
+# define schedstat_val(rq, field)       0
 #endif
 #ifdef CONFIG_SCHED_INFO
diff --git a/kernel/signal.c b/kernel/signal.c
index 96e9bc40667f..af21afc00d08 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2751,23 +2751,18 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
 *  @ts: upper bound on process time suspension
 */
 int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
-                        const struct timespec *ts)
+                    const struct timespec *ts)
 {
+        ktime_t *to = NULL, timeout = { .tv64 = KTIME_MAX };
        struct task_struct *tsk = current;
-        long timeout = MAX_SCHEDULE_TIMEOUT;
        sigset_t mask = *which;
-        int sig;
+        int sig, ret = 0;
        if (ts) {
                if (!timespec_valid(ts))
                        return -EINVAL;
-                timeout = timespec_to_jiffies(ts);
+                timeout = timespec_to_ktime(*ts);
-                /*
+                to = &timeout;
-                 * We can be close to the next tick, add another one
-                 * to ensure we will wait at least the time asked for.
-                 */
-                if (ts->tv_sec || ts->tv_nsec)
-                        timeout++;
        }
        /*
@@ -2778,7 +2773,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
        spin_lock_irq(&tsk->sighand->siglock);
        sig = dequeue_signal(tsk, &mask, info);
-        if (!sig && timeout) {
+        if (!sig && timeout.tv64) {
                /*
                 * None ready, temporarily unblock those we're interested
                 * while we are sleeping in so that we'll be awakened when
@@ -2790,8 +2785,9 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
                recalc_sigpending();
                spin_unlock_irq(&tsk->sighand->siglock);
-                timeout = freezable_schedule_timeout_interruptible(timeout);
+                __set_current_state(TASK_INTERRUPTIBLE);
+                ret = freezable_schedule_hrtimeout_range(to, tsk->timer_slack_ns,
+                                                         HRTIMER_MODE_REL);
                spin_lock_irq(&tsk->sighand->siglock);
                __set_task_blocked(tsk, &tsk->real_blocked);
                sigemptyset(&tsk->real_blocked);
@@ -2801,7 +2797,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
        if (sig)
                return sig;
-        return timeout ? -EINTR : -EAGAIN;
+        return ret ? -EINTR : -EAGAIN;
 }
 /**
diff --git a/kernel/smp.c b/kernel/smp.c
index 74165443c240..3aa642d39c03 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -33,69 +33,54 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);
 static void flush_smp_call_function_queue(bool warn_cpu_offline);
-static int
+int smpcfd_prepare_cpu(unsigned int cpu)
-hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
-        long cpu = (long)hcpu;
        struct call_function_data *cfd = &per_cpu(cfd_data, cpu);
-        switch (action) {
+        if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
-        case CPU_UP_PREPARE:
+                                     cpu_to_node(cpu)))
-        case CPU_UP_PREPARE_FROZEN:
+                return -ENOMEM;
-                if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
+        cfd->csd = alloc_percpu(struct call_single_data);
-                                cpu_to_node(cpu)))
+        if (!cfd->csd) {
-                        return notifier_from_errno(-ENOMEM);
-                cfd->csd = alloc_percpu(struct call_single_data);
-                if (!cfd->csd) {
-                        free_cpumask_var(cfd->cpumask);
-                        return notifier_from_errno(-ENOMEM);
-                }
-                break;
-#ifdef CONFIG_HOTPLUG_CPU
-        case CPU_UP_CANCELED:
-        case CPU_UP_CANCELED_FROZEN:
-                /* Fall-through to the CPU_DEAD[_FROZEN] case. */
-        case CPU_DEAD:
-        case CPU_DEAD_FROZEN:
                free_cpumask_var(cfd->cpumask);
-                free_percpu(cfd->csd);
+                return -ENOMEM;
-                break;
+        }
-        case CPU_DYING:
+        return 0;
-        case CPU_DYING_FROZEN:
+}
-                /*
-                 * The IPIs for the smp-call-function callbacks queued by other
+int smpcfd_dead_cpu(unsigned int cpu)
-                 * CPUs might arrive late, either due to hardware latencies or
+{
-                 * because this CPU disabled interrupts (inside stop-machine)
+        struct call_function_data *cfd = &per_cpu(cfd_data, cpu);
-                 * before the IPIs were sent. So flush out any pending callbacks
-                 * explicitly (without waiting for the IPIs to arrive), to
-                 * ensure that the outgoing CPU doesn't go offline with work
-                 * still pending.
-                 */
-                flush_smp_call_function_queue(false);
-                break;
-#endif
-        };
-        return NOTIFY_OK;
+        free_cpumask_var(cfd->cpumask);
+        free_percpu(cfd->csd);
+        return 0;
 }
-static struct notifier_block hotplug_cfd_notifier = {
+int smpcfd_dying_cpu(unsigned int cpu)
-        .notifier_call          = hotplug_cfd,
+{
-};
+        /*
+         * The IPIs for the smp-call-function callbacks queued by other
+         * CPUs might arrive late, either due to hardware latencies or
+         * because this CPU disabled interrupts (inside stop-machine)
+         * before the IPIs were sent. So flush out any pending callbacks
+         * explicitly (without waiting for the IPIs to arrive), to
+         * ensure that the outgoing CPU doesn't go offline with work
+         * still pending.
+         */
+        flush_smp_call_function_queue(false);
+        return 0;
+}
 void __init call_function_init(void)
 {
-        void *cpu = (void *)(long)smp_processor_id();
        int i;
        for_each_possible_cpu(i)
                init_llist_head(&per_cpu(call_single_queue, i));
-        hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu);
+        smpcfd_prepare_cpu(smp_processor_id());
-        register_cpu_notifier(&hotplug_cfd_notifier);
 }
 /*
@@ -107,7 +92,7 @@ void __init call_function_init(void)
 */
 static __always_inline void csd_lock_wait(struct call_single_data *csd)
 {
-        smp_cond_acquire(!(csd->flags & CSD_FLAG_LOCK));
+        smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK));
 }
 static __always_inline void csd_lock(struct call_single_data *csd)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 87b2fc38398b..53954631a4e1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1205,6 +1205,17 @@ static struct ctl_table kern_table[] = {
                .extra2         = &one,
        },
 #endif
+#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU)
+        {
+                .procname       = "panic_on_rcu_stall",
+                .data           = &sysctl_panic_on_rcu_stall,
+                .maxlen         = sizeof(sysctl_panic_on_rcu_stall),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &zero,
+                .extra2         = &one,
+        },
+#endif
        { }
 };
@@ -1497,8 +1508,8 @@ static struct ctl_table vm_table[] = {
 #ifdef CONFIG_NUMA
        {
                .procname       = "zone_reclaim_mode",
-                .data           = &zone_reclaim_mode,
+                .data           = &node_reclaim_mode,
-                .maxlen         = sizeof(zone_reclaim_mode),
+                .maxlen         = sizeof(node_reclaim_mode),
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
                .extra1         = &zero,
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 53fa971d000d..6ab4842b00e8 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -108,7 +108,6 @@ void task_work_run(void)
                 * fail, but it can play with *work and other entries.
                 */
                raw_spin_unlock_wait(&task->pi_lock);
-                smp_mb();
                do {
                        next = work->next;
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index e840ed867a5d..c3aad685bbc0 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -30,7 +30,6 @@
 * struct alarm_base - Alarm timer bases
 * @lock:               Lock for syncrhonized access to the base
 * @timerqueue:         Timerqueue head managing the list of events
- * @timer:              hrtimer used to schedule events while running
 * @gettime:            Function to read the time correlating to the base
 * @base_clockid:       clockid for the base
 */
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index a9b76a40319e..2c5bc77c0bb0 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -645,7 +645,7 @@ void tick_cleanup_dead_cpu(int cpu)
 #endif
 #ifdef CONFIG_SYSFS
-struct bus_type clockevents_subsys = {
+static struct bus_type clockevents_subsys = {
        .name           = "clockevents",
        .dev_name       = "clockevent",
 };
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 56ece145a814..6a5a310a1a53 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -669,10 +669,12 @@ static void clocksource_enqueue(struct clocksource *cs)
        struct list_head *entry = &clocksource_list;
        struct clocksource *tmp;
-        list_for_each_entry(tmp, &clocksource_list, list)
+        list_for_each_entry(tmp, &clocksource_list, list) {
                /* Keep track of the place, where to insert */
-                if (tmp->rating >= cs->rating)
+                if (tmp->rating < cs->rating)
-                        entry = &tmp->list;
+                        break;
+                entry = &tmp->list;
+        }
        list_add(&cs->list, entry);
 }
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index e99df0ff1d42..9ba7c820fc23 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -177,7 +177,7 @@ hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
 #endif
 }
-#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+#ifdef CONFIG_NO_HZ_COMMON
 static inline
 struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
                                         int pinned)
@@ -1590,7 +1590,7 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
 /*
 * Functions related to boot-time initialization:
 */
-static void init_hrtimers_cpu(int cpu)
+int hrtimers_prepare_cpu(unsigned int cpu)
 {
        struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
        int i;
@@ -1602,6 +1602,7 @@ static void init_hrtimers_cpu(int cpu)
        cpu_base->cpu = cpu;
        hrtimer_init_hres(cpu_base);
+        return 0;
 }
 #ifdef CONFIG_HOTPLUG_CPU
@@ -1636,7 +1637,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
        }
 }
-static void migrate_hrtimers(int scpu)
+int hrtimers_dead_cpu(unsigned int scpu)
 {
        struct hrtimer_cpu_base *old_base, *new_base;
        int i;
@@ -1665,45 +1666,14 @@ static void migrate_hrtimers(int scpu)
        /* Check, if we got expired work to do */
        __hrtimer_peek_ahead_timers();
        local_irq_enable();
+        return 0;
 }
 #endif /* CONFIG_HOTPLUG_CPU */
-static int hrtimer_cpu_notify(struct notifier_block *self,
-                                        unsigned long action, void *hcpu)
-{
-        int scpu = (long)hcpu;
-        switch (action) {
-        case CPU_UP_PREPARE:
-        case CPU_UP_PREPARE_FROZEN:
-                init_hrtimers_cpu(scpu);
-                break;
-#ifdef CONFIG_HOTPLUG_CPU
-        case CPU_DEAD:
-        case CPU_DEAD_FROZEN:
-                migrate_hrtimers(scpu);
-                break;
-#endif
-        default:
-                break;
-        }
-        return NOTIFY_OK;
-}
-static struct notifier_block hrtimers_nb = {
-        .notifier_call = hrtimer_cpu_notify,
-};
 void __init hrtimers_init(void)
 {
-        hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
+        hrtimers_prepare_cpu(smp_processor_id());
-                          (void *)(long)smp_processor_id());
-        register_cpu_notifier(&hrtimers_nb);
 }
 /**
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 1cafba860b08..39008d78927a 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -777,6 +777,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
                        timer->it.cpu.expires = 0;
                        sample_to_timespec(timer->it_clock, timer->it.cpu.expires,
                                           &itp->it_value);
+                        return;
                } else {
                        cpu_timer_sample_group(timer->it_clock, p, &now);
                        unlock_task_sighand(p, &flags);
diff --git a/kernel/time/test_udelay.c b/kernel/time/test_udelay.c
index e622ba365a13..b0928ab3270f 100644
--- a/kernel/time/test_udelay.c
+++ b/kernel/time/test_udelay.c
@@ -43,13 +43,13 @@ static int udelay_test_single(struct seq_file *s, int usecs, uint32_t iters)
        int allowed_error_ns = usecs * 5;
        for (i = 0; i < iters; ++i) {
-                struct timespec ts1, ts2;
+                s64 kt1, kt2;
                int time_passed;
-                ktime_get_ts(&ts1);
+                kt1 = ktime_get_ns();
                udelay(usecs);
-                ktime_get_ts(&ts2);
+                kt2 = ktime_get_ns();
-                time_passed = timespec_to_ns(&ts2) - timespec_to_ns(&ts1);
+                time_passed = kt2 - kt1;
                if (i == 0 || time_passed < min)
                        min = time_passed;
@@ -87,11 +87,11 @@ static int udelay_test_show(struct seq_file *s, void *v)
        if (usecs > 0 && iters > 0) {
                return udelay_test_single(s, usecs, iters);
        } else if (usecs == 0) {
-                struct timespec ts;
+                struct timespec64 ts;
-                ktime_get_ts(&ts);
+                ktime_get_ts64(&ts);
-                seq_printf(s, "udelay() test (lpj=%ld kt=%ld.%09ld)\n",
+                seq_printf(s, "udelay() test (lpj=%ld kt=%lld.%09ld)\n",
-                                loops_per_jiffy, ts.tv_sec, ts.tv_nsec);
+                                loops_per_jiffy, (s64)ts.tv_sec, ts.tv_nsec);
                seq_puts(s, "usage:\n");
                seq_puts(s, "echo USECS [ITERS] > " DEBUGFS_FILENAME "\n");
                seq_puts(s, "cat " DEBUGFS_FILENAME "\n");
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index 53d7184da0be..690b797f522e 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -75,6 +75,7 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
 }
 static struct clock_event_device ce_broadcast_hrtimer = {
+        .name                   = "bc_hrtimer",
        .set_state_shutdown     = bc_shutdown,
        .set_next_ktime         = bc_set_next,
        .features               = CLOCK_EVT_FEAT_ONESHOT |
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 966a5a6fdd0a..f738251000fe 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -164,3 +164,4 @@ static inline void timers_update_migration(bool update_nohz) { }
 DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
 extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem);
+void timer_clear_idle(void);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 536ada80f6dd..204fdc86863d 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -31,7 +31,7 @@
 #include <trace/events/timer.h>
 /*
- * Per cpu nohz control structure
+ * Per-CPU nohz control structure
 */
 static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
@@ -61,7 +61,7 @@ static void tick_do_update_jiffies64(ktime_t now)
        if (delta.tv64 < tick_period.tv64)
                return;
-        /* Reevalute with jiffies_lock held */
+        /* Reevaluate with jiffies_lock held */
        write_seqlock(&jiffies_lock);
        delta = ktime_sub(now, last_jiffies_update);
@@ -116,8 +116,8 @@ static void tick_sched_do_timer(ktime_t now)
 #ifdef CONFIG_NO_HZ_COMMON
        /*
         * Check if the do_timer duty was dropped. We don't care about
-         * concurrency: This happens only when the cpu in charge went
+         * concurrency: This happens only when the CPU in charge went
-         * into a long sleep. If two cpus happen to assign themself to
+         * into a long sleep. If two CPUs happen to assign themselves to
         * this duty, then the jiffies update is still serialized by
         * jiffies_lock.
         */
@@ -349,7 +349,7 @@ void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bi
 /*
 * Re-evaluate the need for the tick as we switch the current task.
 * It might need the tick due to per task/process properties:
- * perf events, posix cpu timers, ...
+ * perf events, posix CPU timers, ...
 */
 void __tick_nohz_task_switch(void)
 {
@@ -509,8 +509,8 @@ int tick_nohz_tick_stopped(void)
 *
 * In case the sched_tick was stopped on this CPU, we have to check if jiffies
 * must be updated. Otherwise an interrupt handler could use a stale jiffy
- * value. We do this unconditionally on any cpu, as we don't know whether the
+ * value. We do this unconditionally on any CPU, as we don't know whether the
- * cpu, which has the update task assigned is in a long sleep.
+ * CPU, which has the update task assigned is in a long sleep.
 */
 static void tick_nohz_update_jiffies(ktime_t now)
 {
@@ -526,7 +526,7 @@ static void tick_nohz_update_jiffies(ktime_t now)
 }
 /*
- * Updates the per cpu time idle statistics counters
+ * Updates the per-CPU time idle statistics counters
 */
 static void
 update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time)
@@ -566,12 +566,12 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
 }
 /**
- * get_cpu_idle_time_us - get the total idle time of a cpu
+ * get_cpu_idle_time_us - get the total idle time of a CPU
 * @cpu: CPU number to query
 * @last_update_time: variable to store update time in. Do not update
 * counters if NULL.
 *
- * Return the cummulative idle time (since boot) for a given
+ * Return the cumulative idle time (since boot) for a given
 * CPU, in microseconds.
 *
 * This time is measured via accounting rather than sampling,
@@ -607,12 +607,12 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
 EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
 /**
- * get_cpu_iowait_time_us - get the total iowait time of a cpu
+ * get_cpu_iowait_time_us - get the total iowait time of a CPU
 * @cpu: CPU number to query
 * @last_update_time: variable to store update time in. Do not update
 * counters if NULL.
 *
- * Return the cummulative iowait time (since boot) for a given
+ * Return the cumulative iowait time (since boot) for a given
 * CPU, in microseconds.
 *
 * This time is measured via accounting rather than sampling,
@@ -700,6 +700,12 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
        delta = next_tick - basemono;
        if (delta <= (u64)TICK_NSEC) {
                tick.tv64 = 0;
+                /*
+                 * Tell the timer code that the base is not idle, i.e. undo
+                 * the effect of get_next_timer_interrupt():
+                 */
+                timer_clear_idle();
                /*
                 * We've not stopped the tick yet, and there's a timer in the
                 * next period, so no point in stopping it either, bail.
@@ -726,14 +732,14 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
        }
        /*
-         * If this cpu is the one which updates jiffies, then give up
+         * If this CPU is the one which updates jiffies, then give up
-         * the assignment and let it be taken by the cpu which runs
+         * the assignment and let it be taken by the CPU which runs
-         * the tick timer next, which might be this cpu as well. If we
+         * the tick timer next, which might be this CPU as well. If we
         * don't drop this here the jiffies might be stale and
         * do_timer() never invoked. Keep track of the fact that it
-         * was the one which had the do_timer() duty last. If this cpu
+         * was the one which had the do_timer() duty last. If this CPU
         * is the one which had the do_timer() duty last, we limit the
-         * sleep time to the timekeeping max_deferement value.
+         * sleep time to the timekeeping max_deferment value.
         * Otherwise we can sleep as long as we want.
         */
        delta = timekeeping_max_deferment();
@@ -809,6 +815,12 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
        tick_do_update_jiffies64(now);
        cpu_load_update_nohz_stop();
+        /*
+         * Clear the timer idle flag, so we avoid IPIs on remote queueing and
+         * the clock forward checks in the enqueue path:
+         */
+        timer_clear_idle();
        calc_load_exit_idle();
        touch_softlockup_watchdog_sched();
        /*
@@ -841,9 +853,9 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts)
 static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
 {
        /*
-         * If this cpu is offline and it is the one which updates
+         * If this CPU is offline and it is the one which updates
         * jiffies, then give up the assignment and let it be taken by
-         * the cpu which runs the tick timer next. If we don't drop
+         * the CPU which runs the tick timer next. If we don't drop
         * this here the jiffies might be stale and do_timer() never
         * invoked.
         */
@@ -896,11 +908,10 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts)
        ktime_t now, expires;
        int cpu = smp_processor_id();
-        now = tick_nohz_start_idle(ts);
        if (can_stop_idle_tick(cpu, ts)) {
                int was_stopped = ts->tick_stopped;
+                now = tick_nohz_start_idle(ts);
                ts->idle_calls++;
                expires = tick_nohz_stop_sched_tick(ts, now, cpu);
@@ -933,11 +944,11 @@ void tick_nohz_idle_enter(void)
        WARN_ON_ONCE(irqs_disabled());
        /*
-         * Update the idle state in the scheduler domain hierarchy
+         * Update the idle state in the scheduler domain hierarchy
-         * when tick_nohz_stop_sched_tick() is called from the idle loop.
+         * when tick_nohz_stop_sched_tick() is called from the idle loop.
-         * State will be updated to busy during the first busy tick after
+         * State will be updated to busy during the first busy tick after
-         * exiting idle.
+         * exiting idle.
-         */
+         */
        set_cpu_sd_state_idle();
        local_irq_disable();
@@ -1092,35 +1103,6 @@ static void tick_nohz_switch_to_nohz(void)
        tick_nohz_activate(ts, NOHZ_MODE_LOWRES);
 }
-/*
- * When NOHZ is enabled and the tick is stopped, we need to kick the
- * tick timer from irq_enter() so that the jiffies update is kept
- * alive during long running softirqs. That's ugly as hell, but
- * correctness is key even if we need to fix the offending softirq in
- * the first place.
- *
- * Note, this is different to tick_nohz_restart. We just kick the
- * timer and do not touch the other magic bits which need to be done
- * when idle is left.
- */
-static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now)
-{
-#if 0
-        /* Switch back to 2.6.27 behaviour */
-        ktime_t delta;
-        /*
-         * Do not touch the tick device, when the next expiry is either
-         * already reached or less/equal than the tick period.
-         */
-        delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now);
-        if (delta.tv64 <= tick_period.tv64)
-                return;
-        tick_nohz_restart(ts, now);
-#endif
-}
 static inline void tick_nohz_irq_enter(void)
 {
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
@@ -1131,10 +1113,8 @@ static inline void tick_nohz_irq_enter(void)
        now = ktime_get();
        if (ts->idle_active)
                tick_nohz_stop_idle(ts, now);
-        if (ts->tick_stopped) {
+        if (ts->tick_stopped)
                tick_nohz_update_jiffies(now);
-                tick_nohz_kick_tick(ts, now);
-        }
 }
 #else
@@ -1211,7 +1191,7 @@ void tick_setup_sched_timer(void)
        hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
        ts->sched_timer.function = tick_sched_timer;
-        /* Get the next period (per cpu) */
+        /* Get the next period (per-CPU) */
        hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
        /* Offset the tick to avert jiffies_lock contention. */
diff --git a/kernel/time/timeconv.c b/kernel/time/timeconv.c
index 86628e755f38..7142580ad94f 100644
--- a/kernel/time/timeconv.c
+++ b/kernel/time/timeconv.c
@@ -67,20 +67,21 @@ static const unsigned short __mon_yday[2][13] = {
 #define SECS_PER_DAY    (SECS_PER_HOUR * 24)
 /**
- * time_to_tm - converts the calendar time to local broken-down time
+ * time64_to_tm - converts the calendar time to local broken-down time
 *
 * @totalsecs   the number of seconds elapsed since 00:00:00 on January 1, 1970,
 *              Coordinated Universal Time (UTC).
 * @offset      offset seconds adding to totalsecs.
 * @result      pointer to struct tm variable to receive broken-down time
 */
-void time_to_tm(time_t totalsecs, int offset, struct tm *result)
+void time64_to_tm(time64_t totalsecs, int offset, struct tm *result)
 {
        long days, rem, y;
+        int remainder;
        const unsigned short *ip;
-        days = totalsecs / SECS_PER_DAY;
+        days = div_s64_rem(totalsecs, SECS_PER_DAY, &remainder);
-        rem = totalsecs % SECS_PER_DAY;
+        rem = remainder;
        rem += offset;
        while (rem < 0) {
                rem += SECS_PER_DAY;
@@ -124,4 +125,4 @@ void time_to_tm(time_t totalsecs, int offset, struct tm *result)
        result->tm_mon = y;
        result->tm_mday = days + 1;
 }
-EXPORT_SYMBOL(time_to_tm);
+EXPORT_SYMBOL(time64_to_tm);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 479d25cd3d4f..3b65746c7f15 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -480,10 +480,12 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk)
        * users are removed, this can be killed.
        */
        remainder = tk->tkr_mono.xtime_nsec & ((1ULL << tk->tkr_mono.shift) - 1);
-        tk->tkr_mono.xtime_nsec -= remainder;
+        if (remainder != 0) {
-        tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift;
+                tk->tkr_mono.xtime_nsec -= remainder;
-        tk->ntp_error += remainder << tk->ntp_error_shift;
+                tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift;
-        tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift;
+                tk->ntp_error += remainder << tk->ntp_error_shift;
+                tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift;
+        }
 }
 #else
 #define old_vsyscall_fixup(tk)
@@ -2186,6 +2188,7 @@ struct timespec64 get_monotonic_coarse64(void)
        return now;
 }
+EXPORT_SYMBOL(get_monotonic_coarse64);
 /*
 * Must hold jiffies_lock
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 3a95f9728778..555670a5143c 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -59,43 +59,153 @@ __visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
 EXPORT_SYMBOL(jiffies_64);
 /*
- * per-CPU timer vector definitions:
+ * The timer wheel has LVL_DEPTH array levels. Each level provides an array of
+ * LVL_SIZE buckets. Each level is driven by its own clock and therefor each
+ * level has a different granularity.
+ *
+ * The level granularity is:            LVL_CLK_DIV ^ lvl
+ * The level clock frequency is:        HZ / (LVL_CLK_DIV ^ level)
+ *
+ * The array level of a newly armed timer depends on the relative expiry
+ * time. The farther the expiry time is away the higher the array level and
+ * therefor the granularity becomes.
+ *
+ * Contrary to the original timer wheel implementation, which aims for 'exact'
+ * expiry of the timers, this implementation removes the need for recascading
+ * the timers into the lower array levels. The previous 'classic' timer wheel
+ * implementation of the kernel already violated the 'exact' expiry by adding
+ * slack to the expiry time to provide batched expiration. The granularity
+ * levels provide implicit batching.
+ *
+ * This is an optimization of the original timer wheel implementation for the
+ * majority of the timer wheel use cases: timeouts. The vast majority of
+ * timeout timers (networking, disk I/O ...) are canceled before expiry. If
+ * the timeout expires it indicates that normal operation is disturbed, so it
+ * does not matter much whether the timeout comes with a slight delay.
+ *
+ * The only exception to this are networking timers with a small expiry
+ * time. They rely on the granularity. Those fit into the first wheel level,
+ * which has HZ granularity.
+ *
+ * We don't have cascading anymore. timers with a expiry time above the
+ * capacity of the last wheel level are force expired at the maximum timeout
+ * value of the last wheel level. From data sampling we know that the maximum
+ * value observed is 5 days (network connection tracking), so this should not
+ * be an issue.
+ *
+ * The currently chosen array constants values are a good compromise between
+ * array size and granularity.
+ *
+ * This results in the following granularity and range levels:
+ *
+ * HZ 1000 steps
+ * Level Offset  Granularity            Range
+ *  0      0         1 ms                0 ms -         63 ms
+ *  1     64         8 ms               64 ms -        511 ms
+ *  2    128        64 ms              512 ms -       4095 ms (512ms - ~4s)
+ *  3    192       512 ms             4096 ms -      32767 ms (~4s - ~32s)
+ *  4    256      4096 ms (~4s)      32768 ms -     262143 ms (~32s - ~4m)
+ *  5    320     32768 ms (~32s)    262144 ms -    2097151 ms (~4m - ~34m)
+ *  6    384    262144 ms (~4m)    2097152 ms -   16777215 ms (~34m - ~4h)
+ *  7    448   2097152 ms (~34m)  16777216 ms -  134217727 ms (~4h - ~1d)
+ *  8    512  16777216 ms (~4h)  134217728 ms - 1073741822 ms (~1d - ~12d)
+ *
+ * HZ  300
+ * Level Offset  Granularity            Range
+ *  0      0         3 ms                0 ms -        210 ms
+ *  1     64        26 ms              213 ms -       1703 ms (213ms - ~1s)
+ *  2    128       213 ms             1706 ms -      13650 ms (~1s - ~13s)
+ *  3    192      1706 ms (~1s)      13653 ms -     109223 ms (~13s - ~1m)
+ *  4    256     13653 ms (~13s)    109226 ms -     873810 ms (~1m - ~14m)
+ *  5    320    109226 ms (~1m)     873813 ms -    6990503 ms (~14m - ~1h)
+ *  6    384    873813 ms (~14m)   6990506 ms -   55924050 ms (~1h - ~15h)
+ *  7    448   6990506 ms (~1h)   55924053 ms -  447392423 ms (~15h - ~5d)
+ *  8    512  55924053 ms (~15h) 447392426 ms - 3579139406 ms (~5d - ~41d)
+ *
+ * HZ  250
+ * Level Offset  Granularity            Range
+ *  0      0         4 ms                0 ms -        255 ms
+ *  1     64        32 ms              256 ms -       2047 ms (256ms - ~2s)
+ *  2    128       256 ms             2048 ms -      16383 ms (~2s - ~16s)
+ *  3    192      2048 ms (~2s)      16384 ms -     131071 ms (~16s - ~2m)
+ *  4    256     16384 ms (~16s)    131072 ms -    1048575 ms (~2m - ~17m)
+ *  5    320    131072 ms (~2m)    1048576 ms -    8388607 ms (~17m - ~2h)
+ *  6    384   1048576 ms (~17m)   8388608 ms -   67108863 ms (~2h - ~18h)
+ *  7    448   8388608 ms (~2h)   67108864 ms -  536870911 ms (~18h - ~6d)
+ *  8    512  67108864 ms (~18h) 536870912 ms - 4294967288 ms (~6d - ~49d)
+ *
+ * HZ  100
+ * Level Offset  Granularity            Range
+ *  0      0         10 ms               0 ms -        630 ms
+ *  1     64         80 ms             640 ms -       5110 ms (640ms - ~5s)
+ *  2    128        640 ms            5120 ms -      40950 ms (~5s - ~40s)
+ *  3    192       5120 ms (~5s)     40960 ms -     327670 ms (~40s - ~5m)
+ *  4    256      40960 ms (~40s)   327680 ms -    2621430 ms (~5m - ~43m)
+ *  5    320     327680 ms (~5m)   2621440 ms -   20971510 ms (~43m - ~5h)
+ *  6    384    2621440 ms (~43m) 20971520 ms -  167772150 ms (~5h - ~1d)
+ *  7    448   20971520 ms (~5h) 167772160 ms - 1342177270 ms (~1d - ~15d)
 */
-#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)
-#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)
-#define TVN_SIZE (1 << TVN_BITS)
-#define TVR_SIZE (1 << TVR_BITS)
-#define TVN_MASK (TVN_SIZE - 1)
-#define TVR_MASK (TVR_SIZE - 1)
-#define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1))
-struct tvec {
-        struct hlist_head vec[TVN_SIZE];
-};
-struct tvec_root {
+/* Clock divisor for the next level */
-        struct hlist_head vec[TVR_SIZE];
+#define LVL_CLK_SHIFT   3
-};
+#define LVL_CLK_DIV     (1UL << LVL_CLK_SHIFT)
+#define LVL_CLK_MASK    (LVL_CLK_DIV - 1)
+#define LVL_SHIFT(n)    ((n) * LVL_CLK_SHIFT)
+#define LVL_GRAN(n)     (1UL << LVL_SHIFT(n))
-struct tvec_base {
+/*
-        spinlock_t lock;
+ * The time start value for each level to select the bucket at enqueue
-        struct timer_list *running_timer;
+ * time.
-        unsigned long timer_jiffies;
+ */
-        unsigned long next_timer;
+#define LVL_START(n)    ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT))
-        unsigned long active_timers;
-        unsigned long all_timers;
+/* Size of each clock level */
-        int cpu;
+#define LVL_BITS        6
-        bool migration_enabled;
+#define LVL_SIZE        (1UL << LVL_BITS)
-        bool nohz_active;
+#define LVL_MASK        (LVL_SIZE - 1)
-        struct tvec_root tv1;
+#define LVL_OFFS(n)     ((n) * LVL_SIZE)
-        struct tvec tv2;
-        struct tvec tv3;
+/* Level depth */
-        struct tvec tv4;
+#if HZ > 100
-        struct tvec tv5;
+# define LVL_DEPTH      9
-} ____cacheline_aligned;
+# else
+# define LVL_DEPTH      8
+#endif
+/* The cutoff (max. capacity of the wheel) */
+#define WHEEL_TIMEOUT_CUTOFF    (LVL_START(LVL_DEPTH))
+#define WHEEL_TIMEOUT_MAX       (WHEEL_TIMEOUT_CUTOFF - LVL_GRAN(LVL_DEPTH - 1))
+/*
+ * The resulting wheel size. If NOHZ is configured we allocate two
+ * wheels so we have a separate storage for the deferrable timers.
+ */
+#define WHEEL_SIZE      (LVL_SIZE * LVL_DEPTH)
+#ifdef CONFIG_NO_HZ_COMMON
+# define NR_BASES       2
+# define BASE_STD       0
+# define BASE_DEF       1
+#else
+# define NR_BASES       1
+# define BASE_STD       0
+# define BASE_DEF       0
+#endif
+struct timer_base {
+        spinlock_t              lock;
+        struct timer_list       *running_timer;
+        unsigned long           clk;
+        unsigned long           next_expiry;
+        unsigned int            cpu;
+        bool                    migration_enabled;
+        bool                    nohz_active;
+        bool                    is_idle;
+        DECLARE_BITMAP(pending_map, WHEEL_SIZE);
+        struct hlist_head       vectors[WHEEL_SIZE];
+} ____cacheline_aligned;
-static DEFINE_PER_CPU(struct tvec_base, tvec_bases);
+static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]);
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
 unsigned int sysctl_timer_migration = 1;
@@ -106,15 +216,17 @@ void timers_update_migration(bool update_nohz)
        unsigned int cpu;
        /* Avoid the loop, if nothing to update */
-        if (this_cpu_read(tvec_bases.migration_enabled) == on)
+        if (this_cpu_read(timer_bases[BASE_STD].migration_enabled) == on)
                return;
        for_each_possible_cpu(cpu) {
-                per_cpu(tvec_bases.migration_enabled, cpu) = on;
+                per_cpu(timer_bases[BASE_STD].migration_enabled, cpu) = on;
+                per_cpu(timer_bases[BASE_DEF].migration_enabled, cpu) = on;
                per_cpu(hrtimer_bases.migration_enabled, cpu) = on;
                if (!update_nohz)
                        continue;
-                per_cpu(tvec_bases.nohz_active, cpu) = true;
+                per_cpu(timer_bases[BASE_STD].nohz_active, cpu) = true;
+                per_cpu(timer_bases[BASE_DEF].nohz_active, cpu) = true;
                per_cpu(hrtimer_bases.nohz_active, cpu) = true;
        }
 }
@@ -133,20 +245,6 @@ int timer_migration_handler(struct ctl_table *table, int write,
        mutex_unlock(&mutex);
        return ret;
 }
-static inline struct tvec_base *get_target_base(struct tvec_base *base,
-                                                int pinned)
-{
-        if (pinned || !base->migration_enabled)
-                return this_cpu_ptr(&tvec_bases);
-        return per_cpu_ptr(&tvec_bases, get_nohz_timer_target());
-}
-#else
-static inline struct tvec_base *get_target_base(struct tvec_base *base,
-                                                int pinned)
-{
-        return this_cpu_ptr(&tvec_bases);
-}
 #endif
 static unsigned long round_jiffies_common(unsigned long j, int cpu,
@@ -351,101 +449,126 @@ unsigned long round_jiffies_up_relative(unsigned long j)
 }
 EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
-/**
- * set_timer_slack - set the allowed slack for a timer
+static inline unsigned int timer_get_idx(struct timer_list *timer)
- * @timer: the timer to be modified
- * @slack_hz: the amount of time (in jiffies) allowed for rounding
- *
- * Set the amount of time, in jiffies, that a certain timer has
- * in terms of slack. By setting this value, the timer subsystem
- * will schedule the actual timer somewhere between
- * the time mod_timer() asks for, and that time plus the slack.
- *
- * By setting the slack to -1, a percentage of the delay is used
- * instead.
- */
-void set_timer_slack(struct timer_list *timer, int slack_hz)
 {
-        timer->slack = slack_hz;
+        return (timer->flags & TIMER_ARRAYMASK) >> TIMER_ARRAYSHIFT;
 }
-EXPORT_SYMBOL_GPL(set_timer_slack);
-static void
+static inline void timer_set_idx(struct timer_list *timer, unsigned int idx)
-__internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 {
-        unsigned long expires = timer->expires;
+        timer->flags = (timer->flags & ~TIMER_ARRAYMASK) |
-        unsigned long idx = expires - base->timer_jiffies;
+                        idx << TIMER_ARRAYSHIFT;
-        struct hlist_head *vec;
+}
-        if (idx < TVR_SIZE) {
+/*
-                int i = expires & TVR_MASK;
+ * Helper function to calculate the array index for a given expiry
-                vec = base->tv1.vec + i;
+ * time.
-        } else if (idx < 1 << (TVR_BITS + TVN_BITS)) {
+ */
-                int i = (expires >> TVR_BITS) & TVN_MASK;
+static inline unsigned calc_index(unsigned expires, unsigned lvl)
-                vec = base->tv2.vec + i;
+{
-        } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) {
+        expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl);
-                int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK;
+        return LVL_OFFS(lvl) + (expires & LVL_MASK);
-                vec = base->tv3.vec + i;
+}
-        } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) {
-                int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK;
+static int calc_wheel_index(unsigned long expires, unsigned long clk)
-                vec = base->tv4.vec + i;
+{
-        } else if ((signed long) idx < 0) {
+        unsigned long delta = expires - clk;
-                /*
+        unsigned int idx;
-                 * Can happen if you add a timer with expires == jiffies,
-                 * or you set a timer to go off in the past
+        if (delta < LVL_START(1)) {
-                 */
+                idx = calc_index(expires, 0);
-                vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK);
+        } else if (delta < LVL_START(2)) {
+                idx = calc_index(expires, 1);
+        } else if (delta < LVL_START(3)) {
+                idx = calc_index(expires, 2);
+        } else if (delta < LVL_START(4)) {
+                idx = calc_index(expires, 3);
+        } else if (delta < LVL_START(5)) {
+                idx = calc_index(expires, 4);
+        } else if (delta < LVL_START(6)) {
+                idx = calc_index(expires, 5);
+        } else if (delta < LVL_START(7)) {
+                idx = calc_index(expires, 6);
+        } else if (LVL_DEPTH > 8 && delta < LVL_START(8)) {
+                idx = calc_index(expires, 7);
+        } else if ((long) delta < 0) {
+                idx = clk & LVL_MASK;
        } else {
-                int i;
+                /*
-                /* If the timeout is larger than MAX_TVAL (on 64-bit
+                 * Force expire obscene large timeouts to expire at the
-                 * architectures or with CONFIG_BASE_SMALL=1) then we
+                 * capacity limit of the wheel.
-                 * use the maximum timeout.
                 */
-                if (idx > MAX_TVAL) {
+                if (expires >= WHEEL_TIMEOUT_CUTOFF)
-                        idx = MAX_TVAL;
+                        expires = WHEEL_TIMEOUT_MAX;
-                        expires = idx + base->timer_jiffies;
-                }
+                idx = calc_index(expires, LVL_DEPTH - 1);
-                i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
-                vec = base->tv5.vec + i;
        }
+        return idx;
+}
+/*
+ * Enqueue the timer into the hash bucket, mark it pending in
+ * the bitmap and store the index in the timer flags.
+ */
+static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
+                          unsigned int idx)
+{
+        hlist_add_head(&timer->entry, base->vectors + idx);
+        __set_bit(idx, base->pending_map);
+        timer_set_idx(timer, idx);
+}
+static void
+__internal_add_timer(struct timer_base *base, struct timer_list *timer)
+{
+        unsigned int idx;
-        hlist_add_head(&timer->entry, vec);
+        idx = calc_wheel_index(timer->expires, base->clk);
+        enqueue_timer(base, timer, idx);
 }
-static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
+static void
+trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
 {
-        /* Advance base->jiffies, if the base is empty */
+        if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active)
-        if (!base->all_timers++)
+                return;
-                base->timer_jiffies = jiffies;
-        __internal_add_timer(base, timer);
        /*
-         * Update base->active_timers and base->next_timer
+         * TODO: This wants some optimizing similar to the code below, but we
+         * will do that when we switch from push to pull for deferrable timers.
         */
-        if (!(timer->flags & TIMER_DEFERRABLE)) {
+        if (timer->flags & TIMER_DEFERRABLE) {
-                if (!base->active_timers++ ||
+                if (tick_nohz_full_cpu(base->cpu))
-                    time_before(timer->expires, base->next_timer))
+                        wake_up_nohz_cpu(base->cpu);
-                        base->next_timer = timer->expires;
+                return;
        }
        /*
-         * Check whether the other CPU is in dynticks mode and needs
+         * We might have to IPI the remote CPU if the base is idle and the
-         * to be triggered to reevaluate the timer wheel.
+         * timer is not deferrable. If the other CPU is on the way to idle
-         * We are protected against the other CPU fiddling
+         * then it can't set base->is_idle as we hold the base lock:
-         * with the timer by holding the timer base lock. This also
-         * makes sure that a CPU on the way to stop its tick can not
-         * evaluate the timer wheel.
-         *
-         * Spare the IPI for deferrable timers on idle targets though.
-         * The next busy ticks will take care of it. Except full dynticks
-         * require special care against races with idle_cpu(), lets deal
-         * with that later.
         */
-        if (base->nohz_active) {
+        if (!base->is_idle)
-                if (!(timer->flags & TIMER_DEFERRABLE) ||
+                return;
-                    tick_nohz_full_cpu(base->cpu))
-                        wake_up_nohz_cpu(base->cpu);
+        /* Check whether this is the new first expiring timer: */
-        }
+        if (time_after_eq(timer->expires, base->next_expiry))
+                return;
+        /*
+         * Set the next expiry time and kick the CPU so it can reevaluate the
+         * wheel:
+         */
+        base->next_expiry = timer->expires;
+                wake_up_nohz_cpu(base->cpu);
+}
+static void
+internal_add_timer(struct timer_base *base, struct timer_list *timer)
+{
+        __internal_add_timer(base, timer);
+        trigger_dyntick_cpu(base, timer);
 }
 #ifdef CONFIG_TIMER_STATS
@@ -666,7 +789,6 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags,
 {
        timer->entry.pprev = NULL;
        timer->flags = flags | raw_smp_processor_id();
-        timer->slack = -1;
 #ifdef CONFIG_TIMER_STATS
        timer->start_site = NULL;
        timer->start_pid = -1;
@@ -706,54 +828,125 @@ static inline void detach_timer(struct timer_list *timer, bool clear_pending)
        entry->next = LIST_POISON2;
 }
-static inline void
+static int detach_if_pending(struct timer_list *timer, struct timer_base *base,
-detach_expired_timer(struct timer_list *timer, struct tvec_base *base)
-{
-        detach_timer(timer, true);
-        if (!(timer->flags & TIMER_DEFERRABLE))
-                base->active_timers--;
-        base->all_timers--;
-}
-static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
                             bool clear_pending)
 {
+        unsigned idx = timer_get_idx(timer);
        if (!timer_pending(timer))
                return 0;
+        if (hlist_is_singular_node(&timer->entry, base->vectors + idx))
+                __clear_bit(idx, base->pending_map);
        detach_timer(timer, clear_pending);
-        if (!(timer->flags & TIMER_DEFERRABLE)) {
-                base->active_timers--;
-                if (timer->expires == base->next_timer)
-                        base->next_timer = base->timer_jiffies;
-        }
-        /* If this was the last timer, advance base->jiffies */
-        if (!--base->all_timers)
-                base->timer_jiffies = jiffies;
        return 1;
 }
+static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu)
+{
+        struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu);
+        /*
+         * If the timer is deferrable and nohz is active then we need to use
+         * the deferrable base.
+         */
+        if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active &&
+            (tflags & TIMER_DEFERRABLE))
+                base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu);
+        return base;
+}
+static inline struct timer_base *get_timer_this_cpu_base(u32 tflags)
+{
+        struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
+        /*
+         * If the timer is deferrable and nohz is active then we need to use
+         * the deferrable base.
+         */
+        if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active &&
+            (tflags & TIMER_DEFERRABLE))
+                base = this_cpu_ptr(&timer_bases[BASE_DEF]);
+        return base;
+}
+static inline struct timer_base *get_timer_base(u32 tflags)
+{
+        return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK);
+}
+#ifdef CONFIG_NO_HZ_COMMON
+static inline struct timer_base *
+__get_target_base(struct timer_base *base, unsigned tflags)
+{
+#ifdef CONFIG_SMP
+        if ((tflags & TIMER_PINNED) || !base->migration_enabled)
+                return get_timer_this_cpu_base(tflags);
+        return get_timer_cpu_base(tflags, get_nohz_timer_target());
+#else
+        return get_timer_this_cpu_base(tflags);
+#endif
+}
+static inline void forward_timer_base(struct timer_base *base)
+{
+        /*
+         * We only forward the base when it's idle and we have a delta between
+         * base clock and jiffies.
+         */
+        if (!base->is_idle || (long) (jiffies - base->clk) < 2)
+                return;
+        /*
+         * If the next expiry value is > jiffies, then we fast forward to
+         * jiffies otherwise we forward to the next expiry value.
+         */
+        if (time_after(base->next_expiry, jiffies))
+                base->clk = jiffies;
+        else
+                base->clk = base->next_expiry;
+}
+#else
+static inline struct timer_base *
+__get_target_base(struct timer_base *base, unsigned tflags)
+{
+        return get_timer_this_cpu_base(tflags);
+}
+static inline void forward_timer_base(struct timer_base *base) { }
+#endif
+static inline struct timer_base *
+get_target_base(struct timer_base *base, unsigned tflags)
+{
+        struct timer_base *target = __get_target_base(base, tflags);
+        forward_timer_base(target);
+        return target;
+}
 /*
- * We are using hashed locking: holding per_cpu(tvec_bases).lock
+ * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means
- * means that all timers which are tied to this base via timer->base are
+ * that all timers which are tied to this base are locked, and the base itself
- * locked, and the base itself is locked too.
+ * is locked too.
 *
 * So __run_timers/migrate_timers can safely modify all timers which could
- * be found on ->tvX lists.
+ * be found in the base->vectors array.
 *
- * When the timer's base is locked and removed from the list, the
+ * When a timer is migrating then the TIMER_MIGRATING flag is set and we need
- * TIMER_MIGRATING flag is set, FIXME
+ * to wait until the migration is done.
 */
-static struct tvec_base *lock_timer_base(struct timer_list *timer,
+static struct timer_base *lock_timer_base(struct timer_list *timer,
-                                        unsigned long *flags)
+                                          unsigned long *flags)
        __acquires(timer->base->lock)
 {
        for (;;) {
+                struct timer_base *base;
                u32 tf = timer->flags;
-                struct tvec_base *base;
                if (!(tf & TIMER_MIGRATING)) {
-                        base = per_cpu_ptr(&tvec_bases, tf & TIMER_CPUMASK);
+                        base = get_timer_base(tf);
                        spin_lock_irqsave(&base->lock, *flags);
                        if (timer->flags == tf)
                                return base;
@@ -764,13 +957,41 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer,
 }
 static inline int
-__mod_timer(struct timer_list *timer, unsigned long expires,
+__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
-            bool pending_only, int pinned)
 {
-        struct tvec_base *base, *new_base;
+        struct timer_base *base, *new_base;
-        unsigned long flags;
+        unsigned int idx = UINT_MAX;
+        unsigned long clk = 0, flags;
        int ret = 0;
+        /*
+         * This is a common optimization triggered by the networking code - if
+         * the timer is re-modified to have the same timeout or ends up in the
+         * same array bucket then just return:
+         */
+        if (timer_pending(timer)) {
+                if (timer->expires == expires)
+                        return 1;
+                /*
+                 * Take the current timer_jiffies of base, but without holding
+                 * the lock!
+                 */
+                base = get_timer_base(timer->flags);
+                clk = base->clk;
+                idx = calc_wheel_index(expires, clk);
+                /*
+                 * Retrieve and compare the array index of the pending
+                 * timer. If it matches set the expiry to the new value so a
+                 * subsequent call will exit in the expires check above.
+                 */
+                if (idx == timer_get_idx(timer)) {
+                        timer->expires = expires;
+                        return 1;
+                }
+        }
        timer_stats_timer_set_start_info(timer);
        BUG_ON(!timer->function);
@@ -782,15 +1003,15 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
        debug_activate(timer, expires);
-        new_base = get_target_base(base, pinned);
+        new_base = get_target_base(base, timer->flags);
        if (base != new_base) {
                /*
-                 * We are trying to schedule the timer on the local CPU.
+                 * We are trying to schedule the timer on the new base.
                 * However we can't change timer's base while it is running,
                 * otherwise del_timer_sync() can't detect that the timer's
-                 * handler yet has not finished. This also guarantees that
+                 * handler yet has not finished. This also guarantees that the
-                 * the timer is serialized wrt itself.
+                 * timer is serialized wrt itself.
                 */
                if (likely(base->running_timer != timer)) {
                        /* See the comment in lock_timer_base() */
@@ -805,7 +1026,18 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
        }
        timer->expires = expires;
-        internal_add_timer(base, timer);
+        /*
+         * If 'idx' was calculated above and the base time did not advance
+         * between calculating 'idx' and taking the lock, only enqueue_timer()
+         * and trigger_dyntick_cpu() is required. Otherwise we need to
+         * (re)calculate the wheel index via internal_add_timer().
+         */
+        if (idx != UINT_MAX && clk == base->clk) {
+                enqueue_timer(base, timer, idx);
+                trigger_dyntick_cpu(base, timer);
+        } else {
+                internal_add_timer(base, timer);
+        }
 out_unlock:
        spin_unlock_irqrestore(&base->lock, flags);
@@ -825,49 +1057,10 @@ out_unlock:
 */
 int mod_timer_pending(struct timer_list *timer, unsigned long expires)
 {
-        return __mod_timer(timer, expires, true, TIMER_NOT_PINNED);
+        return __mod_timer(timer, expires, true);
 }
 EXPORT_SYMBOL(mod_timer_pending);
-/*
- * Decide where to put the timer while taking the slack into account
- *
- * Algorithm:
- *   1) calculate the maximum (absolute) time
- *   2) calculate the highest bit where the expires and new max are different
- *   3) use this bit to make a mask
- *   4) use the bitmask to round down the maximum time, so that all last
- *      bits are zeros
- */
-static inline
-unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
-{
-        unsigned long expires_limit, mask;
-        int bit;
-        if (timer->slack >= 0) {
-                expires_limit = expires + timer->slack;
-        } else {
-                long delta = expires - jiffies;
-                if (delta < 256)
-                        return expires;
-                expires_limit = expires + delta / 256;
-        }
-        mask = expires ^ expires_limit;
-        if (mask == 0)
-                return expires;
-        bit = __fls(mask);
-        mask = (1UL << bit) - 1;
-        expires_limit = expires_limit & ~(mask);
-        return expires_limit;
-}
 /**
 * mod_timer - modify a timer's timeout
 * @timer: the timer to be modified
@@ -890,49 +1083,11 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
 */
 int mod_timer(struct timer_list *timer, unsigned long expires)
 {
-        expires = apply_slack(timer, expires);
+        return __mod_timer(timer, expires, false);
-        /*
-         * This is a common optimization triggered by the
-         * networking code - if the timer is re-modified
-         * to be the same thing then just return:
-         */
-        if (timer_pending(timer) && timer->expires == expires)
-                return 1;
-        return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
 }
 EXPORT_SYMBOL(mod_timer);
 /**
- * mod_timer_pinned - modify a timer's timeout
- * @timer: the timer to be modified
- * @expires: new timeout in jiffies
- *
- * mod_timer_pinned() is a way to update the expire field of an
- * active timer (if the timer is inactive it will be activated)
- * and to ensure that the timer is scheduled on the current CPU.
- *
- * Note that this does not prevent the timer from being migrated
- * when the current CPU goes offline.  If this is a problem for
- * you, use CPU-hotplug notifiers to handle it correctly, for
- * example, cancelling the timer when the corresponding CPU goes
- * offline.
- *
- * mod_timer_pinned(timer, expires) is equivalent to:
- *
- *     del_timer(timer); timer->expires = expires; add_timer(timer);
- */
-int mod_timer_pinned(struct timer_list *timer, unsigned long expires)
-{
-        if (timer->expires == expires && timer_pending(timer))
-                return 1;
-        return __mod_timer(timer, expires, false, TIMER_PINNED);
-}
-EXPORT_SYMBOL(mod_timer_pinned);
-/**
 * add_timer - start a timer
 * @timer: the timer to be added
 *
@@ -962,13 +1117,14 @@ EXPORT_SYMBOL(add_timer);
 */
 void add_timer_on(struct timer_list *timer, int cpu)
 {
-        struct tvec_base *new_base = per_cpu_ptr(&tvec_bases, cpu);
+        struct timer_base *new_base, *base;
-        struct tvec_base *base;
        unsigned long flags;
        timer_stats_timer_set_start_info(timer);
        BUG_ON(timer_pending(timer) || !timer->function);
+        new_base = get_timer_cpu_base(timer->flags, cpu);
        /*
         * If @timer was on a different CPU, it should be migrated with the
         * old base locked to prevent other operations proceeding with the
@@ -1004,7 +1160,7 @@ EXPORT_SYMBOL_GPL(add_timer_on);
 */
 int del_timer(struct timer_list *timer)
 {
-        struct tvec_base *base;
+        struct timer_base *base;
        unsigned long flags;
        int ret = 0;
@@ -1030,7 +1186,7 @@ EXPORT_SYMBOL(del_timer);
 */
 int try_to_del_timer_sync(struct timer_list *timer)
 {
-        struct tvec_base *base;
+        struct timer_base *base;
        unsigned long flags;
        int ret = -1;
@@ -1114,27 +1270,6 @@ int del_timer_sync(struct timer_list *timer)
 EXPORT_SYMBOL(del_timer_sync);
 #endif
-static int cascade(struct tvec_base *base, struct tvec *tv, int index)
-{
-        /* cascade all the timers from tv up one level */
-        struct timer_list *timer;
-        struct hlist_node *tmp;
-        struct hlist_head tv_list;
-        hlist_move_list(tv->vec + index, &tv_list);
-        /*
-         * We are removing _all_ timers from the list, so we
-         * don't have to detach them individually.
-         */
-        hlist_for_each_entry_safe(timer, tmp, &tv_list, entry) {
-                /* No accounting, while moving them */
-                __internal_add_timer(base, timer);
-        }
-        return index;
-}
 static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
                          unsigned long data)
 {
@@ -1178,147 +1313,141 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
        }
 }
-#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
+static void expire_timers(struct timer_base *base, struct hlist_head *head)
-/**
- * __run_timers - run all expired timers (if any) on this CPU.
- * @base: the timer vector to be processed.
- *
- * This function cascades all vectors and executes all expired timer
- * vectors.
- */
-static inline void __run_timers(struct tvec_base *base)
 {
-        struct timer_list *timer;
+        while (!hlist_empty(head)) {
+                struct timer_list *timer;
+                void (*fn)(unsigned long);
+                unsigned long data;
-        spin_lock_irq(&base->lock);
+                timer = hlist_entry(head->first, struct timer_list, entry);
+                timer_stats_account_timer(timer);
-        while (time_after_eq(jiffies, base->timer_jiffies)) {
+                base->running_timer = timer;
-                struct hlist_head work_list;
+                detach_timer(timer, true);
-                struct hlist_head *head = &work_list;
-                int index;
-                if (!base->all_timers) {
+                fn = timer->function;
-                        base->timer_jiffies = jiffies;
+                data = timer->data;
-                        break;
+                if (timer->flags & TIMER_IRQSAFE) {
+                        spin_unlock(&base->lock);
+                        call_timer_fn(timer, fn, data);
+                        spin_lock(&base->lock);
+                } else {
+                        spin_unlock_irq(&base->lock);
+                        call_timer_fn(timer, fn, data);
+                        spin_lock_irq(&base->lock);
                }
+        }
+}
-                index = base->timer_jiffies & TVR_MASK;
+static int __collect_expired_timers(struct timer_base *base,
+                                    struct hlist_head *heads)
+{
+        unsigned long clk = base->clk;
+        struct hlist_head *vec;
+        int i, levels = 0;
+        unsigned int idx;
-                /*
+        for (i = 0; i < LVL_DEPTH; i++) {
-                 * Cascade timers:
+                idx = (clk & LVL_MASK) + i * LVL_SIZE;
-                 */
-                if (!index &&
+                if (__test_and_clear_bit(idx, base->pending_map)) {
-                        (!cascade(base, &base->tv2, INDEX(0))) &&
+                        vec = base->vectors + idx;
-                                (!cascade(base, &base->tv3, INDEX(1))) &&
+                        hlist_move_list(vec, heads++);
-                                        !cascade(base, &base->tv4, INDEX(2)))
+                        levels++;
-                        cascade(base, &base->tv5, INDEX(3));
-                ++base->timer_jiffies;
-                hlist_move_list(base->tv1.vec + index, head);
-                while (!hlist_empty(head)) {
-                        void (*fn)(unsigned long);
-                        unsigned long data;
-                        bool irqsafe;
-                        timer = hlist_entry(head->first, struct timer_list, entry);
-                        fn = timer->function;
-                        data = timer->data;
-                        irqsafe = timer->flags & TIMER_IRQSAFE;
-                        timer_stats_account_timer(timer);
-                        base->running_timer = timer;
-                        detach_expired_timer(timer, base);
-                        if (irqsafe) {
-                                spin_unlock(&base->lock);
-                                call_timer_fn(timer, fn, data);
-                                spin_lock(&base->lock);
-                        } else {
-                                spin_unlock_irq(&base->lock);
-                                call_timer_fn(timer, fn, data);
-                                spin_lock_irq(&base->lock);
-                        }
                }
+                /* Is it time to look at the next level? */
+                if (clk & LVL_CLK_MASK)
+                        break;
+                /* Shift clock for the next level granularity */
+                clk >>= LVL_CLK_SHIFT;
        }
-        base->running_timer = NULL;
+        return levels;
-        spin_unlock_irq(&base->lock);
 }
 #ifdef CONFIG_NO_HZ_COMMON
 /*
- * Find out when the next timer event is due to happen. This
+ * Find the next pending bucket of a level. Search from level start (@offset)
- * is used on S/390 to stop all activity when a CPU is idle.
+ * + @clk upwards and if nothing there, search from start of the level
- * This function needs to be called with interrupts disabled.
+ * (@offset) up to @offset + clk.
+ */
+static int next_pending_bucket(struct timer_base *base, unsigned offset,
+                               unsigned clk)
+{
+        unsigned pos, start = offset + clk;
+        unsigned end = offset + LVL_SIZE;
+        pos = find_next_bit(base->pending_map, end, start);
+        if (pos < end)
+                return pos - start;
+        pos = find_next_bit(base->pending_map, start, offset);
+        return pos < start ? pos + LVL_SIZE - start : -1;
+}
+/*
+ * Search the first expiring timer in the various clock levels. Caller must
+ * hold base->lock.
 */
-static unsigned long __next_timer_interrupt(struct tvec_base *base)
+static unsigned long __next_timer_interrupt(struct timer_base *base)
-{
+{
-        unsigned long timer_jiffies = base->timer_jiffies;
+        unsigned long clk, next, adj;
-        unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA;
+        unsigned lvl, offset = 0;
-        int index, slot, array, found = 0;
-        struct timer_list *nte;
+        next = base->clk + NEXT_TIMER_MAX_DELTA;
-        struct tvec *varray[4];
+        clk = base->clk;
+        for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) {
-        /* Look for timer events in tv1. */
+                int pos = next_pending_bucket(base, offset, clk & LVL_MASK);
-        index = slot = timer_jiffies & TVR_MASK;
-        do {
+                if (pos >= 0) {
-                hlist_for_each_entry(nte, base->tv1.vec + slot, entry) {
+                        unsigned long tmp = clk + (unsigned long) pos;
-                        if (nte->flags & TIMER_DEFERRABLE)
-                                continue;
+                        tmp <<= LVL_SHIFT(lvl);
+                        if (time_before(tmp, next))
-                        found = 1;
+                                next = tmp;
-                        expires = nte->expires;
-                        /* Look at the cascade bucket(s)? */
-                        if (!index || slot < index)
-                                goto cascade;
-                        return expires;
                }
-                slot = (slot + 1) & TVR_MASK;
+                /*
-        } while (slot != index);
+                 * Clock for the next level. If the current level clock lower
+                 * bits are zero, we look at the next level as is. If not we
-cascade:
+                 * need to advance it by one because that's going to be the
-        /* Calculate the next cascade event */
+                 * next expiring bucket in that level. base->clk is the next
-        if (index)
+                 * expiring jiffie. So in case of:
-                timer_jiffies += TVR_SIZE - index;
+                 *
-        timer_jiffies >>= TVR_BITS;
+                 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
+                 *  0    0    0    0    0    0
-        /* Check tv2-tv5. */
+                 *
-        varray[0] = &base->tv2;
+                 * we have to look at all levels @index 0. With
-        varray[1] = &base->tv3;
+                 *
-        varray[2] = &base->tv4;
+                 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
-        varray[3] = &base->tv5;
+                 *  0    0    0    0    0    2
+                 *
-        for (array = 0; array < 4; array++) {
+                 * LVL0 has the next expiring bucket @index 2. The upper
-                struct tvec *varp = varray[array];
+                 * levels have the next expiring bucket @index 1.
+                 *
-                index = slot = timer_jiffies & TVN_MASK;
+                 * In case that the propagation wraps the next level the same
-                do {
+                 * rules apply:
-                        hlist_for_each_entry(nte, varp->vec + slot, entry) {
+                 *
-                                if (nte->flags & TIMER_DEFERRABLE)
+                 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
-                                        continue;
+                 *  0    0    0    0    F    2
+                 *
-                                found = 1;
+                 * So after looking at LVL0 we get:
-                                if (time_before(nte->expires, expires))
+                 *
-                                        expires = nte->expires;
+                 * LVL5 LVL4 LVL3 LVL2 LVL1
-                        }
+                 *  0    0    0    1    0
-                        /*
+                 *
-                         * Do we still search for the first timer or are
+                 * So no propagation from LVL1 to LVL2 because that happened
-                         * we looking up the cascade buckets ?
+                 * with the add already, but then we need to propagate further
-                         */
+                 * from LVL2 to LVL3.
-                        if (found) {
+                 *
-                                /* Look at the cascade bucket(s)? */
+                 * So the simple check whether the lower bits of the current
-                                if (!index || slot < index)
+                 * level are 0 or not is sufficient for all cases.
-                                        break;
+                 */
-                                return expires;
+                adj = clk & LVL_CLK_MASK ? 1 : 0;
-                        }
+                clk >>= LVL_CLK_SHIFT;
-                        slot = (slot + 1) & TVN_MASK;
+                clk += adj;
-                } while (slot != index);
-                if (index)
-                        timer_jiffies += TVN_SIZE - index;
-                timer_jiffies >>= TVN_BITS;
        }
-        return expires;
+        return next;
 }
 /*
@@ -1364,7 +1493,7 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
 */
 u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 {
-        struct tvec_base *base = this_cpu_ptr(&tvec_bases);
+        struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
        u64 expires = KTIME_MAX;
        unsigned long nextevt;
@@ -1376,19 +1505,80 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
                return expires;
        spin_lock(&base->lock);
-        if (base->active_timers) {
+        nextevt = __next_timer_interrupt(base);
-                if (time_before_eq(base->next_timer, base->timer_jiffies))
+        base->next_expiry = nextevt;
-                        base->next_timer = __next_timer_interrupt(base);
+        /*
-                nextevt = base->next_timer;
+         * We have a fresh next event. Check whether we can forward the base:
-                if (time_before_eq(nextevt, basej))
+         */
-                        expires = basem;
+        if (time_after(nextevt, jiffies))
-                else
+                base->clk = jiffies;
-                        expires = basem + (nextevt - basej) * TICK_NSEC;
+        else if (time_after(nextevt, base->clk))
+                base->clk = nextevt;
+        if (time_before_eq(nextevt, basej)) {
+                expires = basem;
+                base->is_idle = false;
+        } else {
+                expires = basem + (nextevt - basej) * TICK_NSEC;
+                /*
+                 * If we expect to sleep more than a tick, mark the base idle:
+                 */
+                if ((expires - basem) > TICK_NSEC)
+                        base->is_idle = true;
        }
        spin_unlock(&base->lock);
        return cmp_next_hrtimer_event(basem, expires);
 }
+/**
+ * timer_clear_idle - Clear the idle state of the timer base
+ *
+ * Called with interrupts disabled
+ */
+void timer_clear_idle(void)
+{
+        struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
+        /*
+         * We do this unlocked. The worst outcome is a remote enqueue sending
+         * a pointless IPI, but taking the lock would just make the window for
+         * sending the IPI a few instructions smaller for the cost of taking
+         * the lock in the exit from idle path.
+         */
+        base->is_idle = false;
+}
+static int collect_expired_timers(struct timer_base *base,
+                                  struct hlist_head *heads)
+{
+        /*
+         * NOHZ optimization. After a long idle sleep we need to forward the
+         * base to current jiffies. Avoid a loop by searching the bitfield for
+         * the next expiring timer.
+         */
+        if ((long)(jiffies - base->clk) > 2) {
+                unsigned long next = __next_timer_interrupt(base);
+                /*
+                 * If the next timer is ahead of time forward to current
+                 * jiffies, otherwise forward to the next expiry time:
+                 */
+                if (time_after(next, jiffies)) {
+                        /* The call site will increment clock! */
+                        base->clk = jiffies - 1;
+                        return 0;
+                }
+                base->clk = next;
+        }
+        return __collect_expired_timers(base, heads);
+}
+#else
+static inline int collect_expired_timers(struct timer_base *base,
+                                         struct hlist_head *heads)
+{
+        return __collect_expired_timers(base, heads);
+}
 #endif
 /*
@@ -1411,15 +1601,42 @@ void update_process_times(int user_tick)
        run_posix_cpu_timers(p);
 }
+/**
+ * __run_timers - run all expired timers (if any) on this CPU.
+ * @base: the timer vector to be processed.
+ */
+static inline void __run_timers(struct timer_base *base)
+{
+        struct hlist_head heads[LVL_DEPTH];
+        int levels;
+        if (!time_after_eq(jiffies, base->clk))
+                return;
+        spin_lock_irq(&base->lock);
+        while (time_after_eq(jiffies, base->clk)) {
+                levels = collect_expired_timers(base, heads);
+                base->clk++;
+                while (levels--)
+                        expire_timers(base, heads + levels);
+        }
+        base->running_timer = NULL;
+        spin_unlock_irq(&base->lock);
+}
 /*
 * This function runs timers and the timer-tq in bottom half context.
 */
 static void run_timer_softirq(struct softirq_action *h)
 {
-        struct tvec_base *base = this_cpu_ptr(&tvec_bases);
+        struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
-        if (time_after_eq(jiffies, base->timer_jiffies))
+        __run_timers(base);
-                __run_timers(base);
+        if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active)
+                __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
 }
 /*
@@ -1427,7 +1644,18 @@ static void run_timer_softirq(struct softirq_action *h)
 */
 void run_local_timers(void)
 {
+        struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
        hrtimer_run_queues();
+        /* Raise the softirq only if required. */
+        if (time_before(jiffies, base->clk)) {
+                if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active)
+                        return;
+                /* CPU is awake, so check the deferrable base. */
+                base++;
+                if (time_before(jiffies, base->clk))
+                        return;
+        }
        raise_softirq(TIMER_SOFTIRQ);
 }
@@ -1512,7 +1740,7 @@ signed long __sched schedule_timeout(signed long timeout)
        expire = timeout + jiffies;
        setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
-        __mod_timer(&timer, expire, false, TIMER_NOT_PINNED);
+        __mod_timer(&timer, expire, false);
        schedule();
        del_singleshot_timer_sync(&timer);
@@ -1563,87 +1791,62 @@ signed long __sched schedule_timeout_idle(signed long timeout)
 EXPORT_SYMBOL(schedule_timeout_idle);
 #ifdef CONFIG_HOTPLUG_CPU
-static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head)
+static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *head)
 {
        struct timer_list *timer;
        int cpu = new_base->cpu;
        while (!hlist_empty(head)) {
                timer = hlist_entry(head->first, struct timer_list, entry);
-                /* We ignore the accounting on the dying cpu */
                detach_timer(timer, false);
                timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu;
                internal_add_timer(new_base, timer);
        }
 }
-static void migrate_timers(int cpu)
+int timers_dead_cpu(unsigned int cpu)
 {
-        struct tvec_base *old_base;
+        struct timer_base *old_base;
-        struct tvec_base *new_base;
+        struct timer_base *new_base;
-        int i;
+        int b, i;
        BUG_ON(cpu_online(cpu));
-        old_base = per_cpu_ptr(&tvec_bases, cpu);
-        new_base = get_cpu_ptr(&tvec_bases);
-        /*
-         * The caller is globally serialized and nobody else
-         * takes two locks at once, deadlock is not possible.
-         */
-        spin_lock_irq(&new_base->lock);
-        spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
-        BUG_ON(old_base->running_timer);
-        for (i = 0; i < TVR_SIZE; i++)
-                migrate_timer_list(new_base, old_base->tv1.vec + i);
-        for (i = 0; i < TVN_SIZE; i++) {
-                migrate_timer_list(new_base, old_base->tv2.vec + i);
-                migrate_timer_list(new_base, old_base->tv3.vec + i);
-                migrate_timer_list(new_base, old_base->tv4.vec + i);
-                migrate_timer_list(new_base, old_base->tv5.vec + i);
-        }
-        old_base->active_timers = 0;
+        for (b = 0; b < NR_BASES; b++) {
-        old_base->all_timers = 0;
+                old_base = per_cpu_ptr(&timer_bases[b], cpu);
+                new_base = get_cpu_ptr(&timer_bases[b]);
+                /*
+                 * The caller is globally serialized and nobody else
+                 * takes two locks at once, deadlock is not possible.
+                 */
+                spin_lock_irq(&new_base->lock);
+                spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
-        spin_unlock(&old_base->lock);
+                BUG_ON(old_base->running_timer);
-        spin_unlock_irq(&new_base->lock);
-        put_cpu_ptr(&tvec_bases);
-}
-static int timer_cpu_notify(struct notifier_block *self,
+                for (i = 0; i < WHEEL_SIZE; i++)
-                                unsigned long action, void *hcpu)
+                        migrate_timer_list(new_base, old_base->vectors + i);
-{
-        switch (action) {
-        case CPU_DEAD:
-        case CPU_DEAD_FROZEN:
-                migrate_timers((long)hcpu);
-                break;
-        default:
-                break;
-        }
-        return NOTIFY_OK;
+                spin_unlock(&old_base->lock);
+                spin_unlock_irq(&new_base->lock);
+                put_cpu_ptr(&timer_bases);
+        }
+        return 0;
 }
-static inline void timer_register_cpu_notifier(void)
-{
-        cpu_notifier(timer_cpu_notify, 0);
-}
-#else
-static inline void timer_register_cpu_notifier(void) { }
 #endif /* CONFIG_HOTPLUG_CPU */
 static void __init init_timer_cpu(int cpu)
 {
-        struct tvec_base *base = per_cpu_ptr(&tvec_bases, cpu);
+        struct timer_base *base;
+        int i;
-        base->cpu = cpu;
-        spin_lock_init(&base->lock);
-        base->timer_jiffies = jiffies;
+        for (i = 0; i < NR_BASES; i++) {
-        base->next_timer = base->timer_jiffies;
+                base = per_cpu_ptr(&timer_bases[i], cpu);
+                base->cpu = cpu;
+                spin_lock_init(&base->lock);
+                base->clk = jiffies;
+        }
 }
 static void __init init_timer_cpus(void)
@@ -1658,7 +1861,6 @@ void __init init_timers(void)
 {
        init_timer_cpus();
        init_timer_stats();
-        timer_register_cpu_notifier();
        open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
 }
@@ -1702,9 +1904,15 @@ static void __sched do_usleep_range(unsigned long min, unsigned long max)
 }
 /**
- * usleep_range - Drop in replacement for udelay where wakeup is flexible
+ * usleep_range - Sleep for an approximate time
 * @min: Minimum time in usecs to sleep
 * @max: Maximum time in usecs to sleep
+ *
+ * In non-atomic context where the exact wakeup time is flexible, use
+ * usleep_range() instead of udelay().  The sleep improves responsiveness
+ * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces
+ * power usage by allowing hrtimers to take advantage of an already-
+ * scheduled interrupt instead of scheduling a new one just for this sleep.
 */
 void __sched usleep_range(unsigned long min, unsigned long max)
 {
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 1adecb4b87c8..087204c733eb 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -279,7 +279,7 @@ static void print_name_offset(struct seq_file *m, unsigned long addr)
 static int tstats_show(struct seq_file *m, void *v)
 {
-        struct timespec period;
+        struct timespec64 period;
        struct entry *entry;
        unsigned long ms;
        long events = 0;
@@ -295,11 +295,11 @@ static int tstats_show(struct seq_file *m, void *v)
        time = ktime_sub(time_stop, time_start);
-        period = ktime_to_timespec(time);
+        period = ktime_to_timespec64(time);
        ms = period.tv_nsec / 1000000;
        seq_puts(m, "Timer Stats Version: v0.3\n");
-        seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms);
+        seq_printf(m, "Sample period: %ld.%03ld s\n", (long)period.tv_sec, ms);
        if (atomic_read(&overflow_count))
                seq_printf(m, "Overflow: %d entries\n", atomic_read(&overflow_count));
        seq_printf(m, "Collection: %s\n", timer_stats_active ? "active" : "inactive");
diff --git a/kernel/torture.c b/kernel/torture.c
index fa0bdeee17ac..75961b3decfe 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -82,6 +82,104 @@ static int min_online = -1;
 static int max_online;
 /*
+ * Attempt to take a CPU offline.  Return false if the CPU is already
+ * offline or if it is not subject to CPU-hotplug operations.  The
+ * caller can detect other failures by looking at the statistics.
+ */
+bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes,
+                     unsigned long *sum_offl, int *min_offl, int *max_offl)
+{
+        unsigned long delta;
+        int ret;
+        unsigned long starttime;
+        if (!cpu_online(cpu) || !cpu_is_hotpluggable(cpu))
+                return false;
+        if (verbose)
+                pr_alert("%s" TORTURE_FLAG
+                         "torture_onoff task: offlining %d\n",
+                         torture_type, cpu);
+        starttime = jiffies;
+        (*n_offl_attempts)++;
+        ret = cpu_down(cpu);
+        if (ret) {
+                if (verbose)
+                        pr_alert("%s" TORTURE_FLAG
+                                 "torture_onoff task: offline %d failed: errno %d\n",
+                                 torture_type, cpu, ret);
+        } else {
+                if (verbose)
+                        pr_alert("%s" TORTURE_FLAG
+                                 "torture_onoff task: offlined %d\n",
+                                 torture_type, cpu);
+                (*n_offl_successes)++;
+                delta = jiffies - starttime;
+                sum_offl += delta;
+                if (*min_offl < 0) {
+                        *min_offl = delta;
+                        *max_offl = delta;
+                }
+                if (*min_offl > delta)
+                        *min_offl = delta;
+                if (*max_offl < delta)
+                        *max_offl = delta;
+        }
+        return true;
+}
+EXPORT_SYMBOL_GPL(torture_offline);
+/*
+ * Attempt to bring a CPU online.  Return false if the CPU is already
+ * online or if it is not subject to CPU-hotplug operations.  The
+ * caller can detect other failures by looking at the statistics.
+ */
+bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes,
+                    unsigned long *sum_onl, int *min_onl, int *max_onl)
+{
+        unsigned long delta;
+        int ret;
+        unsigned long starttime;
+        if (cpu_online(cpu) || !cpu_is_hotpluggable(cpu))
+                return false;
+        if (verbose)
+                pr_alert("%s" TORTURE_FLAG
+                         "torture_onoff task: onlining %d\n",
+                         torture_type, cpu);
+        starttime = jiffies;
+        (*n_onl_attempts)++;
+        ret = cpu_up(cpu);
+        if (ret) {
+                if (verbose)
+                        pr_alert("%s" TORTURE_FLAG
+                                 "torture_onoff task: online %d failed: errno %d\n",
+                                 torture_type, cpu, ret);
+        } else {
+                if (verbose)
+                        pr_alert("%s" TORTURE_FLAG
+                                 "torture_onoff task: onlined %d\n",
+                                 torture_type, cpu);
+                (*n_onl_successes)++;
+                delta = jiffies - starttime;
+                *sum_onl += delta;
+                if (*min_onl < 0) {
+                        *min_onl = delta;
+                        *max_onl = delta;
+                }
+                if (*min_onl > delta)
+                        *min_onl = delta;
+                if (*max_onl < delta)
+                        *max_onl = delta;
+        }
+        return true;
+}
+EXPORT_SYMBOL_GPL(torture_online);
+/*
 * Execute random CPU-hotplug operations at the interval specified
 * by the onoff_interval.
 */
@@ -89,16 +187,19 @@ static int
 torture_onoff(void *arg)
 {
        int cpu;
-        unsigned long delta;
        int maxcpu = -1;
        DEFINE_TORTURE_RANDOM(rand);
-        int ret;
-        unsigned long starttime;
        VERBOSE_TOROUT_STRING("torture_onoff task started");
        for_each_online_cpu(cpu)
                maxcpu = cpu;
        WARN_ON(maxcpu < 0);
+        if (maxcpu == 0) {
+                VERBOSE_TOROUT_STRING("Only one CPU, so CPU-hotplug testing is disabled");
+                goto stop;
+        }
        if (onoff_holdoff > 0) {
                VERBOSE_TOROUT_STRING("torture_onoff begin holdoff");
                schedule_timeout_interruptible(onoff_holdoff);
@@ -106,69 +207,16 @@ torture_onoff(void *arg)
        }
        while (!torture_must_stop()) {
                cpu = (torture_random(&rand) >> 4) % (maxcpu + 1);
-                if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) {
+                if (!torture_offline(cpu,
-                        if (verbose)
+                                     &n_offline_attempts, &n_offline_successes,
-                                pr_alert("%s" TORTURE_FLAG
+                                     &sum_offline, &min_offline, &max_offline))
-                                         "torture_onoff task: offlining %d\n",
+                        torture_online(cpu,
-                                         torture_type, cpu);
+                                       &n_online_attempts, &n_online_successes,
-                        starttime = jiffies;
+                                       &sum_online, &min_online, &max_online);
-                        n_offline_attempts++;
-                        ret = cpu_down(cpu);
-                        if (ret) {
-                                if (verbose)
-                                        pr_alert("%s" TORTURE_FLAG
-                                                 "torture_onoff task: offline %d failed: errno %d\n",
-                                                 torture_type, cpu, ret);
-                        } else {
-                                if (verbose)
-                                        pr_alert("%s" TORTURE_FLAG
-                                                 "torture_onoff task: offlined %d\n",
-                                                 torture_type, cpu);
-                                n_offline_successes++;
-                                delta = jiffies - starttime;
-                                sum_offline += delta;
-                                if (min_offline < 0) {
-                                        min_offline = delta;
-                                        max_offline = delta;
-                                }
-                                if (min_offline > delta)
-                                        min_offline = delta;
-                                if (max_offline < delta)
-                                        max_offline = delta;
-                        }
-                } else if (cpu_is_hotpluggable(cpu)) {
-                        if (verbose)
-                                pr_alert("%s" TORTURE_FLAG
-                                         "torture_onoff task: onlining %d\n",
-                                         torture_type, cpu);
-                        starttime = jiffies;
-                        n_online_attempts++;
-                        ret = cpu_up(cpu);
-                        if (ret) {
-                                if (verbose)
-                                        pr_alert("%s" TORTURE_FLAG
-                                                 "torture_onoff task: online %d failed: errno %d\n",
-                                                 torture_type, cpu, ret);
-                        } else {
-                                if (verbose)
-                                        pr_alert("%s" TORTURE_FLAG
-                                                 "torture_onoff task: onlined %d\n",
-                                                 torture_type, cpu);
-                                n_online_successes++;
-                                delta = jiffies - starttime;
-                                sum_online += delta;
-                                if (min_online < 0) {
-                                        min_online = delta;
-                                        max_online = delta;
-                                }
-                                if (min_online > delta)
-                                        min_online = delta;
-                                if (max_online < delta)
-                                        max_online = delta;
-                        }
-                }
                schedule_timeout_interruptible(onoff_interval);
        }
+stop:
        torture_kthread_stopping("torture_onoff");
        return 0;
 }
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index fafeaf803bd0..f4b86e8ca1e7 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -542,6 +542,7 @@ config HIST_TRIGGERS
        bool "Histogram triggers"
        depends on ARCH_HAVE_NMI_SAFE_CMPXCHG
        select TRACING_MAP
+        select TRACING
        default n
        help
          Hist triggers allow one or more arbitrary trace event fields
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 9aef8654e90d..fb345cd11883 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -127,12 +127,13 @@ static void trace_note_tsk(struct task_struct *tsk)
 static void trace_note_time(struct blk_trace *bt)
 {
-        struct timespec now;
+        struct timespec64 now;
        unsigned long flags;
        u32 words[2];
-        getnstimeofday(&now);
+        /* need to check user space to see if this breaks in y2038 or y2106 */
-        words[0] = now.tv_sec;
+        ktime_get_real_ts64(&now);
+        words[0] = (u32)now.tv_sec;
        words[1] = now.tv_nsec;
        local_irq_save(flags);
@@ -189,6 +190,7 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
                                 BLK_TC_ACT(BLK_TC_WRITE) };
 #define BLK_TC_RAHEAD           BLK_TC_AHEAD
+#define BLK_TC_PREFLUSH         BLK_TC_FLUSH
 /* The ilog2() calls fall out because they're constant */
 #define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \
@@ -199,7 +201,8 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
 * blk_io_trace structure and places it in a per-cpu subbuffer.
 */
 static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
-                     int rw, u32 what, int error, int pdu_len, void *pdu_data)
+                     int op, int op_flags, u32 what, int error, int pdu_len,
+                     void *pdu_data)
 {
        struct task_struct *tsk = current;
        struct ring_buffer_event *event = NULL;
@@ -214,13 +217,16 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
        if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer))
                return;
-        what |= ddir_act[rw & WRITE];
+        what |= ddir_act[op_is_write(op) ? WRITE : READ];
-        what |= MASK_TC_BIT(rw, SYNC);
+        what |= MASK_TC_BIT(op_flags, SYNC);
-        what |= MASK_TC_BIT(rw, RAHEAD);
+        what |= MASK_TC_BIT(op_flags, RAHEAD);
-        what |= MASK_TC_BIT(rw, META);
+        what |= MASK_TC_BIT(op_flags, META);
-        what |= MASK_TC_BIT(rw, DISCARD);
+        what |= MASK_TC_BIT(op_flags, PREFLUSH);
-        what |= MASK_TC_BIT(rw, FLUSH);
+        what |= MASK_TC_BIT(op_flags, FUA);
-        what |= MASK_TC_BIT(rw, FUA);
+        if (op == REQ_OP_DISCARD)
+                what |= BLK_TC_ACT(BLK_TC_DISCARD);
+        if (op == REQ_OP_FLUSH)
+                what |= BLK_TC_ACT(BLK_TC_FLUSH);
        pid = tsk->pid;
        if (act_log_check(bt, what, sector, pid))
@@ -708,11 +714,11 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
        if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
                what |= BLK_TC_ACT(BLK_TC_PC);
-                __blk_add_trace(bt, 0, nr_bytes, rq->cmd_flags,
+                __blk_add_trace(bt, 0, nr_bytes, req_op(rq), rq->cmd_flags,
                                what, rq->errors, rq->cmd_len, rq->cmd);
        } else  {
                what |= BLK_TC_ACT(BLK_TC_FS);
-                __blk_add_trace(bt, blk_rq_pos(rq), nr_bytes,
+                __blk_add_trace(bt, blk_rq_pos(rq), nr_bytes, req_op(rq),
                                rq->cmd_flags, what, rq->errors, 0, NULL);
        }
 }
@@ -770,7 +776,7 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
                return;
        __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
-                        bio->bi_rw, what, error, 0, NULL);
+                        bio_op(bio), bio->bi_rw, what, error, 0, NULL);
 }
 static void blk_add_trace_bio_bounce(void *ignore,
@@ -818,7 +824,8 @@ static void blk_add_trace_getrq(void *ignore,
                struct blk_trace *bt = q->blk_trace;
                if (bt)
-                        __blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL);
+                        __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0,
+                                        NULL);
        }
 }
@@ -833,7 +840,7 @@ static void blk_add_trace_sleeprq(void *ignore,
                struct blk_trace *bt = q->blk_trace;
                if (bt)
-                        __blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ,
+                        __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ,
                                        0, 0, NULL);
        }
 }
@@ -843,7 +850,7 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q)
        struct blk_trace *bt = q->blk_trace;
        if (bt)
-                __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
+                __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
 }
 static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
@@ -860,7 +867,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
                else
                        what = BLK_TA_UNPLUG_TIMER;
-                __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
+                __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
        }
 }
@@ -874,8 +881,9 @@ static void blk_add_trace_split(void *ignore,
                __be64 rpdu = cpu_to_be64(pdu);
                __blk_add_trace(bt, bio->bi_iter.bi_sector,
-                                bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_SPLIT,
+                                bio->bi_iter.bi_size, bio_op(bio), bio->bi_rw,
-                                bio->bi_error, sizeof(rpdu), &rpdu);
+                                BLK_TA_SPLIT, bio->bi_error, sizeof(rpdu),
+                                &rpdu);
        }
 }
@@ -907,7 +915,7 @@ static void blk_add_trace_bio_remap(void *ignore,
        r.sector_from = cpu_to_be64(from);
        __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
-                        bio->bi_rw, BLK_TA_REMAP, bio->bi_error,
+                        bio_op(bio), bio->bi_rw, BLK_TA_REMAP, bio->bi_error,
                        sizeof(r), &r);
 }
@@ -940,7 +948,7 @@ static void blk_add_trace_rq_remap(void *ignore,
        r.sector_from = cpu_to_be64(from);
        __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
-                        rq_data_dir(rq), BLK_TA_REMAP, !!rq->errors,
+                        rq_data_dir(rq), 0, BLK_TA_REMAP, !!rq->errors,
                        sizeof(r), &r);
 }
@@ -965,10 +973,10 @@ void blk_add_driver_data(struct request_queue *q,
                return;
        if (rq->cmd_type == REQ_TYPE_BLOCK_PC)
-                __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0,
+                __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, 0,
                                BLK_TA_DRV_DATA, rq->errors, len, data);
        else
-                __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0,
+                __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0, 0,
                                BLK_TA_DRV_DATA, rq->errors, len, data);
 }
 EXPORT_SYMBOL_GPL(blk_add_driver_data);
@@ -1769,21 +1777,34 @@ void blk_dump_cmd(char *buf, struct request *rq)
        }
 }
-void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
+void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes)
 {
        int i = 0;
-        if (rw & REQ_FLUSH)
+        if (rw & REQ_PREFLUSH)
                rwbs[i++] = 'F';
-        if (rw & WRITE)
+        switch (op) {
+        case REQ_OP_WRITE:
+        case REQ_OP_WRITE_SAME:
                rwbs[i++] = 'W';
-        else if (rw & REQ_DISCARD)
+                break;
+        case REQ_OP_DISCARD:
+                rwbs[i++] = 'D';
+                break;
+        case REQ_OP_SECURE_ERASE:
                rwbs[i++] = 'D';
-        else if (bytes)
+                rwbs[i++] = 'E';
+                break;
+        case REQ_OP_FLUSH:
+                rwbs[i++] = 'F';
+                break;
+        case REQ_OP_READ:
                rwbs[i++] = 'R';
-        else
+                break;
+        default:
                rwbs[i++] = 'N';
+        }
        if (rw & REQ_FUA)
                rwbs[i++] = 'F';
@@ -1793,8 +1814,6 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
                rwbs[i++] = 'S';
        if (rw & REQ_META)
                rwbs[i++] = 'M';
-        if (rw & REQ_SECURE)
-                rwbs[i++] = 'E';
        rwbs[i] = '\0';
 }
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 780bcbe1d4de..b20438fdb029 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -81,6 +81,49 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
        .arg3_type      = ARG_ANYTHING,
 };
+static u64 bpf_probe_write_user(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+        void *unsafe_ptr = (void *) (long) r1;
+        void *src = (void *) (long) r2;
+        int size = (int) r3;
+        /*
+         * Ensure we're in user context which is safe for the helper to
+         * run. This helper has no business in a kthread.
+         *
+         * access_ok() should prevent writing to non-user memory, but in
+         * some situations (nommu, temporary switch, etc) access_ok() does
+         * not provide enough validation, hence the check on KERNEL_DS.
+         */
+        if (unlikely(in_interrupt() ||
+                     current->flags & (PF_KTHREAD | PF_EXITING)))
+                return -EPERM;
+        if (unlikely(segment_eq(get_fs(), KERNEL_DS)))
+                return -EPERM;
+        if (!access_ok(VERIFY_WRITE, unsafe_ptr, size))
+                return -EPERM;
+        return probe_kernel_write(unsafe_ptr, src, size);
+}
+static const struct bpf_func_proto bpf_probe_write_user_proto = {
+        .func           = bpf_probe_write_user,
+        .gpl_only       = true,
+        .ret_type       = RET_INTEGER,
+        .arg1_type      = ARG_ANYTHING,
+        .arg2_type      = ARG_PTR_TO_STACK,
+        .arg3_type      = ARG_CONST_STACK_SIZE,
+};
+static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
+{
+        pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!",
+                            current->comm, task_pid_nr(current));
+        return &bpf_probe_write_user_proto;
+}
 /*
 * limited trace_printk()
 * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed
@@ -188,25 +231,33 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
        return &bpf_trace_printk_proto;
 }
-static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
+static u64 bpf_perf_event_read(u64 r1, u64 flags, u64 r3, u64 r4, u64 r5)
 {
        struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
        struct bpf_array *array = container_of(map, struct bpf_array, map);
+        unsigned int cpu = smp_processor_id();
+        u64 index = flags & BPF_F_INDEX_MASK;
+        struct bpf_event_entry *ee;
        struct perf_event *event;
-        struct file *file;
+        if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
+                return -EINVAL;
+        if (index == BPF_F_CURRENT_CPU)
+                index = cpu;
        if (unlikely(index >= array->map.max_entries))
                return -E2BIG;
-        file = (struct file *)array->ptrs[index];
+        ee = READ_ONCE(array->ptrs[index]);
-        if (unlikely(!file))
+        if (!ee)
                return -ENOENT;
-        event = file->private_data;
+        event = ee->event;
+        if (unlikely(event->attr.type != PERF_TYPE_HARDWARE &&
+                     event->attr.type != PERF_TYPE_RAW))
+                return -EINVAL;
        /* make sure event is local and doesn't have pmu::count */
-        if (event->oncpu != smp_processor_id() ||
+        if (unlikely(event->oncpu != cpu || event->pmu->count))
-            event->pmu->count)
                return -EINVAL;
        /*
@@ -225,47 +276,58 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = {
        .arg2_type      = ARG_ANYTHING,
 };
-static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
+static __always_inline u64
+__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
+                        u64 flags, struct perf_raw_record *raw)
 {
-        struct pt_regs *regs = (struct pt_regs *) (long) r1;
-        struct bpf_map *map = (struct bpf_map *) (long) r2;
        struct bpf_array *array = container_of(map, struct bpf_array, map);
+        unsigned int cpu = smp_processor_id();
        u64 index = flags & BPF_F_INDEX_MASK;
-        void *data = (void *) (long) r4;
        struct perf_sample_data sample_data;
+        struct bpf_event_entry *ee;
        struct perf_event *event;
-        struct file *file;
-        struct perf_raw_record raw = {
-                .size = size,
-                .data = data,
-        };
-        if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
-                return -EINVAL;
        if (index == BPF_F_CURRENT_CPU)
-                index = raw_smp_processor_id();
+                index = cpu;
        if (unlikely(index >= array->map.max_entries))
                return -E2BIG;
-        file = (struct file *)array->ptrs[index];
+        ee = READ_ONCE(array->ptrs[index]);
-        if (unlikely(!file))
+        if (!ee)
                return -ENOENT;
-        event = file->private_data;
+        event = ee->event;
        if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE ||
                     event->attr.config != PERF_COUNT_SW_BPF_OUTPUT))
                return -EINVAL;
-        if (unlikely(event->oncpu != smp_processor_id()))
+        if (unlikely(event->oncpu != cpu))
                return -EOPNOTSUPP;
        perf_sample_data_init(&sample_data, 0, 0);
-        sample_data.raw = &raw;
+        sample_data.raw = raw;
        perf_event_output(event, &sample_data, regs);
        return 0;
 }
+static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
+{
+        struct pt_regs *regs = (struct pt_regs *)(long) r1;
+        struct bpf_map *map  = (struct bpf_map *)(long) r2;
+        void *data = (void *)(long) r4;
+        struct perf_raw_record raw = {
+                .frag = {
+                        .size = size,
+                        .data = data,
+                },
+        };
+        if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
+                return -EINVAL;
+        return __bpf_perf_event_output(regs, map, flags, &raw);
+}
 static const struct bpf_func_proto bpf_perf_event_output_proto = {
        .func           = bpf_perf_event_output,
        .gpl_only       = true,
@@ -279,31 +341,41 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {
 static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs);
-static u64 bpf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
+u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
+                     void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
 {
        struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs);
+        struct perf_raw_frag frag = {
+                .copy           = ctx_copy,
+                .size           = ctx_size,
+                .data           = ctx,
+        };
+        struct perf_raw_record raw = {
+                .frag = {
+                        {
+                                .next   = ctx_size ? &frag : NULL,
+                        },
+                        .size   = meta_size,
+                        .data   = meta,
+                },
+        };
        perf_fetch_caller_regs(regs);
-        return bpf_perf_event_output((long)regs, r2, flags, r4, size);
+        return __bpf_perf_event_output(regs, map, flags, &raw);
 }
-static const struct bpf_func_proto bpf_event_output_proto = {
+static u64 bpf_get_current_task(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
-        .func           = bpf_event_output,
+{
+        return (long) current;
+}
+static const struct bpf_func_proto bpf_get_current_task_proto = {
+        .func           = bpf_get_current_task,
        .gpl_only       = true,
        .ret_type       = RET_INTEGER,
-        .arg1_type      = ARG_PTR_TO_CTX,
-        .arg2_type      = ARG_CONST_MAP_PTR,
-        .arg3_type      = ARG_ANYTHING,
-        .arg4_type      = ARG_PTR_TO_STACK,
-        .arg5_type      = ARG_CONST_STACK_SIZE,
 };
-const struct bpf_func_proto *bpf_get_event_output_proto(void)
-{
-        return &bpf_event_output_proto;
-}
 static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
 {
        switch (func_id) {
@@ -321,6 +393,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
                return &bpf_tail_call_proto;
        case BPF_FUNC_get_current_pid_tgid:
                return &bpf_get_current_pid_tgid_proto;
+        case BPF_FUNC_get_current_task:
+                return &bpf_get_current_task_proto;
        case BPF_FUNC_get_current_uid_gid:
                return &bpf_get_current_uid_gid_proto;
        case BPF_FUNC_get_current_comm:
@@ -331,6 +405,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
                return &bpf_get_smp_processor_id_proto;
        case BPF_FUNC_perf_event_read:
                return &bpf_perf_event_read_proto;
+        case BPF_FUNC_probe_write_user:
+                return bpf_get_probe_write_proto();
        default:
                return NULL;
        }
@@ -349,20 +425,15 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 }
 /* bpf+kprobe programs can access fields of 'struct pt_regs' */
-static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type)
+static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
+                                        enum bpf_reg_type *reg_type)
 {
-        /* check bounds */
        if (off < 0 || off >= sizeof(struct pt_regs))
                return false;
-        /* only read is allowed */
        if (type != BPF_READ)
                return false;
-        /* disallow misaligned access */
        if (off % size != 0)
                return false;
        return true;
 }
@@ -427,7 +498,8 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
        }
 }
-static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type)
+static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type,
+                                    enum bpf_reg_type *reg_type)
 {
        if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE)
                return false;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 900dbb1efff2..84752c8e28b5 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -89,16 +89,16 @@ struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end;
 /* What to set function_trace_op to */
 static struct ftrace_ops *set_function_trace_op;
-/* List for set_ftrace_pid's pids. */
+static bool ftrace_pids_enabled(struct ftrace_ops *ops)
-LIST_HEAD(ftrace_pids);
-struct ftrace_pid {
-        struct list_head list;
-        struct pid *pid;
-};
-static bool ftrace_pids_enabled(void)
 {
-        return !list_empty(&ftrace_pids);
+        struct trace_array *tr;
+        if (!(ops->flags & FTRACE_OPS_FL_PID) || !ops->private)
+                return false;
+        tr = ops->private;
+        return tr->function_pids != NULL;
 }
 static void ftrace_update_trampoline(struct ftrace_ops *ops);
@@ -179,7 +179,9 @@ int ftrace_nr_registered_ops(void)
 static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
                            struct ftrace_ops *op, struct pt_regs *regs)
 {
-        if (!test_tsk_trace_trace(current))
+        struct trace_array *tr = op->private;
+        if (tr && this_cpu_read(tr->trace_buffer.data->ftrace_ignore_pid))
                return;
        op->saved_func(ip, parent_ip, op, regs);
@@ -417,7 +419,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
        /* Always save the function, and reset at unregistering */
        ops->saved_func = ops->func;
-        if (ops->flags & FTRACE_OPS_FL_PID && ftrace_pids_enabled())
+        if (ftrace_pids_enabled(ops))
                ops->func = ftrace_pid_func;
        ftrace_update_trampoline(ops);
@@ -450,7 +452,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
 static void ftrace_update_pid_func(void)
 {
-        bool enabled = ftrace_pids_enabled();
        struct ftrace_ops *op;
        /* Only do something if we are tracing something */
@@ -459,8 +460,8 @@ static void ftrace_update_pid_func(void)
        do_for_each_ftrace_op(op, ftrace_ops_list) {
                if (op->flags & FTRACE_OPS_FL_PID) {
-                        op->func = enabled ? ftrace_pid_func :
+                        op->func = ftrace_pids_enabled(op) ?
-                                op->saved_func;
+                                ftrace_pid_func : op->saved_func;
                        ftrace_update_trampoline(op);
                }
        } while_for_each_ftrace_op(op);
@@ -5324,179 +5325,99 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
        return ops->func;
 }
-static void clear_ftrace_swapper(void)
+static void
+ftrace_filter_pid_sched_switch_probe(void *data, bool preempt,
+                    struct task_struct *prev, struct task_struct *next)
 {
-        struct task_struct *p;
+        struct trace_array *tr = data;
-        int cpu;
+        struct trace_pid_list *pid_list;
-        get_online_cpus();
+        pid_list = rcu_dereference_sched(tr->function_pids);
-        for_each_online_cpu(cpu) {
-                p = idle_task(cpu);
-                clear_tsk_trace_trace(p);
-        }
-        put_online_cpus();
-}
-static void set_ftrace_swapper(void)
-{
-        struct task_struct *p;
-        int cpu;
-        get_online_cpus();
+        this_cpu_write(tr->trace_buffer.data->ftrace_ignore_pid,
-        for_each_online_cpu(cpu) {
+                       trace_ignore_this_task(pid_list, next));
-                p = idle_task(cpu);
-                set_tsk_trace_trace(p);
-        }
-        put_online_cpus();
 }
-static void clear_ftrace_pid(struct pid *pid)
+static void clear_ftrace_pids(struct trace_array *tr)
 {
-        struct task_struct *p;
+        struct trace_pid_list *pid_list;
+        int cpu;
-        rcu_read_lock();
+        pid_list = rcu_dereference_protected(tr->function_pids,
-        do_each_pid_task(pid, PIDTYPE_PID, p) {
+                                             lockdep_is_held(&ftrace_lock));
-                clear_tsk_trace_trace(p);
+        if (!pid_list)
-        } while_each_pid_task(pid, PIDTYPE_PID, p);
+                return;
-        rcu_read_unlock();
-        put_pid(pid);
+        unregister_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr);
-}
-static void set_ftrace_pid(struct pid *pid)
+        for_each_possible_cpu(cpu)
-{
+                per_cpu_ptr(tr->trace_buffer.data, cpu)->ftrace_ignore_pid = false;
-        struct task_struct *p;
-        rcu_read_lock();
+        rcu_assign_pointer(tr->function_pids, NULL);
-        do_each_pid_task(pid, PIDTYPE_PID, p) {
-                set_tsk_trace_trace(p);
-        } while_each_pid_task(pid, PIDTYPE_PID, p);
-        rcu_read_unlock();
-}
-static void clear_ftrace_pid_task(struct pid *pid)
+        /* Wait till all users are no longer using pid filtering */
-{
+        synchronize_sched();
-        if (pid == ftrace_swapper_pid)
-                clear_ftrace_swapper();
-        else
-                clear_ftrace_pid(pid);
-}
-static void set_ftrace_pid_task(struct pid *pid)
+        trace_free_pid_list(pid_list);
-{
-        if (pid == ftrace_swapper_pid)
-                set_ftrace_swapper();
-        else
-                set_ftrace_pid(pid);
 }
-static int ftrace_pid_add(int p)
+static void ftrace_pid_reset(struct trace_array *tr)
 {
-        struct pid *pid;
-        struct ftrace_pid *fpid;
-        int ret = -EINVAL;
        mutex_lock(&ftrace_lock);
+        clear_ftrace_pids(tr);
-        if (!p)
-                pid = ftrace_swapper_pid;
-        else
-                pid = find_get_pid(p);
-        if (!pid)
-                goto out;
-        ret = 0;
-        list_for_each_entry(fpid, &ftrace_pids, list)
-                if (fpid->pid == pid)
-                        goto out_put;
-        ret = -ENOMEM;
-        fpid = kmalloc(sizeof(*fpid), GFP_KERNEL);
-        if (!fpid)
-                goto out_put;
-        list_add(&fpid->list, &ftrace_pids);
-        fpid->pid = pid;
-        set_ftrace_pid_task(pid);
        ftrace_update_pid_func();
        ftrace_startup_all(0);
        mutex_unlock(&ftrace_lock);
-        return 0;
-out_put:
-        if (pid != ftrace_swapper_pid)
-                put_pid(pid);
-out:
-        mutex_unlock(&ftrace_lock);
-        return ret;
 }
-static void ftrace_pid_reset(void)
+/* Greater than any max PID */
-{
+#define FTRACE_NO_PIDS          (void *)(PID_MAX_LIMIT + 1)
-        struct ftrace_pid *fpid, *safe;
-        mutex_lock(&ftrace_lock);
-        list_for_each_entry_safe(fpid, safe, &ftrace_pids, list) {
-                struct pid *pid = fpid->pid;
-                clear_ftrace_pid_task(pid);
-                list_del(&fpid->list);
-                kfree(fpid);
-        }
-        ftrace_update_pid_func();
-        ftrace_startup_all(0);
-        mutex_unlock(&ftrace_lock);
-}
 static void *fpid_start(struct seq_file *m, loff_t *pos)
+        __acquires(RCU)
 {
+        struct trace_pid_list *pid_list;
+        struct trace_array *tr = m->private;
        mutex_lock(&ftrace_lock);
+        rcu_read_lock_sched();
-        if (!ftrace_pids_enabled() && (!*pos))
+        pid_list = rcu_dereference_sched(tr->function_pids);
-                return (void *) 1;
-        return seq_list_start(&ftrace_pids, *pos);
+        if (!pid_list)
+                return !(*pos) ? FTRACE_NO_PIDS : NULL;
+        return trace_pid_start(pid_list, pos);
 }
 static void *fpid_next(struct seq_file *m, void *v, loff_t *pos)
 {
-        if (v == (void *)1)
+        struct trace_array *tr = m->private;
+        struct trace_pid_list *pid_list = rcu_dereference_sched(tr->function_pids);
+        if (v == FTRACE_NO_PIDS)
                return NULL;
-        return seq_list_next(v, &ftrace_pids, pos);
+        return trace_pid_next(pid_list, v, pos);
 }
 static void fpid_stop(struct seq_file *m, void *p)
+        __releases(RCU)
 {
+        rcu_read_unlock_sched();
        mutex_unlock(&ftrace_lock);
 }
 static int fpid_show(struct seq_file *m, void *v)
 {
-        const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list);
+        if (v == FTRACE_NO_PIDS) {
-        if (v == (void *)1) {
                seq_puts(m, "no pid\n");
                return 0;
        }
-        if (fpid->pid == ftrace_swapper_pid)
+        return trace_pid_show(m, v);
-                seq_puts(m, "swapper tasks\n");
-        else
-                seq_printf(m, "%u\n", pid_vnr(fpid->pid));
-        return 0;
 }
 static const struct seq_operations ftrace_pid_sops = {
@@ -5509,58 +5430,103 @@ static const struct seq_operations ftrace_pid_sops = {
 static int
 ftrace_pid_open(struct inode *inode, struct file *file)
 {
+        struct trace_array *tr = inode->i_private;
+        struct seq_file *m;
        int ret = 0;
+        if (trace_array_get(tr) < 0)
+                return -ENODEV;
        if ((file->f_mode & FMODE_WRITE) &&
            (file->f_flags & O_TRUNC))
-                ftrace_pid_reset();
+                ftrace_pid_reset(tr);
-        if (file->f_mode & FMODE_READ)
+        ret = seq_open(file, &ftrace_pid_sops);
-                ret = seq_open(file, &ftrace_pid_sops);
+        if (ret < 0) {
+                trace_array_put(tr);
+        } else {
+                m = file->private_data;
+                /* copy tr over to seq ops */
+                m->private = tr;
+        }
        return ret;
 }
+static void ignore_task_cpu(void *data)
+{
+        struct trace_array *tr = data;
+        struct trace_pid_list *pid_list;
+        /*
+         * This function is called by on_each_cpu() while the
+         * event_mutex is held.
+         */
+        pid_list = rcu_dereference_protected(tr->function_pids,
+                                             mutex_is_locked(&ftrace_lock));
+        this_cpu_write(tr->trace_buffer.data->ftrace_ignore_pid,
+                       trace_ignore_this_task(pid_list, current));
+}
 static ssize_t
 ftrace_pid_write(struct file *filp, const char __user *ubuf,
                   size_t cnt, loff_t *ppos)
 {
-        char buf[64], *tmp;
+        struct seq_file *m = filp->private_data;
-        long val;
+        struct trace_array *tr = m->private;
-        int ret;
+        struct trace_pid_list *filtered_pids = NULL;
+        struct trace_pid_list *pid_list;
+        ssize_t ret;
-        if (cnt >= sizeof(buf))
+        if (!cnt)
-                return -EINVAL;
+                return 0;
+        mutex_lock(&ftrace_lock);
+        filtered_pids = rcu_dereference_protected(tr->function_pids,
+                                             lockdep_is_held(&ftrace_lock));
+        ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt);
+        if (ret < 0)
+                goto out;
-        if (copy_from_user(&buf, ubuf, cnt))
+        rcu_assign_pointer(tr->function_pids, pid_list);
-                return -EFAULT;
-        buf[cnt] = 0;
+        if (filtered_pids) {
+                synchronize_sched();
+                trace_free_pid_list(filtered_pids);
+        } else if (pid_list) {
+                /* Register a probe to set whether to ignore the tracing of a task */
+                register_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr);
+        }
        /*
-         * Allow "echo > set_ftrace_pid" or "echo -n '' > set_ftrace_pid"
+         * Ignoring of pids is done at task switch. But we have to
-         * to clean the filter quietly.
+         * check for those tasks that are currently running.
+         * Always do this in case a pid was appended or removed.
         */
-        tmp = strstrip(buf);
+        on_each_cpu(ignore_task_cpu, tr, 1);
-        if (strlen(tmp) == 0)
-                return 1;
-        ret = kstrtol(tmp, 10, &val);
+        ftrace_update_pid_func();
-        if (ret < 0)
+        ftrace_startup_all(0);
-                return ret;
+ out:
+        mutex_unlock(&ftrace_lock);
-        ret = ftrace_pid_add(val);
+        if (ret > 0)
+                *ppos += ret;
-        return ret ? ret : cnt;
+        return ret;
 }
 static int
 ftrace_pid_release(struct inode *inode, struct file *file)
 {
-        if (file->f_mode & FMODE_READ)
+        struct trace_array *tr = inode->i_private;
-                seq_release(inode, file);
-        return 0;
+        trace_array_put(tr);
+        return seq_release(inode, file);
 }
 static const struct file_operations ftrace_pid_fops = {
@@ -5571,24 +5537,21 @@ static const struct file_operations ftrace_pid_fops = {
        .release        = ftrace_pid_release,
 };
-static __init int ftrace_init_tracefs(void)
+void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer)
 {
-        struct dentry *d_tracer;
+        trace_create_file("set_ftrace_pid", 0644, d_tracer,
+                            tr, &ftrace_pid_fops);
+}
-        d_tracer = tracing_init_dentry();
+void __init ftrace_init_tracefs_toplevel(struct trace_array *tr,
-        if (IS_ERR(d_tracer))
+                                         struct dentry *d_tracer)
-                return 0;
+{
+        /* Only the top level directory has the dyn_tracefs and profile */
+        WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL));
        ftrace_init_dyn_tracefs(d_tracer);
-        trace_create_file("set_ftrace_pid", 0644, d_tracer,
-                            NULL, &ftrace_pid_fops);
        ftrace_profile_tracefs(d_tracer);
-        return 0;
 }
-fs_initcall(ftrace_init_tracefs);
 /**
 * ftrace_kill - kill ftrace
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8a4bd6b68a0b..dade4c9559cc 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -25,7 +25,7 @@
 #include <linux/hardirq.h>
 #include <linux/linkage.h>
 #include <linux/uaccess.h>
-#include <linux/kprobes.h>
+#include <linux/vmalloc.h>
 #include <linux/ftrace.h>
 #include <linux/module.h>
 #include <linux/percpu.h>
@@ -319,6 +319,258 @@ int call_filter_check_discard(struct trace_event_call *call, void *rec,
        return 0;
 }
+void trace_free_pid_list(struct trace_pid_list *pid_list)
+{
+        vfree(pid_list->pids);
+        kfree(pid_list);
+}
+/**
+ * trace_find_filtered_pid - check if a pid exists in a filtered_pid list
+ * @filtered_pids: The list of pids to check
+ * @search_pid: The PID to find in @filtered_pids
+ *
+ * Returns true if @search_pid is fonud in @filtered_pids, and false otherwis.
+ */
+bool
+trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid)
+{
+        /*
+         * If pid_max changed after filtered_pids was created, we
+         * by default ignore all pids greater than the previous pid_max.
+         */
+        if (search_pid >= filtered_pids->pid_max)
+                return false;
+        return test_bit(search_pid, filtered_pids->pids);
+}
+/**
+ * trace_ignore_this_task - should a task be ignored for tracing
+ * @filtered_pids: The list of pids to check
+ * @task: The task that should be ignored if not filtered
+ *
+ * Checks if @task should be traced or not from @filtered_pids.
+ * Returns true if @task should *NOT* be traced.
+ * Returns false if @task should be traced.
+ */
+bool
+trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task)
+{
+        /*
+         * Return false, because if filtered_pids does not exist,
+         * all pids are good to trace.
+         */
+        if (!filtered_pids)
+                return false;
+        return !trace_find_filtered_pid(filtered_pids, task->pid);
+}
+/**
+ * trace_pid_filter_add_remove - Add or remove a task from a pid_list
+ * @pid_list: The list to modify
+ * @self: The current task for fork or NULL for exit
+ * @task: The task to add or remove
+ *
+ * If adding a task, if @self is defined, the task is only added if @self
+ * is also included in @pid_list. This happens on fork and tasks should
+ * only be added when the parent is listed. If @self is NULL, then the
+ * @task pid will be removed from the list, which would happen on exit
+ * of a task.
+ */
+void trace_filter_add_remove_task(struct trace_pid_list *pid_list,
+                                  struct task_struct *self,
+                                  struct task_struct *task)
+{
+        if (!pid_list)
+                return;
+        /* For forks, we only add if the forking task is listed */
+        if (self) {
+                if (!trace_find_filtered_pid(pid_list, self->pid))
+                        return;
+        }
+        /* Sorry, but we don't support pid_max changing after setting */
+        if (task->pid >= pid_list->pid_max)
+                return;
+        /* "self" is set for forks, and NULL for exits */
+        if (self)
+                set_bit(task->pid, pid_list->pids);
+        else
+                clear_bit(task->pid, pid_list->pids);
+}
+/**
+ * trace_pid_next - Used for seq_file to get to the next pid of a pid_list
+ * @pid_list: The pid list to show
+ * @v: The last pid that was shown (+1 the actual pid to let zero be displayed)
+ * @pos: The position of the file
+ *
+ * This is used by the seq_file "next" operation to iterate the pids
+ * listed in a trace_pid_list structure.
+ *
+ * Returns the pid+1 as we want to display pid of zero, but NULL would
+ * stop the iteration.
+ */
+void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos)
+{
+        unsigned long pid = (unsigned long)v;
+        (*pos)++;
+        /* pid already is +1 of the actual prevous bit */
+        pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid);
+        /* Return pid + 1 to allow zero to be represented */
+        if (pid < pid_list->pid_max)
+                return (void *)(pid + 1);
+        return NULL;
+}
+/**
+ * trace_pid_start - Used for seq_file to start reading pid lists
+ * @pid_list: The pid list to show
+ * @pos: The position of the file
+ *
+ * This is used by seq_file "start" operation to start the iteration
+ * of listing pids.
+ *
+ * Returns the pid+1 as we want to display pid of zero, but NULL would
+ * stop the iteration.
+ */
+void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos)
+{
+        unsigned long pid;
+        loff_t l = 0;
+        pid = find_first_bit(pid_list->pids, pid_list->pid_max);
+        if (pid >= pid_list->pid_max)
+                return NULL;
+        /* Return pid + 1 so that zero can be the exit value */
+        for (pid++; pid && l < *pos;
+             pid = (unsigned long)trace_pid_next(pid_list, (void *)pid, &l))
+                ;
+        return (void *)pid;
+}
+/**
+ * trace_pid_show - show the current pid in seq_file processing
+ * @m: The seq_file structure to write into
+ * @v: A void pointer of the pid (+1) value to display
+ *
+ * Can be directly used by seq_file operations to display the current
+ * pid value.
+ */
+int trace_pid_show(struct seq_file *m, void *v)
+{
+        unsigned long pid = (unsigned long)v - 1;
+        seq_printf(m, "%lu\n", pid);
+        return 0;
+}
+/* 128 should be much more than enough */
+#define PID_BUF_SIZE            127
+int trace_pid_write(struct trace_pid_list *filtered_pids,
+                    struct trace_pid_list **new_pid_list,
+                    const char __user *ubuf, size_t cnt)
+{
+        struct trace_pid_list *pid_list;
+        struct trace_parser parser;
+        unsigned long val;
+        int nr_pids = 0;
+        ssize_t read = 0;
+        ssize_t ret = 0;
+        loff_t pos;
+        pid_t pid;
+        if (trace_parser_get_init(&parser, PID_BUF_SIZE + 1))
+                return -ENOMEM;
+        /*
+         * Always recreate a new array. The write is an all or nothing
+         * operation. Always create a new array when adding new pids by
+         * the user. If the operation fails, then the current list is
+         * not modified.
+         */
+        pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL);
+        if (!pid_list)
+                return -ENOMEM;
+        pid_list->pid_max = READ_ONCE(pid_max);
+        /* Only truncating will shrink pid_max */
+        if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max)
+                pid_list->pid_max = filtered_pids->pid_max;
+        pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3);
+        if (!pid_list->pids) {
+                kfree(pid_list);
+                return -ENOMEM;
+        }
+        if (filtered_pids) {
+                /* copy the current bits to the new max */
+                for_each_set_bit(pid, filtered_pids->pids,
+                                 filtered_pids->pid_max) {
+                        set_bit(pid, pid_list->pids);
+                        nr_pids++;
+                }
+        }
+        while (cnt > 0) {
+                pos = 0;
+                ret = trace_get_user(&parser, ubuf, cnt, &pos);
+                if (ret < 0 || !trace_parser_loaded(&parser))
+                        break;
+                read += ret;
+                ubuf += ret;
+                cnt -= ret;
+                parser.buffer[parser.idx] = 0;
+                ret = -EINVAL;
+                if (kstrtoul(parser.buffer, 0, &val))
+                        break;
+                if (val >= pid_list->pid_max)
+                        break;
+                pid = (pid_t)val;
+                set_bit(pid, pid_list->pids);
+                nr_pids++;
+                trace_parser_clear(&parser);
+                ret = 0;
+        }
+        trace_parser_put(&parser);
+        if (ret < 0) {
+                trace_free_pid_list(pid_list);
+                return ret;
+        }
+        if (!nr_pids) {
+                /* Cleared the list of pids */
+                trace_free_pid_list(pid_list);
+                read = ret;
+                pid_list = NULL;
+        }
+        *new_pid_list = pid_list;
+        return read;
+}
 static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu)
 {
        u64 ts;
@@ -1862,7 +2114,17 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr,
 {
        __buffer_unlock_commit(buffer, event);
-        ftrace_trace_stack(tr, buffer, flags, 0, pc, regs);
+        /*
+         * If regs is not set, then skip the following callers:
+         *   trace_buffer_unlock_commit_regs
+         *   event_trigger_unlock_commit
+         *   trace_event_buffer_commit
+         *   trace_event_raw_event_sched_switch
+         * Note, we can still get here via blktrace, wakeup tracer
+         * and mmiotrace, but that's ok if they lose a function or
+         * two. They are that meaningful.
+         */
+        ftrace_trace_stack(tr, buffer, flags, regs ? 0 : 4, pc, regs);
        ftrace_trace_userstack(buffer, flags, pc);
 }
@@ -1913,6 +2175,13 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
        trace.skip              = skip;
        /*
+         * Add two, for this function and the call to save_stack_trace()
+         * If regs is set, then these functions will not be in the way.
+         */
+        if (!regs)
+                trace.skip += 2;
+        /*
         * Since events can happen in NMIs there's no safe way to
         * use the per cpu ftrace_stacks. We reserve it and if an interrupt
         * or NMI comes in, it will just have to use the default
@@ -2083,83 +2352,41 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags)
 /* created for use with alloc_percpu */
 struct trace_buffer_struct {
-        char buffer[TRACE_BUF_SIZE];
+        int nesting;
+        char buffer[4][TRACE_BUF_SIZE];
 };
 static struct trace_buffer_struct *trace_percpu_buffer;
-static struct trace_buffer_struct *trace_percpu_sirq_buffer;
-static struct trace_buffer_struct *trace_percpu_irq_buffer;
-static struct trace_buffer_struct *trace_percpu_nmi_buffer;
 /*
- * The buffer used is dependent on the context. There is a per cpu
+ * Thise allows for lockless recording.  If we're nested too deeply, then
- * buffer for normal context, softirq contex, hard irq context and
+ * this returns NULL.
- * for NMI context. Thise allows for lockless recording.
- *
- * Note, if the buffers failed to be allocated, then this returns NULL
 */
 static char *get_trace_buf(void)
 {
-        struct trace_buffer_struct *percpu_buffer;
+        struct trace_buffer_struct *buffer = this_cpu_ptr(trace_percpu_buffer);
-        /*
-         * If we have allocated per cpu buffers, then we do not
-         * need to do any locking.
-         */
-        if (in_nmi())
-                percpu_buffer = trace_percpu_nmi_buffer;
-        else if (in_irq())
-                percpu_buffer = trace_percpu_irq_buffer;
-        else if (in_softirq())
-                percpu_buffer = trace_percpu_sirq_buffer;
-        else
-                percpu_buffer = trace_percpu_buffer;
-        if (!percpu_buffer)
+        if (!buffer || buffer->nesting >= 4)
                return NULL;
-        return this_cpu_ptr(&percpu_buffer->buffer[0]);
+        return &buffer->buffer[buffer->nesting++][0];
+}
+static void put_trace_buf(void)
+{
+        this_cpu_dec(trace_percpu_buffer->nesting);
 }
 static int alloc_percpu_trace_buffer(void)
 {
        struct trace_buffer_struct *buffers;
-        struct trace_buffer_struct *sirq_buffers;
-        struct trace_buffer_struct *irq_buffers;
-        struct trace_buffer_struct *nmi_buffers;
        buffers = alloc_percpu(struct trace_buffer_struct);
-        if (!buffers)
+        if (WARN(!buffers, "Could not allocate percpu trace_printk buffer"))
-                goto err_warn;
+                return -ENOMEM;
-        sirq_buffers = alloc_percpu(struct trace_buffer_struct);
-        if (!sirq_buffers)
-                goto err_sirq;
-        irq_buffers = alloc_percpu(struct trace_buffer_struct);
-        if (!irq_buffers)
-                goto err_irq;
-        nmi_buffers = alloc_percpu(struct trace_buffer_struct);
-        if (!nmi_buffers)
-                goto err_nmi;
        trace_percpu_buffer = buffers;
-        trace_percpu_sirq_buffer = sirq_buffers;
-        trace_percpu_irq_buffer = irq_buffers;
-        trace_percpu_nmi_buffer = nmi_buffers;
        return 0;
- err_nmi:
-        free_percpu(irq_buffers);
- err_irq:
-        free_percpu(sirq_buffers);
- err_sirq:
-        free_percpu(buffers);
- err_warn:
-        WARN(1, "Could not allocate percpu trace_printk buffer");
-        return -ENOMEM;
 }
 static int buffers_allocated;
@@ -2250,7 +2477,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
        tbuffer = get_trace_buf();
        if (!tbuffer) {
                len = 0;
-                goto out;
+                goto out_nobuffer;
        }
        len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args);
@@ -2276,6 +2503,9 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
        }
 out:
+        put_trace_buf();
+out_nobuffer:
        preempt_enable_notrace();
        unpause_graph_tracing();
@@ -2307,7 +2537,7 @@ __trace_array_vprintk(struct ring_buffer *buffer,
        tbuffer = get_trace_buf();
        if (!tbuffer) {
                len = 0;
-                goto out;
+                goto out_nobuffer;
        }
        len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args);
@@ -2326,7 +2556,11 @@ __trace_array_vprintk(struct ring_buffer *buffer,
                __buffer_unlock_commit(buffer, event);
                ftrace_trace_stack(&global_trace, buffer, flags, 6, pc, NULL);
        }
- out:
+out:
+        put_trace_buf();
+out_nobuffer:
        preempt_enable_notrace();
        unpause_graph_tracing();
@@ -6977,6 +7211,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
        for_each_tracing_cpu(cpu)
                tracing_init_tracefs_percpu(tr, cpu);
+        ftrace_init_tracefs(tr, d_tracer);
 }
 static struct vfsmount *trace_automount(void *ingore)
@@ -7130,6 +7365,7 @@ static __init int tracer_init_tracefs(void)
                return 0;
        init_tracer_tracefs(&global_trace, d_tracer);
+        ftrace_init_tracefs_toplevel(&global_trace, d_tracer);
        trace_create_file("tracing_thresh", 0644, d_tracer,
                        &global_trace, &tracing_thresh_fops);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 5167c366d6b7..f783df416726 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -80,6 +80,12 @@ enum trace_type {
        FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
                     filter)
+#undef FTRACE_ENTRY_PACKED
+#define FTRACE_ENTRY_PACKED(name, struct_name, id, tstruct, print,      \
+                            filter)                                     \
+        FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
+                     filter) __packed
 #include "trace_entries.h"
 /*
@@ -156,6 +162,9 @@ struct trace_array_cpu {
        char                    comm[TASK_COMM_LEN];
        bool                    ignore_pid;
+#ifdef CONFIG_FUNCTION_TRACER
+        bool                    ftrace_ignore_pid;
+#endif
 };
 struct tracer;
@@ -247,6 +256,7 @@ struct trace_array {
        int                     ref;
 #ifdef CONFIG_FUNCTION_TRACER
        struct ftrace_ops       *ops;
+        struct trace_pid_list   __rcu *function_pids;
        /* function tracing enabled */
        int                     function_enabled;
 #endif
@@ -628,6 +638,25 @@ extern unsigned long nsecs_to_usecs(unsigned long nsecs);
 extern unsigned long tracing_thresh;
+/* PID filtering */
+extern int pid_max;
+bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids,
+                             pid_t search_pid);
+bool trace_ignore_this_task(struct trace_pid_list *filtered_pids,
+                            struct task_struct *task);
+void trace_filter_add_remove_task(struct trace_pid_list *pid_list,
+                                  struct task_struct *self,
+                                  struct task_struct *task);
+void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos);
+void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos);
+int trace_pid_show(struct seq_file *m, void *v);
+void trace_free_pid_list(struct trace_pid_list *pid_list);
+int trace_pid_write(struct trace_pid_list *filtered_pids,
+                    struct trace_pid_list **new_pid_list,
+                    const char __user *ubuf, size_t cnt);
 #ifdef CONFIG_TRACER_MAX_TRACE
 void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
 void update_max_tr_single(struct trace_array *tr,
@@ -821,12 +850,9 @@ extern struct list_head ftrace_pids;
 #ifdef CONFIG_FUNCTION_TRACER
 extern bool ftrace_filter_param __initdata;
-static inline int ftrace_trace_task(struct task_struct *task)
+static inline int ftrace_trace_task(struct trace_array *tr)
 {
-        if (list_empty(&ftrace_pids))
+        return !this_cpu_read(tr->trace_buffer.data->ftrace_ignore_pid);
-                return 1;
-        return test_tsk_trace_trace(task);
 }
 extern int ftrace_is_dead(void);
 int ftrace_create_function_files(struct trace_array *tr,
@@ -836,8 +862,11 @@ void ftrace_init_global_array_ops(struct trace_array *tr);
 void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func);
 void ftrace_reset_array_ops(struct trace_array *tr);
 int using_ftrace_ops_list_func(void);
+void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer);
+void ftrace_init_tracefs_toplevel(struct trace_array *tr,
+                                  struct dentry *d_tracer);
 #else
-static inline int ftrace_trace_task(struct task_struct *task)
+static inline int ftrace_trace_task(struct trace_array *tr)
 {
        return 1;
 }
@@ -852,6 +881,8 @@ static inline void ftrace_destroy_function_files(struct trace_array *tr) { }
 static inline __init void
 ftrace_init_global_array_ops(struct trace_array *tr) { }
 static inline void ftrace_reset_array_ops(struct trace_array *tr) { }
+static inline void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d) { }
+static inline void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct dentry *d) { }
 /* ftace_func_t type is not defined, use macro instead of static inline */
 #define ftrace_init_array_ops(tr, func) do { } while (0)
 #endif /* CONFIG_FUNCTION_TRACER */
@@ -1600,6 +1631,11 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
 #define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \
        FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \
                     filter)
+#undef FTRACE_ENTRY_PACKED
+#define FTRACE_ENTRY_PACKED(call, struct_name, id, tstruct, print, filter) \
+        FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \
+                     filter)
 #include "trace_entries.h"
 #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_FUNCTION_TRACER)
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index ee7b94a4810a..5c30efcda5e6 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -72,7 +72,7 @@ FTRACE_ENTRY_REG(function, ftrace_entry,
 );
 /* Function call entry */
-FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry,
+FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry,
        TRACE_GRAPH_ENT,
@@ -88,7 +88,7 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry,
 );
 /* Function return entry */
-FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry,
+FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
        TRACE_GRAPH_RET,
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 3d4155892a1e..03c0a48c3ac4 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -15,7 +15,6 @@
 #include <linux/kthread.h>
 #include <linux/tracefs.h>
 #include <linux/uaccess.h>
-#include <linux/vmalloc.h>
 #include <linux/module.h>
 #include <linux/ctype.h>
 #include <linux/sort.h>
@@ -262,6 +261,14 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
        local_save_flags(fbuffer->flags);
        fbuffer->pc = preempt_count();
+        /*
+         * If CONFIG_PREEMPT is enabled, then the tracepoint itself disables
+         * preemption (adding one to the preempt_count). Since we are
+         * interested in the preempt_count at the time the tracepoint was
+         * hit, we need to subtract one to offset the increment.
+         */
+        if (IS_ENABLED(CONFIG_PREEMPT))
+                fbuffer->pc--;
        fbuffer->trace_file = trace_file;
        fbuffer->event =
@@ -499,60 +506,6 @@ static void ftrace_clear_events(struct trace_array *tr)
        mutex_unlock(&event_mutex);
 }
-/* Shouldn't this be in a header? */
-extern int pid_max;
-/* Returns true if found in filter */
-static bool
-find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid)
-{
-        /*
-         * If pid_max changed after filtered_pids was created, we
-         * by default ignore all pids greater than the previous pid_max.
-         */
-        if (search_pid >= filtered_pids->pid_max)
-                return false;
-        return test_bit(search_pid, filtered_pids->pids);
-}
-static bool
-ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task)
-{
-        /*
-         * Return false, because if filtered_pids does not exist,
-         * all pids are good to trace.
-         */
-        if (!filtered_pids)
-                return false;
-        return !find_filtered_pid(filtered_pids, task->pid);
-}
-static void filter_add_remove_task(struct trace_pid_list *pid_list,
-                                   struct task_struct *self,
-                                   struct task_struct *task)
-{
-        if (!pid_list)
-                return;
-        /* For forks, we only add if the forking task is listed */
-        if (self) {
-                if (!find_filtered_pid(pid_list, self->pid))
-                        return;
-        }
-        /* Sorry, but we don't support pid_max changing after setting */
-        if (task->pid >= pid_list->pid_max)
-                return;
-        /* "self" is set for forks, and NULL for exits */
-        if (self)
-                set_bit(task->pid, pid_list->pids);
-        else
-                clear_bit(task->pid, pid_list->pids);
-}
 static void
 event_filter_pid_sched_process_exit(void *data, struct task_struct *task)
 {
@@ -560,7 +513,7 @@ event_filter_pid_sched_process_exit(void *data, struct task_struct *task)
        struct trace_array *tr = data;
        pid_list = rcu_dereference_sched(tr->filtered_pids);
-        filter_add_remove_task(pid_list, NULL, task);
+        trace_filter_add_remove_task(pid_list, NULL, task);
 }
 static void
@@ -572,7 +525,7 @@ event_filter_pid_sched_process_fork(void *data,
        struct trace_array *tr = data;
        pid_list = rcu_dereference_sched(tr->filtered_pids);
-        filter_add_remove_task(pid_list, self, task);
+        trace_filter_add_remove_task(pid_list, self, task);
 }
 void trace_event_follow_fork(struct trace_array *tr, bool enable)
@@ -600,8 +553,8 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
        pid_list = rcu_dereference_sched(tr->filtered_pids);
        this_cpu_write(tr->trace_buffer.data->ignore_pid,
-                       ignore_this_task(pid_list, prev) &&
+                       trace_ignore_this_task(pid_list, prev) &&
-                       ignore_this_task(pid_list, next));
+                       trace_ignore_this_task(pid_list, next));
 }
 static void
@@ -614,7 +567,7 @@ event_filter_pid_sched_switch_probe_post(void *data, bool preempt,
        pid_list = rcu_dereference_sched(tr->filtered_pids);
        this_cpu_write(tr->trace_buffer.data->ignore_pid,
-                       ignore_this_task(pid_list, next));
+                       trace_ignore_this_task(pid_list, next));
 }
 static void
@@ -630,7 +583,7 @@ event_filter_pid_sched_wakeup_probe_pre(void *data, struct task_struct *task)
        pid_list = rcu_dereference_sched(tr->filtered_pids);
        this_cpu_write(tr->trace_buffer.data->ignore_pid,
-                       ignore_this_task(pid_list, task));
+                       trace_ignore_this_task(pid_list, task));
 }
 static void
@@ -647,7 +600,7 @@ event_filter_pid_sched_wakeup_probe_post(void *data, struct task_struct *task)
        /* Set tracing if current is enabled */
        this_cpu_write(tr->trace_buffer.data->ignore_pid,
-                       ignore_this_task(pid_list, current));
+                       trace_ignore_this_task(pid_list, current));
 }
 static void __ftrace_clear_event_pids(struct trace_array *tr)
@@ -685,8 +638,7 @@ static void __ftrace_clear_event_pids(struct trace_array *tr)
        /* Wait till all users are no longer using pid filtering */
        synchronize_sched();
-        vfree(pid_list->pids);
+        trace_free_pid_list(pid_list);
-        kfree(pid_list);
 }
 static void ftrace_clear_event_pids(struct trace_array *tr)
@@ -1034,18 +986,8 @@ p_next(struct seq_file *m, void *v, loff_t *pos)
 {
        struct trace_array *tr = m->private;
        struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids);
-        unsigned long pid = (unsigned long)v;
-        (*pos)++;
-        /* pid already is +1 of the actual prevous bit */
-        pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid);
-        /* Return pid + 1 to allow zero to be represented */
+        return trace_pid_next(pid_list, v, pos);
-        if (pid < pid_list->pid_max)
-                return (void *)(pid + 1);
-        return NULL;
 }
 static void *p_start(struct seq_file *m, loff_t *pos)
@@ -1053,8 +995,6 @@ static void *p_start(struct seq_file *m, loff_t *pos)
 {
        struct trace_pid_list *pid_list;
        struct trace_array *tr = m->private;
-        unsigned long pid;
-        loff_t l = 0;
        /*
         * Grab the mutex, to keep calls to p_next() having the same
@@ -1070,15 +1010,7 @@ static void *p_start(struct seq_file *m, loff_t *pos)
        if (!pid_list)
                return NULL;
-        pid = find_first_bit(pid_list->pids, pid_list->pid_max);
+        return trace_pid_start(pid_list, pos);
-        if (pid >= pid_list->pid_max)
-                return NULL;
-        /* Return pid + 1 so that zero can be the exit value */
-        for (pid++; pid && l < *pos;
-             pid = (unsigned long)p_next(m, (void *)pid, &l))
-                ;
-        return (void *)pid;
 }
 static void p_stop(struct seq_file *m, void *p)
@@ -1088,14 +1020,6 @@ static void p_stop(struct seq_file *m, void *p)
        mutex_unlock(&event_mutex);
 }
-static int p_show(struct seq_file *m, void *v)
-{
-        unsigned long pid = (unsigned long)v - 1;
-        seq_printf(m, "%lu\n", pid);
-        return 0;
-}
 static ssize_t
 event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
                  loff_t *ppos)
@@ -1654,7 +1578,7 @@ static void ignore_task_cpu(void *data)
                                             mutex_is_locked(&event_mutex));
        this_cpu_write(tr->trace_buffer.data->ignore_pid,
-                       ignore_this_task(pid_list, current));
+                       trace_ignore_this_task(pid_list, current));
 }
 static ssize_t
@@ -1666,13 +1590,7 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
        struct trace_pid_list *filtered_pids = NULL;
        struct trace_pid_list *pid_list;
        struct trace_event_file *file;
-        struct trace_parser parser;
+        ssize_t ret;
-        unsigned long val;
-        loff_t this_pos;
-        ssize_t read = 0;
-        ssize_t ret = 0;
-        pid_t pid;
-        int nr_pids = 0;
        if (!cnt)
                return 0;
@@ -1681,93 +1599,15 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
        if (ret < 0)
                return ret;
-        if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1))
-                return -ENOMEM;
        mutex_lock(&event_mutex);
        filtered_pids = rcu_dereference_protected(tr->filtered_pids,
                                             lockdep_is_held(&event_mutex));
-        /*
+        ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt);
-         * Always recreate a new array. The write is an all or nothing
+        if (ret < 0)
-         * operation. Always create a new array when adding new pids by
-         * the user. If the operation fails, then the current list is
-         * not modified.
-         */
-        pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL);
-        if (!pid_list) {
-                read = -ENOMEM;
-                goto out;
-        }
-        pid_list->pid_max = READ_ONCE(pid_max);
-        /* Only truncating will shrink pid_max */
-        if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max)
-                pid_list->pid_max = filtered_pids->pid_max;
-        pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3);
-        if (!pid_list->pids) {
-                kfree(pid_list);
-                read = -ENOMEM;
-                goto out;
-        }
-        if (filtered_pids) {
-                /* copy the current bits to the new max */
-                pid = find_first_bit(filtered_pids->pids,
-                                     filtered_pids->pid_max);
-                while (pid < filtered_pids->pid_max) {
-                        set_bit(pid, pid_list->pids);
-                        pid = find_next_bit(filtered_pids->pids,
-                                            filtered_pids->pid_max,
-                                            pid + 1);
-                        nr_pids++;
-                }
-        }
-        while (cnt > 0) {
-                this_pos = 0;
-                ret = trace_get_user(&parser, ubuf, cnt, &this_pos);
-                if (ret < 0 || !trace_parser_loaded(&parser))
-                        break;
-                read += ret;
-                ubuf += ret;
-                cnt -= ret;
-                parser.buffer[parser.idx] = 0;
-                ret = -EINVAL;
-                if (kstrtoul(parser.buffer, 0, &val))
-                        break;
-                if (val >= pid_list->pid_max)
-                        break;
-                pid = (pid_t)val;
-                set_bit(pid, pid_list->pids);
-                nr_pids++;
-                trace_parser_clear(&parser);
-                ret = 0;
-        }
-        trace_parser_put(&parser);
-        if (ret < 0) {
-                vfree(pid_list->pids);
-                kfree(pid_list);
-                read = ret;
                goto out;
-        }
-        if (!nr_pids) {
-                /* Cleared the list of pids */
-                vfree(pid_list->pids);
-                kfree(pid_list);
-                read = ret;
-                if (!filtered_pids)
-                        goto out;
-                pid_list = NULL;
-        }
        rcu_assign_pointer(tr->filtered_pids, pid_list);
        list_for_each_entry(file, &tr->events, list) {
@@ -1776,10 +1616,8 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
        if (filtered_pids) {
                synchronize_sched();
+                trace_free_pid_list(filtered_pids);
-                vfree(filtered_pids->pids);
+        } else if (pid_list) {
-                kfree(filtered_pids);
-        } else {
                /*
                 * Register a probe that is called before all other probes
                 * to set ignore_pid if next or prev do not match.
@@ -1817,9 +1655,8 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
 out:
        mutex_unlock(&event_mutex);
-        ret = read;
+        if (ret > 0)
-        if (read > 0)
+                *ppos += ret;
-                *ppos += read;
        return ret;
 }
@@ -1846,7 +1683,7 @@ static const struct seq_operations show_set_event_seq_ops = {
 static const struct seq_operations show_set_pid_seq_ops = {
        .start = p_start,
        .next = p_next,
-        .show = p_show,
+        .show = trace_pid_show,
        .stop = p_stop,
 };
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 5a095c2e4b69..0efa00d80623 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -43,7 +43,7 @@ static int allocate_ftrace_ops(struct trace_array *tr)
        /* Currently only the non stack verision is supported */
        ops->func = function_trace_call;
-        ops->flags = FTRACE_OPS_FL_RECURSION_SAFE;
+        ops->flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_PID;
        tr->ops = ops;
        ops->private = tr;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 3a0244ff7ea8..7363ccf79512 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -319,7 +319,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
        int cpu;
        int pc;
-        if (!ftrace_trace_task(current))
+        if (!ftrace_trace_task(tr))
                return 0;
        /* trace it when it is-nested-in or is a function enabled. */
@@ -338,6 +338,13 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
        if (ftrace_graph_notrace_addr(trace->func))
                return 1;
+        /*
+         * Stop here if tracing_threshold is set. We only write function return
+         * events to the ring buffer.
+         */
+        if (tracing_thresh)
+                return 1;
        local_irq_save(flags);
        cpu = raw_smp_processor_id();
        data = per_cpu_ptr(tr->trace_buffer.data, cpu);
@@ -355,14 +362,6 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
        return ret;
 }
-static int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
-{
-        if (tracing_thresh)
-                return 1;
-        else
-                return trace_graph_entry(trace);
-}
 static void
 __trace_graph_function(struct trace_array *tr,
                unsigned long ip, unsigned long flags, int pc)
@@ -457,7 +456,7 @@ static int graph_trace_init(struct trace_array *tr)
        set_graph_array(tr);
        if (tracing_thresh)
                ret = register_ftrace_graph(&trace_graph_thresh_return,
-                                            &trace_graph_thresh_entry);
+                                            &trace_graph_entry);
        else
                ret = register_ftrace_graph(&trace_graph_return,
                                            &trace_graph_entry);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 5546eec0505f..9aedb0b06683 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -587,6 +587,7 @@ static int create_trace_kprobe(int argc, char **argv)
         *  $retval     : fetch return value
         *  $stack      : fetch stack address
         *  $stackN     : fetch Nth of stack (N:0-)
+         *  $comm       : fetch current task comm
         *  @ADDR       : fetch memory at ADDR (ADDR should be in kernel)
         *  @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol)
         *  %REG        : fetch register REG
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 68f376ca6d3f..cd7480d0a201 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -68,19 +68,15 @@ static void mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
        trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x",
                         dev->bus->number, dev->devfn,
                         dev->vendor, dev->device, dev->irq);
-        /*
-         * XXX: is pci_resource_to_user() appropriate, since we are
-         * supposed to interpret the __ioremap() phys_addr argument based on
-         * these printed values?
-         */
        for (i = 0; i < 7; i++) {
-                pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
+                start = dev->resource[i].start;
                trace_seq_printf(s, " %llx",
                        (unsigned long long)(start |
                        (dev->resource[i].flags & PCI_REGION_FLAG_MASK)));
        }
        for (i = 0; i < 7; i++) {
-                pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
+                start = dev->resource[i].start;
+                end = dev->resource[i].end;
                trace_seq_printf(s, " %llx",
                        dev->resource[i].start < dev->resource[i].end ?
                        (unsigned long long)(end - start) + 1 : 0);
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index f96f0383f6c6..ad1d6164e946 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -36,6 +36,10 @@ struct trace_bprintk_fmt {
 static inline struct trace_bprintk_fmt *lookup_format(const char *fmt)
 {
        struct trace_bprintk_fmt *pos;
+        if (!fmt)
+                return ERR_PTR(-EINVAL);
        list_for_each_entry(pos, &trace_bprintk_fmt_list, list) {
                if (!strcmp(pos->fmt, fmt))
                        return pos;
@@ -57,7 +61,8 @@ void hold_module_trace_bprintk_format(const char **start, const char **end)
        for (iter = start; iter < end; iter++) {
                struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter);
                if (tb_fmt) {
-                        *iter = tb_fmt->fmt;
+                        if (!IS_ERR(tb_fmt))
+                                *iter = tb_fmt->fmt;
                        continue;
                }
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 1d372fa6fefb..74e80a582c28 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -218,6 +218,28 @@ free_bitfield_fetch_param(struct bitfield_fetch_param *data)
        kfree(data);
 }
+void FETCH_FUNC_NAME(comm, string)(struct pt_regs *regs,
+                                          void *data, void *dest)
+{
+        int maxlen = get_rloc_len(*(u32 *)dest);
+        u8 *dst = get_rloc_data(dest);
+        long ret;
+        if (!maxlen)
+                return;
+        ret = strlcpy(dst, current->comm, maxlen);
+        *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(*(u32 *)dest));
+}
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(comm, string));
+void FETCH_FUNC_NAME(comm, string_size)(struct pt_regs *regs,
+                                               void *data, void *dest)
+{
+        *(u32 *)dest = strlen(current->comm) + 1;
+}
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(comm, string_size));
 static const struct fetch_type *find_fetch_type(const char *type,
                                                const struct fetch_type *ftbl)
 {
@@ -348,6 +370,11 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
                        }
                } else
                        ret = -EINVAL;
+        } else if (strcmp(arg, "comm") == 0) {
+                if (strcmp(t->name, "string") != 0 &&
+                    strcmp(t->name, "string_size") != 0)
+                        return -EINVAL;
+                f->fn = t->fetch[FETCH_MTD_comm];
        } else
                ret = -EINVAL;
@@ -522,6 +549,12 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
                arg[t - parg->comm] = '\0';
                t++;
        }
+        /*
+         * The default type of $comm should be "string", and it can't be
+         * dereferenced.
+         */
+        if (!t && strcmp(arg, "$comm") == 0)
+                t = "string";
        parg->type = find_fetch_type(t, ftbl);
        if (!parg->type) {
                pr_info("Unsupported type: %s\n", t);
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index f6398db09114..45400ca5ded1 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -102,6 +102,7 @@ enum {
        FETCH_MTD_reg = 0,
        FETCH_MTD_stack,
        FETCH_MTD_retval,
+        FETCH_MTD_comm,
        FETCH_MTD_memory,
        FETCH_MTD_symbol,
        FETCH_MTD_deref,
@@ -183,6 +184,14 @@ DECLARE_BASIC_FETCH_FUNCS(bitfield);
 #define fetch_bitfield_string                   NULL
 #define fetch_bitfield_string_size              NULL
+/* comm only makes sense as a string */
+#define fetch_comm_u8           NULL
+#define fetch_comm_u16          NULL
+#define fetch_comm_u32          NULL
+#define fetch_comm_u64          NULL
+DECLARE_FETCH_FUNC(comm, string);
+DECLARE_FETCH_FUNC(comm, string_size);
 /*
 * Define macro for basic types - we don't need to define s* types, because
 * we have to care only about bitwidth at recording time.
@@ -213,6 +222,7 @@ DEFINE_FETCH_##method(u64)
 ASSIGN_FETCH_FUNC(reg, ftype),                          \
 ASSIGN_FETCH_FUNC(stack, ftype),                        \
 ASSIGN_FETCH_FUNC(retval, ftype),                       \
+ASSIGN_FETCH_FUNC(comm, ftype),                         \
 ASSIGN_FETCH_FUNC(memory, ftype),                       \
 ASSIGN_FETCH_FUNC(symbol, ftype),                       \
 ASSIGN_FETCH_FUNC(deref, ftype),                        \
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 9bafc211930c..68f594212759 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -938,6 +938,20 @@ bool userns_may_setgroups(const struct user_namespace *ns)
        return allowed;
 }
+/*
+ * Returns true if @ns is the same namespace as or a descendant of
+ * @target_ns.
+ */
+bool current_in_userns(const struct user_namespace *target_ns)
+{
+        struct user_namespace *ns;
+        for (ns = current_user_ns(); ns; ns = ns->parent) {
+                if (ns == target_ns)
+                        return true;
+        }
+        return false;
+}
 static inline struct user_namespace *to_user_ns(struct ns_common *ns)
 {
        return container_of(ns, struct user_namespace, ns);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e1c0e996b5ae..ef071ca73fc3 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4369,8 +4369,8 @@ static void show_pwq(struct pool_workqueue *pwq)
 /**
 * show_workqueue_state - dump workqueue state
 *
- * Called from a sysrq handler and prints out all busy workqueues and
+ * Called from a sysrq handler or try_to_freeze_tasks() and prints out
- * pools.
+ * all busy workqueues and pools.
 */
 void show_workqueue_state(void)
 {
@@ -4600,95 +4600,72 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
        if (!cpumask_test_cpu(cpu, pool->attrs->cpumask))
                return;
-        /* is @cpu the only online CPU? */
        cpumask_and(&cpumask, pool->attrs->cpumask, cpu_online_mask);
-        if (cpumask_weight(&cpumask) != 1)
-                return;
        /* as we're called from CPU_ONLINE, the following shouldn't fail */
        for_each_pool_worker(worker, pool)
-                WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
+                WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, &cpumask) < 0);
-                                                  pool->attrs->cpumask) < 0);
 }
-/*
+int workqueue_prepare_cpu(unsigned int cpu)
- * Workqueues should be brought up before normal priority CPU notifiers.
+{
- * This will be registered high priority CPU notifier.
+        struct worker_pool *pool;
- */
-static int workqueue_cpu_up_callback(struct notifier_block *nfb,
+        for_each_cpu_worker_pool(pool, cpu) {
-                                               unsigned long action,
+                if (pool->nr_workers)
-                                               void *hcpu)
+                        continue;
+                if (!create_worker(pool))
+                        return -ENOMEM;
+        }
+        return 0;
+}
+int workqueue_online_cpu(unsigned int cpu)
 {
-        int cpu = (unsigned long)hcpu;
        struct worker_pool *pool;
        struct workqueue_struct *wq;
        int pi;
-        switch (action & ~CPU_TASKS_FROZEN) {
+        mutex_lock(&wq_pool_mutex);
-        case CPU_UP_PREPARE:
-                for_each_cpu_worker_pool(pool, cpu) {
-                        if (pool->nr_workers)
-                                continue;
-                        if (!create_worker(pool))
-                                return NOTIFY_BAD;
-                }
-                break;
-        case CPU_DOWN_FAILED:
-        case CPU_ONLINE:
-                mutex_lock(&wq_pool_mutex);
-                for_each_pool(pool, pi) {
+        for_each_pool(pool, pi) {
-                        mutex_lock(&pool->attach_mutex);
+                mutex_lock(&pool->attach_mutex);
-                        if (pool->cpu == cpu)
+                if (pool->cpu == cpu)
-                                rebind_workers(pool);
+                        rebind_workers(pool);
-                        else if (pool->cpu < 0)
+                else if (pool->cpu < 0)
-                                restore_unbound_workers_cpumask(pool, cpu);
+                        restore_unbound_workers_cpumask(pool, cpu);
-                        mutex_unlock(&pool->attach_mutex);
+                mutex_unlock(&pool->attach_mutex);
-                }
+        }
-                /* update NUMA affinity of unbound workqueues */
+        /* update NUMA affinity of unbound workqueues */
-                list_for_each_entry(wq, &workqueues, list)
+        list_for_each_entry(wq, &workqueues, list)
-                        wq_update_unbound_numa(wq, cpu, true);
+                wq_update_unbound_numa(wq, cpu, true);
-                mutex_unlock(&wq_pool_mutex);
+        mutex_unlock(&wq_pool_mutex);
-                break;
+        return 0;
-        }
-        return NOTIFY_OK;
 }
-/*
+int workqueue_offline_cpu(unsigned int cpu)
- * Workqueues should be brought down after normal priority CPU notifiers.
- * This will be registered as low priority CPU notifier.
- */
-static int workqueue_cpu_down_callback(struct notifier_block *nfb,
-                                                 unsigned long action,
-                                                 void *hcpu)
 {
-        int cpu = (unsigned long)hcpu;
        struct work_struct unbind_work;
        struct workqueue_struct *wq;
-        switch (action & ~CPU_TASKS_FROZEN) {
+        /* unbinding per-cpu workers should happen on the local CPU */
-        case CPU_DOWN_PREPARE:
+        INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);
-                /* unbinding per-cpu workers should happen on the local CPU */
+        queue_work_on(cpu, system_highpri_wq, &unbind_work);
-                INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);
-                queue_work_on(cpu, system_highpri_wq, &unbind_work);
+        /* update NUMA affinity of unbound workqueues */
+        mutex_lock(&wq_pool_mutex);
-                /* update NUMA affinity of unbound workqueues */
+        list_for_each_entry(wq, &workqueues, list)
-                mutex_lock(&wq_pool_mutex);
+                wq_update_unbound_numa(wq, cpu, false);
-                list_for_each_entry(wq, &workqueues, list)
+        mutex_unlock(&wq_pool_mutex);
-                        wq_update_unbound_numa(wq, cpu, false);
-                mutex_unlock(&wq_pool_mutex);
+        /* wait for per-cpu unbinding to finish */
+        flush_work(&unbind_work);
-                /* wait for per-cpu unbinding to finish */
+        destroy_work_on_stack(&unbind_work);
-                flush_work(&unbind_work);
+        return 0;
-                destroy_work_on_stack(&unbind_work);
-                break;
-        }
-        return NOTIFY_OK;
 }
 #ifdef CONFIG_SMP
@@ -5490,9 +5467,6 @@ static int __init init_workqueues(void)
        pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
-        cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
-        hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
        wq_numa_init();
        /* initialize CPU pools */