126 files changed, 6575 insertions, 2605 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index e2ec54e2b952..eb26e12c6c2a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,7 @@ obj-y     = fork.o exec_domain.o panic.o \
            extable.o params.o \
            kthread.o sys_ni.o nsproxy.o \
            notifier.o ksysfs.o cred.o reboot.o \
-            async.o range.o smpboot.o
+            async.o range.o smpboot.o ucount.o
 obj-$(CONFIG_MULTIUSER) += groups.o
diff --git a/kernel/audit.c b/kernel/audit.c
index a8a91bd2b2a9..f1ca11613379 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -877,6 +877,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                                return err;
                }
                if (s.mask & AUDIT_STATUS_PID) {
+                        /* NOTE: we are using task_tgid_vnr() below because
+                         *       the s.pid value is relative to the namespace
+                         *       of the caller; at present this doesn't matter
+                         *       much since you can really only run auditd
+                         *       from the initial pid namespace, but something
+                         *       to keep in mind if this changes */
                        int new_pid = s.pid;
                        pid_t requesting_pid = task_tgid_vnr(current);
@@ -1917,7 +1923,7 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
                         " euid=%u suid=%u fsuid=%u"
                         " egid=%u sgid=%u fsgid=%u tty=%s ses=%u",
                         task_ppid_nr(tsk),
-                         task_pid_nr(tsk),
+                         task_tgid_nr(tsk),
                         from_kuid(&init_user_ns, audit_get_loginuid(tsk)),
                         from_kuid(&init_user_ns, cred->uid),
                         from_kgid(&init_user_ns, cred->gid),
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index d6709eb70970..0d302a87f21b 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -19,6 +19,7 @@
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
+#include <linux/file.h>
 #include <linux/kernel.h>
 #include <linux/audit.h>
 #include <linux/kthread.h>
@@ -544,10 +545,11 @@ int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark)
        unsigned long ino;
        dev_t dev;
-        rcu_read_lock();
+        exe_file = get_task_exe_file(tsk);
-        exe_file = rcu_dereference(tsk->mm->exe_file);
+        if (!exe_file)
+                return 0;
        ino = exe_file->f_inode->i_ino;
        dev = exe_file->f_inode->i_sb->s_dev;
-        rcu_read_unlock();
+        fput(exe_file);
        return audit_mark_compare(mark, ino, dev);
 }
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 5abf1dc1f91c..2cd5256dbff7 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -457,7 +457,7 @@ static int audit_filter_rules(struct task_struct *tsk,
                switch (f->type) {
                case AUDIT_PID:
-                        pid = task_pid_nr(tsk);
+                        pid = task_tgid_nr(tsk);
                        result = audit_comparator(pid, f->op, f->val);
                        break;
                case AUDIT_PPID:
@@ -1993,7 +1993,7 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
        loginuid = from_kuid(&init_user_ns, kloginuid),
        tty = audit_get_tty(current);
-        audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid);
+        audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid);
        audit_log_task_context(ab);
        audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d",
                         oldloginuid, loginuid, tty ? tty_name(tty) : "(none)",
@@ -2220,7 +2220,7 @@ void __audit_ptrace(struct task_struct *t)
 {
        struct audit_context *context = current->audit_context;
-        context->target_pid = task_pid_nr(t);
+        context->target_pid = task_tgid_nr(t);
        context->target_auid = audit_get_loginuid(t);
        context->target_uid = task_uid(t);
        context->target_sessionid = audit_get_sessionid(t);
@@ -2245,7 +2245,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
        if (audit_pid && t->tgid == audit_pid) {
                if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) {
-                        audit_sig_pid = task_pid_nr(tsk);
+                        audit_sig_pid = task_tgid_nr(tsk);
                        if (uid_valid(tsk->loginuid))
                                audit_sig_uid = tsk->loginuid;
                        else
@@ -2345,7 +2345,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
 void __audit_log_capset(const struct cred *new, const struct cred *old)
 {
        struct audit_context *context = current->audit_context;
-        context->capset.pid = task_pid_nr(current);
+        context->capset.pid = task_tgid_nr(current);
        context->capset.cap.effective   = new->cap_effective;
        context->capset.cap.inheritable = new->cap_effective;
        context->capset.cap.permitted   = new->cap_permitted;
@@ -2377,7 +2377,7 @@ static void audit_log_task(struct audit_buffer *ab)
                         from_kgid(&init_user_ns, gid),
                         sessionid);
        audit_log_task_context(ab);
-        audit_log_format(ab, " pid=%d comm=", task_pid_nr(current));
+        audit_log_format(ab, " pid=%d comm=", task_tgid_nr(current));
        audit_log_untrustedstring(ab, get_task_comm(comm, current));
        audit_log_d_path_exe(ab, current->mm);
 }
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 633a650d7aeb..a2ac051c342f 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -538,7 +538,7 @@ static int __init register_perf_event_array_map(void)
 }
 late_initcall(register_perf_event_array_map);
-#ifdef CONFIG_SOCK_CGROUP_DATA
+#ifdef CONFIG_CGROUPS
 static void *cgroup_fd_array_get_ptr(struct bpf_map *map,
                                     struct file *map_file /* not used */,
                                     int fd)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 03fd23d4d587..aa6d98154106 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1018,7 +1018,7 @@ void bpf_user_rnd_init_once(void)
        prandom_init_once(&bpf_user_rnd_state);
 }
-u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_0(bpf_user_rnd_u32)
 {
        /* Should someone ever have the rather unwise idea to use some
         * of the registers passed into this function, then note that
@@ -1031,7 +1031,7 @@ u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
        state = &get_cpu_var(bpf_user_rnd_state);
        res = prandom_u32_state(state);
-        put_cpu_var(state);
+        put_cpu_var(bpf_user_rnd_state);
        return res;
 }
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index fff3650d52fc..570eeca7bdfa 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -26,11 +26,18 @@ struct bpf_htab {
        struct bucket *buckets;
        void *elems;
        struct pcpu_freelist freelist;
+        void __percpu *extra_elems;
        atomic_t count; /* number of elements in this hashtable */
        u32 n_buckets;  /* number of hash buckets */
        u32 elem_size;  /* size of each element in bytes */
 };
+enum extra_elem_state {
+        HTAB_NOT_AN_EXTRA_ELEM = 0,
+        HTAB_EXTRA_ELEM_FREE,
+        HTAB_EXTRA_ELEM_USED
+};
 /* each htab element is struct htab_elem + key + value */
 struct htab_elem {
        union {
@@ -38,7 +45,10 @@ struct htab_elem {
                struct bpf_htab *htab;
                struct pcpu_freelist_node fnode;
        };
-        struct rcu_head rcu;
+        union {
+                struct rcu_head rcu;
+                enum extra_elem_state state;
+        };
        u32 hash;
        char key[0] __aligned(8);
 };
@@ -113,6 +123,23 @@ free_elems:
        return err;
 }
+static int alloc_extra_elems(struct bpf_htab *htab)
+{
+        void __percpu *pptr;
+        int cpu;
+        pptr = __alloc_percpu_gfp(htab->elem_size, 8, GFP_USER | __GFP_NOWARN);
+        if (!pptr)
+                return -ENOMEM;
+        for_each_possible_cpu(cpu) {
+                ((struct htab_elem *)per_cpu_ptr(pptr, cpu))->state =
+                        HTAB_EXTRA_ELEM_FREE;
+        }
+        htab->extra_elems = pptr;
+        return 0;
+}
 /* Called from syscall */
 static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 {
@@ -185,6 +212,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
        if (percpu)
                cost += (u64) round_up(htab->map.value_size, 8) *
                        num_possible_cpus() * htab->map.max_entries;
+        else
+               cost += (u64) htab->elem_size * num_possible_cpus();
        if (cost >= U32_MAX - PAGE_SIZE)
                /* make sure page count doesn't overflow */
@@ -212,14 +241,22 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
                raw_spin_lock_init(&htab->buckets[i].lock);
        }
+        if (!percpu) {
+                err = alloc_extra_elems(htab);
+                if (err)
+                        goto free_buckets;
+        }
        if (!(attr->map_flags & BPF_F_NO_PREALLOC)) {
                err = prealloc_elems_and_freelist(htab);
                if (err)
-                        goto free_buckets;
+                        goto free_extra_elems;
        }
        return &htab->map;
+free_extra_elems:
+        free_percpu(htab->extra_elems);
 free_buckets:
        kvfree(htab->buckets);
 free_htab:
@@ -349,7 +386,6 @@ static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
        if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
                free_percpu(htab_elem_get_ptr(l, htab->map.key_size));
        kfree(l);
 }
 static void htab_elem_free_rcu(struct rcu_head *head)
@@ -370,6 +406,11 @@ static void htab_elem_free_rcu(struct rcu_head *head)
 static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
 {
+        if (l->state == HTAB_EXTRA_ELEM_USED) {
+                l->state = HTAB_EXTRA_ELEM_FREE;
+                return;
+        }
        if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) {
                pcpu_freelist_push(&htab->freelist, &l->fnode);
        } else {
@@ -381,25 +422,44 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
 static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
                                         void *value, u32 key_size, u32 hash,
-                                         bool percpu, bool onallcpus)
+                                         bool percpu, bool onallcpus,
+                                         bool old_elem_exists)
 {
        u32 size = htab->map.value_size;
        bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC);
        struct htab_elem *l_new;
        void __percpu *pptr;
+        int err = 0;
        if (prealloc) {
                l_new = (struct htab_elem *)pcpu_freelist_pop(&htab->freelist);
                if (!l_new)
-                        return ERR_PTR(-E2BIG);
+                        err = -E2BIG;
        } else {
                if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
                        atomic_dec(&htab->count);
-                        return ERR_PTR(-E2BIG);
+                        err = -E2BIG;
+                } else {
+                        l_new = kmalloc(htab->elem_size,
+                                        GFP_ATOMIC | __GFP_NOWARN);
+                        if (!l_new)
+                                return ERR_PTR(-ENOMEM);
                }
-                l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
+        }
-                if (!l_new)
-                        return ERR_PTR(-ENOMEM);
+        if (err) {
+                if (!old_elem_exists)
+                        return ERR_PTR(err);
+                /* if we're updating the existing element and the hash table
+                 * is full, use per-cpu extra elems
+                 */
+                l_new = this_cpu_ptr(htab->extra_elems);
+                if (l_new->state != HTAB_EXTRA_ELEM_FREE)
+                        return ERR_PTR(-E2BIG);
+                l_new->state = HTAB_EXTRA_ELEM_USED;
+        } else {
+                l_new->state = HTAB_NOT_AN_EXTRA_ELEM;
        }
        memcpy(l_new->key, key, key_size);
@@ -489,7 +549,8 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
        if (ret)
                goto err;
-        l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false);
+        l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
+                                !!l_old);
        if (IS_ERR(l_new)) {
                /* all pre-allocated elements are in use or memory exhausted */
                ret = PTR_ERR(l_new);
@@ -563,7 +624,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
                }
        } else {
                l_new = alloc_htab_elem(htab, key, value, key_size,
-                                        hash, true, onallcpus);
+                                        hash, true, onallcpus, false);
                if (IS_ERR(l_new)) {
                        ret = PTR_ERR(l_new);
                        goto err;
@@ -652,6 +713,7 @@ static void htab_map_free(struct bpf_map *map)
                htab_free_elems(htab);
                pcpu_freelist_destroy(&htab->freelist);
        }
+        free_percpu(htab->extra_elems);
        kvfree(htab->buckets);
        kfree(htab);
 }
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 1ea3afba1a4f..39918402e6e9 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -16,6 +16,7 @@
 #include <linux/ktime.h>
 #include <linux/sched.h>
 #include <linux/uidgid.h>
+#include <linux/filter.h>
 /* If kernel subsystem is allowing eBPF programs to call this function,
 * inside its own verifier_ops->get_func_proto() callback it should return
@@ -26,48 +27,32 @@
 * if program is allowed to access maps, so check rcu_read_lock_held in
 * all three functions.
 */
-static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key)
 {
-        /* verifier checked that R1 contains a valid pointer to bpf_map
-         * and R2 points to a program stack and map->key_size bytes were
-         * initialized
-         */
-        struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
-        void *key = (void *) (unsigned long) r2;
-        void *value;
        WARN_ON_ONCE(!rcu_read_lock_held());
+        return (unsigned long) map->ops->map_lookup_elem(map, key);
-        value = map->ops->map_lookup_elem(map, key);
-        /* lookup() returns either pointer to element value or NULL
-         * which is the meaning of PTR_TO_MAP_VALUE_OR_NULL type
-         */
-        return (unsigned long) value;
 }
 const struct bpf_func_proto bpf_map_lookup_elem_proto = {
        .func           = bpf_map_lookup_elem,
        .gpl_only       = false,
+        .pkt_access     = true,
        .ret_type       = RET_PTR_TO_MAP_VALUE_OR_NULL,
        .arg1_type      = ARG_CONST_MAP_PTR,
        .arg2_type      = ARG_PTR_TO_MAP_KEY,
 };
-static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key,
+           void *, value, u64, flags)
 {
-        struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
-        void *key = (void *) (unsigned long) r2;
-        void *value = (void *) (unsigned long) r3;
        WARN_ON_ONCE(!rcu_read_lock_held());
+        return map->ops->map_update_elem(map, key, value, flags);
-        return map->ops->map_update_elem(map, key, value, r4);
 }
 const struct bpf_func_proto bpf_map_update_elem_proto = {
        .func           = bpf_map_update_elem,
        .gpl_only       = false,
+        .pkt_access     = true,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_CONST_MAP_PTR,
        .arg2_type      = ARG_PTR_TO_MAP_KEY,
@@ -75,19 +60,16 @@ const struct bpf_func_proto bpf_map_update_elem_proto = {
        .arg4_type      = ARG_ANYTHING,
 };
-static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key)
 {
-        struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
-        void *key = (void *) (unsigned long) r2;
        WARN_ON_ONCE(!rcu_read_lock_held());
        return map->ops->map_delete_elem(map, key);
 }
 const struct bpf_func_proto bpf_map_delete_elem_proto = {
        .func           = bpf_map_delete_elem,
        .gpl_only       = false,
+        .pkt_access     = true,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_CONST_MAP_PTR,
        .arg2_type      = ARG_PTR_TO_MAP_KEY,
@@ -99,7 +81,7 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto = {
        .ret_type       = RET_INTEGER,
 };
-static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_0(bpf_get_smp_processor_id)
 {
        return smp_processor_id();
 }
@@ -110,7 +92,7 @@ const struct bpf_func_proto bpf_get_smp_processor_id_proto = {
        .ret_type       = RET_INTEGER,
 };
-static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_0(bpf_ktime_get_ns)
 {
        /* NMI safe access to clock monotonic */
        return ktime_get_mono_fast_ns();
@@ -122,11 +104,11 @@ const struct bpf_func_proto bpf_ktime_get_ns_proto = {
        .ret_type       = RET_INTEGER,
 };
-static u64 bpf_get_current_pid_tgid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_0(bpf_get_current_pid_tgid)
 {
        struct task_struct *task = current;
-        if (!task)
+        if (unlikely(!task))
                return -EINVAL;
        return (u64) task->tgid << 32 | task->pid;
@@ -138,18 +120,18 @@ const struct bpf_func_proto bpf_get_current_pid_tgid_proto = {
        .ret_type       = RET_INTEGER,
 };
-static u64 bpf_get_current_uid_gid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_0(bpf_get_current_uid_gid)
 {
        struct task_struct *task = current;
        kuid_t uid;
        kgid_t gid;
-        if (!task)
+        if (unlikely(!task))
                return -EINVAL;
        current_uid_gid(&uid, &gid);
        return (u64) from_kgid(&init_user_ns, gid) << 32 |
-                from_kuid(&init_user_ns, uid);
+                     from_kuid(&init_user_ns, uid);
 }
 const struct bpf_func_proto bpf_get_current_uid_gid_proto = {
@@ -158,10 +140,9 @@ const struct bpf_func_proto bpf_get_current_uid_gid_proto = {
        .ret_type       = RET_INTEGER,
 };
-static u64 bpf_get_current_comm(u64 r1, u64 size, u64 r3, u64 r4, u64 r5)
+BPF_CALL_2(bpf_get_current_comm, char *, buf, u32, size)
 {
        struct task_struct *task = current;
-        char *buf = (char *) (long) r1;
        if (unlikely(!task))
                goto err_clear;
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 5967b870a895..1ed8473ec537 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -97,7 +97,7 @@ static struct inode *bpf_get_inode(struct super_block *sb,
                return ERR_PTR(-ENOSPC);
        inode->i_ino = get_next_ino();
-        inode->i_atime = CURRENT_TIME;
+        inode->i_atime = current_time(inode);
        inode->i_mtime = inode->i_atime;
        inode->i_ctime = inode->i_atime;
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index bf4495fcd25d..732ae16d12b7 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -116,10 +116,9 @@ free_smap:
        return ERR_PTR(err);
 }
-u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
+BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
+           u64, flags)
 {
-        struct pt_regs *regs = (struct pt_regs *) (long) r1;
-        struct bpf_map *map = (struct bpf_map *) (long) r2;
        struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
        struct perf_callchain_entry *trace;
        struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f72f23b8fdab..99a7e5b388f2 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -14,6 +14,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
 #include <linux/filter.h>
 #include <net/netlink.h>
 #include <linux/file.h>
@@ -126,75 +127,16 @@
 * are set to NOT_INIT to indicate that they are no longer readable.
 */
-struct reg_state {
-        enum bpf_reg_type type;
-        union {
-                /* valid when type == CONST_IMM | PTR_TO_STACK | UNKNOWN_VALUE */
-                s64 imm;
-                /* valid when type == PTR_TO_PACKET* */
-                struct {
-                        u32 id;
-                        u16 off;
-                        u16 range;
-                };
-                /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE |
-                 *   PTR_TO_MAP_VALUE_OR_NULL
-                 */
-                struct bpf_map *map_ptr;
-        };
-};
-enum bpf_stack_slot_type {
-        STACK_INVALID,    /* nothing was stored in this stack slot */
-        STACK_SPILL,      /* register spilled into stack */
-        STACK_MISC        /* BPF program wrote some data into this slot */
-};
-#define BPF_REG_SIZE 8  /* size of eBPF register in bytes */
-/* state of the program:
- * type of all registers and stack info
- */
-struct verifier_state {
-        struct reg_state regs[MAX_BPF_REG];
-        u8 stack_slot_type[MAX_BPF_STACK];
-        struct reg_state spilled_regs[MAX_BPF_STACK / BPF_REG_SIZE];
-};
-/* linked list of verifier states used to prune search */
-struct verifier_state_list {
-        struct verifier_state state;
-        struct verifier_state_list *next;
-};
 /* verifier_state + insn_idx are pushed to stack when branch is encountered */
-struct verifier_stack_elem {
+struct bpf_verifier_stack_elem {
        /* verifer state is 'st'
         * before processing instruction 'insn_idx'
         * and after processing instruction 'prev_insn_idx'
         */
-        struct verifier_state st;
+        struct bpf_verifier_state st;
        int insn_idx;
        int prev_insn_idx;
-        struct verifier_stack_elem *next;
+        struct bpf_verifier_stack_elem *next;
-};
-#define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
-/* single container for all structs
- * one verifier_env per bpf_check() call
- */
-struct verifier_env {
-        struct bpf_prog *prog;          /* eBPF program being verified */
-        struct verifier_stack_elem *head; /* stack of verifier states to be processed */
-        int stack_size;                 /* number of states to be processed */
-        struct verifier_state cur_state; /* current verifier state */
-        struct verifier_state_list **explored_states; /* search pruning optimization */
-        struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
-        u32 used_map_cnt;               /* number of used maps */
-        bool allow_ptr_leaks;
 };
 #define BPF_COMPLEXITY_LIMIT_INSNS      65536
@@ -203,6 +145,7 @@ struct verifier_env {
 struct bpf_call_arg_meta {
        struct bpf_map *map_ptr;
        bool raw_mode;
+        bool pkt_access;
        int regno;
        int access_size;
 };
@@ -239,6 +182,7 @@ static const char * const reg_type_str[] = {
        [CONST_PTR_TO_MAP]      = "map_ptr",
        [PTR_TO_MAP_VALUE]      = "map_value",
        [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",
+        [PTR_TO_MAP_VALUE_ADJ]  = "map_value_adj",
        [FRAME_PTR]             = "fp",
        [PTR_TO_STACK]          = "fp",
        [CONST_IMM]             = "imm",
@@ -246,9 +190,9 @@ static const char * const reg_type_str[] = {
        [PTR_TO_PACKET_END]     = "pkt_end",
 };
-static void print_verifier_state(struct verifier_state *state)
+static void print_verifier_state(struct bpf_verifier_state *state)
 {
-        struct reg_state *reg;
+        struct bpf_reg_state *reg;
        enum bpf_reg_type t;
        int i;
@@ -266,10 +210,17 @@ static void print_verifier_state(struct verifier_state *state)
                else if (t == UNKNOWN_VALUE && reg->imm)
                        verbose("%lld", reg->imm);
                else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE ||
-                         t == PTR_TO_MAP_VALUE_OR_NULL)
+                         t == PTR_TO_MAP_VALUE_OR_NULL ||
+                         t == PTR_TO_MAP_VALUE_ADJ)
                        verbose("(ks=%d,vs=%d)",
                                reg->map_ptr->key_size,
                                reg->map_ptr->value_size);
+                if (reg->min_value != BPF_REGISTER_MIN_RANGE)
+                        verbose(",min_value=%llu",
+                                (unsigned long long)reg->min_value);
+                if (reg->max_value != BPF_REGISTER_MAX_RANGE)
+                        verbose(",max_value=%llu",
+                                (unsigned long long)reg->max_value);
        }
        for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
                if (state->stack_slot_type[i] == STACK_SPILL)
@@ -424,9 +375,9 @@ static void print_bpf_insn(struct bpf_insn *insn)
        }
 }
-static int pop_stack(struct verifier_env *env, int *prev_insn_idx)
+static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx)
 {
-        struct verifier_stack_elem *elem;
+        struct bpf_verifier_stack_elem *elem;
        int insn_idx;
        if (env->head == NULL)
@@ -443,12 +394,12 @@ static int pop_stack(struct verifier_env *env, int *prev_insn_idx)
        return insn_idx;
 }
-static struct verifier_state *push_stack(struct verifier_env *env, int insn_idx,
+static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
-                                         int prev_insn_idx)
+                                             int insn_idx, int prev_insn_idx)
 {
-        struct verifier_stack_elem *elem;
+        struct bpf_verifier_stack_elem *elem;
-        elem = kmalloc(sizeof(struct verifier_stack_elem), GFP_KERNEL);
+        elem = kmalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL);
        if (!elem)
                goto err;
@@ -474,13 +425,15 @@ static const int caller_saved[CALLER_SAVED_REGS] = {
        BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
 };
-static void init_reg_state(struct reg_state *regs)
+static void init_reg_state(struct bpf_reg_state *regs)
 {
        int i;
        for (i = 0; i < MAX_BPF_REG; i++) {
                regs[i].type = NOT_INIT;
                regs[i].imm = 0;
+                regs[i].min_value = BPF_REGISTER_MIN_RANGE;
+                regs[i].max_value = BPF_REGISTER_MAX_RANGE;
        }
        /* frame pointer */
@@ -490,20 +443,26 @@ static void init_reg_state(struct reg_state *regs)
        regs[BPF_REG_1].type = PTR_TO_CTX;
 }
-static void mark_reg_unknown_value(struct reg_state *regs, u32 regno)
+static void mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno)
 {
        BUG_ON(regno >= MAX_BPF_REG);
        regs[regno].type = UNKNOWN_VALUE;
        regs[regno].imm = 0;
 }
+static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno)
+{
+        regs[regno].min_value = BPF_REGISTER_MIN_RANGE;
+        regs[regno].max_value = BPF_REGISTER_MAX_RANGE;
+}
 enum reg_arg_type {
        SRC_OP,         /* register is used as source operand */
        DST_OP,         /* register is used as destination operand */
        DST_OP_NO_MARK  /* same as above, check only, don't mark */
 };
-static int check_reg_arg(struct reg_state *regs, u32 regno,
+static int check_reg_arg(struct bpf_reg_state *regs, u32 regno,
                         enum reg_arg_type t)
 {
        if (regno >= MAX_BPF_REG) {
@@ -563,8 +522,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
 /* check_stack_read/write functions track spill/fill of registers,
 * stack boundary and alignment are checked in check_mem_access()
 */
-static int check_stack_write(struct verifier_state *state, int off, int size,
+static int check_stack_write(struct bpf_verifier_state *state, int off,
-                             int value_regno)
+                             int size, int value_regno)
 {
        int i;
        /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
@@ -589,7 +548,7 @@ static int check_stack_write(struct verifier_state *state, int off, int size,
        } else {
                /* regular write of data into stack */
                state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] =
-                        (struct reg_state) {};
+                        (struct bpf_reg_state) {};
                for (i = 0; i < size; i++)
                        state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC;
@@ -597,7 +556,7 @@ static int check_stack_write(struct verifier_state *state, int off, int size,
        return 0;
 }
-static int check_stack_read(struct verifier_state *state, int off, int size,
+static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
                            int value_regno)
 {
        u8 *slot_type;
@@ -638,7 +597,7 @@ static int check_stack_read(struct verifier_state *state, int off, int size,
 }
 /* check read/write into map element returned by bpf_map_lookup_elem() */
-static int check_map_access(struct verifier_env *env, u32 regno, int off,
+static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
                            int size)
 {
        struct bpf_map *map = env->cur_state.regs[regno].map_ptr;
@@ -653,24 +612,31 @@ static int check_map_access(struct verifier_env *env, u32 regno, int off,
 #define MAX_PACKET_OFF 0xffff
-static bool may_write_pkt_data(enum bpf_prog_type type)
+static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
+                                       const struct bpf_call_arg_meta *meta)
 {
-        switch (type) {
+        switch (env->prog->type) {
+        case BPF_PROG_TYPE_SCHED_CLS:
+        case BPF_PROG_TYPE_SCHED_ACT:
        case BPF_PROG_TYPE_XDP:
+                if (meta)
+                        return meta->pkt_access;
+                env->seen_direct_write = true;
                return true;
        default:
                return false;
        }
 }
-static int check_packet_access(struct verifier_env *env, u32 regno, int off,
+static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
                               int size)
 {
-        struct reg_state *regs = env->cur_state.regs;
+        struct bpf_reg_state *regs = env->cur_state.regs;
-        struct reg_state *reg = &regs[regno];
+        struct bpf_reg_state *reg = &regs[regno];
        off += reg->off;
-        if (off < 0 || off + size > reg->range) {
+        if (off < 0 || size <= 0 || off + size > reg->range) {
                verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
                        off, size, regno, reg->id, reg->off, reg->range);
                return -EACCES;
@@ -679,9 +645,13 @@ static int check_packet_access(struct verifier_env *env, u32 regno, int off,
 }
 /* check access to 'struct bpf_context' fields */
-static int check_ctx_access(struct verifier_env *env, int off, int size,
+static int check_ctx_access(struct bpf_verifier_env *env, int off, int size,
                            enum bpf_access_type t, enum bpf_reg_type *reg_type)
 {
+        /* for analyzer ctx accesses are already validated and converted */
+        if (env->analyzer_ops)
+                return 0;
        if (env->prog->aux->ops->is_valid_access &&
            env->prog->aux->ops->is_valid_access(off, size, t, reg_type)) {
                /* remember the offset of last byte accessed in ctx */
@@ -694,7 +664,7 @@ static int check_ctx_access(struct verifier_env *env, int off, int size,
        return -EACCES;
 }
-static bool is_pointer_value(struct verifier_env *env, int regno)
+static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
 {
        if (env->allow_ptr_leaks)
                return false;
@@ -708,28 +678,19 @@ static bool is_pointer_value(struct verifier_env *env, int regno)
        }
 }
-static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg,
+static int check_ptr_alignment(struct bpf_verifier_env *env,
-                               int off, int size)
+                               struct bpf_reg_state *reg, int off, int size)
 {
-        if (reg->type != PTR_TO_PACKET) {
+        if (reg->type != PTR_TO_PACKET && reg->type != PTR_TO_MAP_VALUE_ADJ) {
                if (off % size != 0) {
-                        verbose("misaligned access off %d size %d\n", off, size);
+                        verbose("misaligned access off %d size %d\n",
+                                off, size);
                        return -EACCES;
                } else {
                        return 0;
                }
        }
-        switch (env->prog->type) {
-        case BPF_PROG_TYPE_SCHED_CLS:
-        case BPF_PROG_TYPE_SCHED_ACT:
-        case BPF_PROG_TYPE_XDP:
-                break;
-        default:
-                verbose("verifier is misconfigured\n");
-                return -EACCES;
-        }
        if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
                /* misaligned access to packet is ok on x86,arm,arm64 */
                return 0;
@@ -740,7 +701,8 @@ static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg,
        }
        /* skb->data is NET_IP_ALIGN-ed */
-        if ((NET_IP_ALIGN + reg->off + off) % size != 0) {
+        if (reg->type == PTR_TO_PACKET &&
+            (NET_IP_ALIGN + reg->off + off) % size != 0) {
                verbose("misaligned packet access off %d+%d+%d size %d\n",
                        NET_IP_ALIGN, reg->off, off, size);
                return -EACCES;
@@ -754,12 +716,12 @@ static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg,
 * if t==write && value_regno==-1, some unknown value is stored into memory
 * if t==read && value_regno==-1, don't care what we read from memory
 */
-static int check_mem_access(struct verifier_env *env, u32 regno, int off,
+static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
                            int bpf_size, enum bpf_access_type t,
                            int value_regno)
 {
-        struct verifier_state *state = &env->cur_state;
+        struct bpf_verifier_state *state = &env->cur_state;
-        struct reg_state *reg = &state->regs[regno];
+        struct bpf_reg_state *reg = &state->regs[regno];
        int size, err = 0;
        if (reg->type == PTR_TO_STACK)
@@ -773,12 +735,52 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
        if (err)
                return err;
-        if (reg->type == PTR_TO_MAP_VALUE) {
+        if (reg->type == PTR_TO_MAP_VALUE ||
+            reg->type == PTR_TO_MAP_VALUE_ADJ) {
                if (t == BPF_WRITE && value_regno >= 0 &&
                    is_pointer_value(env, value_regno)) {
                        verbose("R%d leaks addr into map\n", value_regno);
                        return -EACCES;
                }
+                /* If we adjusted the register to this map value at all then we
+                 * need to change off and size to min_value and max_value
+                 * respectively to make sure our theoretical access will be
+                 * safe.
+                 */
+                if (reg->type == PTR_TO_MAP_VALUE_ADJ) {
+                        if (log_level)
+                                print_verifier_state(state);
+                        env->varlen_map_value_access = true;
+                        /* The minimum value is only important with signed
+                         * comparisons where we can't assume the floor of a
+                         * value is 0.  If we are using signed variables for our
+                         * index'es we need to make sure that whatever we use
+                         * will have a set floor within our range.
+                         */
+                        if ((s64)reg->min_value < 0) {
+                                verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
+                                        regno);
+                                return -EACCES;
+                        }
+                        err = check_map_access(env, regno, reg->min_value + off,
+                                               size);
+                        if (err) {
+                                verbose("R%d min value is outside of the array range\n",
+                                        regno);
+                                return err;
+                        }
+                        /* If we haven't set a max value then we need to bail
+                         * since we can't be sure we won't do bad things.
+                         */
+                        if (reg->max_value == BPF_REGISTER_MAX_RANGE) {
+                                verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n",
+                                        regno);
+                                return -EACCES;
+                        }
+                        off += reg->max_value;
+                }
                err = check_map_access(env, regno, off, size);
                if (!err && t == BPF_READ && value_regno >= 0)
                        mark_reg_unknown_value(state->regs, value_regno);
@@ -794,9 +796,8 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
                err = check_ctx_access(env, off, size, t, &reg_type);
                if (!err && t == BPF_READ && value_regno >= 0) {
                        mark_reg_unknown_value(state->regs, value_regno);
-                        if (env->allow_ptr_leaks)
+                        /* note that reg.[id|off|range] == 0 */
-                                /* note that reg.[id|off|range] == 0 */
+                        state->regs[value_regno].type = reg_type;
-                                state->regs[value_regno].type = reg_type;
                }
        } else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) {
@@ -816,7 +817,7 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
                        err = check_stack_read(state, off, size, value_regno);
                }
        } else if (state->regs[regno].type == PTR_TO_PACKET) {
-                if (t == BPF_WRITE && !may_write_pkt_data(env->prog->type)) {
+                if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL)) {
                        verbose("cannot write into packet\n");
                        return -EACCES;
                }
@@ -845,9 +846,9 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
        return err;
 }
-static int check_xadd(struct verifier_env *env, struct bpf_insn *insn)
+static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn)
 {
-        struct reg_state *regs = env->cur_state.regs;
+        struct bpf_reg_state *regs = env->cur_state.regs;
        int err;
        if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) ||
@@ -881,12 +882,12 @@ static int check_xadd(struct verifier_env *env, struct bpf_insn *insn)
 * bytes from that pointer, make sure that it's within stack boundary
 * and all elements of stack are initialized
 */
-static int check_stack_boundary(struct verifier_env *env, int regno,
+static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
                                int access_size, bool zero_size_allowed,
                                struct bpf_call_arg_meta *meta)
 {
-        struct verifier_state *state = &env->cur_state;
+        struct bpf_verifier_state *state = &env->cur_state;
-        struct reg_state *regs = state->regs;
+        struct bpf_reg_state *regs = state->regs;
        int off, i;
        if (regs[regno].type != PTR_TO_STACK) {
@@ -925,18 +926,18 @@ static int check_stack_boundary(struct verifier_env *env, int regno,
        return 0;
 }
-static int check_func_arg(struct verifier_env *env, u32 regno,
+static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
                          enum bpf_arg_type arg_type,
                          struct bpf_call_arg_meta *meta)
 {
-        struct reg_state *reg = env->cur_state.regs + regno;
+        struct bpf_reg_state *regs = env->cur_state.regs, *reg = &regs[regno];
-        enum bpf_reg_type expected_type;
+        enum bpf_reg_type expected_type, type = reg->type;
        int err = 0;
        if (arg_type == ARG_DONTCARE)
                return 0;
-        if (reg->type == NOT_INIT) {
+        if (type == NOT_INIT) {
                verbose("R%d !read_ok\n", regno);
                return -EACCES;
        }
@@ -949,16 +950,29 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
                return 0;
        }
+        if (type == PTR_TO_PACKET && !may_access_direct_pkt_data(env, meta)) {
+                verbose("helper access to the packet is not allowed\n");
+                return -EACCES;
+        }
        if (arg_type == ARG_PTR_TO_MAP_KEY ||
            arg_type == ARG_PTR_TO_MAP_VALUE) {
                expected_type = PTR_TO_STACK;
+                if (type != PTR_TO_PACKET && type != expected_type)
+                        goto err_type;
        } else if (arg_type == ARG_CONST_STACK_SIZE ||
                   arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) {
                expected_type = CONST_IMM;
+                if (type != expected_type)
+                        goto err_type;
        } else if (arg_type == ARG_CONST_MAP_PTR) {
                expected_type = CONST_PTR_TO_MAP;
+                if (type != expected_type)
+                        goto err_type;
        } else if (arg_type == ARG_PTR_TO_CTX) {
                expected_type = PTR_TO_CTX;
+                if (type != expected_type)
+                        goto err_type;
        } else if (arg_type == ARG_PTR_TO_STACK ||
                   arg_type == ARG_PTR_TO_RAW_STACK) {
                expected_type = PTR_TO_STACK;
@@ -966,20 +980,16 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
                 * passed in as argument, it's a CONST_IMM type. Final test
                 * happens during stack boundary checking.
                 */
-                if (reg->type == CONST_IMM && reg->imm == 0)
+                if (type == CONST_IMM && reg->imm == 0)
-                        expected_type = CONST_IMM;
+                        /* final test in check_stack_boundary() */;
+                else if (type != PTR_TO_PACKET && type != expected_type)
+                        goto err_type;
                meta->raw_mode = arg_type == ARG_PTR_TO_RAW_STACK;
        } else {
                verbose("unsupported arg_type %d\n", arg_type);
                return -EFAULT;
        }
-        if (reg->type != expected_type) {
-                verbose("R%d type=%s expected=%s\n", regno,
-                        reg_type_str[reg->type], reg_type_str[expected_type]);
-                return -EACCES;
-        }
        if (arg_type == ARG_CONST_MAP_PTR) {
                /* bpf_map_xxx(map_ptr) call: remember that map_ptr */
                meta->map_ptr = reg->map_ptr;
@@ -997,8 +1007,13 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
                        verbose("invalid map_ptr to access map->key\n");
                        return -EACCES;
                }
-                err = check_stack_boundary(env, regno, meta->map_ptr->key_size,
+                if (type == PTR_TO_PACKET)
-                                           false, NULL);
+                        err = check_packet_access(env, regno, 0,
+                                                  meta->map_ptr->key_size);
+                else
+                        err = check_stack_boundary(env, regno,
+                                                   meta->map_ptr->key_size,
+                                                   false, NULL);
        } else if (arg_type == ARG_PTR_TO_MAP_VALUE) {
                /* bpf_map_xxx(..., map_ptr, ..., value) call:
                 * check [value, value + map->value_size) validity
@@ -1008,9 +1023,13 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
                        verbose("invalid map_ptr to access map->value\n");
                        return -EACCES;
                }
-                err = check_stack_boundary(env, regno,
+                if (type == PTR_TO_PACKET)
-                                           meta->map_ptr->value_size,
+                        err = check_packet_access(env, regno, 0,
-                                           false, NULL);
+                                                  meta->map_ptr->value_size);
+                else
+                        err = check_stack_boundary(env, regno,
+                                                   meta->map_ptr->value_size,
+                                                   false, NULL);
        } else if (arg_type == ARG_CONST_STACK_SIZE ||
                   arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) {
                bool zero_size_allowed = (arg_type == ARG_CONST_STACK_SIZE_OR_ZERO);
@@ -1024,11 +1043,18 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
                        verbose("ARG_CONST_STACK_SIZE cannot be first argument\n");
                        return -EACCES;
                }
-                err = check_stack_boundary(env, regno - 1, reg->imm,
+                if (regs[regno - 1].type == PTR_TO_PACKET)
-                                           zero_size_allowed, meta);
+                        err = check_packet_access(env, regno - 1, 0, reg->imm);
+                else
+                        err = check_stack_boundary(env, regno - 1, reg->imm,
+                                                   zero_size_allowed, meta);
        }
        return err;
+err_type:
+        verbose("R%d type=%s expected=%s\n", regno,
+                reg_type_str[type], reg_type_str[expected_type]);
+        return -EACCES;
 }
 static int check_map_func_compatibility(struct bpf_map *map, int func_id)
@@ -1052,7 +1078,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
                        goto error;
                break;
        case BPF_MAP_TYPE_CGROUP_ARRAY:
-                if (func_id != BPF_FUNC_skb_in_cgroup)
+                if (func_id != BPF_FUNC_skb_under_cgroup &&
+                    func_id != BPF_FUNC_current_task_under_cgroup)
                        goto error;
                break;
        default:
@@ -1074,7 +1101,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
                if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
                        goto error;
                break;
-        case BPF_FUNC_skb_in_cgroup:
+        case BPF_FUNC_current_task_under_cgroup:
+        case BPF_FUNC_skb_under_cgroup:
                if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY)
                        goto error;
                break;
@@ -1107,10 +1135,10 @@ static int check_raw_mode(const struct bpf_func_proto *fn)
        return count > 1 ? -EINVAL : 0;
 }
-static void clear_all_pkt_pointers(struct verifier_env *env)
+static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 {
-        struct verifier_state *state = &env->cur_state;
+        struct bpf_verifier_state *state = &env->cur_state;
-        struct reg_state *regs = state->regs, *reg;
+        struct bpf_reg_state *regs = state->regs, *reg;
        int i;
        for (i = 0; i < MAX_BPF_REG; i++)
@@ -1130,12 +1158,12 @@ static void clear_all_pkt_pointers(struct verifier_env *env)
        }
 }
-static int check_call(struct verifier_env *env, int func_id)
+static int check_call(struct bpf_verifier_env *env, int func_id)
 {
-        struct verifier_state *state = &env->cur_state;
+        struct bpf_verifier_state *state = &env->cur_state;
        const struct bpf_func_proto *fn = NULL;
-        struct reg_state *regs = state->regs;
+        struct bpf_reg_state *regs = state->regs;
-        struct reg_state *reg;
+        struct bpf_reg_state *reg;
        struct bpf_call_arg_meta meta;
        bool changes_data;
        int i, err;
@@ -1163,6 +1191,7 @@ static int check_call(struct verifier_env *env, int func_id)
        changes_data = bpf_helper_changes_skb_data(fn->func);
        memset(&meta, 0, sizeof(meta));
+        meta.pkt_access = fn->pkt_access;
        /* We only support one arg being in raw mode at the moment, which
         * is sufficient for the helper functions we have right now.
@@ -1213,6 +1242,7 @@ static int check_call(struct verifier_env *env, int func_id)
                regs[BPF_REG_0].type = NOT_INIT;
        } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
                regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
+                regs[BPF_REG_0].max_value = regs[BPF_REG_0].min_value = 0;
                /* remember map_ptr, so that check_map_access()
                 * can check 'value_size' boundary of memory access
                 * to map element returned from bpf_map_lookup_elem()
@@ -1237,12 +1267,13 @@ static int check_call(struct verifier_env *env, int func_id)
        return 0;
 }
-static int check_packet_ptr_add(struct verifier_env *env, struct bpf_insn *insn)
+static int check_packet_ptr_add(struct bpf_verifier_env *env,
+                                struct bpf_insn *insn)
 {
-        struct reg_state *regs = env->cur_state.regs;
+        struct bpf_reg_state *regs = env->cur_state.regs;
-        struct reg_state *dst_reg = &regs[insn->dst_reg];
+        struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
-        struct reg_state *src_reg = &regs[insn->src_reg];
+        struct bpf_reg_state *src_reg = &regs[insn->src_reg];
-        struct reg_state tmp_reg;
+        struct bpf_reg_state tmp_reg;
        s32 imm;
        if (BPF_SRC(insn->code) == BPF_K) {
@@ -1301,7 +1332,7 @@ add_imm:
                /* dst_reg stays as pkt_ptr type and since some positive
                 * integer value was added to the pointer, increment its 'id'
                 */
-                dst_reg->id++;
+                dst_reg->id = ++env->id_gen;
                /* something was added to pkt_ptr, set range and off to zero */
                dst_reg->off = 0;
@@ -1310,10 +1341,10 @@ add_imm:
        return 0;
 }
-static int evaluate_reg_alu(struct verifier_env *env, struct bpf_insn *insn)
+static int evaluate_reg_alu(struct bpf_verifier_env *env, struct bpf_insn *insn)
 {
-        struct reg_state *regs = env->cur_state.regs;
+        struct bpf_reg_state *regs = env->cur_state.regs;
-        struct reg_state *dst_reg = &regs[insn->dst_reg];
+        struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
        u8 opcode = BPF_OP(insn->code);
        s64 imm_log2;
@@ -1323,7 +1354,7 @@ static int evaluate_reg_alu(struct verifier_env *env, struct bpf_insn *insn)
         */
        if (BPF_SRC(insn->code) == BPF_X) {
-                struct reg_state *src_reg = &regs[insn->src_reg];
+                struct bpf_reg_state *src_reg = &regs[insn->src_reg];
                if (src_reg->type == UNKNOWN_VALUE && src_reg->imm > 0 &&
                    dst_reg->imm && opcode == BPF_ADD) {
@@ -1412,11 +1443,12 @@ static int evaluate_reg_alu(struct verifier_env *env, struct bpf_insn *insn)
        return 0;
 }
-static int evaluate_reg_imm_alu(struct verifier_env *env, struct bpf_insn *insn)
+static int evaluate_reg_imm_alu(struct bpf_verifier_env *env,
+                                struct bpf_insn *insn)
 {
-        struct reg_state *regs = env->cur_state.regs;
+        struct bpf_reg_state *regs = env->cur_state.regs;
-        struct reg_state *dst_reg = &regs[insn->dst_reg];
+        struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
-        struct reg_state *src_reg = &regs[insn->src_reg];
+        struct bpf_reg_state *src_reg = &regs[insn->src_reg];
        u8 opcode = BPF_OP(insn->code);
        /* dst_reg->type == CONST_IMM here, simulate execution of 'add' insn.
@@ -1432,10 +1464,110 @@ static int evaluate_reg_imm_alu(struct verifier_env *env, struct bpf_insn *insn)
        return 0;
 }
+static void check_reg_overflow(struct bpf_reg_state *reg)
+{
+        if (reg->max_value > BPF_REGISTER_MAX_RANGE)
+                reg->max_value = BPF_REGISTER_MAX_RANGE;
+        if ((s64)reg->min_value < BPF_REGISTER_MIN_RANGE)
+                reg->min_value = BPF_REGISTER_MIN_RANGE;
+}
+static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
+                                    struct bpf_insn *insn)
+{
+        struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg;
+        u64 min_val = BPF_REGISTER_MIN_RANGE, max_val = BPF_REGISTER_MAX_RANGE;
+        bool min_set = false, max_set = false;
+        u8 opcode = BPF_OP(insn->code);
+        dst_reg = &regs[insn->dst_reg];
+        if (BPF_SRC(insn->code) == BPF_X) {
+                check_reg_overflow(&regs[insn->src_reg]);
+                min_val = regs[insn->src_reg].min_value;
+                max_val = regs[insn->src_reg].max_value;
+                /* If the source register is a random pointer then the
+                 * min_value/max_value values represent the range of the known
+                 * accesses into that value, not the actual min/max value of the
+                 * register itself.  In this case we have to reset the reg range
+                 * values so we know it is not safe to look at.
+                 */
+                if (regs[insn->src_reg].type != CONST_IMM &&
+                    regs[insn->src_reg].type != UNKNOWN_VALUE) {
+                        min_val = BPF_REGISTER_MIN_RANGE;
+                        max_val = BPF_REGISTER_MAX_RANGE;
+                }
+        } else if (insn->imm < BPF_REGISTER_MAX_RANGE &&
+                   (s64)insn->imm > BPF_REGISTER_MIN_RANGE) {
+                min_val = max_val = insn->imm;
+                min_set = max_set = true;
+        }
+        /* We don't know anything about what was done to this register, mark it
+         * as unknown.
+         */
+        if (min_val == BPF_REGISTER_MIN_RANGE &&
+            max_val == BPF_REGISTER_MAX_RANGE) {
+                reset_reg_range_values(regs, insn->dst_reg);
+                return;
+        }
+        switch (opcode) {
+        case BPF_ADD:
+                dst_reg->min_value += min_val;
+                dst_reg->max_value += max_val;
+                break;
+        case BPF_SUB:
+                dst_reg->min_value -= min_val;
+                dst_reg->max_value -= max_val;
+                break;
+        case BPF_MUL:
+                dst_reg->min_value *= min_val;
+                dst_reg->max_value *= max_val;
+                break;
+        case BPF_AND:
+                /* & is special since it could end up with 0 bits set. */
+                dst_reg->min_value &= min_val;
+                dst_reg->max_value = max_val;
+                break;
+        case BPF_LSH:
+                /* Gotta have special overflow logic here, if we're shifting
+                 * more than MAX_RANGE then just assume we have an invalid
+                 * range.
+                 */
+                if (min_val > ilog2(BPF_REGISTER_MAX_RANGE))
+                        dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
+                else
+                        dst_reg->min_value <<= min_val;
+                if (max_val > ilog2(BPF_REGISTER_MAX_RANGE))
+                        dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
+                else
+                        dst_reg->max_value <<= max_val;
+                break;
+        case BPF_RSH:
+                dst_reg->min_value >>= min_val;
+                dst_reg->max_value >>= max_val;
+                break;
+        case BPF_MOD:
+                /* % is special since it is an unsigned modulus, so the floor
+                 * will always be 0.
+                 */
+                dst_reg->min_value = 0;
+                dst_reg->max_value = max_val - 1;
+                break;
+        default:
+                reset_reg_range_values(regs, insn->dst_reg);
+                break;
+        }
+        check_reg_overflow(dst_reg);
+}
 /* check validity of 32-bit and 64-bit arithmetic operations */
-static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
+static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 {
-        struct reg_state *regs = env->cur_state.regs, *dst_reg;
+        struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg;
        u8 opcode = BPF_OP(insn->code);
        int err;
@@ -1495,6 +1627,11 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
                if (err)
                        return err;
+                /* we are setting our register to something new, we need to
+                 * reset its range values.
+                 */
+                reset_reg_range_values(regs, insn->dst_reg);
                if (BPF_SRC(insn->code) == BPF_X) {
                        if (BPF_CLASS(insn->code) == BPF_ALU64) {
                                /* case: R1 = R2
@@ -1516,6 +1653,8 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
                         */
                        regs[insn->dst_reg].type = CONST_IMM;
                        regs[insn->dst_reg].imm = insn->imm;
+                        regs[insn->dst_reg].max_value = insn->imm;
+                        regs[insn->dst_reg].min_value = insn->imm;
                }
        } else if (opcode > BPF_END) {
@@ -1568,6 +1707,9 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
                dst_reg = &regs[insn->dst_reg];
+                /* first we want to adjust our ranges. */
+                adjust_reg_min_max_vals(env, insn);
                /* pattern match 'bpf_add Rx, imm' instruction */
                if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 &&
                    dst_reg->type == FRAME_PTR && BPF_SRC(insn->code) == BPF_K) {
@@ -1602,28 +1744,58 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
                        return -EACCES;
                }
-                /* mark dest operand */
+                /* If we did pointer math on a map value then just set it to our
-                mark_reg_unknown_value(regs, insn->dst_reg);
+                 * PTR_TO_MAP_VALUE_ADJ type so we can deal with any stores or
+                 * loads to this register appropriately, otherwise just mark the
+                 * register as unknown.
+                 */
+                if (env->allow_ptr_leaks &&
+                    (dst_reg->type == PTR_TO_MAP_VALUE ||
+                     dst_reg->type == PTR_TO_MAP_VALUE_ADJ))
+                        dst_reg->type = PTR_TO_MAP_VALUE_ADJ;
+                else
+                        mark_reg_unknown_value(regs, insn->dst_reg);
        }
        return 0;
 }
-static void find_good_pkt_pointers(struct verifier_env *env,
+static void find_good_pkt_pointers(struct bpf_verifier_state *state,
-                                   struct reg_state *dst_reg)
+                                   struct bpf_reg_state *dst_reg)
 {
-        struct verifier_state *state = &env->cur_state;
+        struct bpf_reg_state *regs = state->regs, *reg;
-        struct reg_state *regs = state->regs, *reg;
        int i;
-        /* r2 = r3;
-         * r2 += 8
+        /* LLVM can generate two kind of checks:
-         * if (r2 > pkt_end) goto somewhere
+         *
-         * r2 == dst_reg, pkt_end == src_reg,
+         * Type 1:
-         * r2=pkt(id=n,off=8,r=0)
+         *
-         * r3=pkt(id=n,off=0,r=0)
+         *   r2 = r3;
-         * find register r3 and mark its range as r3=pkt(id=n,off=0,r=8)
+         *   r2 += 8;
-         * so that range of bytes [r3, r3 + 8) is safe to access
+         *   if (r2 > pkt_end) goto <handle exception>
+         *   <access okay>
+         *
+         *   Where:
+         *     r2 == dst_reg, pkt_end == src_reg
+         *     r2=pkt(id=n,off=8,r=0)
+         *     r3=pkt(id=n,off=0,r=0)
+         *
+         * Type 2:
+         *
+         *   r2 = r3;
+         *   r2 += 8;
+         *   if (pkt_end >= r2) goto <access okay>
+         *   <handle exception>
+         *
+         *   Where:
+         *     pkt_end == dst_reg, r2 == src_reg
+         *     r2=pkt(id=n,off=8,r=0)
+         *     r3=pkt(id=n,off=0,r=0)
+         *
+         * Find register r3 and mark its range as r3=pkt(id=n,off=0,r=8)
+         * so that range of bytes [r3, r3 + 8) is safe to access.
         */
        for (i = 0; i < MAX_BPF_REG; i++)
                if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id)
                        regs[i].range = dst_reg->off;
@@ -1637,11 +1809,109 @@ static void find_good_pkt_pointers(struct verifier_env *env,
        }
 }
-static int check_cond_jmp_op(struct verifier_env *env,
+/* Adjusts the register min/max values in the case that the dst_reg is the
+ * variable register that we are working on, and src_reg is a constant or we're
+ * simply doing a BPF_K check.
+ */
+static void reg_set_min_max(struct bpf_reg_state *true_reg,
+                            struct bpf_reg_state *false_reg, u64 val,
+                            u8 opcode)
+{
+        switch (opcode) {
+        case BPF_JEQ:
+                /* If this is false then we know nothing Jon Snow, but if it is
+                 * true then we know for sure.
+                 */
+                true_reg->max_value = true_reg->min_value = val;
+                break;
+        case BPF_JNE:
+                /* If this is true we know nothing Jon Snow, but if it is false
+                 * we know the value for sure;
+                 */
+                false_reg->max_value = false_reg->min_value = val;
+                break;
+        case BPF_JGT:
+                /* Unsigned comparison, the minimum value is 0. */
+                false_reg->min_value = 0;
+        case BPF_JSGT:
+                /* If this is false then we know the maximum val is val,
+                 * otherwise we know the min val is val+1.
+                 */
+                false_reg->max_value = val;
+                true_reg->min_value = val + 1;
+                break;
+        case BPF_JGE:
+                /* Unsigned comparison, the minimum value is 0. */
+                false_reg->min_value = 0;
+        case BPF_JSGE:
+                /* If this is false then we know the maximum value is val - 1,
+                 * otherwise we know the mimimum value is val.
+                 */
+                false_reg->max_value = val - 1;
+                true_reg->min_value = val;
+                break;
+        default:
+                break;
+        }
+        check_reg_overflow(false_reg);
+        check_reg_overflow(true_reg);
+}
+/* Same as above, but for the case that dst_reg is a CONST_IMM reg and src_reg
+ * is the variable reg.
+ */
+static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
+                                struct bpf_reg_state *false_reg, u64 val,
+                                u8 opcode)
+{
+        switch (opcode) {
+        case BPF_JEQ:
+                /* If this is false then we know nothing Jon Snow, but if it is
+                 * true then we know for sure.
+                 */
+                true_reg->max_value = true_reg->min_value = val;
+                break;
+        case BPF_JNE:
+                /* If this is true we know nothing Jon Snow, but if it is false
+                 * we know the value for sure;
+                 */
+                false_reg->max_value = false_reg->min_value = val;
+                break;
+        case BPF_JGT:
+                /* Unsigned comparison, the minimum value is 0. */
+                true_reg->min_value = 0;
+        case BPF_JSGT:
+                /*
+                 * If this is false, then the val is <= the register, if it is
+                 * true the register <= to the val.
+                 */
+                false_reg->min_value = val;
+                true_reg->max_value = val - 1;
+                break;
+        case BPF_JGE:
+                /* Unsigned comparison, the minimum value is 0. */
+                true_reg->min_value = 0;
+        case BPF_JSGE:
+                /* If this is false then constant < register, if it is true then
+                 * the register < constant.
+                 */
+                false_reg->min_value = val + 1;
+                true_reg->max_value = val;
+                break;
+        default:
+                break;
+        }
+        check_reg_overflow(false_reg);
+        check_reg_overflow(true_reg);
+}
+static int check_cond_jmp_op(struct bpf_verifier_env *env,
                             struct bpf_insn *insn, int *insn_idx)
 {
-        struct reg_state *regs = env->cur_state.regs, *dst_reg;
+        struct bpf_verifier_state *other_branch, *this_branch = &env->cur_state;
-        struct verifier_state *other_branch;
+        struct bpf_reg_state *regs = this_branch->regs, *dst_reg;
        u8 opcode = BPF_OP(insn->code);
        int err;
@@ -1703,7 +1973,24 @@ static int check_cond_jmp_op(struct verifier_env *env,
        if (!other_branch)
                return -EFAULT;
-        /* detect if R == 0 where R is returned value from bpf_map_lookup_elem() */
+        /* detect if we are comparing against a constant value so we can adjust
+         * our min/max values for our dst register.
+         */
+        if (BPF_SRC(insn->code) == BPF_X) {
+                if (regs[insn->src_reg].type == CONST_IMM)
+                        reg_set_min_max(&other_branch->regs[insn->dst_reg],
+                                        dst_reg, regs[insn->src_reg].imm,
+                                        opcode);
+                else if (dst_reg->type == CONST_IMM)
+                        reg_set_min_max_inv(&other_branch->regs[insn->src_reg],
+                                            &regs[insn->src_reg], dst_reg->imm,
+                                            opcode);
+        } else {
+                reg_set_min_max(&other_branch->regs[insn->dst_reg],
+                                        dst_reg, insn->imm, opcode);
+        }
+        /* detect if R == 0 where R is returned from bpf_map_lookup_elem() */
        if (BPF_SRC(insn->code) == BPF_K &&
            insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) &&
            dst_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
@@ -1722,13 +2009,17 @@ static int check_cond_jmp_op(struct verifier_env *env,
        } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
                   dst_reg->type == PTR_TO_PACKET &&
                   regs[insn->src_reg].type == PTR_TO_PACKET_END) {
-                find_good_pkt_pointers(env, dst_reg);
+                find_good_pkt_pointers(this_branch, dst_reg);
+        } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
+                   dst_reg->type == PTR_TO_PACKET_END &&
+                   regs[insn->src_reg].type == PTR_TO_PACKET) {
+                find_good_pkt_pointers(other_branch, &regs[insn->src_reg]);
        } else if (is_pointer_value(env, insn->dst_reg)) {
                verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
                return -EACCES;
        }
        if (log_level)
-                print_verifier_state(&env->cur_state);
+                print_verifier_state(this_branch);
        return 0;
 }
@@ -1741,9 +2032,9 @@ static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)
 }
 /* verify BPF_LD_IMM64 instruction */
-static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn)
+static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
 {
-        struct reg_state *regs = env->cur_state.regs;
+        struct bpf_reg_state *regs = env->cur_state.regs;
        int err;
        if (BPF_SIZE(insn->code) != BPF_DW) {
@@ -1759,9 +2050,19 @@ static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn)
        if (err)
                return err;
-        if (insn->src_reg == 0)
+        if (insn->src_reg == 0) {
-                /* generic move 64-bit immediate into a register */
+                /* generic move 64-bit immediate into a register,
+                 * only analyzer needs to collect the ld_imm value.
+                 */
+                u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
+                if (!env->analyzer_ops)
+                        return 0;
+                regs[insn->dst_reg].type = CONST_IMM;
+                regs[insn->dst_reg].imm = imm;
                return 0;
+        }
        /* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */
        BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD);
@@ -1798,11 +2099,11 @@ static bool may_access_skb(enum bpf_prog_type type)
 * Output:
 *   R0 - 8/16/32-bit skb data converted to cpu endianness
 */
-static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn)
+static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 {
-        struct reg_state *regs = env->cur_state.regs;
+        struct bpf_reg_state *regs = env->cur_state.regs;
        u8 mode = BPF_MODE(insn->code);
-        struct reg_state *reg;
+        struct bpf_reg_state *reg;
        int i, err;
        if (!may_access_skb(env->prog->type)) {
@@ -1888,7 +2189,7 @@ enum {
        BRANCH = 2,
 };
-#define STATE_LIST_MARK ((struct verifier_state_list *) -1L)
+#define STATE_LIST_MARK ((struct bpf_verifier_state_list *) -1L)
 static int *insn_stack; /* stack of insns to process */
 static int cur_stack;   /* current stack index */
@@ -1899,7 +2200,7 @@ static int *insn_state;
 * w - next instruction
 * e - edge
 */
-static int push_insn(int t, int w, int e, struct verifier_env *env)
+static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
 {
        if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH))
                return 0;
@@ -1940,7 +2241,7 @@ static int push_insn(int t, int w, int e, struct verifier_env *env)
 /* non-recursive depth-first-search to detect loops in BPF program
 * loop == back-edge in directed graph
 */
-static int check_cfg(struct verifier_env *env)
+static int check_cfg(struct bpf_verifier_env *env)
 {
        struct bpf_insn *insns = env->prog->insnsi;
        int insn_cnt = env->prog->len;
@@ -2049,7 +2350,8 @@ err_free:
 /* the following conditions reduce the number of explored insns
 * from ~140k to ~80k for ultra large programs that use a lot of ptr_to_packet
 */
-static bool compare_ptrs_to_packet(struct reg_state *old, struct reg_state *cur)
+static bool compare_ptrs_to_packet(struct bpf_reg_state *old,
+                                   struct bpf_reg_state *cur)
 {
        if (old->id != cur->id)
                return false;
@@ -2124,9 +2426,11 @@ static bool compare_ptrs_to_packet(struct reg_state *old, struct reg_state *cur)
 * whereas register type in current state is meaningful, it means that
 * the current state will reach 'bpf_exit' instruction safely
 */
-static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
+static bool states_equal(struct bpf_verifier_env *env,
+                         struct bpf_verifier_state *old,
+                         struct bpf_verifier_state *cur)
 {
-        struct reg_state *rold, *rcur;
+        struct bpf_reg_state *rold, *rcur;
        int i;
        for (i = 0; i < MAX_BPF_REG; i++) {
@@ -2136,6 +2440,13 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
                if (memcmp(rold, rcur, sizeof(*rold)) == 0)
                        continue;
+                /* If the ranges were not the same, but everything else was and
+                 * we didn't do a variable access into a map then we are a-ok.
+                 */
+                if (!env->varlen_map_value_access &&
+                    rold->type == rcur->type && rold->imm == rcur->imm)
+                        continue;
                if (rold->type == NOT_INIT ||
                    (rold->type == UNKNOWN_VALUE && rcur->type != NOT_INIT))
                        continue;
@@ -2166,9 +2477,9 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
                         * the same, check that stored pointers types
                         * are the same as well.
                         * Ex: explored safe path could have stored
-                         * (struct reg_state) {.type = PTR_TO_STACK, .imm = -8}
+                         * (bpf_reg_state) {.type = PTR_TO_STACK, .imm = -8}
                         * but current path has stored:
-                         * (struct reg_state) {.type = PTR_TO_STACK, .imm = -16}
+                         * (bpf_reg_state) {.type = PTR_TO_STACK, .imm = -16}
                         * such verifier states are not equivalent.
                         * return false to continue verification of this path
                         */
@@ -2179,10 +2490,10 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
        return true;
 }
-static int is_state_visited(struct verifier_env *env, int insn_idx)
+static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 {
-        struct verifier_state_list *new_sl;
+        struct bpf_verifier_state_list *new_sl;
-        struct verifier_state_list *sl;
+        struct bpf_verifier_state_list *sl;
        sl = env->explored_states[insn_idx];
        if (!sl)
@@ -2192,7 +2503,7 @@ static int is_state_visited(struct verifier_env *env, int insn_idx)
                return 0;
        while (sl != STATE_LIST_MARK) {
-                if (states_equal(&sl->state, &env->cur_state))
+                if (states_equal(env, &sl->state, &env->cur_state))
                        /* reached equivalent register/stack state,
                         * prune the search
                         */
@@ -2206,7 +2517,7 @@ static int is_state_visited(struct verifier_env *env, int insn_idx)
         * it will be rejected. Since there are no loops, we won't be
         * seeing this 'insn_idx' instruction again on the way to bpf_exit
         */
-        new_sl = kmalloc(sizeof(struct verifier_state_list), GFP_USER);
+        new_sl = kmalloc(sizeof(struct bpf_verifier_state_list), GFP_USER);
        if (!new_sl)
                return -ENOMEM;
@@ -2217,11 +2528,20 @@ static int is_state_visited(struct verifier_env *env, int insn_idx)
        return 0;
 }
-static int do_check(struct verifier_env *env)
+static int ext_analyzer_insn_hook(struct bpf_verifier_env *env,
+                                  int insn_idx, int prev_insn_idx)
+{
+        if (!env->analyzer_ops || !env->analyzer_ops->insn_hook)
+                return 0;
+        return env->analyzer_ops->insn_hook(env, insn_idx, prev_insn_idx);
+}
+static int do_check(struct bpf_verifier_env *env)
 {
-        struct verifier_state *state = &env->cur_state;
+        struct bpf_verifier_state *state = &env->cur_state;
        struct bpf_insn *insns = env->prog->insnsi;
-        struct reg_state *regs = state->regs;
+        struct bpf_reg_state *regs = state->regs;
        int insn_cnt = env->prog->len;
        int insn_idx, prev_insn_idx = 0;
        int insn_processed = 0;
@@ -2229,6 +2549,7 @@ static int do_check(struct verifier_env *env)
        init_reg_state(regs);
        insn_idx = 0;
+        env->varlen_map_value_access = false;
        for (;;) {
                struct bpf_insn *insn;
                u8 class;
@@ -2275,13 +2596,17 @@ static int do_check(struct verifier_env *env)
                        print_bpf_insn(insn);
                }
+                err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx);
+                if (err)
+                        return err;
                if (class == BPF_ALU || class == BPF_ALU64) {
                        err = check_alu_op(env, insn);
                        if (err)
                                return err;
                } else if (class == BPF_LDX) {
-                        enum bpf_reg_type src_reg_type;
+                        enum bpf_reg_type *prev_src_type, src_reg_type;
                        /* check for reserved fields is already done */
@@ -2305,21 +2630,25 @@ static int do_check(struct verifier_env *env)
                        if (err)
                                return err;
-                        if (BPF_SIZE(insn->code) != BPF_W) {
+                        reset_reg_range_values(regs, insn->dst_reg);
+                        if (BPF_SIZE(insn->code) != BPF_W &&
+                            BPF_SIZE(insn->code) != BPF_DW) {
                                insn_idx++;
                                continue;
                        }
-                        if (insn->imm == 0) {
+                        prev_src_type = &env->insn_aux_data[insn_idx].ptr_type;
+                        if (*prev_src_type == NOT_INIT) {
                                /* saw a valid insn
                                 * dst_reg = *(u32 *)(src_reg + off)
-                                 * use reserved 'imm' field to mark this insn
+                                 * save type to validate intersecting paths
                                 */
-                                insn->imm = src_reg_type;
+                                *prev_src_type = src_reg_type;
-                        } else if (src_reg_type != insn->imm &&
+                        } else if (src_reg_type != *prev_src_type &&
                                   (src_reg_type == PTR_TO_CTX ||
-                                    insn->imm == PTR_TO_CTX)) {
+                                    *prev_src_type == PTR_TO_CTX)) {
                                /* ABuser program is trying to use the same insn
                                 * dst_reg = *(u32*) (src_reg + off)
                                 * with different pointer types:
@@ -2332,7 +2661,7 @@ static int do_check(struct verifier_env *env)
                        }
                } else if (class == BPF_STX) {
-                        enum bpf_reg_type dst_reg_type;
+                        enum bpf_reg_type *prev_dst_type, dst_reg_type;
                        if (BPF_MODE(insn->code) == BPF_XADD) {
                                err = check_xadd(env, insn);
@@ -2360,11 +2689,13 @@ static int do_check(struct verifier_env *env)
                        if (err)
                                return err;
-                        if (insn->imm == 0) {
+                        prev_dst_type = &env->insn_aux_data[insn_idx].ptr_type;
-                                insn->imm = dst_reg_type;
-                        } else if (dst_reg_type != insn->imm &&
+                        if (*prev_dst_type == NOT_INIT) {
+                                *prev_dst_type = dst_reg_type;
+                        } else if (dst_reg_type != *prev_dst_type &&
                                   (dst_reg_type == PTR_TO_CTX ||
-                                    insn->imm == PTR_TO_CTX)) {
+                                    *prev_dst_type == PTR_TO_CTX)) {
                                verbose("same insn cannot be used with different pointers\n");
                                return -EINVAL;
                        }
@@ -2470,6 +2801,7 @@ process_bpf_exit:
                                verbose("invalid BPF_LD mode\n");
                                return -EINVAL;
                        }
+                        reset_reg_range_values(regs, insn->dst_reg);
                } else {
                        verbose("unknown insn class %d\n", class);
                        return -EINVAL;
@@ -2482,14 +2814,28 @@ process_bpf_exit:
        return 0;
 }
+static int check_map_prog_compatibility(struct bpf_map *map,
+                                        struct bpf_prog *prog)
+{
+        if (prog->type == BPF_PROG_TYPE_PERF_EVENT &&
+            (map->map_type == BPF_MAP_TYPE_HASH ||
+             map->map_type == BPF_MAP_TYPE_PERCPU_HASH) &&
+            (map->map_flags & BPF_F_NO_PREALLOC)) {
+                verbose("perf_event programs can only use preallocated hash map\n");
+                return -EINVAL;
+        }
+        return 0;
+}
 /* look for pseudo eBPF instructions that access map FDs and
 * replace them with actual map pointers
 */
-static int replace_map_fd_with_map_ptr(struct verifier_env *env)
+static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
 {
        struct bpf_insn *insn = env->prog->insnsi;
        int insn_cnt = env->prog->len;
-        int i, j;
+        int i, j, err;
        for (i = 0; i < insn_cnt; i++, insn++) {
                if (BPF_CLASS(insn->code) == BPF_LDX &&
@@ -2533,6 +2879,12 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env)
                                return PTR_ERR(map);
                        }
+                        err = check_map_prog_compatibility(map, env->prog);
+                        if (err) {
+                                fdput(f);
+                                return err;
+                        }
                        /* store map pointer inside BPF_LD_IMM64 instruction */
                        insn[0].imm = (u32) (unsigned long) map;
                        insn[1].imm = ((u64) (unsigned long) map) >> 32;
@@ -2576,7 +2928,7 @@ next_insn:
 }
 /* drop refcnt of maps used by the rejected program */
-static void release_maps(struct verifier_env *env)
+static void release_maps(struct bpf_verifier_env *env)
 {
        int i;
@@ -2585,7 +2937,7 @@ static void release_maps(struct verifier_env *env)
 }
 /* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */
-static void convert_pseudo_ld_imm64(struct verifier_env *env)
+static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env)
 {
        struct bpf_insn *insn = env->prog->insnsi;
        int insn_cnt = env->prog->len;
@@ -2599,62 +2951,74 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env)
 /* convert load instructions that access fields of 'struct __sk_buff'
 * into sequence of instructions that access fields of 'struct sk_buff'
 */
-static int convert_ctx_accesses(struct verifier_env *env)
+static int convert_ctx_accesses(struct bpf_verifier_env *env)
 {
-        struct bpf_insn *insn = env->prog->insnsi;
+        const struct bpf_verifier_ops *ops = env->prog->aux->ops;
-        int insn_cnt = env->prog->len;
+        const int insn_cnt = env->prog->len;
-        struct bpf_insn insn_buf[16];
+        struct bpf_insn insn_buf[16], *insn;
        struct bpf_prog *new_prog;
        enum bpf_access_type type;
-        int i;
+        int i, cnt, delta = 0;
-        if (!env->prog->aux->ops->convert_ctx_access)
+        if (ops->gen_prologue) {
+                cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
+                                        env->prog);
+                if (cnt >= ARRAY_SIZE(insn_buf)) {
+                        verbose("bpf verifier is misconfigured\n");
+                        return -EINVAL;
+                } else if (cnt) {
+                        new_prog = bpf_patch_insn_single(env->prog, 0,
+                                                         insn_buf, cnt);
+                        if (!new_prog)
+                                return -ENOMEM;
+                        env->prog = new_prog;
+                        delta += cnt - 1;
+                }
+        }
+        if (!ops->convert_ctx_access)
                return 0;
-        for (i = 0; i < insn_cnt; i++, insn++) {
+        insn = env->prog->insnsi + delta;
-                u32 insn_delta, cnt;
-                if (insn->code == (BPF_LDX | BPF_MEM | BPF_W))
+        for (i = 0; i < insn_cnt; i++, insn++) {
+                if (insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
+                    insn->code == (BPF_LDX | BPF_MEM | BPF_DW))
                        type = BPF_READ;
-                else if (insn->code == (BPF_STX | BPF_MEM | BPF_W))
+                else if (insn->code == (BPF_STX | BPF_MEM | BPF_W) ||
+                         insn->code == (BPF_STX | BPF_MEM | BPF_DW))
                        type = BPF_WRITE;
                else
                        continue;
-                if (insn->imm != PTR_TO_CTX) {
+                if (env->insn_aux_data[i].ptr_type != PTR_TO_CTX)
-                        /* clear internal mark */
-                        insn->imm = 0;
                        continue;
-                }
-                cnt = env->prog->aux->ops->
+                cnt = ops->convert_ctx_access(type, insn->dst_reg, insn->src_reg,
-                        convert_ctx_access(type, insn->dst_reg, insn->src_reg,
+                                              insn->off, insn_buf, env->prog);
-                                           insn->off, insn_buf, env->prog);
                if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
                        verbose("bpf verifier is misconfigured\n");
                        return -EINVAL;
                }
-                new_prog = bpf_patch_insn_single(env->prog, i, insn_buf, cnt);
+                new_prog = bpf_patch_insn_single(env->prog, i + delta, insn_buf,
+                                                 cnt);
                if (!new_prog)
                        return -ENOMEM;
-                insn_delta = cnt - 1;
+                delta += cnt - 1;
                /* keep walking new program and skip insns we just inserted */
                env->prog = new_prog;
-                insn      = new_prog->insnsi + i + insn_delta;
+                insn      = new_prog->insnsi + i + delta;
-                insn_cnt += insn_delta;
-                i        += insn_delta;
        }
        return 0;
 }
-static void free_states(struct verifier_env *env)
+static void free_states(struct bpf_verifier_env *env)
 {
-        struct verifier_state_list *sl, *sln;
+        struct bpf_verifier_state_list *sl, *sln;
        int i;
        if (!env->explored_states)
@@ -2677,19 +3041,24 @@ static void free_states(struct verifier_env *env)
 int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 {
        char __user *log_ubuf = NULL;
-        struct verifier_env *env;
+        struct bpf_verifier_env *env;
        int ret = -EINVAL;
        if ((*prog)->len <= 0 || (*prog)->len > BPF_MAXINSNS)
                return -E2BIG;
-        /* 'struct verifier_env' can be global, but since it's not small,
+        /* 'struct bpf_verifier_env' can be global, but since it's not small,
         * allocate/free it every time bpf_check() is called
         */
-        env = kzalloc(sizeof(struct verifier_env), GFP_KERNEL);
+        env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
        if (!env)
                return -ENOMEM;
+        env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) *
+                                     (*prog)->len);
+        ret = -ENOMEM;
+        if (!env->insn_aux_data)
+                goto err_free_env;
        env->prog = *prog;
        /* grab the mutex to protect few globals used by verifier */
@@ -2708,12 +3077,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
                /* log_* values have to be sane */
                if (log_size < 128 || log_size > UINT_MAX >> 8 ||
                    log_level == 0 || log_ubuf == NULL)
-                        goto free_env;
+                        goto err_unlock;
                ret = -ENOMEM;
                log_buf = vmalloc(log_size);
                if (!log_buf)
-                        goto free_env;
+                        goto err_unlock;
        } else {
                log_level = 0;
        }
@@ -2723,7 +3092,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
                goto skip_full_check;
        env->explored_states = kcalloc(env->prog->len,
-                                       sizeof(struct verifier_state_list *),
+                                       sizeof(struct bpf_verifier_state_list *),
                                       GFP_USER);
        ret = -ENOMEM;
        if (!env->explored_states)
@@ -2782,14 +3151,67 @@ skip_full_check:
 free_log_buf:
        if (log_level)
                vfree(log_buf);
-free_env:
        if (!env->prog->aux->used_maps)
                /* if we didn't copy map pointers into bpf_prog_info, release
                 * them now. Otherwise free_bpf_prog_info() will release them.
                 */
                release_maps(env);
        *prog = env->prog;
+err_unlock:
+        mutex_unlock(&bpf_verifier_lock);
+        vfree(env->insn_aux_data);
+err_free_env:
        kfree(env);
+        return ret;
+}
+int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
+                 void *priv)
+{
+        struct bpf_verifier_env *env;
+        int ret;
+        env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
+        if (!env)
+                return -ENOMEM;
+        env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) *
+                                     prog->len);
+        ret = -ENOMEM;
+        if (!env->insn_aux_data)
+                goto err_free_env;
+        env->prog = prog;
+        env->analyzer_ops = ops;
+        env->analyzer_priv = priv;
+        /* grab the mutex to protect few globals used by verifier */
+        mutex_lock(&bpf_verifier_lock);
+        log_level = 0;
+        env->explored_states = kcalloc(env->prog->len,
+                                       sizeof(struct bpf_verifier_state_list *),
+                                       GFP_KERNEL);
+        ret = -ENOMEM;
+        if (!env->explored_states)
+                goto skip_full_check;
+        ret = check_cfg(env);
+        if (ret < 0)
+                goto skip_full_check;
+        env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
+        ret = do_check(env);
+skip_full_check:
+        while (pop_stack(env, NULL) >= 0);
+        free_states(env);
        mutex_unlock(&bpf_verifier_lock);
+        vfree(env->insn_aux_data);
+err_free_env:
+        kfree(env);
        return ret;
 }
+EXPORT_SYMBOL_GPL(bpf_analyzer);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d1c51b7f5221..85bc9beb046d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -64,6 +64,9 @@
 #include <linux/file.h>
 #include <net/sock.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/cgroup.h>
 /*
 * pidlists linger the following amount before being destroyed.  The goal
 * is avoiding frequent destruction in the middle of consecutive read calls
@@ -1176,6 +1179,8 @@ static void cgroup_destroy_root(struct cgroup_root *root)
        struct cgroup *cgrp = &root->cgrp;
        struct cgrp_cset_link *link, *tmp_link;
+        trace_cgroup_destroy_root(root);
        cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
        BUG_ON(atomic_read(&root->nr_cgrps));
@@ -1874,6 +1879,9 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
                strcpy(root->release_agent_path, opts.release_agent);
                spin_unlock(&release_agent_path_lock);
        }
+        trace_cgroup_remount(root);
 out_unlock:
        kfree(opts.release_agent);
        kfree(opts.name);
@@ -2031,6 +2039,8 @@ static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
        if (ret)
                goto destroy_root;
+        trace_cgroup_setup_root(root);
        /*
         * There must be no failure case after here, since rebinding takes
         * care of subsystems' refcounts, which are explicitly dropped in
@@ -2315,22 +2325,18 @@ static struct file_system_type cgroup2_fs_type = {
        .fs_flags = FS_USERNS_MOUNT,
 };
-static char *cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
+static int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
-                                   struct cgroup_namespace *ns)
+                                 struct cgroup_namespace *ns)
 {
        struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
-        int ret;
-        ret = kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
+        return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
-        if (ret < 0 || ret >= buflen)
-                return NULL;
-        return buf;
 }
-char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
+int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
-                     struct cgroup_namespace *ns)
+                   struct cgroup_namespace *ns)
 {
-        char *ret;
+        int ret;
        mutex_lock(&cgroup_mutex);
        spin_lock_irq(&css_set_lock);
@@ -2357,12 +2363,12 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns);
 *
 * Return value is the same as kernfs_path().
 */
-char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
+int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
 {
        struct cgroup_root *root;
        struct cgroup *cgrp;
        int hierarchy_id = 1;
-        char *path = NULL;
+        int ret;
        mutex_lock(&cgroup_mutex);
        spin_lock_irq(&css_set_lock);
@@ -2371,16 +2377,15 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
        if (root) {
                cgrp = task_cgroup_from_root(task, root);
-                path = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
+                ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
        } else {
                /* if no hierarchy exists, everyone is in "/" */
-                if (strlcpy(buf, "/", buflen) < buflen)
+                ret = strlcpy(buf, "/", buflen);
-                        path = buf;
        }
        spin_unlock_irq(&css_set_lock);
        mutex_unlock(&cgroup_mutex);
-        return path;
+        return ret;
 }
 EXPORT_SYMBOL_GPL(task_cgroup_path);
@@ -2830,6 +2835,10 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
                ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root);
        cgroup_migrate_finish(&preloaded_csets);
+        if (!ret)
+                trace_cgroup_attach_task(dst_cgrp, leader, threadgroup);
        return ret;
 }
@@ -3446,9 +3455,28 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
         * Except for the root, subtree_control must be zero for a cgroup
         * with tasks so that child cgroups don't compete against tasks.
         */
-        if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) {
+        if (enable && cgroup_parent(cgrp)) {
-                ret = -EBUSY;
+                struct cgrp_cset_link *link;
-                goto out_unlock;
+                /*
+                 * Because namespaces pin csets too, @cgrp->cset_links
+                 * might not be empty even when @cgrp is empty.  Walk and
+                 * verify each cset.
+                 */
+                spin_lock_irq(&css_set_lock);
+                ret = 0;
+                list_for_each_entry(link, &cgrp->cset_links, cset_link) {
+                        if (css_set_populated(link->cset)) {
+                                ret = -EBUSY;
+                                break;
+                        }
+                }
+                spin_unlock_irq(&css_set_lock);
+                if (ret)
+                        goto out_unlock;
        }
        /* save and update control masks and prepare csses */
@@ -3592,6 +3620,8 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
        mutex_lock(&cgroup_mutex);
        ret = kernfs_rename(kn, new_parent, new_name_str);
+        if (!ret)
+                trace_cgroup_rename(cgrp);
        mutex_unlock(&cgroup_mutex);
@@ -3899,7 +3929,9 @@ void cgroup_file_notify(struct cgroup_file *cfile)
 * cgroup_task_count - count the number of tasks in a cgroup.
 * @cgrp: the cgroup in question
 *
- * Return the number of tasks in the cgroup.
+ * Return the number of tasks in the cgroup.  The returned number can be
+ * higher than the actual number of tasks due to css_set references from
+ * namespace roots and temporary usages.
 */
 static int cgroup_task_count(const struct cgroup *cgrp)
 {
@@ -4360,6 +4392,8 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
                if (task) {
                        ret = cgroup_migrate(task, false, to->root);
+                        if (!ret)
+                                trace_cgroup_transfer_tasks(to, task, false);
                        put_task_struct(task);
                }
        } while (task && !ret);
@@ -5025,6 +5059,8 @@ static void css_release_work_fn(struct work_struct *work)
                        ss->css_released(css);
        } else {
                /* cgroup release path */
+                trace_cgroup_release(cgrp);
                cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
                cgrp->id = -1;
@@ -5311,6 +5347,8 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
        if (ret)
                goto out_destroy;
+        trace_cgroup_mkdir(cgrp);
        /* let's create and online css's */
        kernfs_activate(kn);
@@ -5486,6 +5524,9 @@ static int cgroup_rmdir(struct kernfs_node *kn)
        ret = cgroup_destroy_locked(cgrp);
+        if (!ret)
+                trace_cgroup_rmdir(cgrp);
        cgroup_kn_unlock(kn);
        return ret;
 }
@@ -5606,6 +5647,12 @@ int __init cgroup_init(void)
        BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
        BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
+        /*
+         * The latency of the synchronize_sched() is too high for cgroups,
+         * avoid it at the cost of forcing all readers into the slow path.
+         */
+        rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
        get_user_ns(init_cgroup_ns.user_ns);
        mutex_lock(&cgroup_mutex);
@@ -5716,7 +5763,7 @@ core_initcall(cgroup_wq_init);
 int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
                     struct pid *pid, struct task_struct *tsk)
 {
-        char *buf, *path;
+        char *buf;
        int retval;
        struct cgroup_root *root;
@@ -5759,18 +5806,18 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
                 * " (deleted)" is appended to the cgroup path.
                 */
                if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
-                        path = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
+                        retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
                                                current->nsproxy->cgroup_ns);
-                        if (!path) {
+                        if (retval >= PATH_MAX)
                                retval = -ENAMETOOLONG;
+                        if (retval < 0)
                                goto out_unlock;
-                        }
+                        seq_puts(m, buf);
                } else {
-                        path = "/";
+                        seq_puts(m, "/");
                }
-                seq_puts(m, path);
                if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
                        seq_puts(m, " (deleted)\n");
                else
@@ -6035,8 +6082,9 @@ static void cgroup_release_agent(struct work_struct *work)
 {
        struct cgroup *cgrp =
                container_of(work, struct cgroup, release_agent_work);
-        char *pathbuf = NULL, *agentbuf = NULL, *path;
+        char *pathbuf = NULL, *agentbuf = NULL;
        char *argv[3], *envp[3];
+        int ret;
        mutex_lock(&cgroup_mutex);
@@ -6046,13 +6094,13 @@ static void cgroup_release_agent(struct work_struct *work)
                goto out;
        spin_lock_irq(&css_set_lock);
-        path = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
+        ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
        spin_unlock_irq(&css_set_lock);
-        if (!path)
+        if (ret < 0 || ret >= PATH_MAX)
                goto out;
        argv[0] = agentbuf;
-        argv[1] = path;
+        argv[1] = pathbuf;
        argv[2] = NULL;
        /* minimal command environment */
@@ -6270,6 +6318,12 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
        if (cgroup_sk_alloc_disabled)
                return;
+        /* Socket clone path */
+        if (skcd->val) {
+                cgroup_get(sock_cgroup_ptr(skcd));
+                return;
+        }
        rcu_read_lock();
        while (true) {
@@ -6295,6 +6349,16 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
 /* cgroup namespaces */
+static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns)
+{
+        return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES);
+}
+static void dec_cgroup_namespaces(struct ucounts *ucounts)
+{
+        dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES);
+}
 static struct cgroup_namespace *alloc_cgroup_ns(void)
 {
        struct cgroup_namespace *new_ns;
@@ -6316,6 +6380,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void)
 void free_cgroup_ns(struct cgroup_namespace *ns)
 {
        put_css_set(ns->root_cset);
+        dec_cgroup_namespaces(ns->ucounts);
        put_user_ns(ns->user_ns);
        ns_free_inum(&ns->ns);
        kfree(ns);
@@ -6327,6 +6392,7 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
                                        struct cgroup_namespace *old_ns)
 {
        struct cgroup_namespace *new_ns;
+        struct ucounts *ucounts;
        struct css_set *cset;
        BUG_ON(!old_ns);
@@ -6340,6 +6406,10 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
        if (!ns_capable(user_ns, CAP_SYS_ADMIN))
                return ERR_PTR(-EPERM);
+        ucounts = inc_cgroup_namespaces(user_ns);
+        if (!ucounts)
+                return ERR_PTR(-ENOSPC);
        /* It is not safe to take cgroup_mutex here */
        spin_lock_irq(&css_set_lock);
        cset = task_css_set(current);
@@ -6349,10 +6419,12 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
        new_ns = alloc_cgroup_ns();
        if (IS_ERR(new_ns)) {
                put_css_set(cset);
+                dec_cgroup_namespaces(ucounts);
                return new_ns;
        }
        new_ns->user_ns = get_user_ns(user_ns);
+        new_ns->ucounts = ucounts;
        new_ns->root_cset = cset;
        return new_ns;
@@ -6403,12 +6475,18 @@ static void cgroupns_put(struct ns_common *ns)
        put_cgroup_ns(to_cg_ns(ns));
 }
+static struct user_namespace *cgroupns_owner(struct ns_common *ns)
+{
+        return to_cg_ns(ns)->user_ns;
+}
 const struct proc_ns_operations cgroupns_operations = {
        .name           = "cgroup",
        .type           = CLONE_NEWCGROUP,
        .get            = cgroupns_get,
        .put            = cgroupns_put,
        .install        = cgroupns_install,
+        .owner          = cgroupns_owner,
 };
 static __init int cgroup_namespaces_init(void)
diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config
index 9f748ed7bea8..1a8f34f63601 100644
--- a/kernel/configs/android-base.config
+++ b/kernel/configs/android-base.config
@@ -11,7 +11,6 @@ CONFIG_ANDROID_LOW_MEMORY_KILLER=y
 CONFIG_ARMV8_DEPRECATED=y
 CONFIG_ASHMEM=y
 CONFIG_AUDIT=y
-CONFIG_BLK_DEV_DM=y
 CONFIG_BLK_DEV_INITRD=y
 CONFIG_CGROUPS=y
 CONFIG_CGROUP_CPUACCT=y
@@ -19,9 +18,7 @@ CONFIG_CGROUP_DEBUG=y
 CONFIG_CGROUP_FREEZER=y
 CONFIG_CGROUP_SCHED=y
 CONFIG_CP15_BARRIER_EMULATION=y
-CONFIG_DM_CRYPT=y
+CONFIG_DEFAULT_SECURITY_SELINUX=y
-CONFIG_DM_VERITY=y
-CONFIG_DM_VERITY_FEC=y
 CONFIG_EMBEDDED=y
 CONFIG_FB=y
 CONFIG_HIGH_RES_TIMERS=y
@@ -41,7 +38,6 @@ CONFIG_IPV6=y
 CONFIG_IPV6_MIP6=y
 CONFIG_IPV6_MULTIPLE_TABLES=y
 CONFIG_IPV6_OPTIMISTIC_DAD=y
-CONFIG_IPV6_PRIVACY=y
 CONFIG_IPV6_ROUTER_PREF=y
 CONFIG_IPV6_ROUTE_INFO=y
 CONFIG_IP_ADVANCED_ROUTER=y
@@ -135,6 +131,7 @@ CONFIG_PREEMPT=y
 CONFIG_QUOTA=y
 CONFIG_RTC_CLASS=y
 CONFIG_RT_GROUP_SCHED=y
+CONFIG_SECCOMP=y
 CONFIG_SECURITY=y
 CONFIG_SECURITY_NETWORK=y
 CONFIG_SECURITY_SELINUX=y
diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config
index e3b953e966d2..297756be369c 100644
--- a/kernel/configs/android-recommended.config
+++ b/kernel/configs/android-recommended.config
@@ -6,12 +6,16 @@
 # CONFIG_PM_WAKELOCKS_GC is not set
 # CONFIG_VT is not set
 CONFIG_BACKLIGHT_LCD_SUPPORT=y
+CONFIG_BLK_DEV_DM=y
 CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_SIZE=8192
 CONFIG_COMPACTION=y
 CONFIG_DEBUG_RODATA=y
+CONFIG_DM_CRYPT=y
 CONFIG_DM_UEVENT=y
+CONFIG_DM_VERITY=y
+CONFIG_DM_VERITY_FEC=y
 CONFIG_DRAGONRISE_FF=y
 CONFIG_ENABLE_DEFAULT_TRACERS=y
 CONFIG_EXT4_FS=y
diff --git a/kernel/configs/kvm_guest.config b/kernel/configs/kvm_guest.config
new file mode 100644
index 000000000000..8d9643767142
--- /dev/null
+++ b/kernel/configs/kvm_guest.config
@@ -0,0 +1,32 @@
+CONFIG_NET=y
+CONFIG_NET_CORE=y
+CONFIG_NETDEVICES=y
+CONFIG_BLOCK=y
+CONFIG_BLK_DEV=y
+CONFIG_NETWORK_FILESYSTEMS=y
+CONFIG_INET=y
+CONFIG_TTY=y
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_IP_PNP=y
+CONFIG_IP_PNP_DHCP=y
+CONFIG_BINFMT_ELF=y
+CONFIG_PCI=y
+CONFIG_PCI_MSI=y
+CONFIG_DEBUG_KERNEL=y
+CONFIG_VIRTUALIZATION=y
+CONFIG_HYPERVISOR_GUEST=y
+CONFIG_PARAVIRT=y
+CONFIG_KVM_GUEST=y
+CONFIG_VIRTIO=y
+CONFIG_VIRTIO_PCI=y
+CONFIG_VIRTIO_BLK=y
+CONFIG_VIRTIO_CONSOLE=y
+CONFIG_VIRTIO_NET=y
+CONFIG_9P_FS=y
+CONFIG_NET_9P=y
+CONFIG_NET_9P_VIRTIO=y
+CONFIG_SCSI_LOWLEVEL=y
+CONFIG_SCSI_VIRTIO=y
+CONFIG_VIRTIO_INPUT=y
+CONFIG_DRM_VIRTIO_GPU=y
diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config
index c2de56ab0fce..7fa0c4ae6394 100644
--- a/kernel/configs/tiny.config
+++ b/kernel/configs/tiny.config
@@ -1,4 +1,12 @@
+# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE is not set
 CONFIG_CC_OPTIMIZE_FOR_SIZE=y
+# CONFIG_KERNEL_GZIP is not set
+# CONFIG_KERNEL_BZIP2 is not set
+# CONFIG_KERNEL_LZMA is not set
 CONFIG_KERNEL_XZ=y
+# CONFIG_KERNEL_LZO is not set
+# CONFIG_KERNEL_LZ4 is not set
 CONFIG_OPTIMIZE_INLINING=y
+# CONFIG_SLAB is not set
+# CONFIG_SLUB is not set
 CONFIG_SLOB=y
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 341bf80f80bd..29de1a9352c0 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -23,6 +23,8 @@
 #include <linux/tick.h>
 #include <linux/irq.h>
 #include <linux/smpboot.h>
+#include <linux/relay.h>
+#include <linux/slab.h>
 #include <trace/events/power.h>
 #define CREATE_TRACE_POINTS
@@ -37,8 +39,9 @@
 * @thread:     Pointer to the hotplug thread
 * @should_run: Thread should execute
 * @rollback:   Perform a rollback
- * @cb_stat:    The state for a single callback (install/uninstall)
+ * @single:     Single callback invocation
- * @cb:         Single callback function (install/uninstall)
+ * @bringup:    Single callback bringup or teardown selector
+ * @cb_state:   The state for a single callback (install/uninstall)
 * @result:     Result of the operation
 * @done:       Signal completion to the issuer of the task
 */
@@ -49,8 +52,10 @@ struct cpuhp_cpu_state {
        struct task_struct      *thread;
        bool                    should_run;
        bool                    rollback;
+        bool                    single;
+        bool                    bringup;
+        struct hlist_node       *node;
        enum cpuhp_state        cb_state;
-        int                     (*cb)(unsigned int cpu);
        int                     result;
        struct completion       done;
 #endif
@@ -68,35 +73,103 @@ static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state);
 * @cant_stop:  Bringup/teardown can't be stopped at this step
 */
 struct cpuhp_step {
-        const char      *name;
+        const char              *name;
-        int             (*startup)(unsigned int cpu);
+        union {
-        int             (*teardown)(unsigned int cpu);
+                int             (*single)(unsigned int cpu);
-        bool            skip_onerr;
+                int             (*multi)(unsigned int cpu,
-        bool            cant_stop;
+                                         struct hlist_node *node);
+        } startup;
+        union {
+                int             (*single)(unsigned int cpu);
+                int             (*multi)(unsigned int cpu,
+                                         struct hlist_node *node);
+        } teardown;
+        struct hlist_head       list;
+        bool                    skip_onerr;
+        bool                    cant_stop;
+        bool                    multi_instance;
 };
 static DEFINE_MUTEX(cpuhp_state_mutex);
 static struct cpuhp_step cpuhp_bp_states[];
 static struct cpuhp_step cpuhp_ap_states[];
+static bool cpuhp_is_ap_state(enum cpuhp_state state)
+{
+        /*
+         * The extra check for CPUHP_TEARDOWN_CPU is only for documentation
+         * purposes as that state is handled explicitly in cpu_down.
+         */
+        return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU;
+}
+static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
+{
+        struct cpuhp_step *sp;
+        sp = cpuhp_is_ap_state(state) ? cpuhp_ap_states : cpuhp_bp_states;
+        return sp + state;
+}
 /**
 * cpuhp_invoke_callback _ Invoke the callbacks for a given state
 * @cpu:        The cpu for which the callback should be invoked
 * @step:       The step in the state machine
- * @cb:         The callback function to invoke
+ * @bringup:    True if the bringup callback should be invoked
 *
- * Called from cpu hotplug and from the state register machinery
+ * Called from cpu hotplug and from the state register machinery.
 */
-static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state step,
+static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
-                                 int (*cb)(unsigned int))
+                                 bool bringup, struct hlist_node *node)
 {
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
-        int ret = 0;
+        struct cpuhp_step *step = cpuhp_get_step(state);
+        int (*cbm)(unsigned int cpu, struct hlist_node *node);
-        if (cb) {
+        int (*cb)(unsigned int cpu);
-                trace_cpuhp_enter(cpu, st->target, step, cb);
+        int ret, cnt;
+        if (!step->multi_instance) {
+                cb = bringup ? step->startup.single : step->teardown.single;
+                if (!cb)
+                        return 0;
+                trace_cpuhp_enter(cpu, st->target, state, cb);
                ret = cb(cpu);
-                trace_cpuhp_exit(cpu, st->state, step, ret);
+                trace_cpuhp_exit(cpu, st->state, state, ret);
+                return ret;
+        }
+        cbm = bringup ? step->startup.multi : step->teardown.multi;
+        if (!cbm)
+                return 0;
+        /* Single invocation for instance add/remove */
+        if (node) {
+                trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
+                ret = cbm(cpu, node);
+                trace_cpuhp_exit(cpu, st->state, state, ret);
+                return ret;
+        }
+        /* State transition. Invoke on all instances */
+        cnt = 0;
+        hlist_for_each(node, &step->list) {
+                trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
+                ret = cbm(cpu, node);
+                trace_cpuhp_exit(cpu, st->state, state, ret);
+                if (ret)
+                        goto err;
+                cnt++;
+        }
+        return 0;
+err:
+        /* Rollback the instances if one failed */
+        cbm = !bringup ? step->startup.multi : step->teardown.multi;
+        if (!cbm)
+                return ret;
+        hlist_for_each(node, &step->list) {
+                if (!cnt--)
+                        break;
+                cbm(cpu, node);
        }
        return ret;
 }
@@ -155,7 +228,7 @@ static struct {
        .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq),
        .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
-        .dep_map = {.name = "cpu_hotplug.lock" },
+        .dep_map = STATIC_LOCKDEP_MAP_INIT("cpu_hotplug.dep_map", &cpu_hotplug.dep_map),
 #endif
 };
@@ -260,10 +333,17 @@ void cpu_hotplug_disable(void)
 }
 EXPORT_SYMBOL_GPL(cpu_hotplug_disable);
+static void __cpu_hotplug_enable(void)
+{
+        if (WARN_ONCE(!cpu_hotplug_disabled, "Unbalanced cpu hotplug enable\n"))
+                return;
+        cpu_hotplug_disabled--;
+}
 void cpu_hotplug_enable(void)
 {
        cpu_maps_update_begin();
-        WARN_ON(--cpu_hotplug_disabled < 0);
+        __cpu_hotplug_enable();
        cpu_maps_update_done();
 }
 EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
@@ -330,12 +410,6 @@ static int notify_online(unsigned int cpu)
        return 0;
 }
-static int notify_starting(unsigned int cpu)
-{
-        cpu_notify(CPU_STARTING, cpu);
-        return 0;
-}
 static int bringup_wait_for_ap(unsigned int cpu)
 {
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
@@ -349,8 +423,16 @@ static int bringup_cpu(unsigned int cpu)
        struct task_struct *idle = idle_thread_get(cpu);
        int ret;
+        /*
+         * Some architectures have to walk the irq descriptors to
+         * setup the vector space for the cpu which comes online.
+         * Prevent irq alloc/free across the bringup.
+         */
+        irq_lock_sparse();
        /* Arch-specific enabling code. */
        ret = __cpu_up(cpu, idle);
+        irq_unlock_sparse();
        if (ret) {
                cpu_notify(CPU_UP_CANCELED, cpu);
                return ret;
@@ -363,62 +445,55 @@ static int bringup_cpu(unsigned int cpu)
 /*
 * Hotplug state machine related functions
 */
-static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st,
+static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st)
-                          struct cpuhp_step *steps)
 {
        for (st->state++; st->state < st->target; st->state++) {
-                struct cpuhp_step *step = steps + st->state;
+                struct cpuhp_step *step = cpuhp_get_step(st->state);
                if (!step->skip_onerr)
-                        cpuhp_invoke_callback(cpu, st->state, step->startup);
+                        cpuhp_invoke_callback(cpu, st->state, true, NULL);
        }
 }
 static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
-                                struct cpuhp_step *steps, enum cpuhp_state target)
+                                enum cpuhp_state target)
 {
        enum cpuhp_state prev_state = st->state;
        int ret = 0;
        for (; st->state > target; st->state--) {
-                struct cpuhp_step *step = steps + st->state;
+                ret = cpuhp_invoke_callback(cpu, st->state, false, NULL);
-                ret = cpuhp_invoke_callback(cpu, st->state, step->teardown);
                if (ret) {
                        st->target = prev_state;
-                        undo_cpu_down(cpu, st, steps);
+                        undo_cpu_down(cpu, st);
                        break;
                }
        }
        return ret;
 }
-static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st,
+static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st)
-                        struct cpuhp_step *steps)
 {
        for (st->state--; st->state > st->target; st->state--) {
-                struct cpuhp_step *step = steps + st->state;
+                struct cpuhp_step *step = cpuhp_get_step(st->state);
                if (!step->skip_onerr)
-                        cpuhp_invoke_callback(cpu, st->state, step->teardown);
+                        cpuhp_invoke_callback(cpu, st->state, false, NULL);
        }
 }
 static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
-                              struct cpuhp_step *steps, enum cpuhp_state target)
+                              enum cpuhp_state target)
 {
        enum cpuhp_state prev_state = st->state;
        int ret = 0;
        while (st->state < target) {
-                struct cpuhp_step *step;
                st->state++;
-                step = steps + st->state;
+                ret = cpuhp_invoke_callback(cpu, st->state, true, NULL);
-                ret = cpuhp_invoke_callback(cpu, st->state, step->startup);
                if (ret) {
                        st->target = prev_state;
-                        undo_cpu_up(cpu, st, steps);
+                        undo_cpu_up(cpu, st);
                        break;
                }
        }
@@ -447,13 +522,13 @@ static int cpuhp_ap_offline(unsigned int cpu, struct cpuhp_cpu_state *st)
 {
        enum cpuhp_state target = max((int)st->target, CPUHP_TEARDOWN_CPU);
-        return cpuhp_down_callbacks(cpu, st, cpuhp_ap_states, target);
+        return cpuhp_down_callbacks(cpu, st, target);
 }
 /* Execute the online startup callbacks. Used to be CPU_ONLINE */
 static int cpuhp_ap_online(unsigned int cpu, struct cpuhp_cpu_state *st)
 {
-        return cpuhp_up_callbacks(cpu, st, cpuhp_ap_states, st->target);
+        return cpuhp_up_callbacks(cpu, st, st->target);
 }
 /*
@@ -476,18 +551,20 @@ static void cpuhp_thread_fun(unsigned int cpu)
        st->should_run = false;
        /* Single callback invocation for [un]install ? */
-        if (st->cb) {
+        if (st->single) {
                if (st->cb_state < CPUHP_AP_ONLINE) {
                        local_irq_disable();
-                        ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb);
+                        ret = cpuhp_invoke_callback(cpu, st->cb_state,
+                                                    st->bringup, st->node);
                        local_irq_enable();
                } else {
-                        ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb);
+                        ret = cpuhp_invoke_callback(cpu, st->cb_state,
+                                                    st->bringup, st->node);
                }
        } else if (st->rollback) {
                BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE);
-                undo_cpu_down(cpu, st, cpuhp_ap_states);
+                undo_cpu_down(cpu, st);
                /*
                 * This is a momentary workaround to keep the notifier users
                 * happy. Will go away once we got rid of the notifiers.
@@ -509,8 +586,9 @@ static void cpuhp_thread_fun(unsigned int cpu)
 }
 /* Invoke a single callback on a remote cpu */
-static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state,
+static int
-                                    int (*cb)(unsigned int))
+cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup,
+                         struct hlist_node *node)
 {
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
@@ -522,10 +600,13 @@ static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state,
         * we invoke the thread function directly.
         */
        if (!st->thread)
-                return cpuhp_invoke_callback(cpu, state, cb);
+                return cpuhp_invoke_callback(cpu, state, bringup, node);
        st->cb_state = state;
-        st->cb = cb;
+        st->single = true;
+        st->bringup = bringup;
+        st->node = node;
        /*
         * Make sure the above stores are visible before should_run becomes
         * true. Paired with the mb() above in cpuhp_thread_fun()
@@ -541,7 +622,7 @@ static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state,
 static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st)
 {
        st->result = 0;
-        st->cb = NULL;
+        st->single = false;
        /*
         * Make sure the above stores are visible before should_run becomes
         * true. Paired with the mb() above in cpuhp_thread_fun()
@@ -674,12 +755,6 @@ static int notify_down_prepare(unsigned int cpu)
        return err;
 }
-static int notify_dying(unsigned int cpu)
-{
-        cpu_notify(CPU_DYING, cpu);
-        return 0;
-}
 /* Take this CPU down. */
 static int take_cpu_down(void *_param)
 {
@@ -692,12 +767,16 @@ static int take_cpu_down(void *_param)
        if (err < 0)
                return err;
+        /*
+         * We get here while we are in CPUHP_TEARDOWN_CPU state and we must not
+         * do this step again.
+         */
+        WARN_ON(st->state != CPUHP_TEARDOWN_CPU);
+        st->state--;
        /* Invoke the former CPU_DYING callbacks */
-        for (; st->state > target; st->state--) {
+        for (; st->state > target; st->state--)
-                struct cpuhp_step *step = cpuhp_ap_states + st->state;
+                cpuhp_invoke_callback(cpu, st->state, false, NULL);
-                cpuhp_invoke_callback(cpu, st->state, step->teardown);
-        }
        /* Give up timekeeping duties */
        tick_handover_do_timer();
        /* Park the stopper thread */
@@ -734,7 +813,7 @@ static int takedown_cpu(unsigned int cpu)
        BUG_ON(cpu_online(cpu));
        /*
-         * The migration_call() CPU_DYING callback will have removed all
+         * The CPUHP_AP_SCHED_MIGRATE_DYING callback will have removed all
         * runnable tasks from the cpu, there's only the idle task left now
         * that the migration thread is done doing the stop_machine thing.
         *
@@ -787,7 +866,6 @@ void cpuhp_report_idle_dead(void)
 #define notify_down_prepare     NULL
 #define takedown_cpu            NULL
 #define notify_dead             NULL
-#define notify_dying            NULL
 #endif
 #ifdef CONFIG_HOTPLUG_CPU
@@ -836,7 +914,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
         * The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need
         * to do the further cleanups.
         */
-        ret = cpuhp_down_callbacks(cpu, st, cpuhp_bp_states, target);
+        ret = cpuhp_down_callbacks(cpu, st, target);
        if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) {
                st->target = prev_state;
                st->rollback = true;
@@ -877,10 +955,9 @@ EXPORT_SYMBOL(cpu_down);
 #endif /*CONFIG_HOTPLUG_CPU*/
 /**
- * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers
+ * notify_cpu_starting(cpu) - Invoke the callbacks on the starting CPU
 * @cpu: cpu that just started
 *
- * This function calls the cpu_chain notifiers with CPU_STARTING.
 * It must be called by the arch code on the new cpu, before the new cpu
 * enables interrupts and before the "boot" cpu returns from __cpu_up().
 */
@@ -889,12 +966,10 @@ void notify_cpu_starting(unsigned int cpu)
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE);
+        rcu_cpu_starting(cpu);  /* Enables RCU usage on this CPU. */
        while (st->state < target) {
-                struct cpuhp_step *step;
                st->state++;
-                step = cpuhp_ap_states + st->state;
+                cpuhp_invoke_callback(cpu, st->state, true, NULL);
-                cpuhp_invoke_callback(cpu, st->state, step->startup);
        }
 }
@@ -979,7 +1054,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
         * responsible for bringing it up to the target state.
         */
        target = min((int)target, CPUHP_BRINGUP_CPU);
-        ret = cpuhp_up_callbacks(cpu, st, cpuhp_bp_states, target);
+        ret = cpuhp_up_callbacks(cpu, st, target);
 out:
        cpu_hotplug_done();
        return ret;
@@ -1024,12 +1099,13 @@ EXPORT_SYMBOL_GPL(cpu_up);
 #ifdef CONFIG_PM_SLEEP_SMP
 static cpumask_var_t frozen_cpus;
-int disable_nonboot_cpus(void)
+int freeze_secondary_cpus(int primary)
 {
-        int cpu, first_cpu, error = 0;
+        int cpu, error = 0;
        cpu_maps_update_begin();
-        first_cpu = cpumask_first(cpu_online_mask);
+        if (!cpu_online(primary))
+                primary = cpumask_first(cpu_online_mask);
        /*
         * We take down all of the non-boot CPUs in one shot to avoid races
         * with the userspace trying to use the CPU hotplug at the same time
@@ -1038,7 +1114,7 @@ int disable_nonboot_cpus(void)
        pr_info("Disabling non-boot CPUs ...\n");
        for_each_online_cpu(cpu) {
-                if (cpu == first_cpu)
+                if (cpu == primary)
                        continue;
                trace_suspend_resume(TPS("CPU_OFF"), cpu, true);
                error = _cpu_down(cpu, 1, CPUHP_OFFLINE);
@@ -1081,7 +1157,7 @@ void enable_nonboot_cpus(void)
        /* Allow everyone to use the CPU hotplug again */
        cpu_maps_update_begin();
-        WARN_ON(--cpu_hotplug_disabled < 0);
+        __cpu_hotplug_enable();
        if (cpumask_empty(frozen_cpus))
                goto out;
@@ -1170,40 +1246,50 @@ core_initcall(cpu_hotplug_pm_sync_init);
 static struct cpuhp_step cpuhp_bp_states[] = {
        [CPUHP_OFFLINE] = {
                .name                   = "offline",
-                .startup                = NULL,
+                .startup.single         = NULL,
-                .teardown               = NULL,
+                .teardown.single        = NULL,
        },
 #ifdef CONFIG_SMP
        [CPUHP_CREATE_THREADS]= {
-                .name                   = "threads:create",
+                .name                   = "threads:prepare",
-                .startup                = smpboot_create_threads,
+                .startup.single         = smpboot_create_threads,
-                .teardown               = NULL,
+                .teardown.single        = NULL,
                .cant_stop              = true,
        },
        [CPUHP_PERF_PREPARE] = {
-                .name = "perf prepare",
+                .name                   = "perf:prepare",
-                .startup = perf_event_init_cpu,
+                .startup.single         = perf_event_init_cpu,
-                .teardown = perf_event_exit_cpu,
+                .teardown.single        = perf_event_exit_cpu,
        },
        [CPUHP_WORKQUEUE_PREP] = {
-                .name = "workqueue prepare",
+                .name                   = "workqueue:prepare",
-                .startup = workqueue_prepare_cpu,
+                .startup.single         = workqueue_prepare_cpu,
-                .teardown = NULL,
+                .teardown.single        = NULL,
        },
        [CPUHP_HRTIMERS_PREPARE] = {
-                .name = "hrtimers prepare",
+                .name                   = "hrtimers:prepare",
-                .startup = hrtimers_prepare_cpu,
+                .startup.single         = hrtimers_prepare_cpu,
-                .teardown = hrtimers_dead_cpu,
+                .teardown.single        = hrtimers_dead_cpu,
        },
        [CPUHP_SMPCFD_PREPARE] = {
-                .name = "SMPCFD prepare",
+                .name                   = "smpcfd:prepare",
-                .startup = smpcfd_prepare_cpu,
+                .startup.single         = smpcfd_prepare_cpu,
-                .teardown = smpcfd_dead_cpu,
+                .teardown.single        = smpcfd_dead_cpu,
+        },
+        [CPUHP_RELAY_PREPARE] = {
+                .name                   = "relay:prepare",
+                .startup.single         = relay_prepare_cpu,
+                .teardown.single        = NULL,
+        },
+        [CPUHP_SLAB_PREPARE] = {
+                .name                   = "slab:prepare",
+                .startup.single         = slab_prepare_cpu,
+                .teardown.single        = slab_dead_cpu,
        },
        [CPUHP_RCUTREE_PREP] = {
-                .name = "RCU-tree prepare",
+                .name                   = "RCU/tree:prepare",
-                .startup = rcutree_prepare_cpu,
+                .startup.single         = rcutree_prepare_cpu,
-                .teardown = rcutree_dead_cpu,
+                .teardown.single        = rcutree_dead_cpu,
        },
        /*
         * Preparatory and dead notifiers. Will be replaced once the notifiers
@@ -1211,8 +1297,8 @@ static struct cpuhp_step cpuhp_bp_states[] = {
         */
        [CPUHP_NOTIFY_PREPARE] = {
                .name                   = "notify:prepare",
-                .startup                = notify_prepare,
+                .startup.single         = notify_prepare,
-                .teardown               = notify_dead,
+                .teardown.single        = notify_dead,
                .skip_onerr             = true,
                .cant_stop              = true,
        },
@@ -1222,20 +1308,21 @@ static struct cpuhp_step cpuhp_bp_states[] = {
         * otherwise a RCU stall occurs.
         */
        [CPUHP_TIMERS_DEAD] = {
-                .name = "timers dead",
+                .name                   = "timers:dead",
-                .startup = NULL,
+                .startup.single         = NULL,
-                .teardown = timers_dead_cpu,
+                .teardown.single        = timers_dead_cpu,
        },
        /* Kicks the plugged cpu into life */
        [CPUHP_BRINGUP_CPU] = {
                .name                   = "cpu:bringup",
-                .startup                = bringup_cpu,
+                .startup.single         = bringup_cpu,
-                .teardown               = NULL,
+                .teardown.single        = NULL,
                .cant_stop              = true,
        },
        [CPUHP_AP_SMPCFD_DYING] = {
-                .startup = NULL,
+                .name                   = "smpcfd:dying",
-                .teardown = smpcfd_dying_cpu,
+                .startup.single         = NULL,
+                .teardown.single        = smpcfd_dying_cpu,
        },
        /*
         * Handled on controll processor until the plugged processor manages
@@ -1243,8 +1330,8 @@ static struct cpuhp_step cpuhp_bp_states[] = {
         */
        [CPUHP_TEARDOWN_CPU] = {
                .name                   = "cpu:teardown",
-                .startup                = NULL,
+                .startup.single         = NULL,
-                .teardown               = takedown_cpu,
+                .teardown.single        = takedown_cpu,
                .cant_stop              = true,
        },
 #else
@@ -1270,24 +1357,13 @@ static struct cpuhp_step cpuhp_ap_states[] = {
        /* First state is scheduler control. Interrupts are disabled */
        [CPUHP_AP_SCHED_STARTING] = {
                .name                   = "sched:starting",
-                .startup                = sched_cpu_starting,
+                .startup.single         = sched_cpu_starting,
-                .teardown               = sched_cpu_dying,
+                .teardown.single        = sched_cpu_dying,
        },
        [CPUHP_AP_RCUTREE_DYING] = {
-                .startup = NULL,
+                .name                   = "RCU/tree:dying",
-                .teardown = rcutree_dying_cpu,
+                .startup.single         = NULL,
-        },
+                .teardown.single        = rcutree_dying_cpu,
-        /*
-         * Low level startup/teardown notifiers. Run with interrupts
-         * disabled. Will be removed once the notifiers are converted to
-         * states.
-         */
-        [CPUHP_AP_NOTIFY_STARTING] = {
-                .name                   = "notify:starting",
-                .startup                = notify_starting,
-                .teardown               = notify_dying,
-                .skip_onerr             = true,
-                .cant_stop              = true,
        },
        /* Entry state on starting. Interrupts enabled from here on. Transient
         * state for synchronsization */
@@ -1296,24 +1372,24 @@ static struct cpuhp_step cpuhp_ap_states[] = {
        },
        /* Handle smpboot threads park/unpark */
        [CPUHP_AP_SMPBOOT_THREADS] = {
-                .name                   = "smpboot:threads",
+                .name                   = "smpboot/threads:online",
-                .startup                = smpboot_unpark_threads,
+                .startup.single         = smpboot_unpark_threads,
-                .teardown               = NULL,
+                .teardown.single        = NULL,
        },
        [CPUHP_AP_PERF_ONLINE] = {
-                .name = "perf online",
+                .name                   = "perf:online",
-                .startup = perf_event_init_cpu,
+                .startup.single         = perf_event_init_cpu,
-                .teardown = perf_event_exit_cpu,
+                .teardown.single        = perf_event_exit_cpu,
        },
        [CPUHP_AP_WORKQUEUE_ONLINE] = {
-                .name = "workqueue online",
+                .name                   = "workqueue:online",
-                .startup = workqueue_online_cpu,
+                .startup.single         = workqueue_online_cpu,
-                .teardown = workqueue_offline_cpu,
+                .teardown.single        = workqueue_offline_cpu,
        },
        [CPUHP_AP_RCUTREE_ONLINE] = {
-                .name = "RCU-tree online",
+                .name                   = "RCU/tree:online",
-                .startup = rcutree_online_cpu,
+                .startup.single         = rcutree_online_cpu,
-                .teardown = rcutree_offline_cpu,
+                .teardown.single        = rcutree_offline_cpu,
        },
        /*
@@ -1322,8 +1398,8 @@ static struct cpuhp_step cpuhp_ap_states[] = {
         */
        [CPUHP_AP_NOTIFY_ONLINE] = {
                .name                   = "notify:online",
-                .startup                = notify_online,
+                .startup.single         = notify_online,
-                .teardown               = notify_down_prepare,
+                .teardown.single        = notify_down_prepare,
                .skip_onerr             = true,
        },
 #endif
@@ -1335,16 +1411,16 @@ static struct cpuhp_step cpuhp_ap_states[] = {
        /* Last state is scheduler control setting the cpu active */
        [CPUHP_AP_ACTIVE] = {
                .name                   = "sched:active",
-                .startup                = sched_cpu_activate,
+                .startup.single         = sched_cpu_activate,
-                .teardown               = sched_cpu_deactivate,
+                .teardown.single        = sched_cpu_deactivate,
        },
 #endif
        /* CPU is fully up and running. */
        [CPUHP_ONLINE] = {
                .name                   = "online",
-                .startup                = NULL,
+                .startup.single         = NULL,
-                .teardown               = NULL,
+                .teardown.single        = NULL,
        },
 };
@@ -1356,54 +1432,42 @@ static int cpuhp_cb_check(enum cpuhp_state state)
        return 0;
 }
-static bool cpuhp_is_ap_state(enum cpuhp_state state)
-{
-        /*
-         * The extra check for CPUHP_TEARDOWN_CPU is only for documentation
-         * purposes as that state is handled explicitely in cpu_down.
-         */
-        return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU;
-}
-static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
-{
-        struct cpuhp_step *sp;
-        sp = cpuhp_is_ap_state(state) ? cpuhp_ap_states : cpuhp_bp_states;
-        return sp + state;
-}
 static void cpuhp_store_callbacks(enum cpuhp_state state,
                                  const char *name,
                                  int (*startup)(unsigned int cpu),
-                                  int (*teardown)(unsigned int cpu))
+                                  int (*teardown)(unsigned int cpu),
+                                  bool multi_instance)
 {
        /* (Un)Install the callbacks for further cpu hotplug operations */
        struct cpuhp_step *sp;
        mutex_lock(&cpuhp_state_mutex);
        sp = cpuhp_get_step(state);
-        sp->startup = startup;
+        sp->startup.single = startup;
-        sp->teardown = teardown;
+        sp->teardown.single = teardown;
        sp->name = name;
+        sp->multi_instance = multi_instance;
+        INIT_HLIST_HEAD(&sp->list);
        mutex_unlock(&cpuhp_state_mutex);
 }
 static void *cpuhp_get_teardown_cb(enum cpuhp_state state)
 {
-        return cpuhp_get_step(state)->teardown;
+        return cpuhp_get_step(state)->teardown.single;
 }
 /*
 * Call the startup/teardown function for a step either on the AP or
 * on the current CPU.
 */
-static int cpuhp_issue_call(int cpu, enum cpuhp_state state,
+static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup,
-                            int (*cb)(unsigned int), bool bringup)
+                            struct hlist_node *node)
 {
+        struct cpuhp_step *sp = cpuhp_get_step(state);
        int ret;
-        if (!cb)
+        if ((bringup && !sp->startup.single) ||
+            (!bringup && !sp->teardown.single))
                return 0;
        /*
         * The non AP bound callbacks can fail on bringup. On teardown
@@ -1411,11 +1475,11 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state,
         */
 #ifdef CONFIG_SMP
        if (cpuhp_is_ap_state(state))
-                ret = cpuhp_invoke_ap_callback(cpu, state, cb);
+                ret = cpuhp_invoke_ap_callback(cpu, state, bringup, node);
        else
-                ret = cpuhp_invoke_callback(cpu, state, cb);
+                ret = cpuhp_invoke_callback(cpu, state, bringup, node);
 #else
-        ret = cpuhp_invoke_callback(cpu, state, cb);
+        ret = cpuhp_invoke_callback(cpu, state, bringup, node);
 #endif
        BUG_ON(ret && !bringup);
        return ret;
@@ -1427,13 +1491,10 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state,
 * Note: The teardown callbacks for rollback are not allowed to fail!
 */
 static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state,
-                                   int (*teardown)(unsigned int cpu))
+                                   struct hlist_node *node)
 {
        int cpu;
-        if (!teardown)
-                return;
        /* Roll back the already executed steps on the other cpus */
        for_each_present_cpu(cpu) {
                struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
@@ -1444,7 +1505,7 @@ static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state,
                /* Did we invoke the startup call on that cpu ? */
                if (cpustate >= state)
-                        cpuhp_issue_call(cpu, state, teardown, false);
+                        cpuhp_issue_call(cpu, state, false, node);
        }
 }
@@ -1471,6 +1532,52 @@ static int cpuhp_reserve_state(enum cpuhp_state state)
        return -ENOSPC;
 }
+int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node,
+                               bool invoke)
+{
+        struct cpuhp_step *sp;
+        int cpu;
+        int ret;
+        sp = cpuhp_get_step(state);
+        if (sp->multi_instance == false)
+                return -EINVAL;
+        get_online_cpus();
+        if (!invoke || !sp->startup.multi)
+                goto add_node;
+        /*
+         * Try to call the startup callback for each present cpu
+         * depending on the hotplug state of the cpu.
+         */
+        for_each_present_cpu(cpu) {
+                struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+                int cpustate = st->state;
+                if (cpustate < state)
+                        continue;
+                ret = cpuhp_issue_call(cpu, state, true, node);
+                if (ret) {
+                        if (sp->teardown.multi)
+                                cpuhp_rollback_install(cpu, state, node);
+                        goto err;
+                }
+        }
+add_node:
+        ret = 0;
+        mutex_lock(&cpuhp_state_mutex);
+        hlist_add_head(node, &sp->list);
+        mutex_unlock(&cpuhp_state_mutex);
+err:
+        put_online_cpus();
+        return ret;
+}
+EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance);
 /**
 * __cpuhp_setup_state - Setup the callbacks for an hotplug machine state
 * @state:      The state to setup
@@ -1484,7 +1591,8 @@ static int cpuhp_reserve_state(enum cpuhp_state state)
 int __cpuhp_setup_state(enum cpuhp_state state,
                        const char *name, bool invoke,
                        int (*startup)(unsigned int cpu),
-                        int (*teardown)(unsigned int cpu))
+                        int (*teardown)(unsigned int cpu),
+                        bool multi_instance)
 {
        int cpu, ret = 0;
        int dyn_state = 0;
@@ -1503,7 +1611,7 @@ int __cpuhp_setup_state(enum cpuhp_state state,
                state = ret;
        }
-        cpuhp_store_callbacks(state, name, startup, teardown);
+        cpuhp_store_callbacks(state, name, startup, teardown, multi_instance);
        if (!invoke || !startup)
                goto out;
@@ -1519,10 +1627,11 @@ int __cpuhp_setup_state(enum cpuhp_state state,
                if (cpustate < state)
                        continue;
-                ret = cpuhp_issue_call(cpu, state, startup, true);
+                ret = cpuhp_issue_call(cpu, state, true, NULL);
                if (ret) {
-                        cpuhp_rollback_install(cpu, state, teardown);
+                        if (teardown)
-                        cpuhp_store_callbacks(state, NULL, NULL, NULL);
+                                cpuhp_rollback_install(cpu, state, NULL);
+                        cpuhp_store_callbacks(state, NULL, NULL, NULL, false);
                        goto out;
                }
        }
@@ -1534,6 +1643,42 @@ out:
 }
 EXPORT_SYMBOL(__cpuhp_setup_state);
+int __cpuhp_state_remove_instance(enum cpuhp_state state,
+                                  struct hlist_node *node, bool invoke)
+{
+        struct cpuhp_step *sp = cpuhp_get_step(state);
+        int cpu;
+        BUG_ON(cpuhp_cb_check(state));
+        if (!sp->multi_instance)
+                return -EINVAL;
+        get_online_cpus();
+        if (!invoke || !cpuhp_get_teardown_cb(state))
+                goto remove;
+        /*
+         * Call the teardown callback for each present cpu depending
+         * on the hotplug state of the cpu. This function is not
+         * allowed to fail currently!
+         */
+        for_each_present_cpu(cpu) {
+                struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+                int cpustate = st->state;
+                if (cpustate >= state)
+                        cpuhp_issue_call(cpu, state, false, node);
+        }
+remove:
+        mutex_lock(&cpuhp_state_mutex);
+        hlist_del(node);
+        mutex_unlock(&cpuhp_state_mutex);
+        put_online_cpus();
+        return 0;
+}
+EXPORT_SYMBOL_GPL(__cpuhp_state_remove_instance);
 /**
 * __cpuhp_remove_state - Remove the callbacks for an hotplug machine state
 * @state:      The state to remove
@@ -1545,14 +1690,21 @@ EXPORT_SYMBOL(__cpuhp_setup_state);
 */
 void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)
 {
-        int (*teardown)(unsigned int cpu) = cpuhp_get_teardown_cb(state);
+        struct cpuhp_step *sp = cpuhp_get_step(state);
        int cpu;
        BUG_ON(cpuhp_cb_check(state));
        get_online_cpus();
-        if (!invoke || !teardown)
+        if (sp->multi_instance) {
+                WARN(!hlist_empty(&sp->list),
+                     "Error: Removing state %d which has instances left.\n",
+                     state);
+                goto remove;
+        }
+        if (!invoke || !cpuhp_get_teardown_cb(state))
                goto remove;
        /*
@@ -1565,10 +1717,10 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)
                int cpustate = st->state;
                if (cpustate >= state)
-                        cpuhp_issue_call(cpu, state, teardown, false);
+                        cpuhp_issue_call(cpu, state, false, NULL);
        }
 remove:
-        cpuhp_store_callbacks(state, NULL, NULL, NULL);
+        cpuhp_store_callbacks(state, NULL, NULL, NULL, false);
        put_online_cpus();
 }
 EXPORT_SYMBOL(__cpuhp_remove_state);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index c7fd2778ed50..29f815d2ef7e 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -325,8 +325,7 @@ static struct file_system_type cpuset_fs_type = {
 /*
 * Return in pmask the portion of a cpusets's cpus_allowed that
 * are online.  If none are online, walk up the cpuset hierarchy
- * until we find one that does have some online cpus.  The top
+ * until we find one that does have some online cpus.
- * cpuset always has some cpus online.
 *
 * One way or another, we guarantee to return some non-empty subset
 * of cpu_online_mask.
@@ -335,8 +334,20 @@ static struct file_system_type cpuset_fs_type = {
 */
 static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
 {
-        while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask))
+        while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) {
                cs = parent_cs(cs);
+                if (unlikely(!cs)) {
+                        /*
+                         * The top cpuset doesn't have any online cpu as a
+                         * consequence of a race between cpuset_hotplug_work
+                         * and cpu hotplug notifier.  But we know the top
+                         * cpuset's effective_cpus is on its way to to be
+                         * identical to cpu_online_mask.
+                         */
+                        cpumask_copy(pmask, cpu_online_mask);
+                        return;
+                }
+        }
        cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
 }
@@ -2069,6 +2080,20 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
        mutex_unlock(&cpuset_mutex);
 }
+/*
+ * Make sure the new task conform to the current state of its parent,
+ * which could have been changed by cpuset just after it inherits the
+ * state from the parent and before it sits on the cgroup's task list.
+ */
+static void cpuset_fork(struct task_struct *task)
+{
+        if (task_css_is_root(task, cpuset_cgrp_id))
+                return;
+        set_cpus_allowed_ptr(task, &current->cpus_allowed);
+        task->mems_allowed = current->mems_allowed;
+}
 struct cgroup_subsys cpuset_cgrp_subsys = {
        .css_alloc      = cpuset_css_alloc,
        .css_online     = cpuset_css_online,
@@ -2079,6 +2104,7 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
        .attach         = cpuset_attach,
        .post_attach    = cpuset_post_attach,
        .bind           = cpuset_bind,
+        .fork           = cpuset_fork,
        .legacy_cftypes = files,
        .early_init     = true,
 };
@@ -2689,7 +2715,7 @@ void __cpuset_memory_pressure_bump(void)
 int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
                     struct pid *pid, struct task_struct *tsk)
 {
-        char *buf, *p;
+        char *buf;
        struct cgroup_subsys_state *css;
        int retval;
@@ -2698,14 +2724,15 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
        if (!buf)
                goto out;
-        retval = -ENAMETOOLONG;
        css = task_get_css(tsk, cpuset_cgrp_id);
-        p = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
+        retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
-                           current->nsproxy->cgroup_ns);
+                                current->nsproxy->cgroup_ns);
        css_put(css);
-        if (!p)
+        if (retval >= PATH_MAX)
+                retval = -ENAMETOOLONG;
+        if (retval < 0)
                goto out_free;
-        seq_puts(m, p);
+        seq_puts(m, buf);
        seq_putc(m, '\n');
        retval = 0;
 out_free:
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1903b8f3a705..c6e47e97b33f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -242,18 +242,6 @@ unlock:
        return ret;
 }
-static void event_function_local(struct perf_event *event, event_f func, void *data)
-{
-        struct event_function_struct efs = {
-                .event = event,
-                .func = func,
-                .data = data,
-        };
-        int ret = event_function(&efs);
-        WARN_ON_ONCE(ret);
-}
 static void event_function_call(struct perf_event *event, event_f func, void *data)
 {
        struct perf_event_context *ctx = event->ctx;
@@ -303,6 +291,54 @@ again:
        raw_spin_unlock_irq(&ctx->lock);
 }
+/*
+ * Similar to event_function_call() + event_function(), but hard assumes IRQs
+ * are already disabled and we're on the right CPU.
+ */
+static void event_function_local(struct perf_event *event, event_f func, void *data)
+{
+        struct perf_event_context *ctx = event->ctx;
+        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+        struct task_struct *task = READ_ONCE(ctx->task);
+        struct perf_event_context *task_ctx = NULL;
+        WARN_ON_ONCE(!irqs_disabled());
+        if (task) {
+                if (task == TASK_TOMBSTONE)
+                        return;
+                task_ctx = ctx;
+        }
+        perf_ctx_lock(cpuctx, task_ctx);
+        task = ctx->task;
+        if (task == TASK_TOMBSTONE)
+                goto unlock;
+        if (task) {
+                /*
+                 * We must be either inactive or active and the right task,
+                 * otherwise we're screwed, since we cannot IPI to somewhere
+                 * else.
+                 */
+                if (ctx->is_active) {
+                        if (WARN_ON_ONCE(task != current))
+                                goto unlock;
+                        if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
+                                goto unlock;
+                }
+        } else {
+                WARN_ON_ONCE(&cpuctx->ctx != ctx);
+        }
+        func(event, cpuctx, ctx, data);
+unlock:
+        perf_ctx_unlock(cpuctx, task_ctx);
+}
 #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
                       PERF_FLAG_FD_OUTPUT  |\
                       PERF_FLAG_PID_CGROUP |\
@@ -1439,8 +1475,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
        if (event->group_leader == event) {
                struct list_head *list;
-                if (is_software_event(event))
+                event->group_caps = event->event_caps;
-                        event->group_flags |= PERF_GROUP_SOFTWARE;
                list = ctx_group_list(event, ctx);
                list_add_tail(&event->group_entry, list);
@@ -1594,9 +1629,7 @@ static void perf_group_attach(struct perf_event *event)
        WARN_ON_ONCE(group_leader->ctx != event->ctx);
-        if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
+        group_leader->group_caps &= event->event_caps;
-                        !is_software_event(event))
-                group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
        list_add_tail(&event->group_entry, &group_leader->sibling_list);
        group_leader->nr_siblings++;
@@ -1687,7 +1720,7 @@ static void perf_group_detach(struct perf_event *event)
                sibling->group_leader = sibling;
                /* Inherit group flags from the previous leader */
-                sibling->group_flags = event->group_flags;
+                sibling->group_caps = event->group_caps;
                WARN_ON_ONCE(sibling->ctx != event->ctx);
        }
@@ -1796,6 +1829,8 @@ group_sched_out(struct perf_event *group_event,
        struct perf_event *event;
        int state = group_event->state;
+        perf_pmu_disable(ctx->pmu);
        event_sched_out(group_event, cpuctx, ctx);
        /*
@@ -1804,6 +1839,8 @@ group_sched_out(struct perf_event *group_event,
        list_for_each_entry(event, &group_event->sibling_list, group_entry)
                event_sched_out(event, cpuctx, ctx);
+        perf_pmu_enable(ctx->pmu);
        if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
                cpuctx->exclusive = 0;
 }
@@ -2109,7 +2146,7 @@ static int group_can_go_on(struct perf_event *event,
        /*
         * Groups consisting entirely of software events can always go on.
         */
-        if (event->group_flags & PERF_GROUP_SOFTWARE)
+        if (event->group_caps & PERF_EV_CAP_SOFTWARE)
                return 1;
        /*
         * If an exclusive group is already on, no other hardware
@@ -2455,16 +2492,16 @@ static int __perf_event_stop(void *info)
         * while restarting.
         */
        if (sd->restart)
-                event->pmu->start(event, PERF_EF_START);
+                event->pmu->start(event, 0);
        return 0;
 }
-static int perf_event_restart(struct perf_event *event)
+static int perf_event_stop(struct perf_event *event, int restart)
 {
        struct stop_event_data sd = {
                .event          = event,
-                .restart        = 1,
+                .restart        = restart,
        };
        int ret = 0;
@@ -2801,19 +2838,36 @@ unlock:
        }
 }
+static DEFINE_PER_CPU(struct list_head, sched_cb_list);
 void perf_sched_cb_dec(struct pmu *pmu)
 {
+        struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
        this_cpu_dec(perf_sched_cb_usages);
+        if (!--cpuctx->sched_cb_usage)
+                list_del(&cpuctx->sched_cb_entry);
 }
 void perf_sched_cb_inc(struct pmu *pmu)
 {
+        struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+        if (!cpuctx->sched_cb_usage++)
+                list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
        this_cpu_inc(perf_sched_cb_usages);
 }
 /*
 * This function provides the context switch callback to the lower code
 * layer. It is invoked ONLY when the context switch callback is enabled.
+ *
+ * This callback is relevant even to per-cpu events; for example multi event
+ * PEBS requires this to provide PID/TID information. This requires we flush
+ * all queued PEBS records before we context switch to a new task.
 */
 static void perf_pmu_sched_task(struct task_struct *prev,
                                struct task_struct *next,
@@ -2821,34 +2875,24 @@ static void perf_pmu_sched_task(struct task_struct *prev,
 {
        struct perf_cpu_context *cpuctx;
        struct pmu *pmu;
-        unsigned long flags;
        if (prev == next)
                return;
-        local_irq_save(flags);
+        list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
+                pmu = cpuctx->unique_pmu; /* software PMUs will not have sched_task */
-        rcu_read_lock();
-        list_for_each_entry_rcu(pmu, &pmus, entry) {
-                if (pmu->sched_task) {
-                        cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-                        perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+                if (WARN_ON_ONCE(!pmu->sched_task))
+                        continue;
-                        perf_pmu_disable(pmu);
-                        pmu->sched_task(cpuctx->task_ctx, sched_in);
+                perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+                perf_pmu_disable(pmu);
-                        perf_pmu_enable(pmu);
+                pmu->sched_task(cpuctx->task_ctx, sched_in);
-                        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+                perf_pmu_enable(pmu);
-                }
+                perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
        }
-        rcu_read_unlock();
-        local_irq_restore(flags);
 }
 static void perf_event_switch(struct task_struct *task,
@@ -3380,6 +3424,22 @@ struct perf_read_data {
        int ret;
 };
+static int find_cpu_to_read(struct perf_event *event, int local_cpu)
+{
+        int event_cpu = event->oncpu;
+        u16 local_pkg, event_pkg;
+        if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
+                event_pkg =  topology_physical_package_id(event_cpu);
+                local_pkg =  topology_physical_package_id(local_cpu);
+                if (event_pkg == local_pkg)
+                        return local_cpu;
+        }
+        return event_cpu;
+}
 /*
 * Cross CPU call to read the hardware event
 */
@@ -3501,7 +3561,7 @@ u64 perf_event_read_local(struct perf_event *event)
 static int perf_event_read(struct perf_event *event, bool group)
 {
-        int ret = 0;
+        int ret = 0, cpu_to_read, local_cpu;
        /*
         * If event is enabled and currently active on a CPU, update the
@@ -3513,8 +3573,22 @@ static int perf_event_read(struct perf_event *event, bool group)
                        .group = group,
                        .ret = 0,
                };
-                smp_call_function_single(event->oncpu,
-                                         __perf_event_read, &data, 1);
+                local_cpu = get_cpu();
+                cpu_to_read = find_cpu_to_read(event, local_cpu);
+                put_cpu();
+                /*
+                 * Purposely ignore the smp_call_function_single() return
+                 * value.
+                 *
+                 * If event->oncpu isn't a valid CPU it means the event got
+                 * scheduled out and that will have updated the event count.
+                 *
+                 * Therefore, either way, we'll have an up-to-date event count
+                 * after this.
+                 */
+                (void)smp_call_function_single(cpu_to_read, __perf_event_read, &data, 1);
                ret = data.ret;
        } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
                struct perf_event_context *ctx = event->ctx;
@@ -3884,7 +3958,7 @@ static void exclusive_event_destroy(struct perf_event *event)
 static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
 {
-        if ((e1->pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) &&
+        if ((e1->pmu == e2->pmu) &&
            (e1->cpu == e2->cpu ||
             e1->cpu == -1 ||
             e2->cpu == -1))
@@ -4800,6 +4874,19 @@ static void ring_buffer_attach(struct perf_event *event,
                spin_unlock_irqrestore(&rb->event_lock, flags);
        }
+        /*
+         * Avoid racing with perf_mmap_close(AUX): stop the event
+         * before swizzling the event::rb pointer; if it's getting
+         * unmapped, its aux_mmap_count will be 0 and it won't
+         * restart. See the comment in __perf_pmu_output_stop().
+         *
+         * Data will inevitably be lost when set_output is done in
+         * mid-air, but then again, whoever does it like this is
+         * not in for the data anyway.
+         */
+        if (has_aux(event))
+                perf_event_stop(event, 0);
        rcu_assign_pointer(event->rb, rb);
        if (old_rb) {
@@ -5292,9 +5379,10 @@ perf_output_sample_regs(struct perf_output_handle *handle,
                        struct pt_regs *regs, u64 mask)
 {
        int bit;
+        DECLARE_BITMAP(_mask, 64);
-        for_each_set_bit(bit, (const unsigned long *) &mask,
+        bitmap_from_u64(_mask, mask);
-                         sizeof(mask) * BITS_PER_BYTE) {
+        for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) {
                u64 val;
                val = perf_reg_value(regs, bit);
@@ -6075,7 +6163,7 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
        raw_spin_unlock_irqrestore(&ifh->lock, flags);
        if (restart)
-                perf_event_restart(event);
+                perf_event_stop(event, 1);
 }
 void perf_event_exec(void)
@@ -6119,7 +6207,13 @@ static void __perf_event_output_stop(struct perf_event *event, void *data)
        /*
         * In case of inheritance, it will be the parent that links to the
-         * ring-buffer, but it will be the child that's actually using it:
+         * ring-buffer, but it will be the child that's actually using it.
+         *
+         * We are using event::rb to determine if the event should be stopped,
+         * however this may race with ring_buffer_attach() (through set_output),
+         * which will make us skip the event that actually needs to be stopped.
+         * So ring_buffer_attach() has to stop an aux event before re-assigning
+         * its rb pointer.
         */
        if (rcu_dereference(parent->rb) == rb)
                ro->err = __perf_event_stop(&sd);
@@ -6129,7 +6223,7 @@ static int __perf_pmu_output_stop(void *info)
 {
        struct perf_event *event = info;
        struct pmu *pmu = event->pmu;
-        struct perf_cpu_context *cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+        struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
        struct remote_output ro = {
                .rb     = event->rb,
        };
@@ -6584,15 +6678,6 @@ got_name:
 }
 /*
- * Whether this @filter depends on a dynamic object which is not loaded
- * yet or its load addresses are not known.
- */
-static bool perf_addr_filter_needs_mmap(struct perf_addr_filter *filter)
-{
-        return filter->filter && filter->inode;
-}
-/*
 * Check whether inode and address range match filter criteria.
 */
 static bool perf_addr_filter_match(struct perf_addr_filter *filter,
@@ -6642,7 +6727,7 @@ static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
        raw_spin_unlock_irqrestore(&ifh->lock, flags);
        if (restart)
-                perf_event_restart(event);
+                perf_event_stop(event, 1);
 }
 /*
@@ -6653,6 +6738,13 @@ static void perf_addr_filters_adjust(struct vm_area_struct *vma)
        struct perf_event_context *ctx;
        int ctxn;
+        /*
+         * Data tracing isn't supported yet and as such there is no need
+         * to keep track of anything that isn't related to executable code:
+         */
+        if (!(vma->vm_flags & VM_EXEC))
+                return;
        rcu_read_lock();
        for_each_task_context_nr(ctxn) {
                ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
@@ -6987,7 +7079,7 @@ static int __perf_event_overflow(struct perf_event *event,
                irq_work_queue(&event->pending);
        }
-        event->overflow_handler(event, data, regs);
+        READ_ONCE(event->overflow_handler)(event, data, regs);
        if (*perf_event_fasync(event) && event->pending_kill) {
                event->pending_wakeup = 1;
@@ -7602,11 +7694,83 @@ static void perf_event_free_filter(struct perf_event *event)
        ftrace_profile_free_filter(event);
 }
+#ifdef CONFIG_BPF_SYSCALL
+static void bpf_overflow_handler(struct perf_event *event,
+                                 struct perf_sample_data *data,
+                                 struct pt_regs *regs)
+{
+        struct bpf_perf_event_data_kern ctx = {
+                .data = data,
+                .regs = regs,
+        };
+        int ret = 0;
+        preempt_disable();
+        if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
+                goto out;
+        rcu_read_lock();
+        ret = BPF_PROG_RUN(event->prog, (void *)&ctx);
+        rcu_read_unlock();
+out:
+        __this_cpu_dec(bpf_prog_active);
+        preempt_enable();
+        if (!ret)
+                return;
+        event->orig_overflow_handler(event, data, regs);
+}
+static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
+{
+        struct bpf_prog *prog;
+        if (event->overflow_handler_context)
+                /* hw breakpoint or kernel counter */
+                return -EINVAL;
+        if (event->prog)
+                return -EEXIST;
+        prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT);
+        if (IS_ERR(prog))
+                return PTR_ERR(prog);
+        event->prog = prog;
+        event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
+        WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
+        return 0;
+}
+static void perf_event_free_bpf_handler(struct perf_event *event)
+{
+        struct bpf_prog *prog = event->prog;
+        if (!prog)
+                return;
+        WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler);
+        event->prog = NULL;
+        bpf_prog_put(prog);
+}
+#else
+static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
+{
+        return -EOPNOTSUPP;
+}
+static void perf_event_free_bpf_handler(struct perf_event *event)
+{
+}
+#endif
 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 {
        bool is_kprobe, is_tracepoint;
        struct bpf_prog *prog;
+        if (event->attr.type == PERF_TYPE_HARDWARE ||
+            event->attr.type == PERF_TYPE_SOFTWARE)
+                return perf_event_set_bpf_handler(event, prog_fd);
        if (event->attr.type != PERF_TYPE_TRACEPOINT)
                return -EINVAL;
@@ -7647,6 +7811,8 @@ static void perf_event_free_bpf_prog(struct perf_event *event)
 {
        struct bpf_prog *prog;
+        perf_event_free_bpf_handler(event);
        if (!event->tp_event)
                return;
@@ -7805,7 +7971,11 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
        list_for_each_entry(filter, &ifh->list, entry) {
                event->addr_filters_offs[count] = 0;
-                if (perf_addr_filter_needs_mmap(filter))
+                /*
+                 * Adjust base offset if the filter is associated to a binary
+                 * that needs to be mapped:
+                 */
+                if (filter->inode)
                        event->addr_filters_offs[count] =
                                perf_addr_filter_apply(filter, mm);
@@ -7820,7 +7990,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
        mmput(mm);
 restart:
-        perf_event_restart(event);
+        perf_event_stop(event, 1);
 }
 /*
@@ -7936,8 +8106,10 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
                                        goto fail;
                        }
-                        if (token == IF_SRC_FILE) {
+                        if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
-                                filename = match_strdup(&args[2]);
+                                int fpos = filter->range ? 2 : 1;
+                                filename = match_strdup(&args[fpos]);
                                if (!filename) {
                                        ret = -ENOMEM;
                                        goto fail;
@@ -8957,6 +9129,19 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
        if (!overflow_handler && parent_event) {
                overflow_handler = parent_event->overflow_handler;
                context = parent_event->overflow_handler_context;
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
+                if (overflow_handler == bpf_overflow_handler) {
+                        struct bpf_prog *prog = bpf_prog_inc(parent_event->prog);
+                        if (IS_ERR(prog)) {
+                                err = PTR_ERR(prog);
+                                goto err_ns;
+                        }
+                        event->prog = prog;
+                        event->orig_overflow_handler =
+                                parent_event->orig_overflow_handler;
+                }
+#endif
        }
        if (overflow_handler) {
@@ -9437,6 +9622,9 @@ SYSCALL_DEFINE5(perf_event_open,
                        goto err_alloc;
        }
+        if (pmu->task_ctx_nr == perf_sw_context)
+                event->event_caps |= PERF_EV_CAP_SOFTWARE;
        if (group_leader &&
            (is_software_event(event) != is_software_event(group_leader))) {
                if (is_software_event(event)) {
@@ -9450,7 +9638,7 @@ SYSCALL_DEFINE5(perf_event_open,
                         */
                        pmu = group_leader->pmu;
                } else if (is_software_event(group_leader) &&
-                           (group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
+                           (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
                        /*
                         * In case the group is a pure software group, and we
                         * try to add a hardware event, move the whole group to
@@ -10385,6 +10573,8 @@ static void __init perf_event_init_all_cpus(void)
                INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
                raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
+                INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
        }
 }
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index ae9b90dc9a5a..257fa460b846 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -330,15 +330,22 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
        if (!rb)
                return NULL;
-        if (!rb_has_aux(rb) || !atomic_inc_not_zero(&rb->aux_refcount))
+        if (!rb_has_aux(rb))
                goto err;
        /*
-         * If rb::aux_mmap_count is zero (and rb_has_aux() above went through),
+         * If aux_mmap_count is zero, the aux buffer is in perf_mmap_close(),
-         * the aux buffer is in perf_mmap_close(), about to get freed.
+         * about to get freed, so we leave immediately.
+         *
+         * Checking rb::aux_mmap_count and rb::refcount has to be done in
+         * the same order, see perf_mmap_close. Otherwise we end up freeing
+         * aux pages in this path, which is a bug, because in_atomic().
         */
        if (!atomic_read(&rb->aux_mmap_count))
-                goto err_put;
+                goto err;
+        if (!atomic_inc_not_zero(&rb->aux_refcount))
+                goto err;
        /*
         * Nesting is not supported for AUX area, make sure nested
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index b7a525ab2083..f9ec9add2164 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -150,7 +150,7 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
 * Returns 0 on success, -EFAULT on failure.
 */
 static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
-                                struct page *page, struct page *kpage)
+                                struct page *old_page, struct page *new_page)
 {
        struct mm_struct *mm = vma->vm_mm;
        spinlock_t *ptl;
@@ -161,48 +161,49 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
        const unsigned long mmun_end   = addr + PAGE_SIZE;
        struct mem_cgroup *memcg;
-        err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg,
+        err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, &memcg,
                        false);
        if (err)
                return err;
        /* For try_to_free_swap() and munlock_vma_page() below */
-        lock_page(page);
+        lock_page(old_page);
        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        err = -EAGAIN;
-        ptep = page_check_address(page, mm, addr, &ptl, 0);
+        ptep = page_check_address(old_page, mm, addr, &ptl, 0);
-        if (!ptep)
+        if (!ptep) {
+                mem_cgroup_cancel_charge(new_page, memcg, false);
                goto unlock;
+        }
-        get_page(kpage);
+        get_page(new_page);
-        page_add_new_anon_rmap(kpage, vma, addr, false);
+        page_add_new_anon_rmap(new_page, vma, addr, false);
-        mem_cgroup_commit_charge(kpage, memcg, false, false);
+        mem_cgroup_commit_charge(new_page, memcg, false, false);
-        lru_cache_add_active_or_unevictable(kpage, vma);
+        lru_cache_add_active_or_unevictable(new_page, vma);
-        if (!PageAnon(page)) {
+        if (!PageAnon(old_page)) {
-                dec_mm_counter(mm, mm_counter_file(page));
+                dec_mm_counter(mm, mm_counter_file(old_page));
                inc_mm_counter(mm, MM_ANONPAGES);
        }
        flush_cache_page(vma, addr, pte_pfn(*ptep));
        ptep_clear_flush_notify(vma, addr, ptep);
-        set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
+        set_pte_at_notify(mm, addr, ptep, mk_pte(new_page, vma->vm_page_prot));
-        page_remove_rmap(page, false);
+        page_remove_rmap(old_page, false);
-        if (!page_mapped(page))
+        if (!page_mapped(old_page))
-                try_to_free_swap(page);
+                try_to_free_swap(old_page);
        pte_unmap_unlock(ptep, ptl);
        if (vma->vm_flags & VM_LOCKED)
-                munlock_vma_page(page);
+                munlock_vma_page(old_page);
-        put_page(page);
+        put_page(old_page);
        err = 0;
 unlock:
-        mem_cgroup_cancel_charge(kpage, memcg, false);
        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
-        unlock_page(page);
+        unlock_page(old_page);
        return err;
 }
@@ -299,7 +300,8 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,
 retry:
        /* Read the page with vaddr into memory */
-        ret = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma);
+        ret = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &old_page,
+                        &vma);
        if (ret <= 0)
                return ret;
@@ -1709,7 +1711,8 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
         * but we treat this as a 'remote' access since it is
         * essentially a kernel access to the memory.
         */
-        result = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &page, NULL);
+        result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page,
+                        NULL);
        if (result < 0)
                return result;
diff --git a/kernel/exit.c b/kernel/exit.c
index 2f974ae042a6..9d68c45ebbe3 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -511,7 +511,7 @@ static void exit_mm(struct task_struct *tsk)
        mm_update_next_owner(mm);
        mmput(mm);
        if (test_thread_flag(TIF_MEMDIE))
-                exit_oom_victim(tsk);
+                exit_oom_victim();
 }
 static struct task_struct *find_alive_thread(struct task_struct *p)
@@ -725,7 +725,7 @@ static void check_stack_usage(void)
 static inline void check_stack_usage(void) {}
 #endif
-void do_exit(long code)
+void __noreturn do_exit(long code)
 {
        struct task_struct *tsk = current;
        int group_dead;
@@ -848,12 +848,7 @@ void do_exit(long code)
        TASKS_RCU(preempt_enable());
        exit_notify(tsk, group_dead);
        proc_exit_connector(tsk);
-#ifdef CONFIG_NUMA
+        mpol_put_task_policy(tsk);
-        task_lock(tsk);
-        mpol_put(tsk->mempolicy);
-        tsk->mempolicy = NULL;
-        task_unlock(tsk);
-#endif
 #ifdef CONFIG_FUTEX
        if (unlikely(current->pi_state_cache))
                kfree(current->pi_state_cache);
@@ -887,29 +882,7 @@ void do_exit(long code)
        exit_rcu();
        TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i));
-        /*
+        do_task_dead();
-         * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
-         * when the following two conditions become true.
-         *   - There is race condition of mmap_sem (It is acquired by
-         *     exit_mm()), and
-         *   - SMI occurs before setting TASK_RUNINNG.
-         *     (or hypervisor of virtual machine switches to other guest)
-         *  As a result, we may become TASK_RUNNING after becoming TASK_DEAD
-         *
-         * To avoid it, we have to wait for releasing tsk->pi_lock which
-         * is held by try_to_wake_up()
-         */
-        smp_mb();
-        raw_spin_unlock_wait(&tsk->pi_lock);
-        /* causes final put_task_struct in finish_task_switch(). */
-        tsk->state = TASK_DEAD;
-        tsk->flags |= PF_NOFREEZE;      /* tell freezer to ignore us */
-        schedule();
-        BUG();
-        /* Avoid "noreturn function does return".  */
-        for (;;)
-                cpu_relax();    /* For when BUG is null */
 }
 EXPORT_SYMBOL_GPL(do_exit);
diff --git a/kernel/fork.c b/kernel/fork.c
index 52e725d4a866..623259fc794d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -158,19 +158,83 @@ void __weak arch_release_thread_stack(unsigned long *stack)
 * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
 * kmemcache based allocator.
 */
-# if THREAD_SIZE >= PAGE_SIZE
+# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
-static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
-                                                  int node)
+#ifdef CONFIG_VMAP_STACK
+/*
+ * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB
+ * flush.  Try to minimize the number of calls by caching stacks.
+ */
+#define NR_CACHED_STACKS 2
+static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);
+#endif
+static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
 {
+#ifdef CONFIG_VMAP_STACK
+        void *stack;
+        int i;
+        local_irq_disable();
+        for (i = 0; i < NR_CACHED_STACKS; i++) {
+                struct vm_struct *s = this_cpu_read(cached_stacks[i]);
+                if (!s)
+                        continue;
+                this_cpu_write(cached_stacks[i], NULL);
+                tsk->stack_vm_area = s;
+                local_irq_enable();
+                return s->addr;
+        }
+        local_irq_enable();
+        stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE,
+                                     VMALLOC_START, VMALLOC_END,
+                                     THREADINFO_GFP | __GFP_HIGHMEM,
+                                     PAGE_KERNEL,
+                                     0, node, __builtin_return_address(0));
+        /*
+         * We can't call find_vm_area() in interrupt context, and
+         * free_thread_stack() can be called in interrupt context,
+         * so cache the vm_struct.
+         */
+        if (stack)
+                tsk->stack_vm_area = find_vm_area(stack);
+        return stack;
+#else
        struct page *page = alloc_pages_node(node, THREADINFO_GFP,
                                             THREAD_SIZE_ORDER);
        return page ? page_address(page) : NULL;
+#endif
 }
-static inline void free_thread_stack(unsigned long *stack)
+static inline void free_thread_stack(struct task_struct *tsk)
 {
-        __free_pages(virt_to_page(stack), THREAD_SIZE_ORDER);
+#ifdef CONFIG_VMAP_STACK
+        if (task_stack_vm_area(tsk)) {
+                unsigned long flags;
+                int i;
+                local_irq_save(flags);
+                for (i = 0; i < NR_CACHED_STACKS; i++) {
+                        if (this_cpu_read(cached_stacks[i]))
+                                continue;
+                        this_cpu_write(cached_stacks[i], tsk->stack_vm_area);
+                        local_irq_restore(flags);
+                        return;
+                }
+                local_irq_restore(flags);
+                vfree(tsk->stack);
+                return;
+        }
+#endif
+        __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER);
 }
 # else
 static struct kmem_cache *thread_stack_cache;
@@ -181,9 +245,9 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
        return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
 }
-static void free_thread_stack(unsigned long *stack)
+static void free_thread_stack(struct task_struct *tsk)
 {
-        kmem_cache_free(thread_stack_cache, stack);
+        kmem_cache_free(thread_stack_cache, tsk->stack);
 }
 void thread_stack_cache_init(void)
@@ -213,24 +277,76 @@ struct kmem_cache *vm_area_cachep;
 /* SLAB cache for mm_struct structures (tsk->mm) */
 static struct kmem_cache *mm_cachep;
-static void account_kernel_stack(unsigned long *stack, int account)
+static void account_kernel_stack(struct task_struct *tsk, int account)
 {
-        /* All stack pages are in the same zone and belong to the same memcg. */
+        void *stack = task_stack_page(tsk);
-        struct page *first_page = virt_to_page(stack);
+        struct vm_struct *vm = task_stack_vm_area(tsk);
+        BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
+        if (vm) {
+                int i;
-        mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
+                BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
-                            THREAD_SIZE / 1024 * account);
-        memcg_kmem_update_page_stat(
+                for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
-                first_page, MEMCG_KERNEL_STACK_KB,
+                        mod_zone_page_state(page_zone(vm->pages[i]),
-                account * (THREAD_SIZE / 1024));
+                                            NR_KERNEL_STACK_KB,
+                                            PAGE_SIZE / 1024 * account);
+                }
+                /* All stack pages belong to the same memcg. */
+                memcg_kmem_update_page_stat(vm->pages[0], MEMCG_KERNEL_STACK_KB,
+                                            account * (THREAD_SIZE / 1024));
+        } else {
+                /*
+                 * All stack pages are in the same zone and belong to the
+                 * same memcg.
+                 */
+                struct page *first_page = virt_to_page(stack);
+                mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
+                                    THREAD_SIZE / 1024 * account);
+                memcg_kmem_update_page_stat(first_page, MEMCG_KERNEL_STACK_KB,
+                                            account * (THREAD_SIZE / 1024));
+        }
 }
-void free_task(struct task_struct *tsk)
+static void release_task_stack(struct task_struct *tsk)
 {
-        account_kernel_stack(tsk->stack, -1);
+        account_kernel_stack(tsk, -1);
        arch_release_thread_stack(tsk->stack);
-        free_thread_stack(tsk->stack);
+        free_thread_stack(tsk);
+        tsk->stack = NULL;
+#ifdef CONFIG_VMAP_STACK
+        tsk->stack_vm_area = NULL;
+#endif
+}
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+void put_task_stack(struct task_struct *tsk)
+{
+        if (atomic_dec_and_test(&tsk->stack_refcount))
+                release_task_stack(tsk);
+}
+#endif
+void free_task(struct task_struct *tsk)
+{
+#ifndef CONFIG_THREAD_INFO_IN_TASK
+        /*
+         * The task is finally done with both the stack and thread_info,
+         * so free both.
+         */
+        release_task_stack(tsk);
+#else
+        /*
+         * If the task had a separate stack allocation, it should be gone
+         * by now.
+         */
+        WARN_ON_ONCE(atomic_read(&tsk->stack_refcount) != 0);
+#endif
        rt_mutex_debug_task_free(tsk);
        ftrace_graph_exit_task(tsk);
        put_seccomp_filter(tsk);
@@ -243,6 +359,12 @@ static inline void free_signal_struct(struct signal_struct *sig)
 {
        taskstats_tgid_free(sig);
        sched_autogroup_exit(sig);
+        /*
+         * __mmdrop is not safe to call from softirq context on x86 due to
+         * pgd_dtor so postpone it to the async context
+         */
+        if (sig->oom_mm)
+                mmdrop_async(sig->oom_mm);
        kmem_cache_free(signal_cachep, sig);
 }
@@ -302,6 +424,7 @@ int arch_task_struct_size __read_mostly;
 void __init fork_init(void)
 {
+        int i;
 #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
 #ifndef ARCH_MIN_TASKALIGN
 #define ARCH_MIN_TASKALIGN      L1_CACHE_BYTES
@@ -321,6 +444,10 @@ void __init fork_init(void)
        init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
        init_task.signal->rlim[RLIMIT_SIGPENDING] =
                init_task.signal->rlim[RLIMIT_NPROC];
+        for (i = 0; i < UCOUNT_COUNTS; i++) {
+                init_user_ns.ucount_max[i] = max_threads/2;
+        }
 }
 int __weak arch_dup_task_struct(struct task_struct *dst,
@@ -342,6 +469,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 {
        struct task_struct *tsk;
        unsigned long *stack;
+        struct vm_struct *stack_vm_area;
        int err;
        if (node == NUMA_NO_NODE)
@@ -354,11 +482,26 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
        if (!stack)
                goto free_tsk;
+        stack_vm_area = task_stack_vm_area(tsk);
        err = arch_dup_task_struct(tsk, orig);
+        /*
+         * arch_dup_task_struct() clobbers the stack-related fields.  Make
+         * sure they're properly initialized before using any stack-related
+         * functions again.
+         */
+        tsk->stack = stack;
+#ifdef CONFIG_VMAP_STACK
+        tsk->stack_vm_area = stack_vm_area;
+#endif
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+        atomic_set(&tsk->stack_refcount, 1);
+#endif
        if (err)
                goto free_stack;
-        tsk->stack = stack;
 #ifdef CONFIG_SECCOMP
        /*
         * We must handle setting up seccomp filters once we're under
@@ -390,21 +533,22 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
        tsk->task_frag.page = NULL;
        tsk->wake_q.next = NULL;
-        account_kernel_stack(stack, 1);
+        account_kernel_stack(tsk, 1);
        kcov_task_init(tsk);
        return tsk;
 free_stack:
-        free_thread_stack(stack);
+        free_thread_stack(tsk);
 free_tsk:
        free_task_struct(tsk);
        return NULL;
 }
 #ifdef CONFIG_MMU
-static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
+static __latent_entropy int dup_mmap(struct mm_struct *mm,
+                                        struct mm_struct *oldmm)
 {
        struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
        struct rb_node **rb_link, *rb_parent;
@@ -711,6 +855,7 @@ static inline void __mmput(struct mm_struct *mm)
        ksm_exit(mm);
        khugepaged_exit(mm); /* must run before exit_mmap */
        exit_mmap(mm);
+        mm_put_huge_zero_page(mm);
        set_mm_exe_file(mm, NULL);
        if (!list_empty(&mm->mmlist)) {
                spin_lock(&mmlist_lock);
@@ -719,6 +864,7 @@ static inline void __mmput(struct mm_struct *mm)
        }
        if (mm->binfmt)
                module_put(mm->binfmt->module);
+        set_bit(MMF_OOM_SKIP, &mm->flags);
        mmdrop(mm);
 }
@@ -799,6 +945,29 @@ struct file *get_mm_exe_file(struct mm_struct *mm)
 EXPORT_SYMBOL(get_mm_exe_file);
 /**
+ * get_task_exe_file - acquire a reference to the task's executable file
+ *
+ * Returns %NULL if task's mm (if any) has no associated executable file or
+ * this is a kernel thread with borrowed mm (see the comment above get_task_mm).
+ * User must release file via fput().
+ */
+struct file *get_task_exe_file(struct task_struct *task)
+{
+        struct file *exe_file = NULL;
+        struct mm_struct *mm;
+        task_lock(task);
+        mm = task->mm;
+        if (mm) {
+                if (!(task->flags & PF_KTHREAD))
+                        exe_file = get_mm_exe_file(mm);
+        }
+        task_unlock(task);
+        return exe_file;
+}
+EXPORT_SYMBOL(get_task_exe_file);
+/**
 * get_task_mm - acquire a reference to the task's mm
 *
 * Returns %NULL if the task has no mm.  Checks PF_KTHREAD (meaning
@@ -913,14 +1082,12 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
        deactivate_mm(tsk, mm);
        /*
-         * If we're exiting normally, clear a user-space tid field if
+         * Signal userspace if we're not exiting with a core dump
-         * requested.  We leave this alone when dying by signal, to leave
+         * because we want to leave the value intact for debugging
-         * the value intact in a core dump, and to save the unnecessary
+         * purposes.
-         * trouble, say, a killed vfork parent shouldn't touch this mm.
-         * Userland only wants this done for a sys_exit.
         */
        if (tsk->clear_child_tid) {
-                if (!(tsk->flags & PF_SIGNALED) &&
+                if (!(tsk->signal->flags & SIGNAL_GROUP_COREDUMP) &&
                    atomic_read(&mm->mm_users) > 1) {
                        /*
                         * We don't check the error code - if userspace has
@@ -1275,7 +1442,8 @@ init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
 * parts of the process environment (as per the clone
 * flags). The actual kick-off is left to the caller.
 */
-static struct task_struct *copy_process(unsigned long clone_flags,
+static __latent_entropy struct task_struct *copy_process(
+                                        unsigned long clone_flags,
                                        unsigned long stack_start,
                                        unsigned long stack_size,
                                        int __user *child_tidptr,
@@ -1404,7 +1572,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        p->real_start_time = ktime_get_boot_ns();
        p->io_context = NULL;
        p->audit_context = NULL;
-        threadgroup_change_begin(current);
        cgroup_fork(p);
 #ifdef CONFIG_NUMA
        p->mempolicy = mpol_dup(p->mempolicy);
@@ -1556,6 +1723,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        INIT_LIST_HEAD(&p->thread_group);
        p->task_works = NULL;
+        threadgroup_change_begin(current);
        /*
         * Ensure that the cgroup subsystem policies allow the new process to be
         * forked. It should be noted the the new process's css_set can be changed
@@ -1656,6 +1824,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 bad_fork_cancel_cgroup:
        cgroup_cancel_fork(p);
 bad_fork_free_pid:
+        threadgroup_change_end(current);
        if (pid != &init_struct_pid)
                free_pid(pid);
 bad_fork_cleanup_thread:
@@ -1688,12 +1857,12 @@ bad_fork_cleanup_policy:
        mpol_put(p->mempolicy);
 bad_fork_cleanup_threadgroup_lock:
 #endif
-        threadgroup_change_end(current);
        delayacct_tsk_free(p);
 bad_fork_cleanup_count:
        atomic_dec(&p->cred->user->processes);
        exit_creds(p);
 bad_fork_free:
+        put_task_stack(p);
        free_task(p);
 fork_out:
        return ERR_PTR(retval);
@@ -1759,6 +1928,7 @@ long _do_fork(unsigned long clone_flags,
        p = copy_process(clone_flags, stack_start, stack_size,
                         child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
+        add_latent_entropy();
        /*
         * Do this prior waking up the new thread - the thread pointer
         * might get invalid after that point, if the thread exits quickly.
diff --git a/kernel/futex.c b/kernel/futex.c
index 46cb3a301bc1..2c4be467fecd 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -381,8 +381,12 @@ static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
 #endif
 }
-/*
+/**
- * We hash on the keys returned from get_futex_key (see below).
+ * hash_futex - Return the hash bucket in the global hash
+ * @key:        Pointer to the futex key for which the hash is calculated
+ *
+ * We hash on the keys returned from get_futex_key (see below) and return the
+ * corresponding hash bucket in the global hash.
 */
 static struct futex_hash_bucket *hash_futex(union futex_key *key)
 {
@@ -392,7 +396,12 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key)
        return &futex_queues[hash & (futex_hashsize - 1)];
 }
-/*
+/**
+ * match_futex - Check whether two futex keys are equal
+ * @key1:       Pointer to key1
+ * @key2:       Pointer to key2
+ *
 * Return 1 if two futex_keys are equal, 0 otherwise.
 */
 static inline int match_futex(union futex_key *key1, union futex_key *key2)
diff --git a/kernel/groups.c b/kernel/groups.c
index 74d431d25251..2fcadd66a8fd 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -7,55 +7,31 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/user_namespace.h>
+#include <linux/vmalloc.h>
 #include <asm/uaccess.h>
 struct group_info *groups_alloc(int gidsetsize)
 {
-        struct group_info *group_info;
+        struct group_info *gi;
-        int nblocks;
+        unsigned int len;
-        int i;
+        len = sizeof(struct group_info) + sizeof(kgid_t) * gidsetsize;
-        nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK;
+        gi = kmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_NOWARN|__GFP_NORETRY);
-        /* Make sure we always allocate at least one indirect block pointer */
+        if (!gi)
-        nblocks = nblocks ? : 1;
+                gi = __vmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_HIGHMEM, PAGE_KERNEL);
-        group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER);
+        if (!gi)
-        if (!group_info)
                return NULL;
-        group_info->ngroups = gidsetsize;
-        group_info->nblocks = nblocks;
-        atomic_set(&group_info->usage, 1);
-        if (gidsetsize <= NGROUPS_SMALL)
-                group_info->blocks[0] = group_info->small_block;
-        else {
-                for (i = 0; i < nblocks; i++) {
-                        kgid_t *b;
-                        b = (void *)__get_free_page(GFP_USER);
-                        if (!b)
-                                goto out_undo_partial_alloc;
-                        group_info->blocks[i] = b;
-                }
-        }
-        return group_info;
-out_undo_partial_alloc:
+        atomic_set(&gi->usage, 1);
-        while (--i >= 0) {
+        gi->ngroups = gidsetsize;
-                free_page((unsigned long)group_info->blocks[i]);
+        return gi;
-        }
-        kfree(group_info);
-        return NULL;
 }
 EXPORT_SYMBOL(groups_alloc);
 void groups_free(struct group_info *group_info)
 {
-        if (group_info->blocks[0] != group_info->small_block) {
+        kvfree(group_info);
-                int i;
-                for (i = 0; i < group_info->nblocks; i++)
-                        free_page((unsigned long)group_info->blocks[i]);
-        }
-        kfree(group_info);
 }
 EXPORT_SYMBOL(groups_free);
@@ -70,7 +46,7 @@ static int groups_to_user(gid_t __user *grouplist,
        for (i = 0; i < count; i++) {
                gid_t gid;
-                gid = from_kgid_munged(user_ns, GROUP_AT(group_info, i));
+                gid = from_kgid_munged(user_ns, group_info->gid[i]);
                if (put_user(gid, grouplist+i))
                        return -EFAULT;
        }
@@ -95,7 +71,7 @@ static int groups_from_user(struct group_info *group_info,
                if (!gid_valid(kgid))
                        return -EINVAL;
-                GROUP_AT(group_info, i) = kgid;
+                group_info->gid[i] = kgid;
        }
        return 0;
 }
@@ -115,15 +91,14 @@ static void groups_sort(struct group_info *group_info)
                for (base = 0; base < max; base++) {
                        int left = base;
                        int right = left + stride;
-                        kgid_t tmp = GROUP_AT(group_info, right);
+                        kgid_t tmp = group_info->gid[right];
-                        while (left >= 0 && gid_gt(GROUP_AT(group_info, left), tmp)) {
+                        while (left >= 0 && gid_gt(group_info->gid[left], tmp)) {
-                                GROUP_AT(group_info, right) =
+                                group_info->gid[right] = group_info->gid[left];
-                                    GROUP_AT(group_info, left);
                                right = left;
                                left -= stride;
                        }
-                        GROUP_AT(group_info, right) = tmp;
+                        group_info->gid[right] = tmp;
                }
                stride /= 3;
        }
@@ -141,9 +116,9 @@ int groups_search(const struct group_info *group_info, kgid_t grp)
        right = group_info->ngroups;
        while (left < right) {
                unsigned int mid = (left+right)/2;
-                if (gid_gt(grp, GROUP_AT(group_info, mid)))
+                if (gid_gt(grp, group_info->gid[mid]))
                        left = mid + 1;
-                else if (gid_lt(grp, GROUP_AT(group_info, mid)))
+                else if (gid_lt(grp, group_info->gid[mid]))
                        right = mid;
                else
                        return 1;
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index d234022805dc..2b59c82cc3e1 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -98,26 +98,26 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
        trace_sched_process_hang(t);
-        if (!sysctl_hung_task_warnings)
+        if (!sysctl_hung_task_warnings && !sysctl_hung_task_panic)
                return;
-        if (sysctl_hung_task_warnings > 0)
-                sysctl_hung_task_warnings--;
        /*
         * Ok, the task did not get scheduled for more than 2 minutes,
         * complain:
         */
-        pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n",
+        if (sysctl_hung_task_warnings) {
-                t->comm, t->pid, timeout);
+                sysctl_hung_task_warnings--;
-        pr_err("      %s %s %.*s\n",
+                pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n",
-                print_tainted(), init_utsname()->release,
+                        t->comm, t->pid, timeout);
-                (int)strcspn(init_utsname()->version, " "),
+                pr_err("      %s %s %.*s\n",
-                init_utsname()->version);
+                        print_tainted(), init_utsname()->release,
-        pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
+                        (int)strcspn(init_utsname()->version, " "),
-                " disables this message.\n");
+                        init_utsname()->version);
-        sched_show_task(t);
+                pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
-        debug_show_held_locks(t);
+                        " disables this message.\n");
+                sched_show_task(t);
+                debug_show_all_locks();
+        }
        touch_nmi_watchdog();
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index f68959341c0f..17f51d63da56 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -4,58 +4,151 @@
 #include <linux/slab.h>
 #include <linux/cpu.h>
-static int get_first_sibling(unsigned int cpu)
+static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
+                                int cpus_per_vec)
 {
-        unsigned int ret;
+        const struct cpumask *siblmsk;
+        int cpu, sibl;
-        ret = cpumask_first(topology_sibling_cpumask(cpu));
+        for ( ; cpus_per_vec > 0; ) {
-        if (ret < nr_cpu_ids)
+                cpu = cpumask_first(nmsk);
-                return ret;
-        return cpu;
+                /* Should not happen, but I'm too lazy to think about it */
+                if (cpu >= nr_cpu_ids)
+                        return;
+                cpumask_clear_cpu(cpu, nmsk);
+                cpumask_set_cpu(cpu, irqmsk);
+                cpus_per_vec--;
+                /* If the cpu has siblings, use them first */
+                siblmsk = topology_sibling_cpumask(cpu);
+                for (sibl = -1; cpus_per_vec > 0; ) {
+                        sibl = cpumask_next(sibl, siblmsk);
+                        if (sibl >= nr_cpu_ids)
+                                break;
+                        if (!cpumask_test_and_clear_cpu(sibl, nmsk))
+                                continue;
+                        cpumask_set_cpu(sibl, irqmsk);
+                        cpus_per_vec--;
+                }
+        }
+}
+static int get_nodes_in_cpumask(const struct cpumask *mask, nodemask_t *nodemsk)
+{
+        int n, nodes;
+        /* Calculate the number of nodes in the supplied affinity mask */
+        for (n = 0, nodes = 0; n < num_online_nodes(); n++) {
+                if (cpumask_intersects(mask, cpumask_of_node(n))) {
+                        node_set(n, *nodemsk);
+                        nodes++;
+                }
+        }
+        return nodes;
 }
-/*
+/**
- * Take a map of online CPUs and the number of available interrupt vectors
+ * irq_create_affinity_masks - Create affinity masks for multiqueue spreading
- * and generate an output cpumask suitable for spreading MSI/MSI-X vectors
+ * @affinity:           The affinity mask to spread. If NULL cpu_online_mask
- * so that they are distributed as good as possible around the CPUs.  If
+ *                      is used
- * more vectors than CPUs are available we'll map one to each CPU,
+ * @nvecs:              The number of vectors
- * otherwise we map one to the first sibling of each socket.
 *
- * If there are more vectors than CPUs we will still only have one bit
+ * Returns the masks pointer or NULL if allocation failed.
- * set per CPU, but interrupt code will keep on assigning the vectors from
- * the start of the bitmap until we run out of vectors.
 */
-struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs)
+struct cpumask *irq_create_affinity_masks(const struct cpumask *affinity,
+                                          int nvec)
 {
-        struct cpumask *affinity_mask;
+        int n, nodes, vecs_per_node, cpus_per_vec, extra_vecs, curvec = 0;
-        unsigned int max_vecs = *nr_vecs;
+        nodemask_t nodemsk = NODE_MASK_NONE;
+        struct cpumask *masks;
+        cpumask_var_t nmsk;
-        if (max_vecs == 1)
+        if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
                return NULL;
-        affinity_mask = kzalloc(cpumask_size(), GFP_KERNEL);
+        masks = kzalloc(nvec * sizeof(*masks), GFP_KERNEL);
-        if (!affinity_mask) {
+        if (!masks)
-                *nr_vecs = 1;
+                goto out;
-                return NULL;
-        }
-        if (max_vecs >= num_online_cpus()) {
+        /* Stabilize the cpumasks */
-                cpumask_copy(affinity_mask, cpu_online_mask);
+        get_online_cpus();
-                *nr_vecs = num_online_cpus();
+        /* If the supplied affinity mask is NULL, use cpu online mask */
-        } else {
+        if (!affinity)
-                unsigned int vecs = 0, cpu;
+                affinity = cpu_online_mask;
-                for_each_online_cpu(cpu) {
+        nodes = get_nodes_in_cpumask(affinity, &nodemsk);
-                        if (cpu == get_first_sibling(cpu)) {
-                                cpumask_set_cpu(cpu, affinity_mask);
-                                vecs++;
-                        }
-                        if (--max_vecs == 0)
+        /*
+         * If the number of nodes in the mask is less than or equal the
+         * number of vectors we just spread the vectors across the nodes.
+         */
+        if (nvec <= nodes) {
+                for_each_node_mask(n, nodemsk) {
+                        cpumask_copy(masks + curvec, cpumask_of_node(n));
+                        if (++curvec == nvec)
                                break;
                }
-                *nr_vecs = vecs;
+                goto outonl;
+        }
+        /* Spread the vectors per node */
+        vecs_per_node = nvec / nodes;
+        /* Account for rounding errors */
+        extra_vecs = nvec - (nodes * vecs_per_node);
+        for_each_node_mask(n, nodemsk) {
+                int ncpus, v, vecs_to_assign = vecs_per_node;
+                /* Get the cpus on this node which are in the mask */
+                cpumask_and(nmsk, affinity, cpumask_of_node(n));
+                /* Calculate the number of cpus per vector */
+                ncpus = cpumask_weight(nmsk);
+                for (v = 0; curvec < nvec && v < vecs_to_assign; curvec++, v++) {
+                        cpus_per_vec = ncpus / vecs_to_assign;
+                        /* Account for extra vectors to compensate rounding errors */
+                        if (extra_vecs) {
+                                cpus_per_vec++;
+                                if (!--extra_vecs)
+                                        vecs_per_node++;
+                        }
+                        irq_spread_init_one(masks + curvec, nmsk, cpus_per_vec);
+                }
+                if (curvec >= nvec)
+                        break;
        }
-        return affinity_mask;
+outonl:
+        put_online_cpus();
+out:
+        free_cpumask_var(nmsk);
+        return masks;
+}
+/**
+ * irq_calc_affinity_vectors - Calculate to optimal number of vectors for a given affinity mask
+ * @affinity:           The affinity mask to spread. If NULL cpu_online_mask
+ *                      is used
+ * @maxvec:             The maximum number of vectors available
+ */
+int irq_calc_affinity_vectors(const struct cpumask *affinity, int maxvec)
+{
+        int cpus, ret;
+        /* Stabilize the cpumasks */
+        get_online_cpus();
+        /* If the supplied affinity mask is NULL, use cpu online mask */
+        if (!affinity)
+                affinity = cpu_online_mask;
+        cpus = cpumask_weight(affinity);
+        ret = (cpus < maxvec) ? cpus : maxvec;
+        put_online_cpus();
+        return ret;
 }
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index b4c1bc7c9ca2..be3c34e4f2ac 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -76,7 +76,6 @@ int irq_set_irq_type(unsigned int irq, unsigned int type)
        if (!desc)
                return -EINVAL;
-        type &= IRQ_TYPE_SENSE_MASK;
        ret = __irq_set_trigger(desc, type);
        irq_put_desc_busunlock(desc, flags);
        return ret;
@@ -756,7 +755,6 @@ void handle_percpu_devid_irq(struct irq_desc *desc)
 {
        struct irq_chip *chip = irq_desc_get_chip(desc);
        struct irqaction *action = desc->action;
-        void *dev_id = raw_cpu_ptr(action->percpu_dev_id);
        unsigned int irq = irq_desc_get_irq(desc);
        irqreturn_t res;
@@ -765,15 +763,26 @@ void handle_percpu_devid_irq(struct irq_desc *desc)
        if (chip->irq_ack)
                chip->irq_ack(&desc->irq_data);
-        trace_irq_handler_entry(irq, action);
+        if (likely(action)) {
-        res = action->handler(irq, dev_id);
+                trace_irq_handler_entry(irq, action);
-        trace_irq_handler_exit(irq, action, res);
+                res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id));
+                trace_irq_handler_exit(irq, action, res);
+        } else {
+                unsigned int cpu = smp_processor_id();
+                bool enabled = cpumask_test_cpu(cpu, desc->percpu_enabled);
+                if (enabled)
+                        irq_percpu_disable(desc, cpu);
+                pr_err_once("Spurious%s percpu IRQ%u on CPU%u\n",
+                            enabled ? " and unmasked" : "", irq, cpu);
+        }
        if (chip->irq_eoi)
                chip->irq_eoi(&desc->irq_data);
 }
-void
+static void
 __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
                     int is_chained, const char *name)
 {
@@ -820,6 +829,21 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
        desc->name = name;
        if (handle != handle_bad_irq && is_chained) {
+                unsigned int type = irqd_get_trigger_type(&desc->irq_data);
+                /*
+                 * We're about to start this interrupt immediately,
+                 * hence the need to set the trigger configuration.
+                 * But the .set_type callback may have overridden the
+                 * flow handler, ignoring that we're dealing with a
+                 * chained interrupt. Reset it immediately because we
+                 * do know better.
+                 */
+                if (type != IRQ_TYPE_NONE) {
+                        __irq_set_trigger(desc, type);
+                        desc->handle_irq = handle;
+                }
                irq_settings_set_noprobe(desc);
                irq_settings_set_norequest(desc);
                irq_settings_set_nothread(desc);
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index abd286afbd27..ee32870079c9 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -260,9 +260,9 @@ irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags)
 }
 /**
- * irq_alloc_domain_generic_chip - Allocate generic chips for an irq domain
+ * __irq_alloc_domain_generic_chip - Allocate generic chips for an irq domain
 * @d:                  irq domain for which to allocate chips
- * @irqs_per_chip:      Number of interrupts each chip handles
+ * @irqs_per_chip:      Number of interrupts each chip handles (max 32)
 * @num_ct:             Number of irq_chip_type instances associated with this
 * @name:               Name of the irq chip
 * @handler:            Default flow handler associated with these chips
@@ -270,11 +270,11 @@ irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags)
 * @set:                IRQ_* bits to set in the mapping function
 * @gcflags:            Generic chip specific setup flags
 */
-int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
+int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
-                                   int num_ct, const char *name,
+                                     int num_ct, const char *name,
-                                   irq_flow_handler_t handler,
+                                     irq_flow_handler_t handler,
-                                   unsigned int clr, unsigned int set,
+                                     unsigned int clr, unsigned int set,
-                                   enum irq_gc_flags gcflags)
+                                     enum irq_gc_flags gcflags)
 {
        struct irq_domain_chip_generic *dgc;
        struct irq_chip_generic *gc;
@@ -326,7 +326,21 @@ int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
        d->name = name;
        return 0;
 }
-EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips);
+EXPORT_SYMBOL_GPL(__irq_alloc_domain_generic_chips);
+static struct irq_chip_generic *
+__irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq)
+{
+        struct irq_domain_chip_generic *dgc = d->gc;
+        int idx;
+        if (!dgc)
+                return ERR_PTR(-ENODEV);
+        idx = hw_irq / dgc->irqs_per_chip;
+        if (idx >= dgc->num_chips)
+                return ERR_PTR(-EINVAL);
+        return dgc->gc[idx];
+}
 /**
 * irq_get_domain_generic_chip - Get a pointer to the generic chip of a hw_irq
@@ -336,15 +350,9 @@ EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips);
 struct irq_chip_generic *
 irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq)
 {
-        struct irq_domain_chip_generic *dgc = d->gc;
+        struct irq_chip_generic *gc = __irq_get_domain_generic_chip(d, hw_irq);
-        int idx;
-        if (!dgc)
+        return !IS_ERR(gc) ? gc : NULL;
-                return NULL;
-        idx = hw_irq / dgc->irqs_per_chip;
-        if (idx >= dgc->num_chips)
-                return NULL;
-        return dgc->gc[idx];
 }
 EXPORT_SYMBOL_GPL(irq_get_domain_generic_chip);
@@ -368,13 +376,9 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
        unsigned long flags;
        int idx;
-        if (!d->gc)
+        gc = __irq_get_domain_generic_chip(d, hw_irq);
-                return -ENODEV;
+        if (IS_ERR(gc))
+                return PTR_ERR(gc);
-        idx = hw_irq / dgc->irqs_per_chip;
-        if (idx >= dgc->num_chips)
-                return -EINVAL;
-        gc = dgc->gc[idx];
        idx = hw_irq % dgc->irqs_per_chip;
@@ -409,10 +413,30 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
        irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set);
        return 0;
 }
-EXPORT_SYMBOL_GPL(irq_map_generic_chip);
+static void irq_unmap_generic_chip(struct irq_domain *d, unsigned int virq)
+{
+        struct irq_data *data = irq_domain_get_irq_data(d, virq);
+        struct irq_domain_chip_generic *dgc = d->gc;
+        unsigned int hw_irq = data->hwirq;
+        struct irq_chip_generic *gc;
+        int irq_idx;
+        gc = irq_get_domain_generic_chip(d, hw_irq);
+        if (!gc)
+                return;
+        irq_idx = hw_irq % dgc->irqs_per_chip;
+        clear_bit(irq_idx, &gc->installed);
+        irq_domain_set_info(d, virq, hw_irq, &no_irq_chip, NULL, NULL, NULL,
+                            NULL);
+}
 struct irq_domain_ops irq_generic_chip_ops = {
        .map    = irq_map_generic_chip,
+        .unmap  = irq_unmap_generic_chip,
        .xlate  = irq_domain_xlate_onetwocell,
 };
 EXPORT_SYMBOL_GPL(irq_generic_chip_ops);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index a623b44f2d4b..00bb0aeea1d0 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -15,6 +15,7 @@
 #include <linux/radix-tree.h>
 #include <linux/bitmap.h>
 #include <linux/irqdomain.h>
+#include <linux/sysfs.h>
 #include "internals.h"
@@ -123,6 +124,181 @@ static DECLARE_BITMAP(allocated_irqs, IRQ_BITMAP_BITS);
 #ifdef CONFIG_SPARSE_IRQ
+static void irq_kobj_release(struct kobject *kobj);
+#ifdef CONFIG_SYSFS
+static struct kobject *irq_kobj_base;
+#define IRQ_ATTR_RO(_name) \
+static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+static ssize_t per_cpu_count_show(struct kobject *kobj,
+                                  struct kobj_attribute *attr, char *buf)
+{
+        struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
+        int cpu, irq = desc->irq_data.irq;
+        ssize_t ret = 0;
+        char *p = "";
+        for_each_possible_cpu(cpu) {
+                unsigned int c = kstat_irqs_cpu(irq, cpu);
+                ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%u", p, c);
+                p = ",";
+        }
+        ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
+        return ret;
+}
+IRQ_ATTR_RO(per_cpu_count);
+static ssize_t chip_name_show(struct kobject *kobj,
+                              struct kobj_attribute *attr, char *buf)
+{
+        struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
+        ssize_t ret = 0;
+        raw_spin_lock_irq(&desc->lock);
+        if (desc->irq_data.chip && desc->irq_data.chip->name) {
+                ret = scnprintf(buf, PAGE_SIZE, "%s\n",
+                                desc->irq_data.chip->name);
+        }
+        raw_spin_unlock_irq(&desc->lock);
+        return ret;
+}
+IRQ_ATTR_RO(chip_name);
+static ssize_t hwirq_show(struct kobject *kobj,
+                          struct kobj_attribute *attr, char *buf)
+{
+        struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
+        ssize_t ret = 0;
+        raw_spin_lock_irq(&desc->lock);
+        if (desc->irq_data.domain)
+                ret = sprintf(buf, "%d\n", (int)desc->irq_data.hwirq);
+        raw_spin_unlock_irq(&desc->lock);
+        return ret;
+}
+IRQ_ATTR_RO(hwirq);
+static ssize_t type_show(struct kobject *kobj,
+                         struct kobj_attribute *attr, char *buf)
+{
+        struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
+        ssize_t ret = 0;
+        raw_spin_lock_irq(&desc->lock);
+        ret = sprintf(buf, "%s\n",
+                      irqd_is_level_type(&desc->irq_data) ? "level" : "edge");
+        raw_spin_unlock_irq(&desc->lock);
+        return ret;
+}
+IRQ_ATTR_RO(type);
+static ssize_t name_show(struct kobject *kobj,
+                         struct kobj_attribute *attr, char *buf)
+{
+        struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
+        ssize_t ret = 0;
+        raw_spin_lock_irq(&desc->lock);
+        if (desc->name)
+                ret = scnprintf(buf, PAGE_SIZE, "%s\n", desc->name);
+        raw_spin_unlock_irq(&desc->lock);
+        return ret;
+}
+IRQ_ATTR_RO(name);
+static ssize_t actions_show(struct kobject *kobj,
+                            struct kobj_attribute *attr, char *buf)
+{
+        struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
+        struct irqaction *action;
+        ssize_t ret = 0;
+        char *p = "";
+        raw_spin_lock_irq(&desc->lock);
+        for (action = desc->action; action != NULL; action = action->next) {
+                ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s",
+                                 p, action->name);
+                p = ",";
+        }
+        raw_spin_unlock_irq(&desc->lock);
+        if (ret)
+                ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
+        return ret;
+}
+IRQ_ATTR_RO(actions);
+static struct attribute *irq_attrs[] = {
+        &per_cpu_count_attr.attr,
+        &chip_name_attr.attr,
+        &hwirq_attr.attr,
+        &type_attr.attr,
+        &name_attr.attr,
+        &actions_attr.attr,
+        NULL
+};
+static struct kobj_type irq_kobj_type = {
+        .release        = irq_kobj_release,
+        .sysfs_ops      = &kobj_sysfs_ops,
+        .default_attrs  = irq_attrs,
+};
+static void irq_sysfs_add(int irq, struct irq_desc *desc)
+{
+        if (irq_kobj_base) {
+                /*
+                 * Continue even in case of failure as this is nothing
+                 * crucial.
+                 */
+                if (kobject_add(&desc->kobj, irq_kobj_base, "%d", irq))
+                        pr_warn("Failed to add kobject for irq %d\n", irq);
+        }
+}
+static int __init irq_sysfs_init(void)
+{
+        struct irq_desc *desc;
+        int irq;
+        /* Prevent concurrent irq alloc/free */
+        irq_lock_sparse();
+        irq_kobj_base = kobject_create_and_add("irq", kernel_kobj);
+        if (!irq_kobj_base) {
+                irq_unlock_sparse();
+                return -ENOMEM;
+        }
+        /* Add the already allocated interrupts */
+        for_each_irq_desc(irq, desc)
+                irq_sysfs_add(irq, desc);
+        irq_unlock_sparse();
+        return 0;
+}
+postcore_initcall(irq_sysfs_init);
+#else /* !CONFIG_SYSFS */
+static struct kobj_type irq_kobj_type = {
+        .release        = irq_kobj_release,
+};
+static void irq_sysfs_add(int irq, struct irq_desc *desc) {}
+#endif /* CONFIG_SYSFS */
 static RADIX_TREE(irq_desc_tree, GFP_KERNEL);
 static void irq_insert_desc(unsigned int irq, struct irq_desc *desc)
@@ -187,6 +363,7 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags,
        desc_set_defaults(irq, desc, node, affinity, owner);
        irqd_set(&desc->irq_data, flags);
+        kobject_init(&desc->kobj, &irq_kobj_type);
        return desc;
@@ -197,15 +374,22 @@ err_desc:
        return NULL;
 }
-static void delayed_free_desc(struct rcu_head *rhp)
+static void irq_kobj_release(struct kobject *kobj)
 {
-        struct irq_desc *desc = container_of(rhp, struct irq_desc, rcu);
+        struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
        free_masks(desc);
        free_percpu(desc->kstat_irqs);
        kfree(desc);
 }
+static void delayed_free_desc(struct rcu_head *rhp)
+{
+        struct irq_desc *desc = container_of(rhp, struct irq_desc, rcu);
+        kobject_put(&desc->kobj);
+}
 static void free_desc(unsigned int irq)
 {
        struct irq_desc *desc = irq_to_desc(irq);
@@ -217,8 +401,12 @@ static void free_desc(unsigned int irq)
         * kstat_irq_usr(). Once we deleted the descriptor from the
         * sparse tree we can free it. Access in proc will fail to
         * lookup the descriptor.
+         *
+         * The sysfs entry must be serialized against a concurrent
+         * irq_sysfs_init() as well.
         */
        mutex_lock(&sparse_irq_lock);
+        kobject_del(&desc->kobj);
        delete_irq_desc(irq);
        mutex_unlock(&sparse_irq_lock);
@@ -236,31 +424,31 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node,
        const struct cpumask *mask = NULL;
        struct irq_desc *desc;
        unsigned int flags;
-        int i, cpu = -1;
+        int i;
-        if (affinity && cpumask_empty(affinity))
+        /* Validate affinity mask(s) */
-                return -EINVAL;
+        if (affinity) {
+                for (i = 0, mask = affinity; i < cnt; i++, mask++) {
+                        if (cpumask_empty(mask))
+                                return -EINVAL;
+                }
+        }
        flags = affinity ? IRQD_AFFINITY_MANAGED : 0;
+        mask = NULL;
        for (i = 0; i < cnt; i++) {
                if (affinity) {
-                        cpu = cpumask_next(cpu, affinity);
+                        node = cpu_to_node(cpumask_first(affinity));
-                        if (cpu >= nr_cpu_ids)
+                        mask = affinity;
-                                cpu = cpumask_first(affinity);
+                        affinity++;
-                        node = cpu_to_node(cpu);
-                        /*
-                         * For single allocations we use the caller provided
-                         * mask otherwise we use the mask of the target cpu
-                         */
-                        mask = cnt == 1 ? affinity : cpumask_of(cpu);
                }
                desc = alloc_desc(start + i, node, flags, mask, owner);
                if (!desc)
                        goto err;
                mutex_lock(&sparse_irq_lock);
                irq_insert_desc(start + i, desc);
+                irq_sysfs_add(start + i, desc);
                mutex_unlock(&sparse_irq_lock);
        }
        return start;
@@ -481,9 +669,9 @@ EXPORT_SYMBOL_GPL(irq_free_descs);
 * @cnt:        Number of consecutive irqs to allocate.
 * @node:       Preferred node on which the irq descriptor should be allocated
 * @owner:      Owning module (can be NULL)
- * @affinity:   Optional pointer to an affinity mask which hints where the
+ * @affinity:   Optional pointer to an affinity mask array of size @cnt which
- *              irq descriptors should be allocated and which default
+ *              hints where the irq descriptors should be allocated and which
- *              affinities to use
+ *              default affinities to use
 *
 * Returns the first irq number or error code
 */
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 4752b43662e0..8c0a0ae43521 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -80,7 +80,7 @@ EXPORT_SYMBOL_GPL(irq_domain_free_fwnode);
 /**
 * __irq_domain_add() - Allocate a new irq_domain data structure
- * @of_node: optional device-tree node of the interrupt controller
+ * @fwnode: firmware node for the interrupt controller
 * @size: Size of linear map; 0 for radix mapping only
 * @hwirq_max: Maximum number of interrupts supported by controller
 * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no
@@ -96,10 +96,8 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
                                    const struct irq_domain_ops *ops,
                                    void *host_data)
 {
+        struct device_node *of_node = to_of_node(fwnode);
        struct irq_domain *domain;
-        struct device_node *of_node;
-        of_node = to_of_node(fwnode);
        domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size),
                              GFP_KERNEL, of_node_to_nid(of_node));
@@ -868,7 +866,10 @@ int irq_domain_xlate_onetwocell(struct irq_domain *d,
        if (WARN_ON(intsize < 1))
                return -EINVAL;
        *out_hwirq = intspec[0];
-        *out_type = (intsize > 1) ? intspec[1] : IRQ_TYPE_NONE;
+        if (intsize > 1)
+                *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK;
+        else
+                *out_type = IRQ_TYPE_NONE;
        return 0;
 }
 EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 73a2b786b5e9..0c5f1a5db654 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -669,8 +669,6 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags)
                return 0;
        }
-        flags &= IRQ_TYPE_SENSE_MASK;
        if (chip->flags & IRQCHIP_SET_TYPE_MASKED) {
                if (!irqd_irq_masked(&desc->irq_data))
                        mask_irq(desc);
@@ -678,7 +676,8 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags)
                        unmask = 1;
        }
-        /* caller masked out all except trigger mode flags */
+        /* Mask all flags except trigger mode */
+        flags &= IRQ_TYPE_SENSE_MASK;
        ret = chip->irq_set_type(&desc->irq_data, flags);
        switch (ret) {
@@ -1681,8 +1680,10 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
        action->dev_id = dev_id;
        retval = irq_chip_pm_get(&desc->irq_data);
-        if (retval < 0)
+        if (retval < 0) {
+                kfree(action);
                return retval;
+        }
        chip_bus_lock(desc);
        retval = __setup_irq(irq, desc, action);
@@ -1985,8 +1986,10 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
        action->percpu_dev_id = dev_id;
        retval = irq_chip_pm_get(&desc->irq_data);
-        if (retval < 0)
+        if (retval < 0) {
+                kfree(action);
                return retval;
+        }
        chip_bus_lock(desc);
        retval = __setup_irq(irq, desc, action);
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 19e9dfbe97fa..8a3e872798f3 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -18,20 +18,42 @@
 /* Temparory solution for building, will be removed later */
 #include <linux/pci.h>
-struct msi_desc *alloc_msi_entry(struct device *dev)
+/**
+ * alloc_msi_entry - Allocate an initialize msi_entry
+ * @dev:        Pointer to the device for which this is allocated
+ * @nvec:       The number of vectors used in this entry
+ * @affinity:   Optional pointer to an affinity mask array size of @nvec
+ *
+ * If @affinity is not NULL then a an affinity array[@nvec] is allocated
+ * and the affinity masks from @affinity are copied.
+ */
+struct msi_desc *
+alloc_msi_entry(struct device *dev, int nvec, const struct cpumask *affinity)
 {
-        struct msi_desc *desc = kzalloc(sizeof(*desc), GFP_KERNEL);
+        struct msi_desc *desc;
+        desc = kzalloc(sizeof(*desc), GFP_KERNEL);
        if (!desc)
                return NULL;
        INIT_LIST_HEAD(&desc->list);
        desc->dev = dev;
+        desc->nvec_used = nvec;
+        if (affinity) {
+                desc->affinity = kmemdup(affinity,
+                        nvec * sizeof(*desc->affinity), GFP_KERNEL);
+                if (!desc->affinity) {
+                        kfree(desc);
+                        return NULL;
+                }
+        }
        return desc;
 }
 void free_msi_entry(struct msi_desc *entry)
 {
+        kfree(entry->affinity);
        kfree(entry);
 }
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 503bc2d348e5..037c321c5618 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -887,7 +887,10 @@ int kexec_load_purgatory(struct kimage *image, unsigned long min,
        return 0;
 out:
        vfree(pi->sechdrs);
+        pi->sechdrs = NULL;
        vfree(pi->purgatory_buf);
+        pi->purgatory_buf = NULL;
        return ret;
 }
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index d10ab6b9b5e0..d63095472ea9 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -49,7 +49,7 @@
 #include <linux/cpu.h>
 #include <linux/jump_label.h>
-#include <asm-generic/sections.h>
+#include <asm/sections.h>
 #include <asm/cacheflush.h>
 #include <asm/errno.h>
 #include <asm/uaccess.h>
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 9ff173dca1ae..be2cc1f9dd57 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -64,7 +64,7 @@ static inline struct kthread *to_kthread(struct task_struct *k)
 static struct kthread *to_live_kthread(struct task_struct *k)
 {
        struct completion *vfork = ACCESS_ONCE(k->vfork_done);
-        if (likely(vfork))
+        if (likely(vfork) && try_get_task_stack(k))
                return __to_kthread(vfork);
        return NULL;
 }
@@ -138,7 +138,7 @@ void *kthread_data(struct task_struct *task)
 }
 /**
- * probe_kthread_data - speculative version of kthread_data()
+ * kthread_probe_data - speculative version of kthread_data()
 * @task: possible kthread task in question
 *
 * @task could be a kthread task.  Return the data value specified when it
@@ -146,7 +146,7 @@ void *kthread_data(struct task_struct *task)
 * inaccessible for any reason, %NULL is returned.  This function requires
 * that @task itself is safe to dereference.
 */
-void *probe_kthread_data(struct task_struct *task)
+void *kthread_probe_data(struct task_struct *task)
 {
        struct kthread *kthread = to_kthread(task);
        void *data = NULL;
@@ -244,33 +244,10 @@ static void create_kthread(struct kthread_create_info *create)
        }
 }
-/**
+static struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
- * kthread_create_on_node - create a kthread.
+                                                    void *data, int node,
- * @threadfn: the function to run until signal_pending(current).
+                                                    const char namefmt[],
- * @data: data ptr for @threadfn.
+                                                    va_list args)
- * @node: task and thread structures for the thread are allocated on this node
- * @namefmt: printf-style name for the thread.
- *
- * Description: This helper function creates and names a kernel
- * thread.  The thread will be stopped: use wake_up_process() to start
- * it.  See also kthread_run().  The new thread has SCHED_NORMAL policy and
- * is affine to all CPUs.
- *
- * If thread is going to be bound on a particular cpu, give its node
- * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE.
- * When woken, the thread will run @threadfn() with @data as its
- * argument. @threadfn() can either call do_exit() directly if it is a
- * standalone thread for which no one will call kthread_stop(), or
- * return when 'kthread_should_stop()' is true (which means
- * kthread_stop() has been called).  The return value should be zero
- * or a negative error number; it will be passed to kthread_stop().
- *
- * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR).
- */
-struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
-                                           void *data, int node,
-                                           const char namefmt[],
-                                           ...)
 {
        DECLARE_COMPLETION_ONSTACK(done);
        struct task_struct *task;
@@ -311,11 +288,8 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
        task = create->result;
        if (!IS_ERR(task)) {
                static const struct sched_param param = { .sched_priority = 0 };
-                va_list args;
-                va_start(args, namefmt);
                vsnprintf(task->comm, sizeof(task->comm), namefmt, args);
-                va_end(args);
                /*
                 * root may have changed our (kthreadd's) priority or CPU mask.
                 * The kernel thread should not inherit these properties.
@@ -326,6 +300,44 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
        kfree(create);
        return task;
 }
+/**
+ * kthread_create_on_node - create a kthread.
+ * @threadfn: the function to run until signal_pending(current).
+ * @data: data ptr for @threadfn.
+ * @node: task and thread structures for the thread are allocated on this node
+ * @namefmt: printf-style name for the thread.
+ *
+ * Description: This helper function creates and names a kernel
+ * thread.  The thread will be stopped: use wake_up_process() to start
+ * it.  See also kthread_run().  The new thread has SCHED_NORMAL policy and
+ * is affine to all CPUs.
+ *
+ * If thread is going to be bound on a particular cpu, give its node
+ * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE.
+ * When woken, the thread will run @threadfn() with @data as its
+ * argument. @threadfn() can either call do_exit() directly if it is a
+ * standalone thread for which no one will call kthread_stop(), or
+ * return when 'kthread_should_stop()' is true (which means
+ * kthread_stop() has been called).  The return value should be zero
+ * or a negative error number; it will be passed to kthread_stop().
+ *
+ * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR).
+ */
+struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
+                                           void *data, int node,
+                                           const char namefmt[],
+                                           ...)
+{
+        struct task_struct *task;
+        va_list args;
+        va_start(args, namefmt);
+        task = __kthread_create_on_node(threadfn, data, node, namefmt, args);
+        va_end(args);
+        return task;
+}
 EXPORT_SYMBOL(kthread_create_on_node);
 static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, long state)
@@ -390,10 +402,10 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
                                   cpu);
        if (IS_ERR(p))
                return p;
+        kthread_bind(p, cpu);
+        /* CPU hotplug need to bind once again when unparking the thread. */
        set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags);
        to_kthread(p)->cpu = cpu;
-        /* Park the thread to get it out of TASK_UNINTERRUPTIBLE state */
-        kthread_park(p);
        return p;
 }
@@ -407,6 +419,10 @@ static void __kthread_unpark(struct task_struct *k, struct kthread *kthread)
         * which might be about to be cleared.
         */
        if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
+                /*
+                 * Newly created kthread was parked when the CPU was offline.
+                 * The binding was lost and we need to set it again.
+                 */
                if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
                        __kthread_bind(k, kthread->cpu, TASK_PARKED);
                wake_up_state(k, TASK_PARKED);
@@ -425,8 +441,10 @@ void kthread_unpark(struct task_struct *k)
 {
        struct kthread *kthread = to_live_kthread(k);
-        if (kthread)
+        if (kthread) {
                __kthread_unpark(k, kthread);
+                put_task_stack(k);
+        }
 }
 EXPORT_SYMBOL_GPL(kthread_unpark);
@@ -455,6 +473,7 @@ int kthread_park(struct task_struct *k)
                                wait_for_completion(&kthread->parked);
                        }
                }
+                put_task_stack(k);
                ret = 0;
        }
        return ret;
@@ -490,6 +509,7 @@ int kthread_stop(struct task_struct *k)
                __kthread_unpark(k, kthread);
                wake_up_process(k);
                wait_for_completion(&kthread->exited);
+                put_task_stack(k);
        }
        ret = k->exit_code;
        put_task_struct(k);
@@ -536,39 +556,48 @@ int kthreadd(void *unused)
        return 0;
 }
-void __init_kthread_worker(struct kthread_worker *worker,
+void __kthread_init_worker(struct kthread_worker *worker,
                                const char *name,
                                struct lock_class_key *key)
 {
+        memset(worker, 0, sizeof(struct kthread_worker));
        spin_lock_init(&worker->lock);
        lockdep_set_class_and_name(&worker->lock, key, name);
        INIT_LIST_HEAD(&worker->work_list);
-        worker->task = NULL;
+        INIT_LIST_HEAD(&worker->delayed_work_list);
 }
-EXPORT_SYMBOL_GPL(__init_kthread_worker);
+EXPORT_SYMBOL_GPL(__kthread_init_worker);
 /**
 * kthread_worker_fn - kthread function to process kthread_worker
 * @worker_ptr: pointer to initialized kthread_worker
 *
- * This function can be used as @threadfn to kthread_create() or
+ * This function implements the main cycle of kthread worker. It processes
- * kthread_run() with @worker_ptr argument pointing to an initialized
+ * work_list until it is stopped with kthread_stop(). It sleeps when the queue
- * kthread_worker.  The started kthread will process work_list until
+ * is empty.
- * the it is stopped with kthread_stop().  A kthread can also call
- * this function directly after extra initialization.
 *
- * Different kthreads can be used for the same kthread_worker as long
+ * The works are not allowed to keep any locks, disable preemption or interrupts
- * as there's only one kthread attached to it at any given time.  A
+ * when they finish. There is defined a safe point for freezing when one work
- * kthread_worker without an attached kthread simply collects queued
+ * finishes and before a new one is started.
- * kthread_works.
+ *
+ * Also the works must not be handled by more than one worker at the same time,
+ * see also kthread_queue_work().
 */
 int kthread_worker_fn(void *worker_ptr)
 {
        struct kthread_worker *worker = worker_ptr;
        struct kthread_work *work;
-        WARN_ON(worker->task);
+        /*
+         * FIXME: Update the check and remove the assignment when all kthread
+         * worker users are created using kthread_create_worker*() functions.
+         */
+        WARN_ON(worker->task && worker->task != current);
        worker->task = current;
+        if (worker->flags & KTW_FREEZABLE)
+                set_freezable();
 repeat:
        set_current_state(TASK_INTERRUPTIBLE);  /* mb paired w/ kthread_stop */
@@ -601,13 +630,132 @@ repeat:
 }
 EXPORT_SYMBOL_GPL(kthread_worker_fn);
-/* insert @work before @pos in @worker */
+static struct kthread_worker *
-static void insert_kthread_work(struct kthread_worker *worker,
+__kthread_create_worker(int cpu, unsigned int flags,
-                               struct kthread_work *work,
+                        const char namefmt[], va_list args)
-                               struct list_head *pos)
+{
+        struct kthread_worker *worker;
+        struct task_struct *task;
+        worker = kzalloc(sizeof(*worker), GFP_KERNEL);
+        if (!worker)
+                return ERR_PTR(-ENOMEM);
+        kthread_init_worker(worker);
+        if (cpu >= 0) {
+                char name[TASK_COMM_LEN];
+                /*
+                 * kthread_create_worker_on_cpu() allows to pass a generic
+                 * namefmt in compare with kthread_create_on_cpu. We need
+                 * to format it here.
+                 */
+                vsnprintf(name, sizeof(name), namefmt, args);
+                task = kthread_create_on_cpu(kthread_worker_fn, worker,
+                                             cpu, name);
+        } else {
+                task = __kthread_create_on_node(kthread_worker_fn, worker,
+                                                -1, namefmt, args);
+        }
+        if (IS_ERR(task))
+                goto fail_task;
+        worker->flags = flags;
+        worker->task = task;
+        wake_up_process(task);
+        return worker;
+fail_task:
+        kfree(worker);
+        return ERR_CAST(task);
+}
+/**
+ * kthread_create_worker - create a kthread worker
+ * @flags: flags modifying the default behavior of the worker
+ * @namefmt: printf-style name for the kthread worker (task).
+ *
+ * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM)
+ * when the needed structures could not get allocated, and ERR_PTR(-EINTR)
+ * when the worker was SIGKILLed.
+ */
+struct kthread_worker *
+kthread_create_worker(unsigned int flags, const char namefmt[], ...)
+{
+        struct kthread_worker *worker;
+        va_list args;
+        va_start(args, namefmt);
+        worker = __kthread_create_worker(-1, flags, namefmt, args);
+        va_end(args);
+        return worker;
+}
+EXPORT_SYMBOL(kthread_create_worker);
+/**
+ * kthread_create_worker_on_cpu - create a kthread worker and bind it
+ *      it to a given CPU and the associated NUMA node.
+ * @cpu: CPU number
+ * @flags: flags modifying the default behavior of the worker
+ * @namefmt: printf-style name for the kthread worker (task).
+ *
+ * Use a valid CPU number if you want to bind the kthread worker
+ * to the given CPU and the associated NUMA node.
+ *
+ * A good practice is to add the cpu number also into the worker name.
+ * For example, use kthread_create_worker_on_cpu(cpu, "helper/%d", cpu).
+ *
+ * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM)
+ * when the needed structures could not get allocated, and ERR_PTR(-EINTR)
+ * when the worker was SIGKILLed.
+ */
+struct kthread_worker *
+kthread_create_worker_on_cpu(int cpu, unsigned int flags,
+                             const char namefmt[], ...)
+{
+        struct kthread_worker *worker;
+        va_list args;
+        va_start(args, namefmt);
+        worker = __kthread_create_worker(cpu, flags, namefmt, args);
+        va_end(args);
+        return worker;
+}
+EXPORT_SYMBOL(kthread_create_worker_on_cpu);
+/*
+ * Returns true when the work could not be queued at the moment.
+ * It happens when it is already pending in a worker list
+ * or when it is being cancelled.
+ */
+static inline bool queuing_blocked(struct kthread_worker *worker,
+                                   struct kthread_work *work)
 {
        lockdep_assert_held(&worker->lock);
+        return !list_empty(&work->node) || work->canceling;
+}
+static void kthread_insert_work_sanity_check(struct kthread_worker *worker,
+                                             struct kthread_work *work)
+{
+        lockdep_assert_held(&worker->lock);
+        WARN_ON_ONCE(!list_empty(&work->node));
+        /* Do not use a work with >1 worker, see kthread_queue_work() */
+        WARN_ON_ONCE(work->worker && work->worker != worker);
+}
+/* insert @work before @pos in @worker */
+static void kthread_insert_work(struct kthread_worker *worker,
+                                struct kthread_work *work,
+                                struct list_head *pos)
+{
+        kthread_insert_work_sanity_check(worker, work);
        list_add_tail(&work->node, pos);
        work->worker = worker;
        if (!worker->current_work && likely(worker->task))
@@ -615,29 +763,133 @@ static void insert_kthread_work(struct kthread_worker *worker,
 }
 /**
- * queue_kthread_work - queue a kthread_work
+ * kthread_queue_work - queue a kthread_work
 * @worker: target kthread_worker
 * @work: kthread_work to queue
 *
 * Queue @work to work processor @task for async execution.  @task
 * must have been created with kthread_worker_create().  Returns %true
 * if @work was successfully queued, %false if it was already pending.
+ *
+ * Reinitialize the work if it needs to be used by another worker.
+ * For example, when the worker was stopped and started again.
 */
-bool queue_kthread_work(struct kthread_worker *worker,
+bool kthread_queue_work(struct kthread_worker *worker,
                        struct kthread_work *work)
 {
        bool ret = false;
        unsigned long flags;
        spin_lock_irqsave(&worker->lock, flags);
-        if (list_empty(&work->node)) {
+        if (!queuing_blocked(worker, work)) {
-                insert_kthread_work(worker, work, &worker->work_list);
+                kthread_insert_work(worker, work, &worker->work_list);
+                ret = true;
+        }
+        spin_unlock_irqrestore(&worker->lock, flags);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(kthread_queue_work);
+/**
+ * kthread_delayed_work_timer_fn - callback that queues the associated kthread
+ *      delayed work when the timer expires.
+ * @__data: pointer to the data associated with the timer
+ *
+ * The format of the function is defined by struct timer_list.
+ * It should have been called from irqsafe timer with irq already off.
+ */
+void kthread_delayed_work_timer_fn(unsigned long __data)
+{
+        struct kthread_delayed_work *dwork =
+                (struct kthread_delayed_work *)__data;
+        struct kthread_work *work = &dwork->work;
+        struct kthread_worker *worker = work->worker;
+        /*
+         * This might happen when a pending work is reinitialized.
+         * It means that it is used a wrong way.
+         */
+        if (WARN_ON_ONCE(!worker))
+                return;
+        spin_lock(&worker->lock);
+        /* Work must not be used with >1 worker, see kthread_queue_work(). */
+        WARN_ON_ONCE(work->worker != worker);
+        /* Move the work from worker->delayed_work_list. */
+        WARN_ON_ONCE(list_empty(&work->node));
+        list_del_init(&work->node);
+        kthread_insert_work(worker, work, &worker->work_list);
+        spin_unlock(&worker->lock);
+}
+EXPORT_SYMBOL(kthread_delayed_work_timer_fn);
+void __kthread_queue_delayed_work(struct kthread_worker *worker,
+                                  struct kthread_delayed_work *dwork,
+                                  unsigned long delay)
+{
+        struct timer_list *timer = &dwork->timer;
+        struct kthread_work *work = &dwork->work;
+        WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn ||
+                     timer->data != (unsigned long)dwork);
+        /*
+         * If @delay is 0, queue @dwork->work immediately.  This is for
+         * both optimization and correctness.  The earliest @timer can
+         * expire is on the closest next tick and delayed_work users depend
+         * on that there's no such delay when @delay is 0.
+         */
+        if (!delay) {
+                kthread_insert_work(worker, work, &worker->work_list);
+                return;
+        }
+        /* Be paranoid and try to detect possible races already now. */
+        kthread_insert_work_sanity_check(worker, work);
+        list_add(&work->node, &worker->delayed_work_list);
+        work->worker = worker;
+        timer_stats_timer_set_start_info(&dwork->timer);
+        timer->expires = jiffies + delay;
+        add_timer(timer);
+}
+/**
+ * kthread_queue_delayed_work - queue the associated kthread work
+ *      after a delay.
+ * @worker: target kthread_worker
+ * @dwork: kthread_delayed_work to queue
+ * @delay: number of jiffies to wait before queuing
+ *
+ * If the work has not been pending it starts a timer that will queue
+ * the work after the given @delay. If @delay is zero, it queues the
+ * work immediately.
+ *
+ * Return: %false if the @work has already been pending. It means that
+ * either the timer was running or the work was queued. It returns %true
+ * otherwise.
+ */
+bool kthread_queue_delayed_work(struct kthread_worker *worker,
+                                struct kthread_delayed_work *dwork,
+                                unsigned long delay)
+{
+        struct kthread_work *work = &dwork->work;
+        unsigned long flags;
+        bool ret = false;
+        spin_lock_irqsave(&worker->lock, flags);
+        if (!queuing_blocked(worker, work)) {
+                __kthread_queue_delayed_work(worker, dwork, delay);
                ret = true;
        }
        spin_unlock_irqrestore(&worker->lock, flags);
        return ret;
 }
-EXPORT_SYMBOL_GPL(queue_kthread_work);
+EXPORT_SYMBOL_GPL(kthread_queue_delayed_work);
 struct kthread_flush_work {
        struct kthread_work     work;
@@ -652,12 +904,12 @@ static void kthread_flush_work_fn(struct kthread_work *work)
 }
 /**
- * flush_kthread_work - flush a kthread_work
+ * kthread_flush_work - flush a kthread_work
 * @work: work to flush
 *
 * If @work is queued or executing, wait for it to finish execution.
 */
-void flush_kthread_work(struct kthread_work *work)
+void kthread_flush_work(struct kthread_work *work)
 {
        struct kthread_flush_work fwork = {
                KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
@@ -666,21 +918,19 @@ void flush_kthread_work(struct kthread_work *work)
        struct kthread_worker *worker;
        bool noop = false;
-retry:
        worker = work->worker;
        if (!worker)
                return;
        spin_lock_irq(&worker->lock);
-        if (work->worker != worker) {
+        /* Work must not be used with >1 worker, see kthread_queue_work(). */
-                spin_unlock_irq(&worker->lock);
+        WARN_ON_ONCE(work->worker != worker);
-                goto retry;
-        }
        if (!list_empty(&work->node))
-                insert_kthread_work(worker, &fwork.work, work->node.next);
+                kthread_insert_work(worker, &fwork.work, work->node.next);
        else if (worker->current_work == work)
-                insert_kthread_work(worker, &fwork.work, worker->work_list.next);
+                kthread_insert_work(worker, &fwork.work,
+                                    worker->work_list.next);
        else
                noop = true;
@@ -689,23 +939,214 @@ retry:
        if (!noop)
                wait_for_completion(&fwork.done);
 }
-EXPORT_SYMBOL_GPL(flush_kthread_work);
+EXPORT_SYMBOL_GPL(kthread_flush_work);
+/*
+ * This function removes the work from the worker queue. Also it makes sure
+ * that it won't get queued later via the delayed work's timer.
+ *
+ * The work might still be in use when this function finishes. See the
+ * current_work proceed by the worker.
+ *
+ * Return: %true if @work was pending and successfully canceled,
+ *      %false if @work was not pending
+ */
+static bool __kthread_cancel_work(struct kthread_work *work, bool is_dwork,
+                                  unsigned long *flags)
+{
+        /* Try to cancel the timer if exists. */
+        if (is_dwork) {
+                struct kthread_delayed_work *dwork =
+                        container_of(work, struct kthread_delayed_work, work);
+                struct kthread_worker *worker = work->worker;
+                /*
+                 * del_timer_sync() must be called to make sure that the timer
+                 * callback is not running. The lock must be temporary released
+                 * to avoid a deadlock with the callback. In the meantime,
+                 * any queuing is blocked by setting the canceling counter.
+                 */
+                work->canceling++;
+                spin_unlock_irqrestore(&worker->lock, *flags);
+                del_timer_sync(&dwork->timer);
+                spin_lock_irqsave(&worker->lock, *flags);
+                work->canceling--;
+        }
+        /*
+         * Try to remove the work from a worker list. It might either
+         * be from worker->work_list or from worker->delayed_work_list.
+         */
+        if (!list_empty(&work->node)) {
+                list_del_init(&work->node);
+                return true;
+        }
+        return false;
+}
 /**
- * flush_kthread_worker - flush all current works on a kthread_worker
+ * kthread_mod_delayed_work - modify delay of or queue a kthread delayed work
+ * @worker: kthread worker to use
+ * @dwork: kthread delayed work to queue
+ * @delay: number of jiffies to wait before queuing
+ *
+ * If @dwork is idle, equivalent to kthread_queue_delayed_work(). Otherwise,
+ * modify @dwork's timer so that it expires after @delay. If @delay is zero,
+ * @work is guaranteed to be queued immediately.
+ *
+ * Return: %true if @dwork was pending and its timer was modified,
+ * %false otherwise.
+ *
+ * A special case is when the work is being canceled in parallel.
+ * It might be caused either by the real kthread_cancel_delayed_work_sync()
+ * or yet another kthread_mod_delayed_work() call. We let the other command
+ * win and return %false here. The caller is supposed to synchronize these
+ * operations a reasonable way.
+ *
+ * This function is safe to call from any context including IRQ handler.
+ * See __kthread_cancel_work() and kthread_delayed_work_timer_fn()
+ * for details.
+ */
+bool kthread_mod_delayed_work(struct kthread_worker *worker,
+                              struct kthread_delayed_work *dwork,
+                              unsigned long delay)
+{
+        struct kthread_work *work = &dwork->work;
+        unsigned long flags;
+        int ret = false;
+        spin_lock_irqsave(&worker->lock, flags);
+        /* Do not bother with canceling when never queued. */
+        if (!work->worker)
+                goto fast_queue;
+        /* Work must not be used with >1 worker, see kthread_queue_work() */
+        WARN_ON_ONCE(work->worker != worker);
+        /* Do not fight with another command that is canceling this work. */
+        if (work->canceling)
+                goto out;
+        ret = __kthread_cancel_work(work, true, &flags);
+fast_queue:
+        __kthread_queue_delayed_work(worker, dwork, delay);
+out:
+        spin_unlock_irqrestore(&worker->lock, flags);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(kthread_mod_delayed_work);
+static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork)
+{
+        struct kthread_worker *worker = work->worker;
+        unsigned long flags;
+        int ret = false;
+        if (!worker)
+                goto out;
+        spin_lock_irqsave(&worker->lock, flags);
+        /* Work must not be used with >1 worker, see kthread_queue_work(). */
+        WARN_ON_ONCE(work->worker != worker);
+        ret = __kthread_cancel_work(work, is_dwork, &flags);
+        if (worker->current_work != work)
+                goto out_fast;
+        /*
+         * The work is in progress and we need to wait with the lock released.
+         * In the meantime, block any queuing by setting the canceling counter.
+         */
+        work->canceling++;
+        spin_unlock_irqrestore(&worker->lock, flags);
+        kthread_flush_work(work);
+        spin_lock_irqsave(&worker->lock, flags);
+        work->canceling--;
+out_fast:
+        spin_unlock_irqrestore(&worker->lock, flags);
+out:
+        return ret;
+}
+/**
+ * kthread_cancel_work_sync - cancel a kthread work and wait for it to finish
+ * @work: the kthread work to cancel
+ *
+ * Cancel @work and wait for its execution to finish.  This function
+ * can be used even if the work re-queues itself. On return from this
+ * function, @work is guaranteed to be not pending or executing on any CPU.
+ *
+ * kthread_cancel_work_sync(&delayed_work->work) must not be used for
+ * delayed_work's. Use kthread_cancel_delayed_work_sync() instead.
+ *
+ * The caller must ensure that the worker on which @work was last
+ * queued can't be destroyed before this function returns.
+ *
+ * Return: %true if @work was pending, %false otherwise.
+ */
+bool kthread_cancel_work_sync(struct kthread_work *work)
+{
+        return __kthread_cancel_work_sync(work, false);
+}
+EXPORT_SYMBOL_GPL(kthread_cancel_work_sync);
+/**
+ * kthread_cancel_delayed_work_sync - cancel a kthread delayed work and
+ *      wait for it to finish.
+ * @dwork: the kthread delayed work to cancel
+ *
+ * This is kthread_cancel_work_sync() for delayed works.
+ *
+ * Return: %true if @dwork was pending, %false otherwise.
+ */
+bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *dwork)
+{
+        return __kthread_cancel_work_sync(&dwork->work, true);
+}
+EXPORT_SYMBOL_GPL(kthread_cancel_delayed_work_sync);
+/**
+ * kthread_flush_worker - flush all current works on a kthread_worker
 * @worker: worker to flush
 *
 * Wait until all currently executing or pending works on @worker are
 * finished.
 */
-void flush_kthread_worker(struct kthread_worker *worker)
+void kthread_flush_worker(struct kthread_worker *worker)
 {
        struct kthread_flush_work fwork = {
                KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
                COMPLETION_INITIALIZER_ONSTACK(fwork.done),
        };
-        queue_kthread_work(worker, &fwork.work);
+        kthread_queue_work(worker, &fwork.work);
        wait_for_completion(&fwork.done);
 }
-EXPORT_SYMBOL_GPL(flush_kthread_worker);
+EXPORT_SYMBOL_GPL(kthread_flush_worker);
+/**
+ * kthread_destroy_worker - destroy a kthread worker
+ * @worker: worker to be destroyed
+ *
+ * Flush and destroy @worker.  The simple flush is enough because the kthread
+ * worker API is used only in trivial scenarios.  There are no multi-step state
+ * machines needed.
+ */
+void kthread_destroy_worker(struct kthread_worker *worker)
+{
+        struct task_struct *task;
+        task = worker->task;
+        if (WARN_ON(!task))
+                return;
+        kthread_flush_worker(worker);
+        kthread_stop(task);
+        WARN_ON(!list_empty(&worker->work_list));
+        kfree(worker);
+}
+EXPORT_SYMBOL(kthread_destroy_worker);
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 8bbe50704621..af4643873e71 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -274,7 +274,6 @@ static int klp_write_object_relocations(struct module *pmod,
        objname = klp_is_module(obj) ? obj->name : "vmlinux";
-        module_disable_ro(pmod);
        /* For each klp relocation section */
        for (i = 1; i < pmod->klp_info->hdr.e_shnum; i++) {
                sec = pmod->klp_info->sechdrs + i;
@@ -309,7 +308,6 @@ static int klp_write_object_relocations(struct module *pmod,
                        break;
        }
-        module_enable_ro(pmod, true);
        return ret;
 }
@@ -547,9 +545,6 @@ static int __klp_enable_patch(struct klp_patch *patch)
            list_prev_entry(patch, list)->state == KLP_DISABLED)
                return -EBUSY;
-        pr_notice_once("tainting kernel with TAINT_LIVEPATCH\n");
-        add_taint(TAINT_LIVEPATCH, LOCKDEP_STILL_OK);
        pr_notice("enabling patch '%s'\n", patch->mod->name);
        klp_for_each_object(patch, obj) {
@@ -763,6 +758,12 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func)
                                    func->old_sympos ? func->old_sympos : 1);
 }
+/* Arches may override this to finish any remaining arch-specific tasks */
+void __weak arch_klp_init_object_loaded(struct klp_patch *patch,
+                                        struct klp_object *obj)
+{
+}
 /* parts of the initialization that is done only when the object is loaded */
 static int klp_init_object_loaded(struct klp_patch *patch,
                                  struct klp_object *obj)
@@ -770,9 +771,15 @@ static int klp_init_object_loaded(struct klp_patch *patch,
        struct klp_func *func;
        int ret;
+        module_disable_ro(patch->mod);
        ret = klp_write_object_relocations(patch->mod, obj);
-        if (ret)
+        if (ret) {
+                module_enable_ro(patch->mod, true);
                return ret;
+        }
+        arch_klp_init_object_loaded(patch, obj);
+        module_enable_ro(patch->mod, true);
        klp_for_each_func(obj, func) {
                ret = klp_find_object_symbol(obj->name, func->old_name,
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 31322a4275cd..6f88e352cd4f 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -18,7 +18,6 @@ obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
 endif
 obj-$(CONFIG_SMP) += spinlock.o
 obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o
-obj-$(CONFIG_SMP) += lglock.o
 obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
 obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o
 obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
diff --git a/kernel/locking/lglock.c b/kernel/locking/lglock.c
deleted file mode 100644
index 951cfcd10b4a..000000000000
--- a/kernel/locking/lglock.c
+++ /dev/null
@@ -1,111 +0,0 @@
-/* See include/linux/lglock.h for description */
-#include <linux/module.h>
-#include <linux/lglock.h>
-#include <linux/cpu.h>
-#include <linux/string.h>
-/*
- * Note there is no uninit, so lglocks cannot be defined in
- * modules (but it's fine to use them from there)
- * Could be added though, just undo lg_lock_init
- */
-void lg_lock_init(struct lglock *lg, char *name)
-{
-        LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0);
-}
-EXPORT_SYMBOL(lg_lock_init);
-void lg_local_lock(struct lglock *lg)
-{
-        arch_spinlock_t *lock;
-        preempt_disable();
-        lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
-        lock = this_cpu_ptr(lg->lock);
-        arch_spin_lock(lock);
-}
-EXPORT_SYMBOL(lg_local_lock);
-void lg_local_unlock(struct lglock *lg)
-{
-        arch_spinlock_t *lock;
-        lock_release(&lg->lock_dep_map, 1, _RET_IP_);
-        lock = this_cpu_ptr(lg->lock);
-        arch_spin_unlock(lock);
-        preempt_enable();
-}
-EXPORT_SYMBOL(lg_local_unlock);
-void lg_local_lock_cpu(struct lglock *lg, int cpu)
-{
-        arch_spinlock_t *lock;
-        preempt_disable();
-        lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
-        lock = per_cpu_ptr(lg->lock, cpu);
-        arch_spin_lock(lock);
-}
-EXPORT_SYMBOL(lg_local_lock_cpu);
-void lg_local_unlock_cpu(struct lglock *lg, int cpu)
-{
-        arch_spinlock_t *lock;
-        lock_release(&lg->lock_dep_map, 1, _RET_IP_);
-        lock = per_cpu_ptr(lg->lock, cpu);
-        arch_spin_unlock(lock);
-        preempt_enable();
-}
-EXPORT_SYMBOL(lg_local_unlock_cpu);
-void lg_double_lock(struct lglock *lg, int cpu1, int cpu2)
-{
-        BUG_ON(cpu1 == cpu2);
-        /* lock in cpu order, just like lg_global_lock */
-        if (cpu2 < cpu1)
-                swap(cpu1, cpu2);
-        preempt_disable();
-        lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
-        arch_spin_lock(per_cpu_ptr(lg->lock, cpu1));
-        arch_spin_lock(per_cpu_ptr(lg->lock, cpu2));
-}
-void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2)
-{
-        lock_release(&lg->lock_dep_map, 1, _RET_IP_);
-        arch_spin_unlock(per_cpu_ptr(lg->lock, cpu1));
-        arch_spin_unlock(per_cpu_ptr(lg->lock, cpu2));
-        preempt_enable();
-}
-void lg_global_lock(struct lglock *lg)
-{
-        int i;
-        preempt_disable();
-        lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
-        for_each_possible_cpu(i) {
-                arch_spinlock_t *lock;
-                lock = per_cpu_ptr(lg->lock, i);
-                arch_spin_lock(lock);
-        }
-}
-EXPORT_SYMBOL(lg_global_lock);
-void lg_global_unlock(struct lglock *lg)
-{
-        int i;
-        lock_release(&lg->lock_dep_map, 1, _RET_IP_);
-        for_each_possible_cpu(i) {
-                arch_spinlock_t *lock;
-                lock = per_cpu_ptr(lg->lock, i);
-                arch_spin_unlock(lock);
-        }
-        preempt_enable();
-}
-EXPORT_SYMBOL(lg_global_unlock);
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index bec0b647f9cc..ce182599cf2e 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -8,152 +8,186 @@
 #include <linux/sched.h>
 #include <linux/errno.h>
-int __percpu_init_rwsem(struct percpu_rw_semaphore *brw,
+int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
                        const char *name, struct lock_class_key *rwsem_key)
 {
-        brw->fast_read_ctr = alloc_percpu(int);
+        sem->read_count = alloc_percpu(int);
-        if (unlikely(!brw->fast_read_ctr))
+        if (unlikely(!sem->read_count))
                return -ENOMEM;
        /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
-        __init_rwsem(&brw->rw_sem, name, rwsem_key);
+        rcu_sync_init(&sem->rss, RCU_SCHED_SYNC);
-        rcu_sync_init(&brw->rss, RCU_SCHED_SYNC);
+        __init_rwsem(&sem->rw_sem, name, rwsem_key);
-        atomic_set(&brw->slow_read_ctr, 0);
+        init_waitqueue_head(&sem->writer);
-        init_waitqueue_head(&brw->write_waitq);
+        sem->readers_block = 0;
        return 0;
 }
 EXPORT_SYMBOL_GPL(__percpu_init_rwsem);
-void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
+void percpu_free_rwsem(struct percpu_rw_semaphore *sem)
 {
        /*
         * XXX: temporary kludge. The error path in alloc_super()
         * assumes that percpu_free_rwsem() is safe after kzalloc().
         */
-        if (!brw->fast_read_ctr)
+        if (!sem->read_count)
                return;
-        rcu_sync_dtor(&brw->rss);
+        rcu_sync_dtor(&sem->rss);
-        free_percpu(brw->fast_read_ctr);
+        free_percpu(sem->read_count);
-        brw->fast_read_ctr = NULL; /* catch use after free bugs */
+        sem->read_count = NULL; /* catch use after free bugs */
 }
 EXPORT_SYMBOL_GPL(percpu_free_rwsem);
-/*
+int __percpu_down_read(struct percpu_rw_semaphore *sem, int try)
- * This is the fast-path for down_read/up_read. If it succeeds we rely
- * on the barriers provided by rcu_sync_enter/exit; see the comments in
- * percpu_down_write() and percpu_up_write().
- *
- * If this helper fails the callers rely on the normal rw_semaphore and
- * atomic_dec_and_test(), so in this case we have the necessary barriers.
- */
-static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val)
 {
-        bool success;
+        /*
+         * Due to having preemption disabled the decrement happens on
+         * the same CPU as the increment, avoiding the
+         * increment-on-one-CPU-and-decrement-on-another problem.
+         *
+         * If the reader misses the writer's assignment of readers_block, then
+         * the writer is guaranteed to see the reader's increment.
+         *
+         * Conversely, any readers that increment their sem->read_count after
+         * the writer looks are guaranteed to see the readers_block value,
+         * which in turn means that they are guaranteed to immediately
+         * decrement their sem->read_count, so that it doesn't matter that the
+         * writer missed them.
+         */
-        preempt_disable();
+        smp_mb(); /* A matches D */
-        success = rcu_sync_is_idle(&brw->rss);
-        if (likely(success))
-                __this_cpu_add(*brw->fast_read_ctr, val);
-        preempt_enable();
-        return success;
+        /*
-}
+         * If !readers_block the critical section starts here, matched by the
+         * release in percpu_up_write().
+         */
+        if (likely(!smp_load_acquire(&sem->readers_block)))
+                return 1;
-/*
+        /*
- * Like the normal down_read() this is not recursive, the writer can
+         * Per the above comment; we still have preemption disabled and
- * come after the first percpu_down_read() and create the deadlock.
+         * will thus decrement on the same CPU as we incremented.
- *
+         */
- * Note: returns with lock_is_held(brw->rw_sem) == T for lockdep,
+        __percpu_up_read(sem);
- * percpu_up_read() does rwsem_release(). This pairs with the usage
- * of ->rw_sem in percpu_down/up_write().
- */
-void percpu_down_read(struct percpu_rw_semaphore *brw)
-{
-        might_sleep();
-        rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_);
-        if (likely(update_fast_ctr(brw, +1)))
+        if (try)
-                return;
+                return 0;
-        /* Avoid rwsem_acquire_read() and rwsem_release() */
+        /*
-        __down_read(&brw->rw_sem);
+         * We either call schedule() in the wait, or we'll fall through
-        atomic_inc(&brw->slow_read_ctr);
+         * and reschedule on the preempt_enable() in percpu_down_read().
-        __up_read(&brw->rw_sem);
+         */
-}
+        preempt_enable_no_resched();
-EXPORT_SYMBOL_GPL(percpu_down_read);
-int percpu_down_read_trylock(struct percpu_rw_semaphore *brw)
+        /*
-{
+         * Avoid lockdep for the down/up_read() we already have them.
-        if (unlikely(!update_fast_ctr(brw, +1))) {
+         */
-                if (!__down_read_trylock(&brw->rw_sem))
+        __down_read(&sem->rw_sem);
-                        return 0;
+        this_cpu_inc(*sem->read_count);
-                atomic_inc(&brw->slow_read_ctr);
+        __up_read(&sem->rw_sem);
-                __up_read(&brw->rw_sem);
-        }
+        preempt_disable();
-        rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 1, _RET_IP_);
        return 1;
 }
+EXPORT_SYMBOL_GPL(__percpu_down_read);
-void percpu_up_read(struct percpu_rw_semaphore *brw)
+void __percpu_up_read(struct percpu_rw_semaphore *sem)
 {
-        rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_);
+        smp_mb(); /* B matches C */
+        /*
-        if (likely(update_fast_ctr(brw, -1)))
+         * In other words, if they see our decrement (presumably to aggregate
-                return;
+         * zero, as that is the only time it matters) they will also see our
+         * critical section.
+         */
+        __this_cpu_dec(*sem->read_count);
-        /* false-positive is possible but harmless */
+        /* Prod writer to recheck readers_active */
-        if (atomic_dec_and_test(&brw->slow_read_ctr))
+        wake_up(&sem->writer);
-                wake_up_all(&brw->write_waitq);
 }
-EXPORT_SYMBOL_GPL(percpu_up_read);
+EXPORT_SYMBOL_GPL(__percpu_up_read);
+#define per_cpu_sum(var)                                                \
+({                                                                      \
+        typeof(var) __sum = 0;                                          \
+        int cpu;                                                        \
+        compiletime_assert_atomic_type(__sum);                          \
+        for_each_possible_cpu(cpu)                                      \
+                __sum += per_cpu(var, cpu);                             \
+        __sum;                                                          \
+})
-static int clear_fast_ctr(struct percpu_rw_semaphore *brw)
+/*
+ * Return true if the modular sum of the sem->read_count per-CPU variable is
+ * zero.  If this sum is zero, then it is stable due to the fact that if any
+ * newly arriving readers increment a given counter, they will immediately
+ * decrement that same counter.
+ */
+static bool readers_active_check(struct percpu_rw_semaphore *sem)
 {
-        unsigned int sum = 0;
+        if (per_cpu_sum(*sem->read_count) != 0)
-        int cpu;
+                return false;
+        /*
+         * If we observed the decrement; ensure we see the entire critical
+         * section.
+         */
-        for_each_possible_cpu(cpu) {
+        smp_mb(); /* C matches B */
-                sum += per_cpu(*brw->fast_read_ctr, cpu);
-                per_cpu(*brw->fast_read_ctr, cpu) = 0;
-        }
-        return sum;
+        return true;
 }
-void percpu_down_write(struct percpu_rw_semaphore *brw)
+void percpu_down_write(struct percpu_rw_semaphore *sem)
 {
+        /* Notify readers to take the slow path. */
+        rcu_sync_enter(&sem->rss);
+        down_write(&sem->rw_sem);
        /*
-         * Make rcu_sync_is_idle() == F and thus disable the fast-path in
+         * Notify new readers to block; up until now, and thus throughout the
-         * percpu_down_read() and percpu_up_read(), and wait for gp pass.
+         * longish rcu_sync_enter() above, new readers could still come in.
-         *
-         * The latter synchronises us with the preceding readers which used
-         * the fast-past, so we can not miss the result of __this_cpu_add()
-         * or anything else inside their criticial sections.
         */
-        rcu_sync_enter(&brw->rss);
+        WRITE_ONCE(sem->readers_block, 1);
-        /* exclude other writers, and block the new readers completely */
+        smp_mb(); /* D matches A */
-        down_write(&brw->rw_sem);
-        /* nobody can use fast_read_ctr, move its sum into slow_read_ctr */
+        /*
-        atomic_add(clear_fast_ctr(brw), &brw->slow_read_ctr);
+         * If they don't see our writer of readers_block, then we are
+         * guaranteed to see their sem->read_count increment, and therefore
+         * will wait for them.
+         */
-        /* wait for all readers to complete their percpu_up_read() */
+        /* Wait for all now active readers to complete. */
-        wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr));
+        wait_event(sem->writer, readers_active_check(sem));
 }
 EXPORT_SYMBOL_GPL(percpu_down_write);
-void percpu_up_write(struct percpu_rw_semaphore *brw)
+void percpu_up_write(struct percpu_rw_semaphore *sem)
 {
-        /* release the lock, but the readers can't use the fast-path */
-        up_write(&brw->rw_sem);
        /*
-         * Enable the fast-path in percpu_down_read() and percpu_up_read()
+         * Signal the writer is done, no fast path yet.
-         * but only after another gp pass; this adds the necessary barrier
+         *
-         * to ensure the reader can't miss the changes done by us.
+         * One reason that we cannot just immediately flip to readers_fast is
+         * that new readers might fail to see the results of this writer's
+         * critical section.
+         *
+         * Therefore we force it through the slow path which guarantees an
+         * acquire and thereby guarantees the critical section's consistency.
+         */
+        smp_store_release(&sem->readers_block, 0);
+        /*
+         * Release the write lock, this will allow readers back in the game.
+         */
+        up_write(&sem->rw_sem);
+        /*
+         * Once this completes (at least one RCU-sched grace period hence) the
+         * reader fast path will be available again. Safe to use outside the
+         * exclusive write lock because its counting.
         */
-        rcu_sync_exit(&brw->rss);
+        rcu_sync_exit(&sem->rss);
 }
 EXPORT_SYMBOL_GPL(percpu_up_write);
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 8a99abf58080..e3b5520005db 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -70,11 +70,14 @@ struct pv_node {
 static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)
 {
        struct __qspinlock *l = (void *)lock;
-        int ret = !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
-                   (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0);
-        qstat_inc(qstat_pv_lock_stealing, ret);
+        if (!(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
-        return ret;
+            (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0)) {
+                qstat_inc(qstat_pv_lock_stealing, true);
+                return true;
+        }
+        return false;
 }
 /*
@@ -257,7 +260,6 @@ static struct pv_node *pv_unhash(struct qspinlock *lock)
 static inline bool
 pv_wait_early(struct pv_node *prev, int loop)
 {
        if ((loop & PV_PREV_CHECK_MASK) != 0)
                return false;
@@ -286,12 +288,10 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
 {
        struct pv_node *pn = (struct pv_node *)node;
        struct pv_node *pp = (struct pv_node *)prev;
-        int waitcnt = 0;
        int loop;
        bool wait_early;
-        /* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */
+        for (;;) {
-        for (;; waitcnt++) {
                for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) {
                        if (READ_ONCE(node->locked))
                                return;
@@ -315,7 +315,6 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
                if (!READ_ONCE(node->locked)) {
                        qstat_inc(qstat_pv_wait_node, true);
-                        qstat_inc(qstat_pv_wait_again, waitcnt);
                        qstat_inc(qstat_pv_wait_early, wait_early);
                        pv_wait(&pn->state, vcpu_halted);
                }
@@ -456,12 +455,9 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
                pv_wait(&l->locked, _Q_SLOW_VAL);
                /*
-                 * The unlocker should have freed the lock before kicking the
+                 * Because of lock stealing, the queue head vCPU may not be
-                 * CPU. So if the lock is still not free, it is a spurious
+                 * able to acquire the lock before it has to wait again.
-                 * wakeup or another vCPU has stolen the lock. The current
-                 * vCPU should spin again.
                 */
-                qstat_inc(qstat_pv_spurious_wakeup, READ_ONCE(l->locked));
        }
        /*
@@ -544,7 +540,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
         * unhash. Otherwise it would be possible to have multiple @lock
         * entries, which would be BAD.
         */
-        locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
+        locked = cmpxchg_release(&l->locked, _Q_LOCKED_VAL, 0);
        if (likely(locked == _Q_LOCKED_VAL))
                return;
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index b9d031516254..eb0a599fcf58 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -24,8 +24,8 @@
 *   pv_latency_wake    - average latency (ns) from vCPU kick to wakeup
 *   pv_lock_slowpath   - # of locking operations via the slowpath
 *   pv_lock_stealing   - # of lock stealing operations
- *   pv_spurious_wakeup - # of spurious wakeups
+ *   pv_spurious_wakeup - # of spurious wakeups in non-head vCPUs
- *   pv_wait_again      - # of vCPU wait's that happened after a vCPU kick
+ *   pv_wait_again      - # of wait's after a queue head vCPU kick
 *   pv_wait_early      - # of early vCPU wait's
 *   pv_wait_head       - # of vCPU wait's at the queue head
 *   pv_wait_node       - # of vCPU wait's at a non-head queue node
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 447e08de1fab..2337b4bb2366 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -121,16 +121,19 @@ enum rwsem_wake_type {
 * - woken process blocks are discarded from the list after having task zeroed
 * - writers are only marked woken if downgrading is false
 */
-static struct rw_semaphore *
+static void __rwsem_mark_wake(struct rw_semaphore *sem,
-__rwsem_mark_wake(struct rw_semaphore *sem,
+                              enum rwsem_wake_type wake_type,
-                  enum rwsem_wake_type wake_type, struct wake_q_head *wake_q)
+                              struct wake_q_head *wake_q)
 {
-        struct rwsem_waiter *waiter;
+        struct rwsem_waiter *waiter, *tmp;
-        struct task_struct *tsk;
+        long oldcount, woken = 0, adjustment = 0;
-        struct list_head *next;
-        long oldcount, woken, loop, adjustment;
+        /*
+         * Take a peek at the queue head waiter such that we can determine
+         * the wakeup(s) to perform.
+         */
+        waiter = list_first_entry(&sem->wait_list, struct rwsem_waiter, list);
-        waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
        if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
                if (wake_type == RWSEM_WAKE_ANY) {
                        /*
@@ -142,19 +145,19 @@ __rwsem_mark_wake(struct rw_semaphore *sem,
                         */
                        wake_q_add(wake_q, waiter->task);
                }
-                goto out;
+                return;
        }
-        /* Writers might steal the lock before we grant it to the next reader.
+        /*
+         * Writers might steal the lock before we grant it to the next reader.
         * We prefer to do the first reader grant before counting readers
         * so we can bail out early if a writer stole the lock.
         */
-        adjustment = 0;
        if (wake_type != RWSEM_WAKE_READ_OWNED) {
                adjustment = RWSEM_ACTIVE_READ_BIAS;
 try_reader_grant:
                oldcount = atomic_long_fetch_add(adjustment, &sem->count);
                if (unlikely(oldcount < RWSEM_WAITING_BIAS)) {
                        /*
                         * If the count is still less than RWSEM_WAITING_BIAS
@@ -164,7 +167,8 @@ __rwsem_mark_wake(struct rw_semaphore *sem,
                         */
                        if (atomic_long_add_return(-adjustment, &sem->count) <
                            RWSEM_WAITING_BIAS)
-                                goto out;
+                                return;
                        /* Last active locker left. Retry waking readers. */
                        goto try_reader_grant;
                }
@@ -176,38 +180,23 @@ __rwsem_mark_wake(struct rw_semaphore *sem,
                rwsem_set_reader_owned(sem);
        }
-        /* Grant an infinite number of read locks to the readers at the front
+        /*
-         * of the queue.  Note we increment the 'active part' of the count by
+         * Grant an infinite number of read locks to the readers at the front
-         * the number of readers before waking any processes up.
+         * of the queue. We know that woken will be at least 1 as we accounted
+         * for above. Note we increment the 'active part' of the count by the
+         * number of readers before waking any processes up.
         */
-        woken = 0;
+        list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) {
-        do {
+                struct task_struct *tsk;
-                woken++;
-                if (waiter->list.next == &sem->wait_list)
+                if (waiter->type == RWSEM_WAITING_FOR_WRITE)
                        break;
-                waiter = list_entry(waiter->list.next,
+                woken++;
-                                        struct rwsem_waiter, list);
-        } while (waiter->type != RWSEM_WAITING_FOR_WRITE);
-        adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
-        if (waiter->type != RWSEM_WAITING_FOR_WRITE)
-                /* hit end of list above */
-                adjustment -= RWSEM_WAITING_BIAS;
-        if (adjustment)
-                atomic_long_add(adjustment, &sem->count);
-        next = sem->wait_list.next;
-        loop = woken;
-        do {
-                waiter = list_entry(next, struct rwsem_waiter, list);
-                next = waiter->list.next;
                tsk = waiter->task;
                wake_q_add(wake_q, tsk);
+                list_del(&waiter->list);
                /*
                 * Ensure that the last operation is setting the reader
                 * waiter to nil such that rwsem_down_read_failed() cannot
@@ -215,13 +204,16 @@ __rwsem_mark_wake(struct rw_semaphore *sem,
                 * to the task to wakeup.
                 */
                smp_store_release(&waiter->task, NULL);
-        } while (--loop);
+        }
-        sem->wait_list.next = next;
+        adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
-        next->prev = &sem->wait_list;
+        if (list_empty(&sem->wait_list)) {
+                /* hit end of list above */
+                adjustment -= RWSEM_WAITING_BIAS;
+        }
- out:
+        if (adjustment)
-        return sem;
+                atomic_long_add(adjustment, &sem->count);
 }
 /*
@@ -235,7 +227,6 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
        struct task_struct *tsk = current;
        WAKE_Q(wake_q);
-        /* set up my own style of waitqueue */
        waiter.task = tsk;
        waiter.type = RWSEM_WAITING_FOR_READ;
@@ -247,7 +238,8 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
        /* we're now waiting on the lock, but no longer actively locking */
        count = atomic_long_add_return(adjustment, &sem->count);
-        /* If there are no active locks, wake the front queued process(es).
+        /*
+         * If there are no active locks, wake the front queued process(es).
         *
         * If there are no writers and we are first in the queue,
         * wake our own waiter to join the existing active readers !
@@ -255,7 +247,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
        if (count == RWSEM_WAITING_BIAS ||
            (count > RWSEM_WAITING_BIAS &&
             adjustment != -RWSEM_ACTIVE_READ_BIAS))
-                sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
+                __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
        raw_spin_unlock_irq(&sem->wait_lock);
        wake_up_q(&wake_q);
@@ -505,7 +497,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
                if (count > RWSEM_WAITING_BIAS) {
                        WAKE_Q(wake_q);
-                        sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q);
+                        __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q);
                        /*
                         * The wakeup is normally called _after_ the wait_lock
                         * is released, but given that we are proactively waking
@@ -614,9 +606,8 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
        raw_spin_lock_irqsave(&sem->wait_lock, flags);
 locked:
-        /* do nothing if list empty */
        if (!list_empty(&sem->wait_list))
-                sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
+                __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
        raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
        wake_up_q(&wake_q);
@@ -638,9 +629,8 @@ struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
        raw_spin_lock_irqsave(&sem->wait_lock, flags);
-        /* do nothing if list empty */
        if (!list_empty(&sem->wait_list))
-                sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q);
+                __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q);
        raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
        wake_up_q(&wake_q);
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 251d16b4cb41..b501e390bb34 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -247,6 +247,7 @@ static void devm_memremap_pages_release(struct device *dev, void *data)
        align_start = res->start & ~(SECTION_SIZE - 1);
        align_size = ALIGN(resource_size(res), SECTION_SIZE);
        arch_remove_memory(align_start, align_size);
+        untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
        pgmap_radix_release(res);
        dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc,
                        "%s: failed to free all reserved pages\n", __func__);
@@ -282,6 +283,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
                struct percpu_ref *ref, struct vmem_altmap *altmap)
 {
        resource_size_t key, align_start, align_size, align_end;
+        pgprot_t pgprot = PAGE_KERNEL;
        struct dev_pagemap *pgmap;
        struct page_map *page_map;
        int error, nid, is_ram;
@@ -351,6 +353,11 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
        if (nid < 0)
                nid = numa_mem_id();
+        error = track_pfn_remap(NULL, &pgprot, PHYS_PFN(align_start), 0,
+                        align_size);
+        if (error)
+                goto err_pfn_remap;
        error = arch_add_memory(nid, align_start, align_size, true);
        if (error)
                goto err_add_memory;
@@ -371,6 +378,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
        return __va(res->start);
 err_add_memory:
+        untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
+ err_pfn_remap:
 err_radix:
        pgmap_radix_release(res);
        devres_free(page_map);
diff --git a/kernel/module.c b/kernel/module.c
index 529efae9f481..f57dd63186e6 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1149,6 +1149,8 @@ static size_t module_flags_taint(struct module *mod, char *buf)
                buf[l++] = 'C';
        if (mod->taints & (1 << TAINT_UNSIGNED_MODULE))
                buf[l++] = 'E';
+        if (mod->taints & (1 << TAINT_LIVEPATCH))
+                buf[l++] = 'K';
        /*
         * TAINT_FORCED_RMMOD: could be added.
         * TAINT_CPU_OUT_OF_SPEC, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
@@ -2792,14 +2794,17 @@ static int copy_chunked_from_user(void *dst, const void __user *usrc, unsigned l
 }
 #ifdef CONFIG_LIVEPATCH
-static int find_livepatch_modinfo(struct module *mod, struct load_info *info)
+static int check_modinfo_livepatch(struct module *mod, struct load_info *info)
 {
-        mod->klp = get_modinfo(info, "livepatch") ? true : false;
+        if (get_modinfo(info, "livepatch")) {
+                mod->klp = true;
+                add_taint_module(mod, TAINT_LIVEPATCH, LOCKDEP_STILL_OK);
+        }
        return 0;
 }
 #else /* !CONFIG_LIVEPATCH */
-static int find_livepatch_modinfo(struct module *mod, struct load_info *info)
+static int check_modinfo_livepatch(struct module *mod, struct load_info *info)
 {
        if (get_modinfo(info, "livepatch")) {
                pr_err("%s: module is marked as livepatch module, but livepatch support is disabled",
@@ -2969,7 +2974,7 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
                        "is unknown, you have been warned.\n", mod->name);
        }
-        err = find_livepatch_modinfo(mod, info);
+        err = check_modinfo_livepatch(mod, info);
        if (err)
                return err;
diff --git a/kernel/padata.c b/kernel/padata.c
index 993278895ccc..7848f0566403 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -30,6 +30,7 @@
 #include <linux/slab.h>
 #include <linux/sysfs.h>
 #include <linux/rcupdate.h>
+#include <linux/module.h>
 #define MAX_OBJ_NUM 1000
@@ -769,52 +770,43 @@ static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu)
                cpumask_test_cpu(cpu, pinst->cpumask.cbcpu);
 }
+static int padata_cpu_online(unsigned int cpu, struct hlist_node *node)
-static int padata_cpu_callback(struct notifier_block *nfb,
-                               unsigned long action, void *hcpu)
 {
-        int err;
        struct padata_instance *pinst;
-        int cpu = (unsigned long)hcpu;
+        int ret;
-        pinst = container_of(nfb, struct padata_instance, cpu_notifier);
+        pinst = hlist_entry_safe(node, struct padata_instance, node);
+        if (!pinst_has_cpu(pinst, cpu))
+                return 0;
-        switch (action) {
+        mutex_lock(&pinst->lock);
-        case CPU_ONLINE:
+        ret = __padata_add_cpu(pinst, cpu);
-        case CPU_ONLINE_FROZEN:
+        mutex_unlock(&pinst->lock);
-        case CPU_DOWN_FAILED:
+        return ret;
-        case CPU_DOWN_FAILED_FROZEN:
+}
-                if (!pinst_has_cpu(pinst, cpu))
-                        break;
-                mutex_lock(&pinst->lock);
-                err = __padata_add_cpu(pinst, cpu);
-                mutex_unlock(&pinst->lock);
-                if (err)
-                        return notifier_from_errno(err);
-                break;
-        case CPU_DOWN_PREPARE:
+static int padata_cpu_prep_down(unsigned int cpu, struct hlist_node *node)
-        case CPU_DOWN_PREPARE_FROZEN:
+{
-        case CPU_UP_CANCELED:
+        struct padata_instance *pinst;
-        case CPU_UP_CANCELED_FROZEN:
+        int ret;
-                if (!pinst_has_cpu(pinst, cpu))
-                        break;
+        pinst = hlist_entry_safe(node, struct padata_instance, node);
-                mutex_lock(&pinst->lock);
+        if (!pinst_has_cpu(pinst, cpu))
-                err = __padata_remove_cpu(pinst, cpu);
+                return 0;
-                mutex_unlock(&pinst->lock);
-                if (err)
-                        return notifier_from_errno(err);
-                break;
-        }
-        return NOTIFY_OK;
+        mutex_lock(&pinst->lock);
+        ret = __padata_remove_cpu(pinst, cpu);
+        mutex_unlock(&pinst->lock);
+        return ret;
 }
+static enum cpuhp_state hp_online;
 #endif
 static void __padata_free(struct padata_instance *pinst)
 {
 #ifdef CONFIG_HOTPLUG_CPU
-        unregister_hotcpu_notifier(&pinst->cpu_notifier);
+        cpuhp_state_remove_instance_nocalls(hp_online, &pinst->node);
 #endif
        padata_stop(pinst);
@@ -1012,11 +1004,8 @@ struct padata_instance *padata_alloc(struct workqueue_struct *wq,
        mutex_init(&pinst->lock);
 #ifdef CONFIG_HOTPLUG_CPU
-        pinst->cpu_notifier.notifier_call = padata_cpu_callback;
+        cpuhp_state_add_instance_nocalls(hp_online, &pinst->node);
-        pinst->cpu_notifier.priority = 0;
-        register_hotcpu_notifier(&pinst->cpu_notifier);
 #endif
        return pinst;
 err_free_masks:
@@ -1039,3 +1028,26 @@ void padata_free(struct padata_instance *pinst)
        kobject_put(&pinst->kobj);
 }
 EXPORT_SYMBOL(padata_free);
+#ifdef CONFIG_HOTPLUG_CPU
+static __init int padata_driver_init(void)
+{
+        int ret;
+        ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "padata:online",
+                                      padata_cpu_online,
+                                      padata_cpu_prep_down);
+        if (ret < 0)
+                return ret;
+        hp_online = ret;
+        return 0;
+}
+module_init(padata_driver_init);
+static __exit void padata_driver_exit(void)
+{
+        cpuhp_remove_multi_state(hp_online);
+}
+module_exit(padata_driver_exit);
+#endif
diff --git a/kernel/panic.c b/kernel/panic.c
index ca8cea1ef673..e6480e20379e 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -71,6 +71,32 @@ void __weak nmi_panic_self_stop(struct pt_regs *regs)
        panic_smp_self_stop();
 }
+/*
+ * Stop other CPUs in panic.  Architecture dependent code may override this
+ * with more suitable version.  For example, if the architecture supports
+ * crash dump, it should save registers of each stopped CPU and disable
+ * per-CPU features such as virtualization extensions.
+ */
+void __weak crash_smp_send_stop(void)
+{
+        static int cpus_stopped;
+        /*
+         * This function can be called twice in panic path, but obviously
+         * we execute this only once.
+         */
+        if (cpus_stopped)
+                return;
+        /*
+         * Note smp_send_stop is the usual smp shutdown function, which
+         * unfortunately means it may not be hardened to work in a panic
+         * situation.
+         */
+        smp_send_stop();
+        cpus_stopped = 1;
+}
 atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID);
 /*
@@ -164,14 +190,21 @@ void panic(const char *fmt, ...)
        if (!_crash_kexec_post_notifiers) {
                printk_nmi_flush_on_panic();
                __crash_kexec(NULL);
-        }
-        /*
+                /*
-         * Note smp_send_stop is the usual smp shutdown function, which
+                 * Note smp_send_stop is the usual smp shutdown function, which
-         * unfortunately means it may not be hardened to work in a panic
+                 * unfortunately means it may not be hardened to work in a
-         * situation.
+                 * panic situation.
-         */
+                 */
-        smp_send_stop();
+                smp_send_stop();
+        } else {
+                /*
+                 * If we want to do crash dump after notifier calls and
+                 * kmsg_dump, we will need architecture dependent extra
+                 * works in addition to stopping other CPUs.
+                 */
+                crash_smp_send_stop();
+        }
        /*
         * Run any panic handlers, including those that might need to
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index a65ba137fd15..df9e8e9e0be7 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -79,23 +79,36 @@ static void proc_cleanup_work(struct work_struct *work)
 /* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
 #define MAX_PID_NS_LEVEL 32
+static struct ucounts *inc_pid_namespaces(struct user_namespace *ns)
+{
+        return inc_ucount(ns, current_euid(), UCOUNT_PID_NAMESPACES);
+}
+static void dec_pid_namespaces(struct ucounts *ucounts)
+{
+        dec_ucount(ucounts, UCOUNT_PID_NAMESPACES);
+}
 static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns,
        struct pid_namespace *parent_pid_ns)
 {
        struct pid_namespace *ns;
        unsigned int level = parent_pid_ns->level + 1;
+        struct ucounts *ucounts;
        int i;
        int err;
-        if (level > MAX_PID_NS_LEVEL) {
+        err = -ENOSPC;
-                err = -EINVAL;
+        if (level > MAX_PID_NS_LEVEL)
+                goto out;
+        ucounts = inc_pid_namespaces(user_ns);
+        if (!ucounts)
                goto out;
-        }
        err = -ENOMEM;
        ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
        if (ns == NULL)
-                goto out;
+                goto out_dec;
        ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
        if (!ns->pidmap[0].page)
@@ -114,6 +127,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
        ns->level = level;
        ns->parent = get_pid_ns(parent_pid_ns);
        ns->user_ns = get_user_ns(user_ns);
+        ns->ucounts = ucounts;
        ns->nr_hashed = PIDNS_HASH_ADDING;
        INIT_WORK(&ns->proc_work, proc_cleanup_work);
@@ -129,6 +143,8 @@ out_free_map:
        kfree(ns->pidmap[0].page);
 out_free:
        kmem_cache_free(pid_ns_cachep, ns);
+out_dec:
+        dec_pid_namespaces(ucounts);
 out:
        return ERR_PTR(err);
 }
@@ -146,6 +162,7 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
        ns_free_inum(&ns->ns);
        for (i = 0; i < PIDMAP_ENTRIES; i++)
                kfree(ns->pidmap[i].page);
+        dec_pid_namespaces(ns->ucounts);
        put_user_ns(ns->user_ns);
        call_rcu(&ns->rcu, delayed_free_pidns);
 }
@@ -388,12 +405,37 @@ static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns)
        return 0;
 }
+static struct ns_common *pidns_get_parent(struct ns_common *ns)
+{
+        struct pid_namespace *active = task_active_pid_ns(current);
+        struct pid_namespace *pid_ns, *p;
+        /* See if the parent is in the current namespace */
+        pid_ns = p = to_pid_ns(ns)->parent;
+        for (;;) {
+                if (!p)
+                        return ERR_PTR(-EPERM);
+                if (p == active)
+                        break;
+                p = p->parent;
+        }
+        return &get_pid_ns(pid_ns)->ns;
+}
+static struct user_namespace *pidns_owner(struct ns_common *ns)
+{
+        return to_pid_ns(ns)->user_ns;
+}
 const struct proc_ns_operations pidns_operations = {
        .name           = "pid",
        .type           = CLONE_NEWPID,
        .get            = pidns_get,
        .put            = pidns_put,
        .install        = pidns_install,
+        .owner          = pidns_owner,
+        .get_parent     = pidns_get_parent,
 };
 static __init int pid_namespaces_init(void)
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 68d3ebc12601..e8517b63eb37 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -186,7 +186,7 @@ config PM_SLEEP_DEBUG
 config DPM_WATCHDOG
        bool "Device suspend/resume watchdog"
-        depends on PM_DEBUG && PSTORE
+        depends on PM_DEBUG && PSTORE && EXPERT
        ---help---
          Sets up a watchdog timer to capture drivers that are
          locked up attempting to suspend/resume a device.
@@ -197,7 +197,7 @@ config DPM_WATCHDOG
 config DPM_WATCHDOG_TIMEOUT
        int "Watchdog timeout in seconds"
        range 1 120
-        default 60
+        default 120
        depends on DPM_WATCHDOG
 config PM_TRACE
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 33c79b6105c5..b26dbc48c75b 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -306,8 +306,10 @@ static int create_image(int platform_mode)
        if (error)
                printk(KERN_ERR "PM: Error %d creating hibernation image\n",
                        error);
-        if (!in_suspend)
+        if (!in_suspend) {
                events_check_enabled = false;
+                clear_free_pages();
+        }
        platform_leave(platform_mode);
@@ -1189,22 +1191,6 @@ static int __init nohibernate_setup(char *str)
        return 1;
 }
-static int __init page_poison_nohibernate_setup(char *str)
-{
-#ifdef CONFIG_PAGE_POISONING_ZERO
-        /*
-         * The zeroing option for page poison skips the checks on alloc.
-         * since hibernation doesn't save free pages there's no way to
-         * guarantee the pages will still be zeroed.
-         */
-        if (!strcmp(str, "on")) {
-                pr_info("Disabling hibernation due to page poisoning\n");
-                return nohibernate_setup(str);
-        }
-#endif
-        return 1;
-}
 __setup("noresume", noresume_setup);
 __setup("resume_offset=", resume_offset_setup);
 __setup("resume=", resume_setup);
@@ -1212,4 +1198,3 @@ __setup("hibernate=", hibernate_setup);
 __setup("resumewait", resumewait_setup);
 __setup("resumedelay=", resumedelay_setup);
 __setup("nohibernate", nohibernate_setup);
-__setup("page_poison=", page_poison_nohibernate_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 5ea50b1b7595..281a697fd458 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -644,6 +644,7 @@ static int __init pm_init(void)
                return error;
        hibernate_image_size_init();
        hibernate_reserved_size_init();
+        pm_states_init();
        power_kobj = kobject_create_and_add("power", NULL);
        if (!power_kobj)
                return -ENOMEM;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 242d8b827dd5..56d1d0dedf76 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -110,6 +110,8 @@ extern int create_basic_memory_bitmaps(void);
 extern void free_basic_memory_bitmaps(void);
 extern int hibernate_preallocate_memory(void);
+extern void clear_free_pages(void);
 /**
 *      Auxiliary structure used for reading the snapshot image data and
 *      metadata from and writing them to the list of page backup entries
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 8f27d5a8adf6..2fba066e125f 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -144,23 +144,12 @@ int freeze_processes(void)
        /*
         * Now that the whole userspace is frozen we need to disbale
         * the OOM killer to disallow any further interference with
-         * killable tasks.
+         * killable tasks. There is no guarantee oom victims will
+         * ever reach a point they go away we have to wait with a timeout.
         */
-        if (!error && !oom_killer_disable())
+        if (!error && !oom_killer_disable(msecs_to_jiffies(freeze_timeout_msecs)))
                error = -EBUSY;
-        /*
-         * There is a hard to fix race between oom_reaper kernel thread
-         * and oom_killer_disable. oom_reaper calls exit_oom_victim
-         * before the victim reaches exit_mm so try to freeze all the tasks
-         * again and catch such a left over task.
-         */
-        if (!error) {
-                pr_info("Double checking all user space processes after OOM killer disable... ");
-                error = try_to_freeze_tasks(true);
-                pr_cont("\n");
-        }
        if (error)
                thaw_processes();
        return error;
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 97b0df71303e..168ff442ebde 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -482,7 +482,16 @@ void pm_qos_update_request(struct pm_qos_request *req,
                return;
        }
-        cancel_delayed_work_sync(&req->work);
+        /*
+         * This function may be called very early during boot, for example,
+         * from of_clk_init(), where irq needs to stay disabled.
+         * cancel_delayed_work_sync() assumes that irq is enabled on
+         * invocation and re-enables it on return.  Avoid calling it until
+         * workqueue is initialized.
+         */
+        if (keventd_up())
+                cancel_delayed_work_sync(&req->work);
        __pm_qos_update_request(req, new_value);
 }
 EXPORT_SYMBOL_GPL(pm_qos_update_request);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 9a0178c2ac1d..4f0f0604f1c4 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -835,9 +835,9 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)
 */
 static bool rtree_next_node(struct memory_bitmap *bm)
 {
-        bm->cur.node = list_entry(bm->cur.node->list.next,
+        if (!list_is_last(&bm->cur.node->list, &bm->cur.zone->leaves)) {
-                                  struct rtree_node, list);
+                bm->cur.node = list_entry(bm->cur.node->list.next,
-        if (&bm->cur.node->list != &bm->cur.zone->leaves) {
+                                          struct rtree_node, list);
                bm->cur.node_pfn += BM_BITS_PER_BLOCK;
                bm->cur.node_bit  = 0;
                touch_softlockup_watchdog();
@@ -845,9 +845,9 @@ static bool rtree_next_node(struct memory_bitmap *bm)
        }
        /* No more nodes, goto next zone */
-        bm->cur.zone = list_entry(bm->cur.zone->list.next,
+        if (!list_is_last(&bm->cur.zone->list, &bm->zones)) {
+                bm->cur.zone = list_entry(bm->cur.zone->list.next,
                                  struct mem_zone_bm_rtree, list);
-        if (&bm->cur.zone->list != &bm->zones) {
                bm->cur.node = list_entry(bm->cur.zone->leaves.next,
                                          struct rtree_node, list);
                bm->cur.node_pfn = 0;
@@ -1132,6 +1132,28 @@ void free_basic_memory_bitmaps(void)
        pr_debug("PM: Basic memory bitmaps freed\n");
 }
+void clear_free_pages(void)
+{
+#ifdef CONFIG_PAGE_POISONING_ZERO
+        struct memory_bitmap *bm = free_pages_map;
+        unsigned long pfn;
+        if (WARN_ON(!(free_pages_map)))
+                return;
+        memory_bm_position_reset(bm);
+        pfn = memory_bm_next_pfn(bm);
+        while (pfn != BM_END_OF_MAP) {
+                if (pfn_valid(pfn))
+                        clear_highpage(pfn_to_page(pfn));
+                pfn = memory_bm_next_pfn(bm);
+        }
+        memory_bm_position_reset(bm);
+        pr_info("PM: free pages cleared after restore\n");
+#endif /* PAGE_POISONING_ZERO */
+}
 /**
 * snapshot_additional_pages - Estimate the number of extra pages needed.
 * @zone: Memory zone to carry out the computation for.
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 0acab9d7f96f..1e7f5da648d9 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -118,10 +118,18 @@ static bool valid_state(suspend_state_t state)
 */
 static bool relative_states;
+void __init pm_states_init(void)
+{
+        /*
+         * freeze state should be supported even without any suspend_ops,
+         * initialize pm_states accordingly here
+         */
+        pm_states[PM_SUSPEND_FREEZE] = pm_labels[relative_states ? 0 : 2];
+}
 static int __init sleep_states_setup(char *str)
 {
        relative_states = !strncmp(str, "1", 1);
-        pm_states[PM_SUSPEND_FREEZE] = pm_labels[relative_states ? 0 : 2];
        return 1;
 }
@@ -211,7 +219,7 @@ static int platform_suspend_begin(suspend_state_t state)
 {
        if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin)
                return freeze_ops->begin();
-        else if (suspend_ops->begin)
+        else if (suspend_ops && suspend_ops->begin)
                return suspend_ops->begin(state);
        else
                return 0;
@@ -221,7 +229,7 @@ static void platform_resume_end(suspend_state_t state)
 {
        if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end)
                freeze_ops->end();
-        else if (suspend_ops->end)
+        else if (suspend_ops && suspend_ops->end)
                suspend_ops->end();
 }
diff --git a/kernel/printk/braille.c b/kernel/printk/braille.c
index 276762f3a460..d5760c42f042 100644
--- a/kernel/printk/braille.c
+++ b/kernel/printk/braille.c
@@ -9,10 +9,10 @@
 char *_braille_console_setup(char **str, char **brl_options)
 {
-        if (!memcmp(*str, "brl,", 4)) {
+        if (!strncmp(*str, "brl,", 4)) {
                *brl_options = "";
                *str += 4;
-        } else if (!memcmp(str, "brl=", 4)) {
+        } else if (!strncmp(*str, "brl=", 4)) {
                *brl_options = *str + 4;
                *str = strchr(*brl_options, ',');
                if (!*str)
diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c
index b69eb8a2876f..16bab471c7e2 100644
--- a/kernel/printk/nmi.c
+++ b/kernel/printk/nmi.c
@@ -99,27 +99,33 @@ again:
        return add;
 }
-/*
+static void printk_nmi_flush_line(const char *text, int len)
- * printk one line from the temporary buffer from @start index until
- * and including the @end index.
- */
-static void print_nmi_seq_line(struct nmi_seq_buf *s, int start, int end)
 {
-        const char *buf = s->buffer + start;
        /*
         * The buffers are flushed in NMI only on panic.  The messages must
         * go only into the ring buffer at this stage.  Consoles will get
         * explicitly called later when a crashdump is not generated.
         */
        if (in_nmi())
-                printk_deferred("%.*s", (end - start) + 1, buf);
+                printk_deferred("%.*s", len, text);
        else
-                printk("%.*s", (end - start) + 1, buf);
+                printk("%.*s", len, text);
 }
 /*
+ * printk one line from the temporary buffer from @start index until
+ * and including the @end index.
+ */
+static void printk_nmi_flush_seq_line(struct nmi_seq_buf *s,
+                                        int start, int end)
+{
+        const char *buf = s->buffer + start;
+        printk_nmi_flush_line(buf, (end - start) + 1);
+}
+/*
 * Flush data from the associated per_CPU buffer. The function
 * can be called either via IRQ work or independently.
 */
@@ -150,9 +156,11 @@ more:
         * the buffer an unexpected way. If we printed something then
         * @len must only increase.
         */
-        if (i && i >= len)
+        if (i && i >= len) {
-                pr_err("printk_nmi_flush: internal error: i=%d >= len=%zu\n",
+                const char *msg = "printk_nmi_flush: internal error\n";
-                       i, len);
+                printk_nmi_flush_line(msg, strlen(msg));
+        }
        if (!len)
                goto out; /* Someone else has already flushed the buffer. */
@@ -166,14 +174,14 @@ more:
        /* Print line by line. */
        for (; i < size; i++) {
                if (s->buffer[i] == '\n') {
-                        print_nmi_seq_line(s, last_i, i);
+                        printk_nmi_flush_seq_line(s, last_i, i);
                        last_i = i + 1;
                }
        }
        /* Check if there was a partial line. */
        if (last_i < size) {
-                print_nmi_seq_line(s, last_i, size - 1);
+                printk_nmi_flush_seq_line(s, last_i, size - 1);
-                pr_cont("\n");
+                printk_nmi_flush_line("\n", strlen("\n"));
        }
        /*
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index eea6dbc2d8cf..de08fc90baaf 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -253,6 +253,17 @@ static int preferred_console = -1;
 int console_set_on_cmdline;
 EXPORT_SYMBOL(console_set_on_cmdline);
+#ifdef CONFIG_OF
+static bool of_specified_console;
+void console_set_by_of(void)
+{
+        of_specified_console = true;
+}
+#else
+# define of_specified_console false
+#endif
 /* Flag: console code may call schedule() */
 static int console_may_schedule;
@@ -655,11 +666,8 @@ static ssize_t msg_print_ext_header(char *buf, size_t size,
         * better readable output. 'c' in the record flags mark the first
         * fragment of a line, '+' the following.
         */
-        if (msg->flags & LOG_CONT && !(prev_flags & LOG_CONT))
+        if (msg->flags & LOG_CONT)
-                cont = 'c';
+                cont = (prev_flags & LOG_CONT) ? '+' : 'c';
-        else if ((msg->flags & LOG_CONT) ||
-                 ((prev_flags & LOG_CONT) && !(msg->flags & LOG_PREFIX)))
-                cont = '+';
        return scnprintf(buf, size, "%u,%llu,%llu,%c;",
                       (msg->facility << 3) | msg->level, seq, ts_usec, cont);
@@ -786,6 +794,8 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
        return ret;
 }
+static void cont_flush(void);
 static ssize_t devkmsg_read(struct file *file, char __user *buf,
                            size_t count, loff_t *ppos)
 {
@@ -801,6 +811,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
        if (ret)
                return ret;
        raw_spin_lock_irq(&logbuf_lock);
+        cont_flush();
        while (user->seq == log_next_seq) {
                if (file->f_flags & O_NONBLOCK) {
                        ret = -EAGAIN;
@@ -863,6 +874,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
                return -ESPIPE;
        raw_spin_lock_irq(&logbuf_lock);
+        cont_flush();
        switch (whence) {
        case SEEK_SET:
                /* the first record */
@@ -901,6 +913,7 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
        poll_wait(file, &log_wait, wait);
        raw_spin_lock_irq(&logbuf_lock);
+        cont_flush();
        if (user->seq < log_next_seq) {
                /* return error when data has vanished underneath us */
                if (user->seq < log_first_seq)
@@ -1287,6 +1300,7 @@ static int syslog_print(char __user *buf, int size)
                size_t skip;
                raw_spin_lock_irq(&logbuf_lock);
+                cont_flush();
                if (syslog_seq < log_first_seq) {
                        /* messages are gone, move to first one */
                        syslog_seq = log_first_seq;
@@ -1346,6 +1360,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
                return -ENOMEM;
        raw_spin_lock_irq(&logbuf_lock);
+        cont_flush();
        if (buf) {
                u64 next_seq;
                u64 seq;
@@ -1507,6 +1522,7 @@ int do_syslog(int type, char __user *buf, int len, int source)
        /* Number of chars in the log buffer */
        case SYSLOG_ACTION_SIZE_UNREAD:
                raw_spin_lock_irq(&logbuf_lock);
+                cont_flush();
                if (syslog_seq < log_first_seq) {
                        /* messages are gone, move to first one */
                        syslog_seq = log_first_seq;
@@ -1643,35 +1659,33 @@ static struct cont {
        bool flushed:1;                 /* buffer sealed and committed */
 } cont;
-static void cont_flush(enum log_flags flags)
+static void cont_flush(void)
 {
        if (cont.flushed)
                return;
        if (cont.len == 0)
                return;
        if (cont.cons) {
                /*
                 * If a fragment of this line was directly flushed to the
                 * console; wait for the console to pick up the rest of the
                 * line. LOG_NOCONS suppresses a duplicated output.
                 */
-                log_store(cont.facility, cont.level, flags | LOG_NOCONS,
+                log_store(cont.facility, cont.level, cont.flags | LOG_NOCONS,
                          cont.ts_nsec, NULL, 0, cont.buf, cont.len);
-                cont.flags = flags;
                cont.flushed = true;
        } else {
                /*
                 * If no fragment of this line ever reached the console,
                 * just submit it to the store and free the buffer.
                 */
-                log_store(cont.facility, cont.level, flags, 0,
+                log_store(cont.facility, cont.level, cont.flags, 0,
                          NULL, 0, cont.buf, cont.len);
                cont.len = 0;
        }
 }
-static bool cont_add(int facility, int level, const char *text, size_t len)
+static bool cont_add(int facility, int level, enum log_flags flags, const char *text, size_t len)
 {
        if (cont.len && cont.flushed)
                return false;
@@ -1682,7 +1696,7 @@ static bool cont_add(int facility, int level, const char *text, size_t len)
         * the line gets too long, split it up in separate records.
         */
        if (nr_ext_console_drivers || cont.len + len > sizeof(cont.buf)) {
-                cont_flush(LOG_CONT);
+                cont_flush();
                return false;
        }
@@ -1691,7 +1705,7 @@ static bool cont_add(int facility, int level, const char *text, size_t len)
                cont.level = level;
                cont.owner = current;
                cont.ts_nsec = local_clock();
-                cont.flags = 0;
+                cont.flags = flags;
                cont.cons = 0;
                cont.flushed = false;
        }
@@ -1699,8 +1713,15 @@ static bool cont_add(int facility, int level, const char *text, size_t len)
        memcpy(cont.buf + cont.len, text, len);
        cont.len += len;
+        // The original flags come from the first line,
+        // but later continuations can add a newline.
+        if (flags & LOG_NEWLINE) {
+                cont.flags |= LOG_NEWLINE;
+                cont_flush();
+        }
        if (cont.len > (sizeof(cont.buf) * 80) / 100)
-                cont_flush(LOG_CONT);
+                cont_flush();
        return true;
 }
@@ -1733,6 +1754,35 @@ static size_t cont_print_text(char *text, size_t size)
        return textlen;
 }
+static size_t log_output(int facility, int level, enum log_flags lflags, const char *dict, size_t dictlen, char *text, size_t text_len)
+{
+        /*
+         * If an earlier line was buffered, and we're a continuation
+         * write from the same process, try to add it to the buffer.
+         */
+        if (cont.len) {
+                if (cont.owner == current && (lflags & LOG_CONT)) {
+                        if (cont_add(facility, level, lflags, text, text_len))
+                                return text_len;
+                }
+                /* Otherwise, make sure it's flushed */
+                cont_flush();
+        }
+        /* Skip empty continuation lines that couldn't be added - they just flush */
+        if (!text_len && (lflags & LOG_CONT))
+                return 0;
+        /* If it doesn't end in a newline, try to buffer the current line */
+        if (!(lflags & LOG_NEWLINE)) {
+                if (cont_add(facility, level, lflags, text, text_len))
+                        return text_len;
+        }
+        /* Store it in the record log */
+        return log_store(facility, level, lflags, 0, dict, dictlen, text, text_len);
+}
 asmlinkage int vprintk_emit(int facility, int level,
                            const char *dict, size_t dictlen,
                            const char *fmt, va_list args)
@@ -1819,10 +1869,9 @@ asmlinkage int vprintk_emit(int facility, int level,
        /* strip kernel syslog prefix and extract log level or control flags */
        if (facility == 0) {
-                int kern_level = printk_get_level(text);
+                int kern_level;
-                if (kern_level) {
+                while ((kern_level = printk_get_level(text)) != 0) {
-                        const char *end_of_header = printk_skip_level(text);
                        switch (kern_level) {
                        case '0' ... '7':
                                if (level == LOGLEVEL_DEFAULT)
@@ -1830,14 +1879,13 @@ asmlinkage int vprintk_emit(int facility, int level,
                                /* fallthrough */
                        case 'd':       /* KERN_DEFAULT */
                                lflags |= LOG_PREFIX;
+                                break;
+                        case 'c':       /* KERN_CONT */
+                                lflags |= LOG_CONT;
                        }
-                        /*
-                         * No need to check length here because vscnprintf
+                        text_len -= 2;
-                         * put '\0' at the end of the string. Only valid and
+                        text += 2;
-                         * newly printed level is detected.
-                         */
-                        text_len -= end_of_header - text;
-                        text = (char *)end_of_header;
                }
        }
@@ -1847,45 +1895,7 @@ asmlinkage int vprintk_emit(int facility, int level,
        if (dict)
                lflags |= LOG_PREFIX|LOG_NEWLINE;
-        if (!(lflags & LOG_NEWLINE)) {
+        printed_len += log_output(facility, level, lflags, dict, dictlen, text, text_len);
-                /*
-                 * Flush the conflicting buffer. An earlier newline was missing,
-                 * or another task also prints continuation lines.
-                 */
-                if (cont.len && (lflags & LOG_PREFIX || cont.owner != current))
-                        cont_flush(LOG_NEWLINE);
-                /* buffer line if possible, otherwise store it right away */
-                if (cont_add(facility, level, text, text_len))
-                        printed_len += text_len;
-                else
-                        printed_len += log_store(facility, level,
-                                                 lflags | LOG_CONT, 0,
-                                                 dict, dictlen, text, text_len);
-        } else {
-                bool stored = false;
-                /*
-                 * If an earlier newline was missing and it was the same task,
-                 * either merge it with the current buffer and flush, or if
-                 * there was a race with interrupts (prefix == true) then just
-                 * flush it out and store this line separately.
-                 * If the preceding printk was from a different task and missed
-                 * a newline, flush and append the newline.
-                 */
-                if (cont.len) {
-                        if (cont.owner == current && !(lflags & LOG_PREFIX))
-                                stored = cont_add(facility, level, text,
-                                                  text_len);
-                        cont_flush(LOG_NEWLINE);
-                }
-                if (stored)
-                        printed_len += text_len;
-                else
-                        printed_len += log_store(facility, level, lflags, 0,
-                                                 dict, dictlen, text, text_len);
-        }
        logbuf_cpu = UINT_MAX;
        raw_spin_unlock(&logbuf_lock);
@@ -2647,7 +2657,7 @@ void register_console(struct console *newcon)
         *      didn't select a console we take the first one
         *      that registers here.
         */
-        if (preferred_console < 0) {
+        if (preferred_console < 0 && !of_specified_console) {
                if (newcon->index < 0)
                        newcon->index = 0;
                if (newcon->setup == NULL ||
@@ -3029,6 +3039,7 @@ void kmsg_dump(enum kmsg_dump_reason reason)
                dumper->active = true;
                raw_spin_lock_irqsave(&logbuf_lock, flags);
+                cont_flush();
                dumper->cur_seq = clear_seq;
                dumper->cur_idx = clear_idx;
                dumper->next_seq = log_next_seq;
@@ -3119,6 +3130,7 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
        bool ret;
        raw_spin_lock_irqsave(&logbuf_lock, flags);
+        cont_flush();
        ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len);
        raw_spin_unlock_irqrestore(&logbuf_lock, flags);
@@ -3161,6 +3173,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
                goto out;
        raw_spin_lock_irqsave(&logbuf_lock, flags);
+        cont_flush();
        if (dumper->cur_seq < log_first_seq) {
                /* messages are gone, move to first available one */
                dumper->cur_seq = log_first_seq;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 1d3b7665d0be..e6474f7272ec 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -73,6 +73,8 @@ void __ptrace_unlink(struct task_struct *child)
 {
        BUG_ON(!child->ptrace);
+        clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
        child->parent = child->real_parent;
        list_del_init(&child->ptrace_entry);
@@ -489,7 +491,6 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
        /* Architecture-specific hardware disable .. */
        ptrace_disable(child);
-        clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
        write_lock_irq(&tasklist_lock);
        /*
@@ -536,7 +537,7 @@ int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst
                int this_len, retval;
                this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
-                retval = access_process_vm(tsk, src, buf, this_len, 0);
+                retval = access_process_vm(tsk, src, buf, this_len, FOLL_FORCE);
                if (!retval) {
                        if (copied)
                                break;
@@ -563,7 +564,8 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
                this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
                if (copy_from_user(buf, src, this_len))
                        return -EFAULT;
-                retval = access_process_vm(tsk, dst, buf, this_len, 1);
+                retval = access_process_vm(tsk, dst, buf, this_len,
+                                FOLL_FORCE | FOLL_WRITE);
                if (!retval) {
                        if (copied)
                                break;
@@ -1126,7 +1128,7 @@ int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr,
        unsigned long tmp;
        int copied;
-        copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), 0);
+        copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), FOLL_FORCE);
        if (copied != sizeof(tmp))
                return -EIO;
        return put_user(tmp, (unsigned long __user *)data);
@@ -1137,7 +1139,8 @@ int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr,
 {
        int copied;
-        copied = access_process_vm(tsk, addr, &data, sizeof(data), 1);
+        copied = access_process_vm(tsk, addr, &data, sizeof(data),
+                        FOLL_FORCE | FOLL_WRITE);
        return (copied == sizeof(data)) ? 0 : -EIO;
 }
@@ -1154,7 +1157,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
        switch (request) {
        case PTRACE_PEEKTEXT:
        case PTRACE_PEEKDATA:
-                ret = access_process_vm(child, addr, &word, sizeof(word), 0);
+                ret = access_process_vm(child, addr, &word, sizeof(word),
+                                FOLL_FORCE);
                if (ret != sizeof(word))
                        ret = -EIO;
                else
@@ -1163,7 +1167,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
        case PTRACE_POKETEXT:
        case PTRACE_POKEDATA:
-                ret = access_process_vm(child, addr, &data, sizeof(data), 1);
+                ret = access_process_vm(child, addr, &data, sizeof(data),
+                                FOLL_FORCE | FOLL_WRITE);
                ret = (ret != sizeof(data) ? -EIO : 0);
                break;
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index d38ab08a3fe7..123ccbd22449 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -52,7 +52,7 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>");
 #define PERF_FLAG "-perf:"
 #define PERFOUT_STRING(s) \
-        pr_alert("%s" PERF_FLAG s "\n", perf_type)
+        pr_alert("%s" PERF_FLAG " %s\n", perf_type, s)
 #define VERBOSE_PERFOUT_STRING(s) \
        do { if (verbose) pr_alert("%s" PERF_FLAG " %s\n", perf_type, s); } while (0)
 #define VERBOSE_PERFOUT_ERRSTRING(s) \
@@ -400,9 +400,8 @@ rcu_perf_writer(void *arg)
                        sp.sched_priority = 0;
                        sched_setscheduler_nocheck(current,
                                                   SCHED_NORMAL, &sp);
-                        pr_alert("%s" PERF_FLAG
+                        pr_alert("%s%s rcu_perf_writer %ld has %d measurements\n",
-                                 "rcu_perf_writer %ld has %d measurements\n",
+                                 perf_type, PERF_FLAG, me, MIN_MEAS);
-                                 perf_type, me, MIN_MEAS);
                        if (atomic_inc_return(&n_rcu_perf_writer_finished) >=
                            nrealwriters) {
                                schedule_timeout_interruptible(10);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 971e2b138063..bf08fee53dc7 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1238,6 +1238,7 @@ rcu_torture_stats_print(void)
        long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
        long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
        static unsigned long rtcv_snap = ULONG_MAX;
+        struct task_struct *wtp;
        for_each_possible_cpu(cpu) {
                for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
@@ -1258,8 +1259,9 @@ rcu_torture_stats_print(void)
                atomic_read(&n_rcu_torture_alloc),
                atomic_read(&n_rcu_torture_alloc_fail),
                atomic_read(&n_rcu_torture_free));
-        pr_cont("rtmbe: %d rtbke: %ld rtbre: %ld ",
+        pr_cont("rtmbe: %d rtbe: %ld rtbke: %ld rtbre: %ld ",
                atomic_read(&n_rcu_torture_mberror),
+                n_rcu_torture_barrier_error,
                n_rcu_torture_boost_ktrerror,
                n_rcu_torture_boost_rterror);
        pr_cont("rtbf: %ld rtb: %ld nt: %ld ",
@@ -1312,10 +1314,12 @@ rcu_torture_stats_print(void)
                rcutorture_get_gp_data(cur_ops->ttype,
                                       &flags, &gpnum, &completed);
-                pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x\n",
+                wtp = READ_ONCE(writer_task);
+                pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx\n",
                         rcu_torture_writer_state_getname(),
                         rcu_torture_writer_state,
-                         gpnum, completed, flags);
+                         gpnum, completed, flags,
+                         wtp == NULL ? ~0UL : wtp->state);
                show_rcu_gp_kthreads();
                rcu_ftrace_dump(DUMP_ALL);
        }
@@ -1362,12 +1366,12 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
                 onoff_interval, onoff_holdoff);
 }
-static void rcutorture_booster_cleanup(int cpu)
+static int rcutorture_booster_cleanup(unsigned int cpu)
 {
        struct task_struct *t;
        if (boost_tasks[cpu] == NULL)
-                return;
+                return 0;
        mutex_lock(&boost_mutex);
        t = boost_tasks[cpu];
        boost_tasks[cpu] = NULL;
@@ -1375,9 +1379,10 @@ static void rcutorture_booster_cleanup(int cpu)
        /* This must be outside of the mutex, otherwise deadlock! */
        torture_stop_kthread(rcu_torture_boost, t);
+        return 0;
 }
-static int rcutorture_booster_init(int cpu)
+static int rcutorture_booster_init(unsigned int cpu)
 {
        int retval;
@@ -1577,28 +1582,7 @@ static void rcu_torture_barrier_cleanup(void)
        }
 }
-static int rcutorture_cpu_notify(struct notifier_block *self,
+static enum cpuhp_state rcutor_hp;
-                                 unsigned long action, void *hcpu)
-{
-        long cpu = (long)hcpu;
-        switch (action & ~CPU_TASKS_FROZEN) {
-        case CPU_ONLINE:
-        case CPU_DOWN_FAILED:
-                (void)rcutorture_booster_init(cpu);
-                break;
-        case CPU_DOWN_PREPARE:
-                rcutorture_booster_cleanup(cpu);
-                break;
-        default:
-                break;
-        }
-        return NOTIFY_OK;
-}
-static struct notifier_block rcutorture_cpu_nb = {
-        .notifier_call = rcutorture_cpu_notify,
-};
 static void
 rcu_torture_cleanup(void)
@@ -1638,11 +1622,8 @@ rcu_torture_cleanup(void)
        for (i = 0; i < ncbflooders; i++)
                torture_stop_kthread(rcu_torture_cbflood, cbflood_task[i]);
        if ((test_boost == 1 && cur_ops->can_boost) ||
-            test_boost == 2) {
+            test_boost == 2)
-                unregister_cpu_notifier(&rcutorture_cpu_nb);
+                cpuhp_remove_state(rcutor_hp);
-                for_each_possible_cpu(i)
-                        rcutorture_booster_cleanup(i);
-        }
        /*
         * Wait for all RCU callbacks to fire, then do flavor-specific
@@ -1869,14 +1850,13 @@ rcu_torture_init(void)
            test_boost == 2) {
                boost_starttime = jiffies + test_boost_interval * HZ;
-                register_cpu_notifier(&rcutorture_cpu_nb);
-                for_each_possible_cpu(i) {
+                firsterr = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "RCU_TORTURE",
-                        if (cpu_is_offline(i))
+                                             rcutorture_booster_init,
-                                continue;  /* Heuristic: CPU can go offline. */
+                                             rcutorture_booster_cleanup);
-                        firsterr = rcutorture_booster_init(i);
+                if (firsterr < 0)
-                        if (firsterr)
+                        goto unwind;
-                                goto unwind;
+                rcutor_hp = firsterr;
-                }
        }
        firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup);
        if (firsterr)
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index be922c9f3d37..50d1861f7759 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -68,6 +68,8 @@ void rcu_sync_lockdep_assert(struct rcu_sync *rsp)
        RCU_LOCKDEP_WARN(!gp_ops[rsp->gp_type].held(),
                         "suspicious rcu_sync_is_idle() usage");
 }
+EXPORT_SYMBOL_GPL(rcu_sync_lockdep_assert);
 #endif
 /**
@@ -83,6 +85,18 @@ void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type)
 }
 /**
+ * Must be called after rcu_sync_init() and before first use.
+ *
+ * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}()
+ * pairs turn into NO-OPs.
+ */
+void rcu_sync_enter_start(struct rcu_sync *rsp)
+{
+        rsp->gp_count++;
+        rsp->gp_state = GP_PASSED;
+}
+/**
 * rcu_sync_enter() - Force readers onto slowpath
 * @rsp: Pointer to rcu_sync structure to use for synchronization
 *
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 944b1b491ed8..1898559e6b60 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -170,7 +170,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
                                      false));
 }
-static void rcu_process_callbacks(struct softirq_action *unused)
+static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused)
 {
        __rcu_process_callbacks(&rcu_sched_ctrlblk);
        __rcu_process_callbacks(&rcu_bh_ctrlblk);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 5d80925e7fc8..69a5611a7e7c 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -41,7 +41,6 @@
 #include <linux/export.h>
 #include <linux/completion.h>
 #include <linux/moduleparam.h>
-#include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/notifier.h>
 #include <linux/cpu.h>
@@ -60,7 +59,6 @@
 #include "tree.h"
 #include "rcu.h"
-MODULE_ALIAS("rcutree");
 #ifdef MODULE_PARAM_PREFIX
 #undef MODULE_PARAM_PREFIX
 #endif
@@ -1848,6 +1846,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
                              struct rcu_data *rdp)
 {
        bool ret;
+        bool need_gp;
        /* Handle the ends of any preceding grace periods first. */
        if (rdp->completed == rnp->completed &&
@@ -1874,9 +1873,10 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
                 */
                rdp->gpnum = rnp->gpnum;
                trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart"));
-                rdp->cpu_no_qs.b.norm = true;
+                need_gp = !!(rnp->qsmask & rdp->grpmask);
+                rdp->cpu_no_qs.b.norm = need_gp;
                rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
-                rdp->core_needs_qs = !!(rnp->qsmask & rdp->grpmask);
+                rdp->core_needs_qs = need_gp;
                zero_cpu_stall_ticks(rdp);
                WRITE_ONCE(rdp->gpwrap, false);
        }
@@ -2344,7 +2344,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
        WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
        WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
        raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags);
-        swake_up(&rsp->gp_wq);  /* Memory barrier implied by swake_up() path. */
+        rcu_gp_kthread_wake(rsp);
 }
 /*
@@ -2970,7 +2970,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
        }
        WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
        raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags);
-        swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */
+        rcu_gp_kthread_wake(rsp);
 }
 /*
@@ -3013,7 +3013,7 @@ __rcu_process_callbacks(struct rcu_state *rsp)
 /*
 * Do RCU core processing for the current CPU.
 */
-static void rcu_process_callbacks(struct softirq_action *unused)
+static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused)
 {
        struct rcu_state *rsp;
@@ -3792,8 +3792,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
        rnp = rdp->mynode;
        mask = rdp->grpmask;
        raw_spin_lock_rcu_node(rnp);            /* irqs already disabled. */
-        rnp->qsmaskinitnext |= mask;
-        rnp->expmaskinitnext |= mask;
        if (!rdp->beenonline)
                WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1);
        rdp->beenonline = true;  /* We have now been online. */
@@ -3860,6 +3858,32 @@ int rcutree_dead_cpu(unsigned int cpu)
        return 0;
 }
+/*
+ * Mark the specified CPU as being online so that subsequent grace periods
+ * (both expedited and normal) will wait on it.  Note that this means that
+ * incoming CPUs are not allowed to use RCU read-side critical sections
+ * until this function is called.  Failing to observe this restriction
+ * will result in lockdep splats.
+ */
+void rcu_cpu_starting(unsigned int cpu)
+{
+        unsigned long flags;
+        unsigned long mask;
+        struct rcu_data *rdp;
+        struct rcu_node *rnp;
+        struct rcu_state *rsp;
+        for_each_rcu_flavor(rsp) {
+                rdp = this_cpu_ptr(rsp->rda);
+                rnp = rdp->mynode;
+                mask = rdp->grpmask;
+                raw_spin_lock_irqsave_rcu_node(rnp, flags);
+                rnp->qsmaskinitnext |= mask;
+                rnp->expmaskinitnext |= mask;
+                raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+        }
+}
 #ifdef CONFIG_HOTPLUG_CPU
 /*
 * The CPU is exiting the idle loop into the arch_cpu_idle_dead()
@@ -4209,8 +4233,10 @@ void __init rcu_init(void)
         * or the scheduler are operational.
         */
        pm_notifier(rcu_pm_notify, 0);
-        for_each_online_cpu(cpu)
+        for_each_online_cpu(cpu) {
                rcutree_prepare_cpu(cpu);
+                rcu_cpu_starting(cpu);
+        }
 }
 #include "tree_exp.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index f714f873bf9d..e99a5234d9ed 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -400,6 +400,7 @@ struct rcu_data {
 #ifdef CONFIG_RCU_FAST_NO_HZ
        struct rcu_head oom_head;
 #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
+        atomic_long_t exp_workdone0;    /* # done by workqueue. */
        atomic_long_t exp_workdone1;    /* # done by others #1. */
        atomic_long_t exp_workdone2;    /* # done by others #2. */
        atomic_long_t exp_workdone3;    /* # done by others #3. */
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 6d86ab6ec2c9..24343eb87b58 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -359,7 +359,8 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
                        struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
                        if (raw_smp_processor_id() == cpu ||
-                            !(atomic_add_return(0, &rdtp->dynticks) & 0x1))
+                            !(atomic_add_return(0, &rdtp->dynticks) & 0x1) ||
+                            !(rnp->qsmaskinitnext & rdp->grpmask))
                                mask_ofl_test |= rdp->grpmask;
                }
                mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
@@ -384,17 +385,16 @@ retry_ipi:
                                mask_ofl_ipi &= ~mask;
                                continue;
                        }
-                        /* Failed, raced with offline. */
+                        /* Failed, raced with CPU hotplug operation. */
                        raw_spin_lock_irqsave_rcu_node(rnp, flags);
-                        if (cpu_online(cpu) &&
+                        if ((rnp->qsmaskinitnext & mask) &&
                            (rnp->expmask & mask)) {
+                                /* Online, so delay for a bit and try again. */
                                raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
                                schedule_timeout_uninterruptible(1);
-                                if (cpu_online(cpu) &&
+                                goto retry_ipi;
-                                    (rnp->expmask & mask))
-                                        goto retry_ipi;
-                                raw_spin_lock_irqsave_rcu_node(rnp, flags);
                        }
+                        /* CPU really is offline, so we can ignore it. */
                        if (!(rnp->expmask & mask))
                                mask_ofl_ipi &= ~mask;
                        raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@ -427,12 +427,10 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
                                jiffies_stall);
                if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root))
                        return;
-                if (ret < 0) {
+                WARN_ON(ret < 0);  /* workqueues should not be signaled. */
-                        /* Hit a signal, disable CPU stall warnings. */
+                if (rcu_cpu_stall_suppress)
-                        swait_event(rsp->expedited_wq,
+                        continue;
-                                   sync_rcu_preempt_exp_done(rnp_root));
+                panic_on_rcu_stall();
-                        return;
-                }
                pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {",
                       rsp->name);
                ndetected = 0;
@@ -500,7 +498,6 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s)
         * next GP, to proceed.
         */
        mutex_lock(&rsp->exp_wake_mutex);
-        mutex_unlock(&rsp->exp_mutex);
        rcu_for_each_node_breadth_first(rsp, rnp) {
                if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) {
@@ -516,6 +513,70 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s)
        mutex_unlock(&rsp->exp_wake_mutex);
 }
+/* Let the workqueue handler know what it is supposed to do. */
+struct rcu_exp_work {
+        smp_call_func_t rew_func;
+        struct rcu_state *rew_rsp;
+        unsigned long rew_s;
+        struct work_struct rew_work;
+};
+/*
+ * Work-queue handler to drive an expedited grace period forward.
+ */
+static void wait_rcu_exp_gp(struct work_struct *wp)
+{
+        struct rcu_exp_work *rewp;
+        /* Initialize the rcu_node tree in preparation for the wait. */
+        rewp = container_of(wp, struct rcu_exp_work, rew_work);
+        sync_rcu_exp_select_cpus(rewp->rew_rsp, rewp->rew_func);
+        /* Wait and clean up, including waking everyone. */
+        rcu_exp_wait_wake(rewp->rew_rsp, rewp->rew_s);
+}
+/*
+ * Given an rcu_state pointer and a smp_call_function() handler, kick
+ * off the specified flavor of expedited grace period.
+ */
+static void _synchronize_rcu_expedited(struct rcu_state *rsp,
+                                       smp_call_func_t func)
+{
+        struct rcu_data *rdp;
+        struct rcu_exp_work rew;
+        struct rcu_node *rnp;
+        unsigned long s;
+        /* If expedited grace periods are prohibited, fall back to normal. */
+        if (rcu_gp_is_normal()) {
+                wait_rcu_gp(rsp->call);
+                return;
+        }
+        /* Take a snapshot of the sequence number.  */
+        s = rcu_exp_gp_seq_snap(rsp);
+        if (exp_funnel_lock(rsp, s))
+                return;  /* Someone else did our work for us. */
+        /* Marshall arguments and schedule the expedited grace period. */
+        rew.rew_func = func;
+        rew.rew_rsp = rsp;
+        rew.rew_s = s;
+        INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp);
+        schedule_work(&rew.rew_work);
+        /* Wait for expedited grace period to complete. */
+        rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
+        rnp = rcu_get_root(rsp);
+        wait_event(rnp->exp_wq[(s >> 1) & 0x3],
+                   sync_exp_work_done(rsp,
+                                      &rdp->exp_workdone0, s));
+        /* Let the next expedited grace period start. */
+        mutex_unlock(&rsp->exp_mutex);
+}
 /**
 * synchronize_sched_expedited - Brute-force RCU-sched grace period
 *
@@ -534,29 +595,13 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s)
 */
 void synchronize_sched_expedited(void)
 {
-        unsigned long s;
        struct rcu_state *rsp = &rcu_sched_state;
        /* If only one CPU, this is automatically a grace period. */
        if (rcu_blocking_is_gp())
                return;
-        /* If expedited grace periods are prohibited, fall back to normal. */
+        _synchronize_rcu_expedited(rsp, sync_sched_exp_handler);
-        if (rcu_gp_is_normal()) {
-                wait_rcu_gp(call_rcu_sched);
-                return;
-        }
-        /* Take a snapshot of the sequence number.  */
-        s = rcu_exp_gp_seq_snap(rsp);
-        if (exp_funnel_lock(rsp, s))
-                return;  /* Someone else did our work for us. */
-        /* Initialize the rcu_node tree in preparation for the wait. */
-        sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler);
-        /* Wait and clean up, including waking everyone. */
-        rcu_exp_wait_wake(rsp, s);
 }
 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
@@ -620,23 +665,8 @@ static void sync_rcu_exp_handler(void *info)
 void synchronize_rcu_expedited(void)
 {
        struct rcu_state *rsp = rcu_state_p;
-        unsigned long s;
-        /* If expedited grace periods are prohibited, fall back to normal. */
-        if (rcu_gp_is_normal()) {
-                wait_rcu_gp(call_rcu);
-                return;
-        }
-        s = rcu_exp_gp_seq_snap(rsp);
-        if (exp_funnel_lock(rsp, s))
-                return;  /* Someone else did our work for us. */
-        /* Initialize the rcu_node tree in preparation for the wait. */
-        sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler);
-        /* Wait for ->blkd_tasks lists to drain, then wake everyone up. */
+        _synchronize_rcu_expedited(rsp, sync_rcu_exp_handler);
-        rcu_exp_wait_wake(rsp, s);
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 0082fce402a0..85c5a883c6e3 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2173,6 +2173,7 @@ static int rcu_nocb_kthread(void *arg)
                                cl++;
                        c++;
                        local_bh_enable();
+                        cond_resched_rcu_qs();
                        list = next;
                }
                trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1);
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 86782f9a4604..b1f28972872c 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -185,16 +185,17 @@ static int show_rcuexp(struct seq_file *m, void *v)
        int cpu;
        struct rcu_state *rsp = (struct rcu_state *)m->private;
        struct rcu_data *rdp;
-        unsigned long s1 = 0, s2 = 0, s3 = 0;
+        unsigned long s0 = 0, s1 = 0, s2 = 0, s3 = 0;
        for_each_possible_cpu(cpu) {
                rdp = per_cpu_ptr(rsp->rda, cpu);
+                s0 += atomic_long_read(&rdp->exp_workdone0);
                s1 += atomic_long_read(&rdp->exp_workdone1);
                s2 += atomic_long_read(&rdp->exp_workdone2);
                s3 += atomic_long_read(&rdp->exp_workdone3);
        }
-        seq_printf(m, "s=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n",
+        seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n",
-                   rsp->expedited_sequence, s1, s2, s3,
+                   rsp->expedited_sequence, s0, s1, s2, s3,
                   atomic_long_read(&rsp->expedited_normal),
                   atomic_read(&rsp->expedited_need_qs),
                   rsp->expedited_sequence / 2);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index f0d8322bc3ec..f19271dce0a9 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -46,7 +46,7 @@
 #include <linux/export.h>
 #include <linux/hardirq.h>
 #include <linux/delay.h>
-#include <linux/module.h>
+#include <linux/moduleparam.h>
 #include <linux/kthread.h>
 #include <linux/tick.h>
@@ -54,7 +54,6 @@
 #include "rcu.h"
-MODULE_ALIAS("rcupdate");
 #ifdef MODULE_PARAM_PREFIX
 #undef MODULE_PARAM_PREFIX
 #endif
diff --git a/kernel/relay.c b/kernel/relay.c
index d797502140b9..da79a109dbeb 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -214,7 +214,7 @@ static void relay_destroy_buf(struct rchan_buf *buf)
                        __free_page(buf->page_array[i]);
                relay_free_page_array(buf->page_array);
        }
-        chan->buf[buf->cpu] = NULL;
+        *per_cpu_ptr(chan->buf, buf->cpu) = NULL;
        kfree(buf->padding);
        kfree(buf);
        kref_put(&chan->kref, relay_destroy_channel);
@@ -328,13 +328,15 @@ static struct rchan_callbacks default_channel_callbacks = {
 /**
 *      wakeup_readers - wake up readers waiting on a channel
- *      @data: contains the channel buffer
+ *      @work: contains the channel buffer
 *
- *      This is the timer function used to defer reader waking.
+ *      This is the function used to defer reader waking
 */
-static void wakeup_readers(unsigned long data)
+static void wakeup_readers(struct irq_work *work)
 {
-        struct rchan_buf *buf = (struct rchan_buf *)data;
+        struct rchan_buf *buf;
+        buf = container_of(work, struct rchan_buf, wakeup_work);
        wake_up_interruptible(&buf->read_wait);
 }
@@ -352,9 +354,10 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init)
        if (init) {
                init_waitqueue_head(&buf->read_wait);
                kref_init(&buf->kref);
-                setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf);
+                init_irq_work(&buf->wakeup_work, wakeup_readers);
-        } else
+        } else {
-                del_timer_sync(&buf->timer);
+                irq_work_sync(&buf->wakeup_work);
+        }
        buf->subbufs_produced = 0;
        buf->subbufs_consumed = 0;
@@ -382,20 +385,21 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init)
 */
 void relay_reset(struct rchan *chan)
 {
+        struct rchan_buf *buf;
        unsigned int i;
        if (!chan)
                return;
-        if (chan->is_global && chan->buf[0]) {
+        if (chan->is_global && (buf = *per_cpu_ptr(chan->buf, 0))) {
-                __relay_reset(chan->buf[0], 0);
+                __relay_reset(buf, 0);
                return;
        }
        mutex_lock(&relay_channels_mutex);
        for_each_possible_cpu(i)
-                if (chan->buf[i])
+                if ((buf = *per_cpu_ptr(chan->buf, i)))
-                        __relay_reset(chan->buf[i], 0);
+                        __relay_reset(buf, 0);
        mutex_unlock(&relay_channels_mutex);
 }
 EXPORT_SYMBOL_GPL(relay_reset);
@@ -440,7 +444,7 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu)
        struct dentry *dentry;
        if (chan->is_global)
-                return chan->buf[0];
+                return *per_cpu_ptr(chan->buf, 0);
        buf = relay_create_buf(chan);
        if (!buf)
@@ -464,7 +468,7 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu)
        __relay_reset(buf, 1);
        if(chan->is_global) {
-                chan->buf[0] = buf;
+                *per_cpu_ptr(chan->buf, 0) = buf;
                buf->cpu = 0;
        }
@@ -486,7 +490,7 @@ free_buf:
 static void relay_close_buf(struct rchan_buf *buf)
 {
        buf->finalized = 1;
-        del_timer_sync(&buf->timer);
+        irq_work_sync(&buf->wakeup_work);
        buf->chan->cb->remove_buf_file(buf->dentry);
        kref_put(&buf->kref, relay_remove_buf);
 }
@@ -512,46 +516,25 @@ static void setup_callbacks(struct rchan *chan,
        chan->cb = cb;
 }
-/**
+int relay_prepare_cpu(unsigned int cpu)
- *      relay_hotcpu_callback - CPU hotplug callback
- *      @nb: notifier block
- *      @action: hotplug action to take
- *      @hcpu: CPU number
- *
- *      Returns the success/failure of the operation. (%NOTIFY_OK, %NOTIFY_BAD)
- */
-static int relay_hotcpu_callback(struct notifier_block *nb,
-                                unsigned long action,
-                                void *hcpu)
 {
-        unsigned int hotcpu = (unsigned long)hcpu;
        struct rchan *chan;
+        struct rchan_buf *buf;
-        switch(action) {
+        mutex_lock(&relay_channels_mutex);
-        case CPU_UP_PREPARE:
+        list_for_each_entry(chan, &relay_channels, list) {
-        case CPU_UP_PREPARE_FROZEN:
+                if ((buf = *per_cpu_ptr(chan->buf, cpu)))
-                mutex_lock(&relay_channels_mutex);
+                        continue;
-                list_for_each_entry(chan, &relay_channels, list) {
+                buf = relay_open_buf(chan, cpu);
-                        if (chan->buf[hotcpu])
+                if (!buf) {
-                                continue;
+                        pr_err("relay: cpu %d buffer creation failed\n", cpu);
-                        chan->buf[hotcpu] = relay_open_buf(chan, hotcpu);
+                        mutex_unlock(&relay_channels_mutex);
-                        if(!chan->buf[hotcpu]) {
+                        return -ENOMEM;
-                                printk(KERN_ERR
-                                        "relay_hotcpu_callback: cpu %d buffer "
-                                        "creation failed\n", hotcpu);
-                                mutex_unlock(&relay_channels_mutex);
-                                return notifier_from_errno(-ENOMEM);
-                        }
                }
-                mutex_unlock(&relay_channels_mutex);
+                *per_cpu_ptr(chan->buf, cpu) = buf;
-                break;
-        case CPU_DEAD:
-        case CPU_DEAD_FROZEN:
-                /* No need to flush the cpu : will be flushed upon
-                 * final relay_flush() call. */
-                break;
        }
-        return NOTIFY_OK;
+        mutex_unlock(&relay_channels_mutex);
+        return 0;
 }
 /**
@@ -583,6 +566,7 @@ struct rchan *relay_open(const char *base_filename,
 {
        unsigned int i;
        struct rchan *chan;
+        struct rchan_buf *buf;
        if (!(subbuf_size && n_subbufs))
                return NULL;
@@ -593,6 +577,7 @@ struct rchan *relay_open(const char *base_filename,
        if (!chan)
                return NULL;
+        chan->buf = alloc_percpu(struct rchan_buf *);
        chan->version = RELAYFS_CHANNEL_VERSION;
        chan->n_subbufs = n_subbufs;
        chan->subbuf_size = subbuf_size;
@@ -608,9 +593,10 @@ struct rchan *relay_open(const char *base_filename,
        mutex_lock(&relay_channels_mutex);
        for_each_online_cpu(i) {
-                chan->buf[i] = relay_open_buf(chan, i);
+                buf = relay_open_buf(chan, i);
-                if (!chan->buf[i])
+                if (!buf)
                        goto free_bufs;
+                *per_cpu_ptr(chan->buf, i) = buf;
        }
        list_add(&chan->list, &relay_channels);
        mutex_unlock(&relay_channels_mutex);
@@ -619,8 +605,8 @@ struct rchan *relay_open(const char *base_filename,
 free_bufs:
        for_each_possible_cpu(i) {
-                if (chan->buf[i])
+                if ((buf = *per_cpu_ptr(chan->buf, i)))
-                        relay_close_buf(chan->buf[i]);
+                        relay_close_buf(buf);
        }
        kref_put(&chan->kref, relay_destroy_channel);
@@ -666,6 +652,7 @@ int relay_late_setup_files(struct rchan *chan,
        unsigned int i, curr_cpu;
        unsigned long flags;
        struct dentry *dentry;
+        struct rchan_buf *buf;
        struct rchan_percpu_buf_dispatcher disp;
        if (!chan || !base_filename)
@@ -684,10 +671,11 @@ int relay_late_setup_files(struct rchan *chan,
        if (chan->is_global) {
                err = -EINVAL;
-                if (!WARN_ON_ONCE(!chan->buf[0])) {
+                buf = *per_cpu_ptr(chan->buf, 0);
-                        dentry = relay_create_buf_file(chan, chan->buf[0], 0);
+                if (!WARN_ON_ONCE(!buf)) {
+                        dentry = relay_create_buf_file(chan, buf, 0);
                        if (dentry && !WARN_ON_ONCE(!chan->is_global)) {
-                                relay_set_buf_dentry(chan->buf[0], dentry);
+                                relay_set_buf_dentry(buf, dentry);
                                err = 0;
                        }
                }
@@ -702,13 +690,14 @@ int relay_late_setup_files(struct rchan *chan,
         * on all currently online CPUs.
         */
        for_each_online_cpu(i) {
-                if (unlikely(!chan->buf[i])) {
+                buf = *per_cpu_ptr(chan->buf, i);
+                if (unlikely(!buf)) {
                        WARN_ONCE(1, KERN_ERR "CPU has no buffer!\n");
                        err = -EINVAL;
                        break;
                }
-                dentry = relay_create_buf_file(chan, chan->buf[i], i);
+                dentry = relay_create_buf_file(chan, buf, i);
                if (unlikely(!dentry)) {
                        err = -EINVAL;
                        break;
@@ -716,10 +705,10 @@ int relay_late_setup_files(struct rchan *chan,
                if (curr_cpu == i) {
                        local_irq_save(flags);
-                        relay_set_buf_dentry(chan->buf[i], dentry);
+                        relay_set_buf_dentry(buf, dentry);
                        local_irq_restore(flags);
                } else {
-                        disp.buf = chan->buf[i];
+                        disp.buf = buf;
                        disp.dentry = dentry;
                        smp_mb();
                        /* relay_channels_mutex must be held, so wait. */
@@ -768,14 +757,15 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
                        buf->early_bytes += buf->chan->subbuf_size -
                                            buf->padding[old_subbuf];
                smp_mb();
-                if (waitqueue_active(&buf->read_wait))
+                if (waitqueue_active(&buf->read_wait)) {
                        /*
                         * Calling wake_up_interruptible() from here
                         * will deadlock if we happen to be logging
                         * from the scheduler (trying to re-grab
                         * rq->lock), so defer it.
                         */
-                        mod_timer(&buf->timer, jiffies + 1);
+                        irq_work_queue(&buf->wakeup_work);
+                }
        }
        old = buf->data;
@@ -822,11 +812,10 @@ void relay_subbufs_consumed(struct rchan *chan,
        if (!chan)
                return;
-        if (cpu >= NR_CPUS || !chan->buf[cpu] ||
+        buf = *per_cpu_ptr(chan->buf, cpu);
-                                        subbufs_consumed > chan->n_subbufs)
+        if (cpu >= NR_CPUS || !buf || subbufs_consumed > chan->n_subbufs)
                return;
-        buf = chan->buf[cpu];
        if (subbufs_consumed > buf->subbufs_produced - buf->subbufs_consumed)
                buf->subbufs_consumed = buf->subbufs_produced;
        else
@@ -842,18 +831,19 @@ EXPORT_SYMBOL_GPL(relay_subbufs_consumed);
 */
 void relay_close(struct rchan *chan)
 {
+        struct rchan_buf *buf;
        unsigned int i;
        if (!chan)
                return;
        mutex_lock(&relay_channels_mutex);
-        if (chan->is_global && chan->buf[0])
+        if (chan->is_global && (buf = *per_cpu_ptr(chan->buf, 0)))
-                relay_close_buf(chan->buf[0]);
+                relay_close_buf(buf);
        else
                for_each_possible_cpu(i)
-                        if (chan->buf[i])
+                        if ((buf = *per_cpu_ptr(chan->buf, i)))
-                                relay_close_buf(chan->buf[i]);
+                                relay_close_buf(buf);
        if (chan->last_toobig)
                printk(KERN_WARNING "relay: one or more items not logged "
@@ -874,20 +864,21 @@ EXPORT_SYMBOL_GPL(relay_close);
 */
 void relay_flush(struct rchan *chan)
 {
+        struct rchan_buf *buf;
        unsigned int i;
        if (!chan)
                return;
-        if (chan->is_global && chan->buf[0]) {
+        if (chan->is_global && (buf = *per_cpu_ptr(chan->buf, 0))) {
-                relay_switch_subbuf(chan->buf[0], 0);
+                relay_switch_subbuf(buf, 0);
                return;
        }
        mutex_lock(&relay_channels_mutex);
        for_each_possible_cpu(i)
-                if (chan->buf[i])
+                if ((buf = *per_cpu_ptr(chan->buf, i)))
-                        relay_switch_subbuf(chan->buf[i], 0);
+                        relay_switch_subbuf(buf, 0);
        mutex_unlock(&relay_channels_mutex);
 }
 EXPORT_SYMBOL_GPL(relay_flush);
@@ -1121,51 +1112,23 @@ static size_t relay_file_read_end_pos(struct rchan_buf *buf,
        return end_pos;
 }
-/*
+static ssize_t relay_file_read(struct file *filp,
- *      subbuf_read_actor - read up to one subbuf's worth of data
+                               char __user *buffer,
- */
+                               size_t count,
-static int subbuf_read_actor(size_t read_start,
+                               loff_t *ppos)
-                             struct rchan_buf *buf,
-                             size_t avail,
-                             read_descriptor_t *desc)
-{
-        void *from;
-        int ret = 0;
-        from = buf->start + read_start;
-        ret = avail;
-        if (copy_to_user(desc->arg.buf, from, avail)) {
-                desc->error = -EFAULT;
-                ret = 0;
-        }
-        desc->arg.data += ret;
-        desc->written += ret;
-        desc->count -= ret;
-        return ret;
-}
-typedef int (*subbuf_actor_t) (size_t read_start,
-                               struct rchan_buf *buf,
-                               size_t avail,
-                               read_descriptor_t *desc);
-/*
- *      relay_file_read_subbufs - read count bytes, bridging subbuf boundaries
- */
-static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
-                                        subbuf_actor_t subbuf_actor,
-                                        read_descriptor_t *desc)
 {
        struct rchan_buf *buf = filp->private_data;
        size_t read_start, avail;
+        size_t written = 0;
        int ret;
-        if (!desc->count)
+        if (!count)
                return 0;
        inode_lock(file_inode(filp));
        do {
+                void *from;
                if (!relay_file_read_avail(buf, *ppos))
                        break;
@@ -1174,32 +1137,22 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
                if (!avail)
                        break;
-                avail = min(desc->count, avail);
+                avail = min(count, avail);
-                ret = subbuf_actor(read_start, buf, avail, desc);
+                from = buf->start + read_start;
-                if (desc->error < 0)
+                ret = avail;
+                if (copy_to_user(buffer, from, avail))
                        break;
-                if (ret) {
+                buffer += ret;
-                        relay_file_read_consume(buf, read_start, ret);
+                written += ret;
-                        *ppos = relay_file_read_end_pos(buf, read_start, ret);
+                count -= ret;
-                }
-        } while (desc->count && ret);
-        inode_unlock(file_inode(filp));
-        return desc->written;
+                relay_file_read_consume(buf, read_start, ret);
-}
+                *ppos = relay_file_read_end_pos(buf, read_start, ret);
+        } while (count);
+        inode_unlock(file_inode(filp));
-static ssize_t relay_file_read(struct file *filp,
+        return written;
-                               char __user *buffer,
-                               size_t count,
-                               loff_t *ppos)
-{
-        read_descriptor_t desc;
-        desc.written = 0;
-        desc.count = count;
-        desc.arg.buf = buffer;
-        desc.error = 0;
-        return relay_file_read_subbufs(filp, ppos, subbuf_read_actor, &desc);
 }
 static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed)
@@ -1377,12 +1330,3 @@ const struct file_operations relay_file_operations = {
        .splice_read    = relay_file_splice_read,
 };
 EXPORT_SYMBOL_GPL(relay_file_operations);
-static __init int relay_init(void)
-{
-        hotcpu_notifier(relay_hotcpu_callback, 0);
-        return 0;
-}
-early_initcall(relay_init);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2a906f20fba7..94732d1ab00a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -581,6 +581,8 @@ static bool wake_up_full_nohz_cpu(int cpu)
         * If needed we can still optimize that later with an
         * empty IRQ.
         */
+        if (cpu_is_offline(cpu))
+                return true;  /* Don't try to wake offline CPUs. */
        if (tick_nohz_full_cpu(cpu)) {
                if (cpu != smp_processor_id() ||
                    tick_nohz_tick_stopped())
@@ -591,6 +593,11 @@ static bool wake_up_full_nohz_cpu(int cpu)
        return false;
 }
+/*
+ * Wake up the specified CPU.  If the CPU is going offline, it is the
+ * caller's responsibility to deal with the lost wakeup, for example,
+ * by hooking into the CPU_DEAD notifier like timers and hrtimers do.
+ */
 void wake_up_nohz_cpu(int cpu)
 {
        if (!wake_up_full_nohz_cpu(cpu))
@@ -1063,8 +1070,12 @@ static int migration_cpu_stop(void *data)
         * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
         * we're holding p->pi_lock.
         */
-        if (task_rq(p) == rq && task_on_rq_queued(p))
+        if (task_rq(p) == rq) {
-                rq = __migrate_task(rq, p, arg->dest_cpu);
+                if (task_on_rq_queued(p))
+                        rq = __migrate_task(rq, p, arg->dest_cpu);
+                else
+                        p->wake_cpu = arg->dest_cpu;
+        }
        raw_spin_unlock(&rq->lock);
        raw_spin_unlock(&p->pi_lock);
@@ -1105,10 +1116,10 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
        p->sched_class->set_cpus_allowed(p, new_mask);
-        if (running)
-                p->sched_class->set_curr_task(rq);
        if (queued)
                enqueue_task(rq, p, ENQUEUE_RESTORE);
+        if (running)
+                set_curr_task(rq, p);
 }
 /*
@@ -1265,7 +1276,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
                /*
                 * Task isn't running anymore; make it appear like we migrated
                 * it before it went to sleep. This means on wakeup we make the
-                 * previous cpu our targer instead of where it really is.
+                 * previous cpu our target instead of where it really is.
                 */
                p->wake_cpu = cpu;
        }
@@ -1629,23 +1640,25 @@ static inline int __set_cpus_allowed_ptr(struct task_struct *p,
 static void
 ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
 {
-#ifdef CONFIG_SCHEDSTATS
+        struct rq *rq;
-        struct rq *rq = this_rq();
-#ifdef CONFIG_SMP
+        if (!schedstat_enabled())
-        int this_cpu = smp_processor_id();
+                return;
-        if (cpu == this_cpu) {
+        rq = this_rq();
-                schedstat_inc(rq, ttwu_local);
-                schedstat_inc(p, se.statistics.nr_wakeups_local);
+#ifdef CONFIG_SMP
+        if (cpu == rq->cpu) {
+                schedstat_inc(rq->ttwu_local);
+                schedstat_inc(p->se.statistics.nr_wakeups_local);
        } else {
                struct sched_domain *sd;
-                schedstat_inc(p, se.statistics.nr_wakeups_remote);
+                schedstat_inc(p->se.statistics.nr_wakeups_remote);
                rcu_read_lock();
-                for_each_domain(this_cpu, sd) {
+                for_each_domain(rq->cpu, sd) {
                        if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
-                                schedstat_inc(sd, ttwu_wake_remote);
+                                schedstat_inc(sd->ttwu_wake_remote);
                                break;
                        }
                }
@@ -1653,17 +1666,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
        }
        if (wake_flags & WF_MIGRATED)
-                schedstat_inc(p, se.statistics.nr_wakeups_migrate);
+                schedstat_inc(p->se.statistics.nr_wakeups_migrate);
 #endif /* CONFIG_SMP */
-        schedstat_inc(rq, ttwu_count);
+        schedstat_inc(rq->ttwu_count);
-        schedstat_inc(p, se.statistics.nr_wakeups);
+        schedstat_inc(p->se.statistics.nr_wakeups);
        if (wake_flags & WF_SYNC)
-                schedstat_inc(p, se.statistics.nr_wakeups_sync);
+                schedstat_inc(p->se.statistics.nr_wakeups_sync);
-#endif /* CONFIG_SCHEDSTATS */
 }
 static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
@@ -2016,6 +2026,28 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
        success = 1; /* we're going to change ->state */
        cpu = task_cpu(p);
+        /*
+         * Ensure we load p->on_rq _after_ p->state, otherwise it would
+         * be possible to, falsely, observe p->on_rq == 0 and get stuck
+         * in smp_cond_load_acquire() below.
+         *
+         * sched_ttwu_pending()                 try_to_wake_up()
+         *   [S] p->on_rq = 1;                  [L] P->state
+         *       UNLOCK rq->lock  -----.
+         *                              \
+         *                               +---   RMB
+         * schedule()                   /
+         *       LOCK rq->lock    -----'
+         *       UNLOCK rq->lock
+         *
+         * [task p]
+         *   [S] p->state = UNINTERRUPTIBLE     [L] p->on_rq
+         *
+         * Pairs with the UNLOCK+LOCK on rq->lock from the
+         * last wakeup of our task and the schedule that got our task
+         * current.
+         */
+        smp_rmb();
        if (p->on_rq && ttwu_remote(p, wake_flags))
                goto stat;
@@ -2062,8 +2094,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
        ttwu_queue(p, cpu, wake_flags);
 stat:
-        if (schedstat_enabled())
+        ttwu_stat(p, cpu, wake_flags);
-                ttwu_stat(p, cpu, wake_flags);
 out:
        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
@@ -2073,6 +2104,7 @@ out:
 /**
 * try_to_wake_up_local - try to wake up a local task with rq lock held
 * @p: the thread to be awakened
+ * @cookie: context's cookie for pinning
 *
 * Put @p on the run-queue if it's not already there. The caller must
 * ensure that this_rq() is locked, @p is bound to this_rq() and not
@@ -2111,8 +2143,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie
                ttwu_activate(rq, p, ENQUEUE_WAKEUP);
        ttwu_do_wakeup(rq, p, 0, cookie);
-        if (schedstat_enabled())
+        ttwu_stat(p, smp_processor_id(), 0);
-                ttwu_stat(p, smp_processor_id(), 0);
 out:
        raw_spin_unlock(&p->pi_lock);
 }
@@ -2750,6 +2781,10 @@ static struct rq *finish_task_switch(struct task_struct *prev)
                 * task and put them back on the free list.
                 */
                kprobe_flush_task(prev);
+                /* Task is done with its stack. */
+                put_task_stack(prev);
                put_task_struct(prev);
        }
@@ -3170,6 +3205,9 @@ static inline void preempt_latency_stop(int val) { }
 */
 static noinline void __schedule_bug(struct task_struct *prev)
 {
+        /* Save this before calling printk(), since that will clobber it */
+        unsigned long preempt_disable_ip = get_preempt_disable_ip(current);
        if (oops_in_progress)
                return;
@@ -3180,13 +3218,12 @@ static noinline void __schedule_bug(struct task_struct *prev)
        print_modules();
        if (irqs_disabled())
                print_irqtrace_events(prev);
-#ifdef CONFIG_DEBUG_PREEMPT
+        if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
-        if (in_atomic_preempt_off()) {
+            && in_atomic_preempt_off()) {
                pr_err("Preemption disabled at:");
-                print_ip_sym(current->preempt_disable_ip);
+                print_ip_sym(preempt_disable_ip);
                pr_cont("\n");
        }
-#endif
        if (panic_on_warn)
                panic("scheduling while atomic\n");
@@ -3212,7 +3249,7 @@ static inline void schedule_debug(struct task_struct *prev)
        profile_hit(SCHED_PROFILING, __builtin_return_address(0));
-        schedstat_inc(this_rq(), sched_count);
+        schedstat_inc(this_rq()->sched_count);
 }
 /*
@@ -3305,17 +3342,6 @@ static void __sched notrace __schedule(bool preempt)
        rq = cpu_rq(cpu);
        prev = rq->curr;
-        /*
-         * do_exit() calls schedule() with preemption disabled as an exception;
-         * however we must fix that up, otherwise the next task will see an
-         * inconsistent (higher) preempt count.
-         *
-         * It also avoids the below schedule_debug() test from complaining
-         * about this.
-         */
-        if (unlikely(prev->state == TASK_DEAD))
-                preempt_enable_no_resched_notrace();
        schedule_debug(prev);
        if (sched_feat(HRTICK))
@@ -3381,7 +3407,33 @@ static void __sched notrace __schedule(bool preempt)
        balance_callback(rq);
 }
-STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */
+void __noreturn do_task_dead(void)
+{
+        /*
+         * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
+         * when the following two conditions become true.
+         *   - There is race condition of mmap_sem (It is acquired by
+         *     exit_mm()), and
+         *   - SMI occurs before setting TASK_RUNINNG.
+         *     (or hypervisor of virtual machine switches to other guest)
+         *  As a result, we may become TASK_RUNNING after becoming TASK_DEAD
+         *
+         * To avoid it, we have to wait for releasing tsk->pi_lock which
+         * is held by try_to_wake_up()
+         */
+        smp_mb();
+        raw_spin_unlock_wait(&current->pi_lock);
+        /* causes final put_task_struct in finish_task_switch(). */
+        __set_current_state(TASK_DEAD);
+        current->flags |= PF_NOFREEZE;  /* tell freezer to ignore us */
+        __schedule(false);
+        BUG();
+        /* Avoid "noreturn function does return".  */
+        for (;;)
+                cpu_relax();    /* For when BUG is null */
+}
 static inline void sched_submit_work(struct task_struct *tsk)
 {
@@ -3665,10 +3717,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        p->prio = prio;
-        if (running)
-                p->sched_class->set_curr_task(rq);
        if (queued)
                enqueue_task(rq, p, queue_flag);
+        if (running)
+                set_curr_task(rq, p);
        check_class_changed(rq, p, prev_class, oldprio);
 out_unlock:
@@ -3682,7 +3734,8 @@ out_unlock:
 void set_user_nice(struct task_struct *p, long nice)
 {
-        int old_prio, delta, queued;
+        bool queued, running;
+        int old_prio, delta;
        struct rq_flags rf;
        struct rq *rq;
@@ -3704,8 +3757,11 @@ void set_user_nice(struct task_struct *p, long nice)
                goto out_unlock;
        }
        queued = task_on_rq_queued(p);
+        running = task_current(rq, p);
        if (queued)
                dequeue_task(rq, p, DEQUEUE_SAVE);
+        if (running)
+                put_prev_task(rq, p);
        p->static_prio = NICE_TO_PRIO(nice);
        set_load_weight(p);
@@ -3722,6 +3778,8 @@ void set_user_nice(struct task_struct *p, long nice)
                if (delta < 0 || (delta > 0 && task_running(rq, p)))
                        resched_curr(rq);
        }
+        if (running)
+                set_curr_task(rq, p);
 out_unlock:
        task_rq_unlock(rq, p, &rf);
 }
@@ -4221,8 +4279,6 @@ change:
        prev_class = p->sched_class;
        __setscheduler(rq, p, attr, pi);
-        if (running)
-                p->sched_class->set_curr_task(rq);
        if (queued) {
                /*
                 * We enqueue to tail when the priority of a task is
@@ -4233,6 +4289,8 @@ change:
                enqueue_task(rq, p, queue_flags);
        }
+        if (running)
+                set_curr_task(rq, p);
        check_class_changed(rq, p, prev_class, oldprio);
        preempt_disable(); /* avoid rq from going away on us */
@@ -4824,7 +4882,7 @@ SYSCALL_DEFINE0(sched_yield)
 {
        struct rq *rq = this_rq_lock();
-        schedstat_inc(rq, yld_count);
+        schedstat_inc(rq->yld_count);
        current->sched_class->yield_task(rq);
        /*
@@ -4841,6 +4899,7 @@ SYSCALL_DEFINE0(sched_yield)
        return 0;
 }
+#ifndef CONFIG_PREEMPT
 int __sched _cond_resched(void)
 {
        if (should_resched(0)) {
@@ -4850,6 +4909,7 @@ int __sched _cond_resched(void)
        return 0;
 }
 EXPORT_SYMBOL(_cond_resched);
+#endif
 /*
 * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
@@ -4975,7 +5035,7 @@ again:
        yielded = curr->sched_class->yield_to_task(rq, p, preempt);
        if (yielded) {
-                schedstat_inc(rq, yld_count);
+                schedstat_inc(rq->yld_count);
                /*
                 * Make p's CPU reschedule; pick_next_entity takes care of
                 * fairness.
@@ -5395,10 +5455,10 @@ void sched_setnuma(struct task_struct *p, int nid)
        p->numa_preferred_nid = nid;
-        if (running)
-                p->sched_class->set_curr_task(rq);
        if (queued)
                enqueue_task(rq, p, ENQUEUE_RESTORE);
+        if (running)
+                set_curr_task(rq, p);
        task_rq_unlock(rq, p, &rf);
 }
 #endif /* CONFIG_NUMA_BALANCING */
@@ -5695,6 +5755,8 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
        }
 }
 #else /* !CONFIG_SCHED_DEBUG */
+# define sched_debug_enabled 0
 # define sched_domain_debug(sd, cpu) do { } while (0)
 static inline bool sched_debug(void)
 {
@@ -5713,6 +5775,7 @@ static int sd_degenerate(struct sched_domain *sd)
                         SD_BALANCE_FORK |
                         SD_BALANCE_EXEC |
                         SD_SHARE_CPUCAPACITY |
+                         SD_ASYM_CPUCAPACITY |
                         SD_SHARE_PKG_RESOURCES |
                         SD_SHARE_POWERDOMAIN)) {
                if (sd->groups != sd->groups->next)
@@ -5743,6 +5806,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
                                SD_BALANCE_NEWIDLE |
                                SD_BALANCE_FORK |
                                SD_BALANCE_EXEC |
+                                SD_ASYM_CPUCAPACITY |
                                SD_SHARE_CPUCAPACITY |
                                SD_SHARE_PKG_RESOURCES |
                                SD_PREFER_SIBLING |
@@ -5887,10 +5951,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc)
        } while (sg != first);
 }
-static void free_sched_domain(struct rcu_head *rcu)
+static void destroy_sched_domain(struct sched_domain *sd)
 {
-        struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
        /*
         * If its an overlapping domain it has private groups, iterate and
         * nuke them all.
@@ -5901,18 +5963,26 @@ static void free_sched_domain(struct rcu_head *rcu)
                kfree(sd->groups->sgc);
                kfree(sd->groups);
        }
+        if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
+                kfree(sd->shared);
        kfree(sd);
 }
-static void destroy_sched_domain(struct sched_domain *sd, int cpu)
+static void destroy_sched_domains_rcu(struct rcu_head *rcu)
 {
-        call_rcu(&sd->rcu, free_sched_domain);
+        struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
+        while (sd) {
+                struct sched_domain *parent = sd->parent;
+                destroy_sched_domain(sd);
+                sd = parent;
+        }
 }
-static void destroy_sched_domains(struct sched_domain *sd, int cpu)
+static void destroy_sched_domains(struct sched_domain *sd)
 {
-        for (; sd; sd = sd->parent)
+        if (sd)
-                destroy_sched_domain(sd, cpu);
+                call_rcu(&sd->rcu, destroy_sched_domains_rcu);
 }
 /*
@@ -5927,14 +5997,14 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
 DEFINE_PER_CPU(struct sched_domain *, sd_llc);
 DEFINE_PER_CPU(int, sd_llc_size);
 DEFINE_PER_CPU(int, sd_llc_id);
+DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
 DEFINE_PER_CPU(struct sched_domain *, sd_numa);
-DEFINE_PER_CPU(struct sched_domain *, sd_busy);
 DEFINE_PER_CPU(struct sched_domain *, sd_asym);
 static void update_top_cache_domain(int cpu)
 {
+        struct sched_domain_shared *sds = NULL;
        struct sched_domain *sd;
-        struct sched_domain *busy_sd = NULL;
        int id = cpu;
        int size = 1;
@@ -5942,13 +6012,13 @@ static void update_top_cache_domain(int cpu)
        if (sd) {
                id = cpumask_first(sched_domain_span(sd));
                size = cpumask_weight(sched_domain_span(sd));
-                busy_sd = sd->parent; /* sd_busy */
+                sds = sd->shared;
        }
-        rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);
        rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
        per_cpu(sd_llc_size, cpu) = size;
        per_cpu(sd_llc_id, cpu) = id;
+        rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
        sd = lowest_flag_domain(cpu, SD_NUMA);
        rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
@@ -5984,7 +6054,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
                         */
                        if (parent->flags & SD_PREFER_SIBLING)
                                tmp->flags |= SD_PREFER_SIBLING;
-                        destroy_sched_domain(parent, cpu);
+                        destroy_sched_domain(parent);
                } else
                        tmp = tmp->parent;
        }
@@ -5992,7 +6062,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
        if (sd && sd_degenerate(sd)) {
                tmp = sd;
                sd = sd->parent;
-                destroy_sched_domain(tmp, cpu);
+                destroy_sched_domain(tmp);
                if (sd)
                        sd->child = NULL;
        }
@@ -6002,7 +6072,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
        rq_attach_root(rq, rd);
        tmp = rq->sd;
        rcu_assign_pointer(rq->sd, sd);
-        destroy_sched_domains(tmp, cpu);
+        destroy_sched_domains(tmp);
        update_top_cache_domain(cpu);
 }
@@ -6245,7 +6315,6 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
                return;
        update_group_capacity(sd, cpu);
-        atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);
 }
 /*
@@ -6333,6 +6402,9 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
        WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
        *per_cpu_ptr(sdd->sd, cpu) = NULL;
+        if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
+                *per_cpu_ptr(sdd->sds, cpu) = NULL;
        if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
                *per_cpu_ptr(sdd->sg, cpu) = NULL;
@@ -6352,26 +6424,37 @@ static int sched_domains_curr_level;
 /*
 * SD_flags allowed in topology descriptions.
 *
- * SD_SHARE_CPUCAPACITY      - describes SMT topologies
+ * These flags are purely descriptive of the topology and do not prescribe
- * SD_SHARE_PKG_RESOURCES - describes shared caches
+ * behaviour. Behaviour is artificial and mapped in the below sd_init()
- * SD_NUMA                - describes NUMA topologies
+ * function:
- * SD_SHARE_POWERDOMAIN   - describes shared power domain
+ *
+ *   SD_SHARE_CPUCAPACITY   - describes SMT topologies
+ *   SD_SHARE_PKG_RESOURCES - describes shared caches
+ *   SD_NUMA                - describes NUMA topologies
+ *   SD_SHARE_POWERDOMAIN   - describes shared power domain
+ *   SD_ASYM_CPUCAPACITY    - describes mixed capacity topologies
 *
- * Odd one out:
+ * Odd one out, which beside describing the topology has a quirk also
- * SD_ASYM_PACKING        - describes SMT quirks
+ * prescribes the desired behaviour that goes along with it:
+ *
+ *   SD_ASYM_PACKING        - describes SMT quirks
 */
 #define TOPOLOGY_SD_FLAGS               \
        (SD_SHARE_CPUCAPACITY |         \
         SD_SHARE_PKG_RESOURCES |       \
         SD_NUMA |                      \
         SD_ASYM_PACKING |              \
+         SD_ASYM_CPUCAPACITY |          \
         SD_SHARE_POWERDOMAIN)
 static struct sched_domain *
-sd_init(struct sched_domain_topology_level *tl, int cpu)
+sd_init(struct sched_domain_topology_level *tl,
+        const struct cpumask *cpu_map,
+        struct sched_domain *child, int cpu)
 {
-        struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
+        struct sd_data *sdd = &tl->data;
-        int sd_weight, sd_flags = 0;
+        struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
+        int sd_id, sd_weight, sd_flags = 0;
 #ifdef CONFIG_NUMA
        /*
@@ -6420,15 +6503,26 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
                .smt_gain               = 0,
                .max_newidle_lb_cost    = 0,
                .next_decay_max_lb_cost = jiffies,
+                .child                  = child,
 #ifdef CONFIG_SCHED_DEBUG
                .name                   = tl->name,
 #endif
        };
+        cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
+        sd_id = cpumask_first(sched_domain_span(sd));
        /*
         * Convert topological properties into behaviour.
         */
+        if (sd->flags & SD_ASYM_CPUCAPACITY) {
+                struct sched_domain *t = sd;
+                for_each_lower_domain(t)
+                        t->flags |= SD_BALANCE_WAKE;
+        }
        if (sd->flags & SD_SHARE_CPUCAPACITY) {
                sd->flags |= SD_PREFER_SIBLING;
                sd->imbalance_pct = 110;
@@ -6460,7 +6554,17 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
                sd->idle_idx = 1;
        }
-        sd->private = &tl->data;
+        /*
+         * For all levels sharing cache; connect a sched_domain_shared
+         * instance.
+         */
+        if (sd->flags & SD_SHARE_PKG_RESOURCES) {
+                sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
+                atomic_inc(&sd->shared->ref);
+                atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
+        }
+        sd->private = sdd;
        return sd;
 }
@@ -6487,6 +6591,9 @@ static struct sched_domain_topology_level *sched_domain_topology =
 void set_sched_topology(struct sched_domain_topology_level *tl)
 {
+        if (WARN_ON_ONCE(sched_smp_initialized))
+                return;
        sched_domain_topology = tl;
 }
@@ -6767,6 +6874,10 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
                if (!sdd->sd)
                        return -ENOMEM;
+                sdd->sds = alloc_percpu(struct sched_domain_shared *);
+                if (!sdd->sds)
+                        return -ENOMEM;
                sdd->sg = alloc_percpu(struct sched_group *);
                if (!sdd->sg)
                        return -ENOMEM;
@@ -6777,6 +6888,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
                for_each_cpu(j, cpu_map) {
                        struct sched_domain *sd;
+                        struct sched_domain_shared *sds;
                        struct sched_group *sg;
                        struct sched_group_capacity *sgc;
@@ -6787,6 +6899,13 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
                        *per_cpu_ptr(sdd->sd, j) = sd;
+                        sds = kzalloc_node(sizeof(struct sched_domain_shared),
+                                        GFP_KERNEL, cpu_to_node(j));
+                        if (!sds)
+                                return -ENOMEM;
+                        *per_cpu_ptr(sdd->sds, j) = sds;
                        sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
                                        GFP_KERNEL, cpu_to_node(j));
                        if (!sg)
@@ -6826,6 +6945,8 @@ static void __sdt_free(const struct cpumask *cpu_map)
                                kfree(*per_cpu_ptr(sdd->sd, j));
                        }
+                        if (sdd->sds)
+                                kfree(*per_cpu_ptr(sdd->sds, j));
                        if (sdd->sg)
                                kfree(*per_cpu_ptr(sdd->sg, j));
                        if (sdd->sgc)
@@ -6833,6 +6954,8 @@ static void __sdt_free(const struct cpumask *cpu_map)
                }
                free_percpu(sdd->sd);
                sdd->sd = NULL;
+                free_percpu(sdd->sds);
+                sdd->sds = NULL;
                free_percpu(sdd->sg);
                sdd->sg = NULL;
                free_percpu(sdd->sgc);
@@ -6844,16 +6967,12 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
                const struct cpumask *cpu_map, struct sched_domain_attr *attr,
                struct sched_domain *child, int cpu)
 {
-        struct sched_domain *sd = sd_init(tl, cpu);
+        struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu);
-        if (!sd)
-                return child;
-        cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
        if (child) {
                sd->level = child->level + 1;
                sched_domain_level_max = max(sched_domain_level_max, sd->level);
                child->parent = sd;
-                sd->child = child;
                if (!cpumask_subset(sched_domain_span(child),
                                    sched_domain_span(sd))) {
@@ -6884,6 +7003,7 @@ static int build_sched_domains(const struct cpumask *cpu_map,
        enum s_alloc alloc_state;
        struct sched_domain *sd;
        struct s_data d;
+        struct rq *rq = NULL;
        int i, ret = -ENOMEM;
        alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
@@ -6934,11 +7054,22 @@ static int build_sched_domains(const struct cpumask *cpu_map,
        /* Attach the domains */
        rcu_read_lock();
        for_each_cpu(i, cpu_map) {
+                rq = cpu_rq(i);
                sd = *per_cpu_ptr(d.sd, i);
+                /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
+                if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
+                        WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
                cpu_attach_domain(sd, d.rd, i);
        }
        rcu_read_unlock();
+        if (rq && sched_debug_enabled) {
+                pr_info("span: %*pbl (max cpu_capacity = %lu)\n",
+                        cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
+        }
        ret = 0;
 error:
        __free_domain_allocs(&d, alloc_state, cpu_map);
@@ -7297,6 +7428,22 @@ int sched_cpu_dying(unsigned int cpu)
 }
 #endif
+#ifdef CONFIG_SCHED_SMT
+DEFINE_STATIC_KEY_FALSE(sched_smt_present);
+static void sched_init_smt(void)
+{
+        /*
+         * We've enumerated all CPUs and will assume that if any CPU
+         * has SMT siblings, CPU0 will too.
+         */
+        if (cpumask_weight(cpu_smt_mask(0)) > 1)
+                static_branch_enable(&sched_smt_present);
+}
+#else
+static inline void sched_init_smt(void) { }
+#endif
 void __init sched_init_smp(void)
 {
        cpumask_var_t non_isolated_cpus;
@@ -7326,6 +7473,9 @@ void __init sched_init_smp(void)
        init_sched_rt_class();
        init_sched_dl_class();
+        sched_init_smt();
        sched_smp_initialized = true;
 }
@@ -7363,6 +7513,7 @@ static struct kmem_cache *task_group_cache __read_mostly;
 #endif
 DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
+DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
 void __init sched_init(void)
 {
@@ -7399,6 +7550,8 @@ void __init sched_init(void)
        for_each_possible_cpu(i) {
                per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
                        cpumask_size(), GFP_KERNEL, cpu_to_node(i));
+                per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
+                        cpumask_size(), GFP_KERNEL, cpu_to_node(i));
        }
 #endif /* CONFIG_CPUMASK_OFFSTACK */
@@ -7501,10 +7654,6 @@ void __init sched_init(void)
        set_load_weight(&init_task);
-#ifdef CONFIG_PREEMPT_NOTIFIERS
-        INIT_HLIST_HEAD(&init_task.preempt_notifiers);
-#endif
        /*
         * The boot idle thread does lazy MMU switching as well:
         */
@@ -7512,11 +7661,6 @@ void __init sched_init(void)
        enter_lazy_tlb(&init_mm, current);
        /*
-         * During early bootup we pretend to be a normal task:
-         */
-        current->sched_class = &fair_sched_class;
-        /*
         * Make us the idle thread. Technically, schedule() should not be
         * called from this thread, however somewhere below it might be,
         * but because we are the idle thread, we just pick up running again
@@ -7570,6 +7714,7 @@ EXPORT_SYMBOL(__might_sleep);
 void ___might_sleep(const char *file, int line, int preempt_offset)
 {
        static unsigned long prev_jiffy;        /* ratelimiting */
+        unsigned long preempt_disable_ip;
        rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
        if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
@@ -7580,6 +7725,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
                return;
        prev_jiffy = jiffies;
+        /* Save this before calling printk(), since that will clobber it */
+        preempt_disable_ip = get_preempt_disable_ip(current);
        printk(KERN_ERR
                "BUG: sleeping function called from invalid context at %s:%d\n",
                        file, line);
@@ -7594,14 +7742,14 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
        debug_show_held_locks(current);
        if (irqs_disabled())
                print_irqtrace_events(current);
-#ifdef CONFIG_DEBUG_PREEMPT
+        if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
-        if (!preempt_count_equals(preempt_offset)) {
+            && !preempt_count_equals(preempt_offset)) {
                pr_err("Preemption disabled at:");
-                print_ip_sym(current->preempt_disable_ip);
+                print_ip_sym(preempt_disable_ip);
                pr_cont("\n");
        }
-#endif
        dump_stack();
+        add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
 }
 EXPORT_SYMBOL(___might_sleep);
 #endif
@@ -7622,12 +7770,10 @@ void normalize_rt_tasks(void)
                if (p->flags & PF_KTHREAD)
                        continue;
-                p->se.exec_start                = 0;
+                p->se.exec_start = 0;
-#ifdef CONFIG_SCHEDSTATS
+                schedstat_set(p->se.statistics.wait_start,  0);
-                p->se.statistics.wait_start     = 0;
+                schedstat_set(p->se.statistics.sleep_start, 0);
-                p->se.statistics.sleep_start    = 0;
+                schedstat_set(p->se.statistics.block_start, 0);
-                p->se.statistics.block_start    = 0;
-#endif
                if (!dl_task(p) && !rt_task(p)) {
                        /*
@@ -7688,7 +7834,7 @@ struct task_struct *curr_task(int cpu)
 *
 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
 */
-void set_curr_task(int cpu, struct task_struct *p)
+void ia64_set_curr_task(int cpu, struct task_struct *p)
 {
        cpu_curr(cpu) = p;
 }
@@ -7819,10 +7965,10 @@ void sched_move_task(struct task_struct *tsk)
        sched_change_group(tsk, TASK_MOVE_GROUP);
-        if (unlikely(running))
-                tsk->sched_class->set_curr_task(rq);
        if (queued)
                enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
+        if (unlikely(running))
+                set_curr_task(rq, tsk);
        task_rq_unlock(rq, tsk, &rf);
 }
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index d4184498c9f5..e73119013c53 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -31,56 +31,81 @@ static inline int right_child(int i)
        return (i << 1) + 2;
 }
-static void cpudl_exchange(struct cpudl *cp, int a, int b)
+static void cpudl_heapify_down(struct cpudl *cp, int idx)
 {
-        int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;
+        int l, r, largest;
-        swap(cp->elements[a].cpu, cp->elements[b].cpu);
+        int orig_cpu = cp->elements[idx].cpu;
-        swap(cp->elements[a].dl , cp->elements[b].dl );
+        u64 orig_dl = cp->elements[idx].dl;
-        swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx);
+        if (left_child(idx) >= cp->size)
-}
+                return;
-static void cpudl_heapify(struct cpudl *cp, int idx)
-{
-        int l, r, largest;
        /* adapted from lib/prio_heap.c */
        while(1) {
+                u64 largest_dl;
                l = left_child(idx);
                r = right_child(idx);
                largest = idx;
+                largest_dl = orig_dl;
-                if ((l < cp->size) && dl_time_before(cp->elements[idx].dl,
+                if ((l < cp->size) && dl_time_before(orig_dl,
-                                                        cp->elements[l].dl))
+                                                cp->elements[l].dl)) {
                        largest = l;
-                if ((r < cp->size) && dl_time_before(cp->elements[largest].dl,
+                        largest_dl = cp->elements[l].dl;
-                                                        cp->elements[r].dl))
+                }
+                if ((r < cp->size) && dl_time_before(largest_dl,
+                                                cp->elements[r].dl))
                        largest = r;
                if (largest == idx)
                        break;
-                /* Push idx down the heap one level and bump one up */
+                /* pull largest child onto idx */
-                cpudl_exchange(cp, largest, idx);
+                cp->elements[idx].cpu = cp->elements[largest].cpu;
+                cp->elements[idx].dl = cp->elements[largest].dl;
+                cp->elements[cp->elements[idx].cpu].idx = idx;
                idx = largest;
        }
+        /* actual push down of saved original values orig_* */
+        cp->elements[idx].cpu = orig_cpu;
+        cp->elements[idx].dl = orig_dl;
+        cp->elements[cp->elements[idx].cpu].idx = idx;
 }
-static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl)
+static void cpudl_heapify_up(struct cpudl *cp, int idx)
 {
-        WARN_ON(idx == IDX_INVALID || !cpu_present(idx));
+        int p;
-        if (dl_time_before(new_dl, cp->elements[idx].dl)) {
+        int orig_cpu = cp->elements[idx].cpu;
-                cp->elements[idx].dl = new_dl;
+        u64 orig_dl = cp->elements[idx].dl;
-                cpudl_heapify(cp, idx);
-        } else {
+        if (idx == 0)
-                cp->elements[idx].dl = new_dl;
+                return;
-                while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl,
-                                        cp->elements[idx].dl)) {
+        do {
-                        cpudl_exchange(cp, idx, parent(idx));
+                p = parent(idx);
-                        idx = parent(idx);
+                if (dl_time_before(orig_dl, cp->elements[p].dl))
-                }
+                        break;
-        }
+                /* pull parent onto idx */
+                cp->elements[idx].cpu = cp->elements[p].cpu;
+                cp->elements[idx].dl = cp->elements[p].dl;
+                cp->elements[cp->elements[idx].cpu].idx = idx;
+                idx = p;
+        } while (idx != 0);
+        /* actual push up of saved original values orig_* */
+        cp->elements[idx].cpu = orig_cpu;
+        cp->elements[idx].dl = orig_dl;
+        cp->elements[cp->elements[idx].cpu].idx = idx;
+}
+static void cpudl_heapify(struct cpudl *cp, int idx)
+{
+        if (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl,
+                                cp->elements[idx].dl))
+                cpudl_heapify_up(cp, idx);
+        else
+                cpudl_heapify_down(cp, idx);
 }
 static inline int cpudl_maximum(struct cpudl *cp)
@@ -120,16 +145,15 @@ out:
 }
 /*
- * cpudl_set - update the cpudl max-heap
+ * cpudl_clear - remove a cpu from the cpudl max-heap
 * @cp: the cpudl max-heap context
 * @cpu: the target cpu
- * @dl: the new earliest deadline for this cpu
 *
 * Notes: assumes cpu_rq(cpu)->lock is locked
 *
 * Returns: (void)
 */
-void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
+void cpudl_clear(struct cpudl *cp, int cpu)
 {
        int old_idx, new_cpu;
        unsigned long flags;
@@ -137,47 +161,60 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
        WARN_ON(!cpu_present(cpu));
        raw_spin_lock_irqsave(&cp->lock, flags);
        old_idx = cp->elements[cpu].idx;
-        if (!is_valid) {
+        if (old_idx == IDX_INVALID) {
-                /* remove item */
+                /*
-                if (old_idx == IDX_INVALID) {
+                 * Nothing to remove if old_idx was invalid.
-                        /*
+                 * This could happen if a rq_offline_dl is
-                         * Nothing to remove if old_idx was invalid.
+                 * called for a CPU without -dl tasks running.
-                         * This could happen if a rq_offline_dl is
+                 */
-                         * called for a CPU without -dl tasks running.
+        } else {
-                         */
-                        goto out;
-                }
                new_cpu = cp->elements[cp->size - 1].cpu;
                cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl;
                cp->elements[old_idx].cpu = new_cpu;
                cp->size--;
                cp->elements[new_cpu].idx = old_idx;
                cp->elements[cpu].idx = IDX_INVALID;
-                while (old_idx > 0 && dl_time_before(
+                cpudl_heapify(cp, old_idx);
-                                cp->elements[parent(old_idx)].dl,
-                                cp->elements[old_idx].dl)) {
-                        cpudl_exchange(cp, old_idx, parent(old_idx));
-                        old_idx = parent(old_idx);
-                }
-                cpumask_set_cpu(cpu, cp->free_cpus);
-                cpudl_heapify(cp, old_idx);
-                goto out;
+                cpumask_set_cpu(cpu, cp->free_cpus);
        }
+        raw_spin_unlock_irqrestore(&cp->lock, flags);
+}
+/*
+ * cpudl_set - update the cpudl max-heap
+ * @cp: the cpudl max-heap context
+ * @cpu: the target cpu
+ * @dl: the new earliest deadline for this cpu
+ *
+ * Notes: assumes cpu_rq(cpu)->lock is locked
+ *
+ * Returns: (void)
+ */
+void cpudl_set(struct cpudl *cp, int cpu, u64 dl)
+{
+        int old_idx;
+        unsigned long flags;
+        WARN_ON(!cpu_present(cpu));
+        raw_spin_lock_irqsave(&cp->lock, flags);
+        old_idx = cp->elements[cpu].idx;
        if (old_idx == IDX_INVALID) {
-                cp->size++;
+                int new_idx = cp->size++;
-                cp->elements[cp->size - 1].dl = dl;
+                cp->elements[new_idx].dl = dl;
-                cp->elements[cp->size - 1].cpu = cpu;
+                cp->elements[new_idx].cpu = cpu;
-                cp->elements[cpu].idx = cp->size - 1;
+                cp->elements[cpu].idx = new_idx;
-                cpudl_change_key(cp, cp->size - 1, dl);
+                cpudl_heapify_up(cp, new_idx);
                cpumask_clear_cpu(cpu, cp->free_cpus);
        } else {
-                cpudl_change_key(cp, old_idx, dl);
+                cp->elements[old_idx].dl = dl;
+                cpudl_heapify(cp, old_idx);
        }
-out:
        raw_spin_unlock_irqrestore(&cp->lock, flags);
 }
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index fcbdf83fed7e..f7da8c55bba0 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -23,7 +23,8 @@ struct cpudl {
 #ifdef CONFIG_SMP
 int cpudl_find(struct cpudl *cp, struct task_struct *p,
               struct cpumask *later_mask);
-void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
+void cpudl_set(struct cpudl *cp, int cpu, u64 dl);
+void cpudl_clear(struct cpudl *cp, int cpu);
 int cpudl_init(struct cpudl *cp);
 void cpudl_set_freecpu(struct cpudl *cp, int cpu);
 void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
index 1141954e73b4..dbc51442ecbc 100644
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -33,7 +33,7 @@ DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
 */
 void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data,
                        void (*func)(struct update_util_data *data, u64 time,
-                                     unsigned long util, unsigned long max))
+                                     unsigned int flags))
 {
        if (WARN_ON(!data || !func))
                return;
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index a84641b222c1..69e06898997d 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -12,7 +12,6 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/cpufreq.h>
-#include <linux/module.h>
 #include <linux/slab.h>
 #include <trace/events/power.h>
@@ -48,11 +47,14 @@ struct sugov_cpu {
        struct sugov_policy *sg_policy;
        unsigned int cached_raw_freq;
+        unsigned long iowait_boost;
+        unsigned long iowait_boost_max;
+        u64 last_update;
        /* The fields below are only needed when sharing a policy. */
        unsigned long util;
        unsigned long max;
-        u64 last_update;
+        unsigned int flags;
 };
 static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
@@ -144,24 +146,75 @@ static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util,
        return cpufreq_driver_resolve_freq(policy, freq);
 }
+static void sugov_get_util(unsigned long *util, unsigned long *max)
+{
+        struct rq *rq = this_rq();
+        unsigned long cfs_max;
+        cfs_max = arch_scale_cpu_capacity(NULL, smp_processor_id());
+        *util = min(rq->cfs.avg.util_avg, cfs_max);
+        *max = cfs_max;
+}
+static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
+                                   unsigned int flags)
+{
+        if (flags & SCHED_CPUFREQ_IOWAIT) {
+                sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
+        } else if (sg_cpu->iowait_boost) {
+                s64 delta_ns = time - sg_cpu->last_update;
+                /* Clear iowait_boost if the CPU apprears to have been idle. */
+                if (delta_ns > TICK_NSEC)
+                        sg_cpu->iowait_boost = 0;
+        }
+}
+static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
+                               unsigned long *max)
+{
+        unsigned long boost_util = sg_cpu->iowait_boost;
+        unsigned long boost_max = sg_cpu->iowait_boost_max;
+        if (!boost_util)
+                return;
+        if (*util * boost_max < *max * boost_util) {
+                *util = boost_util;
+                *max = boost_max;
+        }
+        sg_cpu->iowait_boost >>= 1;
+}
 static void sugov_update_single(struct update_util_data *hook, u64 time,
-                                unsigned long util, unsigned long max)
+                                unsigned int flags)
 {
        struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
        struct sugov_policy *sg_policy = sg_cpu->sg_policy;
        struct cpufreq_policy *policy = sg_policy->policy;
+        unsigned long util, max;
        unsigned int next_f;
+        sugov_set_iowait_boost(sg_cpu, time, flags);
+        sg_cpu->last_update = time;
        if (!sugov_should_update_freq(sg_policy, time))
                return;
-        next_f = util == ULONG_MAX ? policy->cpuinfo.max_freq :
+        if (flags & SCHED_CPUFREQ_RT_DL) {
-                        get_next_freq(sg_cpu, util, max);
+                next_f = policy->cpuinfo.max_freq;
+        } else {
+                sugov_get_util(&util, &max);
+                sugov_iowait_boost(sg_cpu, &util, &max);
+                next_f = get_next_freq(sg_cpu, util, max);
+        }
        sugov_update_commit(sg_policy, time, next_f);
 }
 static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
-                                           unsigned long util, unsigned long max)
+                                           unsigned long util, unsigned long max,
+                                           unsigned int flags)
 {
        struct sugov_policy *sg_policy = sg_cpu->sg_policy;
        struct cpufreq_policy *policy = sg_policy->policy;
@@ -169,9 +222,11 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
        u64 last_freq_update_time = sg_policy->last_freq_update_time;
        unsigned int j;
-        if (util == ULONG_MAX)
+        if (flags & SCHED_CPUFREQ_RT_DL)
                return max_f;
+        sugov_iowait_boost(sg_cpu, &util, &max);
        for_each_cpu(j, policy->cpus) {
                struct sugov_cpu *j_sg_cpu;
                unsigned long j_util, j_max;
@@ -186,41 +241,50 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
                 * frequency update and the time elapsed between the last update
                 * of the CPU utilization and the last frequency update is long
                 * enough, don't take the CPU into account as it probably is
-                 * idle now.
+                 * idle now (and clear iowait_boost for it).
                 */
                delta_ns = last_freq_update_time - j_sg_cpu->last_update;
-                if (delta_ns > TICK_NSEC)
+                if (delta_ns > TICK_NSEC) {
+                        j_sg_cpu->iowait_boost = 0;
                        continue;
+                }
-                j_util = j_sg_cpu->util;
+                if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL)
-                if (j_util == ULONG_MAX)
                        return max_f;
+                j_util = j_sg_cpu->util;
                j_max = j_sg_cpu->max;
                if (j_util * max > j_max * util) {
                        util = j_util;
                        max = j_max;
                }
+                sugov_iowait_boost(j_sg_cpu, &util, &max);
        }
        return get_next_freq(sg_cpu, util, max);
 }
 static void sugov_update_shared(struct update_util_data *hook, u64 time,
-                                unsigned long util, unsigned long max)
+                                unsigned int flags)
 {
        struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
        struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+        unsigned long util, max;
        unsigned int next_f;
+        sugov_get_util(&util, &max);
        raw_spin_lock(&sg_policy->update_lock);
        sg_cpu->util = util;
        sg_cpu->max = max;
+        sg_cpu->flags = flags;
+        sugov_set_iowait_boost(sg_cpu, time, flags);
        sg_cpu->last_update = time;
        if (sugov_should_update_freq(sg_policy, time)) {
-                next_f = sugov_next_freq_shared(sg_cpu, util, max);
+                next_f = sugov_next_freq_shared(sg_cpu, util, max, flags);
                sugov_update_commit(sg_policy, time, next_f);
        }
@@ -444,10 +508,13 @@ static int sugov_start(struct cpufreq_policy *policy)
                sg_cpu->sg_policy = sg_policy;
                if (policy_is_shared(policy)) {
-                        sg_cpu->util = ULONG_MAX;
+                        sg_cpu->util = 0;
                        sg_cpu->max = 0;
+                        sg_cpu->flags = SCHED_CPUFREQ_RT;
                        sg_cpu->last_update = 0;
                        sg_cpu->cached_raw_freq = 0;
+                        sg_cpu->iowait_boost = 0;
+                        sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
                        cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
                                                     sugov_update_shared);
                } else {
@@ -495,28 +562,15 @@ static struct cpufreq_governor schedutil_gov = {
        .limits = sugov_limits,
 };
-static int __init sugov_module_init(void)
-{
-        return cpufreq_register_governor(&schedutil_gov);
-}
-static void __exit sugov_module_exit(void)
-{
-        cpufreq_unregister_governor(&schedutil_gov);
-}
-MODULE_AUTHOR("Rafael J. Wysocki <rafael.j.wysocki@intel.com>");
-MODULE_DESCRIPTION("Utilization-based CPU frequency selection");
-MODULE_LICENSE("GPL");
 #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
 struct cpufreq_governor *cpufreq_default_governor(void)
 {
        return &schedutil_gov;
 }
-fs_initcall(sugov_module_init);
-#else
-module_init(sugov_module_init);
 #endif
-module_exit(sugov_module_exit);
+static int __init sugov_register(void)
+{
+        return cpufreq_register_governor(&schedutil_gov);
+}
+fs_initcall(sugov_register);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 9858266fb0b3..5ebee3164e64 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -23,10 +23,8 @@
 * task when irq is in progress while we read rq->clock. That is a worthy
 * compromise in place of having locks on each irq in account_system_time.
 */
-DEFINE_PER_CPU(u64, cpu_hardirq_time);
+DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
-DEFINE_PER_CPU(u64, cpu_softirq_time);
-static DEFINE_PER_CPU(u64, irq_start_time);
 static int sched_clock_irqtime;
 void enable_sched_clock_irqtime(void)
@@ -39,16 +37,13 @@ void disable_sched_clock_irqtime(void)
        sched_clock_irqtime = 0;
 }
-#ifndef CONFIG_64BIT
-DEFINE_PER_CPU(seqcount_t, irq_time_seq);
-#endif /* CONFIG_64BIT */
 /*
 * Called before incrementing preempt_count on {soft,}irq_enter
 * and before decrementing preempt_count on {soft,}irq_exit.
 */
 void irqtime_account_irq(struct task_struct *curr)
 {
+        struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
        s64 delta;
        int cpu;
@@ -56,10 +51,10 @@ void irqtime_account_irq(struct task_struct *curr)
                return;
        cpu = smp_processor_id();
-        delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
+        delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
-        __this_cpu_add(irq_start_time, delta);
+        irqtime->irq_start_time += delta;
-        irq_time_write_begin();
+        u64_stats_update_begin(&irqtime->sync);
        /*
         * We do not account for softirq time from ksoftirqd here.
         * We want to continue accounting softirq time to ksoftirqd thread
@@ -67,42 +62,36 @@ void irqtime_account_irq(struct task_struct *curr)
         * that do not consume any time, but still wants to run.
         */
        if (hardirq_count())
-                __this_cpu_add(cpu_hardirq_time, delta);
+                irqtime->hardirq_time += delta;
        else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
-                __this_cpu_add(cpu_softirq_time, delta);
+                irqtime->softirq_time += delta;
-        irq_time_write_end();
+        u64_stats_update_end(&irqtime->sync);
 }
 EXPORT_SYMBOL_GPL(irqtime_account_irq);
-static cputime_t irqtime_account_hi_update(cputime_t maxtime)
+static cputime_t irqtime_account_update(u64 irqtime, int idx, cputime_t maxtime)
 {
        u64 *cpustat = kcpustat_this_cpu->cpustat;
-        unsigned long flags;
        cputime_t irq_cputime;
-        local_irq_save(flags);
+        irq_cputime = nsecs_to_cputime64(irqtime) - cpustat[idx];
-        irq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time)) -
-                      cpustat[CPUTIME_IRQ];
        irq_cputime = min(irq_cputime, maxtime);
-        cpustat[CPUTIME_IRQ] += irq_cputime;
+        cpustat[idx] += irq_cputime;
-        local_irq_restore(flags);
        return irq_cputime;
 }
-static cputime_t irqtime_account_si_update(cputime_t maxtime)
+static cputime_t irqtime_account_hi_update(cputime_t maxtime)
 {
-        u64 *cpustat = kcpustat_this_cpu->cpustat;
+        return irqtime_account_update(__this_cpu_read(cpu_irqtime.hardirq_time),
-        unsigned long flags;
+                                      CPUTIME_IRQ, maxtime);
-        cputime_t softirq_cputime;
+}
-        local_irq_save(flags);
+static cputime_t irqtime_account_si_update(cputime_t maxtime)
-        softirq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time)) -
+{
-                          cpustat[CPUTIME_SOFTIRQ];
+        return irqtime_account_update(__this_cpu_read(cpu_irqtime.softirq_time),
-        softirq_cputime = min(softirq_cputime, maxtime);
+                                      CPUTIME_SOFTIRQ, maxtime);
-        cpustat[CPUTIME_SOFTIRQ] += softirq_cputime;
-        local_irq_restore(flags);
-        return softirq_cputime;
 }
 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
@@ -263,6 +252,11 @@ void account_idle_time(cputime_t cputime)
                cpustat[CPUTIME_IDLE] += (__force u64) cputime;
 }
+/*
+ * When a guest is interrupted for a longer amount of time, missed clock
+ * ticks are not redelivered later. Due to that, this function may on
+ * occasion account more time than the calling functions think elapsed.
+ */
 static __always_inline cputime_t steal_account_process_time(cputime_t maxtime)
 {
 #ifdef CONFIG_PARAVIRT
@@ -290,6 +284,9 @@ static inline cputime_t account_other_time(cputime_t max)
 {
        cputime_t accounted;
+        /* Shall be converted to a lockdep-enabled lightweight check */
+        WARN_ON_ONCE(!irqs_disabled());
        accounted = steal_account_process_time(max);
        if (accounted < max)
@@ -301,6 +298,26 @@ static inline cputime_t account_other_time(cputime_t max)
        return accounted;
 }
+#ifdef CONFIG_64BIT
+static inline u64 read_sum_exec_runtime(struct task_struct *t)
+{
+        return t->se.sum_exec_runtime;
+}
+#else
+static u64 read_sum_exec_runtime(struct task_struct *t)
+{
+        u64 ns;
+        struct rq_flags rf;
+        struct rq *rq;
+        rq = task_rq_lock(t, &rf);
+        ns = t->se.sum_exec_runtime;
+        task_rq_unlock(rq, t, &rf);
+        return ns;
+}
+#endif
 /*
 * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
 * tasks (sum on group iteration) belonging to @tsk's group.
@@ -313,6 +330,17 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
        unsigned int seq, nextseq;
        unsigned long flags;
+        /*
+         * Update current task runtime to account pending time since last
+         * scheduler action or thread_group_cputime() call. This thread group
+         * might have other running tasks on different CPUs, but updating
+         * their runtime can affect syscall performance, so we skip account
+         * those pending times and rely only on values updated on tick or
+         * other scheduler action.
+         */
+        if (same_thread_group(current, tsk))
+                (void) task_sched_runtime(current);
        rcu_read_lock();
        /* Attempt a lockless read on the first round. */
        nextseq = 0;
@@ -327,7 +355,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
                        task_cputime(t, &utime, &stime);
                        times->utime += utime;
                        times->stime += stime;
-                        times->sum_exec_runtime += task_sched_runtime(t);
+                        times->sum_exec_runtime += read_sum_exec_runtime(t);
                }
                /* If lockless access failed, take the lock. */
                nextseq = 1;
@@ -371,7 +399,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
         * idle, or potentially user or system time. Due to rounding,
         * other time can exceed ticks occasionally.
         */
-        other = account_other_time(cputime);
+        other = account_other_time(ULONG_MAX);
        if (other >= cputime)
                return;
        cputime -= other;
@@ -486,7 +514,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
        }
        cputime = cputime_one_jiffy;
-        steal = steal_account_process_time(cputime);
+        steal = steal_account_process_time(ULONG_MAX);
        if (steal >= cputime)
                return;
@@ -516,7 +544,7 @@ void account_idle_ticks(unsigned long ticks)
        }
        cputime = jiffies_to_cputime(ticks);
-        steal = steal_account_process_time(cputime);
+        steal = steal_account_process_time(ULONG_MAX);
        if (steal >= cputime)
                return;
@@ -614,19 +642,25 @@ static void cputime_adjust(struct task_cputime *curr,
        stime = curr->stime;
        utime = curr->utime;
-        if (utime == 0) {
+        /*
-                stime = rtime;
+         * If either stime or both stime and utime are 0, assume all runtime is
+         * userspace. Once a task gets some ticks, the monotonicy code at
+         * 'update' will ensure things converge to the observed ratio.
+         */
+        if (stime == 0) {
+                utime = rtime;
                goto update;
        }
-        if (stime == 0) {
+        if (utime == 0) {
-                utime = rtime;
+                stime = rtime;
                goto update;
        }
        stime = scale_stime((__force u64)stime, (__force u64)rtime,
                            (__force u64)(stime + utime));
+update:
        /*
         * Make sure stime doesn't go backwards; this preserves monotonicity
         * for utime because rtime is monotonic.
@@ -649,7 +683,6 @@ static void cputime_adjust(struct task_cputime *curr,
                stime = rtime - utime;
        }
-update:
        prev->stime = stime;
        prev->utime = utime;
 out:
@@ -694,6 +727,13 @@ static cputime_t get_vtime_delta(struct task_struct *tsk)
        unsigned long now = READ_ONCE(jiffies);
        cputime_t delta, other;
+        /*
+         * Unlike tick based timing, vtime based timing never has lost
+         * ticks, and no need for steal time accounting to make up for
+         * lost ticks. Vtime accounts a rounded version of actual
+         * elapsed time. Limit account_other_time to prevent rounding
+         * errors from causing elapsed vtime to go negative.
+         */
        delta = jiffies_to_cputime(now - tsk->vtime_snap);
        other = account_other_time(delta);
        WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 1ce8867283dc..37e2449186c4 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -243,10 +243,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
 static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p)
 {
        struct rq *later_rq = NULL;
-        bool fallback = false;
        later_rq = find_lock_later_rq(p, rq);
        if (!later_rq) {
                int cpu;
@@ -254,7 +252,6 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
                 * If we cannot preempt any rq, fall back to pick any
                 * online cpu.
                 */
-                fallback = true;
                cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p));
                if (cpu >= nr_cpu_ids) {
                        /*
@@ -274,16 +271,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
                double_lock_balance(rq, later_rq);
        }
-        /*
-         * By now the task is replenished and enqueued; migrate it.
-         */
-        deactivate_task(rq, p, 0);
        set_task_cpu(p, later_rq->cpu);
-        activate_task(later_rq, p, 0);
-        if (!fallback)
-                resched_curr(later_rq);
        double_unlock_balance(later_rq, rq);
        return later_rq;
@@ -346,12 +334,12 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
 * one, and to (try to!) reconcile itself with its own scheduling
 * parameters.
 */
-static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
+static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
-                                       struct sched_dl_entity *pi_se)
 {
        struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
        struct rq *rq = rq_of_dl_rq(dl_rq);
+        WARN_ON(dl_se->dl_boosted);
        WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
        /*
@@ -367,8 +355,8 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
         * future; in fact, we must consider execution overheads (time
         * spent on hardirq context, etc.).
         */
-        dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
+        dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline;
-        dl_se->runtime = pi_se->dl_runtime;
+        dl_se->runtime = dl_se->dl_runtime;
 }
 /*
@@ -641,29 +629,31 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
                goto unlock;
        }
-        enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
-        if (dl_task(rq->curr))
-                check_preempt_curr_dl(rq, p, 0);
-        else
-                resched_curr(rq);
 #ifdef CONFIG_SMP
-        /*
-         * Perform balancing operations here; after the replenishments.  We
-         * cannot drop rq->lock before this, otherwise the assertion in
-         * start_dl_timer() about not missing updates is not true.
-         *
-         * If we find that the rq the task was on is no longer available, we
-         * need to select a new rq.
-         *
-         * XXX figure out if select_task_rq_dl() deals with offline cpus.
-         */
        if (unlikely(!rq->online)) {
+                /*
+                 * If the runqueue is no longer available, migrate the
+                 * task elsewhere. This necessarily changes rq.
+                 */
                lockdep_unpin_lock(&rq->lock, rf.cookie);
                rq = dl_task_offline_migration(rq, p);
                rf.cookie = lockdep_pin_lock(&rq->lock);
+                /*
+                 * Now that the task has been migrated to the new RQ and we
+                 * have that locked, proceed as normal and enqueue the task
+                 * there.
+                 */
        }
+#endif
+        enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
+        if (dl_task(rq->curr))
+                check_preempt_curr_dl(rq, p, 0);
+        else
+                resched_curr(rq);
+#ifdef CONFIG_SMP
        /*
         * Queueing this task back might have overloaded rq, check if we need
         * to kick someone away.
@@ -735,9 +725,8 @@ static void update_curr_dl(struct rq *rq)
                return;
        }
-        /* kick cpufreq (see the comment in linux/cpufreq.h). */
+        /* kick cpufreq (see the comment in kernel/sched/sched.h). */
-        if (cpu_of(rq) == smp_processor_id())
+        cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_DL);
-                cpufreq_trigger_update(rq_clock(rq));
        schedstat_set(curr->se.statistics.exec_max,
                      max(curr->se.statistics.exec_max, delta_exec));
@@ -798,7 +787,7 @@ static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
        if (dl_rq->earliest_dl.curr == 0 ||
            dl_time_before(deadline, dl_rq->earliest_dl.curr)) {
                dl_rq->earliest_dl.curr = deadline;
-                cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1);
+                cpudl_set(&rq->rd->cpudl, rq->cpu, deadline);
        }
 }
@@ -813,14 +802,14 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
        if (!dl_rq->dl_nr_running) {
                dl_rq->earliest_dl.curr = 0;
                dl_rq->earliest_dl.next = 0;
-                cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
+                cpudl_clear(&rq->rd->cpudl, rq->cpu);
        } else {
                struct rb_node *leftmost = dl_rq->rb_leftmost;
                struct sched_dl_entity *entry;
                entry = rb_entry(leftmost, struct sched_dl_entity, rb_node);
                dl_rq->earliest_dl.curr = entry->deadline;
-                cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1);
+                cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline);
        }
 }
@@ -1671,7 +1660,7 @@ static void rq_online_dl(struct rq *rq)
        cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu);
        if (rq->dl.dl_nr_running > 0)
-                cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1);
+                cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr);
 }
 /* Assumes rq->lock is held */
@@ -1680,7 +1669,7 @@ static void rq_offline_dl(struct rq *rq)
        if (rq->dl.overloaded)
                dl_clear_overload(rq);
-        cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
+        cpudl_clear(&rq->rd->cpudl, rq->cpu);
        cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
 }
@@ -1723,10 +1712,20 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
 */
 static void switched_to_dl(struct rq *rq, struct task_struct *p)
 {
+        /* If p is not queued we will update its parameters at next wakeup. */
+        if (!task_on_rq_queued(p))
+                return;
+        /*
+         * If p is boosted we already updated its params in
+         * rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH),
+         * p's deadline being now already after rq_clock(rq).
+         */
        if (dl_time_before(p->dl.deadline, rq_clock(rq)))
-                setup_new_dl_entity(&p->dl, &p->dl);
+                setup_new_dl_entity(&p->dl);
-        if (task_on_rq_queued(p) && rq->curr != p) {
+        if (rq->curr != p) {
 #ifdef CONFIG_SMP
                if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded)
                        queue_push_tasks(rq);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 2a0a9995256d..fa178b62ea79 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -369,8 +369,12 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
 #define P(F) \
        SEQ_printf(m, "  .%-30s: %lld\n", #F, (long long)F)
+#define P_SCHEDSTAT(F) \
+        SEQ_printf(m, "  .%-30s: %lld\n", #F, (long long)schedstat_val(F))
 #define PN(F) \
        SEQ_printf(m, "  .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
+#define PN_SCHEDSTAT(F) \
+        SEQ_printf(m, "  .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F)))
        if (!se)
                return;
@@ -378,26 +382,27 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
        PN(se->exec_start);
        PN(se->vruntime);
        PN(se->sum_exec_runtime);
-#ifdef CONFIG_SCHEDSTATS
        if (schedstat_enabled()) {
-                PN(se->statistics.wait_start);
+                PN_SCHEDSTAT(se->statistics.wait_start);
-                PN(se->statistics.sleep_start);
+                PN_SCHEDSTAT(se->statistics.sleep_start);
-                PN(se->statistics.block_start);
+                PN_SCHEDSTAT(se->statistics.block_start);
-                PN(se->statistics.sleep_max);
+                PN_SCHEDSTAT(se->statistics.sleep_max);
-                PN(se->statistics.block_max);
+                PN_SCHEDSTAT(se->statistics.block_max);
-                PN(se->statistics.exec_max);
+                PN_SCHEDSTAT(se->statistics.exec_max);
-                PN(se->statistics.slice_max);
+                PN_SCHEDSTAT(se->statistics.slice_max);
-                PN(se->statistics.wait_max);
+                PN_SCHEDSTAT(se->statistics.wait_max);
-                PN(se->statistics.wait_sum);
+                PN_SCHEDSTAT(se->statistics.wait_sum);
-                P(se->statistics.wait_count);
+                P_SCHEDSTAT(se->statistics.wait_count);
        }
-#endif
        P(se->load.weight);
 #ifdef CONFIG_SMP
        P(se->avg.load_avg);
        P(se->avg.util_avg);
 #endif
+#undef PN_SCHEDSTAT
 #undef PN
+#undef P_SCHEDSTAT
 #undef P
 }
 #endif
@@ -410,7 +415,8 @@ static char *task_group_path(struct task_group *tg)
        if (autogroup_path(tg, group_path, PATH_MAX))
                return group_path;
-        return cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
+        cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
+        return group_path;
 }
 #endif
@@ -429,9 +435,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
                p->prio);
        SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
-                SPLIT_NS(schedstat_val(p, se.statistics.wait_sum)),
+                SPLIT_NS(schedstat_val_or_zero(p->se.statistics.wait_sum)),
                SPLIT_NS(p->se.sum_exec_runtime),
-                SPLIT_NS(schedstat_val(p, se.statistics.sum_sleep_runtime)));
+                SPLIT_NS(schedstat_val_or_zero(p->se.statistics.sum_sleep_runtime)));
 #ifdef CONFIG_NUMA_BALANCING
        SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
@@ -626,9 +632,7 @@ do {									\
 #undef P64
 #endif
-#ifdef CONFIG_SCHEDSTATS
+#define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, schedstat_val(rq->n));
-#define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, rq->n);
        if (schedstat_enabled()) {
                P(yld_count);
                P(sched_count);
@@ -636,9 +640,8 @@ do {									\
                P(ttwu_count);
                P(ttwu_local);
        }
 #undef P
-#endif
        spin_lock_irqsave(&sched_debug_lock, flags);
        print_cfs_stats(m, cpu);
        print_rt_stats(m, cpu);
@@ -868,10 +871,14 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
        SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
 #define P(F) \
        SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
+#define P_SCHEDSTAT(F) \
+        SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)schedstat_val(p->F))
 #define __PN(F) \
        SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
 #define PN(F) \
        SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
+#define PN_SCHEDSTAT(F) \
+        SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(p->F)))
        PN(se.exec_start);
        PN(se.vruntime);
@@ -881,37 +888,36 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
        P(se.nr_migrations);
-#ifdef CONFIG_SCHEDSTATS
        if (schedstat_enabled()) {
                u64 avg_atom, avg_per_cpu;
-                PN(se.statistics.sum_sleep_runtime);
+                PN_SCHEDSTAT(se.statistics.sum_sleep_runtime);
-                PN(se.statistics.wait_start);
+                PN_SCHEDSTAT(se.statistics.wait_start);
-                PN(se.statistics.sleep_start);
+                PN_SCHEDSTAT(se.statistics.sleep_start);
-                PN(se.statistics.block_start);
+                PN_SCHEDSTAT(se.statistics.block_start);
-                PN(se.statistics.sleep_max);
+                PN_SCHEDSTAT(se.statistics.sleep_max);
-                PN(se.statistics.block_max);
+                PN_SCHEDSTAT(se.statistics.block_max);
-                PN(se.statistics.exec_max);
+                PN_SCHEDSTAT(se.statistics.exec_max);
-                PN(se.statistics.slice_max);
+                PN_SCHEDSTAT(se.statistics.slice_max);
-                PN(se.statistics.wait_max);
+                PN_SCHEDSTAT(se.statistics.wait_max);
-                PN(se.statistics.wait_sum);
+                PN_SCHEDSTAT(se.statistics.wait_sum);
-                P(se.statistics.wait_count);
+                P_SCHEDSTAT(se.statistics.wait_count);
-                PN(se.statistics.iowait_sum);
+                PN_SCHEDSTAT(se.statistics.iowait_sum);
-                P(se.statistics.iowait_count);
+                P_SCHEDSTAT(se.statistics.iowait_count);
-                P(se.statistics.nr_migrations_cold);
+                P_SCHEDSTAT(se.statistics.nr_migrations_cold);
-                P(se.statistics.nr_failed_migrations_affine);
+                P_SCHEDSTAT(se.statistics.nr_failed_migrations_affine);
-                P(se.statistics.nr_failed_migrations_running);
+                P_SCHEDSTAT(se.statistics.nr_failed_migrations_running);
-                P(se.statistics.nr_failed_migrations_hot);
+                P_SCHEDSTAT(se.statistics.nr_failed_migrations_hot);
-                P(se.statistics.nr_forced_migrations);
+                P_SCHEDSTAT(se.statistics.nr_forced_migrations);
-                P(se.statistics.nr_wakeups);
+                P_SCHEDSTAT(se.statistics.nr_wakeups);
-                P(se.statistics.nr_wakeups_sync);
+                P_SCHEDSTAT(se.statistics.nr_wakeups_sync);
-                P(se.statistics.nr_wakeups_migrate);
+                P_SCHEDSTAT(se.statistics.nr_wakeups_migrate);
-                P(se.statistics.nr_wakeups_local);
+                P_SCHEDSTAT(se.statistics.nr_wakeups_local);
-                P(se.statistics.nr_wakeups_remote);
+                P_SCHEDSTAT(se.statistics.nr_wakeups_remote);
-                P(se.statistics.nr_wakeups_affine);
+                P_SCHEDSTAT(se.statistics.nr_wakeups_affine);
-                P(se.statistics.nr_wakeups_affine_attempts);
+                P_SCHEDSTAT(se.statistics.nr_wakeups_affine_attempts);
-                P(se.statistics.nr_wakeups_passive);
+                P_SCHEDSTAT(se.statistics.nr_wakeups_passive);
-                P(se.statistics.nr_wakeups_idle);
+                P_SCHEDSTAT(se.statistics.nr_wakeups_idle);
                avg_atom = p->se.sum_exec_runtime;
                if (nr_switches)
@@ -930,7 +936,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
                __PN(avg_atom);
                __PN(avg_per_cpu);
        }
-#endif
        __P(nr_switches);
        SEQ_printf(m, "%-45s:%21Ld\n",
                   "nr_voluntary_switches", (long long)p->nvcsw);
@@ -947,8 +953,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 #endif
        P(policy);
        P(prio);
+#undef PN_SCHEDSTAT
 #undef PN
 #undef __PN
+#undef P_SCHEDSTAT
 #undef P
 #undef __P
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 039de34f1521..d941c97dfbc3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -114,6 +114,12 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
 #endif
+/*
+ * The margin used when comparing utilization with CPU capacity:
+ * util * 1024 < capacity * margin
+ */
+unsigned int capacity_margin = 1280; /* ~20% */
 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 {
        lw->weight += inc;
@@ -256,9 +262,7 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
 static inline struct task_struct *task_of(struct sched_entity *se)
 {
-#ifdef CONFIG_SCHED_DEBUG
+        SCHED_WARN_ON(!entity_is_task(se));
-        WARN_ON_ONCE(!entity_is_task(se));
-#endif
        return container_of(se, struct task_struct, se);
 }
@@ -456,17 +460,23 @@ static inline int entity_before(struct sched_entity *a,
 static void update_min_vruntime(struct cfs_rq *cfs_rq)
 {
+        struct sched_entity *curr = cfs_rq->curr;
        u64 vruntime = cfs_rq->min_vruntime;
-        if (cfs_rq->curr)
+        if (curr) {
-                vruntime = cfs_rq->curr->vruntime;
+                if (curr->on_rq)
+                        vruntime = curr->vruntime;
+                else
+                        curr = NULL;
+        }
        if (cfs_rq->rb_leftmost) {
                struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
                                                   struct sched_entity,
                                                   run_node);
-                if (!cfs_rq->curr)
+                if (!curr)
                        vruntime = se->vruntime;
                else
                        vruntime = min_vruntime(vruntime, se->vruntime);
@@ -656,7 +666,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 #ifdef CONFIG_SMP
-static int select_idle_sibling(struct task_struct *p, int cpu);
+static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
 static unsigned long task_h_load(struct task_struct *p);
 /*
@@ -680,7 +690,14 @@ void init_entity_runnable_average(struct sched_entity *se)
         * will definitely be update (after enqueue).
         */
        sa->period_contrib = 1023;
-        sa->load_avg = scale_load_down(se->load.weight);
+        /*
+         * Tasks are intialized with full load to be seen as heavy tasks until
+         * they get a chance to stabilize to their real load level.
+         * Group entities are intialized with zero load to reflect the fact that
+         * nothing has been attached to the task group yet.
+         */
+        if (entity_is_task(se))
+                sa->load_avg = scale_load_down(se->load.weight);
        sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
        /*
         * At this point, util_avg won't be used in select_task_rq_fair anyway
@@ -726,7 +743,6 @@ void post_init_entity_util_avg(struct sched_entity *se)
        struct sched_avg *sa = &se->avg;
        long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
        u64 now = cfs_rq_clock_task(cfs_rq);
-        int tg_update;
        if (cap > 0) {
                if (cfs_rq->avg.util_avg != 0) {
@@ -759,10 +775,9 @@ void post_init_entity_util_avg(struct sched_entity *se)
                }
        }
-        tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
+        update_cfs_rq_load_avg(now, cfs_rq, false);
        attach_entity_load_avg(cfs_rq, se);
-        if (tg_update)
+        update_tg_load_avg(cfs_rq, false);
-                update_tg_load_avg(cfs_rq, false);
 }
 #else /* !CONFIG_SMP */
@@ -799,7 +814,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
                      max(delta_exec, curr->statistics.exec_max));
        curr->sum_exec_runtime += delta_exec;
-        schedstat_add(cfs_rq, exec_clock, delta_exec);
+        schedstat_add(cfs_rq->exec_clock, delta_exec);
        curr->vruntime += calc_delta_fair(delta_exec, curr);
        update_min_vruntime(cfs_rq);
@@ -820,26 +835,34 @@ static void update_curr_fair(struct rq *rq)
        update_curr(cfs_rq_of(&rq->curr->se));
 }
-#ifdef CONFIG_SCHEDSTATS
 static inline void
 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-        u64 wait_start = rq_clock(rq_of(cfs_rq));
+        u64 wait_start, prev_wait_start;
+        if (!schedstat_enabled())
+                return;
+        wait_start = rq_clock(rq_of(cfs_rq));
+        prev_wait_start = schedstat_val(se->statistics.wait_start);
        if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
-            likely(wait_start > se->statistics.wait_start))
+            likely(wait_start > prev_wait_start))
-                wait_start -= se->statistics.wait_start;
+                wait_start -= prev_wait_start;
-        se->statistics.wait_start = wait_start;
+        schedstat_set(se->statistics.wait_start, wait_start);
 }
-static void
+static inline void
 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        struct task_struct *p;
        u64 delta;
-        delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
+        if (!schedstat_enabled())
+                return;
+        delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
        if (entity_is_task(se)) {
                p = task_of(se);
@@ -849,35 +872,114 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
                         * time stamp can be adjusted to accumulate wait time
                         * prior to migration.
                         */
-                        se->statistics.wait_start = delta;
+                        schedstat_set(se->statistics.wait_start, delta);
                        return;
                }
                trace_sched_stat_wait(p, delta);
        }
-        se->statistics.wait_max = max(se->statistics.wait_max, delta);
+        schedstat_set(se->statistics.wait_max,
-        se->statistics.wait_count++;
+                      max(schedstat_val(se->statistics.wait_max), delta));
-        se->statistics.wait_sum += delta;
+        schedstat_inc(se->statistics.wait_count);
-        se->statistics.wait_start = 0;
+        schedstat_add(se->statistics.wait_sum, delta);
+        schedstat_set(se->statistics.wait_start, 0);
+}
+static inline void
+update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+        struct task_struct *tsk = NULL;
+        u64 sleep_start, block_start;
+        if (!schedstat_enabled())
+                return;
+        sleep_start = schedstat_val(se->statistics.sleep_start);
+        block_start = schedstat_val(se->statistics.block_start);
+        if (entity_is_task(se))
+                tsk = task_of(se);
+        if (sleep_start) {
+                u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
+                if ((s64)delta < 0)
+                        delta = 0;
+                if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
+                        schedstat_set(se->statistics.sleep_max, delta);
+                schedstat_set(se->statistics.sleep_start, 0);
+                schedstat_add(se->statistics.sum_sleep_runtime, delta);
+                if (tsk) {
+                        account_scheduler_latency(tsk, delta >> 10, 1);
+                        trace_sched_stat_sleep(tsk, delta);
+                }
+        }
+        if (block_start) {
+                u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
+                if ((s64)delta < 0)
+                        delta = 0;
+                if (unlikely(delta > schedstat_val(se->statistics.block_max)))
+                        schedstat_set(se->statistics.block_max, delta);
+                schedstat_set(se->statistics.block_start, 0);
+                schedstat_add(se->statistics.sum_sleep_runtime, delta);
+                if (tsk) {
+                        if (tsk->in_iowait) {
+                                schedstat_add(se->statistics.iowait_sum, delta);
+                                schedstat_inc(se->statistics.iowait_count);
+                                trace_sched_stat_iowait(tsk, delta);
+                        }
+                        trace_sched_stat_blocked(tsk, delta);
+                        /*
+                         * Blocking time is in units of nanosecs, so shift by
+                         * 20 to get a milliseconds-range estimation of the
+                         * amount of time that the task spent sleeping:
+                         */
+                        if (unlikely(prof_on == SLEEP_PROFILING)) {
+                                profile_hits(SLEEP_PROFILING,
+                                                (void *)get_wchan(tsk),
+                                                delta >> 20);
+                        }
+                        account_scheduler_latency(tsk, delta >> 10, 0);
+                }
+        }
 }
 /*
 * Task is being enqueued - update stats:
 */
 static inline void
-update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
+        if (!schedstat_enabled())
+                return;
        /*
         * Are we enqueueing a waiting task? (for current tasks
         * a dequeue/enqueue event is a NOP)
         */
        if (se != cfs_rq->curr)
                update_stats_wait_start(cfs_rq, se);
+        if (flags & ENQUEUE_WAKEUP)
+                update_stats_enqueue_sleeper(cfs_rq, se);
 }
 static inline void
 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
+        if (!schedstat_enabled())
+                return;
        /*
         * Mark the end of the wait period if dequeueing a
         * waiting task:
@@ -885,40 +987,18 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
        if (se != cfs_rq->curr)
                update_stats_wait_end(cfs_rq, se);
-        if (flags & DEQUEUE_SLEEP) {
+        if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
-                if (entity_is_task(se)) {
+                struct task_struct *tsk = task_of(se);
-                        struct task_struct *tsk = task_of(se);
-                        if (tsk->state & TASK_INTERRUPTIBLE)
+                if (tsk->state & TASK_INTERRUPTIBLE)
-                                se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
+                        schedstat_set(se->statistics.sleep_start,
-                        if (tsk->state & TASK_UNINTERRUPTIBLE)
+                                      rq_clock(rq_of(cfs_rq)));
-                                se->statistics.block_start = rq_clock(rq_of(cfs_rq));
+                if (tsk->state & TASK_UNINTERRUPTIBLE)
-                }
+                        schedstat_set(se->statistics.block_start,
+                                      rq_clock(rq_of(cfs_rq)));
        }
-}
-#else
-static inline void
-update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
 }
-static inline void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-}
-static inline void
-update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-}
-static inline void
-update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
-{
-}
-#endif
 /*
 * We are picking a new current task - update its stats:
 */
@@ -1513,8 +1593,16 @@ balance:
         * One idle CPU per node is evaluated for a task numa move.
         * Call select_idle_sibling to maybe find a better one.
         */
-        if (!cur)
+        if (!cur) {
-                env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
+                /*
+                 * select_idle_siblings() uses an per-cpu cpumask that
+                 * can be used from IRQ context.
+                 */
+                local_irq_disable();
+                env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
+                                                   env->dst_cpu);
+                local_irq_enable();
+        }
 assign:
        task_numa_assign(env, cur, imp);
@@ -2292,7 +2380,7 @@ void task_numa_work(struct callback_head *work)
        unsigned long nr_pte_updates = 0;
        long pages, virtpages;
-        WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
+        SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
        work->next = work; /* protect against double add */
        /*
@@ -2803,9 +2891,21 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
-/*
+/**
- * Updating tg's load_avg is necessary before update_cfs_share (which is done)
+ * update_tg_load_avg - update the tg's load avg
- * and effective_load (which is not done because it is too costly).
+ * @cfs_rq: the cfs_rq whose avg changed
+ * @force: update regardless of how small the difference
+ *
+ * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
+ * However, because tg->load_avg is a global value there are performance
+ * considerations.
+ *
+ * In order to avoid having to look at the other cfs_rq's, we use a
+ * differential update where we store the last value we propagated. This in
+ * turn allows skipping updates if the differential is 'small'.
+ *
+ * Updating tg's load_avg is necessary before update_cfs_share() (which is
+ * done) and effective_load() (which is not done because it is too costly).
 */
 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
 {
@@ -2875,12 +2975,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
 static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
 {
-        struct rq *rq = rq_of(cfs_rq);
+        if (&this_rq()->cfs == cfs_rq) {
-        int cpu = cpu_of(rq);
-        if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) {
-                unsigned long max = rq->cpu_capacity_orig;
                /*
                 * There are a few boundary cases this might miss but it should
                 * get called often enough that that should (hopefully) not be
@@ -2897,8 +2992,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
                 *
                 * See cpu_util().
                 */
-                cpufreq_update_util(rq_clock(rq),
+                cpufreq_update_util(rq_of(cfs_rq), 0);
-                                    min(cfs_rq->avg.util_avg, max), max);
        }
 }
@@ -2931,10 +3025,10 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
 *
 * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
 *
- * Returns true if the load decayed or we removed utilization. It is expected
+ * Returns true if the load decayed or we removed load.
- * that one calls update_tg_load_avg() on this condition, but after you've
+ *
- * modified the cfs_rq avg (attach/detach), such that we propagate the new
+ * Since both these conditions indicate a changed cfs_rq->avg.load we should
- * avg up.
+ * call update_tg_load_avg() when this function returns true.
 */
 static inline int
 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
@@ -3159,10 +3253,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
 static inline void update_load_avg(struct sched_entity *se, int not_used)
 {
-        struct cfs_rq *cfs_rq = cfs_rq_of(se);
+        cpufreq_update_util(rq_of(cfs_rq_of(se)), 0);
-        struct rq *rq = rq_of(cfs_rq);
-        cpufreq_trigger_update(rq_clock(rq));
 }
 static inline void
@@ -3183,68 +3274,6 @@ static inline int idle_balance(struct rq *rq)
 #endif /* CONFIG_SMP */
-static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-#ifdef CONFIG_SCHEDSTATS
-        struct task_struct *tsk = NULL;
-        if (entity_is_task(se))
-                tsk = task_of(se);
-        if (se->statistics.sleep_start) {
-                u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
-                if ((s64)delta < 0)
-                        delta = 0;
-                if (unlikely(delta > se->statistics.sleep_max))
-                        se->statistics.sleep_max = delta;
-                se->statistics.sleep_start = 0;
-                se->statistics.sum_sleep_runtime += delta;
-                if (tsk) {
-                        account_scheduler_latency(tsk, delta >> 10, 1);
-                        trace_sched_stat_sleep(tsk, delta);
-                }
-        }
-        if (se->statistics.block_start) {
-                u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
-                if ((s64)delta < 0)
-                        delta = 0;
-                if (unlikely(delta > se->statistics.block_max))
-                        se->statistics.block_max = delta;
-                se->statistics.block_start = 0;
-                se->statistics.sum_sleep_runtime += delta;
-                if (tsk) {
-                        if (tsk->in_iowait) {
-                                se->statistics.iowait_sum += delta;
-                                se->statistics.iowait_count++;
-                                trace_sched_stat_iowait(tsk, delta);
-                        }
-                        trace_sched_stat_blocked(tsk, delta);
-                        /*
-                         * Blocking time is in units of nanosecs, so shift by
-                         * 20 to get a milliseconds-range estimation of the
-                         * amount of time that the task spent sleeping:
-                         */
-                        if (unlikely(prof_on == SLEEP_PROFILING)) {
-                                profile_hits(SLEEP_PROFILING,
-                                                (void *)get_wchan(tsk),
-                                                delta >> 20);
-                        }
-                        account_scheduler_latency(tsk, delta >> 10, 0);
-                }
-        }
-#endif
-}
 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 #ifdef CONFIG_SCHED_DEBUG
@@ -3254,7 +3283,7 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
                d = -d;
        if (d > 3*sysctl_sched_latency)
-                schedstat_inc(cfs_rq, nr_spread_over);
+                schedstat_inc(cfs_rq->nr_spread_over);
 #endif
 }
@@ -3371,17 +3400,12 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
        account_entity_enqueue(cfs_rq, se);
        update_cfs_shares(cfs_rq);
-        if (flags & ENQUEUE_WAKEUP) {
+        if (flags & ENQUEUE_WAKEUP)
                place_entity(cfs_rq, se, 0);
-                if (schedstat_enabled())
-                        enqueue_sleeper(cfs_rq, se);
-        }
        check_schedstat_required();
-        if (schedstat_enabled()) {
+        update_stats_enqueue(cfs_rq, se, flags);
-                update_stats_enqueue(cfs_rq, se);
+        check_spread(cfs_rq, se);
-                check_spread(cfs_rq, se);
-        }
        if (!curr)
                __enqueue_entity(cfs_rq, se);
        se->on_rq = 1;
@@ -3448,8 +3472,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
        update_curr(cfs_rq);
        dequeue_entity_load_avg(cfs_rq, se);
-        if (schedstat_enabled())
+        update_stats_dequeue(cfs_rq, se, flags);
-                update_stats_dequeue(cfs_rq, se, flags);
        clear_buddies(cfs_rq, se);
@@ -3459,9 +3482,10 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
        account_entity_dequeue(cfs_rq, se);
        /*
-         * Normalize the entity after updating the min_vruntime because the
+         * Normalize after update_curr(); which will also have moved
-         * update can refer to the ->curr item and we need to reflect this
+         * min_vruntime if @se is the one holding it back. But before doing
-         * movement in our normalized position.
+         * update_min_vruntime() again, which will discount @se's position and
+         * can move min_vruntime forward still more.
         */
        if (!(flags & DEQUEUE_SLEEP))
                se->vruntime -= cfs_rq->min_vruntime;
@@ -3469,8 +3493,16 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
        /* return excess runtime on last dequeue */
        return_cfs_rq_runtime(cfs_rq);
-        update_min_vruntime(cfs_rq);
        update_cfs_shares(cfs_rq);
+        /*
+         * Now advance min_vruntime if @se was the entity holding it back,
+         * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
+         * put back on, and if we advance min_vruntime, we'll be placed back
+         * further than we started -- ie. we'll be penalized.
+         */
+        if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
+                update_min_vruntime(cfs_rq);
 }
 /*
@@ -3523,25 +3555,25 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
                 * a CPU. So account for the time it spent waiting on the
                 * runqueue.
                 */
-                if (schedstat_enabled())
+                update_stats_wait_end(cfs_rq, se);
-                        update_stats_wait_end(cfs_rq, se);
                __dequeue_entity(cfs_rq, se);
                update_load_avg(se, 1);
        }
        update_stats_curr_start(cfs_rq, se);
        cfs_rq->curr = se;
-#ifdef CONFIG_SCHEDSTATS
        /*
         * Track our maximum slice length, if the CPU's load is at
         * least twice that of our own weight (i.e. dont track it
         * when there are only lesser-weight tasks around):
         */
        if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
-                se->statistics.slice_max = max(se->statistics.slice_max,
+                schedstat_set(se->statistics.slice_max,
-                        se->sum_exec_runtime - se->prev_sum_exec_runtime);
+                        max((u64)schedstat_val(se->statistics.slice_max),
+                            se->sum_exec_runtime - se->prev_sum_exec_runtime));
        }
-#endif
        se->prev_sum_exec_runtime = se->sum_exec_runtime;
 }
@@ -3620,13 +3652,10 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
        /* throttle cfs_rqs exceeding runtime */
        check_cfs_rq_runtime(cfs_rq);
-        if (schedstat_enabled()) {
+        check_spread(cfs_rq, prev);
-                check_spread(cfs_rq, prev);
-                if (prev->on_rq)
-                        update_stats_wait_start(cfs_rq, prev);
-        }
        if (prev->on_rq) {
+                update_stats_wait_start(cfs_rq, prev);
                /* Put 'current' back into the tree. */
                __enqueue_entity(cfs_rq, prev);
                /* in !on_rq case, update occurred at dequeue */
@@ -4456,9 +4485,9 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
        struct sched_entity *se = &p->se;
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
-        WARN_ON(task_rq(p) != rq);
+        SCHED_WARN_ON(task_rq(p) != rq);
-        if (cfs_rq->nr_running > 1) {
+        if (rq->cfs.h_nr_running > 1) {
                u64 slice = sched_slice(cfs_rq, se);
                u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
                s64 delta = slice - ran;
@@ -4509,6 +4538,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
        struct cfs_rq *cfs_rq;
        struct sched_entity *se = &p->se;
+        /*
+         * If in_iowait is set, the code below may not trigger any cpufreq
+         * utilization updates, so do it here explicitly with the IOWAIT flag
+         * passed.
+         */
+        if (p->in_iowait)
+                cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT);
        for_each_sched_entity(se) {
                if (se->on_rq)
                        break;
@@ -4605,6 +4642,11 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 }
 #ifdef CONFIG_SMP
+/* Working cpumask for: load_balance, load_balance_newidle. */
+DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
+DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
 #ifdef CONFIG_NO_HZ_COMMON
 /*
 * per rq 'load' arrray crap; XXX kill this.
@@ -5006,9 +5048,9 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
                 * wl = S * s'_i; see (2)
                 */
                if (W > 0 && w < W)
-                        wl = (w * (long)tg->shares) / W;
+                        wl = (w * (long)scale_load_down(tg->shares)) / W;
                else
-                        wl = tg->shares;
+                        wl = scale_load_down(tg->shares);
                /*
                 * Per the above, wl is the new se->load.weight value; since
@@ -5091,18 +5133,18 @@ static int wake_wide(struct task_struct *p)
        return 1;
 }
-static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
+static int wake_affine(struct sched_domain *sd, struct task_struct *p,
+                       int prev_cpu, int sync)
 {
        s64 this_load, load;
        s64 this_eff_load, prev_eff_load;
-        int idx, this_cpu, prev_cpu;
+        int idx, this_cpu;
        struct task_group *tg;
        unsigned long weight;
        int balanced;
        idx       = sd->wake_idx;
        this_cpu  = smp_processor_id();
-        prev_cpu  = task_cpu(p);
        load      = source_load(prev_cpu, idx);
        this_load = target_load(this_cpu, idx);
@@ -5146,13 +5188,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
        balanced = this_eff_load <= prev_eff_load;
-        schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
+        schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
        if (!balanced)
                return 0;
-        schedstat_inc(sd, ttwu_move_affine);
+        schedstat_inc(sd->ttwu_move_affine);
-        schedstat_inc(p, se.statistics.nr_wakeups_affine);
+        schedstat_inc(p->se.statistics.nr_wakeups_affine);
        return 1;
 }
@@ -5228,6 +5270,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
        int shallowest_idle_cpu = -1;
        int i;
+        /* Check if we have any choice: */
+        if (group->group_weight == 1)
+                return cpumask_first(sched_group_cpus(group));
        /* Traverse only the allowed CPUs */
        for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
                if (idle_cpu(i)) {
@@ -5265,64 +5311,242 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 }
 /*
- * Try and locate an idle CPU in the sched_domain.
+ * Implement a for_each_cpu() variant that starts the scan at a given cpu
+ * (@start), and wraps around.
+ *
+ * This is used to scan for idle CPUs; such that not all CPUs looking for an
+ * idle CPU find the same CPU. The down-side is that tasks tend to cycle
+ * through the LLC domain.
+ *
+ * Especially tbench is found sensitive to this.
 */
-static int select_idle_sibling(struct task_struct *p, int target)
+static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped)
+{
+        int next;
+again:
+        next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1);
+        if (*wrapped) {
+                if (next >= start)
+                        return nr_cpumask_bits;
+        } else {
+                if (next >= nr_cpumask_bits) {
+                        *wrapped = 1;
+                        n = -1;
+                        goto again;
+                }
+        }
+        return next;
+}
+#define for_each_cpu_wrap(cpu, mask, start, wrap)                               \
+        for ((wrap) = 0, (cpu) = (start)-1;                                     \
+                (cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)),     \
+                (cpu) < nr_cpumask_bits; )
+#ifdef CONFIG_SCHED_SMT
+static inline void set_idle_cores(int cpu, int val)
+{
+        struct sched_domain_shared *sds;
+        sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+        if (sds)
+                WRITE_ONCE(sds->has_idle_cores, val);
+}
+static inline bool test_idle_cores(int cpu, bool def)
+{
+        struct sched_domain_shared *sds;
+        sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+        if (sds)
+                return READ_ONCE(sds->has_idle_cores);
+        return def;
+}
+/*
+ * Scans the local SMT mask to see if the entire core is idle, and records this
+ * information in sd_llc_shared->has_idle_cores.
+ *
+ * Since SMT siblings share all cache levels, inspecting this limited remote
+ * state should be fairly cheap.
+ */
+void __update_idle_core(struct rq *rq)
+{
+        int core = cpu_of(rq);
+        int cpu;
+        rcu_read_lock();
+        if (test_idle_cores(core, true))
+                goto unlock;
+        for_each_cpu(cpu, cpu_smt_mask(core)) {
+                if (cpu == core)
+                        continue;
+                if (!idle_cpu(cpu))
+                        goto unlock;
+        }
+        set_idle_cores(core, 1);
+unlock:
+        rcu_read_unlock();
+}
+/*
+ * Scan the entire LLC domain for idle cores; this dynamically switches off if
+ * there are no idle cores left in the system; tracked through
+ * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
+ */
+static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
+{
+        struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
+        int core, cpu, wrap;
+        if (!static_branch_likely(&sched_smt_present))
+                return -1;
+        if (!test_idle_cores(target, false))
+                return -1;
+        cpumask_and(cpus, sched_domain_span(sd), tsk_cpus_allowed(p));
+        for_each_cpu_wrap(core, cpus, target, wrap) {
+                bool idle = true;
+                for_each_cpu(cpu, cpu_smt_mask(core)) {
+                        cpumask_clear_cpu(cpu, cpus);
+                        if (!idle_cpu(cpu))
+                                idle = false;
+                }
+                if (idle)
+                        return core;
+        }
+        /*
+         * Failed to find an idle core; stop looking for one.
+         */
+        set_idle_cores(target, 0);
+        return -1;
+}
+/*
+ * Scan the local SMT mask for idle CPUs.
+ */
+static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+{
+        int cpu;
+        if (!static_branch_likely(&sched_smt_present))
+                return -1;
+        for_each_cpu(cpu, cpu_smt_mask(target)) {
+                if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+                        continue;
+                if (idle_cpu(cpu))
+                        return cpu;
+        }
+        return -1;
+}
+#else /* CONFIG_SCHED_SMT */
+static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
+{
+        return -1;
+}
+static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+{
+        return -1;
+}
+#endif /* CONFIG_SCHED_SMT */
+/*
+ * Scan the LLC domain for idle CPUs; this is dynamically regulated by
+ * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
+ * average idle time for this rq (as found in rq->avg_idle).
+ */
+static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
+{
+        struct sched_domain *this_sd;
+        u64 avg_cost, avg_idle = this_rq()->avg_idle;
+        u64 time, cost;
+        s64 delta;
+        int cpu, wrap;
+        this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
+        if (!this_sd)
+                return -1;
+        avg_cost = this_sd->avg_scan_cost;
+        /*
+         * Due to large variance we need a large fuzz factor; hackbench in
+         * particularly is sensitive here.
+         */
+        if ((avg_idle / 512) < avg_cost)
+                return -1;
+        time = local_clock();
+        for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) {
+                if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+                        continue;
+                if (idle_cpu(cpu))
+                        break;
+        }
+        time = local_clock() - time;
+        cost = this_sd->avg_scan_cost;
+        delta = (s64)(time - cost) / 8;
+        this_sd->avg_scan_cost += delta;
+        return cpu;
+}
+/*
+ * Try and locate an idle core/thread in the LLC cache domain.
+ */
+static int select_idle_sibling(struct task_struct *p, int prev, int target)
 {
        struct sched_domain *sd;
-        struct sched_group *sg;
+        int i;
-        int i = task_cpu(p);
        if (idle_cpu(target))
                return target;
        /*
-         * If the prevous cpu is cache affine and idle, don't be stupid.
+         * If the previous cpu is cache affine and idle, don't be stupid.
         */
-        if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
+        if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
-                return i;
+                return prev;
-        /*
-         * Otherwise, iterate the domains and find an eligible idle cpu.
-         *
-         * A completely idle sched group at higher domains is more
-         * desirable than an idle group at a lower level, because lower
-         * domains have smaller groups and usually share hardware
-         * resources which causes tasks to contend on them, e.g. x86
-         * hyperthread siblings in the lowest domain (SMT) can contend
-         * on the shared cpu pipeline.
-         *
-         * However, while we prefer idle groups at higher domains
-         * finding an idle cpu at the lowest domain is still better than
-         * returning 'target', which we've already established, isn't
-         * idle.
-         */
        sd = rcu_dereference(per_cpu(sd_llc, target));
-        for_each_lower_domain(sd) {
+        if (!sd)
-                sg = sd->groups;
+                return target;
-                do {
-                        if (!cpumask_intersects(sched_group_cpus(sg),
+        i = select_idle_core(p, sd, target);
-                                                tsk_cpus_allowed(p)))
+        if ((unsigned)i < nr_cpumask_bits)
-                                goto next;
+                return i;
-                        /* Ensure the entire group is idle */
+        i = select_idle_cpu(p, sd, target);
-                        for_each_cpu(i, sched_group_cpus(sg)) {
+        if ((unsigned)i < nr_cpumask_bits)
-                                if (i == target || !idle_cpu(i))
+                return i;
-                                        goto next;
-                        }
+        i = select_idle_smt(p, sd, target);
+        if ((unsigned)i < nr_cpumask_bits)
+                return i;
-                        /*
-                         * It doesn't matter which cpu we pick, the
-                         * whole group is idle.
-                         */
-                        target = cpumask_first_and(sched_group_cpus(sg),
-                                        tsk_cpus_allowed(p));
-                        goto done;
-next:
-                        sg = sg->next;
-                } while (sg != sd->groups);
-        }
-done:
        return target;
 }
@@ -5360,6 +5584,32 @@ static int cpu_util(int cpu)
        return (util >= capacity) ? capacity : util;
 }
+static inline int task_util(struct task_struct *p)
+{
+        return p->se.avg.util_avg;
+}
+/*
+ * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
+ * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
+ *
+ * In that case WAKE_AFFINE doesn't make sense and we'll let
+ * BALANCE_WAKE sort things out.
+ */
+static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
+{
+        long min_cap, max_cap;
+        min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
+        max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
+        /* Minimum capacity is close to max, no need to abort wake_affine */
+        if (max_cap - min_cap < max_cap >> 3)
+                return 0;
+        return min_cap * 1024 < task_util(p) * capacity_margin;
+}
 /*
 * select_task_rq_fair: Select target runqueue for the waking task in domains
 * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
@@ -5383,7 +5633,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
        if (sd_flag & SD_BALANCE_WAKE) {
                record_wakee(p);
-                want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+                want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
+                              && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
        }
        rcu_read_lock();
@@ -5409,13 +5660,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
        if (affine_sd) {
                sd = NULL; /* Prefer wake_affine over balance flags */
-                if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
+                if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
                        new_cpu = cpu;
        }
        if (!sd) {
                if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
-                        new_cpu = select_idle_sibling(p, new_cpu);
+                        new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
        } else while (sd) {
                struct sched_group *group;
@@ -5939,7 +6190,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
 *
 * The adjacency matrix of the resulting graph is given by:
 *
- *             log_2 n     
+ *             log_2 n
 *   A_i,j = \Union     (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1)  (6)
 *             k = 0
 *
@@ -5985,7 +6236,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
 *
 * [XXX write more on how we solve this.. _after_ merging pjt's patches that
 *      rewrite all of this once again.]
- */ 
+ */
 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
@@ -6133,7 +6384,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
        if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
                int cpu;
-                schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
+                schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
                env->flags |= LBF_SOME_PINNED;
@@ -6164,7 +6415,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
        env->flags &= ~LBF_ALL_PINNED;
        if (task_running(env->src_rq, p)) {
-                schedstat_inc(p, se.statistics.nr_failed_migrations_running);
+                schedstat_inc(p->se.statistics.nr_failed_migrations_running);
                return 0;
        }
@@ -6181,13 +6432,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
        if (tsk_cache_hot <= 0 ||
            env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
                if (tsk_cache_hot == 1) {
-                        schedstat_inc(env->sd, lb_hot_gained[env->idle]);
+                        schedstat_inc(env->sd->lb_hot_gained[env->idle]);
-                        schedstat_inc(p, se.statistics.nr_forced_migrations);
+                        schedstat_inc(p->se.statistics.nr_forced_migrations);
                }
                return 1;
        }
-        schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
+        schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
        return 0;
 }
@@ -6227,7 +6478,7 @@ static struct task_struct *detach_one_task(struct lb_env *env)
                 * so we can safely collect stats here rather than
                 * inside detach_tasks().
                 */
-                schedstat_inc(env->sd, lb_gained[env->idle]);
+                schedstat_inc(env->sd->lb_gained[env->idle]);
                return p;
        }
        return NULL;
@@ -6319,7 +6570,7 @@ next:
         * so we can safely collect detach_one_task() stats here rather
         * than inside detach_one_task().
         */
-        schedstat_add(env->sd, lb_gained[env->idle], detached);
+        schedstat_add(env->sd->lb_gained[env->idle], detached);
        return detached;
 }
@@ -6647,7 +6898,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
                /*
                 * !SD_OVERLAP domains can assume that child groups
                 * span the current group.
-                 */ 
+                 */
                group = child->groups;
                do {
@@ -7147,7 +7398,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
                load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
                if (load_above_capacity > busiest->group_capacity) {
                        load_above_capacity -= busiest->group_capacity;
-                        load_above_capacity *= NICE_0_LOAD;
+                        load_above_capacity *= scale_load_down(NICE_0_LOAD);
                        load_above_capacity /= busiest->group_capacity;
                } else
                        load_above_capacity = ~0UL;
@@ -7354,9 +7605,6 @@ static struct rq *find_busiest_queue(struct lb_env *env,
 */
 #define MAX_PINNED_INTERVAL     512
-/* Working cpumask for load_balance and load_balance_newidle. */
-DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
 static int need_active_balance(struct lb_env *env)
 {
        struct sched_domain *sd = env->sd;
@@ -7460,7 +7708,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
        cpumask_copy(cpus, cpu_active_mask);
-        schedstat_inc(sd, lb_count[idle]);
+        schedstat_inc(sd->lb_count[idle]);
 redo:
        if (!should_we_balance(&env)) {
@@ -7470,19 +7718,19 @@ redo:
        group = find_busiest_group(&env);
        if (!group) {
-                schedstat_inc(sd, lb_nobusyg[idle]);
+                schedstat_inc(sd->lb_nobusyg[idle]);
                goto out_balanced;
        }
        busiest = find_busiest_queue(&env, group);
        if (!busiest) {
-                schedstat_inc(sd, lb_nobusyq[idle]);
+                schedstat_inc(sd->lb_nobusyq[idle]);
                goto out_balanced;
        }
        BUG_ON(busiest == env.dst_rq);
-        schedstat_add(sd, lb_imbalance[idle], env.imbalance);
+        schedstat_add(sd->lb_imbalance[idle], env.imbalance);
        env.src_cpu = busiest->cpu;
        env.src_rq = busiest;
@@ -7589,7 +7837,7 @@ more_balance:
        }
        if (!ld_moved) {
-                schedstat_inc(sd, lb_failed[idle]);
+                schedstat_inc(sd->lb_failed[idle]);
                /*
                 * Increment the failure counter only on periodic balance.
                 * We do not want newidle balance, which can be very
@@ -7672,7 +7920,7 @@ out_all_pinned:
         * we can't migrate them. Let the imbalance flag set so parent level
         * can try to migrate them.
         */
-        schedstat_inc(sd, lb_balanced[idle]);
+        schedstat_inc(sd->lb_balanced[idle]);
        sd->nr_balance_failed = 0;
@@ -7704,11 +7952,12 @@ get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
 }
 static inline void
-update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)
+update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
 {
        unsigned long interval, next;
-        interval = get_sd_balance_interval(sd, cpu_busy);
+        /* used by idle balance, so cpu_busy = 0 */
+        interval = get_sd_balance_interval(sd, 0);
        next = sd->last_balance + interval;
        if (time_after(*next_balance, next))
@@ -7738,7 +7987,7 @@ static int idle_balance(struct rq *this_rq)
                rcu_read_lock();
                sd = rcu_dereference_check_sched_domain(this_rq->sd);
                if (sd)
-                        update_next_balance(sd, 0, &next_balance);
+                        update_next_balance(sd, &next_balance);
                rcu_read_unlock();
                goto out;
@@ -7756,7 +8005,7 @@ static int idle_balance(struct rq *this_rq)
                        continue;
                if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
-                        update_next_balance(sd, 0, &next_balance);
+                        update_next_balance(sd, &next_balance);
                        break;
                }
@@ -7774,7 +8023,7 @@ static int idle_balance(struct rq *this_rq)
                        curr_cost += domain_cost;
                }
-                update_next_balance(sd, 0, &next_balance);
+                update_next_balance(sd, &next_balance);
                /*
                 * Stop searching for tasks to pull if there are
@@ -7864,15 +8113,15 @@ static int active_load_balance_cpu_stop(void *data)
                        .idle           = CPU_IDLE,
                };
-                schedstat_inc(sd, alb_count);
+                schedstat_inc(sd->alb_count);
                p = detach_one_task(&env);
                if (p) {
-                        schedstat_inc(sd, alb_pushed);
+                        schedstat_inc(sd->alb_pushed);
                        /* Active balancing done, reset the failure counter. */
                        sd->nr_balance_failed = 0;
                } else {
-                        schedstat_inc(sd, alb_failed);
+                        schedstat_inc(sd->alb_failed);
                }
        }
        rcu_read_unlock();
@@ -7964,13 +8213,13 @@ static inline void set_cpu_sd_state_busy(void)
        int cpu = smp_processor_id();
        rcu_read_lock();
-        sd = rcu_dereference(per_cpu(sd_busy, cpu));
+        sd = rcu_dereference(per_cpu(sd_llc, cpu));
        if (!sd || !sd->nohz_idle)
                goto unlock;
        sd->nohz_idle = 0;
-        atomic_inc(&sd->groups->sgc->nr_busy_cpus);
+        atomic_inc(&sd->shared->nr_busy_cpus);
 unlock:
        rcu_read_unlock();
 }
@@ -7981,13 +8230,13 @@ void set_cpu_sd_state_idle(void)
        int cpu = smp_processor_id();
        rcu_read_lock();
-        sd = rcu_dereference(per_cpu(sd_busy, cpu));
+        sd = rcu_dereference(per_cpu(sd_llc, cpu));
        if (!sd || sd->nohz_idle)
                goto unlock;
        sd->nohz_idle = 1;
-        atomic_dec(&sd->groups->sgc->nr_busy_cpus);
+        atomic_dec(&sd->shared->nr_busy_cpus);
 unlock:
        rcu_read_unlock();
 }
@@ -8214,8 +8463,8 @@ end:
 static inline bool nohz_kick_needed(struct rq *rq)
 {
        unsigned long now = jiffies;
+        struct sched_domain_shared *sds;
        struct sched_domain *sd;
-        struct sched_group_capacity *sgc;
        int nr_busy, cpu = rq->cpu;
        bool kick = false;
@@ -8243,11 +8492,13 @@ static inline bool nohz_kick_needed(struct rq *rq)
                return true;
        rcu_read_lock();
-        sd = rcu_dereference(per_cpu(sd_busy, cpu));
+        sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
-        if (sd) {
+        if (sds) {
-                sgc = sd->groups->sgc;
+                /*
-                nr_busy = atomic_read(&sgc->nr_busy_cpus);
+                 * XXX: write a coherent comment on why we do this.
+                 * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com
+                 */
+                nr_busy = atomic_read(&sds->nr_busy_cpus);
                if (nr_busy > 1) {
                        kick = true;
                        goto unlock;
@@ -8283,7 +8534,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
 * run_rebalance_domains is triggered when needed from the scheduler tick.
 * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
 */
-static void run_rebalance_domains(struct softirq_action *h)
+static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
 {
        struct rq *this_rq = this_rq();
        enum cpu_idle_type idle = this_rq->idle_balance ?
@@ -8441,7 +8692,6 @@ static void detach_task_cfs_rq(struct task_struct *p)
        struct sched_entity *se = &p->se;
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
        u64 now = cfs_rq_clock_task(cfs_rq);
-        int tg_update;
        if (!vruntime_normalized(p)) {
                /*
@@ -8453,10 +8703,9 @@ static void detach_task_cfs_rq(struct task_struct *p)
        }
        /* Catch up with the cfs_rq and remove our load when we leave */
-        tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
+        update_cfs_rq_load_avg(now, cfs_rq, false);
        detach_entity_load_avg(cfs_rq, se);
-        if (tg_update)
+        update_tg_load_avg(cfs_rq, false);
-                update_tg_load_avg(cfs_rq, false);
 }
 static void attach_task_cfs_rq(struct task_struct *p)
@@ -8464,7 +8713,6 @@ static void attach_task_cfs_rq(struct task_struct *p)
        struct sched_entity *se = &p->se;
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
        u64 now = cfs_rq_clock_task(cfs_rq);
-        int tg_update;
 #ifdef CONFIG_FAIR_GROUP_SCHED
        /*
@@ -8475,10 +8723,9 @@ static void attach_task_cfs_rq(struct task_struct *p)
 #endif
        /* Synchronize task with its cfs_rq */
-        tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
+        update_cfs_rq_load_avg(now, cfs_rq, false);
        attach_entity_load_avg(cfs_rq, se);
-        if (tg_update)
+        update_tg_load_avg(cfs_rq, false);
-                update_tg_load_avg(cfs_rq, false);
        if (!vruntime_normalized(p))
                se->vruntime += cfs_rq->min_vruntime;
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 9fb873cfc75c..1d8718d5300d 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -16,6 +16,9 @@
 #include "sched.h"
+/* Linker adds these: start and end of __cpuidle functions */
+extern char __cpuidle_text_start[], __cpuidle_text_end[];
 /**
 * sched_idle_set_state - Record idle state for the current CPU.
 * @idle_state: State to record.
@@ -53,7 +56,7 @@ static int __init cpu_idle_nopoll_setup(char *__unused)
 __setup("hlt", cpu_idle_nopoll_setup);
 #endif
-static inline int cpu_idle_poll(void)
+static noinline int __cpuidle cpu_idle_poll(void)
 {
        rcu_idle_enter();
        trace_cpu_idle_rcuidle(0, smp_processor_id());
@@ -84,7 +87,7 @@ void __weak arch_cpu_idle(void)
 *
 * To use when the cpuidle framework cannot be used.
 */
-void default_idle_call(void)
+void __cpuidle default_idle_call(void)
 {
        if (current_clr_polling_and_test()) {
                local_irq_enable();
@@ -271,6 +274,12 @@ static void cpu_idle_loop(void)
        }
 }
+bool cpu_in_idle(unsigned long pc)
+{
+        return pc >= (unsigned long)__cpuidle_text_start &&
+                pc < (unsigned long)__cpuidle_text_end;
+}
 void cpu_startup_entry(enum cpuhp_state state)
 {
        /*
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 2ce5458bbe1d..5405d3feb112 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -27,8 +27,8 @@ static struct task_struct *
 pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
 {
        put_prev_task(rq, prev);
+        update_idle_core(rq);
-        schedstat_inc(rq, sched_goidle);
+        schedstat_inc(rq->sched_goidle);
        return rq->idle;
 }
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index d5690b722691..2516b8df6dbb 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -957,9 +957,8 @@ static void update_curr_rt(struct rq *rq)
        if (unlikely((s64)delta_exec <= 0))
                return;
-        /* Kick cpufreq (see the comment in linux/cpufreq.h). */
+        /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
-        if (cpu_of(rq) == smp_processor_id())
+        cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
-                cpufreq_trigger_update(rq_clock(rq));
        schedstat_set(curr->se.statistics.exec_max,
                      max(curr->se.statistics.exec_max, delta_exec));
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c64fc5114004..055f935d4421 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2,6 +2,7 @@
 #include <linux/sched.h>
 #include <linux/sched/sysctl.h>
 #include <linux/sched/rt.h>
+#include <linux/u64_stats_sync.h>
 #include <linux/sched/deadline.h>
 #include <linux/binfmts.h>
 #include <linux/mutex.h>
@@ -15,6 +16,12 @@
 #include "cpudeadline.h"
 #include "cpuacct.h"
+#ifdef CONFIG_SCHED_DEBUG
+#define SCHED_WARN_ON(x)        WARN_ONCE(x, #x)
+#else
+#define SCHED_WARN_ON(x)        ((void)(x))
+#endif
 struct rq;
 struct cpuidle_state;
@@ -565,6 +572,8 @@ struct root_domain {
         */
        cpumask_var_t rto_mask;
        struct cpupri cpupri;
+        unsigned long max_cpu_capacity;
 };
 extern struct root_domain def_root_domain;
@@ -597,7 +606,6 @@ struct rq {
 #ifdef CONFIG_SMP
        unsigned long last_load_update_tick;
 #endif /* CONFIG_SMP */
-        u64 nohz_stamp;
        unsigned long nohz_flags;
 #endif /* CONFIG_NO_HZ_COMMON */
 #ifdef CONFIG_NO_HZ_FULL
@@ -723,6 +731,23 @@ static inline int cpu_of(struct rq *rq)
 #endif
 }
+#ifdef CONFIG_SCHED_SMT
+extern struct static_key_false sched_smt_present;
+extern void __update_idle_core(struct rq *rq);
+static inline void update_idle_core(struct rq *rq)
+{
+        if (static_branch_unlikely(&sched_smt_present))
+                __update_idle_core(rq);
+}
+#else
+static inline void update_idle_core(struct rq *rq) { }
+#endif
 DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 #define cpu_rq(cpu)             (&per_cpu(runqueues, (cpu)))
@@ -857,8 +882,8 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
 DECLARE_PER_CPU(struct sched_domain *, sd_llc);
 DECLARE_PER_CPU(int, sd_llc_size);
 DECLARE_PER_CPU(int, sd_llc_id);
+DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
 DECLARE_PER_CPU(struct sched_domain *, sd_numa);
-DECLARE_PER_CPU(struct sched_domain *, sd_busy);
 DECLARE_PER_CPU(struct sched_domain *, sd_asym);
 struct sched_group_capacity {
@@ -870,10 +895,6 @@ struct sched_group_capacity {
        unsigned int capacity;
        unsigned long next_update;
        int imbalance; /* XXX unrelated to capacity but shared group state */
-        /*
-         * Number of busy cpus in this group.
-         */
-        atomic_t nr_busy_cpus;
        unsigned long cpumask[0]; /* iteration mask */
 };
@@ -1000,7 +1021,11 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
         * per-task data have been completed by this moment.
         */
        smp_wmb();
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+        p->cpu = cpu;
+#else
        task_thread_info(p)->cpu = cpu;
+#endif
        p->wake_cpu = cpu;
 #endif
 }
@@ -1260,6 +1285,11 @@ static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
        prev->sched_class->put_prev_task(rq, prev);
 }
+static inline void set_curr_task(struct rq *rq, struct task_struct *curr)
+{
+        curr->sched_class->set_curr_task(rq);
+}
 #define sched_class_highest (&stop_sched_class)
 #define for_each_class(class) \
   for (class = sched_class_highest; class; class = class->next)
@@ -1290,7 +1320,7 @@ static inline void idle_set_state(struct rq *rq,
 static inline struct cpuidle_state *idle_get_state(struct rq *rq)
 {
-        WARN_ON(!rcu_read_lock_held());
+        SCHED_WARN_ON(!rcu_read_lock_held());
        return rq->idle_state;
 }
 #else
@@ -1710,52 +1740,28 @@ static inline void nohz_balance_exit_idle(unsigned int cpu) { }
 #endif
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
+struct irqtime {
+        u64                     hardirq_time;
+        u64                     softirq_time;
+        u64                     irq_start_time;
+        struct u64_stats_sync   sync;
+};
-DECLARE_PER_CPU(u64, cpu_hardirq_time);
+DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
-DECLARE_PER_CPU(u64, cpu_softirq_time);
-#ifndef CONFIG_64BIT
-DECLARE_PER_CPU(seqcount_t, irq_time_seq);
-static inline void irq_time_write_begin(void)
-{
-        __this_cpu_inc(irq_time_seq.sequence);
-        smp_wmb();
-}
-static inline void irq_time_write_end(void)
-{
-        smp_wmb();
-        __this_cpu_inc(irq_time_seq.sequence);
-}
 static inline u64 irq_time_read(int cpu)
 {
-        u64 irq_time;
+        struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu);
-        unsigned seq;
+        unsigned int seq;
+        u64 total;
        do {
-                seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
+                seq = __u64_stats_fetch_begin(&irqtime->sync);
-                irq_time = per_cpu(cpu_softirq_time, cpu) +
+                total = irqtime->softirq_time + irqtime->hardirq_time;
-                           per_cpu(cpu_hardirq_time, cpu);
+        } while (__u64_stats_fetch_retry(&irqtime->sync, seq));
-        } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
-        return irq_time;
-}
-#else /* CONFIG_64BIT */
-static inline void irq_time_write_begin(void)
-{
-}
-static inline void irq_time_write_end(void)
-{
-}
-static inline u64 irq_time_read(int cpu)
+        return total;
-{
-        return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
 }
-#endif /* CONFIG_64BIT */
 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
 #ifdef CONFIG_CPU_FREQ
@@ -1763,27 +1769,13 @@ DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
 /**
 * cpufreq_update_util - Take a note about CPU utilization changes.
- * @time: Current time.
+ * @rq: Runqueue to carry out the update for.
- * @util: Current utilization.
+ * @flags: Update reason flags.
- * @max: Utilization ceiling.
 *
- * This function is called by the scheduler on every invocation of
+ * This function is called by the scheduler on the CPU whose utilization is
- * update_load_avg() on the CPU whose utilization is being updated.
+ * being updated.
 *
 * It can only be called from RCU-sched read-side critical sections.
- */
-static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max)
-{
-       struct update_util_data *data;
-       data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
-       if (data)
-               data->func(data, time, util, max);
-}
-/**
- * cpufreq_trigger_update - Trigger CPU performance state evaluation if needed.
- * @time: Current time.
 *
 * The way cpufreq is currently arranged requires it to evaluate the CPU
 * performance state (frequency/voltage) on a regular basis to prevent it from
@@ -1797,13 +1789,23 @@ static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned lo
 * but that really is a band-aid.  Going forward it should be replaced with
 * solutions targeted more specifically at RT and DL tasks.
 */
-static inline void cpufreq_trigger_update(u64 time)
+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
+{
+        struct update_util_data *data;
+        data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
+        if (data)
+                data->func(data, rq_clock(rq), flags);
+}
+static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags)
 {
-        cpufreq_update_util(time, ULONG_MAX, 0);
+        if (cpu_of(rq) == smp_processor_id())
+                cpufreq_update_util(rq, flags);
 }
 #else
-static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) {}
+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
-static inline void cpufreq_trigger_update(u64 time) {}
+static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {}
 #endif /* CONFIG_CPU_FREQ */
 #ifdef arch_scale_freq_capacity
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 78955cbea31c..34659a853505 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -29,11 +29,12 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
        if (rq)
                rq->rq_sched_info.run_delay += delta;
 }
-# define schedstat_enabled()            static_branch_unlikely(&sched_schedstats)
+#define schedstat_enabled()             static_branch_unlikely(&sched_schedstats)
-# define schedstat_inc(rq, field)       do { if (schedstat_enabled()) { (rq)->field++; } } while (0)
+#define schedstat_inc(var)              do { if (schedstat_enabled()) { var++; } } while (0)
-# define schedstat_add(rq, field, amt)  do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0)
+#define schedstat_add(var, amt)         do { if (schedstat_enabled()) { var += (amt); } } while (0)
-# define schedstat_set(var, val)        do { if (schedstat_enabled()) { var = (val); } } while (0)
+#define schedstat_set(var, val)         do { if (schedstat_enabled()) { var = (val); } } while (0)
-# define schedstat_val(rq, field)       ((schedstat_enabled()) ? (rq)->field : 0)
+#define schedstat_val(var)              (var)
+#define schedstat_val_or_zero(var)      ((schedstat_enabled()) ? (var) : 0)
 #else /* !CONFIG_SCHEDSTATS */
 static inline void
@@ -45,12 +46,13 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
 static inline void
 rq_sched_info_depart(struct rq *rq, unsigned long long delta)
 {}
-# define schedstat_enabled()            0
+#define schedstat_enabled()             0
-# define schedstat_inc(rq, field)       do { } while (0)
+#define schedstat_inc(var)              do { } while (0)
-# define schedstat_add(rq, field, amt)  do { } while (0)
+#define schedstat_add(var, amt)         do { } while (0)
-# define schedstat_set(var, val)        do { } while (0)
+#define schedstat_set(var, val)         do { } while (0)
-# define schedstat_val(rq, field)       0
+#define schedstat_val(var)              0
-#endif
+#define schedstat_val_or_zero(var)      0
+#endif /* CONFIG_SCHEDSTATS */
 #ifdef CONFIG_SCHED_INFO
 static inline void sched_info_reset_dequeued(struct task_struct *t)
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index f15d6b6a538a..4f7053579fe3 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -196,27 +196,48 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
 }
 EXPORT_SYMBOL(prepare_to_wait_exclusive);
-long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)
+void init_wait_entry(wait_queue_t *wait, int flags)
 {
-        unsigned long flags;
+        wait->flags = flags;
-        if (signal_pending_state(state, current))
-                return -ERESTARTSYS;
        wait->private = current;
        wait->func = autoremove_wake_function;
+        INIT_LIST_HEAD(&wait->task_list);
+}
+EXPORT_SYMBOL(init_wait_entry);
+long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)
+{
+        unsigned long flags;
+        long ret = 0;
        spin_lock_irqsave(&q->lock, flags);
-        if (list_empty(&wait->task_list)) {
+        if (unlikely(signal_pending_state(state, current))) {
-                if (wait->flags & WQ_FLAG_EXCLUSIVE)
+                /*
-                        __add_wait_queue_tail(q, wait);
+                 * Exclusive waiter must not fail if it was selected by wakeup,
-                else
+                 * it should "consume" the condition we were waiting for.
-                        __add_wait_queue(q, wait);
+                 *
+                 * The caller will recheck the condition and return success if
+                 * we were already woken up, we can not miss the event because
+                 * wakeup locks/unlocks the same q->lock.
+                 *
+                 * But we need to ensure that set-condition + wakeup after that
+                 * can't see us, it should wake up another exclusive waiter if
+                 * we fail.
+                 */
+                list_del_init(&wait->task_list);
+                ret = -ERESTARTSYS;
+        } else {
+                if (list_empty(&wait->task_list)) {
+                        if (wait->flags & WQ_FLAG_EXCLUSIVE)
+                                __add_wait_queue_tail(q, wait);
+                        else
+                                __add_wait_queue(q, wait);
+                }
+                set_current_state(state);
        }
-        set_current_state(state);
        spin_unlock_irqrestore(&q->lock, flags);
-        return 0;
+        return ret;
 }
 EXPORT_SYMBOL(prepare_to_wait_event);
@@ -255,39 +276,6 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
 }
 EXPORT_SYMBOL(finish_wait);
-/**
- * abort_exclusive_wait - abort exclusive waiting in a queue
- * @q: waitqueue waited on
- * @wait: wait descriptor
- * @mode: runstate of the waiter to be woken
- * @key: key to identify a wait bit queue or %NULL
- *
- * Sets current thread back to running state and removes
- * the wait descriptor from the given waitqueue if still
- * queued.
- *
- * Wakes up the next waiter if the caller is concurrently
- * woken up through the queue.
- *
- * This prevents waiter starvation where an exclusive waiter
- * aborts and is woken up concurrently and no one wakes up
- * the next waiter.
- */
-void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
-                        unsigned int mode, void *key)
-{
-        unsigned long flags;
-        __set_current_state(TASK_RUNNING);
-        spin_lock_irqsave(&q->lock, flags);
-        if (!list_empty(&wait->task_list))
-                list_del_init(&wait->task_list);
-        else if (waitqueue_active(q))
-                __wake_up_locked_key(q, mode, key);
-        spin_unlock_irqrestore(&q->lock, flags);
-}
-EXPORT_SYMBOL(abort_exclusive_wait);
 int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
 {
        int ret = default_wake_function(wait, mode, sync, key);
@@ -425,20 +413,29 @@ int __sched
 __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
                        wait_bit_action_f *action, unsigned mode)
 {
-        do {
+        int ret = 0;
-                int ret;
+        for (;;) {
                prepare_to_wait_exclusive(wq, &q->wait, mode);
-                if (!test_bit(q->key.bit_nr, q->key.flags))
+                if (test_bit(q->key.bit_nr, q->key.flags)) {
-                        continue;
+                        ret = action(&q->key, mode);
-                ret = action(&q->key, mode);
+                        /*
-                if (!ret)
+                         * See the comment in prepare_to_wait_event().
-                        continue;
+                         * finish_wait() does not necessarily takes wq->lock,
-                abort_exclusive_wait(wq, &q->wait, mode, &q->key);
+                         * but test_and_set_bit() implies mb() which pairs with
-                return ret;
+                         * smp_mb__after_atomic() before wake_up_page().
-        } while (test_and_set_bit(q->key.bit_nr, q->key.flags));
+                         */
-        finish_wait(wq, &q->wait);
+                        if (ret)
-        return 0;
+                                finish_wait(wq, &q->wait);
+                }
+                if (!test_and_set_bit(q->key.bit_nr, q->key.flags)) {
+                        if (!ret)
+                                finish_wait(wq, &q->wait);
+                        return 0;
+                } else if (ret) {
+                        return ret;
+                }
+        }
 }
 EXPORT_SYMBOL(__wait_on_bit_lock);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index ef6c6c3f9d8a..0db7c8a2afe2 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -605,12 +605,16 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
                ptrace_event(PTRACE_EVENT_SECCOMP, data);
                /*
                 * The delivery of a fatal signal during event
-                 * notification may silently skip tracer notification.
+                 * notification may silently skip tracer notification,
-                 * Terminating the task now avoids executing a system
+                 * which could leave us with a potentially unmodified
-                 * call that may not be intended.
+                 * syscall that the tracer would have liked to have
+                 * changed. Since the process is about to die, we just
+                 * force the syscall to be skipped and let the signal
+                 * kill the process and correctly handle any tracer exit
+                 * notifications.
                 */
                if (fatal_signal_pending(current))
-                        do_exit(SIGSYS);
+                        goto skip;
                /* Check if the tracer forced the syscall to be skipped. */
                this_syscall = syscall_get_nr(current, task_pt_regs(current));
                if (this_syscall < 0)
diff --git a/kernel/signal.c b/kernel/signal.c
index af21afc00d08..75761acc77cf 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3044,6 +3044,11 @@ void kernel_sigaction(int sig, __sighandler_t action)
 }
 EXPORT_SYMBOL(kernel_sigaction);
+void __weak sigaction_compat_abi(struct k_sigaction *act,
+                struct k_sigaction *oact)
+{
+}
 int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
 {
        struct task_struct *p = current, *t;
@@ -3059,6 +3064,8 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
        if (oact)
                *oact = *k;
+        sigaction_compat_abi(act, oact);
        if (act) {
                sigdelsetmask(&act->sa.sa_mask,
                              sigmask(SIGKILL) | sigmask(SIGSTOP));
diff --git a/kernel/smp.c b/kernel/smp.c
index 3aa642d39c03..bba3b201668d 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -14,6 +14,7 @@
 #include <linux/smp.h>
 #include <linux/cpu.h>
 #include <linux/sched.h>
+#include <linux/hypervisor.h>
 #include "smpboot.h"
@@ -724,3 +725,54 @@ void wake_up_all_idle_cpus(void)
        preempt_enable();
 }
 EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus);
+/**
+ * smp_call_on_cpu - Call a function on a specific cpu
+ *
+ * Used to call a function on a specific cpu and wait for it to return.
+ * Optionally make sure the call is done on a specified physical cpu via vcpu
+ * pinning in order to support virtualized environments.
+ */
+struct smp_call_on_cpu_struct {
+        struct work_struct      work;
+        struct completion       done;
+        int                     (*func)(void *);
+        void                    *data;
+        int                     ret;
+        int                     cpu;
+};
+static void smp_call_on_cpu_callback(struct work_struct *work)
+{
+        struct smp_call_on_cpu_struct *sscs;
+        sscs = container_of(work, struct smp_call_on_cpu_struct, work);
+        if (sscs->cpu >= 0)
+                hypervisor_pin_vcpu(sscs->cpu);
+        sscs->ret = sscs->func(sscs->data);
+        if (sscs->cpu >= 0)
+                hypervisor_pin_vcpu(-1);
+        complete(&sscs->done);
+}
+int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys)
+{
+        struct smp_call_on_cpu_struct sscs = {
+                .done = COMPLETION_INITIALIZER_ONSTACK(sscs.done),
+                .func = func,
+                .data = par,
+                .cpu  = phys ? cpu : -1,
+        };
+        INIT_WORK_ONSTACK(&sscs.work, smp_call_on_cpu_callback);
+        if (cpu >= nr_cpu_ids || !cpu_online(cpu))
+                return -ENXIO;
+        queue_work_on(cpu, system_wq, &sscs.work);
+        wait_for_completion(&sscs.done);
+        return sscs.ret;
+}
+EXPORT_SYMBOL_GPL(smp_call_on_cpu);
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 13bc43d1fb22..4a5c6e73ecd4 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -186,6 +186,11 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
                kfree(td);
                return PTR_ERR(tsk);
        }
+        /*
+         * Park the thread so that it could start right on the CPU
+         * when it is available.
+         */
+        kthread_park(tsk);
        get_task_struct(tsk);
        *per_cpu_ptr(ht->store, cpu) = tsk;
        if (ht->create) {
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 17caf4b63342..1bf81ef91375 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -78,6 +78,17 @@ static void wakeup_softirqd(void)
 }
 /*
+ * If ksoftirqd is scheduled, we do not want to process pending softirqs
+ * right now. Let ksoftirqd handle this at its own rate, to get fairness.
+ */
+static bool ksoftirqd_running(void)
+{
+        struct task_struct *tsk = __this_cpu_read(ksoftirqd);
+        return tsk && (tsk->state == TASK_RUNNING);
+}
+/*
 * preempt_count and SOFTIRQ_OFFSET usage:
 * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
 *   softirq processing.
@@ -313,7 +324,7 @@ asmlinkage __visible void do_softirq(void)
        pending = local_softirq_pending();
-        if (pending)
+        if (pending && !ksoftirqd_running())
                do_softirq_own_stack();
        local_irq_restore(flags);
@@ -340,6 +351,9 @@ void irq_enter(void)
 static inline void invoke_softirq(void)
 {
+        if (ksoftirqd_running())
+                return;
        if (!force_irqthreads) {
 #ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
                /*
@@ -482,7 +496,7 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t)
 }
 EXPORT_SYMBOL(__tasklet_hi_schedule_first);
-static void tasklet_action(struct softirq_action *a)
+static __latent_entropy void tasklet_action(struct softirq_action *a)
 {
        struct tasklet_struct *list;
@@ -518,7 +532,7 @@ static void tasklet_action(struct softirq_action *a)
        }
 }
-static void tasklet_hi_action(struct softirq_action *a)
+static __latent_entropy void tasklet_hi_action(struct softirq_action *a)
 {
        struct tasklet_struct *list;
@@ -700,7 +714,7 @@ void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu)
        BUG();
 }
-static void takeover_tasklets(unsigned int cpu)
+static int takeover_tasklets(unsigned int cpu)
 {
        /* CPU is dead, so no lock needed. */
        local_irq_disable();
@@ -723,27 +737,12 @@ static void takeover_tasklets(unsigned int cpu)
        raise_softirq_irqoff(HI_SOFTIRQ);
        local_irq_enable();
+        return 0;
 }
+#else
+#define takeover_tasklets       NULL
 #endif /* CONFIG_HOTPLUG_CPU */
-static int cpu_callback(struct notifier_block *nfb, unsigned long action,
-                        void *hcpu)
-{
-        switch (action) {
-#ifdef CONFIG_HOTPLUG_CPU
-        case CPU_DEAD:
-        case CPU_DEAD_FROZEN:
-                takeover_tasklets((unsigned long)hcpu);
-                break;
-#endif /* CONFIG_HOTPLUG_CPU */
-        }
-        return NOTIFY_OK;
-}
-static struct notifier_block cpu_nfb = {
-        .notifier_call = cpu_callback
-};
 static struct smp_hotplug_thread softirq_threads = {
        .store                  = &ksoftirqd,
        .thread_should_run      = ksoftirqd_should_run,
@@ -753,8 +752,8 @@ static struct smp_hotplug_thread softirq_threads = {
 static __init int spawn_ksoftirqd(void)
 {
-        register_cpu_notifier(&cpu_nfb);
+        cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL,
+                                  takeover_tasklets);
        BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
        return 0;
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 4a1ca5f6da7e..ec9ab2f01489 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -20,7 +20,6 @@
 #include <linux/kallsyms.h>
 #include <linux/smpboot.h>
 #include <linux/atomic.h>
-#include <linux/lglock.h>
 #include <linux/nmi.h>
 /*
@@ -47,13 +46,9 @@ struct cpu_stopper {
 static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
 static bool stop_machine_initialized = false;
-/*
+/* static data for stop_cpus */
- * Avoids a race between stop_two_cpus and global stop_cpus, where
+static DEFINE_MUTEX(stop_cpus_mutex);
- * the stoppers could get queued up in reverse order, leading to
+static bool stop_cpus_in_progress;
- * system deadlock. Using an lglock means stop_two_cpus remains
- * relatively cheap.
- */
-DEFINE_STATIC_LGLOCK(stop_cpus_lock);
 static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
 {
@@ -126,6 +121,11 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
        cpu_stop_init_done(&done, 1);
        if (!cpu_stop_queue_work(cpu, &work))
                return -ENOENT;
+        /*
+         * In case @cpu == smp_proccessor_id() we can avoid a sleep+wakeup
+         * cycle by doing a preemption:
+         */
+        cond_resched();
        wait_for_completion(&done.completion);
        return done.ret;
 }
@@ -230,14 +230,26 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
        struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1);
        struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
        int err;
+retry:
-        lg_double_lock(&stop_cpus_lock, cpu1, cpu2);
        spin_lock_irq(&stopper1->lock);
        spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
        err = -ENOENT;
        if (!stopper1->enabled || !stopper2->enabled)
                goto unlock;
+        /*
+         * Ensure that if we race with __stop_cpus() the stoppers won't get
+         * queued up in reverse order leading to system deadlock.
+         *
+         * We can't miss stop_cpus_in_progress if queue_stop_cpus_work() has
+         * queued a work on cpu1 but not on cpu2, we hold both locks.
+         *
+         * It can be falsely true but it is safe to spin until it is cleared,
+         * queue_stop_cpus_work() does everything under preempt_disable().
+         */
+        err = -EDEADLK;
+        if (unlikely(stop_cpus_in_progress))
+                        goto unlock;
        err = 0;
        __cpu_stop_queue_work(stopper1, work1);
@@ -245,8 +257,12 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
 unlock:
        spin_unlock(&stopper2->lock);
        spin_unlock_irq(&stopper1->lock);
-        lg_double_unlock(&stop_cpus_lock, cpu1, cpu2);
+        if (unlikely(err == -EDEADLK)) {
+                while (stop_cpus_in_progress)
+                        cpu_relax();
+                goto retry;
+        }
        return err;
 }
 /**
@@ -316,9 +332,6 @@ bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
        return cpu_stop_queue_work(cpu, work_buf);
 }
-/* static data for stop_cpus */
-static DEFINE_MUTEX(stop_cpus_mutex);
 static bool queue_stop_cpus_work(const struct cpumask *cpumask,
                                 cpu_stop_fn_t fn, void *arg,
                                 struct cpu_stop_done *done)
@@ -332,7 +345,8 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask,
         * preempted by a stopper which might wait for other stoppers
         * to enter @fn which can lead to deadlock.
         */
-        lg_global_lock(&stop_cpus_lock);
+        preempt_disable();
+        stop_cpus_in_progress = true;
        for_each_cpu(cpu, cpumask) {
                work = &per_cpu(cpu_stopper.stop_work, cpu);
                work->fn = fn;
@@ -341,7 +355,8 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask,
                if (cpu_stop_queue_work(cpu, work))
                        queued = true;
        }
-        lg_global_unlock(&stop_cpus_lock);
+        stop_cpus_in_progress = false;
+        preempt_enable();
        return queued;
 }
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 2c5e3a8e00d7..635482e60ca3 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -250,3 +250,8 @@ cond_syscall(sys_execveat);
 /* membarrier */
 cond_syscall(sys_membarrier);
+/* memory protection keys */
+cond_syscall(sys_pkey_mprotect);
+cond_syscall(sys_pkey_alloc);
+cond_syscall(sys_pkey_free);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b43d0b27c1fe..706309f9ed84 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -65,6 +65,7 @@
 #include <linux/sched/sysctl.h>
 #include <linux/kexec.h>
 #include <linux/bpf.h>
+#include <linux/mount.h>
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -106,9 +107,8 @@ extern unsigned int core_pipe_limit;
 extern int pid_max;
 extern int pid_max_min, pid_max_max;
 extern int percpu_pagelist_fraction;
-extern int compat_log;
 extern int latencytop_enabled;
-extern int sysctl_nr_open_min, sysctl_nr_open_max;
+extern unsigned int sysctl_nr_open_min, sysctl_nr_open_max;
 #ifndef CONFIG_MMU
 extern int sysctl_nr_trim_pages;
 #endif
@@ -1084,15 +1084,6 @@ static struct ctl_table kern_table[] = {
                .extra1         = &neg_one,
        },
 #endif
-#ifdef CONFIG_COMPAT
-        {
-                .procname       = "compat-log",
-                .data           = &compat_log,
-                .maxlen         = sizeof (int),
-                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
-        },
-#endif
 #ifdef CONFIG_RT_MUTEXES
        {
                .procname       = "max_lock_depth",
@@ -1692,7 +1683,7 @@ static struct ctl_table fs_table[] = {
        {
                .procname       = "nr_open",
                .data           = &sysctl_nr_open,
-                .maxlen         = sizeof(int),
+                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
                .proc_handler   = proc_dointvec_minmax,
                .extra1         = &sysctl_nr_open_min,
@@ -1838,6 +1829,14 @@ static struct ctl_table fs_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_doulongvec_minmax,
        },
+        {
+                .procname       = "mount-max",
+                .data           = &sysctl_mount_max,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &one,
+        },
        { }
 };
@@ -2140,6 +2139,21 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
        return 0;
 }
+static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp,
+                                 int *valp,
+                                 int write, void *data)
+{
+        if (write) {
+                if (*negp)
+                        return -EINVAL;
+                *valp = *lvalp;
+        } else {
+                unsigned int val = *valp;
+                *lvalp = (unsigned long)val;
+        }
+        return 0;
+}
 static const char proc_wspace_sep[] = { ' ', '\t', '\n' };
 static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
@@ -2259,8 +2273,27 @@ static int do_proc_dointvec(struct ctl_table *table, int write,
 int proc_dointvec(struct ctl_table *table, int write,
                     void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-    return do_proc_dointvec(table,write,buffer,lenp,ppos,
+        return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL);
-                            NULL,NULL);
+}
+/**
+ * proc_douintvec - read a vector of unsigned integers
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer
+ * values from/to the user buffer, treated as an ASCII string.
+ *
+ * Returns 0 on success.
+ */
+int proc_douintvec(struct ctl_table *table, int write,
+                     void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        return do_proc_dointvec(table, write, buffer, lenp, ppos,
+                                do_proc_douintvec_conv, NULL);
 }
 /*
@@ -2858,6 +2891,12 @@ int proc_dointvec(struct ctl_table *table, int write,
        return -ENOSYS;
 }
+int proc_douintvec(struct ctl_table *table, int write,
+                  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        return -ENOSYS;
+}
 int proc_dointvec_minmax(struct ctl_table *table, int write,
                    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
@@ -2903,6 +2942,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
 * exception granted :-)
 */
 EXPORT_SYMBOL(proc_dointvec);
+EXPORT_SYMBOL(proc_douintvec);
 EXPORT_SYMBOL(proc_dointvec_jiffies);
 EXPORT_SYMBOL(proc_dointvec_minmax);
 EXPORT_SYMBOL(proc_dointvec_userhz_jiffies);
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index c3aad685bbc0..12dd190634ab 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -542,7 +542,6 @@ static int alarm_clock_get(clockid_t which_clock, struct timespec *tp)
 static int alarm_timer_create(struct k_itimer *new_timer)
 {
        enum  alarmtimer_type type;
-        struct alarm_base *base;
        if (!alarmtimer_get_rtcdev())
                return -ENOTSUPP;
@@ -551,7 +550,6 @@ static int alarm_timer_create(struct k_itimer *new_timer)
                return -EPERM;
        type = clock2alarm(new_timer->it_clock);
-        base = &alarm_bases[type];
        alarm_init(&new_timer->it.alarm.alarmtimer, type, alarm_handle_timer);
        return 0;
 }
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 6a5a310a1a53..7e4fad75acaa 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -600,9 +600,18 @@ static void __clocksource_select(bool skipcur)
                 */
                if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) {
                        /* Override clocksource cannot be used. */
-                        pr_warn("Override clocksource %s is not HRT compatible - cannot switch while in HRT/NOHZ mode\n",
+                        if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
-                                cs->name);
+                                pr_warn("Override clocksource %s is unstable and not HRT compatible - cannot switch while in HRT/NOHZ mode\n",
-                        override_name[0] = 0;
+                                        cs->name);
+                                override_name[0] = 0;
+                        } else {
+                                /*
+                                 * The override cannot be currently verified.
+                                 * Deferring to let the watchdog check.
+                                 */
+                                pr_info("Override clocksource %s is not currently HRT compatible - deferring\n",
+                                        cs->name);
+                        }
                } else
                        /* Override clocksource can be used. */
                        best = cs;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 9ba7c820fc23..bb5ec425dfe0 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -307,7 +307,7 @@ EXPORT_SYMBOL_GPL(__ktime_divns);
 */
 ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs)
 {
-        ktime_t res = ktime_add(lhs, rhs);
+        ktime_t res = ktime_add_unsafe(lhs, rhs);
        /*
         * We use KTIME_SEC_MAX here, the maximum timeout which we can
@@ -703,7 +703,7 @@ static void clock_was_set_work(struct work_struct *work)
 static DECLARE_WORK(hrtimer_work, clock_was_set_work);
 /*
- * Called from timekeeping and resume code to reprogramm the hrtimer
+ * Called from timekeeping and resume code to reprogram the hrtimer
 * interrupt device on all cpus.
 */
 void clock_was_set_delayed(void)
@@ -1241,7 +1241,7 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
        /*
         * Note: We clear the running state after enqueue_hrtimer and
-         * we do not reprogramm the event hardware. Happens either in
+         * we do not reprogram the event hardware. Happens either in
         * hrtimer_start_range_ns() or in hrtimer_interrupt()
         *
         * Note: Because we dropped the cpu_base->lock above,
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 204fdc86863d..3bcb61b52f6c 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -186,10 +186,13 @@ static bool check_tick_dependency(atomic_t *dep)
        return false;
 }
-static bool can_stop_full_tick(struct tick_sched *ts)
+static bool can_stop_full_tick(int cpu, struct tick_sched *ts)
 {
        WARN_ON_ONCE(!irqs_disabled());
+        if (unlikely(!cpu_online(cpu)))
+                return false;
        if (check_tick_dependency(&tick_dep_mask))
                return false;
@@ -843,7 +846,7 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts)
        if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE)
                return;
-        if (can_stop_full_tick(ts))
+        if (can_stop_full_tick(cpu, ts))
                tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
        else if (ts->tick_stopped)
                tick_nohz_restart_sched_tick(ts, ktime_get());
@@ -908,10 +911,11 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts)
        ktime_t now, expires;
        int cpu = smp_processor_id();
+        now = tick_nohz_start_idle(ts);
        if (can_stop_idle_tick(cpu, ts)) {
                int was_stopped = ts->tick_stopped;
-                now = tick_nohz_start_idle(ts);
                ts->idle_calls++;
                expires = tick_nohz_stop_sched_tick(ts, now, cpu);
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 667b9335f5d6..bd62fb8e8e77 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -780,7 +780,7 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
 {
        struct timespec64 res;
-        set_normalized_timespec64(&res, lhs.tv_sec + rhs.tv_sec,
+        set_normalized_timespec64(&res, (timeu64_t) lhs.tv_sec + rhs.tv_sec,
                        lhs.tv_nsec + rhs.tv_nsec);
        if (unlikely(res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)) {
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 3b65746c7f15..37dec7e3db43 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -401,7 +401,13 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
        do {
                seq = raw_read_seqcount_latch(&tkf->seq);
                tkr = tkf->base + (seq & 0x01);
-                now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr);
+                now = ktime_to_ns(tkr->base);
+                now += timekeeping_delta_to_ns(tkr,
+                                clocksource_delta(
+                                        tkr->read(tkr->clock),
+                                        tkr->cycle_last,
+                                        tkr->mask));
        } while (read_seqcount_retry(&tkf->seq, seq));
        return now;
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
index f6bd65236712..ca9fb800336b 100644
--- a/kernel/time/timekeeping_debug.c
+++ b/kernel/time/timekeeping_debug.c
@@ -23,7 +23,9 @@
 #include "timekeeping_internal.h"
-static unsigned int sleep_time_bin[32] = {0};
+#define NUM_BINS 32
+static unsigned int sleep_time_bin[NUM_BINS] = {0};
 static int tk_debug_show_sleep_time(struct seq_file *s, void *data)
 {
@@ -69,6 +71,11 @@ late_initcall(tk_debug_sleep_time_init);
 void tk_debug_account_sleep_time(struct timespec64 *t)
 {
-        sleep_time_bin[fls(t->tv_sec)]++;
+        /* Cap bin index so we don't overflow the array */
+        int bin = min(fls(t->tv_sec), NUM_BINS-1);
+        sleep_time_bin[bin]++;
+        pr_info("Suspended for %lld.%03lu seconds\n", (s64)t->tv_sec,
+                        t->tv_nsec / NSEC_PER_MSEC);
 }
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 32bf6f75a8fe..2d47980a1bc4 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1633,7 +1633,7 @@ static inline void __run_timers(struct timer_base *base)
 /*
 * This function runs timers and the timer-tq in bottom half context.
 */
-static void run_timer_softirq(struct softirq_action *h)
+static __latent_entropy void run_timer_softirq(struct softirq_action *h)
 {
        struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
diff --git a/kernel/torture.c b/kernel/torture.c
index 75961b3decfe..0d887eb62856 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -43,6 +43,7 @@
 #include <linux/stat.h>
 #include <linux/slab.h>
 #include <linux/trace_clock.h>
+#include <linux/ktime.h>
 #include <asm/byteorder.h>
 #include <linux/torture.h>
@@ -446,9 +447,8 @@ EXPORT_SYMBOL_GPL(torture_shuffle_cleanup);
 * Variables for auto-shutdown.  This allows "lights out" torture runs
 * to be fully scripted.
 */
-static int shutdown_secs;               /* desired test duration in seconds. */
 static struct task_struct *shutdown_task;
-static unsigned long shutdown_time;     /* jiffies to system shutdown. */
+static ktime_t shutdown_time;           /* time to system shutdown. */
 static void (*torture_shutdown_hook)(void);
 /*
@@ -471,20 +471,20 @@ EXPORT_SYMBOL_GPL(torture_shutdown_absorb);
 */
 static int torture_shutdown(void *arg)
 {
-        long delta;
+        ktime_t ktime_snap;
-        unsigned long jiffies_snap;
        VERBOSE_TOROUT_STRING("torture_shutdown task started");
-        jiffies_snap = jiffies;
+        ktime_snap = ktime_get();
-        while (ULONG_CMP_LT(jiffies_snap, shutdown_time) &&
+        while (ktime_before(ktime_snap, shutdown_time) &&
               !torture_must_stop()) {
-                delta = shutdown_time - jiffies_snap;
                if (verbose)
                        pr_alert("%s" TORTURE_FLAG
-                                 "torture_shutdown task: %lu jiffies remaining\n",
+                                 "torture_shutdown task: %llu ms remaining\n",
-                                 torture_type, delta);
+                                 torture_type,
-                schedule_timeout_interruptible(delta);
+                                 ktime_ms_delta(shutdown_time, ktime_snap));
-                jiffies_snap = jiffies;
+                set_current_state(TASK_INTERRUPTIBLE);
+                schedule_hrtimeout(&shutdown_time, HRTIMER_MODE_ABS);
+                ktime_snap = ktime_get();
        }
        if (torture_must_stop()) {
                torture_kthread_stopping("torture_shutdown");
@@ -511,10 +511,9 @@ int torture_shutdown_init(int ssecs, void (*cleanup)(void))
 {
        int ret = 0;
-        shutdown_secs = ssecs;
        torture_shutdown_hook = cleanup;
-        if (shutdown_secs > 0) {
+        if (ssecs > 0) {
-                shutdown_time = jiffies + shutdown_secs * HZ;
+                shutdown_time = ktime_add(ktime_get(), ktime_set(ssecs, 0));
                ret = torture_create_kthread(torture_shutdown, NULL,
                                             shutdown_task);
        }
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index f4b86e8ca1e7..2a96b063d659 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -24,11 +24,6 @@ config HAVE_FUNCTION_GRAPH_TRACER
        help
          See Documentation/trace/ftrace-design.txt
-config HAVE_FUNCTION_GRAPH_FP_TEST
-        bool
-        help
-          See Documentation/trace/ftrace-design.txt
 config HAVE_DYNAMIC_FTRACE
        bool
        help
@@ -221,6 +216,41 @@ config SCHED_TRACER
          This tracer tracks the latency of the highest priority task
          to be scheduled in, starting from the point it has woken up.
+config HWLAT_TRACER
+        bool "Tracer to detect hardware latencies (like SMIs)"
+        select GENERIC_TRACER
+        help
+         This tracer, when enabled will create one or more kernel threads,
+         depening on what the cpumask file is set to, which each thread
+         spinning in a loop looking for interruptions caused by
+         something other than the kernel. For example, if a
+         System Management Interrupt (SMI) takes a noticeable amount of
+         time, this tracer will detect it. This is useful for testing
+         if a system is reliable for Real Time tasks.
+         Some files are created in the tracing directory when this
+         is enabled:
+           hwlat_detector/width   - time in usecs for how long to spin for
+           hwlat_detector/window  - time in usecs between the start of each
+                                     iteration
+         A kernel thread is created that will spin with interrupts disabled
+         for "width" microseconds in every "widow" cycle. It will not spin
+         for "window - width" microseconds, where the system can
+         continue to operate.
+         The output will appear in the trace and trace_pipe files.
+         When the tracer is not running, it has no affect on the system,
+         but when it is running, it can cause the system to be
+         periodically non responsive. Do not run this tracer on a
+         production system.
+         To enable this tracer, echo in "hwlat" into the current_tracer
+         file. Every time a latency is greater than tracing_thresh, it will
+         be recorded into the ring buffer.
 config ENABLE_DEFAULT_TRACERS
        bool "Trace process context switches and events"
        depends on !GENERIC_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index d0a1617b52b4..e57980845549 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -1,8 +1,4 @@
-# We are fully aware of the dangers of __builtin_return_address()
-FRAME_CFLAGS := $(call cc-disable-warning,frame-address)
-KBUILD_CFLAGS += $(FRAME_CFLAGS)
 # Do not instrument the tracer itself:
 ifdef CONFIG_FUNCTION_TRACER
@@ -41,6 +37,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
 obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
 obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
 obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
+obj-$(CONFIG_HWLAT_TRACER) += trace_hwlat.o
 obj-$(CONFIG_NOP_TRACER) += trace_nop.o
 obj-$(CONFIG_STACK_TRACER) += trace_stack.o
 obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 7598e6ca817a..dbafc5df03f3 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -223,7 +223,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
        what |= MASK_TC_BIT(op_flags, META);
        what |= MASK_TC_BIT(op_flags, PREFLUSH);
        what |= MASK_TC_BIT(op_flags, FUA);
-        if (op == REQ_OP_DISCARD)
+        if (op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE)
                what |= BLK_TC_ACT(BLK_TC_DISCARD);
        if (op == REQ_OP_FLUSH)
                what |= BLK_TC_ACT(BLK_TC_FLUSH);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index b20438fdb029..5dcb99281259 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1,4 +1,5 @@
 /* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of version 2 of the GNU General Public
@@ -8,6 +9,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/bpf.h>
+#include <linux/bpf_perf_event.h>
 #include <linux/filter.h>
 #include <linux/uaccess.h>
 #include <linux/ctype.h>
@@ -59,11 +61,9 @@ unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
 }
 EXPORT_SYMBOL_GPL(trace_call_bpf);
-static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr)
 {
-        void *dst = (void *) (long) r1;
+        int ret;
-        int ret, size = (int) r2;
-        void *unsafe_ptr = (void *) (long) r3;
        ret = probe_kernel_read(dst, unsafe_ptr, size);
        if (unlikely(ret < 0))
@@ -81,12 +81,9 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
        .arg3_type      = ARG_ANYTHING,
 };
-static u64 bpf_probe_write_user(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
+           u32, size)
 {
-        void *unsafe_ptr = (void *) (long) r1;
-        void *src = (void *) (long) r2;
-        int size = (int) r3;
        /*
         * Ensure we're in user context which is safe for the helper to
         * run. This helper has no business in a kthread.
@@ -128,9 +125,9 @@ static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
 * limited trace_printk()
 * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed
 */
-static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
+BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
+           u64, arg2, u64, arg3)
 {
-        char *fmt = (char *) (long) r1;
        bool str_seen = false;
        int mod[3] = {};
        int fmt_cnt = 0;
@@ -176,16 +173,16 @@ static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
                                switch (fmt_cnt) {
                                case 1:
-                                        unsafe_addr = r3;
+                                        unsafe_addr = arg1;
-                                        r3 = (long) buf;
+                                        arg1 = (long) buf;
                                        break;
                                case 2:
-                                        unsafe_addr = r4;
+                                        unsafe_addr = arg2;
-                                        r4 = (long) buf;
+                                        arg2 = (long) buf;
                                        break;
                                case 3:
-                                        unsafe_addr = r5;
+                                        unsafe_addr = arg3;
-                                        r5 = (long) buf;
+                                        arg3 = (long) buf;
                                        break;
                                }
                                buf[0] = 0;
@@ -207,9 +204,9 @@ static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
        }
        return __trace_printk(1/* fake ip will not be printed */, fmt,
-                              mod[0] == 2 ? r3 : mod[0] == 1 ? (long) r3 : (u32) r3,
+                              mod[0] == 2 ? arg1 : mod[0] == 1 ? (long) arg1 : (u32) arg1,
-                              mod[1] == 2 ? r4 : mod[1] == 1 ? (long) r4 : (u32) r4,
+                              mod[1] == 2 ? arg2 : mod[1] == 1 ? (long) arg2 : (u32) arg2,
-                              mod[2] == 2 ? r5 : mod[2] == 1 ? (long) r5 : (u32) r5);
+                              mod[2] == 2 ? arg3 : mod[2] == 1 ? (long) arg3 : (u32) arg3);
 }
 static const struct bpf_func_proto bpf_trace_printk_proto = {
@@ -231,9 +228,8 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
        return &bpf_trace_printk_proto;
 }
-static u64 bpf_perf_event_read(u64 r1, u64 flags, u64 r3, u64 r4, u64 r5)
+BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
 {
-        struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
        struct bpf_array *array = container_of(map, struct bpf_array, map);
        unsigned int cpu = smp_processor_id();
        u64 index = flags & BPF_F_INDEX_MASK;
@@ -310,11 +306,9 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
        return 0;
 }
-static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
+BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
+           u64, flags, void *, data, u64, size)
 {
-        struct pt_regs *regs = (struct pt_regs *)(long) r1;
-        struct bpf_map *map  = (struct bpf_map *)(long) r2;
-        void *data = (void *)(long) r4;
        struct perf_raw_record raw = {
                .frag = {
                        .size = size,
@@ -365,7 +359,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
        return __bpf_perf_event_output(regs, map, flags, &raw);
 }
-static u64 bpf_get_current_task(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_0(bpf_get_current_task)
 {
        return (long) current;
 }
@@ -376,6 +370,31 @@ static const struct bpf_func_proto bpf_get_current_task_proto = {
        .ret_type       = RET_INTEGER,
 };
+BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx)
+{
+        struct bpf_array *array = container_of(map, struct bpf_array, map);
+        struct cgroup *cgrp;
+        if (unlikely(in_interrupt()))
+                return -EINVAL;
+        if (unlikely(idx >= array->map.max_entries))
+                return -E2BIG;
+        cgrp = READ_ONCE(array->ptrs[idx]);
+        if (unlikely(!cgrp))
+                return -EAGAIN;
+        return task_under_cgroup_hierarchy(current, cgrp);
+}
+static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = {
+        .func           = bpf_current_task_under_cgroup,
+        .gpl_only       = false,
+        .ret_type       = RET_INTEGER,
+        .arg1_type      = ARG_CONST_MAP_PTR,
+        .arg2_type      = ARG_ANYTHING,
+};
 static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
 {
        switch (func_id) {
@@ -407,6 +426,10 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
                return &bpf_perf_event_read_proto;
        case BPF_FUNC_probe_write_user:
                return bpf_get_probe_write_proto();
+        case BPF_FUNC_current_task_under_cgroup:
+                return &bpf_current_task_under_cgroup_proto;
+        case BPF_FUNC_get_prandom_u32:
+                return &bpf_get_prandom_u32_proto;
        default:
                return NULL;
        }
@@ -447,16 +470,17 @@ static struct bpf_prog_type_list kprobe_tl = {
        .type   = BPF_PROG_TYPE_KPROBE,
 };
-static u64 bpf_perf_event_output_tp(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
+BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map,
+           u64, flags, void *, data, u64, size)
 {
+        struct pt_regs *regs = *(struct pt_regs **)tp_buff;
        /*
         * r1 points to perf tracepoint buffer where first 8 bytes are hidden
         * from bpf program and contain a pointer to 'struct pt_regs'. Fetch it
-         * from there and call the same bpf_perf_event_output() helper
+         * from there and call the same bpf_perf_event_output() helper inline.
         */
-        u64 ctx = *(long *)(uintptr_t)r1;
+        return ____bpf_perf_event_output(regs, map, flags, data, size);
-        return bpf_perf_event_output(ctx, r2, index, r4, size);
 }
 static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
@@ -470,11 +494,18 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
        .arg5_type      = ARG_CONST_STACK_SIZE,
 };
-static u64 bpf_get_stackid_tp(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_3(bpf_get_stackid_tp, void *, tp_buff, struct bpf_map *, map,
+           u64, flags)
 {
-        u64 ctx = *(long *)(uintptr_t)r1;
+        struct pt_regs *regs = *(struct pt_regs **)tp_buff;
-        return bpf_get_stackid(ctx, r2, r3, r4, r5);
+        /*
+         * Same comment as in bpf_perf_event_output_tp(), only that this time
+         * the other helper's function body cannot be inlined due to being
+         * external, thus we need to call raw helper function.
+         */
+        return bpf_get_stackid((unsigned long) regs, (unsigned long) map,
+                               flags, 0, 0);
 }
 static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
@@ -520,10 +551,69 @@ static struct bpf_prog_type_list tracepoint_tl = {
        .type   = BPF_PROG_TYPE_TRACEPOINT,
 };
+static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
+                                    enum bpf_reg_type *reg_type)
+{
+        if (off < 0 || off >= sizeof(struct bpf_perf_event_data))
+                return false;
+        if (type != BPF_READ)
+                return false;
+        if (off % size != 0)
+                return false;
+        if (off == offsetof(struct bpf_perf_event_data, sample_period)) {
+                if (size != sizeof(u64))
+                        return false;
+        } else {
+                if (size != sizeof(long))
+                        return false;
+        }
+        return true;
+}
+static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, int dst_reg,
+                                      int src_reg, int ctx_off,
+                                      struct bpf_insn *insn_buf,
+                                      struct bpf_prog *prog)
+{
+        struct bpf_insn *insn = insn_buf;
+        switch (ctx_off) {
+        case offsetof(struct bpf_perf_event_data, sample_period):
+                BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64));
+                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
+                                                       data), dst_reg, src_reg,
+                                      offsetof(struct bpf_perf_event_data_kern, data));
+                *insn++ = BPF_LDX_MEM(BPF_DW, dst_reg, dst_reg,
+                                      offsetof(struct perf_sample_data, period));
+                break;
+        default:
+                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
+                                                       regs), dst_reg, src_reg,
+                                      offsetof(struct bpf_perf_event_data_kern, regs));
+                *insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), dst_reg, dst_reg, ctx_off);
+                break;
+        }
+        return insn - insn_buf;
+}
+static const struct bpf_verifier_ops perf_event_prog_ops = {
+        .get_func_proto         = tp_prog_func_proto,
+        .is_valid_access        = pe_prog_is_valid_access,
+        .convert_ctx_access     = pe_prog_convert_ctx_access,
+};
+static struct bpf_prog_type_list perf_event_tl = {
+        .ops    = &perf_event_prog_ops,
+        .type   = BPF_PROG_TYPE_PERF_EVENT,
+};
 static int __init register_kprobe_prog_ops(void)
 {
        bpf_register_prog_type(&kprobe_tl);
        bpf_register_prog_type(&tracepoint_tl);
+        bpf_register_prog_type(&perf_event_tl);
        return 0;
 }
 late_initcall(register_kprobe_prog_ops);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 84752c8e28b5..2050a7652a86 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -872,7 +872,13 @@ function_profile_call(unsigned long ip, unsigned long parent_ip,
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 static int profile_graph_entry(struct ftrace_graph_ent *trace)
 {
+        int index = trace->depth;
        function_profile_call(trace->func, 0, NULL, NULL);
+        if (index >= 0 && index < FTRACE_RETFUNC_DEPTH)
+                current->ret_stack[index].subtime = 0;
        return 1;
 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index dade4c9559cc..8696ce6bf2f6 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1047,7 +1047,7 @@ void disable_trace_on_warning(void)
 *
 * Shows real state of the ring buffer if it is enabled or not.
 */
-static int tracer_tracing_is_on(struct trace_array *tr)
+int tracer_tracing_is_on(struct trace_array *tr)
 {
        if (tr->trace_buffer.buffer)
                return ring_buffer_record_is_on(tr->trace_buffer.buffer);
@@ -4123,6 +4123,30 @@ static const char readme_msg[] =
        "\t\t\t  traces\n"
 #endif
 #endif /* CONFIG_STACK_TRACER */
+#ifdef CONFIG_KPROBE_EVENT
+        "  kprobe_events\t\t- Add/remove/show the kernel dynamic events\n"
+        "\t\t\t  Write into this file to define/undefine new trace events.\n"
+#endif
+#ifdef CONFIG_UPROBE_EVENT
+        "  uprobe_events\t\t- Add/remove/show the userspace dynamic events\n"
+        "\t\t\t  Write into this file to define/undefine new trace events.\n"
+#endif
+#if defined(CONFIG_KPROBE_EVENT) || defined(CONFIG_UPROBE_EVENT)
+        "\t  accepts: event-definitions (one definition per line)\n"
+        "\t   Format: p|r[:[<group>/]<event>] <place> [<args>]\n"
+        "\t           -:[<group>/]<event>\n"
+#ifdef CONFIG_KPROBE_EVENT
+        "\t    place: [<module>:]<symbol>[+<offset>]|<memaddr>\n"
+#endif
+#ifdef CONFIG_UPROBE_EVENT
+        "\t    place: <path>:<offset>\n"
+#endif
+        "\t     args: <name>=fetcharg[:type]\n"
+        "\t fetcharg: %<register>, @<address>, @<symbol>[+|-<offset>],\n"
+        "\t           $stack<index>, $stack, $retval, $comm\n"
+        "\t     type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string,\n"
+        "\t           b<bit-width>@<bit-offset>/<container-size>\n"
+#endif
        "  events/\t\t- Directory containing all trace event subsystems:\n"
        "      enable\t\t- Write 0/1 to enable/disable tracing of all events\n"
        "  events/<system>/\t- Directory containing all trace events for <system>:\n"
@@ -4945,7 +4969,7 @@ out:
        return ret;
 }
-#ifdef CONFIG_TRACER_MAX_TRACE
+#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
 static ssize_t
 tracing_max_lat_read(struct file *filp, char __user *ubuf,
@@ -5124,19 +5148,20 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
        struct trace_iterator *iter = filp->private_data;
        ssize_t sret;
-        /* return any leftover data */
-        sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
-        if (sret != -EBUSY)
-                return sret;
-        trace_seq_init(&iter->seq);
        /*
         * Avoid more than one consumer on a single file descriptor
         * This is just a matter of traces coherency, the ring buffer itself
         * is protected.
         */
        mutex_lock(&iter->mutex);
+        /* return any leftover data */
+        sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
+        if (sret != -EBUSY)
+                goto out;
+        trace_seq_init(&iter->seq);
        if (iter->trace->read) {
                sret = iter->trace->read(iter, filp, ubuf, cnt, ppos);
                if (sret)
@@ -5867,7 +5892,7 @@ static const struct file_operations tracing_thresh_fops = {
        .llseek         = generic_file_llseek,
 };
-#ifdef CONFIG_TRACER_MAX_TRACE
+#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
 static const struct file_operations tracing_max_lat_fops = {
        .open           = tracing_open_generic,
        .read           = tracing_max_lat_read,
@@ -6163,9 +6188,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                return -EBUSY;
 #endif
-        if (splice_grow_spd(pipe, &spd))
-                return -ENOMEM;
        if (*ppos & (PAGE_SIZE - 1))
                return -EINVAL;
@@ -6175,6 +6197,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                len &= PAGE_MASK;
        }
+        if (splice_grow_spd(pipe, &spd))
+                return -ENOMEM;
 again:
        trace_access_lock(iter->cpu_file);
        entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file);
@@ -6232,19 +6257,21 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
        /* did we read anything? */
        if (!spd.nr_pages) {
                if (ret)
-                        return ret;
+                        goto out;
+                ret = -EAGAIN;
                if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK))
-                        return -EAGAIN;
+                        goto out;
                ret = wait_on_pipe(iter, true);
                if (ret)
-                        return ret;
+                        goto out;
                goto again;
        }
        ret = splice_to_pipe(pipe, &spd);
+out:
        splice_shrink_spd(&spd);
        return ret;
@@ -7195,7 +7222,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
        create_trace_options_dir(tr);
-#ifdef CONFIG_TRACER_MAX_TRACE
+#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
        trace_create_file("tracing_max_latency", 0644, d_tracer,
                        &tr->max_latency, &tracing_max_lat_fops);
 #endif
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f783df416726..fd24b1f9ac43 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -38,6 +38,7 @@ enum trace_type {
        TRACE_USER_STACK,
        TRACE_BLK,
        TRACE_BPUTS,
+        TRACE_HWLAT,
        __TRACE_LAST_TYPE,
 };
@@ -213,6 +214,8 @@ struct trace_array {
         */
        struct trace_buffer     max_buffer;
        bool                    allocated_snapshot;
+#endif
+#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
        unsigned long           max_latency;
 #endif
        struct trace_pid_list   __rcu *filtered_pids;
@@ -326,6 +329,7 @@ extern void __ftrace_bad_type(void);
                IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT);   \
                IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \
                IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS);   \
+                IF_ASSIGN(var, ent, struct hwlat_entry, TRACE_HWLAT);   \
                IF_ASSIGN(var, ent, struct trace_mmiotrace_rw,          \
                          TRACE_MMIO_RW);                               \
                IF_ASSIGN(var, ent, struct trace_mmiotrace_map,         \
@@ -571,6 +575,7 @@ void tracing_reset_current(int cpu);
 void tracing_reset_all_online_cpus(void);
 int tracing_open_generic(struct inode *inode, struct file *filp);
 bool tracing_is_disabled(void);
+int tracer_tracing_is_on(struct trace_array *tr);
 struct dentry *trace_create_file(const char *name,
                                 umode_t mode,
                                 struct dentry *parent,
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 5c30efcda5e6..d1cc37e78f99 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -322,3 +322,30 @@ FTRACE_ENTRY(branch, trace_branch,
        FILTER_OTHER
 );
+FTRACE_ENTRY(hwlat, hwlat_entry,
+        TRACE_HWLAT,
+        F_STRUCT(
+                __field(        u64,                    duration        )
+                __field(        u64,                    outer_duration  )
+                __field(        u64,                    nmi_total_ts    )
+                __field_struct( struct timespec,        timestamp       )
+                __field_desc(   long,   timestamp,      tv_sec          )
+                __field_desc(   long,   timestamp,      tv_nsec         )
+                __field(        unsigned int,           nmi_count       )
+                __field(        unsigned int,           seqnum          )
+        ),
+        F_printk("cnt:%u\tts:%010lu.%010lu\tinner:%llu\touter:%llunmi-ts:%llu\tnmi-count:%u\n",
+                 __entry->seqnum,
+                 __entry->tv_sec,
+                 __entry->tv_nsec,
+                 __entry->duration,
+                 __entry->outer_duration,
+                 __entry->nmi_total_ts,
+                 __entry->nmi_count),
+        FILTER_OTHER
+);
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index a975571cde24..6721a1e89f39 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -1028,6 +1028,7 @@ static struct event_command trigger_traceon_cmd = {
 static struct event_command trigger_traceoff_cmd = {
        .name                   = "traceoff",
        .trigger_type           = ETT_TRACE_ONOFF,
+        .flags                  = EVENT_CMD_FL_POST_TRIGGER,
        .func                   = event_trigger_callback,
        .reg                    = register_trigger,
        .unreg                  = unregister_trigger,
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 7363ccf79512..4e480e870474 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -119,7 +119,7 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration,
 /* Add a function return address to the trace stack on thread info.*/
 int
 ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
-                         unsigned long frame_pointer)
+                         unsigned long frame_pointer, unsigned long *retp)
 {
        unsigned long long calltime;
        int index;
@@ -170,8 +170,12 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
        current->ret_stack[index].ret = ret;
        current->ret_stack[index].func = func;
        current->ret_stack[index].calltime = calltime;
-        current->ret_stack[index].subtime = 0;
+#ifdef HAVE_FUNCTION_GRAPH_FP_TEST
        current->ret_stack[index].fp = frame_pointer;
+#endif
+#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
+        current->ret_stack[index].retp = retp;
+#endif
        *depth = current->curr_ret_stack;
        return 0;
@@ -204,7 +208,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
                return;
        }
-#if defined(CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST) && !defined(CC_USING_FENTRY)
+#ifdef HAVE_FUNCTION_GRAPH_FP_TEST
        /*
         * The arch may choose to record the frame pointer used
         * and check it here to make sure that it is what we expect it
@@ -279,6 +283,64 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
        return ret;
 }
+/**
+ * ftrace_graph_ret_addr - convert a potentially modified stack return address
+ *                         to its original value
+ *
+ * This function can be called by stack unwinding code to convert a found stack
+ * return address ('ret') to its original value, in case the function graph
+ * tracer has modified it to be 'return_to_handler'.  If the address hasn't
+ * been modified, the unchanged value of 'ret' is returned.
+ *
+ * 'idx' is a state variable which should be initialized by the caller to zero
+ * before the first call.
+ *
+ * 'retp' is a pointer to the return address on the stack.  It's ignored if
+ * the arch doesn't have HAVE_FUNCTION_GRAPH_RET_ADDR_PTR defined.
+ */
+#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
+unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
+                                    unsigned long ret, unsigned long *retp)
+{
+        int index = task->curr_ret_stack;
+        int i;
+        if (ret != (unsigned long)return_to_handler)
+                return ret;
+        if (index < -1)
+                index += FTRACE_NOTRACE_DEPTH;
+        if (index < 0)
+                return ret;
+        for (i = 0; i <= index; i++)
+                if (task->ret_stack[i].retp == retp)
+                        return task->ret_stack[i].ret;
+        return ret;
+}
+#else /* !HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */
+unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
+                                    unsigned long ret, unsigned long *retp)
+{
+        int task_idx;
+        if (ret != (unsigned long)return_to_handler)
+                return ret;
+        task_idx = task->curr_ret_stack;
+        if (!task->ret_stack || task_idx < *idx)
+                return ret;
+        task_idx -= *idx;
+        (*idx)++;
+        return task->ret_stack[task_idx].ret;
+}
+#endif /* HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */
 int __trace_graph_entry(struct trace_array *tr,
                                struct ftrace_graph_ent *trace,
                                unsigned long flags,
@@ -1120,6 +1182,11 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
        trace_seq_puts(s, "/* ");
        switch (iter->ent->type) {
+        case TRACE_BPUTS:
+                ret = trace_print_bputs_msg_only(iter);
+                if (ret != TRACE_TYPE_HANDLED)
+                        return ret;
+                break;
        case TRACE_BPRINT:
                ret = trace_print_bprintk_msg_only(iter);
                if (ret != TRACE_TYPE_HANDLED)
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
new file mode 100644
index 000000000000..b97286c48735
--- /dev/null
+++ b/kernel/trace/trace_hwlat.c
@@ -0,0 +1,633 @@
+/*
+ * trace_hwlatdetect.c - A simple Hardware Latency detector.
+ *
+ * Use this tracer to detect large system latencies induced by the behavior of
+ * certain underlying system hardware or firmware, independent of Linux itself.
+ * The code was developed originally to detect the presence of SMIs on Intel
+ * and AMD systems, although there is no dependency upon x86 herein.
+ *
+ * The classical example usage of this tracer is in detecting the presence of
+ * SMIs or System Management Interrupts on Intel and AMD systems. An SMI is a
+ * somewhat special form of hardware interrupt spawned from earlier CPU debug
+ * modes in which the (BIOS/EFI/etc.) firmware arranges for the South Bridge
+ * LPC (or other device) to generate a special interrupt under certain
+ * circumstances, for example, upon expiration of a special SMI timer device,
+ * due to certain external thermal readings, on certain I/O address accesses,
+ * and other situations. An SMI hits a special CPU pin, triggers a special
+ * SMI mode (complete with special memory map), and the OS is unaware.
+ *
+ * Although certain hardware-inducing latencies are necessary (for example,
+ * a modern system often requires an SMI handler for correct thermal control
+ * and remote management) they can wreak havoc upon any OS-level performance
+ * guarantees toward low-latency, especially when the OS is not even made
+ * aware of the presence of these interrupts. For this reason, we need a
+ * somewhat brute force mechanism to detect these interrupts. In this case,
+ * we do it by hogging all of the CPU(s) for configurable timer intervals,
+ * sampling the built-in CPU timer, looking for discontiguous readings.
+ *
+ * WARNING: This implementation necessarily introduces latencies. Therefore,
+ *          you should NEVER use this tracer while running in a production
+ *          environment requiring any kind of low-latency performance
+ *          guarantee(s).
+ *
+ * Copyright (C) 2008-2009 Jon Masters, Red Hat, Inc. <jcm@redhat.com>
+ * Copyright (C) 2013-2016 Steven Rostedt, Red Hat, Inc. <srostedt@redhat.com>
+ *
+ * Includes useful feedback from Clark Williams <clark@redhat.com>
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+#include <linux/kthread.h>
+#include <linux/tracefs.h>
+#include <linux/uaccess.h>
+#include <linux/cpumask.h>
+#include <linux/delay.h>
+#include "trace.h"
+static struct trace_array       *hwlat_trace;
+#define U64STR_SIZE             22                      /* 20 digits max */
+#define BANNER                  "hwlat_detector: "
+#define DEFAULT_SAMPLE_WINDOW   1000000                 /* 1s */
+#define DEFAULT_SAMPLE_WIDTH    500000                  /* 0.5s */
+#define DEFAULT_LAT_THRESHOLD   10                      /* 10us */
+/* sampling thread*/
+static struct task_struct *hwlat_kthread;
+static struct dentry *hwlat_sample_width;       /* sample width us */
+static struct dentry *hwlat_sample_window;      /* sample window us */
+/* Save the previous tracing_thresh value */
+static unsigned long save_tracing_thresh;
+/* NMI timestamp counters */
+static u64 nmi_ts_start;
+static u64 nmi_total_ts;
+static int nmi_count;
+static int nmi_cpu;
+/* Tells NMIs to call back to the hwlat tracer to record timestamps */
+bool trace_hwlat_callback_enabled;
+/* If the user changed threshold, remember it */
+static u64 last_tracing_thresh = DEFAULT_LAT_THRESHOLD * NSEC_PER_USEC;
+/* Individual latency samples are stored here when detected. */
+struct hwlat_sample {
+        u64             seqnum;         /* unique sequence */
+        u64             duration;       /* delta */
+        u64             outer_duration; /* delta (outer loop) */
+        u64             nmi_total_ts;   /* Total time spent in NMIs */
+        struct timespec timestamp;      /* wall time */
+        int             nmi_count;      /* # NMIs during this sample */
+};
+/* keep the global state somewhere. */
+static struct hwlat_data {
+        struct mutex lock;              /* protect changes */
+        u64     count;                  /* total since reset */
+        u64     sample_window;          /* total sampling window (on+off) */
+        u64     sample_width;           /* active sampling portion of window */
+} hwlat_data = {
+        .sample_window          = DEFAULT_SAMPLE_WINDOW,
+        .sample_width           = DEFAULT_SAMPLE_WIDTH,
+};
+static void trace_hwlat_sample(struct hwlat_sample *sample)
+{
+        struct trace_array *tr = hwlat_trace;
+        struct trace_event_call *call = &event_hwlat;
+        struct ring_buffer *buffer = tr->trace_buffer.buffer;
+        struct ring_buffer_event *event;
+        struct hwlat_entry *entry;
+        unsigned long flags;
+        int pc;
+        pc = preempt_count();
+        local_save_flags(flags);
+        event = trace_buffer_lock_reserve(buffer, TRACE_HWLAT, sizeof(*entry),
+                                          flags, pc);
+        if (!event)
+                return;
+        entry   = ring_buffer_event_data(event);
+        entry->seqnum                   = sample->seqnum;
+        entry->duration                 = sample->duration;
+        entry->outer_duration           = sample->outer_duration;
+        entry->timestamp                = sample->timestamp;
+        entry->nmi_total_ts             = sample->nmi_total_ts;
+        entry->nmi_count                = sample->nmi_count;
+        if (!call_filter_check_discard(call, entry, buffer, event))
+                __buffer_unlock_commit(buffer, event);
+}
+/* Macros to encapsulate the time capturing infrastructure */
+#define time_type       u64
+#define time_get()      trace_clock_local()
+#define time_to_us(x)   div_u64(x, 1000)
+#define time_sub(a, b)  ((a) - (b))
+#define init_time(a, b) (a = b)
+#define time_u64(a)     a
+void trace_hwlat_callback(bool enter)
+{
+        if (smp_processor_id() != nmi_cpu)
+                return;
+        /*
+         * Currently trace_clock_local() calls sched_clock() and the
+         * generic version is not NMI safe.
+         */
+        if (!IS_ENABLED(CONFIG_GENERIC_SCHED_CLOCK)) {
+                if (enter)
+                        nmi_ts_start = time_get();
+                else
+                        nmi_total_ts = time_get() - nmi_ts_start;
+        }
+        if (enter)
+                nmi_count++;
+}
+/**
+ * get_sample - sample the CPU TSC and look for likely hardware latencies
+ *
+ * Used to repeatedly capture the CPU TSC (or similar), looking for potential
+ * hardware-induced latency. Called with interrupts disabled and with
+ * hwlat_data.lock held.
+ */
+static int get_sample(void)
+{
+        struct trace_array *tr = hwlat_trace;
+        time_type start, t1, t2, last_t2;
+        s64 diff, total, last_total = 0;
+        u64 sample = 0;
+        u64 thresh = tracing_thresh;
+        u64 outer_sample = 0;
+        int ret = -1;
+        do_div(thresh, NSEC_PER_USEC); /* modifies interval value */
+        nmi_cpu = smp_processor_id();
+        nmi_total_ts = 0;
+        nmi_count = 0;
+        /* Make sure NMIs see this first */
+        barrier();
+        trace_hwlat_callback_enabled = true;
+        init_time(last_t2, 0);
+        start = time_get(); /* start timestamp */
+        do {
+                t1 = time_get();        /* we'll look for a discontinuity */
+                t2 = time_get();
+                if (time_u64(last_t2)) {
+                        /* Check the delta from outer loop (t2 to next t1) */
+                        diff = time_to_us(time_sub(t1, last_t2));
+                        /* This shouldn't happen */
+                        if (diff < 0) {
+                                pr_err(BANNER "time running backwards\n");
+                                goto out;
+                        }
+                        if (diff > outer_sample)
+                                outer_sample = diff;
+                }
+                last_t2 = t2;
+                total = time_to_us(time_sub(t2, start)); /* sample width */
+                /* Check for possible overflows */
+                if (total < last_total) {
+                        pr_err("Time total overflowed\n");
+                        break;
+                }
+                last_total = total;
+                /* This checks the inner loop (t1 to t2) */
+                diff = time_to_us(time_sub(t2, t1));     /* current diff */
+                /* This shouldn't happen */
+                if (diff < 0) {
+                        pr_err(BANNER "time running backwards\n");
+                        goto out;
+                }
+                if (diff > sample)
+                        sample = diff; /* only want highest value */
+        } while (total <= hwlat_data.sample_width);
+        barrier(); /* finish the above in the view for NMIs */
+        trace_hwlat_callback_enabled = false;
+        barrier(); /* Make sure nmi_total_ts is no longer updated */
+        ret = 0;
+        /* If we exceed the threshold value, we have found a hardware latency */
+        if (sample > thresh || outer_sample > thresh) {
+                struct hwlat_sample s;
+                ret = 1;
+                /* We read in microseconds */
+                if (nmi_total_ts)
+                        do_div(nmi_total_ts, NSEC_PER_USEC);
+                hwlat_data.count++;
+                s.seqnum = hwlat_data.count;
+                s.duration = sample;
+                s.outer_duration = outer_sample;
+                s.timestamp = CURRENT_TIME;
+                s.nmi_total_ts = nmi_total_ts;
+                s.nmi_count = nmi_count;
+                trace_hwlat_sample(&s);
+                /* Keep a running maximum ever recorded hardware latency */
+                if (sample > tr->max_latency)
+                        tr->max_latency = sample;
+        }
+out:
+        return ret;
+}
+static struct cpumask save_cpumask;
+static bool disable_migrate;
+static void move_to_next_cpu(void)
+{
+        static struct cpumask *current_mask;
+        int next_cpu;
+        if (disable_migrate)
+                return;
+        /* Just pick the first CPU on first iteration */
+        if (!current_mask) {
+                current_mask = &save_cpumask;
+                get_online_cpus();
+                cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask);
+                put_online_cpus();
+                next_cpu = cpumask_first(current_mask);
+                goto set_affinity;
+        }
+        /*
+         * If for some reason the user modifies the CPU affinity
+         * of this thread, than stop migrating for the duration
+         * of the current test.
+         */
+        if (!cpumask_equal(current_mask, &current->cpus_allowed))
+                goto disable;
+        get_online_cpus();
+        cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask);
+        next_cpu = cpumask_next(smp_processor_id(), current_mask);
+        put_online_cpus();
+        if (next_cpu >= nr_cpu_ids)
+                next_cpu = cpumask_first(current_mask);
+ set_affinity:
+        if (next_cpu >= nr_cpu_ids) /* Shouldn't happen! */
+                goto disable;
+        cpumask_clear(current_mask);
+        cpumask_set_cpu(next_cpu, current_mask);
+        sched_setaffinity(0, current_mask);
+        return;
+ disable:
+        disable_migrate = true;
+}
+/*
+ * kthread_fn - The CPU time sampling/hardware latency detection kernel thread
+ *
+ * Used to periodically sample the CPU TSC via a call to get_sample. We
+ * disable interrupts, which does (intentionally) introduce latency since we
+ * need to ensure nothing else might be running (and thus preempting).
+ * Obviously this should never be used in production environments.
+ *
+ * Currently this runs on which ever CPU it was scheduled on, but most
+ * real-world hardware latency situations occur across several CPUs,
+ * but we might later generalize this if we find there are any actualy
+ * systems with alternate SMI delivery or other hardware latencies.
+ */
+static int kthread_fn(void *data)
+{
+        u64 interval;
+        while (!kthread_should_stop()) {
+                move_to_next_cpu();
+                local_irq_disable();
+                get_sample();
+                local_irq_enable();
+                mutex_lock(&hwlat_data.lock);
+                interval = hwlat_data.sample_window - hwlat_data.sample_width;
+                mutex_unlock(&hwlat_data.lock);
+                do_div(interval, USEC_PER_MSEC); /* modifies interval value */
+                /* Always sleep for at least 1ms */
+                if (interval < 1)
+                        interval = 1;
+                if (msleep_interruptible(interval))
+                        break;
+        }
+        return 0;
+}
+/**
+ * start_kthread - Kick off the hardware latency sampling/detector kthread
+ *
+ * This starts the kernel thread that will sit and sample the CPU timestamp
+ * counter (TSC or similar) and look for potential hardware latencies.
+ */
+static int start_kthread(struct trace_array *tr)
+{
+        struct task_struct *kthread;
+        kthread = kthread_create(kthread_fn, NULL, "hwlatd");
+        if (IS_ERR(kthread)) {
+                pr_err(BANNER "could not start sampling thread\n");
+                return -ENOMEM;
+        }
+        hwlat_kthread = kthread;
+        wake_up_process(kthread);
+        return 0;
+}
+/**
+ * stop_kthread - Inform the hardware latency samping/detector kthread to stop
+ *
+ * This kicks the running hardware latency sampling/detector kernel thread and
+ * tells it to stop sampling now. Use this on unload and at system shutdown.
+ */
+static void stop_kthread(void)
+{
+        if (!hwlat_kthread)
+                return;
+        kthread_stop(hwlat_kthread);
+        hwlat_kthread = NULL;
+}
+/*
+ * hwlat_read - Wrapper read function for reading both window and width
+ * @filp: The active open file structure
+ * @ubuf: The userspace provided buffer to read value into
+ * @cnt: The maximum number of bytes to read
+ * @ppos: The current "file" position
+ *
+ * This function provides a generic read implementation for the global state
+ * "hwlat_data" structure filesystem entries.
+ */
+static ssize_t hwlat_read(struct file *filp, char __user *ubuf,
+                          size_t cnt, loff_t *ppos)
+{
+        char buf[U64STR_SIZE];
+        u64 *entry = filp->private_data;
+        u64 val;
+        int len;
+        if (!entry)
+                return -EFAULT;
+        if (cnt > sizeof(buf))
+                cnt = sizeof(buf);
+        val = *entry;
+        len = snprintf(buf, sizeof(buf), "%llu\n", val);
+        return simple_read_from_buffer(ubuf, cnt, ppos, buf, len);
+}
+/**
+ * hwlat_width_write - Write function for "width" entry
+ * @filp: The active open file structure
+ * @ubuf: The user buffer that contains the value to write
+ * @cnt: The maximum number of bytes to write to "file"
+ * @ppos: The current position in @file
+ *
+ * This function provides a write implementation for the "width" interface
+ * to the hardware latency detector. It can be used to configure
+ * for how many us of the total window us we will actively sample for any
+ * hardware-induced latency periods. Obviously, it is not possible to
+ * sample constantly and have the system respond to a sample reader, or,
+ * worse, without having the system appear to have gone out to lunch. It
+ * is enforced that width is less that the total window size.
+ */
+static ssize_t
+hwlat_width_write(struct file *filp, const char __user *ubuf,
+                  size_t cnt, loff_t *ppos)
+{
+        u64 val;
+        int err;
+        err = kstrtoull_from_user(ubuf, cnt, 10, &val);
+        if (err)
+                return err;
+        mutex_lock(&hwlat_data.lock);
+        if (val < hwlat_data.sample_window)
+                hwlat_data.sample_width = val;
+        else
+                err = -EINVAL;
+        mutex_unlock(&hwlat_data.lock);
+        if (err)
+                return err;
+        return cnt;
+}
+/**
+ * hwlat_window_write - Write function for "window" entry
+ * @filp: The active open file structure
+ * @ubuf: The user buffer that contains the value to write
+ * @cnt: The maximum number of bytes to write to "file"
+ * @ppos: The current position in @file
+ *
+ * This function provides a write implementation for the "window" interface
+ * to the hardware latency detetector. The window is the total time
+ * in us that will be considered one sample period. Conceptually, windows
+ * occur back-to-back and contain a sample width period during which
+ * actual sampling occurs. Can be used to write a new total window size. It
+ * is enfoced that any value written must be greater than the sample width
+ * size, or an error results.
+ */
+static ssize_t
+hwlat_window_write(struct file *filp, const char __user *ubuf,
+                   size_t cnt, loff_t *ppos)
+{
+        u64 val;
+        int err;
+        err = kstrtoull_from_user(ubuf, cnt, 10, &val);
+        if (err)
+                return err;
+        mutex_lock(&hwlat_data.lock);
+        if (hwlat_data.sample_width < val)
+                hwlat_data.sample_window = val;
+        else
+                err = -EINVAL;
+        mutex_unlock(&hwlat_data.lock);
+        if (err)
+                return err;
+        return cnt;
+}
+static const struct file_operations width_fops = {
+        .open           = tracing_open_generic,
+        .read           = hwlat_read,
+        .write          = hwlat_width_write,
+};
+static const struct file_operations window_fops = {
+        .open           = tracing_open_generic,
+        .read           = hwlat_read,
+        .write          = hwlat_window_write,
+};
+/**
+ * init_tracefs - A function to initialize the tracefs interface files
+ *
+ * This function creates entries in tracefs for "hwlat_detector".
+ * It creates the hwlat_detector directory in the tracing directory,
+ * and within that directory is the count, width and window files to
+ * change and view those values.
+ */
+static int init_tracefs(void)
+{
+        struct dentry *d_tracer;
+        struct dentry *top_dir;
+        d_tracer = tracing_init_dentry();
+        if (IS_ERR(d_tracer))
+                return -ENOMEM;
+        top_dir = tracefs_create_dir("hwlat_detector", d_tracer);
+        if (!top_dir)
+                return -ENOMEM;
+        hwlat_sample_window = tracefs_create_file("window", 0640,
+                                                  top_dir,
+                                                  &hwlat_data.sample_window,
+                                                  &window_fops);
+        if (!hwlat_sample_window)
+                goto err;
+        hwlat_sample_width = tracefs_create_file("width", 0644,
+                                                 top_dir,
+                                                 &hwlat_data.sample_width,
+                                                 &width_fops);
+        if (!hwlat_sample_width)
+                goto err;
+        return 0;
+ err:
+        tracefs_remove_recursive(top_dir);
+        return -ENOMEM;
+}
+static void hwlat_tracer_start(struct trace_array *tr)
+{
+        int err;
+        err = start_kthread(tr);
+        if (err)
+                pr_err(BANNER "Cannot start hwlat kthread\n");
+}
+static void hwlat_tracer_stop(struct trace_array *tr)
+{
+        stop_kthread();
+}
+static bool hwlat_busy;
+static int hwlat_tracer_init(struct trace_array *tr)
+{
+        /* Only allow one instance to enable this */
+        if (hwlat_busy)
+                return -EBUSY;
+        hwlat_trace = tr;
+        disable_migrate = false;
+        hwlat_data.count = 0;
+        tr->max_latency = 0;
+        save_tracing_thresh = tracing_thresh;
+        /* tracing_thresh is in nsecs, we speak in usecs */
+        if (!tracing_thresh)
+                tracing_thresh = last_tracing_thresh;
+        if (tracer_tracing_is_on(tr))
+                hwlat_tracer_start(tr);
+        hwlat_busy = true;
+        return 0;
+}
+static void hwlat_tracer_reset(struct trace_array *tr)
+{
+        stop_kthread();
+        /* the tracing threshold is static between runs */
+        last_tracing_thresh = tracing_thresh;
+        tracing_thresh = save_tracing_thresh;
+        hwlat_busy = false;
+}
+static struct tracer hwlat_tracer __read_mostly =
+{
+        .name           = "hwlat",
+        .init           = hwlat_tracer_init,
+        .reset          = hwlat_tracer_reset,
+        .start          = hwlat_tracer_start,
+        .stop           = hwlat_tracer_stop,
+        .allow_instances = true,
+};
+__init static int init_hwlat_tracer(void)
+{
+        int ret;
+        mutex_init(&hwlat_data.lock);
+        ret = register_tracer(&hwlat_tracer);
+        if (ret)
+                return ret;
+        init_tracefs();
+        return 0;
+}
+late_initcall(init_hwlat_tracer);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 9aedb0b06683..eb6c9f1d3a93 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -253,6 +253,10 @@ static const struct fetch_type kprobes_fetch_type_table[] = {
        ASSIGN_FETCH_TYPE(s16, u16, 1),
        ASSIGN_FETCH_TYPE(s32, u32, 1),
        ASSIGN_FETCH_TYPE(s64, u64, 1),
+        ASSIGN_FETCH_TYPE_ALIAS(x8,  u8,  u8,  0),
+        ASSIGN_FETCH_TYPE_ALIAS(x16, u16, u16, 0),
+        ASSIGN_FETCH_TYPE_ALIAS(x32, u32, u32, 0),
+        ASSIGN_FETCH_TYPE_ALIAS(x64, u64, u64, 0),
        ASSIGN_FETCH_TYPE_END
 };
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 0bb9cf2d53e6..3fc20422c166 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1098,6 +1098,71 @@ static struct trace_event trace_user_stack_event = {
        .funcs          = &trace_user_stack_funcs,
 };
+/* TRACE_HWLAT */
+static enum print_line_t
+trace_hwlat_print(struct trace_iterator *iter, int flags,
+                  struct trace_event *event)
+{
+        struct trace_entry *entry = iter->ent;
+        struct trace_seq *s = &iter->seq;
+        struct hwlat_entry *field;
+        trace_assign_type(field, entry);
+        trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%ld.%09ld",
+                         field->seqnum,
+                         field->duration,
+                         field->outer_duration,
+                         field->timestamp.tv_sec,
+                         field->timestamp.tv_nsec);
+        if (field->nmi_count) {
+                /*
+                 * The generic sched_clock() is not NMI safe, thus
+                 * we only record the count and not the time.
+                 */
+                if (!IS_ENABLED(CONFIG_GENERIC_SCHED_CLOCK))
+                        trace_seq_printf(s, " nmi-total:%llu",
+                                         field->nmi_total_ts);
+                trace_seq_printf(s, " nmi-count:%u",
+                                 field->nmi_count);
+        }
+        trace_seq_putc(s, '\n');
+        return trace_handle_return(s);
+}
+static enum print_line_t
+trace_hwlat_raw(struct trace_iterator *iter, int flags,
+                struct trace_event *event)
+{
+        struct hwlat_entry *field;
+        struct trace_seq *s = &iter->seq;
+        trace_assign_type(field, iter->ent);
+        trace_seq_printf(s, "%llu %lld %ld %09ld %u\n",
+                         field->duration,
+                         field->outer_duration,
+                         field->timestamp.tv_sec,
+                         field->timestamp.tv_nsec,
+                         field->seqnum);
+        return trace_handle_return(s);
+}
+static struct trace_event_functions trace_hwlat_funcs = {
+        .trace          = trace_hwlat_print,
+        .raw            = trace_hwlat_raw,
+};
+static struct trace_event trace_hwlat_event = {
+        .type           = TRACE_HWLAT,
+        .funcs          = &trace_hwlat_funcs,
+};
 /* TRACE_BPUTS */
 static enum print_line_t
 trace_bputs_print(struct trace_iterator *iter, int flags,
@@ -1233,6 +1298,7 @@ static struct trace_event *events[] __initdata = {
        &trace_bputs_event,
        &trace_bprint_event,
        &trace_print_event,
+        &trace_hwlat_event,
        NULL
 };
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 74e80a582c28..8c0553d9afd3 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -36,24 +36,28 @@ const char *reserved_field_names[] = {
 };
 /* Printing  in basic type function template */
-#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt)                         \
+#define DEFINE_BASIC_PRINT_TYPE_FUNC(tname, type, fmt)                  \
-int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name,   \
+int PRINT_TYPE_FUNC_NAME(tname)(struct trace_seq *s, const char *name,  \
                                void *data, void *ent)                  \
 {                                                                       \
        trace_seq_printf(s, " %s=" fmt, name, *(type *)data);           \
        return !trace_seq_has_overflowed(s);                            \
 }                                                                       \
-const char PRINT_TYPE_FMT_NAME(type)[] = fmt;                           \
+const char PRINT_TYPE_FMT_NAME(tname)[] = fmt;                          \
-NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(type));
+NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(tname));
-DEFINE_BASIC_PRINT_TYPE_FUNC(u8 , "0x%x")
+DEFINE_BASIC_PRINT_TYPE_FUNC(u8,  u8,  "%u")
-DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "0x%x")
+DEFINE_BASIC_PRINT_TYPE_FUNC(u16, u16, "%u")
-DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "0x%x")
+DEFINE_BASIC_PRINT_TYPE_FUNC(u32, u32, "%u")
-DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "0x%Lx")
+DEFINE_BASIC_PRINT_TYPE_FUNC(u64, u64, "%Lu")
-DEFINE_BASIC_PRINT_TYPE_FUNC(s8,  "%d")
+DEFINE_BASIC_PRINT_TYPE_FUNC(s8,  s8,  "%d")
-DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d")
+DEFINE_BASIC_PRINT_TYPE_FUNC(s16, s16, "%d")
-DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%d")
+DEFINE_BASIC_PRINT_TYPE_FUNC(s32, s32, "%d")
-DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%Ld")
+DEFINE_BASIC_PRINT_TYPE_FUNC(s64, s64, "%Ld")
+DEFINE_BASIC_PRINT_TYPE_FUNC(x8,  u8,  "0x%x")
+DEFINE_BASIC_PRINT_TYPE_FUNC(x16, u16, "0x%x")
+DEFINE_BASIC_PRINT_TYPE_FUNC(x32, u32, "0x%x")
+DEFINE_BASIC_PRINT_TYPE_FUNC(x64, u64, "0x%Lx")
 /* Print type function for string type */
 int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name,
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 45400ca5ded1..0c0ae54d44c6 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -149,6 +149,11 @@ DECLARE_BASIC_PRINT_TYPE_FUNC(s8);
 DECLARE_BASIC_PRINT_TYPE_FUNC(s16);
 DECLARE_BASIC_PRINT_TYPE_FUNC(s32);
 DECLARE_BASIC_PRINT_TYPE_FUNC(s64);
+DECLARE_BASIC_PRINT_TYPE_FUNC(x8);
+DECLARE_BASIC_PRINT_TYPE_FUNC(x16);
+DECLARE_BASIC_PRINT_TYPE_FUNC(x32);
+DECLARE_BASIC_PRINT_TYPE_FUNC(x64);
 DECLARE_BASIC_PRINT_TYPE_FUNC(string);
 #define FETCH_FUNC_NAME(method, type)   fetch_##method##_##type
@@ -203,7 +208,7 @@ DEFINE_FETCH_##method(u32)		\
 DEFINE_FETCH_##method(u64)
 /* Default (unsigned long) fetch type */
-#define __DEFAULT_FETCH_TYPE(t) u##t
+#define __DEFAULT_FETCH_TYPE(t) x##t
 #define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
 #define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
 #define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
@@ -234,6 +239,10 @@ ASSIGN_FETCH_FUNC(file_offset, ftype),			\
 #define ASSIGN_FETCH_TYPE(ptype, ftype, sign)                   \
        __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype)
+/* If ptype is an alias of atype, use this macro (show atype in format) */
+#define ASSIGN_FETCH_TYPE_ALIAS(ptype, atype, ftype, sign)              \
+        __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #atype)
 #define ASSIGN_FETCH_TYPE_END {}
 #define FETCH_TYPE_STRING       0
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index b2b6efc083a4..5e10395da88e 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -610,8 +610,7 @@ static int perf_sysenter_enable(struct trace_event_call *call)
        if (!sys_perf_refcount_enter)
                ret = register_trace_sys_enter(perf_syscall_enter, NULL);
        if (ret) {
-                pr_info("event trace: Could not activate"
+                pr_info("event trace: Could not activate syscall entry trace point");
-                                "syscall entry trace point");
        } else {
                set_bit(num, enabled_perf_enter_syscalls);
                sys_perf_refcount_enter++;
@@ -682,8 +681,7 @@ static int perf_sysexit_enable(struct trace_event_call *call)
        if (!sys_perf_refcount_exit)
                ret = register_trace_sys_exit(perf_syscall_exit, NULL);
        if (ret) {
-                pr_info("event trace: Could not activate"
+                pr_info("event trace: Could not activate syscall exit trace point");
-                                "syscall exit trace point");
        } else {
                set_bit(num, enabled_perf_exit_syscalls);
                sys_perf_refcount_exit++;
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index c53485441c88..0913693caf6e 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -211,6 +211,10 @@ static const struct fetch_type uprobes_fetch_type_table[] = {
        ASSIGN_FETCH_TYPE(s16, u16, 1),
        ASSIGN_FETCH_TYPE(s32, u32, 1),
        ASSIGN_FETCH_TYPE(s64, u64, 1),
+        ASSIGN_FETCH_TYPE_ALIAS(x8,  u8,  u8,  0),
+        ASSIGN_FETCH_TYPE_ALIAS(x16, u16, u16, 0),
+        ASSIGN_FETCH_TYPE_ALIAS(x32, u32, u32, 0),
+        ASSIGN_FETCH_TYPE_ALIAS(x64, u64, u64, 0),
        ASSIGN_FETCH_TYPE_END
 };
@@ -427,10 +431,6 @@ static int create_trace_uprobe(int argc, char **argv)
                pr_info("Probe point is not specified.\n");
                return -EINVAL;
        }
-        if (isdigit(argv[1][0])) {
-                pr_info("probe point must be have a filename.\n");
-                return -EINVAL;
-        }
        arg = strchr(argv[1], ':');
        if (!arg) {
                ret = -EINVAL;
diff --git a/kernel/ucount.c b/kernel/ucount.c
new file mode 100644
index 000000000000..9d20d5dd298a
--- /dev/null
+++ b/kernel/ucount.c
@@ -0,0 +1,235 @@
+/*
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License as
+ *  published by the Free Software Foundation, version 2 of the
+ *  License.
+ */
+#include <linux/stat.h>
+#include <linux/sysctl.h>
+#include <linux/slab.h>
+#include <linux/hash.h>
+#include <linux/user_namespace.h>
+#define UCOUNTS_HASHTABLE_BITS 10
+static struct hlist_head ucounts_hashtable[(1 << UCOUNTS_HASHTABLE_BITS)];
+static DEFINE_SPINLOCK(ucounts_lock);
+#define ucounts_hashfn(ns, uid)                                         \
+        hash_long((unsigned long)__kuid_val(uid) + (unsigned long)(ns), \
+                  UCOUNTS_HASHTABLE_BITS)
+#define ucounts_hashentry(ns, uid)      \
+        (ucounts_hashtable + ucounts_hashfn(ns, uid))
+#ifdef CONFIG_SYSCTL
+static struct ctl_table_set *
+set_lookup(struct ctl_table_root *root)
+{
+        return &current_user_ns()->set;
+}
+static int set_is_seen(struct ctl_table_set *set)
+{
+        return &current_user_ns()->set == set;
+}
+static int set_permissions(struct ctl_table_header *head,
+                                  struct ctl_table *table)
+{
+        struct user_namespace *user_ns =
+                container_of(head->set, struct user_namespace, set);
+        int mode;
+        /* Allow users with CAP_SYS_RESOURCE unrestrained access */
+        if (ns_capable(user_ns, CAP_SYS_RESOURCE))
+                mode = (table->mode & S_IRWXU) >> 6;
+        else
+        /* Allow all others at most read-only access */
+                mode = table->mode & S_IROTH;
+        return (mode << 6) | (mode << 3) | mode;
+}
+static struct ctl_table_root set_root = {
+        .lookup = set_lookup,
+        .permissions = set_permissions,
+};
+static int zero = 0;
+static int int_max = INT_MAX;
+#define UCOUNT_ENTRY(name)                              \
+        {                                               \
+                .procname       = name,                 \
+                .maxlen         = sizeof(int),          \
+                .mode           = 0644,                 \
+                .proc_handler   = proc_dointvec_minmax, \
+                .extra1         = &zero,                \
+                .extra2         = &int_max,             \
+        }
+static struct ctl_table user_table[] = {
+        UCOUNT_ENTRY("max_user_namespaces"),
+        UCOUNT_ENTRY("max_pid_namespaces"),
+        UCOUNT_ENTRY("max_uts_namespaces"),
+        UCOUNT_ENTRY("max_ipc_namespaces"),
+        UCOUNT_ENTRY("max_net_namespaces"),
+        UCOUNT_ENTRY("max_mnt_namespaces"),
+        UCOUNT_ENTRY("max_cgroup_namespaces"),
+        { }
+};
+#endif /* CONFIG_SYSCTL */
+bool setup_userns_sysctls(struct user_namespace *ns)
+{
+#ifdef CONFIG_SYSCTL
+        struct ctl_table *tbl;
+        setup_sysctl_set(&ns->set, &set_root, set_is_seen);
+        tbl = kmemdup(user_table, sizeof(user_table), GFP_KERNEL);
+        if (tbl) {
+                int i;
+                for (i = 0; i < UCOUNT_COUNTS; i++) {
+                        tbl[i].data = &ns->ucount_max[i];
+                }
+                ns->sysctls = __register_sysctl_table(&ns->set, "user", tbl);
+        }
+        if (!ns->sysctls) {
+                kfree(tbl);
+                retire_sysctl_set(&ns->set);
+                return false;
+        }
+#endif
+        return true;
+}
+void retire_userns_sysctls(struct user_namespace *ns)
+{
+#ifdef CONFIG_SYSCTL
+        struct ctl_table *tbl;
+        tbl = ns->sysctls->ctl_table_arg;
+        unregister_sysctl_table(ns->sysctls);
+        retire_sysctl_set(&ns->set);
+        kfree(tbl);
+#endif
+}
+static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, struct hlist_head *hashent)
+{
+        struct ucounts *ucounts;
+        hlist_for_each_entry(ucounts, hashent, node) {
+                if (uid_eq(ucounts->uid, uid) && (ucounts->ns == ns))
+                        return ucounts;
+        }
+        return NULL;
+}
+static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid)
+{
+        struct hlist_head *hashent = ucounts_hashentry(ns, uid);
+        struct ucounts *ucounts, *new;
+        spin_lock(&ucounts_lock);
+        ucounts = find_ucounts(ns, uid, hashent);
+        if (!ucounts) {
+                spin_unlock(&ucounts_lock);
+                new = kzalloc(sizeof(*new), GFP_KERNEL);
+                if (!new)
+                        return NULL;
+                new->ns = ns;
+                new->uid = uid;
+                atomic_set(&new->count, 0);
+                spin_lock(&ucounts_lock);
+                ucounts = find_ucounts(ns, uid, hashent);
+                if (ucounts) {
+                        kfree(new);
+                } else {
+                        hlist_add_head(&new->node, hashent);
+                        ucounts = new;
+                }
+        }
+        if (!atomic_add_unless(&ucounts->count, 1, INT_MAX))
+                ucounts = NULL;
+        spin_unlock(&ucounts_lock);
+        return ucounts;
+}
+static void put_ucounts(struct ucounts *ucounts)
+{
+        if (atomic_dec_and_test(&ucounts->count)) {
+                spin_lock(&ucounts_lock);
+                hlist_del_init(&ucounts->node);
+                spin_unlock(&ucounts_lock);
+                kfree(ucounts);
+        }
+}
+static inline bool atomic_inc_below(atomic_t *v, int u)
+{
+        int c, old;
+        c = atomic_read(v);
+        for (;;) {
+                if (unlikely(c >= u))
+                        return false;
+                old = atomic_cmpxchg(v, c, c+1);
+                if (likely(old == c))
+                        return true;
+                c = old;
+        }
+}
+struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid,
+                           enum ucount_type type)
+{
+        struct ucounts *ucounts, *iter, *bad;
+        struct user_namespace *tns;
+        ucounts = get_ucounts(ns, uid);
+        for (iter = ucounts; iter; iter = tns->ucounts) {
+                int max;
+                tns = iter->ns;
+                max = READ_ONCE(tns->ucount_max[type]);
+                if (!atomic_inc_below(&iter->ucount[type], max))
+                        goto fail;
+        }
+        return ucounts;
+fail:
+        bad = iter;
+        for (iter = ucounts; iter != bad; iter = iter->ns->ucounts)
+                atomic_dec(&iter->ucount[type]);
+        put_ucounts(ucounts);
+        return NULL;
+}
+void dec_ucount(struct ucounts *ucounts, enum ucount_type type)
+{
+        struct ucounts *iter;
+        for (iter = ucounts; iter; iter = iter->ns->ucounts) {
+                int dec = atomic_dec_if_positive(&iter->ucount[type]);
+                WARN_ON_ONCE(dec < 0);
+        }
+        put_ucounts(ucounts);
+}
+static __init int user_namespace_sysctl_init(void)
+{
+#ifdef CONFIG_SYSCTL
+        static struct ctl_table_header *user_header;
+        static struct ctl_table empty[1];
+        /*
+         * It is necessary to register the user directory in the
+         * default set so that registrations in the child sets work
+         * properly.
+         */
+        user_header = register_sysctl("user", empty);
+        BUG_ON(!user_header);
+        BUG_ON(!setup_userns_sysctls(&init_user_ns));
+#endif
+        return 0;
+}
+subsys_initcall(user_namespace_sysctl_init);
diff --git a/kernel/uid16.c b/kernel/uid16.c
index d58cc4d8f0d1..cc40793464e3 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -117,7 +117,7 @@ static int groups16_to_user(old_gid_t __user *grouplist,
        kgid_t kgid;
        for (i = 0; i < group_info->ngroups; i++) {
-                kgid = GROUP_AT(group_info, i);
+                kgid = group_info->gid[i];
                group = high2lowgid(from_kgid_munged(user_ns, kgid));
                if (put_user(group, grouplist+i))
                        return -EFAULT;
@@ -142,7 +142,7 @@ static int groups16_from_user(struct group_info *group_info,
                if (!gid_valid(kgid))
                        return -EINVAL;
-                GROUP_AT(group_info, i) = kgid;
+                group_info->gid[i] = kgid;
        }
        return 0;
diff --git a/kernel/up.c b/kernel/up.c
index 1760bf3d1463..ee81ac9af4ca 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -6,6 +6,7 @@
 #include <linux/kernel.h>
 #include <linux/export.h>
 #include <linux/smp.h>
+#include <linux/hypervisor.h>
 int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
                                int wait)
@@ -82,3 +83,20 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
        preempt_enable();
 }
 EXPORT_SYMBOL(on_each_cpu_cond);
+int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys)
+{
+        int ret;
+        if (cpu != 0)
+                return -ENXIO;
+        if (phys)
+                hypervisor_pin_vcpu(0);
+        ret = func(par);
+        if (phys)
+                hypervisor_pin_vcpu(-1);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(smp_call_on_cpu);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 68f594212759..86b7854fec8e 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -29,6 +29,17 @@ static DEFINE_MUTEX(userns_state_mutex);
 static bool new_idmap_permitted(const struct file *file,
                                struct user_namespace *ns, int cap_setid,
                                struct uid_gid_map *map);
+static void free_user_ns(struct work_struct *work);
+static struct ucounts *inc_user_namespaces(struct user_namespace *ns, kuid_t uid)
+{
+        return inc_ucount(ns, uid, UCOUNT_USER_NAMESPACES);
+}
+static void dec_user_namespaces(struct ucounts *ucounts)
+{
+        return dec_ucount(ucounts, UCOUNT_USER_NAMESPACES);
+}
 static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
 {
@@ -62,10 +73,16 @@ int create_user_ns(struct cred *new)
        struct user_namespace *ns, *parent_ns = new->user_ns;
        kuid_t owner = new->euid;
        kgid_t group = new->egid;
-        int ret;
+        struct ucounts *ucounts;
+        int ret, i;
+        ret = -ENOSPC;
        if (parent_ns->level > 32)
-                return -EUSERS;
+                goto fail;
+        ucounts = inc_user_namespaces(parent_ns, owner);
+        if (!ucounts)
+                goto fail;
        /*
         * Verify that we can not violate the policy of which files
@@ -73,26 +90,27 @@ int create_user_ns(struct cred *new)
         * by verifing that the root directory is at the root of the
         * mount namespace which allows all files to be accessed.
         */
+        ret = -EPERM;
        if (current_chrooted())
-                return -EPERM;
+                goto fail_dec;
        /* The creator needs a mapping in the parent user namespace
         * or else we won't be able to reasonably tell userspace who
         * created a user_namespace.
         */
+        ret = -EPERM;
        if (!kuid_has_mapping(parent_ns, owner) ||
            !kgid_has_mapping(parent_ns, group))
-                return -EPERM;
+                goto fail_dec;
+        ret = -ENOMEM;
        ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL);
        if (!ns)
-                return -ENOMEM;
+                goto fail_dec;
        ret = ns_alloc_inum(&ns->ns);
-        if (ret) {
+        if (ret)
-                kmem_cache_free(user_ns_cachep, ns);
+                goto fail_free;
-                return ret;
-        }
        ns->ns.ops = &userns_operations;
        atomic_set(&ns->count, 1);
@@ -101,18 +119,37 @@ int create_user_ns(struct cred *new)
        ns->level = parent_ns->level + 1;
        ns->owner = owner;
        ns->group = group;
+        INIT_WORK(&ns->work, free_user_ns);
+        for (i = 0; i < UCOUNT_COUNTS; i++) {
+                ns->ucount_max[i] = INT_MAX;
+        }
+        ns->ucounts = ucounts;
        /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
        mutex_lock(&userns_state_mutex);
        ns->flags = parent_ns->flags;
        mutex_unlock(&userns_state_mutex);
-        set_cred_user_ns(new, ns);
 #ifdef CONFIG_PERSISTENT_KEYRINGS
        init_rwsem(&ns->persistent_keyring_register_sem);
 #endif
+        ret = -ENOMEM;
+        if (!setup_userns_sysctls(ns))
+                goto fail_keyring;
+        set_cred_user_ns(new, ns);
        return 0;
+fail_keyring:
+#ifdef CONFIG_PERSISTENT_KEYRINGS
+        key_put(ns->persistent_keyring_register);
+#endif
+        ns_free_inum(&ns->ns);
+fail_free:
+        kmem_cache_free(user_ns_cachep, ns);
+fail_dec:
+        dec_user_namespaces(ucounts);
+fail:
+        return ret;
 }
 int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
@@ -135,21 +172,30 @@ int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
        return err;
 }
-void free_user_ns(struct user_namespace *ns)
+static void free_user_ns(struct work_struct *work)
 {
-        struct user_namespace *parent;
+        struct user_namespace *parent, *ns =
+                container_of(work, struct user_namespace, work);
        do {
+                struct ucounts *ucounts = ns->ucounts;
                parent = ns->parent;
+                retire_userns_sysctls(ns);
 #ifdef CONFIG_PERSISTENT_KEYRINGS
                key_put(ns->persistent_keyring_register);
 #endif
                ns_free_inum(&ns->ns);
                kmem_cache_free(user_ns_cachep, ns);
+                dec_user_namespaces(ucounts);
                ns = parent;
        } while (atomic_dec_and_test(&parent->count));
 }
-EXPORT_SYMBOL(free_user_ns);
+void __put_user_ns(struct user_namespace *ns)
+{
+        schedule_work(&ns->work);
+}
+EXPORT_SYMBOL(__put_user_ns);
 static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
 {
@@ -1004,12 +1050,37 @@ static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns)
        return commit_creds(cred);
 }
+struct ns_common *ns_get_owner(struct ns_common *ns)
+{
+        struct user_namespace *my_user_ns = current_user_ns();
+        struct user_namespace *owner, *p;
+        /* See if the owner is in the current user namespace */
+        owner = p = ns->ops->owner(ns);
+        for (;;) {
+                if (!p)
+                        return ERR_PTR(-EPERM);
+                if (p == my_user_ns)
+                        break;
+                p = p->parent;
+        }
+        return &get_user_ns(owner)->ns;
+}
+static struct user_namespace *userns_owner(struct ns_common *ns)
+{
+        return to_user_ns(ns)->parent;
+}
 const struct proc_ns_operations userns_operations = {
        .name           = "user",
        .type           = CLONE_NEWUSER,
        .get            = userns_get,
        .put            = userns_put,
        .install        = userns_install,
+        .owner          = userns_owner,
+        .get_parent     = ns_get_owner,
 };
 static __init int user_namespaces_init(void)
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 831ea7108232..6976cd47dcf6 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -17,6 +17,16 @@
 #include <linux/user_namespace.h>
 #include <linux/proc_ns.h>
+static struct ucounts *inc_uts_namespaces(struct user_namespace *ns)
+{
+        return inc_ucount(ns, current_euid(), UCOUNT_UTS_NAMESPACES);
+}
+static void dec_uts_namespaces(struct ucounts *ucounts)
+{
+        dec_ucount(ucounts, UCOUNT_UTS_NAMESPACES);
+}
 static struct uts_namespace *create_uts_ns(void)
 {
        struct uts_namespace *uts_ns;
@@ -36,18 +46,24 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
                                          struct uts_namespace *old_ns)
 {
        struct uts_namespace *ns;
+        struct ucounts *ucounts;
        int err;
+        err = -ENOSPC;
+        ucounts = inc_uts_namespaces(user_ns);
+        if (!ucounts)
+                goto fail;
+        err = -ENOMEM;
        ns = create_uts_ns();
        if (!ns)
-                return ERR_PTR(-ENOMEM);
+                goto fail_dec;
        err = ns_alloc_inum(&ns->ns);
-        if (err) {
+        if (err)
-                kfree(ns);
+                goto fail_free;
-                return ERR_PTR(err);
-        }
+        ns->ucounts = ucounts;
        ns->ns.ops = &utsns_operations;
        down_read(&uts_sem);
@@ -55,6 +71,13 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
        ns->user_ns = get_user_ns(user_ns);
        up_read(&uts_sem);
        return ns;
+fail_free:
+        kfree(ns);
+fail_dec:
+        dec_uts_namespaces(ucounts);
+fail:
+        return ERR_PTR(err);
 }
 /*
@@ -85,6 +108,7 @@ void free_uts_ns(struct kref *kref)
        struct uts_namespace *ns;
        ns = container_of(kref, struct uts_namespace, kref);
+        dec_uts_namespaces(ns->ucounts);
        put_user_ns(ns->user_ns);
        ns_free_inum(&ns->ns);
        kfree(ns);
@@ -130,10 +154,16 @@ static int utsns_install(struct nsproxy *nsproxy, struct ns_common *new)
        return 0;
 }
+static struct user_namespace *utsns_owner(struct ns_common *ns)
+{
+        return to_uts_ns(ns)->user_ns;
+}
 const struct proc_ns_operations utsns_operations = {
        .name           = "uts",
        .type           = CLONE_NEWUTS,
        .get            = utsns_get,
        .put            = utsns_put,
        .install        = utsns_install,
+        .owner          = utsns_owner,
 };
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ef071ca73fc3..479d840db286 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2974,6 +2974,31 @@ bool flush_delayed_work(struct delayed_work *dwork)
 }
 EXPORT_SYMBOL(flush_delayed_work);
+static bool __cancel_work(struct work_struct *work, bool is_dwork)
+{
+        unsigned long flags;
+        int ret;
+        do {
+                ret = try_to_grab_pending(work, is_dwork, &flags);
+        } while (unlikely(ret == -EAGAIN));
+        if (unlikely(ret < 0))
+                return false;
+        set_work_pool_and_clear_pending(work, get_work_pool_id(work));
+        local_irq_restore(flags);
+        return ret;
+}
+/*
+ * See cancel_delayed_work()
+ */
+bool cancel_work(struct work_struct *work)
+{
+        return __cancel_work(work, false);
+}
 /**
 * cancel_delayed_work - cancel a delayed work
 * @dwork: delayed_work to cancel
@@ -2992,20 +3017,7 @@ EXPORT_SYMBOL(flush_delayed_work);
 */
 bool cancel_delayed_work(struct delayed_work *dwork)
 {
-        unsigned long flags;
+        return __cancel_work(&dwork->work, true);
-        int ret;
-        do {
-                ret = try_to_grab_pending(&dwork->work, true, &flags);
-        } while (unlikely(ret == -EAGAIN));
-        if (unlikely(ret < 0))
-                return false;
-        set_work_pool_and_clear_pending(&dwork->work,
-                                        get_work_pool_id(&dwork->work));
-        local_irq_restore(flags);
-        return ret;
 }
 EXPORT_SYMBOL(cancel_delayed_work);
@@ -4249,7 +4261,7 @@ void print_worker_info(const char *log_lvl, struct task_struct *task)
         * This function is called without any synchronization and @task
         * could be in any state.  Be careful with dereferences.
         */
-        worker = probe_kthread_data(task);
+        worker = kthread_probe_data(task);
        /*
         * Carefully copy the associated workqueue's workfn and name.  Keep