109 files changed, 4399 insertions, 2035 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore
index 790d83c7d160..b3097bde4e9c 100644
--- a/kernel/.gitignore
+++ b/kernel/.gitignore
@@ -5,4 +5,3 @@ config_data.h
 config_data.gz
 timeconst.h
 hz.bc
-x509_certificate_list
diff --git a/kernel/audit.c b/kernel/audit.c
index 662c007635fb..5ffcbd354a52 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -407,16 +407,33 @@ static void audit_printk_skb(struct sk_buff *skb)
 static void kauditd_send_skb(struct sk_buff *skb)
 {
        int err;
+        int attempts = 0;
+#define AUDITD_RETRIES 5
+restart:
        /* take a reference in case we can't send it and we want to hold it */
        skb_get(skb);
        err = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0);
        if (err < 0) {
-                BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
+                pr_err("netlink_unicast sending to audit_pid=%d returned error: %d\n",
+                       audit_pid, err);
                if (audit_pid) {
-                        pr_err("*NO* daemon at audit_pid=%d\n", audit_pid);
+                        if (err == -ECONNREFUSED || err == -EPERM
-                        audit_log_lost("auditd disappeared");
+                            || ++attempts >= AUDITD_RETRIES) {
-                        audit_pid = 0;
+                                char s[32];
-                        audit_sock = NULL;
+                                snprintf(s, sizeof(s), "audit_pid=%d reset", audit_pid);
+                                audit_log_lost(s);
+                                audit_pid = 0;
+                                audit_sock = NULL;
+                        } else {
+                                pr_warn("re-scheduling(#%d) write to audit_pid=%d\n",
+                                        attempts, audit_pid);
+                                set_current_state(TASK_INTERRUPTIBLE);
+                                schedule();
+                                __set_current_state(TASK_RUNNING);
+                                goto restart;
+                        }
                }
                /* we might get lucky and get this in the next auditd */
                audit_hold_skb(skb);
@@ -684,25 +701,22 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
        return err;
 }
-static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
+static void audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
 {
-        int rc = 0;
        uid_t uid = from_kuid(&init_user_ns, current_uid());
        pid_t pid = task_tgid_nr(current);
        if (!audit_enabled && msg_type != AUDIT_USER_AVC) {
                *ab = NULL;
-                return rc;
+                return;
        }
        *ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
        if (unlikely(!*ab))
-                return rc;
+                return;
        audit_log_format(*ab, "pid=%d uid=%u", pid, uid);
        audit_log_session_info(*ab);
        audit_log_task_context(*ab);
-        return rc;
 }
 int is_audit_feature_set(int i)
@@ -1357,16 +1371,16 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
        if (unlikely(audit_filter_type(type)))
                return NULL;
-        if (gfp_mask & __GFP_WAIT) {
+        if (gfp_mask & __GFP_DIRECT_RECLAIM) {
                if (audit_pid && audit_pid == current->pid)
-                        gfp_mask &= ~__GFP_WAIT;
+                        gfp_mask &= ~__GFP_DIRECT_RECLAIM;
                else
                        reserve = 0;
        }
        while (audit_backlog_limit
               && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) {
-                if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time) {
+                if (gfp_mask & __GFP_DIRECT_RECLAIM && audit_backlog_wait_time) {
                        long sleep_time;
                        sleep_time = timeout_start + audit_backlog_wait_time - jiffies;
@@ -1566,14 +1580,14 @@ void audit_log_n_string(struct audit_buffer *ab, const char *string,
 * @string: string to be checked
 * @len: max length of the string to check
 */
-int audit_string_contains_control(const char *string, size_t len)
+bool audit_string_contains_control(const char *string, size_t len)
 {
        const unsigned char *p;
        for (p = string; p < (const unsigned char *)string + len; p++) {
                if (*p == '"' || *p < 0x21 || *p > 0x7e)
-                        return 1;
+                        return true;
        }
-        return 0;
+        return false;
 }
 /**
diff --git a/kernel/audit.h b/kernel/audit.h
index dadf86a0e59e..de6cbb7cf547 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -301,7 +301,7 @@ extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark
 #ifdef CONFIG_AUDIT_TREE
 extern struct audit_chunk *audit_tree_lookup(const struct inode *);
 extern void audit_put_chunk(struct audit_chunk *);
-extern int audit_tree_match(struct audit_chunk *, struct audit_tree *);
+extern bool audit_tree_match(struct audit_chunk *, struct audit_tree *);
 extern int audit_make_tree(struct audit_krule *, char *, u32);
 extern int audit_add_tree_rule(struct audit_krule *);
 extern int audit_remove_tree_rule(struct audit_krule *);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 94ecdabda8e6..5efe9b299a12 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -197,13 +197,13 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode)
        return NULL;
 }
-int audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree)
+bool audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree)
 {
        int n;
        for (n = 0; n < chunk->count; n++)
                if (chunk->owners[n].owner == tree)
-                        return 1;
+                        return true;
-        return 0;
+        return false;
 }
 /* tagging and untagging inodes with trees */
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 7714d93edb85..b8ff9e193753 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -39,13 +39,13 @@
 * Locking model:
 *
 * audit_filter_mutex:
- *              Synchronizes writes and blocking reads of audit's filterlist
+ *              Synchronizes writes and blocking reads of audit's filterlist
- *              data.  Rcu is used to traverse the filterlist and access
+ *              data.  Rcu is used to traverse the filterlist and access
- *              contents of structs audit_entry, audit_watch and opaque
+ *              contents of structs audit_entry, audit_watch and opaque
- *              LSM rules during filtering.  If modified, these structures
+ *              LSM rules during filtering.  If modified, these structures
- *              must be copied and replace their counterparts in the filterlist.
+ *              must be copied and replace their counterparts in the filterlist.
- *              An audit_parent struct is not accessed during filtering, so may
+ *              An audit_parent struct is not accessed during filtering, so may
- *              be written directly provided audit_filter_mutex is held.
+ *              be written directly provided audit_filter_mutex is held.
 */
 /* Audit filter lists, defined in <linux/audit.h> */
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index e6983be12bd3..13272582eee0 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,2 +1,4 @@
 obj-y := core.o
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o helpers.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o
+obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 29ace107f236..3f4c99e06c6b 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -15,6 +15,7 @@
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/filter.h>
+#include <linux/perf_event.h>
 /* Called from syscall */
 static struct bpf_map *array_map_alloc(union bpf_attr *attr)
@@ -48,7 +49,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
        array->map.key_size = attr->key_size;
        array->map.value_size = attr->value_size;
        array->map.max_entries = attr->max_entries;
+        array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT;
        array->elem_size = elem_size;
        return &array->map;
@@ -291,14 +292,23 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
        attr = perf_event_attrs(event);
        if (IS_ERR(attr))
-                return (void *)attr;
+                goto err;
-        if (attr->type != PERF_TYPE_RAW &&
+        if (attr->inherit)
-            attr->type != PERF_TYPE_HARDWARE) {
+                goto err;
-                perf_event_release_kernel(event);
-                return ERR_PTR(-EINVAL);
+        if (attr->type == PERF_TYPE_RAW)
-        }
+                return event;
-        return event;
+        if (attr->type == PERF_TYPE_HARDWARE)
+                return event;
+        if (attr->type == PERF_TYPE_SOFTWARE &&
+            attr->config == PERF_COUNT_SW_BPF_OUTPUT)
+                return event;
+err:
+        perf_event_release_kernel(event);
+        return ERR_PTR(-EINVAL);
 }
 static void perf_event_fd_array_put_ptr(void *ptr)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 67c380cfa9ca..334b1bdd572c 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -82,6 +82,8 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
        if (fp == NULL)
                return NULL;
+        kmemcheck_annotate_bitfield(fp, meta);
        aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags);
        if (aux == NULL) {
                vfree(fp);
@@ -90,6 +92,7 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
        fp->pages = size / PAGE_SIZE;
        fp->aux = aux;
+        fp->aux->prog = fp;
        return fp;
 }
@@ -110,8 +113,11 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
        fp = __vmalloc(size, gfp_flags, PAGE_KERNEL);
        if (fp != NULL) {
+                kmemcheck_annotate_bitfield(fp, meta);
                memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
                fp->pages = size / PAGE_SIZE;
+                fp->aux->prog = fp;
                /* We keep fp->aux from fp_old around in the new
                 * reallocated structure.
@@ -722,11 +728,36 @@ void bpf_prog_free(struct bpf_prog *fp)
        struct bpf_prog_aux *aux = fp->aux;
        INIT_WORK(&aux->work, bpf_prog_free_deferred);
-        aux->prog = fp;
        schedule_work(&aux->work);
 }
 EXPORT_SYMBOL_GPL(bpf_prog_free);
+/* RNG for unpriviledged user space with separated state from prandom_u32(). */
+static DEFINE_PER_CPU(struct rnd_state, bpf_user_rnd_state);
+void bpf_user_rnd_init_once(void)
+{
+        prandom_init_once(&bpf_user_rnd_state);
+}
+u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+        /* Should someone ever have the rather unwise idea to use some
+         * of the registers passed into this function, then note that
+         * this function is called from native eBPF and classic-to-eBPF
+         * transformations. Register assignments from both sides are
+         * different, f.e. classic always sets fn(ctx, A, X) here.
+         */
+        struct rnd_state *state;
+        u32 res;
+        state = &get_cpu_var(bpf_user_rnd_state);
+        res = prandom_u32_state(state);
+        put_cpu_var(state);
+        return res;
+}
 /* Weak definitions of helper functions in case we don't have bpf syscall. */
 const struct bpf_func_proto bpf_map_lookup_elem_proto __weak;
 const struct bpf_func_proto bpf_map_update_elem_proto __weak;
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 83c209d9b17a..19909b22b4f8 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -17,7 +17,7 @@
 struct bpf_htab {
        struct bpf_map map;
        struct hlist_head *buckets;
-        spinlock_t lock;
+        raw_spinlock_t lock;
        u32 count;      /* number of elements in this hashtable */
        u32 n_buckets;  /* number of hash buckets */
        u32 elem_size;  /* size of each element in bytes */
@@ -82,12 +82,16 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
        for (i = 0; i < htab->n_buckets; i++)
                INIT_HLIST_HEAD(&htab->buckets[i]);
-        spin_lock_init(&htab->lock);
+        raw_spin_lock_init(&htab->lock);
        htab->count = 0;
        htab->elem_size = sizeof(struct htab_elem) +
                          round_up(htab->map.key_size, 8) +
                          htab->map.value_size;
+        htab->map.pages = round_up(htab->n_buckets * sizeof(struct hlist_head) +
+                                   htab->elem_size * htab->map.max_entries,
+                                   PAGE_SIZE) >> PAGE_SHIFT;
        return &htab->map;
 free_htab:
@@ -230,7 +234,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
        l_new->hash = htab_map_hash(l_new->key, key_size);
        /* bpf_map_update_elem() can be called in_irq() */
-        spin_lock_irqsave(&htab->lock, flags);
+        raw_spin_lock_irqsave(&htab->lock, flags);
        head = select_bucket(htab, l_new->hash);
@@ -266,11 +270,11 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
        } else {
                htab->count++;
        }
-        spin_unlock_irqrestore(&htab->lock, flags);
+        raw_spin_unlock_irqrestore(&htab->lock, flags);
        return 0;
 err:
-        spin_unlock_irqrestore(&htab->lock, flags);
+        raw_spin_unlock_irqrestore(&htab->lock, flags);
        kfree(l_new);
        return ret;
 }
@@ -291,7 +295,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
        hash = htab_map_hash(key, key_size);
-        spin_lock_irqsave(&htab->lock, flags);
+        raw_spin_lock_irqsave(&htab->lock, flags);
        head = select_bucket(htab, hash);
@@ -304,7 +308,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
                ret = 0;
        }
-        spin_unlock_irqrestore(&htab->lock, flags);
+        raw_spin_unlock_irqrestore(&htab->lock, flags);
        return ret;
 }
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 1447ec09421e..4504ca66118d 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -93,13 +93,8 @@ const struct bpf_func_proto bpf_map_delete_elem_proto = {
        .arg2_type      = ARG_PTR_TO_MAP_KEY,
 };
-static u64 bpf_get_prandom_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
-{
-        return prandom_u32();
-}
 const struct bpf_func_proto bpf_get_prandom_u32_proto = {
-        .func           = bpf_get_prandom_u32,
+        .func           = bpf_user_rnd_u32,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
 };
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
new file mode 100644
index 000000000000..be6d726e31c9
--- /dev/null
+++ b/kernel/bpf/inode.c
@@ -0,0 +1,387 @@
+/*
+ * Minimal file system backend for holding eBPF maps and programs,
+ * used by bpf(2) object pinning.
+ *
+ * Authors:
+ *
+ *      Daniel Borkmann <daniel@iogearbox.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/magic.h>
+#include <linux/major.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/fs.h>
+#include <linux/kdev_t.h>
+#include <linux/filter.h>
+#include <linux/bpf.h>
+enum bpf_type {
+        BPF_TYPE_UNSPEC = 0,
+        BPF_TYPE_PROG,
+        BPF_TYPE_MAP,
+};
+static void *bpf_any_get(void *raw, enum bpf_type type)
+{
+        switch (type) {
+        case BPF_TYPE_PROG:
+                atomic_inc(&((struct bpf_prog *)raw)->aux->refcnt);
+                break;
+        case BPF_TYPE_MAP:
+                atomic_inc(&((struct bpf_map *)raw)->refcnt);
+                break;
+        default:
+                WARN_ON_ONCE(1);
+                break;
+        }
+        return raw;
+}
+static void bpf_any_put(void *raw, enum bpf_type type)
+{
+        switch (type) {
+        case BPF_TYPE_PROG:
+                bpf_prog_put(raw);
+                break;
+        case BPF_TYPE_MAP:
+                bpf_map_put(raw);
+                break;
+        default:
+                WARN_ON_ONCE(1);
+                break;
+        }
+}
+static void *bpf_fd_probe_obj(u32 ufd, enum bpf_type *type)
+{
+        void *raw;
+        *type = BPF_TYPE_MAP;
+        raw = bpf_map_get(ufd);
+        if (IS_ERR(raw)) {
+                *type = BPF_TYPE_PROG;
+                raw = bpf_prog_get(ufd);
+        }
+        return raw;
+}
+static const struct inode_operations bpf_dir_iops;
+static const struct inode_operations bpf_prog_iops = { };
+static const struct inode_operations bpf_map_iops  = { };
+static struct inode *bpf_get_inode(struct super_block *sb,
+                                   const struct inode *dir,
+                                   umode_t mode)
+{
+        struct inode *inode;
+        switch (mode & S_IFMT) {
+        case S_IFDIR:
+        case S_IFREG:
+                break;
+        default:
+                return ERR_PTR(-EINVAL);
+        }
+        inode = new_inode(sb);
+        if (!inode)
+                return ERR_PTR(-ENOSPC);
+        inode->i_ino = get_next_ino();
+        inode->i_atime = CURRENT_TIME;
+        inode->i_mtime = inode->i_atime;
+        inode->i_ctime = inode->i_atime;
+        inode_init_owner(inode, dir, mode);
+        return inode;
+}
+static int bpf_inode_type(const struct inode *inode, enum bpf_type *type)
+{
+        *type = BPF_TYPE_UNSPEC;
+        if (inode->i_op == &bpf_prog_iops)
+                *type = BPF_TYPE_PROG;
+        else if (inode->i_op == &bpf_map_iops)
+                *type = BPF_TYPE_MAP;
+        else
+                return -EACCES;
+        return 0;
+}
+static bool bpf_dname_reserved(const struct dentry *dentry)
+{
+        return strchr(dentry->d_name.name, '.');
+}
+static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+        struct inode *inode;
+        if (bpf_dname_reserved(dentry))
+                return -EPERM;
+        inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFDIR);
+        if (IS_ERR(inode))
+                return PTR_ERR(inode);
+        inode->i_op = &bpf_dir_iops;
+        inode->i_fop = &simple_dir_operations;
+        inc_nlink(inode);
+        inc_nlink(dir);
+        d_instantiate(dentry, inode);
+        dget(dentry);
+        return 0;
+}
+static int bpf_mkobj_ops(struct inode *dir, struct dentry *dentry,
+                         umode_t mode, const struct inode_operations *iops)
+{
+        struct inode *inode;
+        if (bpf_dname_reserved(dentry))
+                return -EPERM;
+        inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFREG);
+        if (IS_ERR(inode))
+                return PTR_ERR(inode);
+        inode->i_op = iops;
+        inode->i_private = dentry->d_fsdata;
+        d_instantiate(dentry, inode);
+        dget(dentry);
+        return 0;
+}
+static int bpf_mkobj(struct inode *dir, struct dentry *dentry, umode_t mode,
+                     dev_t devt)
+{
+        enum bpf_type type = MINOR(devt);
+        if (MAJOR(devt) != UNNAMED_MAJOR || !S_ISREG(mode) ||
+            dentry->d_fsdata == NULL)
+                return -EPERM;
+        switch (type) {
+        case BPF_TYPE_PROG:
+                return bpf_mkobj_ops(dir, dentry, mode, &bpf_prog_iops);
+        case BPF_TYPE_MAP:
+                return bpf_mkobj_ops(dir, dentry, mode, &bpf_map_iops);
+        default:
+                return -EPERM;
+        }
+}
+static const struct inode_operations bpf_dir_iops = {
+        .lookup         = simple_lookup,
+        .mknod          = bpf_mkobj,
+        .mkdir          = bpf_mkdir,
+        .rmdir          = simple_rmdir,
+        .unlink         = simple_unlink,
+};
+static int bpf_obj_do_pin(const struct filename *pathname, void *raw,
+                          enum bpf_type type)
+{
+        struct dentry *dentry;
+        struct inode *dir;
+        struct path path;
+        umode_t mode;
+        dev_t devt;
+        int ret;
+        dentry = kern_path_create(AT_FDCWD, pathname->name, &path, 0);
+        if (IS_ERR(dentry))
+                return PTR_ERR(dentry);
+        mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask());
+        devt = MKDEV(UNNAMED_MAJOR, type);
+        ret = security_path_mknod(&path, dentry, mode, devt);
+        if (ret)
+                goto out;
+        dir = d_inode(path.dentry);
+        if (dir->i_op != &bpf_dir_iops) {
+                ret = -EPERM;
+                goto out;
+        }
+        dentry->d_fsdata = raw;
+        ret = vfs_mknod(dir, dentry, mode, devt);
+        dentry->d_fsdata = NULL;
+out:
+        done_path_create(&path, dentry);
+        return ret;
+}
+int bpf_obj_pin_user(u32 ufd, const char __user *pathname)
+{
+        struct filename *pname;
+        enum bpf_type type;
+        void *raw;
+        int ret;
+        pname = getname(pathname);
+        if (IS_ERR(pname))
+                return PTR_ERR(pname);
+        raw = bpf_fd_probe_obj(ufd, &type);
+        if (IS_ERR(raw)) {
+                ret = PTR_ERR(raw);
+                goto out;
+        }
+        ret = bpf_obj_do_pin(pname, raw, type);
+        if (ret != 0)
+                bpf_any_put(raw, type);
+out:
+        putname(pname);
+        return ret;
+}
+static void *bpf_obj_do_get(const struct filename *pathname,
+                            enum bpf_type *type)
+{
+        struct inode *inode;
+        struct path path;
+        void *raw;
+        int ret;
+        ret = kern_path(pathname->name, LOOKUP_FOLLOW, &path);
+        if (ret)
+                return ERR_PTR(ret);
+        inode = d_backing_inode(path.dentry);
+        ret = inode_permission(inode, MAY_WRITE);
+        if (ret)
+                goto out;
+        ret = bpf_inode_type(inode, type);
+        if (ret)
+                goto out;
+        raw = bpf_any_get(inode->i_private, *type);
+        touch_atime(&path);
+        path_put(&path);
+        return raw;
+out:
+        path_put(&path);
+        return ERR_PTR(ret);
+}
+int bpf_obj_get_user(const char __user *pathname)
+{
+        enum bpf_type type = BPF_TYPE_UNSPEC;
+        struct filename *pname;
+        int ret = -ENOENT;
+        void *raw;
+        pname = getname(pathname);
+        if (IS_ERR(pname))
+                return PTR_ERR(pname);
+        raw = bpf_obj_do_get(pname, &type);
+        if (IS_ERR(raw)) {
+                ret = PTR_ERR(raw);
+                goto out;
+        }
+        if (type == BPF_TYPE_PROG)
+                ret = bpf_prog_new_fd(raw);
+        else if (type == BPF_TYPE_MAP)
+                ret = bpf_map_new_fd(raw);
+        else
+                goto out;
+        if (ret < 0)
+                bpf_any_put(raw, type);
+out:
+        putname(pname);
+        return ret;
+}
+static void bpf_evict_inode(struct inode *inode)
+{
+        enum bpf_type type;
+        truncate_inode_pages_final(&inode->i_data);
+        clear_inode(inode);
+        if (!bpf_inode_type(inode, &type))
+                bpf_any_put(inode->i_private, type);
+}
+static const struct super_operations bpf_super_ops = {
+        .statfs         = simple_statfs,
+        .drop_inode     = generic_delete_inode,
+        .evict_inode    = bpf_evict_inode,
+};
+static int bpf_fill_super(struct super_block *sb, void *data, int silent)
+{
+        static struct tree_descr bpf_rfiles[] = { { "" } };
+        struct inode *inode;
+        int ret;
+        ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles);
+        if (ret)
+                return ret;
+        sb->s_op = &bpf_super_ops;
+        inode = sb->s_root->d_inode;
+        inode->i_op = &bpf_dir_iops;
+        inode->i_mode &= ~S_IALLUGO;
+        inode->i_mode |= S_ISVTX | S_IRWXUGO;
+        return 0;
+}
+static struct dentry *bpf_mount(struct file_system_type *type, int flags,
+                                const char *dev_name, void *data)
+{
+        return mount_ns(type, flags, current->nsproxy->mnt_ns, bpf_fill_super);
+}
+static struct file_system_type bpf_fs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "bpf",
+        .mount          = bpf_mount,
+        .kill_sb        = kill_litter_super,
+        .fs_flags       = FS_USERNS_MOUNT,
+};
+MODULE_ALIAS_FS("bpf");
+static int __init bpf_init(void)
+{
+        int ret;
+        ret = sysfs_create_mount_point(fs_kobj, "bpf");
+        if (ret)
+                return ret;
+        ret = register_filesystem(&bpf_fs_type);
+        if (ret)
+                sysfs_remove_mount_point(fs_kobj, "bpf");
+        return ret;
+}
+fs_initcall(bpf_init);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 35bac8e8b071..0d3313d02a7e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -18,6 +18,8 @@
 #include <linux/filter.h>
 #include <linux/version.h>
+int sysctl_unprivileged_bpf_disabled __read_mostly;
 static LIST_HEAD(bpf_map_types);
 static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
@@ -44,11 +46,38 @@ void bpf_register_map_type(struct bpf_map_type_list *tl)
        list_add(&tl->list_node, &bpf_map_types);
 }
+static int bpf_map_charge_memlock(struct bpf_map *map)
+{
+        struct user_struct *user = get_current_user();
+        unsigned long memlock_limit;
+        memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+        atomic_long_add(map->pages, &user->locked_vm);
+        if (atomic_long_read(&user->locked_vm) > memlock_limit) {
+                atomic_long_sub(map->pages, &user->locked_vm);
+                free_uid(user);
+                return -EPERM;
+        }
+        map->user = user;
+        return 0;
+}
+static void bpf_map_uncharge_memlock(struct bpf_map *map)
+{
+        struct user_struct *user = map->user;
+        atomic_long_sub(map->pages, &user->locked_vm);
+        free_uid(user);
+}
 /* called from workqueue */
 static void bpf_map_free_deferred(struct work_struct *work)
 {
        struct bpf_map *map = container_of(work, struct bpf_map, work);
+        bpf_map_uncharge_memlock(map);
        /* implementation dependent freeing */
        map->ops->map_free(map);
 }
@@ -82,6 +111,12 @@ static const struct file_operations bpf_map_fops = {
        .release = bpf_map_release,
 };
+int bpf_map_new_fd(struct bpf_map *map)
+{
+        return anon_inode_getfd("bpf-map", &bpf_map_fops, map,
+                                O_RDWR | O_CLOEXEC);
+}
 /* helper macro to check that unused fields 'union bpf_attr' are zero */
 #define CHECK_ATTR(CMD) \
        memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
@@ -108,8 +143,11 @@ static int map_create(union bpf_attr *attr)
        atomic_set(&map->refcnt, 1);
-        err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC);
+        err = bpf_map_charge_memlock(map);
+        if (err)
+                goto free_map;
+        err = bpf_map_new_fd(map);
        if (err < 0)
                /* failed to allocate fd */
                goto free_map;
@@ -124,19 +162,29 @@ free_map:
 /* if error is returned, fd is released.
 * On success caller should complete fd access with matching fdput()
 */
-struct bpf_map *bpf_map_get(struct fd f)
+struct bpf_map *__bpf_map_get(struct fd f)
 {
-        struct bpf_map *map;
        if (!f.file)
                return ERR_PTR(-EBADF);
        if (f.file->f_op != &bpf_map_fops) {
                fdput(f);
                return ERR_PTR(-EINVAL);
        }
-        map = f.file->private_data;
+        return f.file->private_data;
+}
+struct bpf_map *bpf_map_get(u32 ufd)
+{
+        struct fd f = fdget(ufd);
+        struct bpf_map *map;
+        map = __bpf_map_get(f);
+        if (IS_ERR(map))
+                return map;
+        atomic_inc(&map->refcnt);
+        fdput(f);
        return map;
 }
@@ -164,7 +212,7 @@ static int map_lookup_elem(union bpf_attr *attr)
                return -EINVAL;
        f = fdget(ufd);
-        map = bpf_map_get(f);
+        map = __bpf_map_get(f);
        if (IS_ERR(map))
                return PTR_ERR(map);
@@ -223,7 +271,7 @@ static int map_update_elem(union bpf_attr *attr)
                return -EINVAL;
        f = fdget(ufd);
-        map = bpf_map_get(f);
+        map = __bpf_map_get(f);
        if (IS_ERR(map))
                return PTR_ERR(map);
@@ -276,7 +324,7 @@ static int map_delete_elem(union bpf_attr *attr)
                return -EINVAL;
        f = fdget(ufd);
-        map = bpf_map_get(f);
+        map = __bpf_map_get(f);
        if (IS_ERR(map))
                return PTR_ERR(map);
@@ -317,7 +365,7 @@ static int map_get_next_key(union bpf_attr *attr)
                return -EINVAL;
        f = fdget(ufd);
-        map = bpf_map_get(f);
+        map = __bpf_map_get(f);
        if (IS_ERR(map))
                return PTR_ERR(map);
@@ -402,6 +450,10 @@ static void fixup_bpf_calls(struct bpf_prog *prog)
                         */
                        BUG_ON(!prog->aux->ops->get_func_proto);
+                        if (insn->imm == BPF_FUNC_get_route_realm)
+                                prog->dst_needed = 1;
+                        if (insn->imm == BPF_FUNC_get_prandom_u32)
+                                bpf_user_rnd_init_once();
                        if (insn->imm == BPF_FUNC_tail_call) {
                                /* mark bpf_tail_call as different opcode
                                 * to avoid conditional branch in
@@ -436,29 +488,51 @@ static void free_used_maps(struct bpf_prog_aux *aux)
        kfree(aux->used_maps);
 }
-static void __prog_put_rcu(struct rcu_head *rcu)
+static int bpf_prog_charge_memlock(struct bpf_prog *prog)
+{
+        struct user_struct *user = get_current_user();
+        unsigned long memlock_limit;
+        memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+        atomic_long_add(prog->pages, &user->locked_vm);
+        if (atomic_long_read(&user->locked_vm) > memlock_limit) {
+                atomic_long_sub(prog->pages, &user->locked_vm);
+                free_uid(user);
+                return -EPERM;
+        }
+        prog->aux->user = user;
+        return 0;
+}
+static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
+{
+        struct user_struct *user = prog->aux->user;
+        atomic_long_sub(prog->pages, &user->locked_vm);
+        free_uid(user);
+}
+static void __prog_put_common(struct rcu_head *rcu)
 {
        struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
        free_used_maps(aux);
+        bpf_prog_uncharge_memlock(aux->prog);
        bpf_prog_free(aux->prog);
 }
 /* version of bpf_prog_put() that is called after a grace period */
 void bpf_prog_put_rcu(struct bpf_prog *prog)
 {
-        if (atomic_dec_and_test(&prog->aux->refcnt)) {
+        if (atomic_dec_and_test(&prog->aux->refcnt))
-                prog->aux->prog = prog;
+                call_rcu(&prog->aux->rcu, __prog_put_common);
-                call_rcu(&prog->aux->rcu, __prog_put_rcu);
-        }
 }
 void bpf_prog_put(struct bpf_prog *prog)
 {
-        if (atomic_dec_and_test(&prog->aux->refcnt)) {
+        if (atomic_dec_and_test(&prog->aux->refcnt))
-                free_used_maps(prog->aux);
+                __prog_put_common(&prog->aux->rcu);
-                bpf_prog_free(prog);
-        }
 }
 EXPORT_SYMBOL_GPL(bpf_prog_put);
@@ -474,21 +548,22 @@ static const struct file_operations bpf_prog_fops = {
        .release = bpf_prog_release,
 };
-static struct bpf_prog *get_prog(struct fd f)
+int bpf_prog_new_fd(struct bpf_prog *prog)
 {
-        struct bpf_prog *prog;
+        return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog,
+                                O_RDWR | O_CLOEXEC);
+}
+static struct bpf_prog *__bpf_prog_get(struct fd f)
+{
        if (!f.file)
                return ERR_PTR(-EBADF);
        if (f.file->f_op != &bpf_prog_fops) {
                fdput(f);
                return ERR_PTR(-EINVAL);
        }
-        prog = f.file->private_data;
+        return f.file->private_data;
-        return prog;
 }
 /* called by sockets/tracing/seccomp before attaching program to an event
@@ -499,13 +574,13 @@ struct bpf_prog *bpf_prog_get(u32 ufd)
        struct fd f = fdget(ufd);
        struct bpf_prog *prog;
-        prog = get_prog(f);
+        prog = __bpf_prog_get(f);
        if (IS_ERR(prog))
                return prog;
        atomic_inc(&prog->aux->refcnt);
        fdput(f);
        return prog;
 }
 EXPORT_SYMBOL_GPL(bpf_prog_get);
@@ -540,11 +615,18 @@ static int bpf_prog_load(union bpf_attr *attr)
            attr->kern_version != LINUX_VERSION_CODE)
                return -EINVAL;
+        if (type != BPF_PROG_TYPE_SOCKET_FILTER && !capable(CAP_SYS_ADMIN))
+                return -EPERM;
        /* plain bpf_prog allocation */
        prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
        if (!prog)
                return -ENOMEM;
+        err = bpf_prog_charge_memlock(prog);
+        if (err)
+                goto free_prog_nouncharge;
        prog->len = attr->insn_cnt;
        err = -EFAULT;
@@ -553,10 +635,10 @@ static int bpf_prog_load(union bpf_attr *attr)
                goto free_prog;
        prog->orig_prog = NULL;
-        prog->jited = false;
+        prog->jited = 0;
        atomic_set(&prog->aux->refcnt, 1);
-        prog->gpl_compatible = is_gpl;
+        prog->gpl_compatible = is_gpl ? 1 : 0;
        /* find program type: socket_filter vs tracing_filter */
        err = find_prog_type(type, prog);
@@ -576,7 +658,7 @@ static int bpf_prog_load(union bpf_attr *attr)
        if (err < 0)
                goto free_used_maps;
-        err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC);
+        err = bpf_prog_new_fd(prog);
        if (err < 0)
                /* failed to allocate fd */
                goto free_used_maps;
@@ -586,20 +668,36 @@ static int bpf_prog_load(union bpf_attr *attr)
 free_used_maps:
        free_used_maps(prog->aux);
 free_prog:
+        bpf_prog_uncharge_memlock(prog);
+free_prog_nouncharge:
        bpf_prog_free(prog);
        return err;
 }
+#define BPF_OBJ_LAST_FIELD bpf_fd
+static int bpf_obj_pin(const union bpf_attr *attr)
+{
+        if (CHECK_ATTR(BPF_OBJ))
+                return -EINVAL;
+        return bpf_obj_pin_user(attr->bpf_fd, u64_to_ptr(attr->pathname));
+}
+static int bpf_obj_get(const union bpf_attr *attr)
+{
+        if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0)
+                return -EINVAL;
+        return bpf_obj_get_user(u64_to_ptr(attr->pathname));
+}
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
        union bpf_attr attr = {};
        int err;
-        /* the syscall is limited to root temporarily. This restriction will be
+        if (!capable(CAP_SYS_ADMIN) && sysctl_unprivileged_bpf_disabled)
-         * lifted when security audit is clean. Note that eBPF+tracing must have
-         * this restriction, since it may pass kernel data to user space
-         */
-        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
        if (!access_ok(VERIFY_READ, uattr, 1))
@@ -654,6 +752,12 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
        case BPF_PROG_LOAD:
                err = bpf_prog_load(&attr);
                break;
+        case BPF_OBJ_PIN:
+                err = bpf_obj_pin(&attr);
+                break;
+        case BPF_OBJ_GET:
+                err = bpf_obj_get(&attr);
+                break;
        default:
                err = -EINVAL;
                break;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b074b23000d6..c6073056badf 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -199,6 +199,7 @@ struct verifier_env {
        struct verifier_state_list **explored_states; /* search pruning optimization */
        struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
        u32 used_map_cnt;               /* number of used maps */
+        bool allow_ptr_leaks;
 };
 /* verbose verifier prints what it's seeing
@@ -213,7 +214,7 @@ static DEFINE_MUTEX(bpf_verifier_lock);
 * verbose() is used to dump the verification trace to the log, so the user
 * can figure out what's wrong with the program
 */
-static void verbose(const char *fmt, ...)
+static __printf(1, 2) void verbose(const char *fmt, ...)
 {
        va_list args;
@@ -244,6 +245,7 @@ static const struct {
 } func_limit[] = {
        {BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call},
        {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read},
+        {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_output},
 };
 static void print_verifier_state(struct verifier_env *env)
@@ -538,6 +540,21 @@ static int bpf_size_to_bytes(int bpf_size)
                return -EINVAL;
 }
+static bool is_spillable_regtype(enum bpf_reg_type type)
+{
+        switch (type) {
+        case PTR_TO_MAP_VALUE:
+        case PTR_TO_MAP_VALUE_OR_NULL:
+        case PTR_TO_STACK:
+        case PTR_TO_CTX:
+        case FRAME_PTR:
+        case CONST_PTR_TO_MAP:
+                return true;
+        default:
+                return false;
+        }
+}
 /* check_stack_read/write functions track spill/fill of registers,
 * stack boundary and alignment are checked in check_mem_access()
 */
@@ -550,9 +567,7 @@ static int check_stack_write(struct verifier_state *state, int off, int size,
         */
        if (value_regno >= 0 &&
-            (state->regs[value_regno].type == PTR_TO_MAP_VALUE ||
+            is_spillable_regtype(state->regs[value_regno].type)) {
-             state->regs[value_regno].type == PTR_TO_STACK ||
-             state->regs[value_regno].type == PTR_TO_CTX)) {
                /* register containing pointer is being spilled into stack */
                if (size != BPF_REG_SIZE) {
@@ -643,6 +658,20 @@ static int check_ctx_access(struct verifier_env *env, int off, int size,
        return -EACCES;
 }
+static bool is_pointer_value(struct verifier_env *env, int regno)
+{
+        if (env->allow_ptr_leaks)
+                return false;
+        switch (env->cur_state.regs[regno].type) {
+        case UNKNOWN_VALUE:
+        case CONST_IMM:
+                return false;
+        default:
+                return true;
+        }
+}
 /* check whether memory at (regno + off) is accessible for t = (read | write)
 * if t==write, value_regno is a register which value is stored into memory
 * if t==read, value_regno is a register which will receive the value from memory
@@ -669,11 +698,21 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
        }
        if (state->regs[regno].type == PTR_TO_MAP_VALUE) {
+                if (t == BPF_WRITE && value_regno >= 0 &&
+                    is_pointer_value(env, value_regno)) {
+                        verbose("R%d leaks addr into map\n", value_regno);
+                        return -EACCES;
+                }
                err = check_map_access(env, regno, off, size);
                if (!err && t == BPF_READ && value_regno >= 0)
                        mark_reg_unknown_value(state->regs, value_regno);
        } else if (state->regs[regno].type == PTR_TO_CTX) {
+                if (t == BPF_WRITE && value_regno >= 0 &&
+                    is_pointer_value(env, value_regno)) {
+                        verbose("R%d leaks addr into ctx\n", value_regno);
+                        return -EACCES;
+                }
                err = check_ctx_access(env, off, size, t);
                if (!err && t == BPF_READ && value_regno >= 0)
                        mark_reg_unknown_value(state->regs, value_regno);
@@ -684,10 +723,17 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
                        verbose("invalid stack off=%d size=%d\n", off, size);
                        return -EACCES;
                }
-                if (t == BPF_WRITE)
+                if (t == BPF_WRITE) {
+                        if (!env->allow_ptr_leaks &&
+                            state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL &&
+                            size != BPF_REG_SIZE) {
+                                verbose("attempt to corrupt spilled pointer on stack\n");
+                                return -EACCES;
+                        }
                        err = check_stack_write(state, off, size, value_regno);
-                else
+                } else {
                        err = check_stack_read(state, off, size, value_regno);
+                }
        } else {
                verbose("R%d invalid mem access '%s'\n",
                        regno, reg_type_str[state->regs[regno].type]);
@@ -775,8 +821,13 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
                return -EACCES;
        }
-        if (arg_type == ARG_ANYTHING)
+        if (arg_type == ARG_ANYTHING) {
+                if (is_pointer_value(env, regno)) {
+                        verbose("R%d leaks addr into helper function\n", regno);
+                        return -EACCES;
+                }
                return 0;
+        }
        if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY ||
            arg_type == ARG_PTR_TO_MAP_VALUE) {
@@ -860,7 +911,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
                 * don't allow any other map type to be passed into
                 * the special func;
                 */
-                if (bool_map != bool_func)
+                if (bool_func && bool_map != bool_func)
                        return -EINVAL;
        }
@@ -950,8 +1001,9 @@ static int check_call(struct verifier_env *env, int func_id)
 }
 /* check validity of 32-bit and 64-bit arithmetic operations */
-static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
+static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
 {
+        struct reg_state *regs = env->cur_state.regs;
        u8 opcode = BPF_OP(insn->code);
        int err;
@@ -976,6 +1028,12 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
                if (err)
                        return err;
+                if (is_pointer_value(env, insn->dst_reg)) {
+                        verbose("R%d pointer arithmetic prohibited\n",
+                                insn->dst_reg);
+                        return -EACCES;
+                }
                /* check dest operand */
                err = check_reg_arg(regs, insn->dst_reg, DST_OP);
                if (err)
@@ -1012,6 +1070,11 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
                                 */
                                regs[insn->dst_reg] = regs[insn->src_reg];
                        } else {
+                                if (is_pointer_value(env, insn->src_reg)) {
+                                        verbose("R%d partial copy of pointer\n",
+                                                insn->src_reg);
+                                        return -EACCES;
+                                }
                                regs[insn->dst_reg].type = UNKNOWN_VALUE;
                                regs[insn->dst_reg].map_ptr = NULL;
                        }
@@ -1061,8 +1124,18 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
                /* pattern match 'bpf_add Rx, imm' instruction */
                if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 &&
                    regs[insn->dst_reg].type == FRAME_PTR &&
-                    BPF_SRC(insn->code) == BPF_K)
+                    BPF_SRC(insn->code) == BPF_K) {
                        stack_relative = true;
+                } else if (is_pointer_value(env, insn->dst_reg)) {
+                        verbose("R%d pointer arithmetic prohibited\n",
+                                insn->dst_reg);
+                        return -EACCES;
+                } else if (BPF_SRC(insn->code) == BPF_X &&
+                           is_pointer_value(env, insn->src_reg)) {
+                        verbose("R%d pointer arithmetic prohibited\n",
+                                insn->src_reg);
+                        return -EACCES;
+                }
                /* check dest operand */
                err = check_reg_arg(regs, insn->dst_reg, DST_OP);
@@ -1101,6 +1174,12 @@ static int check_cond_jmp_op(struct verifier_env *env,
                err = check_reg_arg(regs, insn->src_reg, SRC_OP);
                if (err)
                        return err;
+                if (is_pointer_value(env, insn->src_reg)) {
+                        verbose("R%d pointer comparison prohibited\n",
+                                insn->src_reg);
+                        return -EACCES;
+                }
        } else {
                if (insn->src_reg != BPF_REG_0) {
                        verbose("BPF_JMP uses reserved fields\n");
@@ -1155,6 +1234,9 @@ static int check_cond_jmp_op(struct verifier_env *env,
                        regs[insn->dst_reg].type = CONST_IMM;
                        regs[insn->dst_reg].imm = 0;
                }
+        } else if (is_pointer_value(env, insn->dst_reg)) {
+                verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
+                return -EACCES;
        } else if (BPF_SRC(insn->code) == BPF_K &&
                   (opcode == BPF_JEQ || opcode == BPF_JNE)) {
@@ -1658,7 +1740,7 @@ static int do_check(struct verifier_env *env)
                }
                if (class == BPF_ALU || class == BPF_ALU64) {
-                        err = check_alu_op(regs, insn);
+                        err = check_alu_op(env, insn);
                        if (err)
                                return err;
@@ -1816,6 +1898,11 @@ static int do_check(struct verifier_env *env)
                                if (err)
                                        return err;
+                                if (is_pointer_value(env, BPF_REG_0)) {
+                                        verbose("R0 leaks addr as return value\n");
+                                        return -EACCES;
+                                }
 process_bpf_exit:
                                insn_idx = pop_stack(env, &prev_insn_idx);
                                if (insn_idx < 0) {
@@ -1902,8 +1989,7 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env)
                        }
                        f = fdget(insn->imm);
+                        map = __bpf_map_get(f);
-                        map = bpf_map_get(f);
                        if (IS_ERR(map)) {
                                verbose("fd %d is not pointing to valid bpf_map\n",
                                        insn->imm);
@@ -2024,7 +2110,7 @@ static int convert_ctx_accesses(struct verifier_env *env)
                cnt = env->prog->aux->ops->
                        convert_ctx_access(type, insn->dst_reg, insn->src_reg,
-                                           insn->off, insn_buf);
+                                           insn->off, insn_buf, env->prog);
                if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
                        verbose("bpf verifier is misconfigured\n");
                        return -EINVAL;
@@ -2144,6 +2230,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
        if (ret < 0)
                goto skip_full_check;
+        env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
        ret = do_check(env);
 skip_full_check:
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2cf0f79f1fc9..f1603c153890 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -45,7 +45,6 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
-#include <linux/rwsem.h>
 #include <linux/percpu-rwsem.h>
 #include <linux/string.h>
 #include <linux/sort.h>
@@ -76,7 +75,7 @@
 * cgroup_mutex is the master lock.  Any modification to cgroup or its
 * hierarchy must be performed while holding it.
 *
- * css_set_rwsem protects task->cgroups pointer, the list of css_set
+ * css_set_lock protects task->cgroups pointer, the list of css_set
 * objects, and the chain of tasks off each css_set.
 *
 * These locks are exported if CONFIG_PROVE_RCU so that accessors in
@@ -84,12 +83,12 @@
 */
 #ifdef CONFIG_PROVE_RCU
 DEFINE_MUTEX(cgroup_mutex);
-DECLARE_RWSEM(css_set_rwsem);
+DEFINE_SPINLOCK(css_set_lock);
 EXPORT_SYMBOL_GPL(cgroup_mutex);
-EXPORT_SYMBOL_GPL(css_set_rwsem);
+EXPORT_SYMBOL_GPL(css_set_lock);
 #else
 static DEFINE_MUTEX(cgroup_mutex);
-static DECLARE_RWSEM(css_set_rwsem);
+static DEFINE_SPINLOCK(css_set_lock);
 #endif
 /*
@@ -139,6 +138,27 @@ static const char *cgroup_subsys_name[] = {
 };
 #undef SUBSYS
+/* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */
+#define SUBSYS(_x)                                                              \
+        DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key);                 \
+        DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key);                  \
+        EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key);                      \
+        EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key);
+#include <linux/cgroup_subsys.h>
+#undef SUBSYS
+#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key,
+static struct static_key_true *cgroup_subsys_enabled_key[] = {
+#include <linux/cgroup_subsys.h>
+};
+#undef SUBSYS
+#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key,
+static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
+#include <linux/cgroup_subsys.h>
+};
+#undef SUBSYS
 /*
 * The default hierarchy, reserved for the subsystems that are otherwise
 * unattached - it never has more than a single cgroup, and all tasks are
@@ -153,12 +173,6 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root);
 */
 static bool cgrp_dfl_root_visible;
-/*
- * Set by the boot param of the same name and makes subsystems with NULL
- * ->dfl_files to use ->legacy_files on the default hierarchy.
- */
-static bool cgroup_legacy_files_on_dfl;
 /* some controllers are not supported in the default hierarchy */
 static unsigned long cgrp_dfl_root_inhibit_ss_mask;
@@ -186,6 +200,7 @@ static u64 css_serial_nr_next = 1;
 */
 static unsigned long have_fork_callback __read_mostly;
 static unsigned long have_exit_callback __read_mostly;
+static unsigned long have_free_callback __read_mostly;
 /* Ditto for the can_fork callback. */
 static unsigned long have_canfork_callback __read_mostly;
@@ -195,14 +210,87 @@ static struct cftype cgroup_legacy_base_files[];
 static int rebind_subsystems(struct cgroup_root *dst_root,
                             unsigned long ss_mask);
+static void css_task_iter_advance(struct css_task_iter *it);
 static int cgroup_destroy_locked(struct cgroup *cgrp);
 static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
                      bool visible);
 static void css_release(struct percpu_ref *ref);
 static void kill_css(struct cgroup_subsys_state *css);
-static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
+static int cgroup_addrm_files(struct cgroup_subsys_state *css,
+                              struct cgroup *cgrp, struct cftype cfts[],
                              bool is_add);
+/**
+ * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID
+ * @ssid: subsys ID of interest
+ *
+ * cgroup_subsys_enabled() can only be used with literal subsys names which
+ * is fine for individual subsystems but unsuitable for cgroup core.  This
+ * is slower static_key_enabled() based test indexed by @ssid.
+ */
+static bool cgroup_ssid_enabled(int ssid)
+{
+        return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
+}
+/**
+ * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
+ * @cgrp: the cgroup of interest
+ *
+ * The default hierarchy is the v2 interface of cgroup and this function
+ * can be used to test whether a cgroup is on the default hierarchy for
+ * cases where a subsystem should behave differnetly depending on the
+ * interface version.
+ *
+ * The set of behaviors which change on the default hierarchy are still
+ * being determined and the mount option is prefixed with __DEVEL__.
+ *
+ * List of changed behaviors:
+ *
+ * - Mount options "noprefix", "xattr", "clone_children", "release_agent"
+ *   and "name" are disallowed.
+ *
+ * - When mounting an existing superblock, mount options should match.
+ *
+ * - Remount is disallowed.
+ *
+ * - rename(2) is disallowed.
+ *
+ * - "tasks" is removed.  Everything should be at process granularity.  Use
+ *   "cgroup.procs" instead.
+ *
+ * - "cgroup.procs" is not sorted.  pids will be unique unless they got
+ *   recycled inbetween reads.
+ *
+ * - "release_agent" and "notify_on_release" are removed.  Replacement
+ *   notification mechanism will be implemented.
+ *
+ * - "cgroup.clone_children" is removed.
+ *
+ * - "cgroup.subtree_populated" is available.  Its value is 0 if the cgroup
+ *   and its descendants contain no task; otherwise, 1.  The file also
+ *   generates kernfs notification which can be monitored through poll and
+ *   [di]notify when the value of the file changes.
+ *
+ * - cpuset: tasks will be kept in empty cpusets when hotplug happens and
+ *   take masks of ancestors with non-empty cpus/mems, instead of being
+ *   moved to an ancestor.
+ *
+ * - cpuset: a task can be moved into an empty cpuset, and again it takes
+ *   masks of ancestors.
+ *
+ * - memcg: use_hierarchy is on by default and the cgroup file for the flag
+ *   is not created.
+ *
+ * - blkcg: blk-throttle becomes properly hierarchical.
+ *
+ * - debug: disallowed on the default hierarchy.
+ */
+static bool cgroup_on_dfl(const struct cgroup *cgrp)
+{
+        return cgrp->root == &cgrp_dfl_root;
+}
 /* IDR wrappers which synchronize using cgroup_idr_lock */
 static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
                            gfp_t gfp_mask)
@@ -211,7 +299,7 @@ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
        idr_preload(gfp_mask);
        spin_lock_bh(&cgroup_idr_lock);
-        ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_WAIT);
+        ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM);
        spin_unlock_bh(&cgroup_idr_lock);
        idr_preload_end();
        return ret;
@@ -335,6 +423,22 @@ static inline bool cgroup_is_dead(const struct cgroup *cgrp)
        return !(cgrp->self.flags & CSS_ONLINE);
 }
+static void cgroup_get(struct cgroup *cgrp)
+{
+        WARN_ON_ONCE(cgroup_is_dead(cgrp));
+        css_get(&cgrp->self);
+}
+static bool cgroup_tryget(struct cgroup *cgrp)
+{
+        return css_tryget(&cgrp->self);
+}
+static void cgroup_put(struct cgroup *cgrp)
+{
+        css_put(&cgrp->self);
+}
 struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
 {
        struct cgroup *cgrp = of->kn->parent->priv;
@@ -484,19 +588,31 @@ struct css_set init_css_set = {
        .mg_tasks               = LIST_HEAD_INIT(init_css_set.mg_tasks),
        .mg_preload_node        = LIST_HEAD_INIT(init_css_set.mg_preload_node),
        .mg_node                = LIST_HEAD_INIT(init_css_set.mg_node),
+        .task_iters             = LIST_HEAD_INIT(init_css_set.task_iters),
 };
 static int css_set_count        = 1;    /* 1 for init_css_set */
 /**
+ * css_set_populated - does a css_set contain any tasks?
+ * @cset: target css_set
+ */
+static bool css_set_populated(struct css_set *cset)
+{
+        lockdep_assert_held(&css_set_lock);
+        return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks);
+}
+/**
 * cgroup_update_populated - updated populated count of a cgroup
 * @cgrp: the target cgroup
 * @populated: inc or dec populated count
 *
- * @cgrp is either getting the first task (css_set) or losing the last.
+ * One of the css_sets associated with @cgrp is either getting its first
- * Update @cgrp->populated_cnt accordingly.  The count is propagated
+ * task or losing the last.  Update @cgrp->populated_cnt accordingly.  The
- * towards root so that a given cgroup's populated_cnt is zero iff the
+ * count is propagated towards root so that a given cgroup's populated_cnt
- * cgroup and all its descendants are empty.
+ * is zero iff the cgroup and all its descendants don't contain any tasks.
 *
 * @cgrp's interface file "cgroup.populated" is zero if
 * @cgrp->populated_cnt is zero and 1 otherwise.  When @cgrp->populated_cnt
@@ -506,7 +622,7 @@ static int css_set_count	= 1;	/* 1 for init_css_set */
 */
 static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
 {
-        lockdep_assert_held(&css_set_rwsem);
+        lockdep_assert_held(&css_set_lock);
        do {
                bool trigger;
@@ -519,12 +635,93 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
                if (!trigger)
                        break;
-                if (cgrp->populated_kn)
+                check_for_release(cgrp);
-                        kernfs_notify(cgrp->populated_kn);
+                cgroup_file_notify(&cgrp->events_file);
                cgrp = cgroup_parent(cgrp);
        } while (cgrp);
 }
+/**
+ * css_set_update_populated - update populated state of a css_set
+ * @cset: target css_set
+ * @populated: whether @cset is populated or depopulated
+ *
+ * @cset is either getting the first task or losing the last.  Update the
+ * ->populated_cnt of all associated cgroups accordingly.
+ */
+static void css_set_update_populated(struct css_set *cset, bool populated)
+{
+        struct cgrp_cset_link *link;
+        lockdep_assert_held(&css_set_lock);
+        list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
+                cgroup_update_populated(link->cgrp, populated);
+}
+/**
+ * css_set_move_task - move a task from one css_set to another
+ * @task: task being moved
+ * @from_cset: css_set @task currently belongs to (may be NULL)
+ * @to_cset: new css_set @task is being moved to (may be NULL)
+ * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks
+ *
+ * Move @task from @from_cset to @to_cset.  If @task didn't belong to any
+ * css_set, @from_cset can be NULL.  If @task is being disassociated
+ * instead of moved, @to_cset can be NULL.
+ *
+ * This function automatically handles populated_cnt updates and
+ * css_task_iter adjustments but the caller is responsible for managing
+ * @from_cset and @to_cset's reference counts.
+ */
+static void css_set_move_task(struct task_struct *task,
+                              struct css_set *from_cset, struct css_set *to_cset,
+                              bool use_mg_tasks)
+{
+        lockdep_assert_held(&css_set_lock);
+        if (from_cset) {
+                struct css_task_iter *it, *pos;
+                WARN_ON_ONCE(list_empty(&task->cg_list));
+                /*
+                 * @task is leaving, advance task iterators which are
+                 * pointing to it so that they can resume at the next
+                 * position.  Advancing an iterator might remove it from
+                 * the list, use safe walk.  See css_task_iter_advance*()
+                 * for details.
+                 */
+                list_for_each_entry_safe(it, pos, &from_cset->task_iters,
+                                         iters_node)
+                        if (it->task_pos == &task->cg_list)
+                                css_task_iter_advance(it);
+                list_del_init(&task->cg_list);
+                if (!css_set_populated(from_cset))
+                        css_set_update_populated(from_cset, false);
+        } else {
+                WARN_ON_ONCE(!list_empty(&task->cg_list));
+        }
+        if (to_cset) {
+                /*
+                 * We are synchronized through cgroup_threadgroup_rwsem
+                 * against PF_EXITING setting such that we can't race
+                 * against cgroup_exit() changing the css_set to
+                 * init_css_set and dropping the old one.
+                 */
+                WARN_ON_ONCE(task->flags & PF_EXITING);
+                if (!css_set_populated(to_cset))
+                        css_set_update_populated(to_cset, true);
+                rcu_assign_pointer(task->cgroups, to_cset);
+                list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
+                                                             &to_cset->tasks);
+        }
+}
 /*
 * hash table for cgroup groups. This improves the performance to find
 * an existing css_set. This hash doesn't (currently) take into
@@ -552,7 +749,7 @@ static void put_css_set_locked(struct css_set *cset)
        struct cgroup_subsys *ss;
        int ssid;
-        lockdep_assert_held(&css_set_rwsem);
+        lockdep_assert_held(&css_set_lock);
        if (!atomic_dec_and_test(&cset->refcount))
                return;
@@ -564,17 +761,10 @@ static void put_css_set_locked(struct css_set *cset)
        css_set_count--;
        list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
-                struct cgroup *cgrp = link->cgrp;
                list_del(&link->cset_link);
                list_del(&link->cgrp_link);
+                if (cgroup_parent(link->cgrp))
-                /* @cgrp can't go away while we're holding css_set_rwsem */
+                        cgroup_put(link->cgrp);
-                if (list_empty(&cgrp->cset_links)) {
-                        cgroup_update_populated(cgrp, false);
-                        check_for_release(cgrp);
-                }
                kfree(link);
        }
@@ -591,9 +781,9 @@ static void put_css_set(struct css_set *cset)
        if (atomic_add_unless(&cset->refcount, -1, 1))
                return;
-        down_write(&css_set_rwsem);
+        spin_lock_bh(&css_set_lock);
        put_css_set_locked(cset);
-        up_write(&css_set_rwsem);
+        spin_unlock_bh(&css_set_lock);
 }
 /*
@@ -782,15 +972,15 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
        link->cset = cset;
        link->cgrp = cgrp;
-        if (list_empty(&cgrp->cset_links))
-                cgroup_update_populated(cgrp, true);
-        list_move(&link->cset_link, &cgrp->cset_links);
        /*
-         * Always add links to the tail of the list so that the list
+         * Always add links to the tail of the lists so that the lists are
-         * is sorted by order of hierarchy creation
+         * in choronological order.
         */
+        list_move_tail(&link->cset_link, &cgrp->cset_links);
        list_add_tail(&link->cgrp_link, &cset->cgrp_links);
+        if (cgroup_parent(cgrp))
+                cgroup_get(cgrp);
 }
 /**
@@ -816,11 +1006,11 @@ static struct css_set *find_css_set(struct css_set *old_cset,
        /* First see if we already have a cgroup group that matches
         * the desired set */
-        down_read(&css_set_rwsem);
+        spin_lock_bh(&css_set_lock);
        cset = find_existing_css_set(old_cset, cgrp, template);
        if (cset)
                get_css_set(cset);
-        up_read(&css_set_rwsem);
+        spin_unlock_bh(&css_set_lock);
        if (cset)
                return cset;
@@ -841,13 +1031,14 @@ static struct css_set *find_css_set(struct css_set *old_cset,
        INIT_LIST_HEAD(&cset->mg_tasks);
        INIT_LIST_HEAD(&cset->mg_preload_node);
        INIT_LIST_HEAD(&cset->mg_node);
+        INIT_LIST_HEAD(&cset->task_iters);
        INIT_HLIST_NODE(&cset->hlist);
        /* Copy the set of subsystem state objects generated in
         * find_existing_css_set() */
        memcpy(cset->subsys, template, sizeof(cset->subsys));
-        down_write(&css_set_rwsem);
+        spin_lock_bh(&css_set_lock);
        /* Add reference counts and links from the new css_set. */
        list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
                struct cgroup *c = link->cgrp;
@@ -869,7 +1060,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
                list_add_tail(&cset->e_cset_node[ssid],
                              &cset->subsys[ssid]->cgroup->e_csets[ssid]);
-        up_write(&css_set_rwsem);
+        spin_unlock_bh(&css_set_lock);
        return cset;
 }
@@ -933,14 +1124,15 @@ static void cgroup_destroy_root(struct cgroup_root *root)
         * Release all the links from cset_links to this hierarchy's
         * root cgroup
         */
-        down_write(&css_set_rwsem);
+        spin_lock_bh(&css_set_lock);
        list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
                list_del(&link->cset_link);
                list_del(&link->cgrp_link);
                kfree(link);
        }
-        up_write(&css_set_rwsem);
+        spin_unlock_bh(&css_set_lock);
        if (!list_empty(&root->root_list)) {
                list_del(&root->root_list);
@@ -962,7 +1154,7 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
        struct cgroup *res = NULL;
        lockdep_assert_held(&cgroup_mutex);
-        lockdep_assert_held(&css_set_rwsem);
+        lockdep_assert_held(&css_set_lock);
        if (cset == &init_css_set) {
                res = &root->cgrp;
@@ -985,7 +1177,7 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
 /*
 * Return the cgroup for "task" from the given hierarchy. Must be
- * called with cgroup_mutex and css_set_rwsem held.
+ * called with cgroup_mutex and css_set_lock held.
 */
 static struct cgroup *task_cgroup_from_root(struct task_struct *task,
                                            struct cgroup_root *root)
@@ -1024,7 +1216,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
 * update of a tasks cgroup pointer by cgroup_attach_task()
 */
-static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
 static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
 static const struct file_operations proc_cgroupstats_operations;
@@ -1047,43 +1238,25 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
 * cgroup_file_mode - deduce file mode of a control file
 * @cft: the control file in question
 *
- * returns cft->mode if ->mode is not 0
+ * S_IRUGO for read, S_IWUSR for write.
- * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
- * returns S_IRUGO if it has only a read handler
- * returns S_IWUSR if it has only a write hander
 */
 static umode_t cgroup_file_mode(const struct cftype *cft)
 {
        umode_t mode = 0;
-        if (cft->mode)
-                return cft->mode;
        if (cft->read_u64 || cft->read_s64 || cft->seq_show)
                mode |= S_IRUGO;
-        if (cft->write_u64 || cft->write_s64 || cft->write)
+        if (cft->write_u64 || cft->write_s64 || cft->write) {
-                mode |= S_IWUSR;
+                if (cft->flags & CFTYPE_WORLD_WRITABLE)
+                        mode |= S_IWUGO;
+                else
+                        mode |= S_IWUSR;
+        }
        return mode;
 }
-static void cgroup_get(struct cgroup *cgrp)
-{
-        WARN_ON_ONCE(cgroup_is_dead(cgrp));
-        css_get(&cgrp->self);
-}
-static bool cgroup_tryget(struct cgroup *cgrp)
-{
-        return css_tryget(&cgrp->self);
-}
-static void cgroup_put(struct cgroup *cgrp)
-{
-        css_put(&cgrp->self);
-}
 /**
 * cgroup_calc_child_subsys_mask - calculate child_subsys_mask
 * @cgrp: the target cgroup
@@ -1224,28 +1397,64 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
 }
 /**
- * cgroup_clear_dir - remove subsys files in a cgroup directory
+ * css_clear_dir - remove subsys files in a cgroup directory
- * @cgrp: target cgroup
+ * @css: taget css
- * @subsys_mask: mask of the subsystem ids whose files should be removed
+ * @cgrp_override: specify if target cgroup is different from css->cgroup
 */
-static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
+static void css_clear_dir(struct cgroup_subsys_state *css,
+                          struct cgroup *cgrp_override)
 {
-        struct cgroup_subsys *ss;
+        struct cgroup *cgrp = cgrp_override ?: css->cgroup;
-        int i;
+        struct cftype *cfts;
-        for_each_subsys(ss, i) {
+        list_for_each_entry(cfts, &css->ss->cfts, node)
-                struct cftype *cfts;
+                cgroup_addrm_files(css, cgrp, cfts, false);
+}
-                if (!(subsys_mask & (1 << i)))
+/**
-                        continue;
+ * css_populate_dir - create subsys files in a cgroup directory
-                list_for_each_entry(cfts, &ss->cfts, node)
+ * @css: target css
-                        cgroup_addrm_files(cgrp, cfts, false);
+ * @cgrp_overried: specify if target cgroup is different from css->cgroup
+ *
+ * On failure, no file is added.
+ */
+static int css_populate_dir(struct cgroup_subsys_state *css,
+                            struct cgroup *cgrp_override)
+{
+        struct cgroup *cgrp = cgrp_override ?: css->cgroup;
+        struct cftype *cfts, *failed_cfts;
+        int ret;
+        if (!css->ss) {
+                if (cgroup_on_dfl(cgrp))
+                        cfts = cgroup_dfl_base_files;
+                else
+                        cfts = cgroup_legacy_base_files;
+                return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
        }
+        list_for_each_entry(cfts, &css->ss->cfts, node) {
+                ret = cgroup_addrm_files(css, cgrp, cfts, true);
+                if (ret < 0) {
+                        failed_cfts = cfts;
+                        goto err;
+                }
+        }
+        return 0;
+err:
+        list_for_each_entry(cfts, &css->ss->cfts, node) {
+                if (cfts == failed_cfts)
+                        break;
+                cgroup_addrm_files(css, cgrp, cfts, false);
+        }
+        return ret;
 }
 static int rebind_subsystems(struct cgroup_root *dst_root,
                             unsigned long ss_mask)
 {
+        struct cgroup *dcgrp = &dst_root->cgrp;
        struct cgroup_subsys *ss;
        unsigned long tmp_ss_mask;
        int ssid, i, ret;
@@ -1267,10 +1476,13 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
        if (dst_root == &cgrp_dfl_root)
                tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask;
-        ret = cgroup_populate_dir(&dst_root->cgrp, tmp_ss_mask);
+        for_each_subsys_which(ss, ssid, &tmp_ss_mask) {
-        if (ret) {
+                struct cgroup *scgrp = &ss->root->cgrp;
-                if (dst_root != &cgrp_dfl_root)
+                int tssid;
-                        return ret;
+                ret = css_populate_dir(cgroup_css(scgrp, ss), dcgrp);
+                if (!ret)
+                        continue;
                /*
                 * Rebinding back to the default root is not allowed to
@@ -1278,57 +1490,67 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
                 * be rare.  Moving subsystems back and forth even more so.
                 * Just warn about it and continue.
                 */
-                if (cgrp_dfl_root_visible) {
+                if (dst_root == &cgrp_dfl_root) {
-                        pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n",
+                        if (cgrp_dfl_root_visible) {
-                                ret, ss_mask);
+                                pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n",
-                        pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
+                                        ret, ss_mask);
+                                pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
+                        }
+                        continue;
+                }
+                for_each_subsys_which(ss, tssid, &tmp_ss_mask) {
+                        if (tssid == ssid)
+                                break;
+                        css_clear_dir(cgroup_css(scgrp, ss), dcgrp);
                }
+                return ret;
        }
        /*
         * Nothing can fail from this point on.  Remove files for the
         * removed subsystems and rebind each subsystem.
         */
-        for_each_subsys_which(ss, ssid, &ss_mask)
-                cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
        for_each_subsys_which(ss, ssid, &ss_mask) {
-                struct cgroup_root *src_root;
+                struct cgroup_root *src_root = ss->root;
-                struct cgroup_subsys_state *css;
+                struct cgroup *scgrp = &src_root->cgrp;
+                struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
                struct css_set *cset;
-                src_root = ss->root;
+                WARN_ON(!css || cgroup_css(dcgrp, ss));
-                css = cgroup_css(&src_root->cgrp, ss);
-                WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss));
+                css_clear_dir(css, NULL);
-                RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL);
+                RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
-                rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css);
+                rcu_assign_pointer(dcgrp->subsys[ssid], css);
                ss->root = dst_root;
-                css->cgroup = &dst_root->cgrp;
+                css->cgroup = dcgrp;
-                down_write(&css_set_rwsem);
+                spin_lock_bh(&css_set_lock);
                hash_for_each(css_set_table, i, cset, hlist)
                        list_move_tail(&cset->e_cset_node[ss->id],
-                                       &dst_root->cgrp.e_csets[ss->id]);
+                                       &dcgrp->e_csets[ss->id]);
-                up_write(&css_set_rwsem);
+                spin_unlock_bh(&css_set_lock);
                src_root->subsys_mask &= ~(1 << ssid);
-                src_root->cgrp.subtree_control &= ~(1 << ssid);
+                scgrp->subtree_control &= ~(1 << ssid);
-                cgroup_refresh_child_subsys_mask(&src_root->cgrp);
+                cgroup_refresh_child_subsys_mask(scgrp);
                /* default hierarchy doesn't enable controllers by default */
                dst_root->subsys_mask |= 1 << ssid;
-                if (dst_root != &cgrp_dfl_root) {
+                if (dst_root == &cgrp_dfl_root) {
-                        dst_root->cgrp.subtree_control |= 1 << ssid;
+                        static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
-                        cgroup_refresh_child_subsys_mask(&dst_root->cgrp);
+                } else {
+                        dcgrp->subtree_control |= 1 << ssid;
+                        cgroup_refresh_child_subsys_mask(dcgrp);
+                        static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
                }
                if (ss->bind)
                        ss->bind(css);
        }
-        kernfs_activate(dst_root->cgrp.kn);
+        kernfs_activate(dcgrp->kn);
        return 0;
 }
@@ -1458,7 +1680,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
                for_each_subsys(ss, i) {
                        if (strcmp(token, ss->legacy_name))
                                continue;
-                        if (ss->disabled)
+                        if (!cgroup_ssid_enabled(i))
                                continue;
                        /* Mutually exclusive option 'all' + subsystem name */
@@ -1489,7 +1711,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
         */
        if (all_ss || (!one_ss && !opts->none && !opts->name))
                for_each_subsys(ss, i)
-                        if (!ss->disabled)
+                        if (cgroup_ssid_enabled(i))
                                opts->subsys_mask |= (1 << i);
        /*
@@ -1585,7 +1807,7 @@ static void cgroup_enable_task_cg_lists(void)
 {
        struct task_struct *p, *g;
-        down_write(&css_set_rwsem);
+        spin_lock_bh(&css_set_lock);
        if (use_task_css_set_links)
                goto out_unlock;
@@ -1615,14 +1837,16 @@ static void cgroup_enable_task_cg_lists(void)
                if (!(p->flags & PF_EXITING)) {
                        struct css_set *cset = task_css_set(p);
-                        list_add(&p->cg_list, &cset->tasks);
+                        if (!css_set_populated(cset))
+                                css_set_update_populated(cset, true);
+                        list_add_tail(&p->cg_list, &cset->tasks);
                        get_css_set(cset);
                }
                spin_unlock_irq(&p->sighand->siglock);
        } while_each_thread(g, p);
        read_unlock(&tasklist_lock);
 out_unlock:
-        up_write(&css_set_rwsem);
+        spin_unlock_bh(&css_set_lock);
 }
 static void init_cgroup_housekeeping(struct cgroup *cgrp)
@@ -1632,6 +1856,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
        INIT_LIST_HEAD(&cgrp->self.sibling);
        INIT_LIST_HEAD(&cgrp->self.children);
+        INIT_LIST_HEAD(&cgrp->self.files);
        INIT_LIST_HEAD(&cgrp->cset_links);
        INIT_LIST_HEAD(&cgrp->pidlists);
        mutex_init(&cgrp->pidlist_mutex);
@@ -1669,7 +1894,6 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
 {
        LIST_HEAD(tmp_links);
        struct cgroup *root_cgrp = &root->cgrp;
-        struct cftype *base_files;
        struct css_set *cset;
        int i, ret;
@@ -1686,7 +1910,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
                goto out;
        /*
-         * We're accessing css_set_count without locking css_set_rwsem here,
+         * We're accessing css_set_count without locking css_set_lock here,
         * but that's OK - it can only be increased by someone holding
         * cgroup_lock, and that's us. The worst that can happen is that we
         * have some link structures left over
@@ -1708,12 +1932,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
        }
        root_cgrp->kn = root->kf_root->kn;
-        if (root == &cgrp_dfl_root)
+        ret = css_populate_dir(&root_cgrp->self, NULL);
-                base_files = cgroup_dfl_base_files;
-        else
-                base_files = cgroup_legacy_base_files;
-        ret = cgroup_addrm_files(root_cgrp, base_files, true);
        if (ret)
                goto destroy_root;
@@ -1733,10 +1952,13 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
         * Link the root cgroup in this hierarchy into all the css_set
         * objects.
         */
-        down_write(&css_set_rwsem);
+        spin_lock_bh(&css_set_lock);
-        hash_for_each(css_set_table, i, cset, hlist)
+        hash_for_each(css_set_table, i, cset, hlist) {
                link_css_set(&tmp_links, cset, root_cgrp);
-        up_write(&css_set_rwsem);
+                if (css_set_populated(cset))
+                        cgroup_update_populated(root_cgrp, true);
+        }
+        spin_unlock_bh(&css_set_lock);
        BUG_ON(!list_empty(&root_cgrp->self.children));
        BUG_ON(atomic_read(&root->nr_cgrps) != 1);
@@ -1969,7 +2191,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
        char *path = NULL;
        mutex_lock(&cgroup_mutex);
-        down_read(&css_set_rwsem);
+        spin_lock_bh(&css_set_lock);
        root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
@@ -1982,7 +2204,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
                        path = buf;
        }
-        up_read(&css_set_rwsem);
+        spin_unlock_bh(&css_set_lock);
        mutex_unlock(&cgroup_mutex);
        return path;
 }
@@ -2010,6 +2232,49 @@ struct cgroup_taskset {
        struct task_struct      *cur_task;
 };
+#define CGROUP_TASKSET_INIT(tset)       (struct cgroup_taskset){        \
+        .src_csets              = LIST_HEAD_INIT(tset.src_csets),       \
+        .dst_csets              = LIST_HEAD_INIT(tset.dst_csets),       \
+        .csets                  = &tset.src_csets,                      \
+}
+/**
+ * cgroup_taskset_add - try to add a migration target task to a taskset
+ * @task: target task
+ * @tset: target taskset
+ *
+ * Add @task, which is a migration target, to @tset.  This function becomes
+ * noop if @task doesn't need to be migrated.  @task's css_set should have
+ * been added as a migration source and @task->cg_list will be moved from
+ * the css_set's tasks list to mg_tasks one.
+ */
+static void cgroup_taskset_add(struct task_struct *task,
+                               struct cgroup_taskset *tset)
+{
+        struct css_set *cset;
+        lockdep_assert_held(&css_set_lock);
+        /* @task either already exited or can't exit until the end */
+        if (task->flags & PF_EXITING)
+                return;
+        /* leave @task alone if post_fork() hasn't linked it yet */
+        if (list_empty(&task->cg_list))
+                return;
+        cset = task_css_set(task);
+        if (!cset->mg_src_cgrp)
+                return;
+        list_move_tail(&task->cg_list, &cset->mg_tasks);
+        if (list_empty(&cset->mg_node))
+                list_add_tail(&cset->mg_node, &tset->src_csets);
+        if (list_empty(&cset->mg_dst_cset->mg_node))
+                list_move_tail(&cset->mg_dst_cset->mg_node,
+                               &tset->dst_csets);
+}
 /**
 * cgroup_taskset_first - reset taskset and return the first task
 * @tset: taskset of interest
@@ -2057,47 +2322,86 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
 }
 /**
- * cgroup_task_migrate - move a task from one cgroup to another.
+ * cgroup_taskset_migrate - migrate a taskset to a cgroup
- * @old_cgrp: the cgroup @tsk is being migrated from
+ * @tset: taget taskset
- * @tsk: the task being migrated
+ * @dst_cgrp: destination cgroup
- * @new_cset: the new css_set @tsk is being attached to
 *
- * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked.
+ * Migrate tasks in @tset to @dst_cgrp.  This function fails iff one of the
+ * ->can_attach callbacks fails and guarantees that either all or none of
+ * the tasks in @tset are migrated.  @tset is consumed regardless of
+ * success.
 */
-static void cgroup_task_migrate(struct cgroup *old_cgrp,
+static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
-                                struct task_struct *tsk,
+                                  struct cgroup *dst_cgrp)
-                                struct css_set *new_cset)
 {
-        struct css_set *old_cset;
+        struct cgroup_subsys_state *css, *failed_css = NULL;
+        struct task_struct *task, *tmp_task;
-        lockdep_assert_held(&cgroup_mutex);
+        struct css_set *cset, *tmp_cset;
-        lockdep_assert_held(&css_set_rwsem);
+        int i, ret;
-        /*
+        /* methods shouldn't be called if no task is actually migrating */
-         * We are synchronized through cgroup_threadgroup_rwsem against
+        if (list_empty(&tset->src_csets))
-         * PF_EXITING setting such that we can't race against cgroup_exit()
+                return 0;
-         * changing the css_set to init_css_set and dropping the old one.
-         */
-        WARN_ON_ONCE(tsk->flags & PF_EXITING);
-        old_cset = task_css_set(tsk);
-        get_css_set(new_cset);
+        /* check that we can legitimately attach to the cgroup */
-        rcu_assign_pointer(tsk->cgroups, new_cset);
+        for_each_e_css(css, i, dst_cgrp) {
+                if (css->ss->can_attach) {
+                        ret = css->ss->can_attach(css, tset);
+                        if (ret) {
+                                failed_css = css;
+                                goto out_cancel_attach;
+                        }
+                }
+        }
        /*
-         * Use move_tail so that cgroup_taskset_first() still returns the
+         * Now that we're guaranteed success, proceed to move all tasks to
-         * leader after migration.  This works because cgroup_migrate()
+         * the new cgroup.  There are no failure cases after here, so this
-         * ensures that the dst_cset of the leader is the first on the
+         * is the commit point.
-         * tset's dst_csets list.
         */
-        list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);
+        spin_lock_bh(&css_set_lock);
+        list_for_each_entry(cset, &tset->src_csets, mg_node) {
+                list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
+                        struct css_set *from_cset = task_css_set(task);
+                        struct css_set *to_cset = cset->mg_dst_cset;
+                        get_css_set(to_cset);
+                        css_set_move_task(task, from_cset, to_cset, true);
+                        put_css_set_locked(from_cset);
+                }
+        }
+        spin_unlock_bh(&css_set_lock);
        /*
-         * We just gained a reference on old_cset by taking it from the
+         * Migration is committed, all target tasks are now on dst_csets.
-         * task. As trading it for new_cset is protected by cgroup_mutex,
+         * Nothing is sensitive to fork() after this point.  Notify
-         * we're safe to drop it here; it will be freed under RCU.
+         * controllers that migration is complete.
         */
-        put_css_set_locked(old_cset);
+        tset->csets = &tset->dst_csets;
+        for_each_e_css(css, i, dst_cgrp)
+                if (css->ss->attach)
+                        css->ss->attach(css, tset);
+        ret = 0;
+        goto out_release_tset;
+out_cancel_attach:
+        for_each_e_css(css, i, dst_cgrp) {
+                if (css == failed_css)
+                        break;
+                if (css->ss->cancel_attach)
+                        css->ss->cancel_attach(css, tset);
+        }
+out_release_tset:
+        spin_lock_bh(&css_set_lock);
+        list_splice_init(&tset->dst_csets, &tset->src_csets);
+        list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
+                list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
+                list_del_init(&cset->mg_node);
+        }
+        spin_unlock_bh(&css_set_lock);
+        return ret;
 }
 /**
@@ -2113,14 +2417,14 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
        lockdep_assert_held(&cgroup_mutex);
-        down_write(&css_set_rwsem);
+        spin_lock_bh(&css_set_lock);
        list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
                cset->mg_src_cgrp = NULL;
                cset->mg_dst_cset = NULL;
                list_del_init(&cset->mg_preload_node);
                put_css_set_locked(cset);
        }
-        up_write(&css_set_rwsem);
+        spin_unlock_bh(&css_set_lock);
 }
 /**
@@ -2146,7 +2450,7 @@ static void cgroup_migrate_add_src(struct css_set *src_cset,
        struct cgroup *src_cgrp;
        lockdep_assert_held(&cgroup_mutex);
-        lockdep_assert_held(&css_set_rwsem);
+        lockdep_assert_held(&css_set_lock);
        src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
@@ -2235,9 +2539,9 @@ err:
 /**
 * cgroup_migrate - migrate a process or task to a cgroup
- * @cgrp: the destination cgroup
 * @leader: the leader of the process or the task to migrate
 * @threadgroup: whether @leader points to the whole process or a single task
+ * @cgrp: the destination cgroup
 *
 * Migrate a process or task denoted by @leader to @cgrp.  If migrating a
 * process, the caller must be holding cgroup_threadgroup_rwsem.  The
@@ -2251,115 +2555,29 @@ err:
 * decided for all targets by invoking group_migrate_prepare_dst() before
 * actually starting migrating.
 */
-static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
+static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
-                          bool threadgroup)
+                          struct cgroup *cgrp)
-{
+{
-        struct cgroup_taskset tset = {
+        struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
-                .src_csets      = LIST_HEAD_INIT(tset.src_csets),
+        struct task_struct *task;
-                .dst_csets      = LIST_HEAD_INIT(tset.dst_csets),
-                .csets          = &tset.src_csets,
-        };
-        struct cgroup_subsys_state *css, *failed_css = NULL;
-        struct css_set *cset, *tmp_cset;
-        struct task_struct *task, *tmp_task;
-        int i, ret;
        /*
         * Prevent freeing of tasks while we take a snapshot. Tasks that are
         * already PF_EXITING could be freed from underneath us unless we
         * take an rcu_read_lock.
         */
-        down_write(&css_set_rwsem);
+        spin_lock_bh(&css_set_lock);
        rcu_read_lock();
        task = leader;
        do {
-                /* @task either already exited or can't exit until the end */
+                cgroup_taskset_add(task, &tset);
-                if (task->flags & PF_EXITING)
-                        goto next;
-                /* leave @task alone if post_fork() hasn't linked it yet */
-                if (list_empty(&task->cg_list))
-                        goto next;
-                cset = task_css_set(task);
-                if (!cset->mg_src_cgrp)
-                        goto next;
-                /*
-                 * cgroup_taskset_first() must always return the leader.
-                 * Take care to avoid disturbing the ordering.
-                 */
-                list_move_tail(&task->cg_list, &cset->mg_tasks);
-                if (list_empty(&cset->mg_node))
-                        list_add_tail(&cset->mg_node, &tset.src_csets);
-                if (list_empty(&cset->mg_dst_cset->mg_node))
-                        list_move_tail(&cset->mg_dst_cset->mg_node,
-                                       &tset.dst_csets);
-        next:
                if (!threadgroup)
                        break;
        } while_each_thread(leader, task);
        rcu_read_unlock();
-        up_write(&css_set_rwsem);
+        spin_unlock_bh(&css_set_lock);
-        /* methods shouldn't be called if no task is actually migrating */
-        if (list_empty(&tset.src_csets))
-                return 0;
-        /* check that we can legitimately attach to the cgroup */
-        for_each_e_css(css, i, cgrp) {
-                if (css->ss->can_attach) {
-                        ret = css->ss->can_attach(css, &tset);
-                        if (ret) {
-                                failed_css = css;
-                                goto out_cancel_attach;
-                        }
-                }
-        }
-        /*
-         * Now that we're guaranteed success, proceed to move all tasks to
-         * the new cgroup.  There are no failure cases after here, so this
-         * is the commit point.
-         */
-        down_write(&css_set_rwsem);
-        list_for_each_entry(cset, &tset.src_csets, mg_node) {
-                list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)
-                        cgroup_task_migrate(cset->mg_src_cgrp, task,
-                                            cset->mg_dst_cset);
-        }
-        up_write(&css_set_rwsem);
-        /*
-         * Migration is committed, all target tasks are now on dst_csets.
-         * Nothing is sensitive to fork() after this point.  Notify
-         * controllers that migration is complete.
-         */
-        tset.csets = &tset.dst_csets;
-        for_each_e_css(css, i, cgrp)
+        return cgroup_taskset_migrate(&tset, cgrp);
-                if (css->ss->attach)
-                        css->ss->attach(css, &tset);
-        ret = 0;
-        goto out_release_tset;
-out_cancel_attach:
-        for_each_e_css(css, i, cgrp) {
-                if (css == failed_css)
-                        break;
-                if (css->ss->cancel_attach)
-                        css->ss->cancel_attach(css, &tset);
-        }
-out_release_tset:
-        down_write(&css_set_rwsem);
-        list_splice_init(&tset.dst_csets, &tset.src_csets);
-        list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) {
-                list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
-                list_del_init(&cset->mg_node);
-        }
-        up_write(&css_set_rwsem);
-        return ret;
 }
 /**
@@ -2378,7 +2596,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
        int ret;
        /* look up all src csets */
-        down_read(&css_set_rwsem);
+        spin_lock_bh(&css_set_lock);
        rcu_read_lock();
        task = leader;
        do {
@@ -2388,12 +2606,12 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
                        break;
        } while_each_thread(leader, task);
        rcu_read_unlock();
-        up_read(&css_set_rwsem);
+        spin_unlock_bh(&css_set_lock);
        /* prepare dst csets and commit */
        ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
        if (!ret)
-                ret = cgroup_migrate(dst_cgrp, leader, threadgroup);
+                ret = cgroup_migrate(leader, threadgroup, dst_cgrp);
        cgroup_migrate_finish(&preloaded_csets);
        return ret;
@@ -2421,15 +2639,15 @@ static int cgroup_procs_write_permission(struct task_struct *task,
                struct cgroup *cgrp;
                struct inode *inode;
-                down_read(&css_set_rwsem);
+                spin_lock_bh(&css_set_lock);
                cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
-                up_read(&css_set_rwsem);
+                spin_unlock_bh(&css_set_lock);
                while (!cgroup_is_descendant(dst_cgrp, cgrp))
                        cgrp = cgroup_parent(cgrp);
                ret = -ENOMEM;
-                inode = kernfs_get_inode(sb, cgrp->procs_kn);
+                inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
                if (inode) {
                        ret = inode_permission(inode, MAY_WRITE);
                        iput(inode);
@@ -2520,9 +2738,9 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
                if (root == &cgrp_dfl_root)
                        continue;
-                down_read(&css_set_rwsem);
+                spin_lock_bh(&css_set_lock);
                from_cgrp = task_cgroup_from_root(from, root);
-                up_read(&css_set_rwsem);
+                spin_unlock_bh(&css_set_lock);
                retval = cgroup_attach_task(from_cgrp, tsk, false);
                if (retval)
@@ -2637,6 +2855,7 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
 static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 {
        LIST_HEAD(preloaded_csets);
+        struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
        struct cgroup_subsys_state *css;
        struct css_set *src_cset;
        int ret;
@@ -2646,7 +2865,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
        percpu_down_write(&cgroup_threadgroup_rwsem);
        /* look up all csses currently attached to @cgrp's subtree */
-        down_read(&css_set_rwsem);
+        spin_lock_bh(&css_set_lock);
        css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
                struct cgrp_cset_link *link;
@@ -2658,57 +2877,28 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
                        cgroup_migrate_add_src(link->cset, cgrp,
                                               &preloaded_csets);
        }
-        up_read(&css_set_rwsem);
+        spin_unlock_bh(&css_set_lock);
        /* NULL dst indicates self on default hierarchy */
        ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets);
        if (ret)
                goto out_finish;
+        spin_lock_bh(&css_set_lock);
        list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
-                struct task_struct *last_task = NULL, *task;
+                struct task_struct *task, *ntask;
                /* src_csets precede dst_csets, break on the first dst_cset */
                if (!src_cset->mg_src_cgrp)
                        break;
-                /*
+                /* all tasks in src_csets need to be migrated */
-                 * All tasks in src_cset need to be migrated to the
+                list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
-                 * matching dst_cset.  Empty it process by process.  We
+                        cgroup_taskset_add(task, &tset);
-                 * walk tasks but migrate processes.  The leader might even
-                 * belong to a different cset but such src_cset would also
-                 * be among the target src_csets because the default
-                 * hierarchy enforces per-process membership.
-                 */
-                while (true) {
-                        down_read(&css_set_rwsem);
-                        task = list_first_entry_or_null(&src_cset->tasks,
-                                                struct task_struct, cg_list);
-                        if (task) {
-                                task = task->group_leader;
-                                WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp);
-                                get_task_struct(task);
-                        }
-                        up_read(&css_set_rwsem);
-                        if (!task)
-                                break;
-                        /* guard against possible infinite loop */
-                        if (WARN(last_task == task,
-                                 "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n"))
-                                goto out_finish;
-                        last_task = task;
-                        ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
-                        put_task_struct(task);
-                        if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
-                                goto out_finish;
-                }
        }
+        spin_unlock_bh(&css_set_lock);
+        ret = cgroup_taskset_migrate(&tset, cgrp);
 out_finish:
        cgroup_migrate_finish(&preloaded_csets);
        percpu_up_write(&cgroup_threadgroup_rwsem);
@@ -2738,7 +2928,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
                if (tok[0] == '\0')
                        continue;
                for_each_subsys_which(ss, ssid, &tmp_ss_mask) {
-                        if (ss->disabled || strcmp(tok + 1, ss->name))
+                        if (!cgroup_ssid_enabled(ssid) ||
+                            strcmp(tok + 1, ss->name))
                                continue;
                        if (*tok == '+') {
@@ -2862,7 +3053,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
                                ret = create_css(child, ss,
                                        cgrp->subtree_control & (1 << ssid));
                        else
-                                ret = cgroup_populate_dir(child, 1 << ssid);
+                                ret = css_populate_dir(cgroup_css(child, ss),
+                                                       NULL);
                        if (ret)
                                goto err_undo_css;
                }
@@ -2895,7 +3087,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
                        if (css_disable & (1 << ssid)) {
                                kill_css(css);
                        } else {
-                                cgroup_clear_dir(child, 1 << ssid);
+                                css_clear_dir(css, NULL);
                                if (ss->css_reset)
                                        ss->css_reset(css);
                        }
@@ -2943,15 +3135,16 @@ err_undo_css:
                        if (css_enable & (1 << ssid))
                                kill_css(css);
                        else
-                                cgroup_clear_dir(child, 1 << ssid);
+                                css_clear_dir(css, NULL);
                }
        }
        goto out_unlock;
 }
-static int cgroup_populated_show(struct seq_file *seq, void *v)
+static int cgroup_events_show(struct seq_file *seq, void *v)
 {
-        seq_printf(seq, "%d\n", (bool)seq_css(seq)->cgroup->populated_cnt);
+        seq_printf(seq, "populated %d\n",
+                   cgroup_is_populated(seq_css(seq)->cgroup));
        return 0;
 }
@@ -3094,7 +3287,8 @@ static int cgroup_kn_set_ugid(struct kernfs_node *kn)
        return kernfs_setattr(kn, &iattr);
 }
-static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
+static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
+                           struct cftype *cft)
 {
        char name[CGROUP_FILE_NAME_MAX];
        struct kernfs_node *kn;
@@ -3116,33 +3310,38 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
                return ret;
        }
-        if (cft->write == cgroup_procs_write)
+        if (cft->file_offset) {
-                cgrp->procs_kn = kn;
+                struct cgroup_file *cfile = (void *)css + cft->file_offset;
-        else if (cft->seq_show == cgroup_populated_show)
-                cgrp->populated_kn = kn;
+                kernfs_get(kn);
+                cfile->kn = kn;
+                list_add(&cfile->node, &css->files);
+        }
        return 0;
 }
 /**
 * cgroup_addrm_files - add or remove files to a cgroup directory
- * @cgrp: the target cgroup
+ * @css: the target css
+ * @cgrp: the target cgroup (usually css->cgroup)
 * @cfts: array of cftypes to be added
 * @is_add: whether to add or remove
 *
 * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
- * For removals, this function never fails.  If addition fails, this
+ * For removals, this function never fails.
- * function doesn't remove files already added.  The caller is responsible
- * for cleaning up.
 */
-static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
+static int cgroup_addrm_files(struct cgroup_subsys_state *css,
+                              struct cgroup *cgrp, struct cftype cfts[],
                              bool is_add)
 {
-        struct cftype *cft;
+        struct cftype *cft, *cft_end = NULL;
        int ret;
        lockdep_assert_held(&cgroup_mutex);
-        for (cft = cfts; cft->name[0] != '\0'; cft++) {
+restart:
+        for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
                /* does cft->flags tell us to skip this file on @cgrp? */
                if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
                        continue;
@@ -3154,11 +3353,13 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
                        continue;
                if (is_add) {
-                        ret = cgroup_add_file(cgrp, cft);
+                        ret = cgroup_add_file(css, cgrp, cft);
                        if (ret) {
                                pr_warn("%s: failed to add %s, err=%d\n",
                                        __func__, cft->name, ret);
-                                return ret;
+                                cft_end = cft;
+                                is_add = false;
+                                goto restart;
                        }
                } else {
                        cgroup_rm_file(cgrp, cft);
@@ -3184,7 +3385,7 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
                if (cgroup_is_dead(cgrp))
                        continue;
-                ret = cgroup_addrm_files(cgrp, cfts, is_add);
+                ret = cgroup_addrm_files(css, cgrp, cfts, is_add);
                if (ret)
                        break;
        }
@@ -3296,7 +3497,7 @@ static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 {
        int ret;
-        if (ss->disabled)
+        if (!cgroup_ssid_enabled(ss->id))
                return 0;
        if (!cfts || cfts[0].name[0] == '\0')
@@ -3346,17 +3547,8 @@ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 {
        struct cftype *cft;
-        /*
+        for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
-         * If legacy_flies_on_dfl, we want to show the legacy files on the
+                cft->flags |= __CFTYPE_NOT_ON_DFL;
-         * dfl hierarchy but iff the target subsystem hasn't been updated
-         * for the dfl hierarchy yet.
-         */
-        if (!cgroup_legacy_files_on_dfl ||
-            ss->dfl_cftypes != ss->legacy_cftypes) {
-                for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
-                        cft->flags |= __CFTYPE_NOT_ON_DFL;
-        }
        return cgroup_add_cftypes(ss, cfts);
 }
@@ -3371,10 +3563,10 @@ static int cgroup_task_count(const struct cgroup *cgrp)
        int count = 0;
        struct cgrp_cset_link *link;
-        down_read(&css_set_rwsem);
+        spin_lock_bh(&css_set_lock);
        list_for_each_entry(link, &cgrp->cset_links, cset_link)
                count += atomic_read(&link->cset->refcount);
-        up_read(&css_set_rwsem);
+        spin_unlock_bh(&css_set_lock);
        return count;
 }
@@ -3606,22 +3798,25 @@ bool css_has_online_children(struct cgroup_subsys_state *css)
 }
 /**
- * css_advance_task_iter - advance a task itererator to the next css_set
+ * css_task_iter_advance_css_set - advance a task itererator to the next css_set
 * @it: the iterator to advance
 *
 * Advance @it to the next css_set to walk.
 */
-static void css_advance_task_iter(struct css_task_iter *it)
+static void css_task_iter_advance_css_set(struct css_task_iter *it)
 {
        struct list_head *l = it->cset_pos;
        struct cgrp_cset_link *link;
        struct css_set *cset;
+        lockdep_assert_held(&css_set_lock);
        /* Advance to the next non-empty css_set */
        do {
                l = l->next;
                if (l == it->cset_head) {
                        it->cset_pos = NULL;
+                        it->task_pos = NULL;
                        return;
                }
@@ -3632,7 +3827,7 @@ static void css_advance_task_iter(struct css_task_iter *it)
                        link = list_entry(l, struct cgrp_cset_link, cset_link);
                        cset = link->cset;
                }
-        } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));
+        } while (!css_set_populated(cset));
        it->cset_pos = l;
@@ -3643,6 +3838,52 @@ static void css_advance_task_iter(struct css_task_iter *it)
        it->tasks_head = &cset->tasks;
        it->mg_tasks_head = &cset->mg_tasks;
+        /*
+         * We don't keep css_sets locked across iteration steps and thus
+         * need to take steps to ensure that iteration can be resumed after
+         * the lock is re-acquired.  Iteration is performed at two levels -
+         * css_sets and tasks in them.
+         *
+         * Once created, a css_set never leaves its cgroup lists, so a
+         * pinned css_set is guaranteed to stay put and we can resume
+         * iteration afterwards.
+         *
+         * Tasks may leave @cset across iteration steps.  This is resolved
+         * by registering each iterator with the css_set currently being
+         * walked and making css_set_move_task() advance iterators whose
+         * next task is leaving.
+         */
+        if (it->cur_cset) {
+                list_del(&it->iters_node);
+                put_css_set_locked(it->cur_cset);
+        }
+        get_css_set(cset);
+        it->cur_cset = cset;
+        list_add(&it->iters_node, &cset->task_iters);
+}
+static void css_task_iter_advance(struct css_task_iter *it)
+{
+        struct list_head *l = it->task_pos;
+        lockdep_assert_held(&css_set_lock);
+        WARN_ON_ONCE(!l);
+        /*
+         * Advance iterator to find next entry.  cset->tasks is consumed
+         * first and then ->mg_tasks.  After ->mg_tasks, we move onto the
+         * next cset.
+         */
+        l = l->next;
+        if (l == it->tasks_head)
+                l = it->mg_tasks_head->next;
+        if (l == it->mg_tasks_head)
+                css_task_iter_advance_css_set(it);
+        else
+                it->task_pos = l;
 }
 /**
@@ -3654,19 +3895,16 @@ static void css_advance_task_iter(struct css_task_iter *it)
 * css_task_iter_next() to walk through the tasks until the function
 * returns NULL.  On completion of iteration, css_task_iter_end() must be
 * called.
- *
- * Note that this function acquires a lock which is released when the
- * iteration finishes.  The caller can't sleep while iteration is in
- * progress.
 */
 void css_task_iter_start(struct cgroup_subsys_state *css,
                         struct css_task_iter *it)
-        __acquires(css_set_rwsem)
 {
        /* no one should try to iterate before mounting cgroups */
        WARN_ON_ONCE(!use_task_css_set_links);
-        down_read(&css_set_rwsem);
+        memset(it, 0, sizeof(*it));
+        spin_lock_bh(&css_set_lock);
        it->ss = css->ss;
@@ -3677,7 +3915,9 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
        it->cset_head = it->cset_pos;
-        css_advance_task_iter(it);
+        css_task_iter_advance_css_set(it);
+        spin_unlock_bh(&css_set_lock);
 }
 /**
@@ -3690,30 +3930,23 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
 */
 struct task_struct *css_task_iter_next(struct css_task_iter *it)
 {
-        struct task_struct *res;
+        if (it->cur_task) {
-        struct list_head *l = it->task_pos;
+                put_task_struct(it->cur_task);
+                it->cur_task = NULL;
+        }
-        /* If the iterator cg is NULL, we have no tasks */
+        spin_lock_bh(&css_set_lock);
-        if (!it->cset_pos)
-                return NULL;
-        res = list_entry(l, struct task_struct, cg_list);
-        /*
+        if (it->task_pos) {
-         * Advance iterator to find next entry.  cset->tasks is consumed
+                it->cur_task = list_entry(it->task_pos, struct task_struct,
-         * first and then ->mg_tasks.  After ->mg_tasks, we move onto the
+                                          cg_list);
-         * next cset.
+                get_task_struct(it->cur_task);
-         */
+                css_task_iter_advance(it);
-        l = l->next;
+        }
-        if (l == it->tasks_head)
+        spin_unlock_bh(&css_set_lock);
-                l = it->mg_tasks_head->next;
-        if (l == it->mg_tasks_head)
+        return it->cur_task;
-                css_advance_task_iter(it);
-        else
-                it->task_pos = l;
-        return res;
 }
 /**
@@ -3723,9 +3956,16 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
 * Finish task iteration started by css_task_iter_start().
 */
 void css_task_iter_end(struct css_task_iter *it)
-        __releases(css_set_rwsem)
 {
-        up_read(&css_set_rwsem);
+        if (it->cur_cset) {
+                spin_lock_bh(&css_set_lock);
+                list_del(&it->iters_node);
+                put_css_set_locked(it->cur_cset);
+                spin_unlock_bh(&css_set_lock);
+        }
+        if (it->cur_task)
+                put_task_struct(it->cur_task);
 }
 /**
@@ -3750,10 +3990,10 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
        mutex_lock(&cgroup_mutex);
        /* all tasks in @from are being moved, all csets are source */
-        down_read(&css_set_rwsem);
+        spin_lock_bh(&css_set_lock);
        list_for_each_entry(link, &from->cset_links, cset_link)
                cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
-        up_read(&css_set_rwsem);
+        spin_unlock_bh(&css_set_lock);
        ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
        if (ret)
@@ -3771,7 +4011,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
                css_task_iter_end(&it);
                if (task) {
-                        ret = cgroup_migrate(to, task, false);
+                        ret = cgroup_migrate(task, false, to);
                        put_task_struct(task);
                }
        } while (task && !ret);
@@ -4268,13 +4508,13 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
 static struct cftype cgroup_dfl_base_files[] = {
        {
                .name = "cgroup.procs",
+                .file_offset = offsetof(struct cgroup, procs_file),
                .seq_start = cgroup_pidlist_start,
                .seq_next = cgroup_pidlist_next,
                .seq_stop = cgroup_pidlist_stop,
                .seq_show = cgroup_pidlist_show,
                .private = CGROUP_FILE_PROCS,
                .write = cgroup_procs_write,
-                .mode = S_IRUGO | S_IWUSR,
        },
        {
                .name = "cgroup.controllers",
@@ -4292,9 +4532,10 @@ static struct cftype cgroup_dfl_base_files[] = {
                .write = cgroup_subtree_control_write,
        },
        {
-                .name = "cgroup.populated",
+                .name = "cgroup.events",
                .flags = CFTYPE_NOT_ON_ROOT,
-                .seq_show = cgroup_populated_show,
+                .file_offset = offsetof(struct cgroup, events_file),
+                .seq_show = cgroup_events_show,
        },
        { }     /* terminate */
 };
@@ -4309,7 +4550,6 @@ static struct cftype cgroup_legacy_base_files[] = {
                .seq_show = cgroup_pidlist_show,
                .private = CGROUP_FILE_PROCS,
                .write = cgroup_procs_write,
-                .mode = S_IRUGO | S_IWUSR,
        },
        {
                .name = "cgroup.clone_children",
@@ -4329,7 +4569,6 @@ static struct cftype cgroup_legacy_base_files[] = {
                .seq_show = cgroup_pidlist_show,
                .private = CGROUP_FILE_TASKS,
                .write = cgroup_tasks_write,
-                .mode = S_IRUGO | S_IWUSR,
        },
        {
                .name = "notify_on_release",
@@ -4346,37 +4585,6 @@ static struct cftype cgroup_legacy_base_files[] = {
        { }     /* terminate */
 };
-/**
- * cgroup_populate_dir - create subsys files in a cgroup directory
- * @cgrp: target cgroup
- * @subsys_mask: mask of the subsystem ids whose files should be added
- *
- * On failure, no file is added.
- */
-static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
-{
-        struct cgroup_subsys *ss;
-        int i, ret = 0;
-        /* process cftsets of each subsystem */
-        for_each_subsys(ss, i) {
-                struct cftype *cfts;
-                if (!(subsys_mask & (1 << i)))
-                        continue;
-                list_for_each_entry(cfts, &ss->cfts, node) {
-                        ret = cgroup_addrm_files(cgrp, cfts, true);
-                        if (ret < 0)
-                                goto err;
-                }
-        }
-        return 0;
-err:
-        cgroup_clear_dir(cgrp, subsys_mask);
-        return ret;
-}
 /*
 * css destruction is four-stage process.
 *
@@ -4405,9 +4613,13 @@ static void css_free_work_fn(struct work_struct *work)
                container_of(work, struct cgroup_subsys_state, destroy_work);
        struct cgroup_subsys *ss = css->ss;
        struct cgroup *cgrp = css->cgroup;
+        struct cgroup_file *cfile;
        percpu_ref_exit(&css->refcnt);
+        list_for_each_entry(cfile, &css->files, node)
+                kernfs_put(cfile->kn);
        if (ss) {
                /* css free path */
                int id = css->id;
@@ -4512,6 +4724,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
        css->ss = ss;
        INIT_LIST_HEAD(&css->sibling);
        INIT_LIST_HEAD(&css->children);
+        INIT_LIST_HEAD(&css->files);
        css->serial_nr = css_serial_nr_next++;
        if (cgroup_parent(cgrp)) {
@@ -4594,7 +4807,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
        css->id = err;
        if (visible) {
-                err = cgroup_populate_dir(cgrp, 1 << ss->id);
+                err = css_populate_dir(css, NULL);
                if (err)
                        goto err_free_id;
        }
@@ -4620,7 +4833,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
 err_list_del:
        list_del_rcu(&css->sibling);
-        cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
+        css_clear_dir(css, NULL);
 err_free_id:
        cgroup_idr_remove(&ss->css_idr, css->id);
 err_free_percpu_ref:
@@ -4637,7 +4850,6 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
        struct cgroup_root *root;
        struct cgroup_subsys *ss;
        struct kernfs_node *kn;
-        struct cftype *base_files;
        int ssid, ret;
        /* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable.
@@ -4713,12 +4925,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
        if (ret)
                goto out_destroy;
-        if (cgroup_on_dfl(cgrp))
+        ret = css_populate_dir(&cgrp->self, NULL);
-                base_files = cgroup_dfl_base_files;
-        else
-                base_files = cgroup_legacy_base_files;
-        ret = cgroup_addrm_files(cgrp, base_files, true);
        if (ret)
                goto out_destroy;
@@ -4805,7 +5012,7 @@ static void kill_css(struct cgroup_subsys_state *css)
         * This must happen before css is disassociated with its cgroup.
         * See seq_css() for details.
         */
-        cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
+        css_clear_dir(css, NULL);
        /*
         * Killing would put the base ref, but we need to keep it alive
@@ -4854,19 +5061,15 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
        __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
 {
        struct cgroup_subsys_state *css;
-        bool empty;
        int ssid;
        lockdep_assert_held(&cgroup_mutex);
        /*
-         * css_set_rwsem synchronizes access to ->cset_links and prevents
+         * Only migration can raise populated from zero and we're already
-         * @cgrp from being removed while put_css_set() is in progress.
+         * holding cgroup_mutex.
         */
-        down_read(&css_set_rwsem);
+        if (cgroup_is_populated(cgrp))
-        empty = list_empty(&cgrp->cset_links);
-        up_read(&css_set_rwsem);
-        if (!empty)
                return -EBUSY;
        /*
@@ -4964,6 +5167,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
        have_fork_callback |= (bool)ss->fork << ss->id;
        have_exit_callback |= (bool)ss->exit << ss->id;
+        have_free_callback |= (bool)ss->free << ss->id;
        have_canfork_callback |= (bool)ss->can_fork << ss->id;
        /* At system boot, before all subsystems have been
@@ -5012,6 +5216,8 @@ int __init cgroup_init_early(void)
        return 0;
 }
+static unsigned long cgroup_disable_mask __initdata;
 /**
 * cgroup_init - cgroup initialization
 *
@@ -5022,7 +5228,7 @@ int __init cgroup_init(void)
 {
        struct cgroup_subsys *ss;
        unsigned long key;
-        int ssid, err;
+        int ssid;
        BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
        BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
@@ -5058,14 +5264,15 @@ int __init cgroup_init(void)
                 * disabled flag and cftype registration needs kmalloc,
                 * both of which aren't available during early_init.
                 */
-                if (ss->disabled)
+                if (cgroup_disable_mask & (1 << ssid)) {
+                        static_branch_disable(cgroup_subsys_enabled_key[ssid]);
+                        printk(KERN_INFO "Disabling %s control group subsystem\n",
+                               ss->name);
                        continue;
+                }
                cgrp_dfl_root.subsys_mask |= 1 << ss->id;
-                if (cgroup_legacy_files_on_dfl && !ss->dfl_cftypes)
-                        ss->dfl_cftypes = ss->legacy_cftypes;
                if (!ss->dfl_cftypes)
                        cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id;
@@ -5080,17 +5287,10 @@ int __init cgroup_init(void)
                        ss->bind(init_css_set.subsys[ssid]);
        }
-        err = sysfs_create_mount_point(fs_kobj, "cgroup");
+        WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
-        if (err)
+        WARN_ON(register_filesystem(&cgroup_fs_type));
-                return err;
+        WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations));
-        err = register_filesystem(&cgroup_fs_type);
-        if (err < 0) {
-                sysfs_remove_mount_point(fs_kobj, "cgroup");
-                return err;
-        }
-        proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
        return 0;
 }
@@ -5137,7 +5337,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
                goto out;
        mutex_lock(&cgroup_mutex);
-        down_read(&css_set_rwsem);
+        spin_lock_bh(&css_set_lock);
        for_each_root(root) {
                struct cgroup_subsys *ss;
@@ -5157,19 +5357,39 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
                        seq_printf(m, "%sname=%s", count ? "," : "",
                                   root->name);
                seq_putc(m, ':');
                cgrp = task_cgroup_from_root(tsk, root);
-                path = cgroup_path(cgrp, buf, PATH_MAX);
-                if (!path) {
+                /*
-                        retval = -ENAMETOOLONG;
+                 * On traditional hierarchies, all zombie tasks show up as
-                        goto out_unlock;
+                 * belonging to the root cgroup.  On the default hierarchy,
+                 * while a zombie doesn't show up in "cgroup.procs" and
+                 * thus can't be migrated, its /proc/PID/cgroup keeps
+                 * reporting the cgroup it belonged to before exiting.  If
+                 * the cgroup is removed before the zombie is reaped,
+                 * " (deleted)" is appended to the cgroup path.
+                 */
+                if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
+                        path = cgroup_path(cgrp, buf, PATH_MAX);
+                        if (!path) {
+                                retval = -ENAMETOOLONG;
+                                goto out_unlock;
+                        }
+                } else {
+                        path = "/";
                }
                seq_puts(m, path);
-                seq_putc(m, '\n');
+                if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
+                        seq_puts(m, " (deleted)\n");
+                else
+                        seq_putc(m, '\n');
        }
        retval = 0;
 out_unlock:
-        up_read(&css_set_rwsem);
+        spin_unlock_bh(&css_set_lock);
        mutex_unlock(&cgroup_mutex);
        kfree(buf);
 out:
@@ -5193,7 +5413,8 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
        for_each_subsys(ss, i)
                seq_printf(m, "%s\t%d\t%d\t%d\n",
                           ss->legacy_name, ss->root->hierarchy_id,
-                           atomic_read(&ss->root->nr_cgrps), !ss->disabled);
+                           atomic_read(&ss->root->nr_cgrps),
+                           cgroup_ssid_enabled(i));
        mutex_unlock(&cgroup_mutex);
        return 0;
@@ -5314,7 +5535,7 @@ void cgroup_post_fork(struct task_struct *child,
         * @child during its iteration.
         *
         * If we won the race, @child is associated with %current's
-         * css_set.  Grabbing css_set_rwsem guarantees both that the
+         * css_set.  Grabbing css_set_lock guarantees both that the
         * association is stable, and, on completion of the parent's
         * migration, @child is visible in the source of migration or
         * already in the destination cgroup.  This guarantee is necessary
@@ -5329,14 +5550,13 @@ void cgroup_post_fork(struct task_struct *child,
        if (use_task_css_set_links) {
                struct css_set *cset;
-                down_write(&css_set_rwsem);
+                spin_lock_bh(&css_set_lock);
                cset = task_css_set(current);
                if (list_empty(&child->cg_list)) {
-                        rcu_assign_pointer(child->cgroups, cset);
-                        list_add(&child->cg_list, &cset->tasks);
                        get_css_set(cset);
+                        css_set_move_task(child, NULL, cset, false);
                }
-                up_write(&css_set_rwsem);
+                spin_unlock_bh(&css_set_lock);
        }
        /*
@@ -5371,39 +5591,42 @@ void cgroup_exit(struct task_struct *tsk)
 {
        struct cgroup_subsys *ss;
        struct css_set *cset;
-        bool put_cset = false;
        int i;
        /*
         * Unlink from @tsk from its css_set.  As migration path can't race
-         * with us, we can check cg_list without grabbing css_set_rwsem.
+         * with us, we can check css_set and cg_list without synchronization.
         */
+        cset = task_css_set(tsk);
        if (!list_empty(&tsk->cg_list)) {
-                down_write(&css_set_rwsem);
+                spin_lock_bh(&css_set_lock);
-                list_del_init(&tsk->cg_list);
+                css_set_move_task(tsk, cset, NULL, false);
-                up_write(&css_set_rwsem);
+                spin_unlock_bh(&css_set_lock);
-                put_cset = true;
+        } else {
+                get_css_set(cset);
        }
-        /* Reassign the task to the init_css_set. */
-        cset = task_css_set(tsk);
-        RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
        /* see cgroup_post_fork() for details */
-        for_each_subsys_which(ss, i, &have_exit_callback) {
+        for_each_subsys_which(ss, i, &have_exit_callback)
-                struct cgroup_subsys_state *old_css = cset->subsys[i];
+                ss->exit(tsk);
-                struct cgroup_subsys_state *css = task_css(tsk, i);
+}
-                ss->exit(css, old_css, tsk);
+void cgroup_free(struct task_struct *task)
-        }
+{
+        struct css_set *cset = task_css_set(task);
+        struct cgroup_subsys *ss;
+        int ssid;
-        if (put_cset)
+        for_each_subsys_which(ss, ssid, &have_free_callback)
-                put_css_set(cset);
+                ss->free(task);
+        put_css_set(cset);
 }
 static void check_for_release(struct cgroup *cgrp)
 {
-        if (notify_on_release(cgrp) && !cgroup_has_tasks(cgrp) &&
+        if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
            !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
                schedule_work(&cgrp->release_agent_work);
 }
@@ -5482,25 +5705,13 @@ static int __init cgroup_disable(char *str)
                        if (strcmp(token, ss->name) &&
                            strcmp(token, ss->legacy_name))
                                continue;
+                        cgroup_disable_mask |= 1 << i;
-                        ss->disabled = 1;
-                        printk(KERN_INFO "Disabling %s control group subsystem\n",
-                               ss->name);
-                        break;
                }
        }
        return 1;
 }
 __setup("cgroup_disable=", cgroup_disable);
-static int __init cgroup_set_legacy_files_on_dfl(char *str)
-{
-        printk("cgroup: using legacy files on the default hierarchy\n");
-        cgroup_legacy_files_on_dfl = true;
-        return 0;
-}
-__setup("cgroup__DEVEL__legacy_files_on_dfl", cgroup_set_legacy_files_on_dfl);
 /**
 * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
 * @dentry: directory dentry of interest
@@ -5604,7 +5815,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
        if (!name_buf)
                return -ENOMEM;
-        down_read(&css_set_rwsem);
+        spin_lock_bh(&css_set_lock);
        rcu_read_lock();
        cset = rcu_dereference(current->cgroups);
        list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
@@ -5615,7 +5826,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
                           c->root->hierarchy_id, name_buf);
        }
        rcu_read_unlock();
-        up_read(&css_set_rwsem);
+        spin_unlock_bh(&css_set_lock);
        kfree(name_buf);
        return 0;
 }
@@ -5626,7 +5837,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
        struct cgroup_subsys_state *css = seq_css(seq);
        struct cgrp_cset_link *link;
-        down_read(&css_set_rwsem);
+        spin_lock_bh(&css_set_lock);
        list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
                struct css_set *cset = link->cset;
                struct task_struct *task;
@@ -5649,13 +5860,13 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
        overflow:
                seq_puts(seq, "  ...\n");
        }
-        up_read(&css_set_rwsem);
+        spin_unlock_bh(&css_set_lock);
        return 0;
 }
 static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
 {
-        return (!cgroup_has_tasks(css->cgroup) &&
+        return (!cgroup_is_populated(css->cgroup) &&
                !css_has_online_children(&css->cgroup->self));
 }
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c
index 806cd7693ac8..cdd8df4e991c 100644
--- a/kernel/cgroup_pids.c
+++ b/kernel/cgroup_pids.c
@@ -266,11 +266,9 @@ static void pids_fork(struct task_struct *task, void *priv)
        css_put(old_css);
 }
-static void pids_exit(struct cgroup_subsys_state *css,
+static void pids_free(struct task_struct *task)
-                      struct cgroup_subsys_state *old_css,
-                      struct task_struct *task)
 {
-        struct pids_cgroup *pids = css_pids(old_css);
+        struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id));
        pids_uncharge(pids, 1);
 }
@@ -349,7 +347,7 @@ struct cgroup_subsys pids_cgrp_subsys = {
        .can_fork       = pids_can_fork,
        .cancel_fork    = pids_cancel_fork,
        .fork           = pids_fork,
-        .exit           = pids_exit,
+        .free           = pids_free,
        .legacy_cftypes = pids_files,
        .dfl_cftypes    = pids_files,
 };
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 0a495ab35bc7..d8560ee3bab7 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -58,36 +58,13 @@ static void context_tracking_recursion_exit(void)
 * instructions to execute won't use any RCU read side critical section
 * because this function sets RCU in extended quiescent state.
 */
-void context_tracking_enter(enum ctx_state state)
+void __context_tracking_enter(enum ctx_state state)
 {
-        unsigned long flags;
-        /*
-         * Repeat the user_enter() check here because some archs may be calling
-         * this from asm and if no CPU needs context tracking, they shouldn't
-         * go further. Repeat the check here until they support the inline static
-         * key check.
-         */
-        if (!context_tracking_is_enabled())
-                return;
-        /*
-         * Some contexts may involve an exception occuring in an irq,
-         * leading to that nesting:
-         * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
-         * This would mess up the dyntick_nesting count though. And rcu_irq_*()
-         * helpers are enough to protect RCU uses inside the exception. So
-         * just return immediately if we detect we are in an IRQ.
-         */
-        if (in_interrupt())
-                return;
        /* Kernel threads aren't supposed to go to userspace */
        WARN_ON_ONCE(!current->mm);
-        local_irq_save(flags);
        if (!context_tracking_recursion_enter())
-                goto out_irq_restore;
+                return;
        if ( __this_cpu_read(context_tracking.state) != state) {
                if (__this_cpu_read(context_tracking.active)) {
@@ -120,7 +97,27 @@ void context_tracking_enter(enum ctx_state state)
                __this_cpu_write(context_tracking.state, state);
        }
        context_tracking_recursion_exit();
-out_irq_restore:
+}
+NOKPROBE_SYMBOL(__context_tracking_enter);
+EXPORT_SYMBOL_GPL(__context_tracking_enter);
+void context_tracking_enter(enum ctx_state state)
+{
+        unsigned long flags;
+        /*
+         * Some contexts may involve an exception occuring in an irq,
+         * leading to that nesting:
+         * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
+         * This would mess up the dyntick_nesting count though. And rcu_irq_*()
+         * helpers are enough to protect RCU uses inside the exception. So
+         * just return immediately if we detect we are in an IRQ.
+         */
+        if (in_interrupt())
+                return;
+        local_irq_save(flags);
+        __context_tracking_enter(state);
        local_irq_restore(flags);
 }
 NOKPROBE_SYMBOL(context_tracking_enter);
@@ -128,7 +125,7 @@ EXPORT_SYMBOL_GPL(context_tracking_enter);
 void context_tracking_user_enter(void)
 {
-        context_tracking_enter(CONTEXT_USER);
+        user_enter();
 }
 NOKPROBE_SYMBOL(context_tracking_user_enter);
@@ -144,19 +141,10 @@ NOKPROBE_SYMBOL(context_tracking_user_enter);
 * This call supports re-entrancy. This way it can be called from any exception
 * handler without needing to know if we came from userspace or not.
 */
-void context_tracking_exit(enum ctx_state state)
+void __context_tracking_exit(enum ctx_state state)
 {
-        unsigned long flags;
-        if (!context_tracking_is_enabled())
-                return;
-        if (in_interrupt())
-                return;
-        local_irq_save(flags);
        if (!context_tracking_recursion_enter())
-                goto out_irq_restore;
+                return;
        if (__this_cpu_read(context_tracking.state) == state) {
                if (__this_cpu_read(context_tracking.active)) {
@@ -173,7 +161,19 @@ void context_tracking_exit(enum ctx_state state)
                __this_cpu_write(context_tracking.state, CONTEXT_KERNEL);
        }
        context_tracking_recursion_exit();
-out_irq_restore:
+}
+NOKPROBE_SYMBOL(__context_tracking_exit);
+EXPORT_SYMBOL_GPL(__context_tracking_exit);
+void context_tracking_exit(enum ctx_state state)
+{
+        unsigned long flags;
+        if (in_interrupt())
+                return;
+        local_irq_save(flags);
+        __context_tracking_exit(state);
        local_irq_restore(flags);
 }
 NOKPROBE_SYMBOL(context_tracking_exit);
@@ -181,7 +181,7 @@ EXPORT_SYMBOL_GPL(context_tracking_exit);
 void context_tracking_user_exit(void)
 {
-        context_tracking_exit(CONTEXT_USER);
+        user_exit();
 }
 NOKPROBE_SYMBOL(context_tracking_user_exit);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 82cf9dff4295..85ff5e26e23b 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -102,19 +102,6 @@ void get_online_cpus(void)
 }
 EXPORT_SYMBOL_GPL(get_online_cpus);
-bool try_get_online_cpus(void)
-{
-        if (cpu_hotplug.active_writer == current)
-                return true;
-        if (!mutex_trylock(&cpu_hotplug.lock))
-                return false;
-        cpuhp_lock_acquire_tryread();
-        atomic_inc(&cpu_hotplug.refcount);
-        mutex_unlock(&cpu_hotplug.lock);
-        return true;
-}
-EXPORT_SYMBOL_GPL(try_get_online_cpus);
 void put_online_cpus(void)
 {
        int refcount;
@@ -304,8 +291,8 @@ static inline void check_for_tasks(int dead_cpu)
 {
        struct task_struct *g, *p;
-        read_lock_irq(&tasklist_lock);
+        read_lock(&tasklist_lock);
-        do_each_thread(g, p) {
+        for_each_process_thread(g, p) {
                if (!p->on_rq)
                        continue;
                /*
@@ -320,8 +307,8 @@ static inline void check_for_tasks(int dead_cpu)
                pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n",
                        p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags);
-        } while_each_thread(g, p);
+        }
-        read_unlock_irq(&tasklist_lock);
+        read_unlock(&tasklist_lock);
 }
 struct take_cpu_down_param {
@@ -344,7 +331,7 @@ static int take_cpu_down(void *_param)
        /* Give up timekeeping duties */
        tick_handover_do_timer();
        /* Park the stopper thread */
-        kthread_park(current);
+        stop_machine_park((long)param->hcpu);
        return 0;
 }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f0acff0f66c9..10ae73611d80 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -473,7 +473,8 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
        /* On legacy hiearchy, we must be a subset of our parent cpuset. */
        ret = -EACCES;
-        if (!cgroup_on_dfl(cur->css.cgroup) && !is_cpuset_subset(trial, par))
+        if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+            !is_cpuset_subset(trial, par))
                goto out;
        /*
@@ -497,7 +498,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
         * be changed to have empty cpus_allowed or mems_allowed.
         */
        ret = -ENOSPC;
-        if ((cgroup_has_tasks(cur->css.cgroup) || cur->attach_in_progress)) {
+        if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) {
                if (!cpumask_empty(cur->cpus_allowed) &&
                    cpumask_empty(trial->cpus_allowed))
                        goto out;
@@ -879,7 +880,8 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
                 * If it becomes empty, inherit the effective mask of the
                 * parent, which is guaranteed to have some CPUs.
                 */
-                if (cgroup_on_dfl(cp->css.cgroup) && cpumask_empty(new_cpus))
+                if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+                    cpumask_empty(new_cpus))
                        cpumask_copy(new_cpus, parent->effective_cpus);
                /* Skip the whole subtree if the cpumask remains the same. */
@@ -896,7 +898,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
                cpumask_copy(cp->effective_cpus, new_cpus);
                spin_unlock_irq(&callback_lock);
-                WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
+                WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
                        !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
                update_tasks_cpumask(cp);
@@ -1135,7 +1137,8 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
                 * If it becomes empty, inherit the effective mask of the
                 * parent, which is guaranteed to have some MEMs.
                 */
-                if (cgroup_on_dfl(cp->css.cgroup) && nodes_empty(*new_mems))
+                if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+                    nodes_empty(*new_mems))
                        *new_mems = parent->effective_mems;
                /* Skip the whole subtree if the nodemask remains the same. */
@@ -1152,7 +1155,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
                cp->effective_mems = *new_mems;
                spin_unlock_irq(&callback_lock);
-                WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
+                WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
                        !nodes_equal(cp->mems_allowed, cp->effective_mems));
                update_tasks_nodemask(cp);
@@ -1440,7 +1443,7 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css,
        /* allow moving tasks into an empty cpuset if on default hierarchy */
        ret = -ENOSPC;
-        if (!cgroup_on_dfl(css->cgroup) &&
+        if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
            (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
                goto out_unlock;
@@ -1484,9 +1487,8 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
 {
        /* static buf protected by cpuset_mutex */
        static nodemask_t cpuset_attach_nodemask_to;
-        struct mm_struct *mm;
        struct task_struct *task;
-        struct task_struct *leader = cgroup_taskset_first(tset);
+        struct task_struct *leader;
        struct cpuset *cs = css_cs(css);
        struct cpuset *oldcs = cpuset_attach_old_cs;
@@ -1512,26 +1514,30 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
        }
        /*
-         * Change mm, possibly for multiple threads in a threadgroup. This is
+         * Change mm for all threadgroup leaders. This is expensive and may
-         * expensive and may sleep.
+         * sleep and should be moved outside migration path proper.
         */
        cpuset_attach_nodemask_to = cs->effective_mems;
-        mm = get_task_mm(leader);
+        cgroup_taskset_for_each_leader(leader, tset) {
-        if (mm) {
+                struct mm_struct *mm = get_task_mm(leader);
-                mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
+                if (mm) {
-                /*
+                        mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
-                 * old_mems_allowed is the same with mems_allowed here, except
-                 * if this task is being moved automatically due to hotplug.
+                        /*
-                 * In that case @mems_allowed has been updated and is empty,
+                         * old_mems_allowed is the same with mems_allowed
-                 * so @old_mems_allowed is the right nodesets that we migrate
+                         * here, except if this task is being moved
-                 * mm from.
+                         * automatically due to hotplug.  In that case
-                 */
+                         * @mems_allowed has been updated and is empty, so
-                if (is_memory_migrate(cs)) {
+                         * @old_mems_allowed is the right nodesets that we
-                        cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
+                         * migrate mm from.
-                                          &cpuset_attach_nodemask_to);
+                         */
+                        if (is_memory_migrate(cs)) {
+                                cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
+                                                  &cpuset_attach_nodemask_to);
+                        }
+                        mmput(mm);
                }
-                mmput(mm);
        }
        cs->old_mems_allowed = cpuset_attach_nodemask_to;
@@ -1594,9 +1600,6 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
        case FILE_MEMORY_PRESSURE_ENABLED:
                cpuset_memory_pressure_enabled = !!val;
                break;
-        case FILE_MEMORY_PRESSURE:
-                retval = -EACCES;
-                break;
        case FILE_SPREAD_PAGE:
                retval = update_flag(CS_SPREAD_PAGE, cs, val);
                break;
@@ -1863,9 +1866,6 @@ static struct cftype files[] = {
        {
                .name = "memory_pressure",
                .read_u64 = cpuset_read_u64,
-                .write_u64 = cpuset_write_u64,
-                .private = FILE_MEMORY_PRESSURE,
-                .mode = S_IRUGO,
        },
        {
@@ -1952,7 +1952,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
        cpuset_inc();
        spin_lock_irq(&callback_lock);
-        if (cgroup_on_dfl(cs->css.cgroup)) {
+        if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
                cpumask_copy(cs->effective_cpus, parent->effective_cpus);
                cs->effective_mems = parent->effective_mems;
        }
@@ -2029,7 +2029,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
        mutex_lock(&cpuset_mutex);
        spin_lock_irq(&callback_lock);
-        if (cgroup_on_dfl(root_css->cgroup)) {
+        if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
                cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
                top_cpuset.mems_allowed = node_possible_map;
        } else {
@@ -2210,7 +2210,7 @@ retry:
        cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
        mems_updated = !nodes_equal(new_mems, cs->effective_mems);
-        if (cgroup_on_dfl(cs->css.cgroup))
+        if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
                hotplug_update_tasks(cs, &new_cpus, &new_mems,
                                     cpus_updated, mems_updated);
        else
@@ -2241,7 +2241,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
        static cpumask_t new_cpus;
        static nodemask_t new_mems;
        bool cpus_updated, mems_updated;
-        bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup);
+        bool on_dfl = cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
        mutex_lock(&cpuset_mutex);
@@ -2598,22 +2598,22 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
 }
 /**
- * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
+ * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed
- * @tsk: pointer to task_struct of some task.
 *
- * Description: Prints @task's name, cpuset name, and cached copy of its
+ * Description: Prints current's name, cpuset name, and cached copy of its
 * mems_allowed to the kernel log.
 */
-void cpuset_print_task_mems_allowed(struct task_struct *tsk)
+void cpuset_print_current_mems_allowed(void)
 {
        struct cgroup *cgrp;
        rcu_read_lock();
-        cgrp = task_cs(tsk)->css.cgroup;
+        cgrp = task_cs(current)->css.cgroup;
-        pr_info("%s cpuset=", tsk->comm);
+        pr_info("%s cpuset=", current->comm);
        pr_cont_cgroup_name(cgrp);
-        pr_cont(" mems_allowed=%*pbl\n", nodemask_pr_args(&tsk->mems_allowed));
+        pr_cont(" mems_allowed=%*pbl\n",
+                nodemask_pr_args(&current->mems_allowed));
        rcu_read_unlock();
 }
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f548f69c4299..1a734e0adfa7 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -196,7 +196,7 @@ static int perf_sample_period_ns __read_mostly	= DEFAULT_SAMPLE_PERIOD_NS;
 static int perf_sample_allowed_ns __read_mostly =
        DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
-void update_perf_cpu_limits(void)
+static void update_perf_cpu_limits(void)
 {
        u64 tmp = perf_sample_period_ns;
@@ -472,7 +472,7 @@ perf_cgroup_set_timestamp(struct task_struct *task,
 * mode SWOUT : schedule out everything
 * mode SWIN : schedule in based on cgroup for next
 */
-void perf_cgroup_switch(struct task_struct *task, int mode)
+static void perf_cgroup_switch(struct task_struct *task, int mode)
 {
        struct perf_cpu_context *cpuctx;
        struct pmu *pmu;
@@ -1243,11 +1243,7 @@ static inline void perf_event__state_init(struct perf_event *event)
                                              PERF_EVENT_STATE_INACTIVE;
 }
-/*
+static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
- * Called at perf_event creation and when events are attached/detached from a
- * group.
- */
-static void perf_event__read_size(struct perf_event *event)
 {
        int entry = sizeof(u64); /* value */
        int size = 0;
@@ -1263,7 +1259,7 @@ static void perf_event__read_size(struct perf_event *event)
                entry += sizeof(u64);
        if (event->attr.read_format & PERF_FORMAT_GROUP) {
-                nr += event->group_leader->nr_siblings;
+                nr += nr_siblings;
                size += sizeof(u64);
        }
@@ -1271,14 +1267,11 @@ static void perf_event__read_size(struct perf_event *event)
        event->read_size = size;
 }
-static void perf_event__header_size(struct perf_event *event)
+static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
 {
        struct perf_sample_data *data;
-        u64 sample_type = event->attr.sample_type;
        u16 size = 0;
-        perf_event__read_size(event);
        if (sample_type & PERF_SAMPLE_IP)
                size += sizeof(data->ip);
@@ -1303,6 +1296,17 @@ static void perf_event__header_size(struct perf_event *event)
        event->header_size = size;
 }
+/*
+ * Called at perf_event creation and when events are attached/detached from a
+ * group.
+ */
+static void perf_event__header_size(struct perf_event *event)
+{
+        __perf_event_read_size(event,
+                               event->group_leader->nr_siblings);
+        __perf_event_header_size(event, event->attr.sample_type);
+}
 static void perf_event__id_header_size(struct perf_event *event)
 {
        struct perf_sample_data *data;
@@ -1330,6 +1334,27 @@ static void perf_event__id_header_size(struct perf_event *event)
        event->id_header_size = size;
 }
+static bool perf_event_validate_size(struct perf_event *event)
+{
+        /*
+         * The values computed here will be over-written when we actually
+         * attach the event.
+         */
+        __perf_event_read_size(event, event->group_leader->nr_siblings + 1);
+        __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
+        perf_event__id_header_size(event);
+        /*
+         * Sum the lot; should not exceed the 64k limit we have on records.
+         * Conservative limit to allow for callchains and other variable fields.
+         */
+        if (event->read_size + event->header_size +
+            event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
+                return false;
+        return true;
+}
 static void perf_group_attach(struct perf_event *event)
 {
        struct perf_event *group_leader = event->group_leader, *pos;
@@ -1914,7 +1939,7 @@ group_sched_in(struct perf_event *group_event,
        if (group_event->state == PERF_EVENT_STATE_OFF)
                return 0;
-        pmu->start_txn(pmu);
+        pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
        if (event_sched_in(group_event, cpuctx, ctx)) {
                pmu->cancel_txn(pmu);
@@ -3184,14 +3209,22 @@ void perf_event_exec(void)
        rcu_read_unlock();
 }
+struct perf_read_data {
+        struct perf_event *event;
+        bool group;
+        int ret;
+};
 /*
 * Cross CPU call to read the hardware event
 */
 static void __perf_event_read(void *info)
 {
-        struct perf_event *event = info;
+        struct perf_read_data *data = info;
+        struct perf_event *sub, *event = data->event;
        struct perf_event_context *ctx = event->ctx;
        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+        struct pmu *pmu = event->pmu;
        /*
         * If this is a task context, we need to check whether it is
@@ -3208,9 +3241,35 @@ static void __perf_event_read(void *info)
                update_context_time(ctx);
                update_cgrp_time_from_event(event);
        }
        update_event_times(event);
-        if (event->state == PERF_EVENT_STATE_ACTIVE)
+        if (event->state != PERF_EVENT_STATE_ACTIVE)
-                event->pmu->read(event);
+                goto unlock;
+        if (!data->group) {
+                pmu->read(event);
+                data->ret = 0;
+                goto unlock;
+        }
+        pmu->start_txn(pmu, PERF_PMU_TXN_READ);
+        pmu->read(event);
+        list_for_each_entry(sub, &event->sibling_list, group_entry) {
+                update_event_times(sub);
+                if (sub->state == PERF_EVENT_STATE_ACTIVE) {
+                        /*
+                         * Use sibling's PMU rather than @event's since
+                         * sibling could be on different (eg: software) PMU.
+                         */
+                        sub->pmu->read(sub);
+                }
+        }
+        data->ret = pmu->commit_txn(pmu);
+unlock:
        raw_spin_unlock(&ctx->lock);
 }
@@ -3275,15 +3334,23 @@ u64 perf_event_read_local(struct perf_event *event)
        return val;
 }
-static u64 perf_event_read(struct perf_event *event)
+static int perf_event_read(struct perf_event *event, bool group)
 {
+        int ret = 0;
        /*
         * If event is enabled and currently active on a CPU, update the
         * value in the event structure:
         */
        if (event->state == PERF_EVENT_STATE_ACTIVE) {
+                struct perf_read_data data = {
+                        .event = event,
+                        .group = group,
+                        .ret = 0,
+                };
                smp_call_function_single(event->oncpu,
-                                         __perf_event_read, event, 1);
+                                         __perf_event_read, &data, 1);
+                ret = data.ret;
        } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
                struct perf_event_context *ctx = event->ctx;
                unsigned long flags;
@@ -3298,11 +3365,14 @@ static u64 perf_event_read(struct perf_event *event)
                        update_context_time(ctx);
                        update_cgrp_time_from_event(event);
                }
-                update_event_times(event);
+                if (group)
+                        update_group_times(event);
+                else
+                        update_event_times(event);
                raw_spin_unlock_irqrestore(&ctx->lock, flags);
        }
-        return perf_event_count(event);
+        return ret;
 }
 /*
@@ -3744,7 +3814,7 @@ static void put_event(struct perf_event *event)
         *     see the comment there.
         *
         *  2) there is a lock-inversion with mmap_sem through
-         *     perf_event_read_group(), which takes faults while
+         *     perf_read_group(), which takes faults while
         *     holding ctx->mutex, however this is called after
         *     the last filedesc died, so there is no possibility
         *     to trigger the AB-BA case.
@@ -3818,14 +3888,18 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
        *running = 0;
        mutex_lock(&event->child_mutex);
-        total += perf_event_read(event);
+        (void)perf_event_read(event, false);
+        total += perf_event_count(event);
        *enabled += event->total_time_enabled +
                        atomic64_read(&event->child_total_time_enabled);
        *running += event->total_time_running +
                        atomic64_read(&event->child_total_time_running);
        list_for_each_entry(child, &event->child_list, child_list) {
-                total += perf_event_read(child);
+                (void)perf_event_read(child, false);
+                total += perf_event_count(child);
                *enabled += child->total_time_enabled;
                *running += child->total_time_running;
        }
@@ -3835,55 +3909,95 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
 }
 EXPORT_SYMBOL_GPL(perf_event_read_value);
-static int perf_event_read_group(struct perf_event *event,
+static int __perf_read_group_add(struct perf_event *leader,
-                                   u64 read_format, char __user *buf)
+                                        u64 read_format, u64 *values)
 {
-        struct perf_event *leader = event->group_leader, *sub;
+        struct perf_event *sub;
-        struct perf_event_context *ctx = leader->ctx;
+        int n = 1; /* skip @nr */
-        int n = 0, size = 0, ret;
+        int ret;
-        u64 count, enabled, running;
-        u64 values[5];
-        lockdep_assert_held(&ctx->mutex);
+        ret = perf_event_read(leader, true);
+        if (ret)
+                return ret;
+        /*
+         * Since we co-schedule groups, {enabled,running} times of siblings
+         * will be identical to those of the leader, so we only publish one
+         * set.
+         */
+        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+                values[n++] += leader->total_time_enabled +
+                        atomic64_read(&leader->child_total_time_enabled);
+        }
-        count = perf_event_read_value(leader, &enabled, &running);
+        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+                values[n++] += leader->total_time_running +
+                        atomic64_read(&leader->child_total_time_running);
+        }
-        values[n++] = 1 + leader->nr_siblings;
+        /*
-        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+         * Write {count,id} tuples for every sibling.
-                values[n++] = enabled;
+         */
-        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+        values[n++] += perf_event_count(leader);
-                values[n++] = running;
-        values[n++] = count;
        if (read_format & PERF_FORMAT_ID)
                values[n++] = primary_event_id(leader);
-        size = n * sizeof(u64);
+        list_for_each_entry(sub, &leader->sibling_list, group_entry) {
+                values[n++] += perf_event_count(sub);
+                if (read_format & PERF_FORMAT_ID)
+                        values[n++] = primary_event_id(sub);
+        }
-        if (copy_to_user(buf, values, size))
+        return 0;
-                return -EFAULT;
+}
-        ret = size;
+static int perf_read_group(struct perf_event *event,
+                                   u64 read_format, char __user *buf)
+{
+        struct perf_event *leader = event->group_leader, *child;
+        struct perf_event_context *ctx = leader->ctx;
+        int ret;
+        u64 *values;
-        list_for_each_entry(sub, &leader->sibling_list, group_entry) {
+        lockdep_assert_held(&ctx->mutex);
-                n = 0;
-                values[n++] = perf_event_read_value(sub, &enabled, &running);
+        values = kzalloc(event->read_size, GFP_KERNEL);
-                if (read_format & PERF_FORMAT_ID)
+        if (!values)
-                        values[n++] = primary_event_id(sub);
+                return -ENOMEM;
-                size = n * sizeof(u64);
+        values[0] = 1 + leader->nr_siblings;
-                if (copy_to_user(buf + ret, values, size)) {
+        /*
-                        return -EFAULT;
+         * By locking the child_mutex of the leader we effectively
-                }
+         * lock the child list of all siblings.. XXX explain how.
+         */
+        mutex_lock(&leader->child_mutex);
-                ret += size;
+        ret = __perf_read_group_add(leader, read_format, values);
+        if (ret)
+                goto unlock;
+        list_for_each_entry(child, &leader->child_list, child_list) {
+                ret = __perf_read_group_add(child, read_format, values);
+                if (ret)
+                        goto unlock;
        }
+        mutex_unlock(&leader->child_mutex);
+        ret = event->read_size;
+        if (copy_to_user(buf, values, event->read_size))
+                ret = -EFAULT;
+        goto out;
+unlock:
+        mutex_unlock(&leader->child_mutex);
+out:
+        kfree(values);
        return ret;
 }
-static int perf_event_read_one(struct perf_event *event,
+static int perf_read_one(struct perf_event *event,
                                 u64 read_format, char __user *buf)
 {
        u64 enabled, running;
@@ -3921,7 +4035,7 @@ static bool is_event_hup(struct perf_event *event)
 * Read the performance event - simple non blocking version for now
 */
 static ssize_t
-perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
+__perf_read(struct perf_event *event, char __user *buf, size_t count)
 {
        u64 read_format = event->attr.read_format;
        int ret;
@@ -3939,9 +4053,9 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
        WARN_ON_ONCE(event->ctx->parent_ctx);
        if (read_format & PERF_FORMAT_GROUP)
-                ret = perf_event_read_group(event, read_format, buf);
+                ret = perf_read_group(event, read_format, buf);
        else
-                ret = perf_event_read_one(event, read_format, buf);
+                ret = perf_read_one(event, read_format, buf);
        return ret;
 }
@@ -3954,7 +4068,7 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
        int ret;
        ctx = perf_event_ctx_lock(event);
-        ret = perf_read_hw(event, buf, count);
+        ret = __perf_read(event, buf, count);
        perf_event_ctx_unlock(event, ctx);
        return ret;
@@ -3985,7 +4099,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
 static void _perf_event_reset(struct perf_event *event)
 {
-        (void)perf_event_read(event);
+        (void)perf_event_read(event, false);
        local64_set(&event->count, 0);
        perf_event_update_userpage(event);
 }
@@ -5261,9 +5375,15 @@ void perf_output_sample(struct perf_output_handle *handle,
        if (sample_type & PERF_SAMPLE_RAW) {
                if (data->raw) {
-                        perf_output_put(handle, data->raw->size);
+                        u32 raw_size = data->raw->size;
-                        __output_copy(handle, data->raw->data,
+                        u32 real_size = round_up(raw_size + sizeof(u32),
-                                           data->raw->size);
+                                                 sizeof(u64)) - sizeof(u32);
+                        u64 zero = 0;
+                        perf_output_put(handle, real_size);
+                        __output_copy(handle, data->raw->data, raw_size);
+                        if (real_size - raw_size)
+                                __output_copy(handle, &zero, real_size - raw_size);
                } else {
                        struct {
                                u32     size;
@@ -5395,8 +5515,7 @@ void perf_prepare_sample(struct perf_event_header *header,
                else
                        size += sizeof(u32);
-                WARN_ON_ONCE(size & (sizeof(u64)-1));
+                header->size += round_up(size, sizeof(u64));
-                header->size += size;
        }
        if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
@@ -7267,24 +7386,49 @@ static void perf_pmu_nop_void(struct pmu *pmu)
 {
 }
+static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags)
+{
+}
 static int perf_pmu_nop_int(struct pmu *pmu)
 {
        return 0;
 }
-static void perf_pmu_start_txn(struct pmu *pmu)
+static DEFINE_PER_CPU(unsigned int, nop_txn_flags);
+static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
 {
+        __this_cpu_write(nop_txn_flags, flags);
+        if (flags & ~PERF_PMU_TXN_ADD)
+                return;
        perf_pmu_disable(pmu);
 }
 static int perf_pmu_commit_txn(struct pmu *pmu)
 {
+        unsigned int flags = __this_cpu_read(nop_txn_flags);
+        __this_cpu_write(nop_txn_flags, 0);
+        if (flags & ~PERF_PMU_TXN_ADD)
+                return 0;
        perf_pmu_enable(pmu);
        return 0;
 }
 static void perf_pmu_cancel_txn(struct pmu *pmu)
 {
+        unsigned int flags =  __this_cpu_read(nop_txn_flags);
+        __this_cpu_write(nop_txn_flags, 0);
+        if (flags & ~PERF_PMU_TXN_ADD)
+                return;
        perf_pmu_enable(pmu);
 }
@@ -7523,7 +7667,7 @@ got_cpu_context:
                        pmu->commit_txn = perf_pmu_commit_txn;
                        pmu->cancel_txn = perf_pmu_cancel_txn;
                } else {
-                        pmu->start_txn  = perf_pmu_nop_void;
+                        pmu->start_txn  = perf_pmu_nop_txn;
                        pmu->commit_txn = perf_pmu_nop_int;
                        pmu->cancel_txn = perf_pmu_nop_void;
                }
@@ -7611,7 +7755,7 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
        return ret;
 }
-struct pmu *perf_init_event(struct perf_event *event)
+static struct pmu *perf_init_event(struct perf_event *event)
 {
        struct pmu *pmu = NULL;
        int idx;
@@ -8297,13 +8441,35 @@ SYSCALL_DEFINE5(perf_event_open,
        if (move_group) {
                gctx = group_leader->ctx;
+                mutex_lock_double(&gctx->mutex, &ctx->mutex);
+        } else {
+                mutex_lock(&ctx->mutex);
+        }
+        if (!perf_event_validate_size(event)) {
+                err = -E2BIG;
+                goto err_locked;
+        }
+        /*
+         * Must be under the same ctx::mutex as perf_install_in_context(),
+         * because we need to serialize with concurrent event creation.
+         */
+        if (!exclusive_event_installable(event, ctx)) {
+                /* exclusive and group stuff are assumed mutually exclusive */
+                WARN_ON_ONCE(move_group);
+                err = -EBUSY;
+                goto err_locked;
+        }
+        WARN_ON_ONCE(ctx->parent_ctx);
+        if (move_group) {
                /*
                 * See perf_event_ctx_lock() for comments on the details
                 * of swizzling perf_event::ctx.
                 */
-                mutex_lock_double(&gctx->mutex, &ctx->mutex);
                perf_remove_from_context(group_leader, false);
                list_for_each_entry(sibling, &group_leader->sibling_list,
@@ -8311,13 +8477,7 @@ SYSCALL_DEFINE5(perf_event_open,
                        perf_remove_from_context(sibling, false);
                        put_ctx(gctx);
                }
-        } else {
-                mutex_lock(&ctx->mutex);
-        }
-        WARN_ON_ONCE(ctx->parent_ctx);
-        if (move_group) {
                /*
                 * Wait for everybody to stop referencing the events through
                 * the old lists, before installing it on new lists.
@@ -8349,22 +8509,29 @@ SYSCALL_DEFINE5(perf_event_open,
                perf_event__state_init(group_leader);
                perf_install_in_context(ctx, group_leader, group_leader->cpu);
                get_ctx(ctx);
-        }
-        if (!exclusive_event_installable(event, ctx)) {
+                /*
-                err = -EBUSY;
+                 * Now that all events are installed in @ctx, nothing
-                mutex_unlock(&ctx->mutex);
+                 * references @gctx anymore, so drop the last reference we have
-                fput(event_file);
+                 * on it.
-                goto err_context;
+                 */
+                put_ctx(gctx);
        }
+        /*
+         * Precalculate sample_data sizes; do while holding ctx::mutex such
+         * that we're serialized against further additions and before
+         * perf_install_in_context() which is the point the event is active and
+         * can use these values.
+         */
+        perf_event__header_size(event);
+        perf_event__id_header_size(event);
        perf_install_in_context(ctx, event, event->cpu);
        perf_unpin_context(ctx);
-        if (move_group) {
+        if (move_group)
                mutex_unlock(&gctx->mutex);
-                put_ctx(gctx);
-        }
        mutex_unlock(&ctx->mutex);
        put_online_cpus();
@@ -8376,12 +8543,6 @@ SYSCALL_DEFINE5(perf_event_open,
        mutex_unlock(&current->perf_event_mutex);
        /*
-         * Precalculate sample_data sizes
-         */
-        perf_event__header_size(event);
-        perf_event__id_header_size(event);
-        /*
         * Drop the reference on the group_event after placing the
         * new event on the sibling_list. This ensures destruction
         * of the group leader will find the pointer to itself in
@@ -8391,6 +8552,12 @@ SYSCALL_DEFINE5(perf_event_open,
        fd_install(event_fd, event_file);
        return event_fd;
+err_locked:
+        if (move_group)
+                mutex_unlock(&gctx->mutex);
+        mutex_unlock(&ctx->mutex);
+/* err_file: */
+        fput(event_file);
 err_context:
        perf_unpin_context(ctx);
        put_ctx(ctx);
@@ -9293,25 +9460,9 @@ static void perf_cgroup_attach(struct cgroup_subsys_state *css,
                task_function_call(task, __perf_cgroup_move, task);
 }
-static void perf_cgroup_exit(struct cgroup_subsys_state *css,
-                             struct cgroup_subsys_state *old_css,
-                             struct task_struct *task)
-{
-        /*
-         * cgroup_exit() is called in the copy_process() failure path.
-         * Ignore this case since the task hasn't ran yet, this avoids
-         * trying to poke a half freed task state from generic code.
-         */
-        if (!(task->flags & PF_EXITING))
-                return;
-        task_function_call(task, __perf_cgroup_move, task);
-}
 struct cgroup_subsys perf_event_cgrp_subsys = {
        .css_alloc      = perf_cgroup_css_alloc,
        .css_free       = perf_cgroup_css_free,
-        .exit           = perf_cgroup_exit,
        .attach         = perf_cgroup_attach,
 };
 #endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 182bc30899d5..b5d1ea79c595 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -141,7 +141,7 @@ int perf_output_begin(struct perf_output_handle *handle,
        perf_output_get_handle(handle);
        do {
-                tail = READ_ONCE_CTRL(rb->user_page->data_tail);
+                tail = READ_ONCE(rb->user_page->data_tail);
                offset = head = local_read(&rb->head);
                if (!rb->overwrite &&
                    unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size))
diff --git a/kernel/exit.c b/kernel/exit.c
index ea95ee1b5ef7..07110c6020a0 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -706,10 +706,12 @@ void do_exit(long code)
        smp_mb();
        raw_spin_unlock_wait(&tsk->pi_lock);
-        if (unlikely(in_atomic()))
+        if (unlikely(in_atomic())) {
                pr_info("note: %s[%d] exited with preempt_count %d\n",
                        current->comm, task_pid_nr(current),
                        preempt_count());
+                preempt_count_set(PREEMPT_ENABLED);
+        }
        /* sync mm's RSS info before statistics gathering */
        if (tsk->mm)
@@ -761,7 +763,9 @@ void do_exit(long code)
         */
        flush_ptrace_hw_breakpoint(tsk);
+        TASKS_RCU(preempt_disable());
        TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu));
+        TASKS_RCU(preempt_enable());
        exit_notify(tsk, group_dead);
        proc_exit_connector(tsk);
 #ifdef CONFIG_NUMA
diff --git a/kernel/fork.c b/kernel/fork.c
index 7d5f0f118a63..f97f2c449f5c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -251,6 +251,7 @@ void __put_task_struct(struct task_struct *tsk)
        WARN_ON(atomic_read(&tsk->usage));
        WARN_ON(tsk == current);
+        cgroup_free(tsk);
        task_numa_free(tsk);
        security_task_free(tsk);
        exit_creds(tsk);
@@ -454,7 +455,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                tmp->vm_mm = mm;
                if (anon_vma_fork(tmp, mpnt))
                        goto fail_nomem_anon_vma_fork;
-                tmp->vm_flags &= ~(VM_LOCKED|VM_UFFD_MISSING|VM_UFFD_WP);
+                tmp->vm_flags &=
+                        ~(VM_LOCKED|VM_LOCKONFAULT|VM_UFFD_MISSING|VM_UFFD_WP);
                tmp->vm_next = tmp->vm_prev = NULL;
                tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
                file = tmp->vm_file;
@@ -1101,7 +1103,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
        cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
        if (cpu_limit != RLIM_INFINITY) {
                sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
-                sig->cputimer.running = 1;
+                sig->cputimer.running = true;
        }
        /* The timer lists. */
diff --git a/kernel/futex.c b/kernel/futex.c
index 6e443efc65f4..684d7549825a 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -255,9 +255,18 @@ struct futex_hash_bucket {
        struct plist_head chain;
 } ____cacheline_aligned_in_smp;
-static unsigned long __read_mostly futex_hashsize;
+/*
+ * The base of the bucket array and its size are always used together
+ * (after initialization only in hash_futex()), so ensure that they
+ * reside in the same cacheline.
+ */
+static struct {
+        struct futex_hash_bucket *queues;
+        unsigned long            hashsize;
+} __futex_data __read_mostly __aligned(2*sizeof(long));
+#define futex_queues   (__futex_data.queues)
+#define futex_hashsize (__futex_data.hashsize)
-static struct futex_hash_bucket *futex_queues;
 /*
 * Fault injections for futexes.
@@ -267,10 +276,10 @@ static struct futex_hash_bucket *futex_queues;
 static struct {
        struct fault_attr attr;
-        u32 ignore_private;
+        bool ignore_private;
 } fail_futex = {
        .attr = FAULT_ATTR_INITIALIZER,
-        .ignore_private = 0,
+        .ignore_private = false,
 };
 static int __init setup_fail_futex(char *str)
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 9a76e3beda54..3b48dab80164 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -30,6 +30,10 @@ config GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
 config GENERIC_PENDING_IRQ
        bool
+# Support for generic irq migrating off cpu before the cpu is offline.
+config GENERIC_IRQ_MIGRATION
+        bool
 # Alpha specific irq affinity mechanism
 config AUTO_IRQ_AFFINITY
       bool
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index d12123526e2b..2fc9cbdf35b6 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -5,5 +5,6 @@ obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
 obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o
 obj-$(CONFIG_PROC_FS) += proc.o
 obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
+obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o
 obj-$(CONFIG_PM_SLEEP) += pm.o
 obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index e28169dd1c36..15206453b12a 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -21,6 +21,20 @@
 #include "internals.h"
+static irqreturn_t bad_chained_irq(int irq, void *dev_id)
+{
+        WARN_ONCE(1, "Chained irq %d should not call an action\n", irq);
+        return IRQ_NONE;
+}
+/*
+ * Chained handlers should never call action on their IRQ. This default
+ * action will emit warning if such thing happens.
+ */
+struct irqaction chained_action = {
+        .handler = bad_chained_irq,
+};
 /**
 *      irq_set_chip - set the irq chip for an irq
 *      @irq:   irq number
@@ -227,6 +241,13 @@ void irq_enable(struct irq_desc *desc)
 * disabled. If an interrupt happens, then the interrupt flow
 * handler masks the line at the hardware level and marks it
 * pending.
+ *
+ * If the interrupt chip does not implement the irq_disable callback,
+ * a driver can disable the lazy approach for a particular irq line by
+ * calling 'irq_set_status_flags(irq, IRQ_DISABLE_UNLAZY)'. This can
+ * be used for devices which cannot disable the interrupt at the
+ * device level under certain circumstances and have to use
+ * disable_irq[_nosync] instead.
 */
 void irq_disable(struct irq_desc *desc)
 {
@@ -234,6 +255,8 @@ void irq_disable(struct irq_desc *desc)
        if (desc->irq_data.chip->irq_disable) {
                desc->irq_data.chip->irq_disable(&desc->irq_data);
                irq_state_set_masked(desc);
+        } else if (irq_settings_disable_unlazy(desc)) {
+                mask_irq(desc);
        }
 }
@@ -669,7 +692,7 @@ void handle_percpu_irq(struct irq_desc *desc)
        if (chip->irq_ack)
                chip->irq_ack(&desc->irq_data);
-        handle_irq_event_percpu(desc, desc->action);
+        handle_irq_event_percpu(desc);
        if (chip->irq_eoi)
                chip->irq_eoi(&desc->irq_data);
@@ -746,6 +769,8 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
                if (desc->irq_data.chip != &no_irq_chip)
                        mask_ack_irq(desc);
                irq_state_set_disabled(desc);
+                if (is_chained)
+                        desc->action = NULL;
                desc->depth = 1;
        }
        desc->handle_irq = handle;
@@ -755,6 +780,7 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
                irq_settings_set_noprobe(desc);
                irq_settings_set_norequest(desc);
                irq_settings_set_nothread(desc);
+                desc->action = &chained_action;
                irq_startup(desc, true);
        }
 }
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
new file mode 100644
index 000000000000..011f8c4c63da
--- /dev/null
+++ b/kernel/irq/cpuhotplug.c
@@ -0,0 +1,82 @@
+/*
+ * Generic cpu hotunplug interrupt migration code copied from the
+ * arch/arm implementation
+ *
+ * Copyright (C) Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/interrupt.h>
+#include <linux/ratelimit.h>
+#include <linux/irq.h>
+#include "internals.h"
+static bool migrate_one_irq(struct irq_desc *desc)
+{
+        struct irq_data *d = irq_desc_get_irq_data(desc);
+        const struct cpumask *affinity = d->common->affinity;
+        struct irq_chip *c;
+        bool ret = false;
+        /*
+         * If this is a per-CPU interrupt, or the affinity does not
+         * include this CPU, then we have nothing to do.
+         */
+        if (irqd_is_per_cpu(d) ||
+            !cpumask_test_cpu(smp_processor_id(), affinity))
+                return false;
+        if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
+                affinity = cpu_online_mask;
+                ret = true;
+        }
+        c = irq_data_get_irq_chip(d);
+        if (!c->irq_set_affinity) {
+                pr_debug("IRQ%u: unable to set affinity\n", d->irq);
+        } else {
+                int r = irq_do_set_affinity(d, affinity, false);
+                if (r)
+                        pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n",
+                                            d->irq, r);
+        }
+        return ret;
+}
+/**
+ * irq_migrate_all_off_this_cpu - Migrate irqs away from offline cpu
+ *
+ * The current CPU has been marked offline.  Migrate IRQs off this CPU.
+ * If the affinity settings do not allow other CPUs, force them onto any
+ * available CPU.
+ *
+ * Note: we must iterate over all IRQs, whether they have an attached
+ * action structure or not, as we need to get chained interrupts too.
+ */
+void irq_migrate_all_off_this_cpu(void)
+{
+        unsigned int irq;
+        struct irq_desc *desc;
+        unsigned long flags;
+        local_irq_save(flags);
+        for_each_active_irq(irq) {
+                bool affinity_broken;
+                desc = irq_to_desc(irq);
+                raw_spin_lock(&desc->lock);
+                affinity_broken = migrate_one_irq(desc);
+                raw_spin_unlock(&desc->lock);
+                if (affinity_broken)
+                        pr_warn_ratelimited("IRQ%u no longer affine to CPU%u\n",
+                                            irq, smp_processor_id());
+        }
+        local_irq_restore(flags);
+}
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index de41a68fc038..a302cf9a2126 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -22,7 +22,6 @@
 /**
 * handle_bad_irq - handle spurious and unhandled irqs
- * @irq:       the interrupt number
 * @desc:      description of the interrupt
 *
 * Handles spurious and unhandled IRQ's. It also prints a debugmessage.
@@ -35,6 +34,7 @@ void handle_bad_irq(struct irq_desc *desc)
        kstat_incr_irqs_this_cpu(desc);
        ack_bad_irq(irq);
 }
+EXPORT_SYMBOL_GPL(handle_bad_irq);
 /*
 * Special, empty irq handler:
@@ -132,11 +132,11 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
        wake_up_process(action->thread);
 }
-irqreturn_t
+irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
-handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
 {
        irqreturn_t retval = IRQ_NONE;
        unsigned int flags = 0, irq = desc->irq_data.irq;
+        struct irqaction *action = desc->action;
        do {
                irqreturn_t res;
@@ -184,14 +184,13 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
 irqreturn_t handle_irq_event(struct irq_desc *desc)
 {
-        struct irqaction *action = desc->action;
        irqreturn_t ret;
        desc->istate &= ~IRQS_PENDING;
        irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
        raw_spin_unlock(&desc->lock);
-        ret = handle_irq_event_percpu(desc, action);
+        ret = handle_irq_event_percpu(desc);
        raw_spin_lock(&desc->lock);
        irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 5ef0c2dbe930..05c2188271b8 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -18,6 +18,8 @@
 extern bool noirqdebug;
+extern struct irqaction chained_action;
 /*
 * Bits used by threaded handlers:
 * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run
@@ -81,7 +83,7 @@ extern void irq_mark_irq(unsigned int irq);
 extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
-irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action);
+irqreturn_t handle_irq_event_percpu(struct irq_desc *desc);
 irqreturn_t handle_irq_event(struct irq_desc *desc);
 /* Resending of interrupts :*/
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index dc9d27c0c158..22aa9612ef7c 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -27,6 +27,57 @@ static int irq_domain_alloc_descs(int virq, unsigned int nr_irqs,
                                  irq_hw_number_t hwirq, int node);
 static void irq_domain_check_hierarchy(struct irq_domain *domain);
+struct irqchip_fwid {
+        struct fwnode_handle fwnode;
+        char *name;
+        void *data;
+};
+/**
+ * irq_domain_alloc_fwnode - Allocate a fwnode_handle suitable for
+ *                           identifying an irq domain
+ * @data: optional user-provided data
+ *
+ * Allocate a struct device_node, and return a poiner to the embedded
+ * fwnode_handle (or NULL on failure).
+ */
+struct fwnode_handle *irq_domain_alloc_fwnode(void *data)
+{
+        struct irqchip_fwid *fwid;
+        char *name;
+        fwid = kzalloc(sizeof(*fwid), GFP_KERNEL);
+        name = kasprintf(GFP_KERNEL, "irqchip@%p", data);
+        if (!fwid || !name) {
+                kfree(fwid);
+                kfree(name);
+                return NULL;
+        }
+        fwid->name = name;
+        fwid->data = data;
+        fwid->fwnode.type = FWNODE_IRQCHIP;
+        return &fwid->fwnode;
+}
+/**
+ * irq_domain_free_fwnode - Free a non-OF-backed fwnode_handle
+ *
+ * Free a fwnode_handle allocated with irq_domain_alloc_fwnode.
+ */
+void irq_domain_free_fwnode(struct fwnode_handle *fwnode)
+{
+        struct irqchip_fwid *fwid;
+        if (WARN_ON(fwnode->type != FWNODE_IRQCHIP))
+                return;
+        fwid = container_of(fwnode, struct irqchip_fwid, fwnode);
+        kfree(fwid->name);
+        kfree(fwid);
+}
 /**
 * __irq_domain_add() - Allocate a new irq_domain data structure
 * @of_node: optional device-tree node of the interrupt controller
@@ -40,23 +91,28 @@ static void irq_domain_check_hierarchy(struct irq_domain *domain);
 * Allocates and initialize and irq_domain structure.
 * Returns pointer to IRQ domain, or NULL on failure.
 */
-struct irq_domain *__irq_domain_add(struct device_node *of_node, int size,
+struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
                                    irq_hw_number_t hwirq_max, int direct_max,
                                    const struct irq_domain_ops *ops,
                                    void *host_data)
 {
        struct irq_domain *domain;
+        struct device_node *of_node;
+        of_node = to_of_node(fwnode);
        domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size),
                              GFP_KERNEL, of_node_to_nid(of_node));
        if (WARN_ON(!domain))
                return NULL;
+        of_node_get(of_node);
        /* Fill structure */
        INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL);
        domain->ops = ops;
        domain->host_data = host_data;
-        domain->of_node = of_node_get(of_node);
+        domain->fwnode = fwnode;
        domain->hwirq_max = hwirq_max;
        domain->revmap_size = size;
        domain->revmap_direct_max_irq = direct_max;
@@ -102,7 +158,7 @@ void irq_domain_remove(struct irq_domain *domain)
        pr_debug("Removed domain %s\n", domain->name);
-        of_node_put(domain->of_node);
+        of_node_put(irq_domain_get_of_node(domain));
        kfree(domain);
 }
 EXPORT_SYMBOL_GPL(irq_domain_remove);
@@ -133,7 +189,7 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
 {
        struct irq_domain *domain;
-        domain = __irq_domain_add(of_node, size, size, 0, ops, host_data);
+        domain = __irq_domain_add(of_node_to_fwnode(of_node), size, size, 0, ops, host_data);
        if (!domain)
                return NULL;
@@ -177,7 +233,7 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
 {
        struct irq_domain *domain;
-        domain = __irq_domain_add(of_node, first_hwirq + size,
+        domain = __irq_domain_add(of_node_to_fwnode(of_node), first_hwirq + size,
                                  first_hwirq + size, 0, ops, host_data);
        if (domain)
                irq_domain_associate_many(domain, first_irq, first_hwirq, size);
@@ -187,12 +243,12 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
 EXPORT_SYMBOL_GPL(irq_domain_add_legacy);
 /**
- * irq_find_matching_host() - Locates a domain for a given device node
+ * irq_find_matching_fwnode() - Locates a domain for a given fwnode
- * @node: device-tree node of the interrupt controller
+ * @fwnode: FW descriptor of the interrupt controller
 * @bus_token: domain-specific data
 */
-struct irq_domain *irq_find_matching_host(struct device_node *node,
+struct irq_domain *irq_find_matching_fwnode(struct fwnode_handle *fwnode,
-                                          enum irq_domain_bus_token bus_token)
+                                            enum irq_domain_bus_token bus_token)
 {
        struct irq_domain *h, *found = NULL;
        int rc;
@@ -209,9 +265,9 @@ struct irq_domain *irq_find_matching_host(struct device_node *node,
        mutex_lock(&irq_domain_mutex);
        list_for_each_entry(h, &irq_domain_list, link) {
                if (h->ops->match)
-                        rc = h->ops->match(h, node, bus_token);
+                        rc = h->ops->match(h, to_of_node(fwnode), bus_token);
                else
-                        rc = ((h->of_node != NULL) && (h->of_node == node) &&
+                        rc = ((fwnode != NULL) && (h->fwnode == fwnode) &&
                              ((bus_token == DOMAIN_BUS_ANY) ||
                               (h->bus_token == bus_token)));
@@ -223,7 +279,7 @@ struct irq_domain *irq_find_matching_host(struct device_node *node,
        mutex_unlock(&irq_domain_mutex);
        return found;
 }
-EXPORT_SYMBOL_GPL(irq_find_matching_host);
+EXPORT_SYMBOL_GPL(irq_find_matching_fwnode);
 /**
 * irq_set_default_host() - Set a "default" irq domain
@@ -336,10 +392,12 @@ EXPORT_SYMBOL_GPL(irq_domain_associate);
 void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base,
                               irq_hw_number_t hwirq_base, int count)
 {
+        struct device_node *of_node;
        int i;
+        of_node = irq_domain_get_of_node(domain);
        pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__,
-                of_node_full_name(domain->of_node), irq_base, (int)hwirq_base, count);
+                of_node_full_name(of_node), irq_base, (int)hwirq_base, count);
        for (i = 0; i < count; i++) {
                irq_domain_associate(domain, irq_base + i, hwirq_base + i);
@@ -359,12 +417,14 @@ EXPORT_SYMBOL_GPL(irq_domain_associate_many);
 */
 unsigned int irq_create_direct_mapping(struct irq_domain *domain)
 {
+        struct device_node *of_node;
        unsigned int virq;
        if (domain == NULL)
                domain = irq_default_domain;
-        virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node));
+        of_node = irq_domain_get_of_node(domain);
+        virq = irq_alloc_desc_from(1, of_node_to_nid(of_node));
        if (!virq) {
                pr_debug("create_direct virq allocation failed\n");
                return 0;
@@ -399,6 +459,7 @@ EXPORT_SYMBOL_GPL(irq_create_direct_mapping);
 unsigned int irq_create_mapping(struct irq_domain *domain,
                                irq_hw_number_t hwirq)
 {
+        struct device_node *of_node;
        int virq;
        pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
@@ -412,6 +473,8 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
        }
        pr_debug("-> using domain @%p\n", domain);
+        of_node = irq_domain_get_of_node(domain);
        /* Check if mapping already exists */
        virq = irq_find_mapping(domain, hwirq);
        if (virq) {
@@ -420,8 +483,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
        }
        /* Allocate a virtual interrupt number */
-        virq = irq_domain_alloc_descs(-1, 1, hwirq,
+        virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node));
-                                      of_node_to_nid(domain->of_node));
        if (virq <= 0) {
                pr_debug("-> virq allocation failed\n");
                return 0;
@@ -433,7 +495,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
        }
        pr_debug("irq %lu on domain %s mapped to virtual irq %u\n",
-                hwirq, of_node_full_name(domain->of_node), virq);
+                hwirq, of_node_full_name(of_node), virq);
        return virq;
 }
@@ -460,10 +522,12 @@ EXPORT_SYMBOL_GPL(irq_create_mapping);
 int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base,
                               irq_hw_number_t hwirq_base, int count)
 {
+        struct device_node *of_node;
        int ret;
+        of_node = irq_domain_get_of_node(domain);
        ret = irq_alloc_descs(irq_base, irq_base, count,
-                              of_node_to_nid(domain->of_node));
+                              of_node_to_nid(of_node));
        if (unlikely(ret < 0))
                return ret;
@@ -472,28 +536,56 @@ int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base,
 }
 EXPORT_SYMBOL_GPL(irq_create_strict_mappings);
-unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data)
+static int irq_domain_translate(struct irq_domain *d,
+                                struct irq_fwspec *fwspec,
+                                irq_hw_number_t *hwirq, unsigned int *type)
+{
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+        if (d->ops->translate)
+                return d->ops->translate(d, fwspec, hwirq, type);
+#endif
+        if (d->ops->xlate)
+                return d->ops->xlate(d, to_of_node(fwspec->fwnode),
+                                     fwspec->param, fwspec->param_count,
+                                     hwirq, type);
+        /* If domain has no translation, then we assume interrupt line */
+        *hwirq = fwspec->param[0];
+        return 0;
+}
+static void of_phandle_args_to_fwspec(struct of_phandle_args *irq_data,
+                                      struct irq_fwspec *fwspec)
+{
+        int i;
+        fwspec->fwnode = irq_data->np ? &irq_data->np->fwnode : NULL;
+        fwspec->param_count = irq_data->args_count;
+        for (i = 0; i < irq_data->args_count; i++)
+                fwspec->param[i] = irq_data->args[i];
+}
+unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
 {
        struct irq_domain *domain;
        irq_hw_number_t hwirq;
        unsigned int type = IRQ_TYPE_NONE;
        int virq;
-        domain = irq_data->np ? irq_find_host(irq_data->np) : irq_default_domain;
+        if (fwspec->fwnode)
+                domain = irq_find_matching_fwnode(fwspec->fwnode, DOMAIN_BUS_ANY);
+        else
+                domain = irq_default_domain;
        if (!domain) {
                pr_warn("no irq domain found for %s !\n",
-                        of_node_full_name(irq_data->np));
+                        of_node_full_name(to_of_node(fwspec->fwnode)));
                return 0;
        }
-        /* If domain has no translation, then we assume interrupt line */
+        if (irq_domain_translate(domain, fwspec, &hwirq, &type))
-        if (domain->ops->xlate == NULL)
+                return 0;
-                hwirq = irq_data->args[0];
-        else {
-                if (domain->ops->xlate(domain, irq_data->np, irq_data->args,
-                                        irq_data->args_count, &hwirq, &type))
-                        return 0;
-        }
        if (irq_domain_is_hierarchy(domain)) {
                /*
@@ -504,7 +596,7 @@ unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data)
                if (virq)
                        return virq;
-                virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, irq_data);
+                virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, fwspec);
                if (virq <= 0)
                        return 0;
        } else {
@@ -520,6 +612,15 @@ unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data)
                irq_set_irq_type(virq, type);
        return virq;
 }
+EXPORT_SYMBOL_GPL(irq_create_fwspec_mapping);
+unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data)
+{
+        struct irq_fwspec fwspec;
+        of_phandle_args_to_fwspec(irq_data, &fwspec);
+        return irq_create_fwspec_mapping(&fwspec);
+}
 EXPORT_SYMBOL_GPL(irq_create_of_mapping);
 /**
@@ -590,14 +691,16 @@ static int virq_debug_show(struct seq_file *m, void *private)
                   "name", "mapped", "linear-max", "direct-max", "devtree-node");
        mutex_lock(&irq_domain_mutex);
        list_for_each_entry(domain, &irq_domain_list, link) {
+                struct device_node *of_node;
                int count = 0;
+                of_node = irq_domain_get_of_node(domain);
                radix_tree_for_each_slot(slot, &domain->revmap_tree, &iter, 0)
                        count++;
                seq_printf(m, "%c%-16s  %6u  %10u  %10u  %s\n",
                           domain == irq_default_domain ? '*' : ' ', domain->name,
                           domain->revmap_size + count, domain->revmap_size,
                           domain->revmap_direct_max_irq,
-                           domain->of_node ? of_node_full_name(domain->of_node) : "");
+                           of_node ? of_node_full_name(of_node) : "");
        }
        mutex_unlock(&irq_domain_mutex);
@@ -751,11 +854,11 @@ static int irq_domain_alloc_descs(int virq, unsigned int cnt,
 #ifdef  CONFIG_IRQ_DOMAIN_HIERARCHY
 /**
- * irq_domain_add_hierarchy - Add a irqdomain into the hierarchy
+ * irq_domain_create_hierarchy - Add a irqdomain into the hierarchy
 * @parent:     Parent irq domain to associate with the new domain
 * @flags:      Irq domain flags associated to the domain
 * @size:       Size of the domain. See below
- * @node:       Optional device-tree node of the interrupt controller
+ * @fwnode:     Optional fwnode of the interrupt controller
 * @ops:        Pointer to the interrupt domain callbacks
 * @host_data:  Controller private data pointer
 *
@@ -765,19 +868,19 @@ static int irq_domain_alloc_descs(int virq, unsigned int cnt,
 * domain flags are set.
 * Returns pointer to IRQ domain, or NULL on failure.
 */
-struct irq_domain *irq_domain_add_hierarchy(struct irq_domain *parent,
+struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent,
                                            unsigned int flags,
                                            unsigned int size,
-                                            struct device_node *node,
+                                            struct fwnode_handle *fwnode,
                                            const struct irq_domain_ops *ops,
                                            void *host_data)
 {
        struct irq_domain *domain;
        if (size)
-                domain = irq_domain_add_linear(node, size, ops, host_data);
+                domain = irq_domain_create_linear(fwnode, size, ops, host_data);
        else
-                domain = irq_domain_add_tree(node, ops, host_data);
+                domain = irq_domain_create_tree(fwnode, ops, host_data);
        if (domain) {
                domain->parent = parent;
                domain->flags |= flags;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index f9a59f6cabd2..0eebaeef317b 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -258,37 +258,6 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
 }
 EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
-/**
- *      irq_set_vcpu_affinity - Set vcpu affinity for the interrupt
- *      @irq: interrupt number to set affinity
- *      @vcpu_info: vCPU specific data
- *
- *      This function uses the vCPU specific data to set the vCPU
- *      affinity for an irq. The vCPU specific data is passed from
- *      outside, such as KVM. One example code path is as below:
- *      KVM -> IOMMU -> irq_set_vcpu_affinity().
- */
-int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info)
-{
-        unsigned long flags;
-        struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
-        struct irq_data *data;
-        struct irq_chip *chip;
-        int ret = -ENOSYS;
-        if (!desc)
-                return -EINVAL;
-        data = irq_desc_get_irq_data(desc);
-        chip = irq_data_get_irq_chip(data);
-        if (chip && chip->irq_set_vcpu_affinity)
-                ret = chip->irq_set_vcpu_affinity(data, vcpu_info);
-        irq_put_desc_unlock(desc, flags);
-        return ret;
-}
-EXPORT_SYMBOL_GPL(irq_set_vcpu_affinity);
 static void irq_affinity_notify(struct work_struct *work)
 {
        struct irq_affinity_notify *notify =
@@ -424,6 +393,37 @@ setup_affinity(struct irq_desc *desc, struct cpumask *mask)
 }
 #endif
+/**
+ *      irq_set_vcpu_affinity - Set vcpu affinity for the interrupt
+ *      @irq: interrupt number to set affinity
+ *      @vcpu_info: vCPU specific data
+ *
+ *      This function uses the vCPU specific data to set the vCPU
+ *      affinity for an irq. The vCPU specific data is passed from
+ *      outside, such as KVM. One example code path is as below:
+ *      KVM -> IOMMU -> irq_set_vcpu_affinity().
+ */
+int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info)
+{
+        unsigned long flags;
+        struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
+        struct irq_data *data;
+        struct irq_chip *chip;
+        int ret = -ENOSYS;
+        if (!desc)
+                return -EINVAL;
+        data = irq_desc_get_irq_data(desc);
+        chip = irq_data_get_irq_chip(data);
+        if (chip && chip->irq_set_vcpu_affinity)
+                ret = chip->irq_set_vcpu_affinity(data, vcpu_info);
+        irq_put_desc_unlock(desc, flags);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(irq_set_vcpu_affinity);
 void __disable_irq(struct irq_desc *desc)
 {
        if (!desc->depth++)
@@ -730,6 +730,12 @@ static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id)
        return IRQ_NONE;
 }
+static irqreturn_t irq_forced_secondary_handler(int irq, void *dev_id)
+{
+        WARN(1, "Secondary action handler called for irq %d\n", irq);
+        return IRQ_NONE;
+}
 static int irq_wait_for_interrupt(struct irqaction *action)
 {
        set_current_state(TASK_INTERRUPTIBLE);
@@ -756,7 +762,8 @@ static int irq_wait_for_interrupt(struct irqaction *action)
 static void irq_finalize_oneshot(struct irq_desc *desc,
                                 struct irqaction *action)
 {
-        if (!(desc->istate & IRQS_ONESHOT))
+        if (!(desc->istate & IRQS_ONESHOT) ||
+            action->handler == irq_forced_secondary_handler)
                return;
 again:
        chip_bus_lock(desc);
@@ -910,6 +917,18 @@ static void irq_thread_dtor(struct callback_head *unused)
        irq_finalize_oneshot(desc, action);
 }
+static void irq_wake_secondary(struct irq_desc *desc, struct irqaction *action)
+{
+        struct irqaction *secondary = action->secondary;
+        if (WARN_ON_ONCE(!secondary))
+                return;
+        raw_spin_lock_irq(&desc->lock);
+        __irq_wake_thread(desc, secondary);
+        raw_spin_unlock_irq(&desc->lock);
+}
 /*
 * Interrupt handler thread
 */
@@ -940,6 +959,8 @@ static int irq_thread(void *data)
                action_ret = handler_fn(desc, action);
                if (action_ret == IRQ_HANDLED)
                        atomic_inc(&desc->threads_handled);
+                if (action_ret == IRQ_WAKE_THREAD)
+                        irq_wake_secondary(desc, action);
                wake_threads_waitq(desc);
        }
@@ -984,20 +1005,36 @@ void irq_wake_thread(unsigned int irq, void *dev_id)
 }
 EXPORT_SYMBOL_GPL(irq_wake_thread);
-static void irq_setup_forced_threading(struct irqaction *new)
+static int irq_setup_forced_threading(struct irqaction *new)
 {
        if (!force_irqthreads)
-                return;
+                return 0;
        if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT))
-                return;
+                return 0;
        new->flags |= IRQF_ONESHOT;
-        if (!new->thread_fn) {
+        /*
-                set_bit(IRQTF_FORCED_THREAD, &new->thread_flags);
+         * Handle the case where we have a real primary handler and a
-                new->thread_fn = new->handler;
+         * thread handler. We force thread them as well by creating a
-                new->handler = irq_default_primary_handler;
+         * secondary action.
+         */
+        if (new->handler != irq_default_primary_handler && new->thread_fn) {
+                /* Allocate the secondary action */
+                new->secondary = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
+                if (!new->secondary)
+                        return -ENOMEM;
+                new->secondary->handler = irq_forced_secondary_handler;
+                new->secondary->thread_fn = new->thread_fn;
+                new->secondary->dev_id = new->dev_id;
+                new->secondary->irq = new->irq;
+                new->secondary->name = new->name;
        }
+        /* Deal with the primary handler */
+        set_bit(IRQTF_FORCED_THREAD, &new->thread_flags);
+        new->thread_fn = new->handler;
+        new->handler = irq_default_primary_handler;
+        return 0;
 }
 static int irq_request_resources(struct irq_desc *desc)
@@ -1017,6 +1054,48 @@ static void irq_release_resources(struct irq_desc *desc)
                c->irq_release_resources(d);
 }
+static int
+setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary)
+{
+        struct task_struct *t;
+        struct sched_param param = {
+                .sched_priority = MAX_USER_RT_PRIO/2,
+        };
+        if (!secondary) {
+                t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
+                                   new->name);
+        } else {
+                t = kthread_create(irq_thread, new, "irq/%d-s-%s", irq,
+                                   new->name);
+                param.sched_priority -= 1;
+        }
+        if (IS_ERR(t))
+                return PTR_ERR(t);
+        sched_setscheduler_nocheck(t, SCHED_FIFO, &param);
+        /*
+         * We keep the reference to the task struct even if
+         * the thread dies to avoid that the interrupt code
+         * references an already freed task_struct.
+         */
+        get_task_struct(t);
+        new->thread = t;
+        /*
+         * Tell the thread to set its affinity. This is
+         * important for shared interrupt handlers as we do
+         * not invoke setup_affinity() for the secondary
+         * handlers as everything is already set up. Even for
+         * interrupts marked with IRQF_NO_BALANCE this is
+         * correct as we want the thread to move to the cpu(s)
+         * on which the requesting code placed the interrupt.
+         */
+        set_bit(IRQTF_AFFINITY, &new->thread_flags);
+        return 0;
+}
 /*
 * Internal function to register an irqaction - typically used to
 * allocate special interrupts that are part of the architecture.
@@ -1037,6 +1116,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
        if (!try_module_get(desc->owner))
                return -ENODEV;
+        new->irq = irq;
        /*
         * Check whether the interrupt nests into another interrupt
         * thread.
@@ -1054,8 +1135,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                 */
                new->handler = irq_nested_primary_handler;
        } else {
-                if (irq_settings_can_thread(desc))
+                if (irq_settings_can_thread(desc)) {
-                        irq_setup_forced_threading(new);
+                        ret = irq_setup_forced_threading(new);
+                        if (ret)
+                                goto out_mput;
+                }
        }
        /*
@@ -1064,37 +1148,14 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
         * thread.
         */
        if (new->thread_fn && !nested) {
-                struct task_struct *t;
+                ret = setup_irq_thread(new, irq, false);
-                static const struct sched_param param = {
+                if (ret)
-                        .sched_priority = MAX_USER_RT_PRIO/2,
-                };
-                t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
-                                   new->name);
-                if (IS_ERR(t)) {
-                        ret = PTR_ERR(t);
                        goto out_mput;
+                if (new->secondary) {
+                        ret = setup_irq_thread(new->secondary, irq, true);
+                        if (ret)
+                                goto out_thread;
                }
-                sched_setscheduler_nocheck(t, SCHED_FIFO, &param);
-                /*
-                 * We keep the reference to the task struct even if
-                 * the thread dies to avoid that the interrupt code
-                 * references an already freed task_struct.
-                 */
-                get_task_struct(t);
-                new->thread = t;
-                /*
-                 * Tell the thread to set its affinity. This is
-                 * important for shared interrupt handlers as we do
-                 * not invoke setup_affinity() for the secondary
-                 * handlers as everything is already set up. Even for
-                 * interrupts marked with IRQF_NO_BALANCE this is
-                 * correct as we want the thread to move to the cpu(s)
-                 * on which the requesting code placed the interrupt.
-                 */
-                set_bit(IRQTF_AFFINITY, &new->thread_flags);
        }
        if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
@@ -1267,7 +1328,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                                   irq, nmsk, omsk);
        }
-        new->irq = irq;
        *old_ptr = new;
        irq_pm_install_action(desc, new);
@@ -1293,6 +1353,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
         */
        if (new->thread)
                wake_up_process(new->thread);
+        if (new->secondary)
+                wake_up_process(new->secondary->thread);
        register_irq_proc(irq, desc);
        new->dir = NULL;
@@ -1323,6 +1385,13 @@ out_thread:
                kthread_stop(t);
                put_task_struct(t);
        }
+        if (new->secondary && new->secondary->thread) {
+                struct task_struct *t = new->secondary->thread;
+                new->secondary->thread = NULL;
+                kthread_stop(t);
+                put_task_struct(t);
+        }
 out_mput:
        module_put(desc->owner);
        return ret;
@@ -1394,6 +1463,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
        /* If this was the last handler, shut down the IRQ line: */
        if (!desc->action) {
+                irq_settings_clr_disable_unlazy(desc);
                irq_shutdown(desc);
                irq_release_resources(desc);
        }
@@ -1430,9 +1500,14 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
        if (action->thread) {
                kthread_stop(action->thread);
                put_task_struct(action->thread);
+                if (action->secondary && action->secondary->thread) {
+                        kthread_stop(action->secondary->thread);
+                        put_task_struct(action->secondary->thread);
+                }
        }
        module_put(desc->owner);
+        kfree(action->secondary);
        return action;
 }
@@ -1576,8 +1651,10 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
        retval = __setup_irq(irq, desc, action);
        chip_bus_sync_unlock(desc);
-        if (retval)
+        if (retval) {
+                kfree(action->secondary);
                kfree(action);
+        }
 #ifdef CONFIG_DEBUG_SHIRQ_FIXME
        if (!retval && (irqflags & IRQF_SHARED)) {
@@ -1761,6 +1838,7 @@ void free_percpu_irq(unsigned int irq, void __percpu *dev_id)
        kfree(__free_percpu_irq(irq, dev_id));
        chip_bus_sync_unlock(desc);
 }
+EXPORT_SYMBOL_GPL(free_percpu_irq);
 /**
 *      setup_percpu_irq - setup a per-cpu interrupt
@@ -1790,9 +1868,10 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act)
 *      @devname: An ascii name for the claiming device
 *      @dev_id: A percpu cookie passed back to the handler function
 *
- *      This call allocates interrupt resources, but doesn't
+ *      This call allocates interrupt resources and enables the
- *      automatically enable the interrupt. It has to be done on each
+ *      interrupt on the local CPU. If the interrupt is supposed to be
- *      CPU using enable_percpu_irq().
+ *      enabled on other CPUs, it has to be done on each CPU using
+ *      enable_percpu_irq().
 *
 *      Dev_id must be globally unique. It is a per-cpu variable, and
 *      the handler gets called with the interrupted CPU's instance of
@@ -1831,6 +1910,7 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
        return retval;
 }
+EXPORT_SYMBOL_GPL(request_percpu_irq);
 /**
 *      irq_get_irqchip_state - returns the irqchip state of a interrupt.
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 7e6512b9dc1f..6b0c0b74a2a1 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -228,22 +228,18 @@ static void msi_domain_update_chip_ops(struct msi_domain_info *info)
 {
        struct irq_chip *chip = info->chip;
-        BUG_ON(!chip);
+        BUG_ON(!chip || !chip->irq_mask || !chip->irq_unmask);
-        if (!chip->irq_mask)
-                chip->irq_mask = pci_msi_mask_irq;
-        if (!chip->irq_unmask)
-                chip->irq_unmask = pci_msi_unmask_irq;
        if (!chip->irq_set_affinity)
                chip->irq_set_affinity = msi_domain_set_affinity;
 }
 /**
 * msi_create_irq_domain - Create a MSI interrupt domain
- * @of_node:    Optional device-tree node of the interrupt controller
+ * @fwnode:     Optional fwnode of the interrupt controller
 * @info:       MSI domain info
 * @parent:     Parent irq domain
 */
-struct irq_domain *msi_create_irq_domain(struct device_node *node,
+struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode,
                                         struct msi_domain_info *info,
                                         struct irq_domain *parent)
 {
@@ -252,8 +248,8 @@ struct irq_domain *msi_create_irq_domain(struct device_node *node,
        if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
                msi_domain_update_chip_ops(info);
-        return irq_domain_add_hierarchy(parent, 0, 0, node, &msi_domain_ops,
+        return irq_domain_create_hierarchy(parent, 0, 0, fwnode,
-                                        info);
+                                           &msi_domain_ops, info);
 }
 /**
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 21c62617a35a..e80c4400118a 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -21,7 +21,7 @@ bool irq_pm_check_wakeup(struct irq_desc *desc)
                desc->istate |= IRQS_SUSPENDED | IRQS_PENDING;
                desc->depth++;
                irq_disable(desc);
-                pm_system_wakeup();
+                pm_system_irq_wakeup(irq_desc_get_irq(desc));
                return true;
        }
        return false;
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index e3a8c9577ba6..a916cf144b65 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -12,6 +12,7 @@
 #include <linux/seq_file.h>
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
+#include <linux/mutex.h>
 #include "internals.h"
@@ -323,18 +324,29 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
 void register_irq_proc(unsigned int irq, struct irq_desc *desc)
 {
+        static DEFINE_MUTEX(register_lock);
        char name [MAX_NAMELEN];
-        if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip) || desc->dir)
+        if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip))
                return;
+        /*
+         * irq directories are registered only when a handler is
+         * added, not when the descriptor is created, so multiple
+         * tasks might try to register at the same time.
+         */
+        mutex_lock(&register_lock);
+        if (desc->dir)
+                goto out_unlock;
        memset(name, 0, MAX_NAMELEN);
        sprintf(name, "%d", irq);
        /* create /proc/irq/1234 */
        desc->dir = proc_mkdir(name, root_irq_dir);
        if (!desc->dir)
-                return;
+                goto out_unlock;
 #ifdef CONFIG_SMP
        /* create /proc/irq/<irq>/smp_affinity */
@@ -355,6 +367,9 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
        proc_create_data("spurious", 0444, desc->dir,
                         &irq_spurious_proc_fops, (void *)(long)irq);
+out_unlock:
+        mutex_unlock(&register_lock);
 }
 void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
@@ -460,7 +475,7 @@ int show_interrupts(struct seq_file *p, void *v)
        for_each_online_cpu(j)
                any_count |= kstat_irqs_cpu(i, j);
        action = desc->action;
-        if (!action && !any_count)
+        if ((!action || action == &chained_action) && !any_count)
                goto out;
        seq_printf(p, "%*d: ", prec, i);
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 3320b84cc60f..320579d89091 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -15,6 +15,7 @@ enum {
        _IRQ_NESTED_THREAD      = IRQ_NESTED_THREAD,
        _IRQ_PER_CPU_DEVID      = IRQ_PER_CPU_DEVID,
        _IRQ_IS_POLLED          = IRQ_IS_POLLED,
+        _IRQ_DISABLE_UNLAZY     = IRQ_DISABLE_UNLAZY,
        _IRQF_MODIFY_MASK       = IRQF_MODIFY_MASK,
 };
@@ -28,6 +29,7 @@ enum {
 #define IRQ_NESTED_THREAD       GOT_YOU_MORON
 #define IRQ_PER_CPU_DEVID       GOT_YOU_MORON
 #define IRQ_IS_POLLED           GOT_YOU_MORON
+#define IRQ_DISABLE_UNLAZY      GOT_YOU_MORON
 #undef IRQF_MODIFY_MASK
 #define IRQF_MODIFY_MASK        GOT_YOU_MORON
@@ -154,3 +156,13 @@ static inline bool irq_settings_is_polled(struct irq_desc *desc)
 {
        return desc->status_use_accessors & _IRQ_IS_POLLED;
 }
+static inline bool irq_settings_disable_unlazy(struct irq_desc *desc)
+{
+        return desc->status_use_accessors & _IRQ_DISABLE_UNLAZY;
+}
+static inline void irq_settings_clr_disable_unlazy(struct irq_desc *desc)
+{
+        desc->status_use_accessors &= ~_IRQ_DISABLE_UNLAZY;
+}
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 4c5edc357923..d873b64fbddc 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -6,6 +6,8 @@
 * Version 2.  See the file COPYING for more details.
 */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/capability.h>
 #include <linux/mm.h>
 #include <linux/file.h>
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 201b45327804..11b64a63c0f8 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -6,7 +6,7 @@
 * Version 2.  See the file COPYING for more details.
 */
-#define pr_fmt(fmt)     "kexec: " fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/capability.h>
 #include <linux/mm.h>
@@ -1027,7 +1027,7 @@ static int __init crash_notes_memory_init(void)
        crash_notes = __alloc_percpu(size, align);
        if (!crash_notes) {
-                pr_warn("Kexec: Memory allocation for saving cpu register states failed\n");
+                pr_warn("Memory allocation for saving cpu register states failed\n");
                return -ENOMEM;
        }
        return 0;
@@ -1149,7 +1149,7 @@ static int __init parse_crashkernel_simple(char *cmdline,
        if (*cur == '@')
                *crash_base = memparse(cur+1, &cur);
        else if (*cur != ' ' && *cur != '\0') {
-                pr_warn("crashkernel: unrecognized char\n");
+                pr_warn("crashkernel: unrecognized char: %c\n", *cur);
                return -EINVAL;
        }
@@ -1186,12 +1186,12 @@ static int __init parse_crashkernel_suffix(char *cmdline,
        /* check with suffix */
        if (strncmp(cur, suffix, strlen(suffix))) {
-                pr_warn("crashkernel: unrecognized char\n");
+                pr_warn("crashkernel: unrecognized char: %c\n", *cur);
                return -EINVAL;
        }
        cur += strlen(suffix);
        if (*cur != ' ' && *cur != '\0') {
-                pr_warn("crashkernel: unrecognized char\n");
+                pr_warn("crashkernel: unrecognized char: %c\n", *cur);
                return -EINVAL;
        }
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 6a9a3f2a0e8e..b70ada0028d2 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -9,6 +9,8 @@
 * Version 2.  See the file COPYING for more details.
 */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/capability.h>
 #include <linux/mm.h>
 #include <linux/file.h>
diff --git a/kernel/kmod.c b/kernel/kmod.c
index da98d0593de2..0277d1216f80 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -327,9 +327,13 @@ static void call_usermodehelper_exec_work(struct work_struct *work)
                call_usermodehelper_exec_sync(sub_info);
        } else {
                pid_t pid;
+                /*
+                 * Use CLONE_PARENT to reparent it to kthreadd; we do not
+                 * want to pollute current->children, and we need a parent
+                 * that always ignores SIGCHLD to ensure auto-reaping.
+                 */
                pid = kernel_thread(call_usermodehelper_exec_async, sub_info,
-                                    SIGCHLD);
+                                    CLONE_PARENT | SIGCHLD);
                if (pid < 0) {
                        sub_info->retval = pid;
                        umh_complete(sub_info);
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 8acfbf773e06..deae3907ac1e 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -2738,7 +2738,7 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
                return;
        /* no reclaim without waiting on it */
-        if (!(gfp_mask & __GFP_WAIT))
+        if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
                return;
        /* this guy won't enter reclaim */
@@ -3068,7 +3068,7 @@ static int __lock_is_held(struct lockdep_map *lock);
 static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
                          int trylock, int read, int check, int hardirqs_off,
                          struct lockdep_map *nest_lock, unsigned long ip,
-                          int references)
+                          int references, int pin_count)
 {
        struct task_struct *curr = current;
        struct lock_class *class = NULL;
@@ -3157,7 +3157,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
        hlock->waittime_stamp = 0;
        hlock->holdtime_stamp = lockstat_clock();
 #endif
-        hlock->pin_count = 0;
+        hlock->pin_count = pin_count;
        if (check && !mark_irqflags(curr, hlock))
                return 0;
@@ -3343,7 +3343,7 @@ found_it:
                        hlock_class(hlock)->subclass, hlock->trylock,
                                hlock->read, hlock->check, hlock->hardirqs_off,
                                hlock->nest_lock, hlock->acquire_ip,
-                                hlock->references))
+                                hlock->references, hlock->pin_count))
                        return 0;
        }
@@ -3433,7 +3433,7 @@ found_it:
                        hlock_class(hlock)->subclass, hlock->trylock,
                                hlock->read, hlock->check, hlock->hardirqs_off,
                                hlock->nest_lock, hlock->acquire_ip,
-                                hlock->references))
+                                hlock->references, hlock->pin_count))
                        return 0;
        }
@@ -3583,7 +3583,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
        current->lockdep_recursion = 1;
        trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
        __lock_acquire(lock, subclass, trylock, read, check,
-                       irqs_disabled_flags(flags), nest_lock, ip, 0);
+                       irqs_disabled_flags(flags), nest_lock, ip, 0, 0);
        current->lockdep_recursion = 0;
        raw_local_irq_restore(flags);
 }
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 32244186f1f2..8ef1919d63b2 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -17,12 +17,14 @@
 *
 * Copyright (C) IBM Corporation, 2014
 *
- * Author: Paul E. McKenney <paulmck@us.ibm.com>
+ * Authors: Paul E. McKenney <paulmck@us.ibm.com>
+ *          Davidlohr Bueso <dave@stgolabs.net>
 *      Based on kernel/rcu/torture.c.
 */
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/kthread.h>
+#include <linux/sched/rt.h>
 #include <linux/spinlock.h>
 #include <linux/rwlock.h>
 #include <linux/mutex.h>
@@ -34,6 +36,7 @@
 #include <linux/moduleparam.h>
 #include <linux/delay.h>
 #include <linux/slab.h>
+#include <linux/percpu-rwsem.h>
 #include <linux/torture.h>
 MODULE_LICENSE("GPL");
@@ -91,11 +94,13 @@ struct lock_torture_ops {
        void (*init)(void);
        int (*writelock)(void);
        void (*write_delay)(struct torture_random_state *trsp);
+        void (*task_boost)(struct torture_random_state *trsp);
        void (*writeunlock)(void);
        int (*readlock)(void);
        void (*read_delay)(struct torture_random_state *trsp);
        void (*readunlock)(void);
-        unsigned long flags;
+        unsigned long flags; /* for irq spinlocks */
        const char *name;
 };
@@ -139,9 +144,15 @@ static void torture_lock_busted_write_unlock(void)
          /* BUGGY, do not use in real life!!! */
 }
+static void torture_boost_dummy(struct torture_random_state *trsp)
+{
+        /* Only rtmutexes care about priority */
+}
 static struct lock_torture_ops lock_busted_ops = {
        .writelock      = torture_lock_busted_write_lock,
        .write_delay    = torture_lock_busted_write_delay,
+        .task_boost     = torture_boost_dummy,
        .writeunlock    = torture_lock_busted_write_unlock,
        .readlock       = NULL,
        .read_delay     = NULL,
@@ -185,6 +196,7 @@ static void torture_spin_lock_write_unlock(void) __releases(torture_spinlock)
 static struct lock_torture_ops spin_lock_ops = {
        .writelock      = torture_spin_lock_write_lock,
        .write_delay    = torture_spin_lock_write_delay,
+        .task_boost     = torture_boost_dummy,
        .writeunlock    = torture_spin_lock_write_unlock,
        .readlock       = NULL,
        .read_delay     = NULL,
@@ -211,6 +223,7 @@ __releases(torture_spinlock)
 static struct lock_torture_ops spin_lock_irq_ops = {
        .writelock      = torture_spin_lock_write_lock_irq,
        .write_delay    = torture_spin_lock_write_delay,
+        .task_boost     = torture_boost_dummy,
        .writeunlock    = torture_lock_spin_write_unlock_irq,
        .readlock       = NULL,
        .read_delay     = NULL,
@@ -275,6 +288,7 @@ static void torture_rwlock_read_unlock(void) __releases(torture_rwlock)
 static struct lock_torture_ops rw_lock_ops = {
        .writelock      = torture_rwlock_write_lock,
        .write_delay    = torture_rwlock_write_delay,
+        .task_boost     = torture_boost_dummy,
        .writeunlock    = torture_rwlock_write_unlock,
        .readlock       = torture_rwlock_read_lock,
        .read_delay     = torture_rwlock_read_delay,
@@ -315,6 +329,7 @@ __releases(torture_rwlock)
 static struct lock_torture_ops rw_lock_irq_ops = {
        .writelock      = torture_rwlock_write_lock_irq,
        .write_delay    = torture_rwlock_write_delay,
+        .task_boost     = torture_boost_dummy,
        .writeunlock    = torture_rwlock_write_unlock_irq,
        .readlock       = torture_rwlock_read_lock_irq,
        .read_delay     = torture_rwlock_read_delay,
@@ -354,6 +369,7 @@ static void torture_mutex_unlock(void) __releases(torture_mutex)
 static struct lock_torture_ops mutex_lock_ops = {
        .writelock      = torture_mutex_lock,
        .write_delay    = torture_mutex_delay,
+        .task_boost     = torture_boost_dummy,
        .writeunlock    = torture_mutex_unlock,
        .readlock       = NULL,
        .read_delay     = NULL,
@@ -361,6 +377,90 @@ static struct lock_torture_ops mutex_lock_ops = {
        .name           = "mutex_lock"
 };
+#ifdef CONFIG_RT_MUTEXES
+static DEFINE_RT_MUTEX(torture_rtmutex);
+static int torture_rtmutex_lock(void) __acquires(torture_rtmutex)
+{
+        rt_mutex_lock(&torture_rtmutex);
+        return 0;
+}
+static void torture_rtmutex_boost(struct torture_random_state *trsp)
+{
+        int policy;
+        struct sched_param param;
+        const unsigned int factor = 50000; /* yes, quite arbitrary */
+        if (!rt_task(current)) {
+                /*
+                 * (1) Boost priority once every ~50k operations. When the
+                 * task tries to take the lock, the rtmutex it will account
+                 * for the new priority, and do any corresponding pi-dance.
+                 */
+                if (!(torture_random(trsp) %
+                      (cxt.nrealwriters_stress * factor))) {
+                        policy = SCHED_FIFO;
+                        param.sched_priority = MAX_RT_PRIO - 1;
+                } else /* common case, do nothing */
+                        return;
+        } else {
+                /*
+                 * The task will remain boosted for another ~500k operations,
+                 * then restored back to its original prio, and so forth.
+                 *
+                 * When @trsp is nil, we want to force-reset the task for
+                 * stopping the kthread.
+                 */
+                if (!trsp || !(torture_random(trsp) %
+                               (cxt.nrealwriters_stress * factor * 2))) {
+                        policy = SCHED_NORMAL;
+                        param.sched_priority = 0;
+                } else /* common case, do nothing */
+                        return;
+        }
+        sched_setscheduler_nocheck(current, policy, &param);
+}
+static void torture_rtmutex_delay(struct torture_random_state *trsp)
+{
+        const unsigned long shortdelay_us = 2;
+        const unsigned long longdelay_ms = 100;
+        /*
+         * We want a short delay mostly to emulate likely code, and
+         * we want a long delay occasionally to force massive contention.
+         */
+        if (!(torture_random(trsp) %
+              (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
+                mdelay(longdelay_ms);
+        if (!(torture_random(trsp) %
+              (cxt.nrealwriters_stress * 2 * shortdelay_us)))
+                udelay(shortdelay_us);
+#ifdef CONFIG_PREEMPT
+        if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
+                preempt_schedule();  /* Allow test to be preempted. */
+#endif
+}
+static void torture_rtmutex_unlock(void) __releases(torture_rtmutex)
+{
+        rt_mutex_unlock(&torture_rtmutex);
+}
+static struct lock_torture_ops rtmutex_lock_ops = {
+        .writelock      = torture_rtmutex_lock,
+        .write_delay    = torture_rtmutex_delay,
+        .task_boost     = torture_rtmutex_boost,
+        .writeunlock    = torture_rtmutex_unlock,
+        .readlock       = NULL,
+        .read_delay     = NULL,
+        .readunlock     = NULL,
+        .name           = "rtmutex_lock"
+};
+#endif
 static DECLARE_RWSEM(torture_rwsem);
 static int torture_rwsem_down_write(void) __acquires(torture_rwsem)
 {
@@ -419,6 +519,7 @@ static void torture_rwsem_up_read(void) __releases(torture_rwsem)
 static struct lock_torture_ops rwsem_lock_ops = {
        .writelock      = torture_rwsem_down_write,
        .write_delay    = torture_rwsem_write_delay,
+        .task_boost     = torture_boost_dummy,
        .writeunlock    = torture_rwsem_up_write,
        .readlock       = torture_rwsem_down_read,
        .read_delay     = torture_rwsem_read_delay,
@@ -426,6 +527,48 @@ static struct lock_torture_ops rwsem_lock_ops = {
        .name           = "rwsem_lock"
 };
+#include <linux/percpu-rwsem.h>
+static struct percpu_rw_semaphore pcpu_rwsem;
+void torture_percpu_rwsem_init(void)
+{
+        BUG_ON(percpu_init_rwsem(&pcpu_rwsem));
+}
+static int torture_percpu_rwsem_down_write(void) __acquires(pcpu_rwsem)
+{
+        percpu_down_write(&pcpu_rwsem);
+        return 0;
+}
+static void torture_percpu_rwsem_up_write(void) __releases(pcpu_rwsem)
+{
+        percpu_up_write(&pcpu_rwsem);
+}
+static int torture_percpu_rwsem_down_read(void) __acquires(pcpu_rwsem)
+{
+        percpu_down_read(&pcpu_rwsem);
+        return 0;
+}
+static void torture_percpu_rwsem_up_read(void) __releases(pcpu_rwsem)
+{
+        percpu_up_read(&pcpu_rwsem);
+}
+static struct lock_torture_ops percpu_rwsem_lock_ops = {
+        .init           = torture_percpu_rwsem_init,
+        .writelock      = torture_percpu_rwsem_down_write,
+        .write_delay    = torture_rwsem_write_delay,
+        .task_boost     = torture_boost_dummy,
+        .writeunlock    = torture_percpu_rwsem_up_write,
+        .readlock       = torture_percpu_rwsem_down_read,
+        .read_delay     = torture_rwsem_read_delay,
+        .readunlock     = torture_percpu_rwsem_up_read,
+        .name           = "percpu_rwsem_lock"
+};
 /*
 * Lock torture writer kthread.  Repeatedly acquires and releases
 * the lock, checking for duplicate acquisitions.
@@ -442,6 +585,7 @@ static int lock_torture_writer(void *arg)
                if ((torture_random(&rand) & 0xfffff) == 0)
                        schedule_timeout_uninterruptible(1);
+                cxt.cur_ops->task_boost(&rand);
                cxt.cur_ops->writelock();
                if (WARN_ON_ONCE(lock_is_write_held))
                        lwsp->n_lock_fail++;
@@ -456,6 +600,8 @@ static int lock_torture_writer(void *arg)
                stutter_wait("lock_torture_writer");
        } while (!torture_must_stop());
+        cxt.cur_ops->task_boost(NULL); /* reset prio */
        torture_kthread_stopping("lock_torture_writer");
        return 0;
 }
@@ -642,7 +788,11 @@ static int __init lock_torture_init(void)
                &spin_lock_ops, &spin_lock_irq_ops,
                &rw_lock_ops, &rw_lock_irq_ops,
                &mutex_lock_ops,
+#ifdef CONFIG_RT_MUTEXES
+                &rtmutex_lock_ops,
+#endif
                &rwsem_lock_ops,
+                &percpu_rwsem_lock_ops,
        };
        if (!torture_init_begin(torture_type, verbose, &torture_runnable))
@@ -661,11 +811,11 @@ static int __init lock_torture_init(void)
                for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
                        pr_alert(" %s", torture_ops[i]->name);
                pr_alert("\n");
-                torture_init_end();
+                firsterr = -EINVAL;
-                return -EINVAL;
+                goto unwind;
        }
        if (cxt.cur_ops->init)
-                cxt.cur_ops->init(); /* no "goto unwind" prior to this point!!! */
+                cxt.cur_ops->init();
        if (nwriters_stress >= 0)
                cxt.nrealwriters_stress = nwriters_stress;
@@ -676,6 +826,10 @@ static int __init lock_torture_init(void)
        if (strncmp(torture_type, "mutex", 5) == 0)
                cxt.debug_lock = true;
 #endif
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+        if (strncmp(torture_type, "rtmutex", 7) == 0)
+                cxt.debug_lock = true;
+#endif
 #ifdef CONFIG_DEBUG_SPINLOCK
        if ((strncmp(torture_type, "spin", 4) == 0) ||
            (strncmp(torture_type, "rw_lock", 7) == 0))
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index fd91aaa4554c..5b9102a47ea5 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -67,7 +67,7 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
        node->locked = 0;
        node->next   = NULL;
-        prev = xchg(lock, node);
+        prev = xchg_acquire(lock, node);
        if (likely(prev == NULL)) {
                /*
                 * Lock acquired, don't need to set node->locked to 1. Threads
@@ -98,7 +98,7 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
                /*
                 * Release the lock by setting it to NULL
                 */
-                if (likely(cmpxchg(lock, node, NULL) == node))
+                if (likely(cmpxchg_release(lock, node, NULL) == node))
                        return;
                /* Wait until the next pointer is set */
                while (!(next = READ_ONCE(node->next)))
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 4cccea6b8934..0551c219c40e 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -277,7 +277,7 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
 static inline bool mutex_try_to_acquire(struct mutex *lock)
 {
        return !mutex_is_locked(lock) &&
-                (atomic_cmpxchg(&lock->count, 1, 0) == 1);
+                (atomic_cmpxchg_acquire(&lock->count, 1, 0) == 1);
 }
 /*
@@ -529,7 +529,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
         * Once more, try to acquire the lock. Only try-lock the mutex if
         * it is unlocked to reduce unnecessary xchg() operations.
         */
-        if (!mutex_is_locked(lock) && (atomic_xchg(&lock->count, 0) == 1))
+        if (!mutex_is_locked(lock) &&
+            (atomic_xchg_acquire(&lock->count, 0) == 1))
                goto skip_wait;
        debug_mutex_lock_common(lock, &waiter);
@@ -553,7 +554,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                 * non-negative in order to avoid unnecessary xchg operations:
                 */
                if (atomic_read(&lock->count) >= 0 &&
-                    (atomic_xchg(&lock->count, -1) == 1))
+                    (atomic_xchg_acquire(&lock->count, -1) == 1))
                        break;
                /*
@@ -867,7 +868,7 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
        spin_lock_mutex(&lock->wait_lock, flags);
-        prev = atomic_xchg(&lock->count, -1);
+        prev = atomic_xchg_acquire(&lock->count, -1);
        if (likely(prev == 1)) {
                mutex_set_owner(lock);
                mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index dc85ee23a26f..d092a0c9c2d4 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -50,7 +50,7 @@ osq_wait_next(struct optimistic_spin_queue *lock,
        for (;;) {
                if (atomic_read(&lock->tail) == curr &&
-                    atomic_cmpxchg(&lock->tail, curr, old) == curr) {
+                    atomic_cmpxchg_acquire(&lock->tail, curr, old) == curr) {
                        /*
                         * We were the last queued, we moved @lock back. @prev
                         * will now observe @lock and will complete its
@@ -92,7 +92,11 @@ bool osq_lock(struct optimistic_spin_queue *lock)
        node->next = NULL;
        node->cpu = curr;
-        old = atomic_xchg(&lock->tail, curr);
+        /*
+         * ACQUIRE semantics, pairs with corresponding RELEASE
+         * in unlock() uncontended, or fastpath.
+         */
+        old = atomic_xchg_acquire(&lock->tail, curr);
        if (old == OSQ_UNLOCKED_VAL)
                return true;
@@ -184,7 +188,8 @@ void osq_unlock(struct optimistic_spin_queue *lock)
        /*
         * Fast path for the uncontended case.
         */
-        if (likely(atomic_cmpxchg(&lock->tail, curr, OSQ_UNLOCKED_VAL) == curr))
+        if (likely(atomic_cmpxchg_release(&lock->tail, curr,
+                                          OSQ_UNLOCKED_VAL) == curr))
                return;
        /*
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index f32567254867..f231e0bb311c 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -17,50 +17,43 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *brw,
        /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
        __init_rwsem(&brw->rw_sem, name, rwsem_key);
-        atomic_set(&brw->write_ctr, 0);
+        rcu_sync_init(&brw->rss, RCU_SCHED_SYNC);
        atomic_set(&brw->slow_read_ctr, 0);
        init_waitqueue_head(&brw->write_waitq);
        return 0;
 }
+EXPORT_SYMBOL_GPL(__percpu_init_rwsem);
 void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
 {
+        /*
+         * XXX: temporary kludge. The error path in alloc_super()
+         * assumes that percpu_free_rwsem() is safe after kzalloc().
+         */
+        if (!brw->fast_read_ctr)
+                return;
+        rcu_sync_dtor(&brw->rss);
        free_percpu(brw->fast_read_ctr);
        brw->fast_read_ctr = NULL; /* catch use after free bugs */
 }
 /*
- * This is the fast-path for down_read/up_read, it only needs to ensure
+ * This is the fast-path for down_read/up_read. If it succeeds we rely
- * there is no pending writer (atomic_read(write_ctr) == 0) and inc/dec the
+ * on the barriers provided by rcu_sync_enter/exit; see the comments in
- * fast per-cpu counter. The writer uses synchronize_sched_expedited() to
+ * percpu_down_write() and percpu_up_write().
- * serialize with the preempt-disabled section below.
- *
- * The nontrivial part is that we should guarantee acquire/release semantics
- * in case when
- *
- *      R_W: down_write() comes after up_read(), the writer should see all
- *           changes done by the reader
- * or
- *      W_R: down_read() comes after up_write(), the reader should see all
- *           changes done by the writer
 *
 * If this helper fails the callers rely on the normal rw_semaphore and
 * atomic_dec_and_test(), so in this case we have the necessary barriers.
- *
- * But if it succeeds we do not have any barriers, atomic_read(write_ctr) or
- * __this_cpu_add() below can be reordered with any LOAD/STORE done by the
- * reader inside the critical section. See the comments in down_write and
- * up_write below.
 */
 static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val)
 {
-        bool success = false;
+        bool success;
        preempt_disable();
-        if (likely(!atomic_read(&brw->write_ctr))) {
+        success = rcu_sync_is_idle(&brw->rss);
+        if (likely(success))
                __this_cpu_add(*brw->fast_read_ctr, val);
-                success = true;
-        }
        preempt_enable();
        return success;
@@ -77,16 +70,17 @@ static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val)
 void percpu_down_read(struct percpu_rw_semaphore *brw)
 {
        might_sleep();
-        if (likely(update_fast_ctr(brw, +1))) {
+        rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_);
-                rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_);
+        if (likely(update_fast_ctr(brw, +1)))
                return;
-        }
-        down_read(&brw->rw_sem);
+        /* Avoid rwsem_acquire_read() and rwsem_release() */
+        __down_read(&brw->rw_sem);
        atomic_inc(&brw->slow_read_ctr);
-        /* avoid up_read()->rwsem_release() */
        __up_read(&brw->rw_sem);
 }
+EXPORT_SYMBOL_GPL(percpu_down_read);
 int percpu_down_read_trylock(struct percpu_rw_semaphore *brw)
 {
@@ -112,6 +106,7 @@ void percpu_up_read(struct percpu_rw_semaphore *brw)
        if (atomic_dec_and_test(&brw->slow_read_ctr))
                wake_up_all(&brw->write_waitq);
 }
+EXPORT_SYMBOL_GPL(percpu_up_read);
 static int clear_fast_ctr(struct percpu_rw_semaphore *brw)
 {
@@ -126,33 +121,17 @@ static int clear_fast_ctr(struct percpu_rw_semaphore *brw)
        return sum;
 }
-/*
- * A writer increments ->write_ctr to force the readers to switch to the
- * slow mode, note the atomic_read() check in update_fast_ctr().
- *
- * After that the readers can only inc/dec the slow ->slow_read_ctr counter,
- * ->fast_read_ctr is stable. Once the writer moves its sum into the slow
- * counter it represents the number of active readers.
- *
- * Finally the writer takes ->rw_sem for writing and blocks the new readers,
- * then waits until the slow counter becomes zero.
- */
 void percpu_down_write(struct percpu_rw_semaphore *brw)
 {
-        /* tell update_fast_ctr() there is a pending writer */
-        atomic_inc(&brw->write_ctr);
        /*
-         * 1. Ensures that write_ctr != 0 is visible to any down_read/up_read
+         * Make rcu_sync_is_idle() == F and thus disable the fast-path in
-         *    so that update_fast_ctr() can't succeed.
+         * percpu_down_read() and percpu_up_read(), and wait for gp pass.
-         *
-         * 2. Ensures we see the result of every previous this_cpu_add() in
-         *    update_fast_ctr().
         *
-         * 3. Ensures that if any reader has exited its critical section via
+         * The latter synchronises us with the preceding readers which used
-         *    fast-path, it executes a full memory barrier before we return.
+         * the fast-past, so we can not miss the result of __this_cpu_add()
-         *    See R_W case in the comment above update_fast_ctr().
+         * or anything else inside their criticial sections.
         */
-        synchronize_sched_expedited();
+        rcu_sync_enter(&brw->rss);
        /* exclude other writers, and block the new readers completely */
        down_write(&brw->rw_sem);
@@ -163,16 +142,17 @@ void percpu_down_write(struct percpu_rw_semaphore *brw)
        /* wait for all readers to complete their percpu_up_read() */
        wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr));
 }
+EXPORT_SYMBOL_GPL(percpu_down_write);
 void percpu_up_write(struct percpu_rw_semaphore *brw)
 {
        /* release the lock, but the readers can't use the fast-path */
        up_write(&brw->rw_sem);
        /*
-         * Insert the barrier before the next fast-path in down_read,
+         * Enable the fast-path in percpu_down_read() and percpu_up_read()
-         * see W_R case in the comment above update_fast_ctr().
+         * but only after another gp pass; this adds the necessary barrier
+         * to ensure the reader can't miss the changes done by us.
         */
-        synchronize_sched_expedited();
+        rcu_sync_exit(&brw->rss);
-        /* the last writer unblocks update_fast_ctr() */
-        atomic_dec(&brw->write_ctr);
 }
+EXPORT_SYMBOL_GPL(percpu_up_write);
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index f17a3e3b3550..fec082338668 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -86,7 +86,7 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
        /*
         * Put the reader into the wait queue
         */
-        arch_spin_lock(&lock->lock);
+        arch_spin_lock(&lock->wait_lock);
        /*
         * The ACQUIRE semantics of the following spinning code ensure
@@ -99,7 +99,7 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
        /*
         * Signal the next one in queue to become queue head
         */
-        arch_spin_unlock(&lock->lock);
+        arch_spin_unlock(&lock->wait_lock);
 }
 EXPORT_SYMBOL(queued_read_lock_slowpath);
@@ -112,7 +112,7 @@ void queued_write_lock_slowpath(struct qrwlock *lock)
        u32 cnts;
        /* Put the writer into the wait queue */
-        arch_spin_lock(&lock->lock);
+        arch_spin_lock(&lock->wait_lock);
        /* Try to acquire the lock directly if no reader is present */
        if (!atomic_read(&lock->cnts) &&
@@ -144,6 +144,6 @@ void queued_write_lock_slowpath(struct qrwlock *lock)
                cpu_relax_lowlatency();
        }
 unlock:
-        arch_spin_unlock(&lock->lock);
+        arch_spin_unlock(&lock->wait_lock);
 }
 EXPORT_SYMBOL(queued_write_lock_slowpath);
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index c8e6e9a596f5..f0450ff4829b 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -267,7 +267,6 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
                }
                if (!lp) { /* ONCE */
-                        WRITE_ONCE(pn->state, vcpu_hashed);
                        lp = pv_hash(lock, pn);
                        /*
@@ -275,11 +274,9 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
                         * when we observe _Q_SLOW_VAL in __pv_queued_spin_unlock()
                         * we'll be sure to be able to observe our hash entry.
                         *
-                         *   [S] pn->state
                         *   [S] <hash>                 [Rmw] l->locked == _Q_SLOW_VAL
                         *       MB                           RMB
                         * [RmW] l->locked = _Q_SLOW_VAL  [L] <unhash>
-                         *                                [L] pn->state
                         *
                         * Matches the smp_rmb() in __pv_queued_spin_unlock().
                         */
@@ -364,8 +361,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
         * vCPU is harmless other than the additional latency in completing
         * the unlock.
         */
-        if (READ_ONCE(node->state) == vcpu_hashed)
+        pv_kick(node->cpu);
-                pv_kick(node->cpu);
 }
 /*
 * Include the architecture specific callee-save thunk of the
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 7781d801212f..8251e75dd9c0 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -74,14 +74,23 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
 * set up.
 */
 #ifndef CONFIG_DEBUG_RT_MUTEXES
-# define rt_mutex_cmpxchg(l,c,n)        (cmpxchg(&l->owner, c, n) == c)
+# define rt_mutex_cmpxchg_relaxed(l,c,n) (cmpxchg_relaxed(&l->owner, c, n) == c)
+# define rt_mutex_cmpxchg_acquire(l,c,n) (cmpxchg_acquire(&l->owner, c, n) == c)
+# define rt_mutex_cmpxchg_release(l,c,n) (cmpxchg_release(&l->owner, c, n) == c)
+/*
+ * Callers must hold the ->wait_lock -- which is the whole purpose as we force
+ * all future threads that attempt to [Rmw] the lock to the slowpath. As such
+ * relaxed semantics suffice.
+ */
 static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
 {
        unsigned long owner, *p = (unsigned long *) &lock->owner;
        do {
                owner = *p;
-        } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
+        } while (cmpxchg_relaxed(p, owner,
+                                 owner | RT_MUTEX_HAS_WAITERS) != owner);
 }
 /*
@@ -121,11 +130,14 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
         *                                      lock(wait_lock);
         *                                      acquire(lock);
         */
-        return rt_mutex_cmpxchg(lock, owner, NULL);
+        return rt_mutex_cmpxchg_release(lock, owner, NULL);
 }
 #else
-# define rt_mutex_cmpxchg(l,c,n)        (0)
+# define rt_mutex_cmpxchg_relaxed(l,c,n)        (0)
+# define rt_mutex_cmpxchg_acquire(l,c,n)        (0)
+# define rt_mutex_cmpxchg_release(l,c,n)        (0)
 static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
 {
        lock->owner = (struct task_struct *)
@@ -158,7 +170,8 @@ rt_mutex_waiter_less(struct rt_mutex_waiter *left,
         * then right waiter has a dl_prio() too.
         */
        if (dl_prio(left->prio))
-                return (left->task->dl.deadline < right->task->dl.deadline);
+                return dl_time_before(left->task->dl.deadline,
+                                      right->task->dl.deadline);
        return 0;
 }
@@ -1321,7 +1334,7 @@ rt_mutex_fastlock(struct rt_mutex *lock, int state,
                                struct hrtimer_sleeper *timeout,
                                enum rtmutex_chainwalk chwalk))
 {
-        if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+        if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
                rt_mutex_deadlock_account_lock(lock, current);
                return 0;
        } else
@@ -1337,7 +1350,7 @@ rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
                                      enum rtmutex_chainwalk chwalk))
 {
        if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
-            likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+            likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
                rt_mutex_deadlock_account_lock(lock, current);
                return 0;
        } else
@@ -1348,7 +1361,7 @@ static inline int
 rt_mutex_fasttrylock(struct rt_mutex *lock,
                     int (*slowfn)(struct rt_mutex *lock))
 {
-        if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+        if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
                rt_mutex_deadlock_account_lock(lock, current);
                return 1;
        }
@@ -1362,7 +1375,7 @@ rt_mutex_fastunlock(struct rt_mutex *lock,
 {
        WAKE_Q(wake_q);
-        if (likely(rt_mutex_cmpxchg(lock, current, NULL))) {
+        if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
                rt_mutex_deadlock_account_unlock(current);
        } else {
@@ -1484,7 +1497,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_unlock);
 bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock,
                                   struct wake_q_head *wqh)
 {
-        if (likely(rt_mutex_cmpxchg(lock, current, NULL))) {
+        if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
                rt_mutex_deadlock_account_unlock(current);
                return false;
        }
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 0f189714e457..a4d4de05b2d1 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -262,7 +262,7 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
         * to reduce unnecessary expensive cmpxchg() operations.
         */
        if (count == RWSEM_WAITING_BIAS &&
-            cmpxchg(&sem->count, RWSEM_WAITING_BIAS,
+            cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS,
                    RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
                if (!list_is_singular(&sem->wait_list))
                        rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
@@ -285,7 +285,8 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
                if (!(count == 0 || count == RWSEM_WAITING_BIAS))
                        return false;
-                old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS);
+                old = cmpxchg_acquire(&sem->count, count,
+                                      count + RWSEM_ACTIVE_WRITE_BIAS);
                if (old == count) {
                        rwsem_set_owner(sem);
                        return true;
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 72b0c66628b6..7658d32c5c78 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -24,6 +24,16 @@ __weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size)
 }
 #endif
+static void *try_ram_remap(resource_size_t offset, size_t size)
+{
+        struct page *page = pfn_to_page(offset >> PAGE_SHIFT);
+        /* In the simple case just return the existing linear address */
+        if (!PageHighMem(page))
+                return __va(offset);
+        return NULL; /* fallback to ioremap_cache */
+}
 /**
 * memremap() - remap an iomem_resource as cacheable memory
 * @offset: iomem resource start address
@@ -66,8 +76,8 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags)
                 * the requested range is potentially in "System RAM"
                 */
                if (is_ram == REGION_INTERSECTS)
-                        addr = __va(offset);
+                        addr = try_ram_remap(offset, size);
-                else
+                if (!addr)
                        addr = ioremap_cache(offset, size);
        }
@@ -114,9 +124,10 @@ void *devm_memremap(struct device *dev, resource_size_t offset,
 {
        void **ptr, *addr;
-        ptr = devres_alloc(devm_memremap_release, sizeof(*ptr), GFP_KERNEL);
+        ptr = devres_alloc_node(devm_memremap_release, sizeof(*ptr), GFP_KERNEL,
+                        dev_to_node(dev));
        if (!ptr)
-                return NULL;
+                return ERR_PTR(-ENOMEM);
        addr = memremap(offset, size, flags);
        if (addr) {
@@ -131,9 +142,8 @@ EXPORT_SYMBOL(devm_memremap);
 void devm_memunmap(struct device *dev, void *addr)
 {
-        WARN_ON(devres_destroy(dev, devm_memremap_release, devm_memremap_match,
+        WARN_ON(devres_release(dev, devm_memremap_release,
-                               addr));
+                                devm_memremap_match, addr));
-        memunmap(addr);
 }
 EXPORT_SYMBOL(devm_memunmap);
@@ -166,8 +176,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res)
        if (is_ram == REGION_INTERSECTS)
                return __va(res->start);
-        page_map = devres_alloc(devm_memremap_pages_release,
+        page_map = devres_alloc_node(devm_memremap_pages_release,
-                        sizeof(*page_map), GFP_KERNEL);
+                        sizeof(*page_map), GFP_KERNEL, dev_to_node(dev));
        if (!page_map)
                return ERR_PTR(-ENOMEM);
@@ -175,7 +185,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res)
        nid = dev_to_node(dev);
        if (nid < 0)
-                nid = 0;
+                nid = numa_mem_id();
        error = arch_add_memory(nid, res->start, resource_size(res), true);
        if (error) {
diff --git a/kernel/module.c b/kernel/module.c
index b86b7bf1be38..8f051a106676 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1063,11 +1063,15 @@ void symbol_put_addr(void *addr)
        if (core_kernel_text(a))
                return;
-        /* module_text_address is safe here: we're supposed to have reference
+        /*
-         * to module from symbol_get, so it can't go away. */
+         * Even though we hold a reference on the module; we still need to
+         * disable preemption in order to safely traverse the data structure.
+         */
+        preempt_disable();
        modaddr = __module_text_address(a);
        BUG_ON(!modaddr);
        module_put(modaddr);
+        preempt_enable();
 }
 EXPORT_SYMBOL_GPL(symbol_put_addr);
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index bd62f5cda746..6528a79d998d 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c
@@ -10,6 +10,7 @@
 */
 #include <linux/kernel.h>
+#include <linux/errno.h>
 #include <keys/system_keyring.h>
 #include <crypto/public_key.h>
 #include "module-internal.h"
diff --git a/kernel/panic.c b/kernel/panic.c
index 04e91ff7560b..4579dbb7ed87 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -23,6 +23,7 @@
 #include <linux/sysrq.h>
 #include <linux/init.h>
 #include <linux/nmi.h>
+#include <linux/console.h>
 #define PANIC_TIMER_STEP 100
 #define PANIC_BLINK_SPD 18
@@ -147,6 +148,15 @@ void panic(const char *fmt, ...)
        bust_spinlocks(0);
+        /*
+         * We may have ended up stopping the CPU holding the lock (in
+         * smp_send_stop()) while still having some valuable data in the console
+         * buffer.  Try to acquire the lock then release it regardless of the
+         * result.  The release will also print the buffers out.
+         */
+        console_trylock();
+        console_unlock();
        if (!panic_blink)
                panic_blink = no_blink;
diff --git a/kernel/params.c b/kernel/params.c
index b6554aa71094..a6d6149c0fe6 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -223,7 +223,7 @@ char *parse_args(const char *doing,
                 int (*unknown)(char *param, char *val,
                                const char *doing, void *arg))
 {
-        char *param, *val;
+        char *param, *val, *err = NULL;
        /* Chew leading spaces */
        args = skip_spaces(args);
@@ -238,7 +238,7 @@ char *parse_args(const char *doing,
                args = next_arg(args, &param, &val);
                /* Stop at -- */
                if (!val && strcmp(param, "--") == 0)
-                        return args;
+                        return err ?: args;
                irq_was_disabled = irqs_disabled();
                ret = parse_one(param, val, doing, params, num,
                                min_level, max_level, arg, unknown);
@@ -247,24 +247,25 @@ char *parse_args(const char *doing,
                                doing, param);
                switch (ret) {
+                case 0:
+                        continue;
                case -ENOENT:
                        pr_err("%s: Unknown parameter `%s'\n", doing, param);
-                        return ERR_PTR(ret);
+                        break;
                case -ENOSPC:
                        pr_err("%s: `%s' too large for parameter `%s'\n",
                               doing, val ?: "", param);
-                        return ERR_PTR(ret);
-                case 0:
                        break;
                default:
                        pr_err("%s: `%s' invalid for parameter `%s'\n",
                               doing, val ?: "", param);
-                        return ERR_PTR(ret);
+                        break;
                }
+                err = ERR_PTR(ret);
        }
-        /* All parsed OK. */
+        return err;
-        return NULL;
 }
 /* Lazy bastard, eh? */
@@ -325,10 +326,11 @@ int param_get_charp(char *buffer, const struct kernel_param *kp)
 }
 EXPORT_SYMBOL(param_get_charp);
-static void param_free_charp(void *arg)
+void param_free_charp(void *arg)
 {
        maybe_kfree_parameter(*((char **)arg));
 }
+EXPORT_SYMBOL(param_free_charp);
 const struct kernel_param_ops param_ops_charp = {
        .set = param_set_charp,
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 690f78f210f2..b7342a24f559 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -733,7 +733,7 @@ int hibernate(void)
 * contents of memory is restored from the saved image.
 *
 * If this is successful, control reappears in the restored target kernel in
- * hibernation_snaphot() which returns to hibernate().  Otherwise, the routine
+ * hibernation_snapshot() which returns to hibernate().  Otherwise, the routine
 * attempts to recover gracefully and make the kernel return to the normal mode
 * of operation.
 */
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 63d395b5df93..b2dd4d999900 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -272,6 +272,22 @@ static inline void pm_print_times_init(void)
 {
        pm_print_times_enabled = !!initcall_debug;
 }
+static ssize_t pm_wakeup_irq_show(struct kobject *kobj,
+                                        struct kobj_attribute *attr,
+                                        char *buf)
+{
+        return pm_wakeup_irq ? sprintf(buf, "%u\n", pm_wakeup_irq) : -ENODATA;
+}
+static ssize_t pm_wakeup_irq_store(struct kobject *kobj,
+                                        struct kobj_attribute *attr,
+                                        const char *buf, size_t n)
+{
+        return -EINVAL;
+}
+power_attr(pm_wakeup_irq);
 #else /* !CONFIG_PM_SLEEP_DEBUG */
 static inline void pm_print_times_init(void) {}
 #endif /* CONFIG_PM_SLEEP_DEBUG */
@@ -604,6 +620,7 @@ static struct attribute * g[] = {
 #endif
 #ifdef CONFIG_PM_SLEEP_DEBUG
        &pm_print_times_attr.attr,
+        &pm_wakeup_irq_attr.attr,
 #endif
 #endif
 #ifdef CONFIG_FREEZER
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 5235dd4e1e2f..3a970604308f 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1779,7 +1779,7 @@ alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
        while (to_alloc-- > 0) {
                struct page *page;
-                page = alloc_image_page(__GFP_HIGHMEM);
+                page = alloc_image_page(__GFP_HIGHMEM|__GFP_KSWAPD_RECLAIM);
                memory_bm_set_bit(bm, page_to_pfn(page));
        }
        return nr_highmem;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 7e4cda4a8dd9..f9fe133c13e2 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -35,6 +35,9 @@
 const char *pm_labels[] = { "mem", "standby", "freeze", NULL };
 const char *pm_states[PM_SUSPEND_MAX];
+unsigned int pm_suspend_global_flags;
+EXPORT_SYMBOL_GPL(pm_suspend_global_flags);
 static const struct platform_suspend_ops *suspend_ops;
 static const struct platform_freeze_ops *freeze_ops;
 static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
@@ -493,6 +496,7 @@ static int enter_state(suspend_state_t state)
 #endif
        pr_debug("PM: Preparing system for sleep (%s)\n", pm_states[state]);
+        pm_suspend_clear_flags();
        error = suspend_prepare(state);
        if (error)
                goto Unlock;
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index b2066fb5b10f..12cd989dadf6 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -257,7 +257,7 @@ static int hib_submit_io(int rw, pgoff_t page_off, void *addr,
        struct bio *bio;
        int error = 0;
-        bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
+        bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1);
        bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);
        bio->bi_bdev = hib_resume_bdev;
@@ -356,7 +356,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb)
                return -ENOSPC;
        if (hb) {
-                src = (void *)__get_free_page(__GFP_WAIT | __GFP_NOWARN |
+                src = (void *)__get_free_page(__GFP_RECLAIM | __GFP_NOWARN |
                                              __GFP_NORETRY);
                if (src) {
                        copy_page(src, buf);
@@ -364,7 +364,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb)
                        ret = hib_wait_io(hb); /* Free pages */
                        if (ret)
                                return ret;
-                        src = (void *)__get_free_page(__GFP_WAIT |
+                        src = (void *)__get_free_page(__GFP_RECLAIM |
                                                      __GFP_NOWARN |
                                                      __GFP_NORETRY);
                        if (src) {
@@ -672,7 +672,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
        nr_threads = num_online_cpus() - 1;
        nr_threads = clamp_val(nr_threads, 1, LZO_THREADS);
-        page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
+        page = (void *)__get_free_page(__GFP_RECLAIM | __GFP_HIGH);
        if (!page) {
                printk(KERN_ERR "PM: Failed to allocate LZO page\n");
                ret = -ENOMEM;
@@ -975,7 +975,7 @@ static int get_swap_reader(struct swap_map_handle *handle,
                last = tmp;
                tmp->map = (struct swap_map_page *)
-                           __get_free_page(__GFP_WAIT | __GFP_HIGH);
+                           __get_free_page(__GFP_RECLAIM | __GFP_HIGH);
                if (!tmp->map) {
                        release_swap_reader(handle);
                        return -ENOMEM;
@@ -1242,9 +1242,9 @@ static int load_image_lzo(struct swap_map_handle *handle,
        for (i = 0; i < read_pages; i++) {
                page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ?
-                                                  __GFP_WAIT | __GFP_HIGH :
+                                                  __GFP_RECLAIM | __GFP_HIGH :
-                                                  __GFP_WAIT | __GFP_NOWARN |
+                                                  __GFP_RECLAIM | __GFP_NOWARN |
-                                                  __GFP_NORETRY);
+                                                  __GFP_NORETRY);
                if (!page[i]) {
                        if (i < LZO_CMP_PAGES) {
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 8f0324ef72ab..2ce8826f1053 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -269,6 +269,9 @@ static u32 clear_idx;
 #define PREFIX_MAX              32
 #define LOG_LINE_MAX            (1024 - PREFIX_MAX)
+#define LOG_LEVEL(v)            ((v) & 0x07)
+#define LOG_FACILITY(v)         ((v) >> 3 & 0xff)
 /* record buffer */
 #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
 #define LOG_ALIGN 4
@@ -517,6 +520,7 @@ int check_syslog_permissions(int type, int source)
 ok:
        return security_syslog(type);
 }
+EXPORT_SYMBOL_GPL(check_syslog_permissions);
 static void append_char(char **pp, char *e, char c)
 {
@@ -611,7 +615,6 @@ struct devkmsg_user {
 static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
 {
        char *buf, *line;
-        int i;
        int level = default_message_loglevel;
        int facility = 1;       /* LOG_USER */
        size_t len = iov_iter_count(from);
@@ -641,12 +644,13 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
        line = buf;
        if (line[0] == '<') {
                char *endp = NULL;
+                unsigned int u;
-                i = simple_strtoul(line+1, &endp, 10);
+                u = simple_strtoul(line + 1, &endp, 10);
                if (endp && endp[0] == '>') {
-                        level = i & 7;
+                        level = LOG_LEVEL(u);
-                        if (i >> 3)
+                        if (LOG_FACILITY(u) != 0)
-                                facility = i >> 3;
+                                facility = LOG_FACILITY(u);
                        endp++;
                        len -= endp - line;
                        line = endp;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 787320de68e0..b760bae64cf1 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -1016,6 +1016,11 @@ int ptrace_request(struct task_struct *child, long request,
                break;
        }
 #endif
+        case PTRACE_SECCOMP_GET_FILTER:
+                ret = seccomp_get_filter(child, addr, datavp);
+                break;
        default:
                break;
        }
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 50a808424b06..61a16569ffbf 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -1,4 +1,4 @@
-obj-y += update.o
+obj-y += update.o sync.o
 obj-$(CONFIG_SRCU) += srcu.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_TREE_RCU) += tree.o
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 77192953dee5..d89328e260df 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -252,7 +252,7 @@ struct rcu_torture_ops {
        void (*exp_sync)(void);
        unsigned long (*get_state)(void);
        void (*cond_sync)(unsigned long oldstate);
-        void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
+        call_rcu_func_t call;
        void (*cb_barrier)(void);
        void (*fqs)(void);
        void (*stats)(void);
@@ -448,7 +448,7 @@ static void synchronize_rcu_busted(void)
 }
 static void
-call_rcu_busted(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+call_rcu_busted(struct rcu_head *head, rcu_callback_t func)
 {
        /* This is a deliberate bug for testing purposes only! */
        func(head);
@@ -523,7 +523,7 @@ static void srcu_torture_synchronize(void)
 }
 static void srcu_torture_call(struct rcu_head *head,
-                              void (*func)(struct rcu_head *head))
+                              rcu_callback_t func)
 {
        call_srcu(srcu_ctlp, head, func);
 }
@@ -695,7 +695,7 @@ static bool __maybe_unused torturing_tasks(void)
 #define RCUTORTURE_TASKS_OPS
-static bool torturing_tasks(void)
+static bool __maybe_unused torturing_tasks(void)
 {
        return false;
 }
@@ -768,7 +768,6 @@ static int rcu_torture_boost(void *arg)
                                }
                                call_rcu_time = jiffies;
                        }
-                        cond_resched_rcu_qs();
                        stutter_wait("rcu_torture_boost");
                        if (torture_must_stop())
                                goto checkwait;
@@ -1208,7 +1207,6 @@ rcu_torture_reader(void *arg)
                __this_cpu_inc(rcu_torture_batch[completed]);
                preempt_enable();
                cur_ops->readunlock(idx);
-                cond_resched_rcu_qs();
                stutter_wait("rcu_torture_reader");
        } while (!torture_must_stop());
        if (irqreader && cur_ops->irq_capable) {
@@ -1742,15 +1740,15 @@ rcu_torture_init(void)
                for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
                        pr_alert(" %s", torture_ops[i]->name);
                pr_alert("\n");
-                torture_init_end();
+                firsterr = -EINVAL;
-                return -EINVAL;
+                goto unwind;
        }
        if (cur_ops->fqs == NULL && fqs_duration != 0) {
                pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n");
                fqs_duration = 0;
        }
        if (cur_ops->init)
-                cur_ops->init(); /* no "goto unwind" prior to this point!!! */
+                cur_ops->init();
        if (nreaders >= 0) {
                nrealreaders = nreaders;
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index d3fcb2ec8536..a63a1ea5a41b 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -298,11 +298,9 @@ int __srcu_read_lock(struct srcu_struct *sp)
        int idx;
        idx = READ_ONCE(sp->completed) & 0x1;
-        preempt_disable();
        __this_cpu_inc(sp->per_cpu_ref->c[idx]);
        smp_mb(); /* B */  /* Avoid leaking the critical section. */
        __this_cpu_inc(sp->per_cpu_ref->seq[idx]);
-        preempt_enable();
        return idx;
 }
 EXPORT_SYMBOL_GPL(__srcu_read_lock);
@@ -387,7 +385,7 @@ static void srcu_flip(struct srcu_struct *sp)
 * srcu_struct structure.
 */
 void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
-                void (*func)(struct rcu_head *head))
+               rcu_callback_t func)
 {
        unsigned long flags;
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
new file mode 100644
index 000000000000..be922c9f3d37
--- /dev/null
+++ b/kernel/rcu/sync.c
@@ -0,0 +1,223 @@
+/*
+ * RCU-based infrastructure for lightweight reader-writer locking
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (c) 2015, Red Hat, Inc.
+ *
+ * Author: Oleg Nesterov <oleg@redhat.com>
+ */
+#include <linux/rcu_sync.h>
+#include <linux/sched.h>
+#ifdef CONFIG_PROVE_RCU
+#define __INIT_HELD(func)       .held = func,
+#else
+#define __INIT_HELD(func)
+#endif
+static const struct {
+        void (*sync)(void);
+        void (*call)(struct rcu_head *, void (*)(struct rcu_head *));
+        void (*wait)(void);
+#ifdef CONFIG_PROVE_RCU
+        int  (*held)(void);
+#endif
+} gp_ops[] = {
+        [RCU_SYNC] = {
+                .sync = synchronize_rcu,
+                .call = call_rcu,
+                .wait = rcu_barrier,
+                __INIT_HELD(rcu_read_lock_held)
+        },
+        [RCU_SCHED_SYNC] = {
+                .sync = synchronize_sched,
+                .call = call_rcu_sched,
+                .wait = rcu_barrier_sched,
+                __INIT_HELD(rcu_read_lock_sched_held)
+        },
+        [RCU_BH_SYNC] = {
+                .sync = synchronize_rcu_bh,
+                .call = call_rcu_bh,
+                .wait = rcu_barrier_bh,
+                __INIT_HELD(rcu_read_lock_bh_held)
+        },
+};
+enum { GP_IDLE = 0, GP_PENDING, GP_PASSED };
+enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY };
+#define rss_lock        gp_wait.lock
+#ifdef CONFIG_PROVE_RCU
+void rcu_sync_lockdep_assert(struct rcu_sync *rsp)
+{
+        RCU_LOCKDEP_WARN(!gp_ops[rsp->gp_type].held(),
+                         "suspicious rcu_sync_is_idle() usage");
+}
+#endif
+/**
+ * rcu_sync_init() - Initialize an rcu_sync structure
+ * @rsp: Pointer to rcu_sync structure to be initialized
+ * @type: Flavor of RCU with which to synchronize rcu_sync structure
+ */
+void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type)
+{
+        memset(rsp, 0, sizeof(*rsp));
+        init_waitqueue_head(&rsp->gp_wait);
+        rsp->gp_type = type;
+}
+/**
+ * rcu_sync_enter() - Force readers onto slowpath
+ * @rsp: Pointer to rcu_sync structure to use for synchronization
+ *
+ * This function is used by updaters who need readers to make use of
+ * a slowpath during the update.  After this function returns, all
+ * subsequent calls to rcu_sync_is_idle() will return false, which
+ * tells readers to stay off their fastpaths.  A later call to
+ * rcu_sync_exit() re-enables reader slowpaths.
+ *
+ * When called in isolation, rcu_sync_enter() must wait for a grace
+ * period, however, closely spaced calls to rcu_sync_enter() can
+ * optimize away the grace-period wait via a state machine implemented
+ * by rcu_sync_enter(), rcu_sync_exit(), and rcu_sync_func().
+ */
+void rcu_sync_enter(struct rcu_sync *rsp)
+{
+        bool need_wait, need_sync;
+        spin_lock_irq(&rsp->rss_lock);
+        need_wait = rsp->gp_count++;
+        need_sync = rsp->gp_state == GP_IDLE;
+        if (need_sync)
+                rsp->gp_state = GP_PENDING;
+        spin_unlock_irq(&rsp->rss_lock);
+        BUG_ON(need_wait && need_sync);
+        if (need_sync) {
+                gp_ops[rsp->gp_type].sync();
+                rsp->gp_state = GP_PASSED;
+                wake_up_all(&rsp->gp_wait);
+        } else if (need_wait) {
+                wait_event(rsp->gp_wait, rsp->gp_state == GP_PASSED);
+        } else {
+                /*
+                 * Possible when there's a pending CB from a rcu_sync_exit().
+                 * Nobody has yet been allowed the 'fast' path and thus we can
+                 * avoid doing any sync(). The callback will get 'dropped'.
+                 */
+                BUG_ON(rsp->gp_state != GP_PASSED);
+        }
+}
+/**
+ * rcu_sync_func() - Callback function managing reader access to fastpath
+ * @rsp: Pointer to rcu_sync structure to use for synchronization
+ *
+ * This function is passed to one of the call_rcu() functions by
+ * rcu_sync_exit(), so that it is invoked after a grace period following the
+ * that invocation of rcu_sync_exit().  It takes action based on events that
+ * have taken place in the meantime, so that closely spaced rcu_sync_enter()
+ * and rcu_sync_exit() pairs need not wait for a grace period.
+ *
+ * If another rcu_sync_enter() is invoked before the grace period
+ * ended, reset state to allow the next rcu_sync_exit() to let the
+ * readers back onto their fastpaths (after a grace period).  If both
+ * another rcu_sync_enter() and its matching rcu_sync_exit() are invoked
+ * before the grace period ended, re-invoke call_rcu() on behalf of that
+ * rcu_sync_exit().  Otherwise, set all state back to idle so that readers
+ * can again use their fastpaths.
+ */
+static void rcu_sync_func(struct rcu_head *rcu)
+{
+        struct rcu_sync *rsp = container_of(rcu, struct rcu_sync, cb_head);
+        unsigned long flags;
+        BUG_ON(rsp->gp_state != GP_PASSED);
+        BUG_ON(rsp->cb_state == CB_IDLE);
+        spin_lock_irqsave(&rsp->rss_lock, flags);
+        if (rsp->gp_count) {
+                /*
+                 * A new rcu_sync_begin() has happened; drop the callback.
+                 */
+                rsp->cb_state = CB_IDLE;
+        } else if (rsp->cb_state == CB_REPLAY) {
+                /*
+                 * A new rcu_sync_exit() has happened; requeue the callback
+                 * to catch a later GP.
+                 */
+                rsp->cb_state = CB_PENDING;
+                gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func);
+        } else {
+                /*
+                 * We're at least a GP after rcu_sync_exit(); eveybody will now
+                 * have observed the write side critical section. Let 'em rip!.
+                 */
+                rsp->cb_state = CB_IDLE;
+                rsp->gp_state = GP_IDLE;
+        }
+        spin_unlock_irqrestore(&rsp->rss_lock, flags);
+}
+/**
+ * rcu_sync_exit() - Allow readers back onto fast patch after grace period
+ * @rsp: Pointer to rcu_sync structure to use for synchronization
+ *
+ * This function is used by updaters who have completed, and can therefore
+ * now allow readers to make use of their fastpaths after a grace period
+ * has elapsed.  After this grace period has completed, all subsequent
+ * calls to rcu_sync_is_idle() will return true, which tells readers that
+ * they can once again use their fastpaths.
+ */
+void rcu_sync_exit(struct rcu_sync *rsp)
+{
+        spin_lock_irq(&rsp->rss_lock);
+        if (!--rsp->gp_count) {
+                if (rsp->cb_state == CB_IDLE) {
+                        rsp->cb_state = CB_PENDING;
+                        gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func);
+                } else if (rsp->cb_state == CB_PENDING) {
+                        rsp->cb_state = CB_REPLAY;
+                }
+        }
+        spin_unlock_irq(&rsp->rss_lock);
+}
+/**
+ * rcu_sync_dtor() - Clean up an rcu_sync structure
+ * @rsp: Pointer to rcu_sync structure to be cleaned up
+ */
+void rcu_sync_dtor(struct rcu_sync *rsp)
+{
+        int cb_state;
+        BUG_ON(rsp->gp_count);
+        spin_lock_irq(&rsp->rss_lock);
+        if (rsp->cb_state == CB_REPLAY)
+                rsp->cb_state = CB_PENDING;
+        cb_state = rsp->cb_state;
+        spin_unlock_irq(&rsp->rss_lock);
+        if (cb_state != CB_IDLE) {
+                gp_ops[rsp->gp_type].wait();
+                BUG_ON(rsp->cb_state != CB_IDLE);
+        }
+}
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index d0471056d0af..944b1b491ed8 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -44,7 +44,7 @@ struct rcu_ctrlblk;
 static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
 static void rcu_process_callbacks(struct softirq_action *unused);
 static void __call_rcu(struct rcu_head *head,
-                       void (*func)(struct rcu_head *rcu),
+                       rcu_callback_t func,
                       struct rcu_ctrlblk *rcp);
 #include "tiny_plugin.h"
@@ -203,7 +203,7 @@ EXPORT_SYMBOL_GPL(synchronize_sched);
 * Helper function for call_rcu() and call_rcu_bh().
 */
 static void __call_rcu(struct rcu_head *head,
-                       void (*func)(struct rcu_head *rcu),
+                       rcu_callback_t func,
                       struct rcu_ctrlblk *rcp)
 {
        unsigned long flags;
@@ -229,7 +229,7 @@ static void __call_rcu(struct rcu_head *head,
 * period.  But since we have but one CPU, that would be after any
 * quiescent state.
 */
-void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+void call_rcu_sched(struct rcu_head *head, rcu_callback_t func)
 {
        __call_rcu(head, func, &rcu_sched_ctrlblk);
 }
@@ -239,7 +239,7 @@ EXPORT_SYMBOL_GPL(call_rcu_sched);
 * Post an RCU bottom-half callback to be invoked after any subsequent
 * quiescent state.
 */
-void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+void call_rcu_bh(struct rcu_head *head, rcu_callback_t func)
 {
        __call_rcu(head, func, &rcu_bh_ctrlblk);
 }
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 9f75f25cc5d9..f07343b54fe5 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -71,7 +71,6 @@ MODULE_ALIAS("rcutree");
 static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
 static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
 static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS];
-static struct lock_class_key rcu_exp_sched_class[RCU_NUM_LVLS];
 /*
 * In order to export the rcu_state name to the tracing tools, it
@@ -98,7 +97,7 @@ struct rcu_state sname##_state = { \
        .level = { &sname##_state.node[0] }, \
        .rda = &sname##_data, \
        .call = cr, \
-        .fqs_state = RCU_GP_IDLE, \
+        .gp_state = RCU_GP_IDLE, \
        .gpnum = 0UL - 300UL, \
        .completed = 0UL - 300UL, \
        .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \
@@ -161,6 +160,8 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf);
 static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
 static void invoke_rcu_core(void);
 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
+static void rcu_report_exp_rdp(struct rcu_state *rsp,
+                               struct rcu_data *rdp, bool wake);
 /* rcuc/rcub kthread realtime priority */
 #ifdef CONFIG_RCU_KTHREAD_PRIO
@@ -245,21 +246,33 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
 */
 void rcu_sched_qs(void)
 {
-        if (!__this_cpu_read(rcu_sched_data.passed_quiesce)) {
+        unsigned long flags;
+        if (__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) {
                trace_rcu_grace_period(TPS("rcu_sched"),
                                       __this_cpu_read(rcu_sched_data.gpnum),
                                       TPS("cpuqs"));
-                __this_cpu_write(rcu_sched_data.passed_quiesce, 1);
+                __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false);
+                if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
+                        return;
+                local_irq_save(flags);
+                if (__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) {
+                        __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false);
+                        rcu_report_exp_rdp(&rcu_sched_state,
+                                           this_cpu_ptr(&rcu_sched_data),
+                                           true);
+                }
+                local_irq_restore(flags);
        }
 }
 void rcu_bh_qs(void)
 {
-        if (!__this_cpu_read(rcu_bh_data.passed_quiesce)) {
+        if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) {
                trace_rcu_grace_period(TPS("rcu_bh"),
                                       __this_cpu_read(rcu_bh_data.gpnum),
                                       TPS("cpuqs"));
-                __this_cpu_write(rcu_bh_data.passed_quiesce, 1);
+                __this_cpu_write(rcu_bh_data.cpu_no_qs.b.norm, false);
        }
 }
@@ -337,12 +350,14 @@ static void rcu_momentary_dyntick_idle(void)
 */
 void rcu_note_context_switch(void)
 {
+        barrier(); /* Avoid RCU read-side critical sections leaking down. */
        trace_rcu_utilization(TPS("Start context switch"));
        rcu_sched_qs();
        rcu_preempt_note_context_switch();
        if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
                rcu_momentary_dyntick_idle();
        trace_rcu_utilization(TPS("End context switch"));
+        barrier(); /* Avoid RCU read-side critical sections leaking up. */
 }
 EXPORT_SYMBOL_GPL(rcu_note_context_switch);
@@ -353,12 +368,19 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch);
 * RCU flavors in desperate need of a quiescent state, which will normally
 * be none of them).  Either way, do a lightweight quiescent state for
 * all RCU flavors.
+ *
+ * The barrier() calls are redundant in the common case when this is
+ * called externally, but just in case this is called from within this
+ * file.
+ *
 */
 void rcu_all_qs(void)
 {
+        barrier(); /* Avoid RCU read-side critical sections leaking down. */
        if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
                rcu_momentary_dyntick_idle();
        this_cpu_inc(rcu_qs_ctr);
+        barrier(); /* Avoid RCU read-side critical sections leaking up. */
 }
 EXPORT_SYMBOL_GPL(rcu_all_qs);
@@ -1744,9 +1766,9 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
                 */
                rdp->gpnum = rnp->gpnum;
                trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart"));
-                rdp->passed_quiesce = 0;
+                rdp->cpu_no_qs.b.norm = true;
                rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
-                rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
+                rdp->core_needs_qs = !!(rnp->qsmask & rdp->grpmask);
                zero_cpu_stall_ticks(rdp);
                WRITE_ONCE(rdp->gpwrap, false);
        }
@@ -1927,16 +1949,15 @@ static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp)
 /*
 * Do one round of quiescent-state forcing.
 */
-static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
+static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time)
 {
-        int fqs_state = fqs_state_in;
        bool isidle = false;
        unsigned long maxj;
        struct rcu_node *rnp = rcu_get_root(rsp);
        WRITE_ONCE(rsp->gp_activity, jiffies);
        rsp->n_force_qs++;
-        if (fqs_state == RCU_SAVE_DYNTICK) {
+        if (first_time) {
                /* Collect dyntick-idle snapshots. */
                if (is_sysidle_rcu_state(rsp)) {
                        isidle = true;
@@ -1945,7 +1966,6 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
                force_qs_rnp(rsp, dyntick_save_progress_counter,
                             &isidle, &maxj);
                rcu_sysidle_report_gp(rsp, isidle, maxj);
-                fqs_state = RCU_FORCE_QS;
        } else {
                /* Handle dyntick-idle and offline CPUs. */
                isidle = true;
@@ -1959,7 +1979,6 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
                           READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS);
                raw_spin_unlock_irq(&rnp->lock);
        }
-        return fqs_state;
 }
 /*
@@ -2023,7 +2042,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
        /* Declare grace period done. */
        WRITE_ONCE(rsp->completed, rsp->gpnum);
        trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end"));
-        rsp->fqs_state = RCU_GP_IDLE;
+        rsp->gp_state = RCU_GP_IDLE;
        rdp = this_cpu_ptr(rsp->rda);
        /* Advance CBs to reduce false positives below. */
        needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp;
@@ -2041,7 +2060,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
 */
 static int __noreturn rcu_gp_kthread(void *arg)
 {
-        int fqs_state;
+        bool first_gp_fqs;
        int gf;
        unsigned long j;
        int ret;
@@ -2073,7 +2092,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
                }
                /* Handle quiescent-state forcing. */
-                fqs_state = RCU_SAVE_DYNTICK;
+                first_gp_fqs = true;
                j = jiffies_till_first_fqs;
                if (j > HZ) {
                        j = HZ;
@@ -2101,7 +2120,8 @@ static int __noreturn rcu_gp_kthread(void *arg)
                                trace_rcu_grace_period(rsp->name,
                                                       READ_ONCE(rsp->gpnum),
                                                       TPS("fqsstart"));
-                                fqs_state = rcu_gp_fqs(rsp, fqs_state);
+                                rcu_gp_fqs(rsp, first_gp_fqs);
+                                first_gp_fqs = false;
                                trace_rcu_grace_period(rsp->name,
                                                       READ_ONCE(rsp->gpnum),
                                                       TPS("fqsend"));
@@ -2337,7 +2357,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
        rnp = rdp->mynode;
        raw_spin_lock_irqsave(&rnp->lock, flags);
        smp_mb__after_unlock_lock();
-        if ((rdp->passed_quiesce == 0 &&
+        if ((rdp->cpu_no_qs.b.norm &&
             rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) ||
            rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum ||
            rdp->gpwrap) {
@@ -2348,7 +2368,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
                 * We will instead need a new quiescent state that lies
                 * within the current grace period.
                 */
-                rdp->passed_quiesce = 0;        /* need qs for new gp. */
+                rdp->cpu_no_qs.b.norm = true;   /* need qs for new gp. */
                rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
                return;
@@ -2357,7 +2377,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
        if ((rnp->qsmask & mask) == 0) {
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        } else {
-                rdp->qs_pending = 0;
+                rdp->core_needs_qs = 0;
                /*
                 * This GP can't end until cpu checks in, so all of our
@@ -2388,14 +2408,14 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
         * Does this CPU still need to do its part for current grace period?
         * If no, return and let the other CPUs do their part as well.
         */
-        if (!rdp->qs_pending)
+        if (!rdp->core_needs_qs)
                return;
        /*
         * Was there a quiescent state since the beginning of the grace
         * period? If no, then exit and wait for the next call.
         */
-        if (!rdp->passed_quiesce &&
+        if (rdp->cpu_no_qs.b.norm &&
            rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr))
                return;
@@ -3017,7 +3037,7 @@ static void rcu_leak_callback(struct rcu_head *rhp)
 * is expected to specify a CPU.
 */
 static void
-__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
+__call_rcu(struct rcu_head *head, rcu_callback_t func,
           struct rcu_state *rsp, int cpu, bool lazy)
 {
        unsigned long flags;
@@ -3088,7 +3108,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
 /*
 * Queue an RCU-sched callback for invocation after a grace period.
 */
-void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+void call_rcu_sched(struct rcu_head *head, rcu_callback_t func)
 {
        __call_rcu(head, func, &rcu_sched_state, -1, 0);
 }
@@ -3097,7 +3117,7 @@ EXPORT_SYMBOL_GPL(call_rcu_sched);
 /*
 * Queue an RCU callback for invocation after a quicker grace period.
 */
-void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+void call_rcu_bh(struct rcu_head *head, rcu_callback_t func)
 {
        __call_rcu(head, func, &rcu_bh_state, -1, 0);
 }
@@ -3111,7 +3131,7 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
 * function may only be called from __kfree_rcu().
 */
 void kfree_call_rcu(struct rcu_head *head,
-                    void (*func)(struct rcu_head *rcu))
+                    rcu_callback_t func)
 {
        __call_rcu(head, func, rcu_state_p, -1, 1);
 }
@@ -3379,6 +3399,191 @@ static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s)
        return rcu_seq_done(&rsp->expedited_sequence, s);
 }
+/*
+ * Reset the ->expmaskinit values in the rcu_node tree to reflect any
+ * recent CPU-online activity.  Note that these masks are not cleared
+ * when CPUs go offline, so they reflect the union of all CPUs that have
+ * ever been online.  This means that this function normally takes its
+ * no-work-to-do fastpath.
+ */
+static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp)
+{
+        bool done;
+        unsigned long flags;
+        unsigned long mask;
+        unsigned long oldmask;
+        int ncpus = READ_ONCE(rsp->ncpus);
+        struct rcu_node *rnp;
+        struct rcu_node *rnp_up;
+        /* If no new CPUs onlined since last time, nothing to do. */
+        if (likely(ncpus == rsp->ncpus_snap))
+                return;
+        rsp->ncpus_snap = ncpus;
+        /*
+         * Each pass through the following loop propagates newly onlined
+         * CPUs for the current rcu_node structure up the rcu_node tree.
+         */
+        rcu_for_each_leaf_node(rsp, rnp) {
+                raw_spin_lock_irqsave(&rnp->lock, flags);
+                smp_mb__after_unlock_lock();
+                if (rnp->expmaskinit == rnp->expmaskinitnext) {
+                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                        continue;  /* No new CPUs, nothing to do. */
+                }
+                /* Update this node's mask, track old value for propagation. */
+                oldmask = rnp->expmaskinit;
+                rnp->expmaskinit = rnp->expmaskinitnext;
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                /* If was already nonzero, nothing to propagate. */
+                if (oldmask)
+                        continue;
+                /* Propagate the new CPU up the tree. */
+                mask = rnp->grpmask;
+                rnp_up = rnp->parent;
+                done = false;
+                while (rnp_up) {
+                        raw_spin_lock_irqsave(&rnp_up->lock, flags);
+                        smp_mb__after_unlock_lock();
+                        if (rnp_up->expmaskinit)
+                                done = true;
+                        rnp_up->expmaskinit |= mask;
+                        raw_spin_unlock_irqrestore(&rnp_up->lock, flags);
+                        if (done)
+                                break;
+                        mask = rnp_up->grpmask;
+                        rnp_up = rnp_up->parent;
+                }
+        }
+}
+/*
+ * Reset the ->expmask values in the rcu_node tree in preparation for
+ * a new expedited grace period.
+ */
+static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp)
+{
+        unsigned long flags;
+        struct rcu_node *rnp;
+        sync_exp_reset_tree_hotplug(rsp);
+        rcu_for_each_node_breadth_first(rsp, rnp) {
+                raw_spin_lock_irqsave(&rnp->lock, flags);
+                smp_mb__after_unlock_lock();
+                WARN_ON_ONCE(rnp->expmask);
+                rnp->expmask = rnp->expmaskinit;
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
+        }
+}
+/*
+ * Return non-zero if there is no RCU expedited grace period in progress
+ * for the specified rcu_node structure, in other words, if all CPUs and
+ * tasks covered by the specified rcu_node structure have done their bit
+ * for the current expedited grace period.  Works only for preemptible
+ * RCU -- other RCU implementation use other means.
+ *
+ * Caller must hold the root rcu_node's exp_funnel_mutex.
+ */
+static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
+{
+        return rnp->exp_tasks == NULL &&
+               READ_ONCE(rnp->expmask) == 0;
+}
+/*
+ * Report the exit from RCU read-side critical section for the last task
+ * that queued itself during or before the current expedited preemptible-RCU
+ * grace period.  This event is reported either to the rcu_node structure on
+ * which the task was queued or to one of that rcu_node structure's ancestors,
+ * recursively up the tree.  (Calm down, calm down, we do the recursion
+ * iteratively!)
+ *
+ * Caller must hold the root rcu_node's exp_funnel_mutex and the
+ * specified rcu_node structure's ->lock.
+ */
+static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
+                                 bool wake, unsigned long flags)
+        __releases(rnp->lock)
+{
+        unsigned long mask;
+        for (;;) {
+                if (!sync_rcu_preempt_exp_done(rnp)) {
+                        if (!rnp->expmask)
+                                rcu_initiate_boost(rnp, flags);
+                        else
+                                raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                        break;
+                }
+                if (rnp->parent == NULL) {
+                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                        if (wake) {
+                                smp_mb(); /* EGP done before wake_up(). */
+                                wake_up(&rsp->expedited_wq);
+                        }
+                        break;
+                }
+                mask = rnp->grpmask;
+                raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
+                rnp = rnp->parent;
+                raw_spin_lock(&rnp->lock); /* irqs already disabled */
+                smp_mb__after_unlock_lock();
+                WARN_ON_ONCE(!(rnp->expmask & mask));
+                rnp->expmask &= ~mask;
+        }
+}
+/*
+ * Report expedited quiescent state for specified node.  This is a
+ * lock-acquisition wrapper function for __rcu_report_exp_rnp().
+ *
+ * Caller must hold the root rcu_node's exp_funnel_mutex.
+ */
+static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
+                                              struct rcu_node *rnp, bool wake)
+{
+        unsigned long flags;
+        raw_spin_lock_irqsave(&rnp->lock, flags);
+        smp_mb__after_unlock_lock();
+        __rcu_report_exp_rnp(rsp, rnp, wake, flags);
+}
+/*
+ * Report expedited quiescent state for multiple CPUs, all covered by the
+ * specified leaf rcu_node structure.  Caller must hold the root
+ * rcu_node's exp_funnel_mutex.
+ */
+static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp,
+                                    unsigned long mask, bool wake)
+{
+        unsigned long flags;
+        raw_spin_lock_irqsave(&rnp->lock, flags);
+        smp_mb__after_unlock_lock();
+        if (!(rnp->expmask & mask)) {
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                return;
+        }
+        rnp->expmask &= ~mask;
+        __rcu_report_exp_rnp(rsp, rnp, wake, flags); /* Releases rnp->lock. */
+}
+/*
+ * Report expedited quiescent state for specified rcu_data (CPU).
+ * Caller must hold the root rcu_node's exp_funnel_mutex.
+ */
+static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp,
+                               bool wake)
+{
+        rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake);
+}
 /* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */
 static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp,
                               struct rcu_data *rdp,
@@ -3455,16 +3660,111 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
 }
 /* Invoked on each online non-idle CPU for expedited quiescent state. */
-static int synchronize_sched_expedited_cpu_stop(void *data)
+static void sync_sched_exp_handler(void *data)
 {
-        struct rcu_data *rdp = data;
+        struct rcu_data *rdp;
-        struct rcu_state *rsp = rdp->rsp;
+        struct rcu_node *rnp;
+        struct rcu_state *rsp = data;
-        /* We are here: If we are last, do the wakeup. */
+        rdp = this_cpu_ptr(rsp->rda);
-        rdp->exp_done = true;
+        rnp = rdp->mynode;
-        if (atomic_dec_and_test(&rsp->expedited_need_qs))
+        if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
-                wake_up(&rsp->expedited_wq);
+            __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
-        return 0;
+                return;
+        __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true);
+        resched_cpu(smp_processor_id());
+}
+/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */
+static void sync_sched_exp_online_cleanup(int cpu)
+{
+        struct rcu_data *rdp;
+        int ret;
+        struct rcu_node *rnp;
+        struct rcu_state *rsp = &rcu_sched_state;
+        rdp = per_cpu_ptr(rsp->rda, cpu);
+        rnp = rdp->mynode;
+        if (!(READ_ONCE(rnp->expmask) & rdp->grpmask))
+                return;
+        ret = smp_call_function_single(cpu, sync_sched_exp_handler, rsp, 0);
+        WARN_ON_ONCE(ret);
+}
+/*
+ * Select the nodes that the upcoming expedited grace period needs
+ * to wait for.
+ */
+static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
+                                     smp_call_func_t func)
+{
+        int cpu;
+        unsigned long flags;
+        unsigned long mask;
+        unsigned long mask_ofl_test;
+        unsigned long mask_ofl_ipi;
+        int ret;
+        struct rcu_node *rnp;
+        sync_exp_reset_tree(rsp);
+        rcu_for_each_leaf_node(rsp, rnp) {
+                raw_spin_lock_irqsave(&rnp->lock, flags);
+                smp_mb__after_unlock_lock();
+                /* Each pass checks a CPU for identity, offline, and idle. */
+                mask_ofl_test = 0;
+                for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) {
+                        struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+                        struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+                        if (raw_smp_processor_id() == cpu ||
+                            !(atomic_add_return(0, &rdtp->dynticks) & 0x1))
+                                mask_ofl_test |= rdp->grpmask;
+                }
+                mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
+                /*
+                 * Need to wait for any blocked tasks as well.  Note that
+                 * additional blocking tasks will also block the expedited
+                 * GP until such time as the ->expmask bits are cleared.
+                 */
+                if (rcu_preempt_has_tasks(rnp))
+                        rnp->exp_tasks = rnp->blkd_tasks.next;
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                /* IPI the remaining CPUs for expedited quiescent state. */
+                mask = 1;
+                for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
+                        if (!(mask_ofl_ipi & mask))
+                                continue;
+retry_ipi:
+                        ret = smp_call_function_single(cpu, func, rsp, 0);
+                        if (!ret) {
+                                mask_ofl_ipi &= ~mask;
+                        } else {
+                                /* Failed, raced with offline. */
+                                raw_spin_lock_irqsave(&rnp->lock, flags);
+                                if (cpu_online(cpu) &&
+                                    (rnp->expmask & mask)) {
+                                        raw_spin_unlock_irqrestore(&rnp->lock,
+                                                                   flags);
+                                        schedule_timeout_uninterruptible(1);
+                                        if (cpu_online(cpu) &&
+                                            (rnp->expmask & mask))
+                                                goto retry_ipi;
+                                        raw_spin_lock_irqsave(&rnp->lock,
+                                                              flags);
+                                }
+                                if (!(rnp->expmask & mask))
+                                        mask_ofl_ipi &= ~mask;
+                                raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                        }
+                }
+                /* Report quiescent states for those that went offline. */
+                mask_ofl_test |= mask_ofl_ipi;
+                if (mask_ofl_test)
+                        rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false);
+        }
 }
 static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
@@ -3472,7 +3772,9 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
        int cpu;
        unsigned long jiffies_stall;
        unsigned long jiffies_start;
-        struct rcu_data *rdp;
+        unsigned long mask;
+        struct rcu_node *rnp;
+        struct rcu_node *rnp_root = rcu_get_root(rsp);
        int ret;
        jiffies_stall = rcu_jiffies_till_stall_check();
@@ -3481,33 +3783,43 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
        for (;;) {
                ret = wait_event_interruptible_timeout(
                                rsp->expedited_wq,
-                                !atomic_read(&rsp->expedited_need_qs),
+                                sync_rcu_preempt_exp_done(rnp_root),
                                jiffies_stall);
                if (ret > 0)
                        return;
                if (ret < 0) {
                        /* Hit a signal, disable CPU stall warnings. */
                        wait_event(rsp->expedited_wq,
-                                   !atomic_read(&rsp->expedited_need_qs));
+                                   sync_rcu_preempt_exp_done(rnp_root));
                        return;
                }
-                pr_err("INFO: %s detected expedited stalls on CPUs: {",
+                pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {",
                       rsp->name);
-                for_each_online_cpu(cpu) {
+                rcu_for_each_leaf_node(rsp, rnp) {
-                        rdp = per_cpu_ptr(rsp->rda, cpu);
+                        (void)rcu_print_task_exp_stall(rnp);
+                        mask = 1;
-                        if (rdp->exp_done)
+                        for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
-                                continue;
+                                struct rcu_data *rdp;
-                        pr_cont(" %d", cpu);
+                                if (!(rnp->expmask & mask))
+                                        continue;
+                                rdp = per_cpu_ptr(rsp->rda, cpu);
+                                pr_cont(" %d-%c%c%c", cpu,
+                                        "O."[cpu_online(cpu)],
+                                        "o."[!!(rdp->grpmask & rnp->expmaskinit)],
+                                        "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]);
+                        }
+                        mask <<= 1;
                }
                pr_cont(" } %lu jiffies s: %lu\n",
                        jiffies - jiffies_start, rsp->expedited_sequence);
-                for_each_online_cpu(cpu) {
+                rcu_for_each_leaf_node(rsp, rnp) {
-                        rdp = per_cpu_ptr(rsp->rda, cpu);
+                        mask = 1;
+                        for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
-                        if (rdp->exp_done)
+                                if (!(rnp->expmask & mask))
-                                continue;
+                                        continue;
-                        dump_cpu_task(cpu);
+                                dump_cpu_task(cpu);
+                        }
                }
                jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3;
        }
@@ -3531,7 +3843,6 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 */
 void synchronize_sched_expedited(void)
 {
-        int cpu;
        unsigned long s;
        struct rcu_node *rnp;
        struct rcu_state *rsp = &rcu_sched_state;
@@ -3539,48 +3850,16 @@ void synchronize_sched_expedited(void)
        /* Take a snapshot of the sequence number.  */
        s = rcu_exp_gp_seq_snap(rsp);
-        if (!try_get_online_cpus()) {
-                /* CPU hotplug operation in flight, fall back to normal GP. */
-                wait_rcu_gp(call_rcu_sched);
-                atomic_long_inc(&rsp->expedited_normal);
-                return;
-        }
-        WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
        rnp = exp_funnel_lock(rsp, s);
-        if (rnp == NULL) {
+        if (rnp == NULL)
-                put_online_cpus();
                return;  /* Someone else did our work for us. */
-        }
        rcu_exp_gp_seq_start(rsp);
+        sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler);
-        /* Stop each CPU that is online, non-idle, and not us. */
+        synchronize_sched_expedited_wait(rsp);
-        init_waitqueue_head(&rsp->expedited_wq);
-        atomic_set(&rsp->expedited_need_qs, 1); /* Extra count avoids race. */
-        for_each_online_cpu(cpu) {
-                struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
-                struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-                rdp->exp_done = false;
-                /* Skip our CPU and any idle CPUs. */
-                if (raw_smp_processor_id() == cpu ||
-                    !(atomic_add_return(0, &rdtp->dynticks) & 0x1))
-                        continue;
-                atomic_inc(&rsp->expedited_need_qs);
-                stop_one_cpu_nowait(cpu, synchronize_sched_expedited_cpu_stop,
-                                    rdp, &rdp->exp_stop_work);
-        }
-        /* Remove extra count and, if necessary, wait for CPUs to stop. */
-        if (!atomic_dec_and_test(&rsp->expedited_need_qs))
-                synchronize_sched_expedited_wait(rsp);
        rcu_exp_gp_seq_end(rsp);
        mutex_unlock(&rnp->exp_funnel_mutex);
-        put_online_cpus();
 }
 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
@@ -3606,11 +3885,11 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
        /* Is the RCU core waiting for a quiescent state from this CPU? */
        if (rcu_scheduler_fully_active &&
-            rdp->qs_pending && !rdp->passed_quiesce &&
+            rdp->core_needs_qs && rdp->cpu_no_qs.b.norm &&
            rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) {
-                rdp->n_rp_qs_pending++;
+                rdp->n_rp_core_needs_qs++;
-        } else if (rdp->qs_pending &&
+        } else if (rdp->core_needs_qs &&
-                   (rdp->passed_quiesce ||
+                   (!rdp->cpu_no_qs.b.norm ||
                    rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) {
                rdp->n_rp_report_qs++;
                return 1;
@@ -3901,7 +4180,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
        /* Set up local state, ensuring consistent view of global state. */
        raw_spin_lock_irqsave(&rnp->lock, flags);
-        rdp->beenonline = 1;     /* We have now been online. */
        rdp->qlen_last_fqs_check = 0;
        rdp->n_force_qs_snap = rsp->n_force_qs;
        rdp->blimit = blimit;
@@ -3923,11 +4201,15 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
        raw_spin_lock(&rnp->lock);              /* irqs already disabled. */
        smp_mb__after_unlock_lock();
        rnp->qsmaskinitnext |= mask;
+        rnp->expmaskinitnext |= mask;
+        if (!rdp->beenonline)
+                WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1);
+        rdp->beenonline = true;  /* We have now been online. */
        rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */
        rdp->completed = rnp->completed;
-        rdp->passed_quiesce = false;
+        rdp->cpu_no_qs.b.norm = true;
        rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu);
-        rdp->qs_pending = false;
+        rdp->core_needs_qs = false;
        trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
@@ -3960,6 +4242,7 @@ int rcu_cpu_notify(struct notifier_block *self,
                break;
        case CPU_ONLINE:
        case CPU_DOWN_FAILED:
+                sync_sched_exp_online_cleanup(cpu);
                rcu_boost_kthread_setaffinity(rnp, -1);
                break;
        case CPU_DOWN_PREPARE:
@@ -3971,6 +4254,12 @@ int rcu_cpu_notify(struct notifier_block *self,
                        rcu_cleanup_dying_cpu(rsp);
                break;
        case CPU_DYING_IDLE:
+                /* QS for any half-done expedited RCU-sched GP. */
+                preempt_disable();
+                rcu_report_exp_rdp(&rcu_sched_state,
+                                   this_cpu_ptr(rcu_sched_state.rda), true);
+                preempt_enable();
                for_each_rcu_flavor(rsp) {
                        rcu_cleanup_dying_idle_cpu(cpu, rsp);
                }
@@ -4102,7 +4391,6 @@ static void __init rcu_init_one(struct rcu_state *rsp,
        static const char * const buf[] = RCU_NODE_NAME_INIT;
        static const char * const fqs[] = RCU_FQS_NAME_INIT;
        static const char * const exp[] = RCU_EXP_NAME_INIT;
-        static const char * const exp_sched[] = RCU_EXP_SCHED_NAME_INIT;
        static u8 fl_mask = 0x1;
        int levelcnt[RCU_NUM_LVLS];             /* # nodes in each level. */
@@ -4162,18 +4450,13 @@ static void __init rcu_init_one(struct rcu_state *rsp,
                        INIT_LIST_HEAD(&rnp->blkd_tasks);
                        rcu_init_one_nocb(rnp);
                        mutex_init(&rnp->exp_funnel_mutex);
-                        if (rsp == &rcu_sched_state)
+                        lockdep_set_class_and_name(&rnp->exp_funnel_mutex,
-                                lockdep_set_class_and_name(
+                                                   &rcu_exp_class[i], exp[i]);
-                                        &rnp->exp_funnel_mutex,
-                                        &rcu_exp_sched_class[i], exp_sched[i]);
-                        else
-                                lockdep_set_class_and_name(
-                                        &rnp->exp_funnel_mutex,
-                                        &rcu_exp_class[i], exp[i]);
                }
        }
        init_waitqueue_head(&rsp->gp_wq);
+        init_waitqueue_head(&rsp->expedited_wq);
        rnp = rsp->level[rcu_num_lvls - 1];
        for_each_possible_cpu(i) {
                while (i > rnp->grphi)
@@ -4216,13 +4499,12 @@ static void __init rcu_init_geometry(void)
                rcu_fanout_leaf, nr_cpu_ids);
        /*
-         * The boot-time rcu_fanout_leaf parameter is only permitted
+         * The boot-time rcu_fanout_leaf parameter must be at least two
-         * to increase the leaf-level fanout, not decrease it.  Of course,
+         * and cannot exceed the number of bits in the rcu_node masks.
-         * the leaf-level fanout cannot exceed the number of bits in
+         * Complain and fall back to the compile-time values if this
-         * the rcu_node masks.  Complain and fall back to the compile-
+         * limit is exceeded.
-         * time values if these limits are exceeded.
         */
-        if (rcu_fanout_leaf < RCU_FANOUT_LEAF ||
+        if (rcu_fanout_leaf < 2 ||
            rcu_fanout_leaf > sizeof(unsigned long) * 8) {
                rcu_fanout_leaf = RCU_FANOUT_LEAF;
                WARN_ON(1);
@@ -4239,10 +4521,13 @@ static void __init rcu_init_geometry(void)
        /*
         * The tree must be able to accommodate the configured number of CPUs.
-         * If this limit is exceeded than we have a serious problem elsewhere.
+         * If this limit is exceeded, fall back to the compile-time values.
         */
-        if (nr_cpu_ids > rcu_capacity[RCU_NUM_LVLS - 1])
+        if (nr_cpu_ids > rcu_capacity[RCU_NUM_LVLS - 1]) {
-                panic("rcu_init_geometry: rcu_capacity[] is too small");
+                rcu_fanout_leaf = RCU_FANOUT_LEAF;
+                WARN_ON(1);
+                return;
+        }
        /* Calculate the number of levels in the tree. */
        for (i = 0; nr_cpu_ids > rcu_capacity[i]; i++) {
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 2e991f8361e4..9fb4e238d4dc 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -70,8 +70,6 @@
 #  define RCU_NODE_NAME_INIT  { "rcu_node_0" }
 #  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0" }
 #  define RCU_EXP_NAME_INIT   { "rcu_node_exp_0" }
-#  define RCU_EXP_SCHED_NAME_INIT \
-                              { "rcu_node_exp_sched_0" }
 #elif NR_CPUS <= RCU_FANOUT_2
 #  define RCU_NUM_LVLS        2
 #  define NUM_RCU_LVL_0       1
@@ -81,8 +79,6 @@
 #  define RCU_NODE_NAME_INIT  { "rcu_node_0", "rcu_node_1" }
 #  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0", "rcu_node_fqs_1" }
 #  define RCU_EXP_NAME_INIT   { "rcu_node_exp_0", "rcu_node_exp_1" }
-#  define RCU_EXP_SCHED_NAME_INIT \
-                              { "rcu_node_exp_sched_0", "rcu_node_exp_sched_1" }
 #elif NR_CPUS <= RCU_FANOUT_3
 #  define RCU_NUM_LVLS        3
 #  define NUM_RCU_LVL_0       1
@@ -93,8 +89,6 @@
 #  define RCU_NODE_NAME_INIT  { "rcu_node_0", "rcu_node_1", "rcu_node_2" }
 #  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" }
 #  define RCU_EXP_NAME_INIT   { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" }
-#  define RCU_EXP_SCHED_NAME_INIT \
-                              { "rcu_node_exp_sched_0", "rcu_node_exp_sched_1", "rcu_node_exp_sched_2" }
 #elif NR_CPUS <= RCU_FANOUT_4
 #  define RCU_NUM_LVLS        4
 #  define NUM_RCU_LVL_0       1
@@ -106,8 +100,6 @@
 #  define RCU_NODE_NAME_INIT  { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" }
 #  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" }
 #  define RCU_EXP_NAME_INIT   { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" }
-#  define RCU_EXP_SCHED_NAME_INIT \
-                              { "rcu_node_exp_sched_0", "rcu_node_exp_sched_1", "rcu_node_exp_sched_2", "rcu_node_exp_sched_3" }
 #else
 # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
 #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
@@ -171,16 +163,21 @@ struct rcu_node {
                                /*  an rcu_data structure, otherwise, each */
                                /*  bit corresponds to a child rcu_node */
                                /*  structure. */
-        unsigned long expmask;  /* Groups that have ->blkd_tasks */
-                                /*  elements that need to drain to allow the */
-                                /*  current expedited grace period to */
-                                /*  complete (only for PREEMPT_RCU). */
        unsigned long qsmaskinit;
-                                /* Per-GP initial value for qsmask & expmask. */
+                                /* Per-GP initial value for qsmask. */
                                /*  Initialized from ->qsmaskinitnext at the */
                                /*  beginning of each grace period. */
        unsigned long qsmaskinitnext;
                                /* Online CPUs for next grace period. */
+        unsigned long expmask;  /* CPUs or groups that need to check in */
+                                /*  to allow the current expedited GP */
+                                /*  to complete. */
+        unsigned long expmaskinit;
+                                /* Per-GP initial values for expmask. */
+                                /*  Initialized from ->expmaskinitnext at the */
+                                /*  beginning of each expedited GP. */
+        unsigned long expmaskinitnext;
+                                /* Online CPUs for next expedited GP. */
        unsigned long grpmask;  /* Mask to apply to parent qsmask. */
                                /*  Only one bit will be set in this mask. */
        int     grplo;          /* lowest-numbered CPU or group here. */
@@ -281,6 +278,18 @@ struct rcu_node {
        for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \
             (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
+/*
+ * Union to allow "aggregate OR" operation on the need for a quiescent
+ * state by the normal and expedited grace periods.
+ */
+union rcu_noqs {
+        struct {
+                u8 norm;
+                u8 exp;
+        } b; /* Bits. */
+        u16 s; /* Set of bits, aggregate OR here. */
+};
 /* Index values for nxttail array in struct rcu_data. */
 #define RCU_DONE_TAIL           0       /* Also RCU_WAIT head. */
 #define RCU_WAIT_TAIL           1       /* Also RCU_NEXT_READY head. */
@@ -297,8 +306,8 @@ struct rcu_data {
                                        /*  is aware of having started. */
        unsigned long   rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */
                                        /*  for rcu_all_qs() invocations. */
-        bool            passed_quiesce; /* User-mode/idle loop etc. */
+        union rcu_noqs  cpu_no_qs;      /* No QSes yet for this CPU. */
-        bool            qs_pending;     /* Core waits for quiesc state. */
+        bool            core_needs_qs;  /* Core waits for quiesc state. */
        bool            beenonline;     /* CPU online at least once. */
        bool            gpwrap;         /* Possible gpnum/completed wrap. */
        struct rcu_node *mynode;        /* This CPU's leaf of hierarchy */
@@ -307,9 +316,6 @@ struct rcu_data {
                                        /*  ticks this CPU has handled */
                                        /*  during and after the last grace */
                                        /* period it is aware of. */
-        struct cpu_stop_work exp_stop_work;
-                                        /* Expedited grace-period control */
-                                        /*  for CPU stopping. */
        /* 2) batch handling */
        /*
@@ -363,7 +369,7 @@ struct rcu_data {
        /* 5) __rcu_pending() statistics. */
        unsigned long n_rcu_pending;    /* rcu_pending() calls since boot. */
-        unsigned long n_rp_qs_pending;
+        unsigned long n_rp_core_needs_qs;
        unsigned long n_rp_report_qs;
        unsigned long n_rp_cb_ready;
        unsigned long n_rp_cpu_needs_gp;
@@ -378,7 +384,6 @@ struct rcu_data {
        struct rcu_head oom_head;
 #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
        struct mutex exp_funnel_mutex;
-        bool exp_done;                  /* Expedited QS for this CPU? */
        /* 7) Callback offloading. */
 #ifdef CONFIG_RCU_NOCB_CPU
@@ -412,13 +417,6 @@ struct rcu_data {
        struct rcu_state *rsp;
 };
-/* Values for fqs_state field in struct rcu_state. */
-#define RCU_GP_IDLE             0       /* No grace period in progress. */
-#define RCU_GP_INIT             1       /* Grace period being initialized. */
-#define RCU_SAVE_DYNTICK        2       /* Need to scan dyntick state. */
-#define RCU_FORCE_QS            3       /* Need to force quiescent state. */
-#define RCU_SIGNAL_INIT         RCU_SAVE_DYNTICK
 /* Values for nocb_defer_wakeup field in struct rcu_data. */
 #define RCU_NOGP_WAKE_NOT       0
 #define RCU_NOGP_WAKE           1
@@ -464,14 +462,13 @@ struct rcu_state {
                                                /*  shut bogus gcc warning) */
        u8 flavor_mask;                         /* bit in flavor mask. */
        struct rcu_data __percpu *rda;          /* pointer of percu rcu_data. */
-        void (*call)(struct rcu_head *head,     /* call_rcu() flavor. */
+        call_rcu_func_t call;                   /* call_rcu() flavor. */
-                     void (*func)(struct rcu_head *head));
+        int ncpus;                              /* # CPUs seen so far. */
        /* The following fields are guarded by the root rcu_node's lock. */
-        u8      fqs_state ____cacheline_internodealigned_in_smp;
+        u8      boost ____cacheline_internodealigned_in_smp;
-                                                /* Force QS state. */
+                                                /* Subject to priority boost. */
-        u8      boost;                          /* Subject to priority boost. */
        unsigned long gpnum;                    /* Current gp number. */
        unsigned long completed;                /* # of last completed gp. */
        struct task_struct *gp_kthread;         /* Task for grace periods. */
@@ -508,6 +505,7 @@ struct rcu_state {
        atomic_long_t expedited_normal;         /* # fallbacks to normal. */
        atomic_t expedited_need_qs;             /* # CPUs left to check in. */
        wait_queue_head_t expedited_wq;         /* Wait for check-ins. */
+        int ncpus_snap;                         /* # CPUs seen last time. */
        unsigned long jiffies_force_qs;         /* Time at which to invoke */
                                                /*  force_quiescent_state(). */
@@ -538,8 +536,8 @@ struct rcu_state {
 #define RCU_GP_FLAG_INIT 0x1    /* Need grace-period initialization. */
 #define RCU_GP_FLAG_FQS  0x2    /* Need grace-period quiescent-state forcing. */
-/* Values for rcu_state structure's gp_flags field. */
+/* Values for rcu_state structure's gp_state field. */
-#define RCU_GP_WAIT_INIT 0      /* Initial state. */
+#define RCU_GP_IDLE      0      /* Initial state and no GP in progress. */
 #define RCU_GP_WAIT_GPS  1      /* Wait for grace-period start. */
 #define RCU_GP_DONE_GPS  2      /* Wait done for grace-period start. */
 #define RCU_GP_WAIT_FQS  3      /* Wait for force-quiescent-state time. */
@@ -582,9 +580,10 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 static void rcu_print_detail_task_stall(struct rcu_state *rsp);
 static int rcu_print_task_stall(struct rcu_node *rnp);
+static int rcu_print_task_exp_stall(struct rcu_node *rnp);
 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
 static void rcu_preempt_check_callbacks(void);
-void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
+void call_rcu(struct rcu_head *head, rcu_callback_t func);
 static void __init __rcu_init_preempt(void);
 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index b2bf3963a0ae..630c19772630 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -101,7 +101,6 @@ RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
 static struct rcu_state *const rcu_state_p = &rcu_preempt_state;
 static struct rcu_data __percpu *const rcu_data_p = &rcu_preempt_data;
-static int rcu_preempted_readers_exp(struct rcu_node *rnp);
 static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
                               bool wake);
@@ -114,6 +113,147 @@ static void __init rcu_bootup_announce(void)
        rcu_bootup_announce_oddness();
 }
+/* Flags for rcu_preempt_ctxt_queue() decision table. */
+#define RCU_GP_TASKS    0x8
+#define RCU_EXP_TASKS   0x4
+#define RCU_GP_BLKD     0x2
+#define RCU_EXP_BLKD    0x1
+/*
+ * Queues a task preempted within an RCU-preempt read-side critical
+ * section into the appropriate location within the ->blkd_tasks list,
+ * depending on the states of any ongoing normal and expedited grace
+ * periods.  The ->gp_tasks pointer indicates which element the normal
+ * grace period is waiting on (NULL if none), and the ->exp_tasks pointer
+ * indicates which element the expedited grace period is waiting on (again,
+ * NULL if none).  If a grace period is waiting on a given element in the
+ * ->blkd_tasks list, it also waits on all subsequent elements.  Thus,
+ * adding a task to the tail of the list blocks any grace period that is
+ * already waiting on one of the elements.  In contrast, adding a task
+ * to the head of the list won't block any grace period that is already
+ * waiting on one of the elements.
+ *
+ * This queuing is imprecise, and can sometimes make an ongoing grace
+ * period wait for a task that is not strictly speaking blocking it.
+ * Given the choice, we needlessly block a normal grace period rather than
+ * blocking an expedited grace period.
+ *
+ * Note that an endless sequence of expedited grace periods still cannot
+ * indefinitely postpone a normal grace period.  Eventually, all of the
+ * fixed number of preempted tasks blocking the normal grace period that are
+ * not also blocking the expedited grace period will resume and complete
+ * their RCU read-side critical sections.  At that point, the ->gp_tasks
+ * pointer will equal the ->exp_tasks pointer, at which point the end of
+ * the corresponding expedited grace period will also be the end of the
+ * normal grace period.
+ */
+static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp,
+                                   unsigned long flags) __releases(rnp->lock)
+{
+        int blkd_state = (rnp->gp_tasks ? RCU_GP_TASKS : 0) +
+                         (rnp->exp_tasks ? RCU_EXP_TASKS : 0) +
+                         (rnp->qsmask & rdp->grpmask ? RCU_GP_BLKD : 0) +
+                         (rnp->expmask & rdp->grpmask ? RCU_EXP_BLKD : 0);
+        struct task_struct *t = current;
+        /*
+         * Decide where to queue the newly blocked task.  In theory,
+         * this could be an if-statement.  In practice, when I tried
+         * that, it was quite messy.
+         */
+        switch (blkd_state) {
+        case 0:
+        case                RCU_EXP_TASKS:
+        case                RCU_EXP_TASKS + RCU_GP_BLKD:
+        case RCU_GP_TASKS:
+        case RCU_GP_TASKS + RCU_EXP_TASKS:
+                /*
+                 * Blocking neither GP, or first task blocking the normal
+                 * GP but not blocking the already-waiting expedited GP.
+                 * Queue at the head of the list to avoid unnecessarily
+                 * blocking the already-waiting GPs.
+                 */
+                list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
+                break;
+        case                                              RCU_EXP_BLKD:
+        case                                RCU_GP_BLKD:
+        case                                RCU_GP_BLKD + RCU_EXP_BLKD:
+        case RCU_GP_TASKS +                               RCU_EXP_BLKD:
+        case RCU_GP_TASKS +                 RCU_GP_BLKD + RCU_EXP_BLKD:
+        case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD:
+                /*
+                 * First task arriving that blocks either GP, or first task
+                 * arriving that blocks the expedited GP (with the normal
+                 * GP already waiting), or a task arriving that blocks
+                 * both GPs with both GPs already waiting.  Queue at the
+                 * tail of the list to avoid any GP waiting on any of the
+                 * already queued tasks that are not blocking it.
+                 */
+                list_add_tail(&t->rcu_node_entry, &rnp->blkd_tasks);
+                break;
+        case                RCU_EXP_TASKS +               RCU_EXP_BLKD:
+        case                RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD:
+        case RCU_GP_TASKS + RCU_EXP_TASKS +               RCU_EXP_BLKD:
+                /*
+                 * Second or subsequent task blocking the expedited GP.
+                 * The task either does not block the normal GP, or is the
+                 * first task blocking the normal GP.  Queue just after
+                 * the first task blocking the expedited GP.
+                 */
+                list_add(&t->rcu_node_entry, rnp->exp_tasks);
+                break;
+        case RCU_GP_TASKS +                 RCU_GP_BLKD:
+        case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD:
+                /*
+                 * Second or subsequent task blocking the normal GP.
+                 * The task does not block the expedited GP. Queue just
+                 * after the first task blocking the normal GP.
+                 */
+                list_add(&t->rcu_node_entry, rnp->gp_tasks);
+                break;
+        default:
+                /* Yet another exercise in excessive paranoia. */
+                WARN_ON_ONCE(1);
+                break;
+        }
+        /*
+         * We have now queued the task.  If it was the first one to
+         * block either grace period, update the ->gp_tasks and/or
+         * ->exp_tasks pointers, respectively, to reference the newly
+         * blocked tasks.
+         */
+        if (!rnp->gp_tasks && (blkd_state & RCU_GP_BLKD))
+                rnp->gp_tasks = &t->rcu_node_entry;
+        if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD))
+                rnp->exp_tasks = &t->rcu_node_entry;
+        raw_spin_unlock(&rnp->lock);
+        /*
+         * Report the quiescent state for the expedited GP.  This expedited
+         * GP should not be able to end until we report, so there should be
+         * no need to check for a subsequent expedited GP.  (Though we are
+         * still in a quiescent state in any case.)
+         */
+        if (blkd_state & RCU_EXP_BLKD &&
+            t->rcu_read_unlock_special.b.exp_need_qs) {
+                t->rcu_read_unlock_special.b.exp_need_qs = false;
+                rcu_report_exp_rdp(rdp->rsp, rdp, true);
+        } else {
+                WARN_ON_ONCE(t->rcu_read_unlock_special.b.exp_need_qs);
+        }
+        local_irq_restore(flags);
+}
 /*
 * Record a preemptible-RCU quiescent state for the specified CPU.  Note
 * that this just means that the task currently running on the CPU is
@@ -125,11 +265,11 @@ static void __init rcu_bootup_announce(void)
 */
 static void rcu_preempt_qs(void)
 {
-        if (!__this_cpu_read(rcu_data_p->passed_quiesce)) {
+        if (__this_cpu_read(rcu_data_p->cpu_no_qs.s)) {
                trace_rcu_grace_period(TPS("rcu_preempt"),
                                       __this_cpu_read(rcu_data_p->gpnum),
                                       TPS("cpuqs"));
-                __this_cpu_write(rcu_data_p->passed_quiesce, 1);
+                __this_cpu_write(rcu_data_p->cpu_no_qs.b.norm, false);
                barrier(); /* Coordinate with rcu_preempt_check_callbacks(). */
                current->rcu_read_unlock_special.b.need_qs = false;
        }
@@ -167,42 +307,18 @@ static void rcu_preempt_note_context_switch(void)
                t->rcu_blocked_node = rnp;
                /*
-                 * If this CPU has already checked in, then this task
+                 * Verify the CPU's sanity, trace the preemption, and
-                 * will hold up the next grace period rather than the
+                 * then queue the task as required based on the states
-                 * current grace period.  Queue the task accordingly.
+                 * of any ongoing and expedited grace periods.
-                 * If the task is queued for the current grace period
-                 * (i.e., this CPU has not yet passed through a quiescent
-                 * state for the current grace period), then as long
-                 * as that task remains queued, the current grace period
-                 * cannot end.  Note that there is some uncertainty as
-                 * to exactly when the current grace period started.
-                 * We take a conservative approach, which can result
-                 * in unnecessarily waiting on tasks that started very
-                 * slightly after the current grace period began.  C'est
-                 * la vie!!!
-                 *
-                 * But first, note that the current CPU must still be
-                 * on line!
                 */
                WARN_ON_ONCE((rdp->grpmask & rcu_rnp_online_cpus(rnp)) == 0);
                WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
-                if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) {
-                        list_add(&t->rcu_node_entry, rnp->gp_tasks->prev);
-                        rnp->gp_tasks = &t->rcu_node_entry;
-                        if (IS_ENABLED(CONFIG_RCU_BOOST) &&
-                            rnp->boost_tasks != NULL)
-                                rnp->boost_tasks = rnp->gp_tasks;
-                } else {
-                        list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
-                        if (rnp->qsmask & rdp->grpmask)
-                                rnp->gp_tasks = &t->rcu_node_entry;
-                }
                trace_rcu_preempt_task(rdp->rsp->name,
                                       t->pid,
                                       (rnp->qsmask & rdp->grpmask)
                                       ? rnp->gpnum
                                       : rnp->gpnum + 1);
-                raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                rcu_preempt_ctxt_queue(rnp, rdp, flags);
        } else if (t->rcu_read_lock_nesting < 0 &&
                   t->rcu_read_unlock_special.s) {
@@ -272,6 +388,7 @@ void rcu_read_unlock_special(struct task_struct *t)
        unsigned long flags;
        struct list_head *np;
        bool drop_boost_mutex = false;
+        struct rcu_data *rdp;
        struct rcu_node *rnp;
        union rcu_special special;
@@ -282,8 +399,8 @@ void rcu_read_unlock_special(struct task_struct *t)
        local_irq_save(flags);
        /*
-         * If RCU core is waiting for this CPU to exit critical section,
+         * If RCU core is waiting for this CPU to exit its critical section,
-         * let it know that we have done so.  Because irqs are disabled,
+         * report the fact that it has exited.  Because irqs are disabled,
         * t->rcu_read_unlock_special cannot change.
         */
        special = t->rcu_read_unlock_special;
@@ -296,13 +413,32 @@ void rcu_read_unlock_special(struct task_struct *t)
                }
        }
+        /*
+         * Respond to a request for an expedited grace period, but only if
+         * we were not preempted, meaning that we were running on the same
+         * CPU throughout.  If we were preempted, the exp_need_qs flag
+         * would have been cleared at the time of the first preemption,
+         * and the quiescent state would be reported when we were dequeued.
+         */
+        if (special.b.exp_need_qs) {
+                WARN_ON_ONCE(special.b.blocked);
+                t->rcu_read_unlock_special.b.exp_need_qs = false;
+                rdp = this_cpu_ptr(rcu_state_p->rda);
+                rcu_report_exp_rdp(rcu_state_p, rdp, true);
+                if (!t->rcu_read_unlock_special.s) {
+                        local_irq_restore(flags);
+                        return;
+                }
+        }
        /* Hardware IRQ handlers cannot block, complain if they get here. */
        if (in_irq() || in_serving_softirq()) {
                lockdep_rcu_suspicious(__FILE__, __LINE__,
                                       "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n");
-                pr_alert("->rcu_read_unlock_special: %#x (b: %d, nq: %d)\n",
+                pr_alert("->rcu_read_unlock_special: %#x (b: %d, enq: %d nq: %d)\n",
                         t->rcu_read_unlock_special.s,
                         t->rcu_read_unlock_special.b.blocked,
+                         t->rcu_read_unlock_special.b.exp_need_qs,
                         t->rcu_read_unlock_special.b.need_qs);
                local_irq_restore(flags);
                return;
@@ -329,7 +465,7 @@ void rcu_read_unlock_special(struct task_struct *t)
                        raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
                }
                empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
-                empty_exp = !rcu_preempted_readers_exp(rnp);
+                empty_exp = sync_rcu_preempt_exp_done(rnp);
                smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
                np = rcu_next_node_entry(t, rnp);
                list_del_init(&t->rcu_node_entry);
@@ -353,7 +489,7 @@ void rcu_read_unlock_special(struct task_struct *t)
                 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
                 * so we must take a snapshot of the expedited state.
                 */
-                empty_exp_now = !rcu_preempted_readers_exp(rnp);
+                empty_exp_now = sync_rcu_preempt_exp_done(rnp);
                if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) {
                        trace_rcu_quiescent_state_report(TPS("preempt_rcu"),
                                                         rnp->gpnum,
@@ -450,6 +586,27 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
 }
 /*
+ * Scan the current list of tasks blocked within RCU read-side critical
+ * sections, printing out the tid of each that is blocking the current
+ * expedited grace period.
+ */
+static int rcu_print_task_exp_stall(struct rcu_node *rnp)
+{
+        struct task_struct *t;
+        int ndetected = 0;
+        if (!rnp->exp_tasks)
+                return 0;
+        t = list_entry(rnp->exp_tasks->prev,
+                       struct task_struct, rcu_node_entry);
+        list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
+                pr_cont(" P%d", t->pid);
+                ndetected++;
+        }
+        return ndetected;
+}
+/*
 * Check that the list of blocked tasks for the newly completed grace
 * period is in fact empty.  It is a serious bug to complete a grace
 * period that still has RCU readers blocked!  This function must be
@@ -483,8 +640,8 @@ static void rcu_preempt_check_callbacks(void)
                return;
        }
        if (t->rcu_read_lock_nesting > 0 &&
-            __this_cpu_read(rcu_data_p->qs_pending) &&
+            __this_cpu_read(rcu_data_p->core_needs_qs) &&
-            !__this_cpu_read(rcu_data_p->passed_quiesce))
+            __this_cpu_read(rcu_data_p->cpu_no_qs.b.norm))
                t->rcu_read_unlock_special.b.need_qs = true;
 }
@@ -500,7 +657,7 @@ static void rcu_preempt_do_callbacks(void)
 /*
 * Queue a preemptible-RCU callback for invocation after a grace period.
 */
-void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+void call_rcu(struct rcu_head *head, rcu_callback_t func)
 {
        __call_rcu(head, func, rcu_state_p, -1, 0);
 }
@@ -535,155 +692,41 @@ void synchronize_rcu(void)
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu);
-static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
-/*
- * Return non-zero if there are any tasks in RCU read-side critical
- * sections blocking the current preemptible-RCU expedited grace period.
- * If there is no preemptible-RCU expedited grace period currently in
- * progress, returns zero unconditionally.
- */
-static int rcu_preempted_readers_exp(struct rcu_node *rnp)
-{
-        return rnp->exp_tasks != NULL;
-}
-/*
- * return non-zero if there is no RCU expedited grace period in progress
- * for the specified rcu_node structure, in other words, if all CPUs and
- * tasks covered by the specified rcu_node structure have done their bit
- * for the current expedited grace period.  Works only for preemptible
- * RCU -- other RCU implementation use other means.
- *
- * Caller must hold the root rcu_node's exp_funnel_mutex.
- */
-static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
-{
-        return !rcu_preempted_readers_exp(rnp) &&
-               READ_ONCE(rnp->expmask) == 0;
-}
-/*
- * Report the exit from RCU read-side critical section for the last task
- * that queued itself during or before the current expedited preemptible-RCU
- * grace period.  This event is reported either to the rcu_node structure on
- * which the task was queued or to one of that rcu_node structure's ancestors,
- * recursively up the tree.  (Calm down, calm down, we do the recursion
- * iteratively!)
- *
- * Caller must hold the root rcu_node's exp_funnel_mutex.
- */
-static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
-                               bool wake)
-{
-        unsigned long flags;
-        unsigned long mask;
-        raw_spin_lock_irqsave(&rnp->lock, flags);
-        smp_mb__after_unlock_lock();
-        for (;;) {
-                if (!sync_rcu_preempt_exp_done(rnp)) {
-                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
-                        break;
-                }
-                if (rnp->parent == NULL) {
-                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
-                        if (wake) {
-                                smp_mb(); /* EGP done before wake_up(). */
-                                wake_up(&sync_rcu_preempt_exp_wq);
-                        }
-                        break;
-                }
-                mask = rnp->grpmask;
-                raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
-                rnp = rnp->parent;
-                raw_spin_lock(&rnp->lock); /* irqs already disabled */
-                smp_mb__after_unlock_lock();
-                rnp->expmask &= ~mask;
-        }
-}
 /*
- * Snapshot the tasks blocking the newly started preemptible-RCU expedited
+ * Remote handler for smp_call_function_single().  If there is an
- * grace period for the specified rcu_node structure, phase 1.  If there
+ * RCU read-side critical section in effect, request that the
- * are such tasks, set the ->expmask bits up the rcu_node tree and also
+ * next rcu_read_unlock() record the quiescent state up the
- * set the ->expmask bits on the leaf rcu_node structures to tell phase 2
+ * ->expmask fields in the rcu_node tree.  Otherwise, immediately
- * that work is needed here.
+ * report the quiescent state.
- *
- * Caller must hold the root rcu_node's exp_funnel_mutex.
 */
-static void
+static void sync_rcu_exp_handler(void *info)
-sync_rcu_preempt_exp_init1(struct rcu_state *rsp, struct rcu_node *rnp)
 {
-        unsigned long flags;
+        struct rcu_data *rdp;
-        unsigned long mask;
+        struct rcu_state *rsp = info;
-        struct rcu_node *rnp_up;
+        struct task_struct *t = current;
-        raw_spin_lock_irqsave(&rnp->lock, flags);
-        smp_mb__after_unlock_lock();
-        WARN_ON_ONCE(rnp->expmask);
-        WARN_ON_ONCE(rnp->exp_tasks);
-        if (!rcu_preempt_has_tasks(rnp)) {
-                /* No blocked tasks, nothing to do. */
-                raw_spin_unlock_irqrestore(&rnp->lock, flags);
-                return;
-        }
-        /* Call for Phase 2 and propagate ->expmask bits up the tree. */
-        rnp->expmask = 1;
-        rnp_up = rnp;
-        while (rnp_up->parent) {
-                mask = rnp_up->grpmask;
-                rnp_up = rnp_up->parent;
-                if (rnp_up->expmask & mask)
-                        break;
-                raw_spin_lock(&rnp_up->lock); /* irqs already off */
-                smp_mb__after_unlock_lock();
-                rnp_up->expmask |= mask;
-                raw_spin_unlock(&rnp_up->lock); /* irqs still off */
-        }
-        raw_spin_unlock_irqrestore(&rnp->lock, flags);
-}
-/*
- * Snapshot the tasks blocking the newly started preemptible-RCU expedited
- * grace period for the specified rcu_node structure, phase 2.  If the
- * leaf rcu_node structure has its ->expmask field set, check for tasks.
- * If there are some, clear ->expmask and set ->exp_tasks accordingly,
- * then initiate RCU priority boosting.  Otherwise, clear ->expmask and
- * invoke rcu_report_exp_rnp() to clear out the upper-level ->expmask bits,
- * enabling rcu_read_unlock_special() to do the bit-clearing.
- *
- * Caller must hold the root rcu_node's exp_funnel_mutex.
- */
-static void
-sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp)
-{
-        unsigned long flags;
-        raw_spin_lock_irqsave(&rnp->lock, flags);
-        smp_mb__after_unlock_lock();
-        if (!rnp->expmask) {
-                /* Phase 1 didn't do anything, so Phase 2 doesn't either. */
-                raw_spin_unlock_irqrestore(&rnp->lock, flags);
-                return;
-        }
-        /* Phase 1 is over. */
-        rnp->expmask = 0;
        /*
-         * If there are still blocked tasks, set up ->exp_tasks so that
+         * Within an RCU read-side critical section, request that the next
-         * rcu_read_unlock_special() will wake us and then boost them.
+         * rcu_read_unlock() report.  Unless this RCU read-side critical
+         * section has already blocked, in which case it is already set
+         * up for the expedited grace period to wait on it.
         */
-        if (rcu_preempt_has_tasks(rnp)) {
+        if (t->rcu_read_lock_nesting > 0 &&
-                rnp->exp_tasks = rnp->blkd_tasks.next;
+            !t->rcu_read_unlock_special.b.blocked) {
-                rcu_initiate_boost(rnp, flags);  /* releases rnp->lock */
+                t->rcu_read_unlock_special.b.exp_need_qs = true;
                return;
        }
-        /* No longer any blocked tasks, so undo bit setting. */
+        /*
-        raw_spin_unlock_irqrestore(&rnp->lock, flags);
+         * We are either exiting an RCU read-side critical section (negative
-        rcu_report_exp_rnp(rsp, rnp, false);
+         * values of t->rcu_read_lock_nesting) or are not in one at all
+         * (zero value of t->rcu_read_lock_nesting).  Or we are in an RCU
+         * read-side critical section that blocked before this expedited
+         * grace period started.  Either way, we can immediately report
+         * the quiescent state.
+         */
+        rdp = this_cpu_ptr(rsp->rda);
+        rcu_report_exp_rdp(rsp, rdp, true);
 }
 /**
@@ -713,24 +756,12 @@ void synchronize_rcu_expedited(void)
        rcu_exp_gp_seq_start(rsp);
-        /* force all RCU readers onto ->blkd_tasks lists. */
+        /* Initialize the rcu_node tree in preparation for the wait. */
-        synchronize_sched_expedited();
+        sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler);
-        /*
-         * Snapshot current state of ->blkd_tasks lists into ->expmask.
-         * Phase 1 sets bits and phase 2 permits rcu_read_unlock_special()
-         * to start clearing them.  Doing this in one phase leads to
-         * strange races between setting and clearing bits, so just say "no"!
-         */
-        rcu_for_each_leaf_node(rsp, rnp)
-                sync_rcu_preempt_exp_init1(rsp, rnp);
-        rcu_for_each_leaf_node(rsp, rnp)
-                sync_rcu_preempt_exp_init2(rsp, rnp);
        /* Wait for snapshotted ->blkd_tasks lists to drain. */
        rnp = rcu_get_root(rsp);
-        wait_event(sync_rcu_preempt_exp_wq,
+        synchronize_sched_expedited_wait(rsp);
-                   sync_rcu_preempt_exp_done(rnp));
        /* Clean up and exit. */
        rcu_exp_gp_seq_end(rsp);
@@ -835,6 +866,16 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
 }
 /*
+ * Because preemptible RCU does not exist, we never have to check for
+ * tasks blocked within RCU read-side critical sections that are
+ * blocking the current expedited grace period.
+ */
+static int rcu_print_task_exp_stall(struct rcu_node *rnp)
+{
+        return 0;
+}
+/*
 * Because there is no preemptible RCU, there can be no readers blocked,
 * so there is no need to check for blocked tasks.  So check only for
 * bogus qsmask values.
@@ -1702,8 +1743,12 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
                ticks_value = rsp->gpnum - rdp->gpnum;
        }
        print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
-        pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n",
+        pr_err("\t%d-%c%c%c: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n",
-               cpu, ticks_value, ticks_title,
+               cpu,
+               "O."[!!cpu_online(cpu)],
+               "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)],
+               "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)],
+               ticks_value, ticks_title,
               atomic_read(&rdtp->dynticks) & 0xfff,
               rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
               rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 6fc4c5ff3bb5..ef7093cc9b5c 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -117,13 +117,13 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
        if (!rdp->beenonline)
                return;
-        seq_printf(m, "%3d%cc=%ld g=%ld pq=%d/%d qp=%d",
+        seq_printf(m, "%3d%cc=%ld g=%ld cnq=%d/%d:%d",
                   rdp->cpu,
                   cpu_is_offline(rdp->cpu) ? '!' : ' ',
                   ulong2long(rdp->completed), ulong2long(rdp->gpnum),
-                   rdp->passed_quiesce,
+                   rdp->cpu_no_qs.b.norm,
                   rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu),
-                   rdp->qs_pending);
+                   rdp->core_needs_qs);
        seq_printf(m, " dt=%d/%llx/%d df=%lu",
                   atomic_read(&rdp->dynticks->dynticks),
                   rdp->dynticks->dynticks_nesting,
@@ -268,7 +268,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
        gpnum = rsp->gpnum;
        seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x ",
                   ulong2long(rsp->completed), ulong2long(gpnum),
-                   rsp->fqs_state,
+                   rsp->gp_state,
                   (long)(rsp->jiffies_force_qs - jiffies),
                   (int)(jiffies & 0xffff));
        seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
@@ -361,7 +361,7 @@ static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
                   cpu_is_offline(rdp->cpu) ? '!' : ' ',
                   rdp->n_rcu_pending);
        seq_printf(m, "qsp=%ld rpq=%ld cbr=%ld cng=%ld ",
-                   rdp->n_rp_qs_pending,
+                   rdp->n_rp_core_needs_qs,
                   rdp->n_rp_report_qs,
                   rdp->n_rp_cb_ready,
                   rdp->n_rp_cpu_needs_gp);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 7a0b3bc7c5ed..5f748c5a40f0 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -534,7 +534,7 @@ static void rcu_spawn_tasks_kthread(void);
 * Post an RCU-tasks callback.  First call must be from process context
 * after the scheduler if fully operational.
 */
-void call_rcu_tasks(struct rcu_head *rhp, void (*func)(struct rcu_head *rhp))
+void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func)
 {
        unsigned long flags;
        bool needwake;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2f9c92884817..4d568ac9319e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -817,7 +817,7 @@ static void set_load_weight(struct task_struct *p)
        /*
         * SCHED_IDLE tasks get minimal weight:
         */
-        if (p->policy == SCHED_IDLE) {
+        if (idle_policy(p->policy)) {
                load->weight = scale_load(WEIGHT_IDLEPRIO);
                load->inv_weight = WMULT_IDLEPRIO;
                return;
@@ -827,17 +827,19 @@ static void set_load_weight(struct task_struct *p)
        load->inv_weight = prio_to_wmult[prio];
 }
-static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
+static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 {
        update_rq_clock(rq);
-        sched_info_queued(rq, p);
+        if (!(flags & ENQUEUE_RESTORE))
+                sched_info_queued(rq, p);
        p->sched_class->enqueue_task(rq, p, flags);
 }
-static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
+static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 {
        update_rq_clock(rq);
-        sched_info_dequeued(rq, p);
+        if (!(flags & DEQUEUE_SAVE))
+                sched_info_dequeued(rq, p);
        p->sched_class->dequeue_task(rq, p, flags);
 }
@@ -1178,7 +1180,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
                 * holding rq->lock.
                 */
                lockdep_assert_held(&rq->lock);
-                dequeue_task(rq, p, 0);
+                dequeue_task(rq, p, DEQUEUE_SAVE);
        }
        if (running)
                put_prev_task(rq, p);
@@ -1188,7 +1190,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
        if (running)
                p->sched_class->set_curr_task(rq);
        if (queued)
-                enqueue_task(rq, p, 0);
+                enqueue_task(rq, p, ENQUEUE_RESTORE);
 }
 /*
@@ -1292,7 +1294,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
        if (task_cpu(p) != new_cpu) {
                if (p->sched_class->migrate_task_rq)
-                        p->sched_class->migrate_task_rq(p, new_cpu);
+                        p->sched_class->migrate_task_rq(p);
                p->se.nr_migrations++;
                perf_event_task_migrate(p);
        }
@@ -1333,12 +1335,16 @@ static int migrate_swap_stop(void *data)
        struct rq *src_rq, *dst_rq;
        int ret = -EAGAIN;
+        if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
+                return -EAGAIN;
        src_rq = cpu_rq(arg->src_cpu);
        dst_rq = cpu_rq(arg->dst_cpu);
        double_raw_lock(&arg->src_task->pi_lock,
                        &arg->dst_task->pi_lock);
        double_rq_lock(src_rq, dst_rq);
        if (task_cpu(arg->dst_task) != arg->dst_cpu)
                goto unlock;
@@ -1574,13 +1580,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
                        goto out;
                }
+                /* No more Mr. Nice Guy. */
                switch (state) {
                case cpuset:
-                        /* No more Mr. Nice Guy. */
+                        if (IS_ENABLED(CONFIG_CPUSETS)) {
-                        cpuset_cpus_allowed_fallback(p);
+                                cpuset_cpus_allowed_fallback(p);
-                        state = possible;
+                                state = possible;
-                        break;
+                                break;
+                        }
+                        /* fall-through */
                case possible:
                        do_set_cpus_allowed(p, cpu_possible_mask);
                        state = fail;
@@ -1692,7 +1700,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
 #endif /* CONFIG_SCHEDSTATS */
 }
-static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
+static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
 {
        activate_task(rq, p, en_flags);
        p->on_rq = TASK_ON_RQ_QUEUED;
@@ -2114,23 +2122,17 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 #endif /* CONFIG_NUMA_BALANCING */
 }
+DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
 #ifdef CONFIG_NUMA_BALANCING
-#ifdef CONFIG_SCHED_DEBUG
 void set_numabalancing_state(bool enabled)
 {
        if (enabled)
-                sched_feat_set("NUMA");
+                static_branch_enable(&sched_numa_balancing);
        else
-                sched_feat_set("NO_NUMA");
+                static_branch_disable(&sched_numa_balancing);
 }
-#else
-__read_mostly bool numabalancing_enabled;
-void set_numabalancing_state(bool enabled)
-{
-        numabalancing_enabled = enabled;
-}
-#endif /* CONFIG_SCHED_DEBUG */
 #ifdef CONFIG_PROC_SYSCTL
 int sysctl_numa_balancing(struct ctl_table *table, int write,
@@ -2138,7 +2140,7 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
 {
        struct ctl_table t;
        int err;
-        int state = numabalancing_enabled;
+        int state = static_branch_likely(&sched_numa_balancing);
        if (write && !capable(CAP_SYS_ADMIN))
                return -EPERM;
@@ -2349,6 +2351,8 @@ void wake_up_new_task(struct task_struct *p)
        struct rq *rq;
        raw_spin_lock_irqsave(&p->pi_lock, flags);
+        /* Initialize new task's runnable average */
+        init_entity_runnable_average(&p->se);
 #ifdef CONFIG_SMP
        /*
         * Fork balancing, do it here and not earlier because:
@@ -2358,16 +2362,21 @@ void wake_up_new_task(struct task_struct *p)
        set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
 #endif
-        /* Initialize new task's runnable average */
-        init_entity_runnable_average(&p->se);
        rq = __task_rq_lock(p);
        activate_task(rq, p, 0);
        p->on_rq = TASK_ON_RQ_QUEUED;
        trace_sched_wakeup_new(p);
        check_preempt_curr(rq, p, WF_FORK);
 #ifdef CONFIG_SMP
-        if (p->sched_class->task_woken)
+        if (p->sched_class->task_woken) {
+                /*
+                 * Nothing relies on rq->lock after this, so its fine to
+                 * drop it.
+                 */
+                lockdep_unpin_lock(&rq->lock);
                p->sched_class->task_woken(rq, p);
+                lockdep_pin_lock(&rq->lock);
+        }
 #endif
        task_rq_unlock(rq, p, &flags);
 }
@@ -2476,7 +2485,6 @@ static inline void
 prepare_task_switch(struct rq *rq, struct task_struct *prev,
                    struct task_struct *next)
 {
-        trace_sched_switch(prev, next);
        sched_info_switch(rq, prev, next);
        perf_event_task_sched_out(prev, next);
        fire_sched_out_preempt_notifiers(prev, next);
@@ -2510,6 +2518,22 @@ static struct rq *finish_task_switch(struct task_struct *prev)
        struct mm_struct *mm = rq->prev_mm;
        long prev_state;
+        /*
+         * The previous task will have left us with a preempt_count of 2
+         * because it left us after:
+         *
+         *      schedule()
+         *        preempt_disable();                    // 1
+         *        __schedule()
+         *          raw_spin_lock_irq(&rq->lock)        // 2
+         *
+         * Also, see FORK_PREEMPT_COUNT.
+         */
+        if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
+                      "corrupted preempt_count: %s/%d/0x%x\n",
+                      current->comm, current->pid, preempt_count()))
+                preempt_count_set(FORK_PREEMPT_COUNT);
        rq->prev_mm = NULL;
        /*
@@ -2517,11 +2541,11 @@ static struct rq *finish_task_switch(struct task_struct *prev)
         * If a task dies, then it sets TASK_DEAD in tsk->state and calls
         * schedule one last time. The schedule call will never return, and
         * the scheduled task must drop that reference.
-         * The test for TASK_DEAD must occur while the runqueue locks are
+         *
-         * still held, otherwise prev could be scheduled on another cpu, die
+         * We must observe prev->state before clearing prev->on_cpu (in
-         * there before we look at prev->state, and then the reference would
+         * finish_lock_switch), otherwise a concurrent wakeup can get prev
-         * be dropped twice.
+         * running on another CPU and we could rave with its RUNNING -> DEAD
-         *              Manfred Spraul <manfred@colorfullife.com>
+         * transition, resulting in a double drop.
         */
        prev_state = prev->state;
        vtime_task_switch(prev);
@@ -2594,8 +2618,15 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
 {
        struct rq *rq;
-        /* finish_task_switch() drops rq->lock and enables preemtion */
+        /*
-        preempt_disable();
+         * New tasks start with FORK_PREEMPT_COUNT, see there and
+         * finish_task_switch() for details.
+         *
+         * finish_task_switch() will drop rq->lock() and lower preempt_count
+         * and the preempt_enable() will end up enabling preemption (on
+         * PREEMPT_COUNT kernels).
+         */
        rq = finish_task_switch(prev);
        balance_callback(rq);
        preempt_enable();
@@ -2953,15 +2984,13 @@ static noinline void __schedule_bug(struct task_struct *prev)
 static inline void schedule_debug(struct task_struct *prev)
 {
 #ifdef CONFIG_SCHED_STACK_END_CHECK
-        BUG_ON(unlikely(task_stack_end_corrupted(prev)));
+        BUG_ON(task_stack_end_corrupted(prev));
 #endif
-        /*
-         * Test if we are atomic. Since do_exit() needs to call into
+        if (unlikely(in_atomic_preempt_off())) {
-         * schedule() atomically, we ignore that path. Otherwise whine
-         * if we are scheduling when we should not.
-         */
-        if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))
                __schedule_bug(prev);
+                preempt_count_set(PREEMPT_DISABLED);
+        }
        rcu_sleep_check();
        profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@ -3047,7 +3076,7 @@ again:
 *
 * WARNING: must be called with preemption disabled!
 */
-static void __sched __schedule(void)
+static void __sched notrace __schedule(bool preempt)
 {
        struct task_struct *prev, *next;
        unsigned long *switch_count;
@@ -3059,6 +3088,17 @@ static void __sched __schedule(void)
        rcu_note_context_switch();
        prev = rq->curr;
+        /*
+         * do_exit() calls schedule() with preemption disabled as an exception;
+         * however we must fix that up, otherwise the next task will see an
+         * inconsistent (higher) preempt count.
+         *
+         * It also avoids the below schedule_debug() test from complaining
+         * about this.
+         */
+        if (unlikely(prev->state == TASK_DEAD))
+                preempt_enable_no_resched_notrace();
        schedule_debug(prev);
        if (sched_feat(HRTICK))
@@ -3076,7 +3116,7 @@ static void __sched __schedule(void)
        rq->clock_skip_update <<= 1; /* promote REQ to ACT */
        switch_count = &prev->nivcsw;
-        if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
+        if (!preempt && prev->state) {
                if (unlikely(signal_pending_state(prev->state, prev))) {
                        prev->state = TASK_RUNNING;
                } else {
@@ -3112,6 +3152,7 @@ static void __sched __schedule(void)
                rq->curr = next;
                ++*switch_count;
+                trace_sched_switch(preempt, prev, next);
                rq = context_switch(rq, prev, next); /* unlocks the rq */
                cpu = cpu_of(rq);
        } else {
@@ -3141,7 +3182,7 @@ asmlinkage __visible void __sched schedule(void)
        sched_submit_work(tsk);
        do {
                preempt_disable();
-                __schedule();
+                __schedule(false);
                sched_preempt_enable_no_resched();
        } while (need_resched());
 }
@@ -3181,9 +3222,9 @@ void __sched schedule_preempt_disabled(void)
 static void __sched notrace preempt_schedule_common(void)
 {
        do {
-                preempt_active_enter();
+                preempt_disable_notrace();
-                __schedule();
+                __schedule(true);
-                preempt_active_exit();
+                preempt_enable_no_resched_notrace();
                /*
                 * Check again in case we missed a preemption opportunity
@@ -3234,24 +3275,17 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
                return;
        do {
-                /*
+                preempt_disable_notrace();
-                 * Use raw __prempt_count() ops that don't call function.
-                 * We can't call functions before disabling preemption which
-                 * disarm preemption tracing recursions.
-                 */
-                __preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
-                barrier();
                /*
                 * Needs preempt disabled in case user_exit() is traced
                 * and the tracer calls preempt_enable_notrace() causing
                 * an infinite recursion.
                 */
                prev_ctx = exception_enter();
-                __schedule();
+                __schedule(true);
                exception_exit(prev_ctx);
-                barrier();
+                preempt_enable_no_resched_notrace();
-                __preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
        } while (need_resched());
 }
 EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
@@ -3274,11 +3308,11 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
        prev_state = exception_enter();
        do {
-                preempt_active_enter();
+                preempt_disable();
                local_irq_enable();
-                __schedule();
+                __schedule(true);
                local_irq_disable();
-                preempt_active_exit();
+                sched_preempt_enable_no_resched();
        } while (need_resched());
        exception_exit(prev_state);
@@ -3306,7 +3340,7 @@ EXPORT_SYMBOL(default_wake_function);
 */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
-        int oldprio, queued, running, enqueue_flag = 0;
+        int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE;
        struct rq *rq;
        const struct sched_class *prev_class;
@@ -3338,7 +3372,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        queued = task_on_rq_queued(p);
        running = task_current(rq, p);
        if (queued)
-                dequeue_task(rq, p, 0);
+                dequeue_task(rq, p, DEQUEUE_SAVE);
        if (running)
                put_prev_task(rq, p);
@@ -3356,7 +3390,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
                if (!dl_prio(p->normal_prio) ||
                    (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
                        p->dl.dl_boosted = 1;
-                        enqueue_flag = ENQUEUE_REPLENISH;
+                        enqueue_flag |= ENQUEUE_REPLENISH;
                } else
                        p->dl.dl_boosted = 0;
                p->sched_class = &dl_sched_class;
@@ -3364,7 +3398,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
                if (dl_prio(oldprio))
                        p->dl.dl_boosted = 0;
                if (oldprio < prio)
-                        enqueue_flag = ENQUEUE_HEAD;
+                        enqueue_flag |= ENQUEUE_HEAD;
                p->sched_class = &rt_sched_class;
        } else {
                if (dl_prio(oldprio))
@@ -3416,7 +3450,7 @@ void set_user_nice(struct task_struct *p, long nice)
        }
        queued = task_on_rq_queued(p);
        if (queued)
-                dequeue_task(rq, p, 0);
+                dequeue_task(rq, p, DEQUEUE_SAVE);
        p->static_prio = NICE_TO_PRIO(nice);
        set_load_weight(p);
@@ -3425,7 +3459,7 @@ void set_user_nice(struct task_struct *p, long nice)
        delta = p->prio - old_prio;
        if (queued) {
-                enqueue_task(rq, p, 0);
+                enqueue_task(rq, p, ENQUEUE_RESTORE);
                /*
                 * If the task increased its priority or is running and
                 * lowered its priority, then reschedule its CPU:
@@ -3746,10 +3780,7 @@ recheck:
        } else {
                reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
-                if (policy != SCHED_DEADLINE &&
+                if (!valid_policy(policy))
-                                policy != SCHED_FIFO && policy != SCHED_RR &&
-                                policy != SCHED_NORMAL && policy != SCHED_BATCH &&
-                                policy != SCHED_IDLE)
                        return -EINVAL;
        }
@@ -3805,7 +3836,7 @@ recheck:
                 * Treat SCHED_IDLE as nice 20. Only allow a switch to
                 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
                 */
-                if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
+                if (idle_policy(p->policy) && !idle_policy(policy)) {
                        if (!can_nice(p, task_nice(p)))
                                return -EPERM;
                }
@@ -3930,7 +3961,7 @@ change:
        queued = task_on_rq_queued(p);
        running = task_current(rq, p);
        if (queued)
-                dequeue_task(rq, p, 0);
+                dequeue_task(rq, p, DEQUEUE_SAVE);
        if (running)
                put_prev_task(rq, p);
@@ -3940,11 +3971,15 @@ change:
        if (running)
                p->sched_class->set_curr_task(rq);
        if (queued) {
+                int enqueue_flags = ENQUEUE_RESTORE;
                /*
                 * We enqueue to tail when the priority of a task is
                 * increased (user space view).
                 */
-                enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
+                if (oldprio <= p->prio)
+                        enqueue_flags |= ENQUEUE_HEAD;
+                enqueue_task(rq, p, enqueue_flags);
        }
        check_class_changed(rq, p, prev_class, oldprio);
@@ -4022,6 +4057,7 @@ int sched_setscheduler_nocheck(struct task_struct *p, int policy,
 {
        return _sched_setscheduler(p, policy, param, false);
 }
+EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
 static int
 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
@@ -4934,7 +4970,15 @@ void init_idle(struct task_struct *idle, int cpu)
        idle->state = TASK_RUNNING;
        idle->se.exec_start = sched_clock();
-        do_set_cpus_allowed(idle, cpumask_of(cpu));
+#ifdef CONFIG_SMP
+        /*
+         * Its possible that init_idle() gets called multiple times on a task,
+         * in that case do_set_cpus_allowed() will not do the right thing.
+         *
+         * And since this is boot we can forgo the serialization.
+         */
+        set_cpus_allowed_common(idle, cpumask_of(cpu));
+#endif
        /*
         * We're having a chicken and egg problem, even though we are
         * holding rq->lock, the cpu isn't yet set to this cpu so the
@@ -4951,7 +4995,7 @@ void init_idle(struct task_struct *idle, int cpu)
        rq->curr = rq->idle = idle;
        idle->on_rq = TASK_ON_RQ_QUEUED;
-#if defined(CONFIG_SMP)
+#ifdef CONFIG_SMP
        idle->on_cpu = 1;
 #endif
        raw_spin_unlock(&rq->lock);
@@ -4966,7 +5010,7 @@ void init_idle(struct task_struct *idle, int cpu)
        idle->sched_class = &idle_sched_class;
        ftrace_graph_init_idle_task(idle, cpu);
        vtime_init_idle(idle, cpu);
-#if defined(CONFIG_SMP)
+#ifdef CONFIG_SMP
        sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
 #endif
 }
@@ -5085,7 +5129,7 @@ void sched_setnuma(struct task_struct *p, int nid)
        running = task_current(rq, p);
        if (queued)
-                dequeue_task(rq, p, 0);
+                dequeue_task(rq, p, DEQUEUE_SAVE);
        if (running)
                put_prev_task(rq, p);
@@ -5094,7 +5138,7 @@ void sched_setnuma(struct task_struct *p, int nid)
        if (running)
                p->sched_class->set_curr_task(rq);
        if (queued)
-                enqueue_task(rq, p, 0);
+                enqueue_task(rq, p, ENQUEUE_RESTORE);
        task_rq_unlock(rq, p, &flags);
 }
 #endif /* CONFIG_NUMA_BALANCING */
@@ -5515,21 +5559,27 @@ static void set_cpu_rq_start_time(void)
 static int sched_cpu_active(struct notifier_block *nfb,
                                      unsigned long action, void *hcpu)
 {
+        int cpu = (long)hcpu;
        switch (action & ~CPU_TASKS_FROZEN) {
        case CPU_STARTING:
                set_cpu_rq_start_time();
                return NOTIFY_OK;
        case CPU_ONLINE:
                /*
                 * At this point a starting CPU has marked itself as online via
                 * set_cpu_online(). But it might not yet have marked itself
                 * as active, which is essential from here on.
-                 *
-                 * Thus, fall-through and help the starting CPU along.
                 */
+                set_cpu_active(cpu, true);
+                stop_machine_unpark(cpu);
+                return NOTIFY_OK;
        case CPU_DOWN_FAILED:
-                set_cpu_active((long)hcpu, true);
+                set_cpu_active(cpu, true);
                return NOTIFY_OK;
        default:
                return NOTIFY_DONE;
        }
@@ -6461,7 +6511,8 @@ static struct sched_domain_topology_level default_topology[] = {
        { NULL, },
 };
-struct sched_domain_topology_level *sched_domain_topology = default_topology;
+static struct sched_domain_topology_level *sched_domain_topology =
+        default_topology;
 #define for_each_sd_topology(tl)                        \
        for (tl = sched_domain_topology; tl->mask; tl++)
@@ -7230,9 +7281,6 @@ void __init sched_init_smp(void)
        alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
        alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
-        /* nohz_full won't take effect without isolating the cpus. */
-        tick_nohz_full_add_cpus_to(cpu_isolated_map);
        sched_init_numa();
        /*
@@ -7465,7 +7513,7 @@ void __init sched_init(void)
 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
 static inline int preempt_count_equals(int preempt_offset)
 {
-        int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
+        int nested = preempt_count() + rcu_preempt_depth();
        return (nested == preempt_offset);
 }
@@ -7712,7 +7760,7 @@ void sched_move_task(struct task_struct *tsk)
        queued = task_on_rq_queued(tsk);
        if (queued)
-                dequeue_task(rq, tsk, 0);
+                dequeue_task(rq, tsk, DEQUEUE_SAVE);
        if (unlikely(running))
                put_prev_task(rq, tsk);
@@ -7728,7 +7776,7 @@ void sched_move_task(struct task_struct *tsk)
 #ifdef CONFIG_FAIR_GROUP_SCHED
        if (tsk->sched_class->task_move_group)
-                tsk->sched_class->task_move_group(tsk, queued);
+                tsk->sched_class->task_move_group(tsk);
        else
 #endif
                set_task_rq(tsk, task_cpu(tsk));
@@ -7736,7 +7784,7 @@ void sched_move_task(struct task_struct *tsk)
        if (unlikely(running))
                tsk->sched_class->set_curr_task(rq);
        if (queued)
-                enqueue_task(rq, tsk, 0);
+                enqueue_task(rq, tsk, ENQUEUE_RESTORE);
        task_rq_unlock(rq, tsk, &flags);
 }
@@ -8196,21 +8244,6 @@ static void cpu_cgroup_attach(struct cgroup_subsys_state *css,
                sched_move_task(task);
 }
-static void cpu_cgroup_exit(struct cgroup_subsys_state *css,
-                            struct cgroup_subsys_state *old_css,
-                            struct task_struct *task)
-{
-        /*
-         * cgroup_exit() is called in the copy_process() failure path.
-         * Ignore this case since the task hasn't ran yet, this avoids
-         * trying to poke a half freed task state from generic code.
-         */
-        if (!(task->flags & PF_EXITING))
-                return;
-        sched_move_task(task);
-}
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
                                struct cftype *cftype, u64 shareval)
@@ -8542,7 +8575,6 @@ struct cgroup_subsys cpu_cgrp_subsys = {
        .fork           = cpu_cgroup_fork,
        .can_attach     = cpu_cgroup_can_attach,
        .attach         = cpu_cgroup_attach,
-        .exit           = cpu_cgroup_exit,
        .legacy_cftypes = cpu_files,
        .early_init     = 1,
 };
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index c6acb07466bb..5a75b08cfd85 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -31,11 +31,6 @@ static inline int right_child(int i)
        return (i << 1) + 2;
 }
-static inline int dl_time_before(u64 a, u64 b)
-{
-        return (s64)(a - b) < 0;
-}
 static void cpudl_exchange(struct cpudl *cp, int a, int b)
 {
        int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index 1a0a6ef2fbe1..fcbdf83fed7e 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -2,6 +2,7 @@
 #define _LINUX_CPUDL_H
 #include <linux/sched.h>
+#include <linux/sched/deadline.h>
 #define IDX_INVALID     -1
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 8cbc3db671df..26a54461bf59 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -444,6 +444,7 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
        *ut = p->utime;
        *st = p->stime;
 }
+EXPORT_SYMBOL_GPL(task_cputime_adjusted);
 void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
@@ -652,6 +653,7 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
        task_cputime(p, &cputime.utime, &cputime.stime);
        cputime_adjust(&cputime, &p->prev_cputime, ut, st);
 }
+EXPORT_SYMBOL_GPL(task_cputime_adjusted);
 void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index fc8f01083527..8b0a15e285f9 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -668,8 +668,15 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
         * Queueing this task back might have overloaded rq, check if we need
         * to kick someone away.
         */
-        if (has_pushable_dl_tasks(rq))
+        if (has_pushable_dl_tasks(rq)) {
+                /*
+                 * Nothing relies on rq->lock after this, so its safe to drop
+                 * rq->lock.
+                 */
+                lockdep_unpin_lock(&rq->lock);
                push_dl_task(rq);
+                lockdep_pin_lock(&rq->lock);
+        }
 #endif
 unlock:
@@ -1066,8 +1073,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
                int target = find_later_rq(p);
                if (target != -1 &&
-                                dl_time_before(p->dl.deadline,
+                                (dl_time_before(p->dl.deadline,
-                                        cpu_rq(target)->dl.earliest_dl.curr))
+                                        cpu_rq(target)->dl.earliest_dl.curr) ||
+                                (cpu_rq(target)->dl.dl_nr_running == 0)))
                        cpu = target;
        }
        rcu_read_unlock();
@@ -1417,7 +1425,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
                later_rq = cpu_rq(cpu);
-                if (!dl_time_before(task->dl.deadline,
+                if (later_rq->dl.dl_nr_running &&
+                    !dl_time_before(task->dl.deadline,
                                        later_rq->dl.earliest_dl.curr)) {
                        /*
                         * Target rq has tasks of equal or earlier deadline,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6e2e3483b1ec..824aa9f501a3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -661,11 +661,12 @@ static unsigned long task_h_load(struct task_struct *p);
 /*
 * We choose a half-life close to 1 scheduling period.
- * Note: The tables below are dependent on this value.
+ * Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are
+ * dependent on this value.
 */
 #define LOAD_AVG_PERIOD 32
 #define LOAD_AVG_MAX 47742 /* maximum possible load avg */
-#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
+#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */
 /* Give new sched_entity start runnable values to heavy its load in infant time */
 void init_entity_runnable_average(struct sched_entity *se)
@@ -682,7 +683,7 @@ void init_entity_runnable_average(struct sched_entity *se)
        sa->load_avg = scale_load_down(se->load.weight);
        sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
        sa->util_avg = scale_load_down(SCHED_LOAD_SCALE);
-        sa->util_sum = LOAD_AVG_MAX;
+        sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
        /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
 }
@@ -2069,7 +2070,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
        int local = !!(flags & TNF_FAULT_LOCAL);
        int priv;
-        if (!numabalancing_enabled)
+        if (!static_branch_likely(&sched_numa_balancing))
                return;
        /* for example, ksmd faulting in a user's mm */
@@ -2157,7 +2158,7 @@ void task_numa_work(struct callback_head *work)
        struct vm_area_struct *vma;
        unsigned long start, end;
        unsigned long nr_pte_updates = 0;
-        long pages;
+        long pages, virtpages;
        WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
@@ -2203,9 +2204,11 @@ void task_numa_work(struct callback_head *work)
        start = mm->numa_scan_offset;
        pages = sysctl_numa_balancing_scan_size;
        pages <<= 20 - PAGE_SHIFT; /* MB in pages */
+        virtpages = pages * 8;     /* Scan up to this much virtual space */
        if (!pages)
                return;
        down_read(&mm->mmap_sem);
        vma = find_vma(mm, start);
        if (!vma) {
@@ -2240,18 +2243,22 @@ void task_numa_work(struct callback_head *work)
                        start = max(start, vma->vm_start);
                        end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
                        end = min(end, vma->vm_end);
-                        nr_pte_updates += change_prot_numa(vma, start, end);
+                        nr_pte_updates = change_prot_numa(vma, start, end);
                        /*
-                         * Scan sysctl_numa_balancing_scan_size but ensure that
+                         * Try to scan sysctl_numa_balancing_size worth of
-                         * at least one PTE is updated so that unused virtual
+                         * hpages that have at least one present PTE that
-                         * address space is quickly skipped.
+                         * is not already pte-numa. If the VMA contains
+                         * areas that are unused or already full of prot_numa
+                         * PTEs, scan up to virtpages, to skip through those
+                         * areas faster.
                         */
                        if (nr_pte_updates)
                                pages -= (end - start) >> PAGE_SHIFT;
+                        virtpages -= (end - start) >> PAGE_SHIFT;
                        start = end;
-                        if (pages <= 0)
+                        if (pages <= 0 || virtpages <= 0)
                                goto out;
                        cond_resched();
@@ -2363,7 +2370,7 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
         */
        tg_weight = atomic_long_read(&tg->load_avg);
        tg_weight -= cfs_rq->tg_load_avg_contrib;
-        tg_weight += cfs_rq_load_avg(cfs_rq);
+        tg_weight += cfs_rq->load.weight;
        return tg_weight;
 }
@@ -2373,7 +2380,7 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
        long tg_weight, load, shares;
        tg_weight = calc_tg_weight(tg, cfs_rq);
-        load = cfs_rq_load_avg(cfs_rq);
+        load = cfs_rq->load.weight;
        shares = (tg->shares * load);
        if (tg_weight)
@@ -2515,6 +2522,12 @@ static u32 __compute_runnable_contrib(u64 n)
        return contrib + runnable_avg_yN_sum[n];
 }
+#if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10
+#error "load tracking assumes 2^10 as unit"
+#endif
+#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
 /*
 * We can represent the historical contribution to runnable average as the
 * coefficients of a geometric series.  To do this we sub-divide our runnable
@@ -2547,10 +2560,10 @@ static __always_inline int
 __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
                  unsigned long weight, int running, struct cfs_rq *cfs_rq)
 {
-        u64 delta, periods;
+        u64 delta, scaled_delta, periods;
        u32 contrib;
-        int delta_w, decayed = 0;
+        unsigned int delta_w, scaled_delta_w, decayed = 0;
-        unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu);
+        unsigned long scale_freq, scale_cpu;
        delta = now - sa->last_update_time;
        /*
@@ -2571,6 +2584,9 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
                return 0;
        sa->last_update_time = now;
+        scale_freq = arch_scale_freq_capacity(NULL, cpu);
+        scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
        /* delta_w is the amount already accumulated against our next period */
        delta_w = sa->period_contrib;
        if (delta + delta_w >= 1024) {
@@ -2585,13 +2601,16 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
                 * period and accrue it.
                 */
                delta_w = 1024 - delta_w;
+                scaled_delta_w = cap_scale(delta_w, scale_freq);
                if (weight) {
-                        sa->load_sum += weight * delta_w;
+                        sa->load_sum += weight * scaled_delta_w;
-                        if (cfs_rq)
+                        if (cfs_rq) {
-                                cfs_rq->runnable_load_sum += weight * delta_w;
+                                cfs_rq->runnable_load_sum +=
+                                                weight * scaled_delta_w;
+                        }
                }
                if (running)
-                        sa->util_sum += delta_w * scale_freq >> SCHED_CAPACITY_SHIFT;
+                        sa->util_sum += scaled_delta_w * scale_cpu;
                delta -= delta_w;
@@ -2608,23 +2627,25 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
                /* Efficiently calculate \sum (1..n_period) 1024*y^i */
                contrib = __compute_runnable_contrib(periods);
+                contrib = cap_scale(contrib, scale_freq);
                if (weight) {
                        sa->load_sum += weight * contrib;
                        if (cfs_rq)
                                cfs_rq->runnable_load_sum += weight * contrib;
                }
                if (running)
-                        sa->util_sum += contrib * scale_freq >> SCHED_CAPACITY_SHIFT;
+                        sa->util_sum += contrib * scale_cpu;
        }
        /* Remainder of delta accrued against u_0` */
+        scaled_delta = cap_scale(delta, scale_freq);
        if (weight) {
-                sa->load_sum += weight * delta;
+                sa->load_sum += weight * scaled_delta;
                if (cfs_rq)
-                        cfs_rq->runnable_load_sum += weight * delta;
+                        cfs_rq->runnable_load_sum += weight * scaled_delta;
        }
        if (running)
-                sa->util_sum += delta * scale_freq >> SCHED_CAPACITY_SHIFT;
+                sa->util_sum += scaled_delta * scale_cpu;
        sa->period_contrib += delta;
@@ -2634,7 +2655,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
                        cfs_rq->runnable_load_avg =
                                div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
                }
-                sa->util_avg = (sa->util_sum << SCHED_LOAD_SHIFT) / LOAD_AVG_MAX;
+                sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
        }
        return decayed;
@@ -2664,20 +2685,20 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
 /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
 static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 {
-        int decayed;
        struct sched_avg *sa = &cfs_rq->avg;
+        int decayed, removed = 0;
        if (atomic_long_read(&cfs_rq->removed_load_avg)) {
                long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
                sa->load_avg = max_t(long, sa->load_avg - r, 0);
                sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
+                removed = 1;
        }
        if (atomic_long_read(&cfs_rq->removed_util_avg)) {
                long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
                sa->util_avg = max_t(long, sa->util_avg - r, 0);
-                sa->util_sum = max_t(s32, sa->util_sum -
+                sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
-                        ((r * LOAD_AVG_MAX) >> SCHED_LOAD_SHIFT), 0);
        }
        decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
@@ -2688,40 +2709,77 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
        cfs_rq->load_last_update_time_copy = sa->last_update_time;
 #endif
-        return decayed;
+        return decayed || removed;
 }
 /* Update task and its cfs_rq load average */
 static inline void update_load_avg(struct sched_entity *se, int update_tg)
 {
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
-        int cpu = cpu_of(rq_of(cfs_rq));
        u64 now = cfs_rq_clock_task(cfs_rq);
+        int cpu = cpu_of(rq_of(cfs_rq));
        /*
         * Track task load average for carrying it to new CPU after migrated, and
         * track group sched_entity load average for task_h_load calc in migration
         */
        __update_load_avg(now, cpu, &se->avg,
-                se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL);
+                          se->on_rq * scale_load_down(se->load.weight),
+                          cfs_rq->curr == se, NULL);
        if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
                update_tg_load_avg(cfs_rq, 0);
 }
+static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+        if (!sched_feat(ATTACH_AGE_LOAD))
+                goto skip_aging;
+        /*
+         * If we got migrated (either between CPUs or between cgroups) we'll
+         * have aged the average right before clearing @last_update_time.
+         */
+        if (se->avg.last_update_time) {
+                __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
+                                  &se->avg, 0, 0, NULL);
+                /*
+                 * XXX: we could have just aged the entire load away if we've been
+                 * absent from the fair class for too long.
+                 */
+        }
+skip_aging:
+        se->avg.last_update_time = cfs_rq->avg.last_update_time;
+        cfs_rq->avg.load_avg += se->avg.load_avg;
+        cfs_rq->avg.load_sum += se->avg.load_sum;
+        cfs_rq->avg.util_avg += se->avg.util_avg;
+        cfs_rq->avg.util_sum += se->avg.util_sum;
+}
+static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+        __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
+                          &se->avg, se->on_rq * scale_load_down(se->load.weight),
+                          cfs_rq->curr == se, NULL);
+        cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
+        cfs_rq->avg.load_sum = max_t(s64,  cfs_rq->avg.load_sum - se->avg.load_sum, 0);
+        cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
+        cfs_rq->avg.util_sum = max_t(s32,  cfs_rq->avg.util_sum - se->avg.util_sum, 0);
+}
 /* Add the load generated by se into cfs_rq's load average */
 static inline void
 enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        struct sched_avg *sa = &se->avg;
        u64 now = cfs_rq_clock_task(cfs_rq);
-        int migrated = 0, decayed;
+        int migrated, decayed;
-        if (sa->last_update_time == 0) {
+        migrated = !sa->last_update_time;
-                sa->last_update_time = now;
+        if (!migrated) {
-                migrated = 1;
-        }
-        else {
                __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
                        se->on_rq * scale_load_down(se->load.weight),
                        cfs_rq->curr == se, NULL);
@@ -2732,12 +2790,8 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
        cfs_rq->runnable_load_avg += sa->load_avg;
        cfs_rq->runnable_load_sum += sa->load_sum;
-        if (migrated) {
+        if (migrated)
-                cfs_rq->avg.load_avg += sa->load_avg;
+                attach_entity_load_avg(cfs_rq, se);
-                cfs_rq->avg.load_sum += sa->load_sum;
-                cfs_rq->avg.util_avg += sa->util_avg;
-                cfs_rq->avg.util_sum += sa->util_sum;
-        }
        if (decayed || migrated)
                update_tg_load_avg(cfs_rq, 0);
@@ -2752,7 +2806,7 @@ dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
        cfs_rq->runnable_load_avg =
                max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
        cfs_rq->runnable_load_sum =
-                max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
+                max_t(s64,  cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
 }
 /*
@@ -2820,6 +2874,11 @@ static inline void
 dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
 static inline void remove_entity_load_avg(struct sched_entity *se) {}
+static inline void
+attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
+static inline void
+detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
 static inline int idle_balance(struct rq *rq)
 {
        return 0;
@@ -4816,32 +4875,39 @@ next:
 done:
        return target;
 }
 /*
- * get_cpu_usage returns the amount of capacity of a CPU that is used by CFS
+ * cpu_util returns the amount of capacity of a CPU that is used by CFS
 * tasks. The unit of the return value must be the one of capacity so we can
- * compare the usage with the capacity of the CPU that is available for CFS
+ * compare the utilization with the capacity of the CPU that is available for
- * task (ie cpu_capacity).
+ * CFS task (ie cpu_capacity).
- * cfs.avg.util_avg is the sum of running time of runnable tasks on a
+ *
- * CPU. It represents the amount of utilization of a CPU in the range
+ * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
- * [0..SCHED_LOAD_SCALE].  The usage of a CPU can't be higher than the full
+ * recent utilization of currently non-runnable tasks on a CPU. It represents
- * capacity of the CPU because it's about the running time on this CPU.
+ * the amount of utilization of a CPU in the range [0..capacity_orig] where
- * Nevertheless, cfs.avg.util_avg can be higher than SCHED_LOAD_SCALE
+ * capacity_orig is the cpu_capacity available at the highest frequency
- * because of unfortunate rounding in util_avg or just
+ * (arch_scale_freq_capacity()).
- * after migrating tasks until the average stabilizes with the new running
+ * The utilization of a CPU converges towards a sum equal to or less than the
- * time. So we need to check that the usage stays into the range
+ * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
- * [0..cpu_capacity_orig] and cap if necessary.
+ * the running time on this CPU scaled by capacity_curr.
- * Without capping the usage, a group could be seen as overloaded (CPU0 usage
+ *
- * at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity
+ * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
+ * higher than capacity_orig because of unfortunate rounding in
+ * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
+ * the average stabilizes with the new running time. We need to check that the
+ * utilization stays within the range of [0..capacity_orig] and cap it if
+ * necessary. Without utilization capping, a group could be seen as overloaded
+ * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
+ * available capacity. We allow utilization to overshoot capacity_curr (but not
+ * capacity_orig) as it useful for predicting the capacity required after task
+ * migrations (scheduler-driven DVFS).
 */
-static int get_cpu_usage(int cpu)
+static int cpu_util(int cpu)
 {
-        unsigned long usage = cpu_rq(cpu)->cfs.avg.util_avg;
+        unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
        unsigned long capacity = capacity_orig_of(cpu);
-        if (usage >= SCHED_LOAD_SCALE)
+        return (util >= capacity) ? capacity : util;
-                return capacity;
-        return (usage * capacity) >> SCHED_LOAD_SHIFT;
 }
 /*
@@ -4944,7 +5010,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 * previous cpu.  However, the caller only guarantees p->pi_lock is held; no
 * other assumptions, including the state of rq->lock, should be made.
 */
-static void migrate_task_rq_fair(struct task_struct *p, int next_cpu)
+static void migrate_task_rq_fair(struct task_struct *p)
 {
        /*
         * We are supposed to update the task to "current" time, then its up to date
@@ -5524,10 +5590,10 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
        unsigned long src_faults, dst_faults;
        int src_nid, dst_nid;
-        if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
+        if (!static_branch_likely(&sched_numa_balancing))
                return -1;
-        if (!sched_feat(NUMA))
+        if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
                return -1;
        src_nid = cpu_to_node(env->src_cpu);
@@ -5933,7 +5999,7 @@ struct sg_lb_stats {
        unsigned long sum_weighted_load; /* Weighted load of group's tasks */
        unsigned long load_per_task;
        unsigned long group_capacity;
-        unsigned long group_usage; /* Total usage of the group */
+        unsigned long group_util; /* Total utilization of the group */
        unsigned int sum_nr_running; /* Nr tasks running in the group */
        unsigned int idle_cpus;
        unsigned int group_weight;
@@ -6009,19 +6075,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
        return load_idx;
 }
-static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu)
-{
-        if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
-                return sd->smt_gain / sd->span_weight;
-        return SCHED_CAPACITY_SCALE;
-}
-unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
-{
-        return default_scale_cpu_capacity(sd, cpu);
-}
 static unsigned long scale_rt_capacity(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
@@ -6051,16 +6104,9 @@ static unsigned long scale_rt_capacity(int cpu)
 static void update_cpu_capacity(struct sched_domain *sd, int cpu)
 {
-        unsigned long capacity = SCHED_CAPACITY_SCALE;
+        unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
        struct sched_group *sdg = sd->groups;
-        if (sched_feat(ARCH_CAPACITY))
-                capacity *= arch_scale_cpu_capacity(sd, cpu);
-        else
-                capacity *= default_scale_cpu_capacity(sd, cpu);
-        capacity >>= SCHED_CAPACITY_SHIFT;
        cpu_rq(cpu)->cpu_capacity_orig = capacity;
        capacity *= scale_rt_capacity(cpu);
@@ -6186,8 +6232,8 @@ static inline int sg_imbalanced(struct sched_group *group)
 * group_has_capacity returns true if the group has spare capacity that could
 * be used by some tasks.
 * We consider that a group has spare capacity if the  * number of task is
- * smaller than the number of CPUs or if the usage is lower than the available
+ * smaller than the number of CPUs or if the utilization is lower than the
- * capacity for CFS tasks.
+ * available capacity for CFS tasks.
 * For the latter, we use a threshold to stabilize the state, to take into
 * account the variance of the tasks' load and to return true if the available
 * capacity in meaningful for the load balancer.
@@ -6201,7 +6247,7 @@ group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
                return true;
        if ((sgs->group_capacity * 100) >
-                        (sgs->group_usage * env->sd->imbalance_pct))
+                        (sgs->group_util * env->sd->imbalance_pct))
                return true;
        return false;
@@ -6222,15 +6268,15 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
                return false;
        if ((sgs->group_capacity * 100) <
-                        (sgs->group_usage * env->sd->imbalance_pct))
+                        (sgs->group_util * env->sd->imbalance_pct))
                return true;
        return false;
 }
-static enum group_type group_classify(struct lb_env *env,
+static inline enum
-                struct sched_group *group,
+group_type group_classify(struct sched_group *group,
-                struct sg_lb_stats *sgs)
+                          struct sg_lb_stats *sgs)
 {
        if (sgs->group_no_capacity)
                return group_overloaded;
@@ -6270,7 +6316,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                        load = source_load(i, load_idx);
                sgs->group_load += load;
-                sgs->group_usage += get_cpu_usage(i);
+                sgs->group_util += cpu_util(i);
                sgs->sum_nr_running += rq->cfs.h_nr_running;
                if (rq->nr_running > 1)
@@ -6295,7 +6341,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
        sgs->group_weight = group->group_weight;
        sgs->group_no_capacity = group_is_overloaded(env, sgs);
-        sgs->group_type = group_classify(env, group, sgs);
+        sgs->group_type = group_classify(group, sgs);
 }
 /**
@@ -6429,7 +6475,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
                    group_has_capacity(env, &sds->local_stat) &&
                    (sgs->sum_nr_running > 1)) {
                        sgs->group_no_capacity = 1;
-                        sgs->group_type = group_overloaded;
+                        sgs->group_type = group_classify(sg, sgs);
                }
                if (update_sd_pick_busiest(env, sds, sg, sgs)) {
@@ -7609,8 +7655,22 @@ out:
         * When the cpu is attached to null domain for ex, it will not be
         * updated.
         */
-        if (likely(update_next_balance))
+        if (likely(update_next_balance)) {
                rq->next_balance = next_balance;
+#ifdef CONFIG_NO_HZ_COMMON
+                /*
+                 * If this CPU has been elected to perform the nohz idle
+                 * balance. Other idle CPUs have already rebalanced with
+                 * nohz_idle_balance() and nohz.next_balance has been
+                 * updated accordingly. This CPU is now running the idle load
+                 * balance for itself and we need to update the
+                 * nohz.next_balance accordingly.
+                 */
+                if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
+                        nohz.next_balance = rq->next_balance;
+#endif
+        }
 }
 #ifdef CONFIG_NO_HZ_COMMON
@@ -7623,6 +7683,9 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
        int this_cpu = this_rq->cpu;
        struct rq *rq;
        int balance_cpu;
+        /* Earliest time when we have to do rebalance again */
+        unsigned long next_balance = jiffies + 60*HZ;
+        int update_next_balance = 0;
        if (idle != CPU_IDLE ||
            !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
@@ -7654,10 +7717,19 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
                        rebalance_domains(rq, CPU_IDLE);
                }
-                if (time_after(this_rq->next_balance, rq->next_balance))
+                if (time_after(next_balance, rq->next_balance)) {
-                        this_rq->next_balance = rq->next_balance;
+                        next_balance = rq->next_balance;
+                        update_next_balance = 1;
+                }
        }
-        nohz.next_balance = this_rq->next_balance;
+        /*
+         * next_balance will be updated only when there is a need.
+         * When the CPU is attached to null domain for ex, it will not be
+         * updated.
+         */
+        if (likely(update_next_balance))
+                nohz.next_balance = next_balance;
 end:
        clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
 }
@@ -7810,7 +7882,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
                entity_tick(cfs_rq, se, queued);
        }
-        if (numabalancing_enabled)
+        if (static_branch_unlikely(&sched_numa_balancing))
                task_tick_numa(rq, curr);
 }
@@ -7886,21 +7958,39 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
                check_preempt_curr(rq, p, 0);
 }
-static void switched_from_fair(struct rq *rq, struct task_struct *p)
+static inline bool vruntime_normalized(struct task_struct *p)
 {
        struct sched_entity *se = &p->se;
-        struct cfs_rq *cfs_rq = cfs_rq_of(se);
        /*
-         * Ensure the task's vruntime is normalized, so that when it's
+         * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
-         * switched back to the fair class the enqueue_entity(.flags=0) will
+         * the dequeue_entity(.flags=0) will already have normalized the
-         * do the right thing.
+         * vruntime.
+         */
+        if (p->on_rq)
+                return true;
+        /*
+         * When !on_rq, vruntime of the task has usually NOT been normalized.
+         * But there are some cases where it has already been normalized:
         *
-         * If it's queued, then the dequeue_entity(.flags=0) will already
+         * - A forked child which is waiting for being woken up by
-         * have normalized the vruntime, if it's !queued, then only when
+         *   wake_up_new_task().
-         * the task is sleeping will it still have non-normalized vruntime.
+         * - A task which has been woken up by try_to_wake_up() and
+         *   waiting for actually being woken up by sched_ttwu_pending().
         */
-        if (!task_on_rq_queued(p) && p->state != TASK_RUNNING) {
+        if (!se->sum_exec_runtime || p->state == TASK_WAKING)
+                return true;
+        return false;
+}
+static void detach_task_cfs_rq(struct task_struct *p)
+{
+        struct sched_entity *se = &p->se;
+        struct cfs_rq *cfs_rq = cfs_rq_of(se);
+        if (!vruntime_normalized(p)) {
                /*
                 * Fix up our vruntime so that the current sleep doesn't
                 * cause 'unlimited' sleep bonus.
@@ -7909,28 +7999,14 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
                se->vruntime -= cfs_rq->min_vruntime;
        }
-#ifdef CONFIG_SMP
        /* Catch up with the cfs_rq and remove our load when we leave */
-        __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq), &se->avg,
+        detach_entity_load_avg(cfs_rq, se);
-                se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL);
-        cfs_rq->avg.load_avg =
-                max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
-        cfs_rq->avg.load_sum =
-                max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0);
-        cfs_rq->avg.util_avg =
-                max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
-        cfs_rq->avg.util_sum =
-                max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0);
-#endif
 }
-/*
+static void attach_task_cfs_rq(struct task_struct *p)
- * We switched to the sched_fair class.
- */
-static void switched_to_fair(struct rq *rq, struct task_struct *p)
 {
        struct sched_entity *se = &p->se;
+        struct cfs_rq *cfs_rq = cfs_rq_of(se);
 #ifdef CONFIG_FAIR_GROUP_SCHED
        /*
@@ -7940,31 +8016,33 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
        se->depth = se->parent ? se->parent->depth + 1 : 0;
 #endif
-        if (!task_on_rq_queued(p)) {
+        /* Synchronize task with its cfs_rq */
+        attach_entity_load_avg(cfs_rq, se);
+        if (!vruntime_normalized(p))
+                se->vruntime += cfs_rq->min_vruntime;
+}
+static void switched_from_fair(struct rq *rq, struct task_struct *p)
+{
+        detach_task_cfs_rq(p);
+}
+static void switched_to_fair(struct rq *rq, struct task_struct *p)
+{
+        attach_task_cfs_rq(p);
+        if (task_on_rq_queued(p)) {
                /*
-                 * Ensure the task has a non-normalized vruntime when it is switched
+                 * We were most likely switched from sched_rt, so
-                 * back to the fair class with !queued, so that enqueue_entity() at
+                 * kick off the schedule if running, otherwise just see
-                 * wake-up time will do the right thing.
+                 * if we can still preempt the current task.
-                 *
-                 * If it's queued, then the enqueue_entity(.flags=0) makes the task
-                 * has non-normalized vruntime, if it's !queued, then it still has
-                 * normalized vruntime.
                 */
-                if (p->state != TASK_RUNNING)
+                if (rq->curr == p)
-                        se->vruntime += cfs_rq_of(se)->min_vruntime;
+                        resched_curr(rq);
-                return;
+                else
+                        check_preempt_curr(rq, p, 0);
        }
-        /*
-         * We were most likely switched from sched_rt, so
-         * kick off the schedule if running, otherwise just see
-         * if we can still preempt the current task.
-         */
-        if (rq->curr == p)
-                resched_curr(rq);
-        else
-                check_preempt_curr(rq, p, 0);
 }
 /* Account for a task changing its policy or group.
@@ -7999,56 +8077,16 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static void task_move_group_fair(struct task_struct *p, int queued)
+static void task_move_group_fair(struct task_struct *p)
 {
-        struct sched_entity *se = &p->se;
+        detach_task_cfs_rq(p);
-        struct cfs_rq *cfs_rq;
-        /*
-         * If the task was not on the rq at the time of this cgroup movement
-         * it must have been asleep, sleeping tasks keep their ->vruntime
-         * absolute on their old rq until wakeup (needed for the fair sleeper
-         * bonus in place_entity()).
-         *
-         * If it was on the rq, we've just 'preempted' it, which does convert
-         * ->vruntime to a relative base.
-         *
-         * Make sure both cases convert their relative position when migrating
-         * to another cgroup's rq. This does somewhat interfere with the
-         * fair sleeper stuff for the first placement, but who cares.
-         */
-        /*
-         * When !queued, vruntime of the task has usually NOT been normalized.
-         * But there are some cases where it has already been normalized:
-         *
-         * - Moving a forked child which is waiting for being woken up by
-         *   wake_up_new_task().
-         * - Moving a task which has been woken up by try_to_wake_up() and
-         *   waiting for actually being woken up by sched_ttwu_pending().
-         *
-         * To prevent boost or penalty in the new cfs_rq caused by delta
-         * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
-         */
-        if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING))
-                queued = 1;
-        if (!queued)
-                se->vruntime -= cfs_rq_of(se)->min_vruntime;
        set_task_rq(p, task_cpu(p));
-        se->depth = se->parent ? se->parent->depth + 1 : 0;
-        if (!queued) {
-                cfs_rq = cfs_rq_of(se);
-                se->vruntime += cfs_rq->min_vruntime;
 #ifdef CONFIG_SMP
-                /* Virtually synchronize task with its new cfs_rq */
+        /* Tell se's cfs_rq has been changed -- migrated */
-                p->se.avg.last_update_time = cfs_rq->avg.last_update_time;
+        p->se.avg.last_update_time = 0;
-                cfs_rq->avg.load_avg += p->se.avg.load_avg;
-                cfs_rq->avg.load_sum += p->se.avg.load_sum;
-                cfs_rq->avg.util_avg += p->se.avg.util_avg;
-                cfs_rq->avg.util_sum += p->se.avg.util_sum;
 #endif
-        }
+        attach_task_cfs_rq(p);
 }
 void free_fair_sched_group(struct task_group *tg)
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 83a50e7ca533..69631fa46c2f 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -36,11 +36,6 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true)
 */
 SCHED_FEAT(WAKEUP_PREEMPTION, true)
-/*
- * Use arch dependent cpu capacity functions
- */
-SCHED_FEAT(ARCH_CAPACITY, true)
 SCHED_FEAT(HRTICK, false)
 SCHED_FEAT(DOUBLE_TICK, false)
 SCHED_FEAT(LB_BIAS, true)
@@ -72,19 +67,5 @@ SCHED_FEAT(RT_PUSH_IPI, true)
 SCHED_FEAT(FORCE_SD_OVERLAP, false)
 SCHED_FEAT(RT_RUNTIME_SHARE, true)
 SCHED_FEAT(LB_MIN, false)
+SCHED_FEAT(ATTACH_AGE_LOAD, true)
-/*
- * Apply the automatic NUMA scheduling policy. Enabled automatically
- * at runtime if running on a NUMA machine. Can be controlled via
- * numa_balancing=
- */
-#ifdef CONFIG_NUMA_BALANCING
-/*
- * NUMA will favor moving tasks towards nodes where a higher number of
- * hinting faults are recorded during active load balancing. It will
- * resist moving tasks towards nodes where a lower number of hinting
- * faults have been recorded.
- */
-SCHED_FEAT(NUMA,        true)
-#endif
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 8f177c73ae19..4a2ef5a02fd3 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -57,9 +57,11 @@ static inline int cpu_idle_poll(void)
        rcu_idle_enter();
        trace_cpu_idle_rcuidle(0, smp_processor_id());
        local_irq_enable();
+        stop_critical_timings();
        while (!tif_need_resched() &&
                (cpu_idle_force_poll || tick_check_broadcast_expired()))
                cpu_relax();
+        start_critical_timings();
        trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
        rcu_idle_exit();
        return 1;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index d2ea59364a1c..e3cc16312046 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -635,11 +635,11 @@ bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
 /*
 * We ran out of runtime, see if we can borrow some from our neighbours.
 */
-static int do_balance_runtime(struct rt_rq *rt_rq)
+static void do_balance_runtime(struct rt_rq *rt_rq)
 {
        struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
        struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
-        int i, weight, more = 0;
+        int i, weight;
        u64 rt_period;
        weight = cpumask_weight(rd->span);
@@ -673,7 +673,6 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
                                diff = rt_period - rt_rq->rt_runtime;
                        iter->rt_runtime -= diff;
                        rt_rq->rt_runtime += diff;
-                        more = 1;
                        if (rt_rq->rt_runtime == rt_period) {
                                raw_spin_unlock(&iter->rt_runtime_lock);
                                break;
@@ -683,8 +682,6 @@ next:
                raw_spin_unlock(&iter->rt_runtime_lock);
        }
        raw_spin_unlock(&rt_b->rt_runtime_lock);
-        return more;
 }
 /*
@@ -796,26 +793,19 @@ static void __enable_runtime(struct rq *rq)
        }
 }
-static int balance_runtime(struct rt_rq *rt_rq)
+static void balance_runtime(struct rt_rq *rt_rq)
 {
-        int more = 0;
        if (!sched_feat(RT_RUNTIME_SHARE))
-                return more;
+                return;
        if (rt_rq->rt_time > rt_rq->rt_runtime) {
                raw_spin_unlock(&rt_rq->rt_runtime_lock);
-                more = do_balance_runtime(rt_rq);
+                do_balance_runtime(rt_rq);
                raw_spin_lock(&rt_rq->rt_runtime_lock);
        }
-        return more;
 }
 #else /* !CONFIG_SMP */
-static inline int balance_runtime(struct rt_rq *rt_rq)
+static inline void balance_runtime(struct rt_rq *rt_rq) {}
-{
-        return 0;
-}
 #endif /* CONFIG_SMP */
 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 68cda117574c..efd3bfc7e347 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -84,6 +84,10 @@ static inline void update_cpu_load_active(struct rq *this_rq) { }
 */
 #define RUNTIME_INF     ((u64)~0ULL)
+static inline int idle_policy(int policy)
+{
+        return policy == SCHED_IDLE;
+}
 static inline int fair_policy(int policy)
 {
        return policy == SCHED_NORMAL || policy == SCHED_BATCH;
@@ -98,6 +102,11 @@ static inline int dl_policy(int policy)
 {
        return policy == SCHED_DEADLINE;
 }
+static inline bool valid_policy(int policy)
+{
+        return idle_policy(policy) || fair_policy(policy) ||
+                rt_policy(policy) || dl_policy(policy);
+}
 static inline int task_has_rt_policy(struct task_struct *p)
 {
@@ -109,11 +118,6 @@ static inline int task_has_dl_policy(struct task_struct *p)
        return dl_policy(p->policy);
 }
-static inline bool dl_time_before(u64 a, u64 b)
-{
-        return (s64)(a - b) < 0;
-}
 /*
 * Tells if entity @a should preempt entity @b.
 */
@@ -1003,17 +1007,7 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
 #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
 #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
-#ifdef CONFIG_NUMA_BALANCING
+extern struct static_key_false sched_numa_balancing;
-#define sched_feat_numa(x) sched_feat(x)
-#ifdef CONFIG_SCHED_DEBUG
-#define numabalancing_enabled sched_feat_numa(NUMA)
-#else
-extern bool numabalancing_enabled;
-#endif /* CONFIG_SCHED_DEBUG */
-#else
-#define sched_feat_numa(x) (0)
-#define numabalancing_enabled (0)
-#endif /* CONFIG_NUMA_BALANCING */
 static inline u64 global_rt_period(void)
 {
@@ -1078,9 +1072,10 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
         * After ->on_cpu is cleared, the task can be moved to a different CPU.
         * We must ensure this doesn't happen until the switch is completely
         * finished.
+         *
+         * Pairs with the control dependency and rmb in try_to_wake_up().
         */
-        smp_wmb();
+        smp_store_release(&prev->on_cpu, 0);
-        prev->on_cpu = 0;
 #endif
 #ifdef CONFIG_DEBUG_SPINLOCK
        /* this is a valid case when another task releases the spinlock */
@@ -1156,16 +1151,18 @@ static const u32 prio_to_wmult[40] = {
 /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
 };
-#define ENQUEUE_WAKEUP          1
+#define ENQUEUE_WAKEUP          0x01
-#define ENQUEUE_HEAD            2
+#define ENQUEUE_HEAD            0x02
 #ifdef CONFIG_SMP
-#define ENQUEUE_WAKING          4       /* sched_class::task_waking was called */
+#define ENQUEUE_WAKING          0x04    /* sched_class::task_waking was called */
 #else
-#define ENQUEUE_WAKING          0
+#define ENQUEUE_WAKING          0x00
 #endif
-#define ENQUEUE_REPLENISH       8
+#define ENQUEUE_REPLENISH       0x08
+#define ENQUEUE_RESTORE 0x10
-#define DEQUEUE_SLEEP           1
+#define DEQUEUE_SLEEP           0x01
+#define DEQUEUE_SAVE            0x02
 #define RETRY_TASK              ((void *)-1UL)
@@ -1193,7 +1190,7 @@ struct sched_class {
 #ifdef CONFIG_SMP
        int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
-        void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
+        void (*migrate_task_rq)(struct task_struct *p);
        void (*task_waking) (struct task_struct *task);
        void (*task_woken) (struct rq *this_rq, struct task_struct *task);
@@ -1226,7 +1223,7 @@ struct sched_class {
        void (*update_curr) (struct rq *rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
-        void (*task_move_group) (struct task_struct *p, int on_rq);
+        void (*task_move_group) (struct task_struct *p);
 #endif
 };
@@ -1404,6 +1401,17 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
 }
 #endif
+#ifndef arch_scale_cpu_capacity
+static __always_inline
+unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
+{
+        if (sd && (sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
+                return sd->smt_gain / sd->span_weight;
+        return SCHED_CAPACITY_SCALE;
+}
+#endif
 static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
 {
        rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 272d9322bc5d..052e02672d12 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -106,10 +106,9 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked);
-void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, int nr,
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
-                          void *key)
 {
-        __wake_up_common(q, mode, nr, 0, key);
+        __wake_up_common(q, mode, 1, 0, key);
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked_key);
@@ -284,7 +283,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
        if (!list_empty(&wait->task_list))
                list_del_init(&wait->task_list);
        else if (waitqueue_active(q))
-                __wake_up_locked_key(q, mode, 1, key);
+                __wake_up_locked_key(q, mode, key);
        spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL(abort_exclusive_wait);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 5bd4779282df..580ac2d4024f 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -347,6 +347,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
 {
        struct seccomp_filter *sfilter;
        int ret;
+        const bool save_orig = config_enabled(CONFIG_CHECKPOINT_RESTORE);
        if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
                return ERR_PTR(-EINVAL);
@@ -370,7 +371,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
                return ERR_PTR(-ENOMEM);
        ret = bpf_prog_create_from_user(&sfilter->prog, fprog,
-                                        seccomp_check_filter);
+                                        seccomp_check_filter, save_orig);
        if (ret < 0) {
                kfree(sfilter);
                return ERR_PTR(ret);
@@ -469,7 +470,7 @@ void get_seccomp_filter(struct task_struct *tsk)
 static inline void seccomp_filter_free(struct seccomp_filter *filter)
 {
        if (filter) {
-                bpf_prog_free(filter->prog);
+                bpf_prog_destroy(filter->prog);
                kfree(filter);
        }
 }
@@ -867,3 +868,76 @@ long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
        /* prctl interface doesn't have flags, so they are always zero. */
        return do_seccomp(op, 0, uargs);
 }
+#if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE)
+long seccomp_get_filter(struct task_struct *task, unsigned long filter_off,
+                        void __user *data)
+{
+        struct seccomp_filter *filter;
+        struct sock_fprog_kern *fprog;
+        long ret;
+        unsigned long count = 0;
+        if (!capable(CAP_SYS_ADMIN) ||
+            current->seccomp.mode != SECCOMP_MODE_DISABLED) {
+                return -EACCES;
+        }
+        spin_lock_irq(&task->sighand->siglock);
+        if (task->seccomp.mode != SECCOMP_MODE_FILTER) {
+                ret = -EINVAL;
+                goto out;
+        }
+        filter = task->seccomp.filter;
+        while (filter) {
+                filter = filter->prev;
+                count++;
+        }
+        if (filter_off >= count) {
+                ret = -ENOENT;
+                goto out;
+        }
+        count -= filter_off;
+        filter = task->seccomp.filter;
+        while (filter && count > 1) {
+                filter = filter->prev;
+                count--;
+        }
+        if (WARN_ON(count != 1 || !filter)) {
+                /* The filter tree shouldn't shrink while we're using it. */
+                ret = -ENOENT;
+                goto out;
+        }
+        fprog = filter->prog->orig_prog;
+        if (!fprog) {
+                /* This must be a new non-cBPF filter, since we save every
+                 * every cBPF filter's orig_prog above when
+                 * CONFIG_CHECKPOINT_RESTORE is enabled.
+                 */
+                ret = -EMEDIUMTYPE;
+                goto out;
+        }
+        ret = fprog->len;
+        if (!data)
+                goto out;
+        get_seccomp_filter(task);
+        spin_unlock_irq(&task->sighand->siglock);
+        if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog)))
+                ret = -EFAULT;
+        put_seccomp_filter(task);
+        return ret;
+out:
+        spin_unlock_irq(&task->sighand->siglock);
+        return ret;
+}
+#endif
diff --git a/kernel/signal.c b/kernel/signal.c
index 0f6bbbe77b46..c0b01fe24bbd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -503,41 +503,6 @@ int unhandled_signal(struct task_struct *tsk, int sig)
        return !tsk->ptrace;
 }
-/*
- * Notify the system that a driver wants to block all signals for this
- * process, and wants to be notified if any signals at all were to be
- * sent/acted upon.  If the notifier routine returns non-zero, then the
- * signal will be acted upon after all.  If the notifier routine returns 0,
- * then then signal will be blocked.  Only one block per process is
- * allowed.  priv is a pointer to private data that the notifier routine
- * can use to determine if the signal should be blocked or not.
- */
-void
-block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask)
-{
-        unsigned long flags;
-        spin_lock_irqsave(&current->sighand->siglock, flags);
-        current->notifier_mask = mask;
-        current->notifier_data = priv;
-        current->notifier = notifier;
-        spin_unlock_irqrestore(&current->sighand->siglock, flags);
-}
-/* Notify the system that blocking has ended. */
-void
-unblock_all_signals(void)
-{
-        unsigned long flags;
-        spin_lock_irqsave(&current->sighand->siglock, flags);
-        current->notifier = NULL;
-        current->notifier_data = NULL;
-        recalc_sigpending();
-        spin_unlock_irqrestore(&current->sighand->siglock, flags);
-}
 static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
 {
        struct sigqueue *q, *first = NULL;
@@ -580,19 +545,8 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
 {
        int sig = next_signal(pending, mask);
-        if (sig) {
+        if (sig)
-                if (current->notifier) {
-                        if (sigismember(current->notifier_mask, sig)) {
-                                if (!(current->notifier)(current->notifier_data)) {
-                                        clear_thread_flag(TIF_SIGPENDING);
-                                        return 0;
-                                }
-                        }
-                }
                collect_signal(sig, pending, info);
-        }
        return sig;
 }
@@ -834,7 +788,7 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force)
        sigset_t flush;
        if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) {
-                if (signal->flags & SIGNAL_GROUP_COREDUMP)
+                if (!(signal->flags & SIGNAL_GROUP_EXIT))
                        return sig == SIGKILL;
                /*
                 * The process is in the middle of dying, nothing to do.
@@ -2483,9 +2437,6 @@ EXPORT_SYMBOL(force_sig);
 EXPORT_SYMBOL(send_sig);
 EXPORT_SYMBOL(send_sig_info);
 EXPORT_SYMBOL(sigprocmask);
-EXPORT_SYMBOL(block_all_signals);
-EXPORT_SYMBOL(unblock_all_signals);
 /*
 * System call entry points.
diff --git a/kernel/smp.c b/kernel/smp.c
index 07854477c164..d903c02223af 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -669,7 +669,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
        cpumask_var_t cpus;
        int cpu, ret;
-        might_sleep_if(gfp_flags & __GFP_WAIT);
+        might_sleep_if(gfpflags_allow_blocking(gfp_flags));
        if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) {
                preempt_disable();
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index a818cbc73e14..d264f59bff56 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -222,9 +222,8 @@ static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cp
 {
        struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
-        if (ht->pre_unpark)
+        if (!ht->selfparking)
-                ht->pre_unpark(cpu);
+                kthread_unpark(tsk);
-        kthread_unpark(tsk);
 }
 void smpboot_unpark_threads(unsigned int cpu)
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 12484e5d5c88..867bc20e1ef1 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -73,21 +73,24 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
        }
 }
+static void __cpu_stop_queue_work(struct cpu_stopper *stopper,
+                                        struct cpu_stop_work *work)
+{
+        list_add_tail(&work->list, &stopper->works);
+        wake_up_process(stopper->thread);
+}
 /* queue @work to @stopper.  if offline, @work is completed immediately */
 static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
 {
        struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
        unsigned long flags;
        spin_lock_irqsave(&stopper->lock, flags);
+        if (stopper->enabled)
-        if (stopper->enabled) {
+                __cpu_stop_queue_work(stopper, work);
-                list_add_tail(&work->list, &stopper->works);
+        else
-                wake_up_process(stopper->thread);
-        } else
                cpu_stop_signal_done(work->done, false);
        spin_unlock_irqrestore(&stopper->lock, flags);
 }
@@ -213,6 +216,31 @@ static int multi_cpu_stop(void *data)
        return err;
 }
+static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
+                                    int cpu2, struct cpu_stop_work *work2)
+{
+        struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1);
+        struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
+        int err;
+        lg_double_lock(&stop_cpus_lock, cpu1, cpu2);
+        spin_lock_irq(&stopper1->lock);
+        spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
+        err = -ENOENT;
+        if (!stopper1->enabled || !stopper2->enabled)
+                goto unlock;
+        err = 0;
+        __cpu_stop_queue_work(stopper1, work1);
+        __cpu_stop_queue_work(stopper2, work2);
+unlock:
+        spin_unlock(&stopper2->lock);
+        spin_unlock_irq(&stopper1->lock);
+        lg_double_unlock(&stop_cpus_lock, cpu1, cpu2);
+        return err;
+}
 /**
 * stop_two_cpus - stops two cpus
 * @cpu1: the cpu to stop
@@ -247,24 +275,13 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
        cpu_stop_init_done(&done, 2);
        set_state(&msdata, MULTI_STOP_PREPARE);
-        /*
+        if (cpu1 > cpu2)
-         * If we observe both CPUs active we know _cpu_down() cannot yet have
+                swap(cpu1, cpu2);
-         * queued its stop_machine works and therefore ours will get executed
+        if (cpu_stop_queue_two_works(cpu1, &work1, cpu2, &work2)) {
-         * first. Or its not either one of our CPUs that's getting unplugged,
-         * in which case we don't care.
-         *
-         * This relies on the stopper workqueues to be FIFO.
-         */
-        if (!cpu_active(cpu1) || !cpu_active(cpu2)) {
                preempt_enable();
                return -ENOENT;
        }
-        lg_double_lock(&stop_cpus_lock, cpu1, cpu2);
-        cpu_stop_queue_work(cpu1, &work1);
-        cpu_stop_queue_work(cpu2, &work2);
-        lg_double_unlock(&stop_cpus_lock, cpu1, cpu2);
        preempt_enable();
        wait_for_completion(&done.completion);
@@ -452,6 +469,18 @@ repeat:
        }
 }
+void stop_machine_park(int cpu)
+{
+        struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+        /*
+         * Lockless. cpu_stopper_thread() will take stopper->lock and flush
+         * the pending works before it parks, until then it is fine to queue
+         * the new works.
+         */
+        stopper->enabled = false;
+        kthread_park(stopper->thread);
+}
 extern void sched_set_stop_task(int cpu, struct task_struct *stop);
 static void cpu_stop_create(unsigned int cpu)
@@ -462,26 +491,16 @@ static void cpu_stop_create(unsigned int cpu)
 static void cpu_stop_park(unsigned int cpu)
 {
        struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
-        struct cpu_stop_work *work, *tmp;
-        unsigned long flags;
-        /* drain remaining works */
+        WARN_ON(!list_empty(&stopper->works));
-        spin_lock_irqsave(&stopper->lock, flags);
-        list_for_each_entry_safe(work, tmp, &stopper->works, list) {
-                list_del_init(&work->list);
-                cpu_stop_signal_done(work->done, false);
-        }
-        stopper->enabled = false;
-        spin_unlock_irqrestore(&stopper->lock, flags);
 }
-static void cpu_stop_unpark(unsigned int cpu)
+void stop_machine_unpark(int cpu)
 {
        struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
-        spin_lock_irq(&stopper->lock);
        stopper->enabled = true;
-        spin_unlock_irq(&stopper->lock);
+        kthread_unpark(stopper->thread);
 }
 static struct smp_hotplug_thread cpu_stop_threads = {
@@ -490,9 +509,7 @@ static struct smp_hotplug_thread cpu_stop_threads = {
        .thread_fn              = cpu_stopper_thread,
        .thread_comm            = "migration/%u",
        .create                 = cpu_stop_create,
-        .setup                  = cpu_stop_unpark,
        .park                   = cpu_stop_park,
-        .pre_unpark             = cpu_stop_unpark,
        .selfparking            = true,
 };
@@ -508,6 +525,7 @@ static int __init cpu_stop_init(void)
        }
        BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads));
+        stop_machine_unpark(raw_smp_processor_id());
        stop_machine_initialized = true;
        return 0;
 }
diff --git a/kernel/sys.c b/kernel/sys.c
index fa2f2f671a5c..6af9212ab5aa 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -222,7 +222,7 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
                                goto out_unlock;        /* No processes for this user */
                }
                do_each_thread(g, p) {
-                        if (uid_eq(task_uid(p), uid))
+                        if (uid_eq(task_uid(p), uid) && task_pid_vnr(p))
                                error = set_one_prio(p, niceval, error);
                } while_each_thread(g, p);
                if (!uid_eq(uid, cred->uid))
@@ -290,7 +290,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
                                goto out_unlock;        /* No processes for this user */
                }
                do_each_thread(g, p) {
-                        if (uid_eq(task_uid(p), uid)) {
+                        if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) {
                                niceval = nice_to_rlimit(task_nice(p));
                                if (niceval > retval)
                                        retval = niceval;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index a02decf15583..0623787ec67a 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -194,6 +194,7 @@ cond_syscall(sys_mlock);
 cond_syscall(sys_munlock);
 cond_syscall(sys_mlockall);
 cond_syscall(sys_munlockall);
+cond_syscall(sys_mlock2);
 cond_syscall(sys_mincore);
 cond_syscall(sys_madvise);
 cond_syscall(sys_mremap);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e69201d8094e..dc6858d6639e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -64,6 +64,7 @@
 #include <linux/binfmts.h>
 #include <linux/sched/sysctl.h>
 #include <linux/kexec.h>
+#include <linux/bpf.h>
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -887,6 +888,17 @@ static struct ctl_table kern_table[] = {
                .extra1         = &zero,
                .extra2         = &one,
        },
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+        {
+                .procname       = "hardlockup_panic",
+                .data           = &hardlockup_panic,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &zero,
+                .extra2         = &one,
+        },
+#endif
 #ifdef CONFIG_SMP
        {
                .procname       = "softlockup_all_cpu_backtrace",
@@ -897,6 +909,15 @@ static struct ctl_table kern_table[] = {
                .extra1         = &zero,
                .extra2         = &one,
        },
+        {
+                .procname       = "hardlockup_all_cpu_backtrace",
+                .data           = &sysctl_hardlockup_all_cpu_backtrace,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &zero,
+                .extra2         = &one,
+        },
 #endif /* CONFIG_SMP */
 #endif
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
@@ -1139,6 +1160,18 @@ static struct ctl_table kern_table[] = {
                .proc_handler   = timer_migration_handler,
        },
 #endif
+#ifdef CONFIG_BPF_SYSCALL
+        {
+                .procname       = "unprivileged_bpf_disabled",
+                .data           = &sysctl_unprivileged_bpf_disabled,
+                .maxlen         = sizeof(sysctl_unprivileged_bpf_disabled),
+                .mode           = 0644,
+                /* only handle a transition from default "0" to "1" */
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &one,
+                .extra2         = &one,
+        },
+#endif
        { }
 };
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 841b72f720e8..1347882d131e 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -217,7 +217,7 @@ static void clocksource_watchdog(unsigned long data)
                        continue;
                /* Check the deviation from the watchdog clocksource. */
-                if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) {
+                if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {
                        pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable because the skew is too large:\n",
                                cs->name);
                        pr_warn("                      '%s' wd_now: %llx wd_last: %llx mask: %llx\n",
@@ -479,7 +479,7 @@ static u32 clocksource_max_adjustment(struct clocksource *cs)
 * return half the number of nanoseconds the hardware counter can technically
 * cover. This is done so that we can potentially detect problems caused by
 * delayed timers or bad hardware, which might result in time intervals that
- * are larger then what the math used can handle without overflows.
+ * are larger than what the math used can handle without overflows.
 */
 u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cyc)
 {
@@ -595,16 +595,15 @@ static void __clocksource_select(bool skipcur)
 */
 static void clocksource_select(void)
 {
-        return __clocksource_select(false);
+        __clocksource_select(false);
 }
 static void clocksource_select_fallback(void)
 {
-        return __clocksource_select(true);
+        __clocksource_select(true);
 }
 #else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */
 static inline void clocksource_select(void) { }
 static inline void clocksource_select_fallback(void) { }
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 457a373e2181..435b8850dd80 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -59,7 +59,7 @@
 /*
 * The timer bases:
 *
- * There are more clockids then hrtimer bases. Thus, we index
+ * There are more clockids than hrtimer bases. Thus, we index
 * into the timer bases by the hrtimer_base_type enum. When trying
 * to reach a base using a clockid, hrtimer_clockid_to_base()
 * is used to convert from clockid to the proper hrtimer_base_type.
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index df68cb875248..149cc8086aea 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -99,7 +99,7 @@ static time64_t			ntp_next_leap_sec = TIME64_MAX;
 static int pps_valid;           /* signal watchdog counter */
 static long pps_tf[3];          /* phase median filter */
 static long pps_jitter;         /* current jitter (ns) */
-static struct timespec pps_fbase; /* beginning of the last freq interval */
+static struct timespec64 pps_fbase; /* beginning of the last freq interval */
 static int pps_shift;           /* current interval duration (s) (shift) */
 static int pps_intcnt;          /* interval counter */
 static s64 pps_freq;            /* frequency offset (scaled ns/s) */
@@ -509,7 +509,7 @@ static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock);
 static void sync_cmos_clock(struct work_struct *work)
 {
        struct timespec64 now;
-        struct timespec next;
+        struct timespec64 next;
        int fail = 1;
        /*
@@ -559,7 +559,7 @@ static void sync_cmos_clock(struct work_struct *work)
                next.tv_nsec -= NSEC_PER_SEC;
        }
        queue_delayed_work(system_power_efficient_wq,
-                           &sync_cmos_work, timespec_to_jiffies(&next));
+                           &sync_cmos_work, timespec64_to_jiffies(&next));
 }
 void ntp_notify_cmos_timer(void)
@@ -773,13 +773,13 @@ int __do_adjtimex(struct timex *txc, struct timespec64 *ts, s32 *time_tai)
 * pps_normtime.nsec has a range of ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ]
 * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) */
 struct pps_normtime {
-        __kernel_time_t sec;    /* seconds */
+        s64             sec;    /* seconds */
        long            nsec;   /* nanoseconds */
 };
 /* normalize the timestamp so that nsec is in the
   ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval */
-static inline struct pps_normtime pps_normalize_ts(struct timespec ts)
+static inline struct pps_normtime pps_normalize_ts(struct timespec64 ts)
 {
        struct pps_normtime norm = {
                .sec = ts.tv_sec,
@@ -861,7 +861,7 @@ static long hardpps_update_freq(struct pps_normtime freq_norm)
                pps_errcnt++;
                pps_dec_freq_interval();
                printk_deferred(KERN_ERR
-                        "hardpps: PPSERROR: interval too long - %ld s\n",
+                        "hardpps: PPSERROR: interval too long - %lld s\n",
                        freq_norm.sec);
                return 0;
        }
@@ -948,7 +948,7 @@ static void hardpps_update_phase(long error)
 * This code is based on David Mills's reference nanokernel
 * implementation. It was mostly rewritten but keeps the same idea.
 */
-void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
+void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts)
 {
        struct pps_normtime pts_norm, freq_norm;
@@ -969,7 +969,7 @@ void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
        }
        /* ok, now we have a base for frequency calculation */
-        freq_norm = pps_normalize_ts(timespec_sub(*raw_ts, pps_fbase));
+        freq_norm = pps_normalize_ts(timespec64_sub(*raw_ts, pps_fbase));
        /* check that the signal is in the range
         * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it */
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
index 65430504ca26..af924470eac0 100644
--- a/kernel/time/ntp_internal.h
+++ b/kernel/time/ntp_internal.h
@@ -9,5 +9,5 @@ extern ktime_t ntp_get_next_leap(void);
 extern int second_overflow(unsigned long secs);
 extern int ntp_validate_timex(struct timex *);
 extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *);
-extern void __hardpps(const struct timespec *, const struct timespec *);
+extern void __hardpps(const struct timespec64 *, const struct timespec64 *);
 #endif /* _LINUX_NTP_INTERNAL_H */
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 892e3dae0aac..f5e86d282d52 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -249,7 +249,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
                 * but barriers are not required because update_gt_cputime()
                 * can handle concurrent updates.
                 */
-                WRITE_ONCE(cputimer->running, 1);
+                WRITE_ONCE(cputimer->running, true);
        }
        sample_cputime_atomic(times, &cputimer->cputime_atomic);
 }
@@ -864,6 +864,13 @@ static void check_thread_timers(struct task_struct *tsk,
        unsigned long long expires;
        unsigned long soft;
+        /*
+         * If cputime_expires is zero, then there are no active
+         * per thread CPU timers.
+         */
+        if (task_cputime_zero(&tsk->cputime_expires))
+                return;
        expires = check_timers_list(timers, firing, prof_ticks(tsk));
        tsk_expires->prof_exp = expires_to_cputime(expires);
@@ -911,7 +918,7 @@ static inline void stop_process_timers(struct signal_struct *sig)
        struct thread_group_cputimer *cputimer = &sig->cputimer;
        /* Turn off cputimer->running. This is done without locking. */
-        WRITE_ONCE(cputimer->running, 0);
+        WRITE_ONCE(cputimer->running, false);
 }
 static u32 onecputick;
@@ -962,6 +969,19 @@ static void check_process_timers(struct task_struct *tsk,
        unsigned long soft;
        /*
+         * If cputimer is not running, then there are no active
+         * process wide timers (POSIX 1.b, itimers, RLIMIT_CPU).
+         */
+        if (!READ_ONCE(tsk->signal->cputimer.running))
+                return;
+        /*
+         * Signify that a thread is checking for process timers.
+         * Write access to this field is protected by the sighand lock.
+         */
+        sig->cputimer.checking_timer = true;
+        /*
         * Collect the current process totals.
         */
        thread_group_cputimer(tsk, &cputime);
@@ -1015,6 +1035,8 @@ static void check_process_timers(struct task_struct *tsk,
        sig->cputime_expires.sched_exp = sched_expires;
        if (task_cputime_zero(&sig->cputime_expires))
                stop_process_timers(sig);
+        sig->cputimer.checking_timer = false;
 }
 /*
@@ -1117,24 +1139,33 @@ static inline int task_cputime_expired(const struct task_cputime *sample,
 static inline int fastpath_timer_check(struct task_struct *tsk)
 {
        struct signal_struct *sig;
-        cputime_t utime, stime;
-        task_cputime(tsk, &utime, &stime);
        if (!task_cputime_zero(&tsk->cputime_expires)) {
-                struct task_cputime task_sample = {
+                struct task_cputime task_sample;
-                        .utime = utime,
-                        .stime = stime,
-                        .sum_exec_runtime = tsk->se.sum_exec_runtime
-                };
+                task_cputime(tsk, &task_sample.utime, &task_sample.stime);
+                task_sample.sum_exec_runtime = tsk->se.sum_exec_runtime;
                if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
                        return 1;
        }
        sig = tsk->signal;
-        /* Check if cputimer is running. This is accessed without locking. */
+        /*
-        if (READ_ONCE(sig->cputimer.running)) {
+         * Check if thread group timers expired when the cputimer is
+         * running and no other thread in the group is already checking
+         * for thread group cputimers. These fields are read without the
+         * sighand lock. However, this is fine because this is meant to
+         * be a fastpath heuristic to determine whether we should try to
+         * acquire the sighand lock to check/handle timers.
+         *
+         * In the worst case scenario, if 'running' or 'checking_timer' gets
+         * set but the current thread doesn't see the change yet, we'll wait
+         * until the next thread in the group gets a scheduler interrupt to
+         * handle the timer. This isn't an issue in practice because these
+         * types of delays with signals actually getting sent are expected.
+         */
+        if (READ_ONCE(sig->cputimer.running) &&
+            !READ_ONCE(sig->cputimer.checking_timer)) {
                struct task_cputime group_sample;
                sample_cputime_atomic(&group_sample, &sig->cputimer.cputime_atomic);
@@ -1174,12 +1205,8 @@ void run_posix_cpu_timers(struct task_struct *tsk)
         * put them on the firing list.
         */
        check_thread_timers(tsk, &firing);
-        /*
-         * If there are any active process wide timers (POSIX 1.b, itimers,
+        check_process_timers(tsk, &firing);
-         * RLIMIT_CPU) cputimer must be running.
-         */
-        if (READ_ONCE(tsk->signal->cputimer.running))
-                check_process_timers(tsk, &firing);
        /*
         * We must release these locks before taking any timer's lock.
diff --git a/kernel/time/timeconst.bc b/kernel/time/timeconst.bc
index c7388dee8635..c48688904f9f 100644
--- a/kernel/time/timeconst.bc
+++ b/kernel/time/timeconst.bc
@@ -39,7 +39,7 @@ define fmuls(b,n,d) {
 }
 define timeconst(hz) {
-        print "/* Automatically generated by kernel/timeconst.bc */\n"
+        print "/* Automatically generated by kernel/time/timeconst.bc */\n"
        print "/* Time conversion constants for HZ == ", hz, " */\n"
        print "\n"
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 3739ac6aa473..d563c1960302 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -849,7 +849,7 @@ EXPORT_SYMBOL_GPL(ktime_get_real_seconds);
 #ifdef CONFIG_NTP_PPS
 /**
- * getnstime_raw_and_real - get day and raw monotonic time in timespec format
+ * ktime_get_raw_and_real_ts64 - get day and raw monotonic time in timespec format
 * @ts_raw:     pointer to the timespec to be set to raw monotonic time
 * @ts_real:    pointer to the timespec to be set to the time of day
 *
@@ -857,7 +857,7 @@ EXPORT_SYMBOL_GPL(ktime_get_real_seconds);
 * same time atomically and stores the resulting timestamps in timespec
 * format.
 */
-void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
+void ktime_get_raw_and_real_ts64(struct timespec64 *ts_raw, struct timespec64 *ts_real)
 {
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned long seq;
@@ -868,7 +868,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
        do {
                seq = read_seqcount_begin(&tk_core.seq);
-                *ts_raw = timespec64_to_timespec(tk->raw_time);
+                *ts_raw = tk->raw_time;
                ts_real->tv_sec = tk->xtime_sec;
                ts_real->tv_nsec = 0;
@@ -877,10 +877,10 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
        } while (read_seqcount_retry(&tk_core.seq, seq));
-        timespec_add_ns(ts_raw, nsecs_raw);
+        timespec64_add_ns(ts_raw, nsecs_raw);
-        timespec_add_ns(ts_real, nsecs_real);
+        timespec64_add_ns(ts_real, nsecs_real);
 }
-EXPORT_SYMBOL(getnstime_raw_and_real);
+EXPORT_SYMBOL(ktime_get_raw_and_real_ts64);
 #endif /* CONFIG_NTP_PPS */
@@ -1251,7 +1251,7 @@ void __init timekeeping_init(void)
        set_normalized_timespec64(&tmp, -boot.tv_sec, -boot.tv_nsec);
        tk_set_wall_to_mono(tk, tmp);
-        timekeeping_update(tk, TK_MIRROR);
+        timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
        write_seqcount_end(&tk_core.seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@ -1614,7 +1614,7 @@ static __always_inline void timekeeping_freqadjust(struct timekeeper *tk,
        negative = (tick_error < 0);
        /* Sort out the magnitude of the correction */
-        tick_error = abs64(tick_error);
+        tick_error = abs(tick_error);
        for (adj = 0; tick_error > interval; adj++)
                tick_error >>= 1;
@@ -1674,7 +1674,7 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
 /**
 * accumulate_nsecs_to_secs - Accumulates nsecs into secs
 *
- * Helper function that accumulates a the nsecs greater then a second
+ * Helper function that accumulates the nsecs greater than a second
 * from the xtime_nsec field to the xtime_secs field.
 * It also calls into the NTP code to handle leapsecond processing.
 *
@@ -1726,7 +1726,7 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
        cycle_t interval = tk->cycle_interval << shift;
        u64 raw_nsecs;
-        /* If the offset is smaller then a shifted interval, do nothing */
+        /* If the offset is smaller than a shifted interval, do nothing */
        if (offset < interval)
                return offset;
@@ -2025,7 +2025,7 @@ int do_adjtimex(struct timex *txc)
 /**
 * hardpps() - Accessor function to NTP __hardpps function
 */
-void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
+void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts)
 {
        unsigned long flags;
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 84190f02b521..74591ba9474f 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -461,10 +461,17 @@ void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr)
 static void timer_stats_account_timer(struct timer_list *timer)
 {
-        if (likely(!timer->start_site))
+        void *site;
+        /*
+         * start_site can be concurrently reset by
+         * timer_stats_timer_clear_start_info()
+         */
+        site = READ_ONCE(timer->start_site);
+        if (likely(!site))
                return;
-        timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
+        timer_stats_update_stats(timer, timer->start_pid, site,
                                 timer->function, timer->start_comm,
                                 timer->flags);
 }
@@ -867,7 +874,7 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
        if (mask == 0)
                return expires;
-        bit = find_last_bit(&mask, BITS_PER_LONG);
+        bit = __fls(mask);
        mask = (1UL << bit) - 1;
diff --git a/kernel/torture.c b/kernel/torture.c
index 3e4840633d3e..44aa462d033f 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -523,6 +523,7 @@ static int stutter;
 */
 void stutter_wait(const char *title)
 {
+        cond_resched_rcu_qs();
        while (READ_ONCE(stutter_pause_test) ||
               (torture_runnable && !READ_ONCE(*torture_runnable))) {
                if (stutter_pause_test)
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 8d6363f42169..e45db6b0d878 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -434,7 +434,7 @@ config UPROBE_EVENT
 config BPF_EVENTS
        depends on BPF_SYSCALL
-        depends on KPROBE_EVENT || UPROBE_EVENT
+        depends on (KPROBE_EVENT || UPROBE_EVENT) && PERF_EVENTS
        bool
        default y
        help
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index b2fcf472774e..a990824c8604 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -437,7 +437,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
                       struct block_device *bdev,
                       struct blk_user_trace_setup *buts)
 {
-        struct blk_trace *old_bt, *bt = NULL;
+        struct blk_trace *bt = NULL;
        struct dentry *dir = NULL;
        int ret;
@@ -519,11 +519,8 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
        bt->trace_state = Blktrace_setup;
        ret = -EBUSY;
-        old_bt = xchg(&q->blk_trace, bt);
+        if (cmpxchg(&q->blk_trace, NULL, bt))
-        if (old_bt) {
-                (void) xchg(&q->blk_trace, old_bt);
                goto err;
-        }
        if (atomic_inc_return(&blk_probes_ref) == 1)
                blk_register_tracepoints();
@@ -1482,7 +1479,7 @@ static int blk_trace_remove_queue(struct request_queue *q)
 static int blk_trace_setup_queue(struct request_queue *q,
                                 struct block_device *bdev)
 {
-        struct blk_trace *old_bt, *bt = NULL;
+        struct blk_trace *bt = NULL;
        int ret = -ENOMEM;
        bt = kzalloc(sizeof(*bt), GFP_KERNEL);
@@ -1498,12 +1495,9 @@ static int blk_trace_setup_queue(struct request_queue *q,
        blk_trace_setup_lba(bt, bdev);
-        old_bt = xchg(&q->blk_trace, bt);
+        ret = -EBUSY;
-        if (old_bt != NULL) {
+        if (cmpxchg(&q->blk_trace, NULL, bt))
-                (void)xchg(&q->blk_trace, old_bt);
-                ret = -EBUSY;
                goto free_bt;
-        }
        if (atomic_inc_return(&blk_probes_ref) == 1)
                blk_register_tracepoints();
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 0fe96c7c8803..4228fd3682c3 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -199,6 +199,11 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
        if (!event)
                return -ENOENT;
+        /* make sure event is local and doesn't have pmu::count */
+        if (event->oncpu != smp_processor_id() ||
+            event->pmu->count)
+                return -EINVAL;
        /*
         * we don't know if the function is run successfully by the
         * return value. It can be judged in other places, such as
@@ -207,14 +212,58 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
        return perf_event_read_local(event);
 }
-const struct bpf_func_proto bpf_perf_event_read_proto = {
+static const struct bpf_func_proto bpf_perf_event_read_proto = {
        .func           = bpf_perf_event_read,
-        .gpl_only       = false,
+        .gpl_only       = true,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_CONST_MAP_PTR,
        .arg2_type      = ARG_ANYTHING,
 };
+static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
+{
+        struct pt_regs *regs = (struct pt_regs *) (long) r1;
+        struct bpf_map *map = (struct bpf_map *) (long) r2;
+        struct bpf_array *array = container_of(map, struct bpf_array, map);
+        void *data = (void *) (long) r4;
+        struct perf_sample_data sample_data;
+        struct perf_event *event;
+        struct perf_raw_record raw = {
+                .size = size,
+                .data = data,
+        };
+        if (unlikely(index >= array->map.max_entries))
+                return -E2BIG;
+        event = (struct perf_event *)array->ptrs[index];
+        if (unlikely(!event))
+                return -ENOENT;
+        if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE ||
+                     event->attr.config != PERF_COUNT_SW_BPF_OUTPUT))
+                return -EINVAL;
+        if (unlikely(event->oncpu != smp_processor_id()))
+                return -EOPNOTSUPP;
+        perf_sample_data_init(&sample_data, 0, 0);
+        sample_data.raw = &raw;
+        perf_event_output(event, &sample_data, regs);
+        return 0;
+}
+static const struct bpf_func_proto bpf_perf_event_output_proto = {
+        .func           = bpf_perf_event_output,
+        .gpl_only       = true,
+        .ret_type       = RET_INTEGER,
+        .arg1_type      = ARG_PTR_TO_CTX,
+        .arg2_type      = ARG_CONST_MAP_PTR,
+        .arg3_type      = ARG_ANYTHING,
+        .arg4_type      = ARG_PTR_TO_STACK,
+        .arg5_type      = ARG_CONST_STACK_SIZE,
+};
 static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
 {
        switch (func_id) {
@@ -242,6 +291,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
                return &bpf_get_smp_processor_id_proto;
        case BPF_FUNC_perf_event_read:
                return &bpf_perf_event_read_proto;
+        case BPF_FUNC_perf_event_output:
+                return &bpf_perf_event_output_proto;
        default:
                return NULL;
        }
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index ea2725053771..3f743b147247 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5708,7 +5708,7 @@ free:
 }
 static void
-ftrace_graph_probe_sched_switch(void *ignore,
+ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
                        struct task_struct *prev, struct task_struct *next)
 {
        unsigned long long timestamp;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index bee1e1530052..6bbc5f652355 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -506,7 +506,7 @@ check_ignore_pid(struct trace_pid_list *filtered_pids, struct task_struct *task)
 }
 static void
-event_filter_pid_sched_switch_probe_pre(void *data,
+event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
                    struct task_struct *prev, struct task_struct *next)
 {
        struct trace_array *tr = data;
@@ -520,7 +520,7 @@ event_filter_pid_sched_switch_probe_pre(void *data,
 }
 static void
-event_filter_pid_sched_switch_probe_post(void *data,
+event_filter_pid_sched_switch_probe_post(void *data, bool preempt,
                    struct task_struct *prev, struct task_struct *next)
 {
        struct trace_array *tr = data;
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index f270088e9929..4c896a0101bd 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -16,7 +16,8 @@ static int			sched_ref;
 static DEFINE_MUTEX(sched_register_mutex);
 static void
-probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next)
+probe_sched_switch(void *ignore, bool preempt,
+                   struct task_struct *prev, struct task_struct *next)
 {
        if (unlikely(!sched_ref))
                return;
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 855c2c7612e8..9d4399b553a3 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -424,7 +424,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 }
 static void notrace
-probe_wakeup_sched_switch(void *ignore,
+probe_wakeup_sched_switch(void *ignore, bool preempt,
                          struct task_struct *prev, struct task_struct *next)
 {
        struct trace_array_cpu *data;
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 0bd212af406c..dda9e6742950 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -91,9 +91,19 @@ check_stack(unsigned long ip, unsigned long *stack)
        if (!object_is_on_stack(stack))
                return;
+        /* Can't do this from NMI context (can cause deadlocks) */
+        if (in_nmi())
+                return;
        local_irq_save(flags);
        arch_spin_lock(&stack_trace_max_lock);
+        /*
+         * RCU may not be watching, make it see us.
+         * The stack trace code uses rcu_sched.
+         */
+        rcu_irq_enter();
        /* In case another CPU set the tracer_frame on us */
        if (unlikely(!frame_size))
                this_size -= tracer_frame;
@@ -175,6 +185,7 @@ check_stack(unsigned long ip, unsigned long *stack)
        }
 out:
+        rcu_irq_exit();
        arch_spin_unlock(&stack_trace_max_lock);
        local_irq_restore(flags);
 }
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 64ed1c37bd1f..18f34cf75f74 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -57,8 +57,10 @@ int __read_mostly watchdog_thresh = 10;
 #ifdef CONFIG_SMP
 int __read_mostly sysctl_softlockup_all_cpu_backtrace;
+int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
 #else
 #define sysctl_softlockup_all_cpu_backtrace 0
+#define sysctl_hardlockup_all_cpu_backtrace 0
 #endif
 static struct cpumask watchdog_cpumask __read_mostly;
 unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
@@ -110,8 +112,9 @@ static unsigned long soft_lockup_nmi_warn;
 * Should we panic when a soft-lockup or hard-lockup occurs:
 */
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
-static int hardlockup_panic =
+unsigned int __read_mostly hardlockup_panic =
                        CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
+static unsigned long hardlockup_allcpu_dumped;
 /*
 * We may not want to enable hard lockup detection by default in all cases,
 * for example when running the kernel as a guest on a hypervisor. In these
@@ -173,6 +176,13 @@ static int __init softlockup_all_cpu_backtrace_setup(char *str)
        return 1;
 }
 __setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup);
+static int __init hardlockup_all_cpu_backtrace_setup(char *str)
+{
+        sysctl_hardlockup_all_cpu_backtrace =
+                !!simple_strtol(str, NULL, 0);
+        return 1;
+}
+__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup);
 #endif
 /*
@@ -263,15 +273,15 @@ void touch_softlockup_watchdog_sync(void)
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
 /* watchdog detector functions */
-static int is_hardlockup(void)
+static bool is_hardlockup(void)
 {
        unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
        if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)
-                return 1;
+                return true;
        __this_cpu_write(hrtimer_interrupts_saved, hrint);
-        return 0;
+        return false;
 }
 #endif
@@ -279,7 +289,7 @@ static int is_softlockup(unsigned long touch_ts)
 {
        unsigned long now = get_timestamp();
-        if (watchdog_enabled & SOFT_WATCHDOG_ENABLED) {
+        if ((watchdog_enabled & SOFT_WATCHDOG_ENABLED) && watchdog_thresh){
                /* Warn about unreasonable delays. */
                if (time_after(now, touch_ts + get_softlockup_thresh()))
                        return now - touch_ts;
@@ -318,17 +328,30 @@ static void watchdog_overflow_callback(struct perf_event *event,
         */
        if (is_hardlockup()) {
                int this_cpu = smp_processor_id();
+                struct pt_regs *regs = get_irq_regs();
                /* only print hardlockups once */
                if (__this_cpu_read(hard_watchdog_warn) == true)
                        return;
-                if (hardlockup_panic)
+                pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
-                        panic("Watchdog detected hard LOCKUP on cpu %d",
+                print_modules();
-                              this_cpu);
+                print_irqtrace_events(current);
+                if (regs)
+                        show_regs(regs);
                else
-                        WARN(1, "Watchdog detected hard LOCKUP on cpu %d",
+                        dump_stack();
-                             this_cpu);
+                /*
+                 * Perform all-CPU dump only once to avoid multiple hardlockups
+                 * generating interleaving traces
+                 */
+                if (sysctl_hardlockup_all_cpu_backtrace &&
+                                !test_and_set_bit(0, &hardlockup_allcpu_dumped))
+                        trigger_allbutself_cpu_backtrace();
+                if (hardlockup_panic)
+                        panic("Hard LOCKUP");
                __this_cpu_write(hard_watchdog_warn, true);
                return;
@@ -347,6 +370,9 @@ static void watchdog_interrupt_count(void)
 static int watchdog_nmi_enable(unsigned int cpu);
 static void watchdog_nmi_disable(unsigned int cpu);
+static int watchdog_enable_all_cpus(void);
+static void watchdog_disable_all_cpus(void);
 /* watchdog kicker functions */
 static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 {
@@ -651,37 +677,41 @@ static struct smp_hotplug_thread watchdog_threads = {
 /*
 * park all watchdog threads that are specified in 'watchdog_cpumask'
+ *
+ * This function returns an error if kthread_park() of a watchdog thread
+ * fails. In this situation, the watchdog threads of some CPUs can already
+ * be parked and the watchdog threads of other CPUs can still be runnable.
+ * Callers are expected to handle this special condition as appropriate in
+ * their context.
+ *
+ * This function may only be called in a context that is protected against
+ * races with CPU hotplug - for example, via get_online_cpus().
 */
 static int watchdog_park_threads(void)
 {
        int cpu, ret = 0;
-        get_online_cpus();
        for_each_watchdog_cpu(cpu) {
                ret = kthread_park(per_cpu(softlockup_watchdog, cpu));
                if (ret)
                        break;
        }
-        if (ret) {
-                for_each_watchdog_cpu(cpu)
-                        kthread_unpark(per_cpu(softlockup_watchdog, cpu));
-        }
-        put_online_cpus();
        return ret;
 }
 /*
 * unpark all watchdog threads that are specified in 'watchdog_cpumask'
+ *
+ * This function may only be called in a context that is protected against
+ * races with CPU hotplug - for example, via get_online_cpus().
 */
 static void watchdog_unpark_threads(void)
 {
        int cpu;
-        get_online_cpus();
        for_each_watchdog_cpu(cpu)
                kthread_unpark(per_cpu(softlockup_watchdog, cpu));
-        put_online_cpus();
 }
 /*
@@ -691,6 +721,7 @@ int lockup_detector_suspend(void)
 {
        int ret = 0;
+        get_online_cpus();
        mutex_lock(&watchdog_proc_mutex);
        /*
         * Multiple suspend requests can be active in parallel (counted by
@@ -704,6 +735,11 @@ int lockup_detector_suspend(void)
        if (ret == 0)
                watchdog_suspended++;
+        else {
+                watchdog_disable_all_cpus();
+                pr_err("Failed to suspend lockup detectors, disabled\n");
+                watchdog_enabled = 0;
+        }
        mutex_unlock(&watchdog_proc_mutex);
@@ -726,12 +762,20 @@ void lockup_detector_resume(void)
                watchdog_unpark_threads();
        mutex_unlock(&watchdog_proc_mutex);
+        put_online_cpus();
 }
-static void update_watchdog_all_cpus(void)
+static int update_watchdog_all_cpus(void)
 {
-        watchdog_park_threads();
+        int ret;
+        ret = watchdog_park_threads();
+        if (ret)
+                return ret;
        watchdog_unpark_threads();
+        return 0;
 }
 static int watchdog_enable_all_cpus(void)
@@ -750,15 +794,20 @@ static int watchdog_enable_all_cpus(void)
                 * Enable/disable the lockup detectors or
                 * change the sample period 'on the fly'.
                 */
-                update_watchdog_all_cpus();
+                err = update_watchdog_all_cpus();
+                if (err) {
+                        watchdog_disable_all_cpus();
+                        pr_err("Failed to update lockup detectors, disabled\n");
+                }
        }
+        if (err)
+                watchdog_enabled = 0;
        return err;
 }
-/* prepare/enable/disable routines */
-/* sysctl functions */
-#ifdef CONFIG_SYSCTL
 static void watchdog_disable_all_cpus(void)
 {
        if (watchdog_running) {
@@ -767,6 +816,8 @@ static void watchdog_disable_all_cpus(void)
        }
 }
+#ifdef CONFIG_SYSCTL
 /*
 * Update the run state of the lockup detectors.
 */
@@ -808,6 +859,7 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,
        int err, old, new;
        int *watchdog_param = (int *)table->data;
+        get_online_cpus();
        mutex_lock(&watchdog_proc_mutex);
        if (watchdog_suspended) {
@@ -849,15 +901,17 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,
                } while (cmpxchg(&watchdog_enabled, old, new) != old);
                /*
-                 * Update the run state of the lockup detectors.
+                 * Update the run state of the lockup detectors. There is _no_
-                 * Restore 'watchdog_enabled' on failure.
+                 * need to check the value returned by proc_watchdog_update()
+                 * and to restore the previous value of 'watchdog_enabled' as
+                 * both lockup detectors are disabled if proc_watchdog_update()
+                 * returns an error.
                 */
                err = proc_watchdog_update();
-                if (err)
-                        watchdog_enabled = old;
        }
 out:
        mutex_unlock(&watchdog_proc_mutex);
+        put_online_cpus();
        return err;
 }
@@ -899,6 +953,7 @@ int proc_watchdog_thresh(struct ctl_table *table, int write,
 {
        int err, old;
+        get_online_cpus();
        mutex_lock(&watchdog_proc_mutex);
        if (watchdog_suspended) {
@@ -914,15 +969,17 @@ int proc_watchdog_thresh(struct ctl_table *table, int write,
                goto out;
        /*
-         * Update the sample period.
+         * Update the sample period. Restore on failure.
-         * Restore 'watchdog_thresh' on failure.
         */
        set_sample_period();
        err = proc_watchdog_update();
-        if (err)
+        if (err) {
                watchdog_thresh = old;
+                set_sample_period();
+        }
 out:
        mutex_unlock(&watchdog_proc_mutex);
+        put_online_cpus();
        return err;
 }
@@ -937,6 +994,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
 {
        int err;
+        get_online_cpus();
        mutex_lock(&watchdog_proc_mutex);
        if (watchdog_suspended) {
@@ -964,6 +1022,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
        }
 out:
        mutex_unlock(&watchdog_proc_mutex);
+        put_online_cpus();
        return err;
 }
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ca71582fcfab..c579dbab2e36 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1458,13 +1458,13 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
        timer_stats_timer_set_start_info(&dwork->timer);
        dwork->wq = wq;
+        /* timer isn't guaranteed to run in this cpu, record earlier */
+        if (cpu == WORK_CPU_UNBOUND)
+                cpu = raw_smp_processor_id();
        dwork->cpu = cpu;
        timer->expires = jiffies + delay;
-        if (unlikely(cpu != WORK_CPU_UNBOUND))
+        add_timer_on(timer, cpu);
-                add_timer_on(timer, cpu);
-        else
-                add_timer(timer);
 }
 /**
@@ -3199,6 +3199,7 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
        u32 hash = wqattrs_hash(attrs);
        struct worker_pool *pool;
        int node;
+        int target_node = NUMA_NO_NODE;
        lockdep_assert_held(&wq_pool_mutex);
@@ -3210,13 +3211,25 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
                }
        }
+        /* if cpumask is contained inside a NUMA node, we belong to that node */
+        if (wq_numa_enabled) {
+                for_each_node(node) {
+                        if (cpumask_subset(attrs->cpumask,
+                                           wq_numa_possible_cpumask[node])) {
+                                target_node = node;
+                                break;
+                        }
+                }
+        }
        /* nope, create a new one */
-        pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+        pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, target_node);
        if (!pool || init_worker_pool(pool) < 0)
                goto fail;
        lockdep_set_subclass(&pool->lock, 1);   /* see put_pwq() */
        copy_workqueue_attrs(pool->attrs, attrs);
+        pool->node = target_node;
        /*
         * no_numa isn't a worker_pool attribute, always clear it.  See
@@ -3224,17 +3237,6 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
         */
        pool->attrs->no_numa = false;
-        /* if cpumask is contained inside a NUMA node, we belong to that node */
-        if (wq_numa_enabled) {
-                for_each_node(node) {
-                        if (cpumask_subset(pool->attrs->cpumask,
-                                           wq_numa_possible_cpumask[node])) {
-                                pool->node = node;
-                                break;
-                        }
-                }
-        }
        if (worker_pool_assign_id(pool) < 0)
                goto fail;