Merge remote-tracking branch 'asoc/fix/rt5645' into asoc-fix-rt5645

author: Mark Brown <broonie@kernel.org> 2015-10-12 13:09:27 -0400
committer: Mark Brown <broonie@kernel.org> 2015-10-12 13:09:27 -0400
commit: 79828b4fa835f73cdaf4bffa48696abdcbea9d02 (patch)
tree: 5e0fa7156acb75ba603022bc807df8f2fedb97a8 /kernel
parent: 721b51fcf91898299d96f4b72cb9434cda29dce6 (diff)
parent: 8c1a9d6323abf0fb1e5dad96cf3f1c783505ea5a (diff)
131 files changed, 7153 insertions, 6014 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 43c4c920f30a..53abf008ecb3 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -45,16 +45,18 @@ ifneq ($(CONFIG_SMP),y)
 obj-y += up.o
 endif
 obj-$(CONFIG_UID16) += uid16.o
-obj-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += system_keyring.o system_certificates.o
 obj-$(CONFIG_MODULES) += module.o
 obj-$(CONFIG_MODULE_SIG) += module_signing.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
+obj-$(CONFIG_KEXEC_CORE) += kexec_core.o
 obj-$(CONFIG_KEXEC) += kexec.o
+obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
 obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
 obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_CGROUPS) += cgroup.o
 obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
+obj-$(CONFIG_CGROUP_PIDS) += cgroup_pids.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
 obj-$(CONFIG_UTS_NS) += utsname.o
 obj-$(CONFIG_USER_NS) += user_namespace.o
@@ -64,7 +66,7 @@ obj-$(CONFIG_SMP) += stop_machine.o
 obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
 obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
 obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
-obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o
+obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o audit_fsnotify.o
 obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
 obj-$(CONFIG_GCOV_KERNEL) += gcov/
 obj-$(CONFIG_KPROBES) += kprobes.o
@@ -98,6 +100,9 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 obj-$(CONFIG_JUMP_LABEL) += jump_label.o
 obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
 obj-$(CONFIG_TORTURE_TEST) += torture.o
+obj-$(CONFIG_MEMBARRIER) += membarrier.o
+obj-$(CONFIG_HAS_IOMEM) += memremap.o
 $(obj)/configs.o: $(obj)/config_data.h
@@ -111,99 +116,3 @@ $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
 targets += config_data.h
 $(obj)/config_data.h: $(obj)/config_data.gz FORCE
        $(call filechk,ikconfiggz)
-###############################################################################
-#
-# Roll all the X.509 certificates that we can find together and pull them into
-# the kernel so that they get loaded into the system trusted keyring during
-# boot.
-#
-# We look in the source root and the build root for all files whose name ends
-# in ".x509".  Unfortunately, this will generate duplicate filenames, so we
-# have make canonicalise the pathnames and then sort them to discard the
-# duplicates.
-#
-###############################################################################
-ifeq ($(CONFIG_SYSTEM_TRUSTED_KEYRING),y)
-X509_CERTIFICATES-y := $(wildcard *.x509) $(wildcard $(srctree)/*.x509)
-X509_CERTIFICATES-$(CONFIG_MODULE_SIG) += $(objtree)/signing_key.x509
-X509_CERTIFICATES-raw := $(sort $(foreach CERT,$(X509_CERTIFICATES-y), \
-                                $(or $(realpath $(CERT)),$(CERT))))
-X509_CERTIFICATES := $(subst $(realpath $(objtree))/,,$(X509_CERTIFICATES-raw))
-ifeq ($(X509_CERTIFICATES),)
-$(warning *** No X.509 certificates found ***)
-endif
-ifneq ($(wildcard $(obj)/.x509.list),)
-ifneq ($(shell cat $(obj)/.x509.list),$(X509_CERTIFICATES))
-$(warning X.509 certificate list changed to "$(X509_CERTIFICATES)" from "$(shell cat $(obj)/.x509.list)")
-$(shell rm $(obj)/.x509.list)
-endif
-endif
-kernel/system_certificates.o: $(obj)/x509_certificate_list
-quiet_cmd_x509certs  = CERTS   $@
-      cmd_x509certs  = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; $(kecho) "  - Including cert $(X509)")
-targets += $(obj)/x509_certificate_list
-$(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list
-        $(call if_changed,x509certs)
-targets += $(obj)/.x509.list
-$(obj)/.x509.list:
-        @echo $(X509_CERTIFICATES) >$@
-endif
-clean-files := x509_certificate_list .x509.list
-ifeq ($(CONFIG_MODULE_SIG),y)
-###############################################################################
-#
-# If module signing is requested, say by allyesconfig, but a key has not been
-# supplied, then one will need to be generated to make sure the build does not
-# fail and that the kernel may be used afterwards.
-#
-###############################################################################
-ifndef CONFIG_MODULE_SIG_HASH
-$(error Could not determine digest type to use from kernel config)
-endif
-signing_key.priv signing_key.x509: x509.genkey
-        @echo "###"
-        @echo "### Now generating an X.509 key pair to be used for signing modules."
-        @echo "###"
-        @echo "### If this takes a long time, you might wish to run rngd in the"
-        @echo "### background to keep the supply of entropy topped up.  It"
-        @echo "### needs to be run as root, and uses a hardware random"
-        @echo "### number generator if one is available."
-        @echo "###"
-        openssl req -new -nodes -utf8 -$(CONFIG_MODULE_SIG_HASH) -days 36500 \
-                -batch -x509 -config x509.genkey \
-                -outform DER -out signing_key.x509 \
-                -keyout signing_key.priv 2>&1
-        @echo "###"
-        @echo "### Key pair generated."
-        @echo "###"
-x509.genkey:
-        @echo Generating X.509 key generation config
-        @echo  >x509.genkey "[ req ]"
-        @echo >>x509.genkey "default_bits = 4096"
-        @echo >>x509.genkey "distinguished_name = req_distinguished_name"
-        @echo >>x509.genkey "prompt = no"
-        @echo >>x509.genkey "string_mask = utf8only"
-        @echo >>x509.genkey "x509_extensions = myexts"
-        @echo >>x509.genkey
-        @echo >>x509.genkey "[ req_distinguished_name ]"
-        @echo >>x509.genkey "#O = Unspecified company"
-        @echo >>x509.genkey "CN = Build time autogenerated kernel key"
-        @echo >>x509.genkey "#emailAddress = unspecified.user@unspecified.company"
-        @echo >>x509.genkey
-        @echo >>x509.genkey "[ myexts ]"
-        @echo >>x509.genkey "basicConstraints=critical,CA:FALSE"
-        @echo >>x509.genkey "keyUsage=digitalSignature"
-        @echo >>x509.genkey "subjectKeyIdentifier=hash"
-        @echo >>x509.genkey "authorityKeyIdentifier=keyid"
-endif
diff --git a/kernel/audit.c b/kernel/audit.c
index f9e6065346db..662c007635fb 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1761,7 +1761,7 @@ void audit_log_name(struct audit_context *context, struct audit_names *n,
        } else
                audit_log_format(ab, " name=(null)");
-        if (n->ino != (unsigned long)-1)
+        if (n->ino != AUDIT_INO_UNSET)
                audit_log_format(ab, " inode=%lu"
                                 " dev=%02x:%02x mode=%#ho"
                                 " ouid=%u ogid=%u rdev=%02x:%02x",
diff --git a/kernel/audit.h b/kernel/audit.h
index d641f9bb3ed0..dadf86a0e59e 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -50,6 +50,7 @@ enum audit_state {
 /* Rule lists */
 struct audit_watch;
+struct audit_fsnotify_mark;
 struct audit_tree;
 struct audit_chunk;
@@ -252,6 +253,7 @@ struct audit_net {
 extern int selinux_audit_rule_update(void);
 extern struct mutex audit_filter_mutex;
+extern int audit_del_rule(struct audit_entry *);
 extern void audit_free_rule_rcu(struct rcu_head *);
 extern struct list_head audit_filter_list[];
@@ -269,6 +271,15 @@ extern int audit_add_watch(struct audit_krule *krule, struct list_head **list);
 extern void audit_remove_watch_rule(struct audit_krule *krule);
 extern char *audit_watch_path(struct audit_watch *watch);
 extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev);
+extern struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pathname, int len);
+extern char *audit_mark_path(struct audit_fsnotify_mark *mark);
+extern void audit_remove_mark(struct audit_fsnotify_mark *audit_mark);
+extern void audit_remove_mark_rule(struct audit_krule *krule);
+extern int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_t dev);
+extern int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old);
+extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark);
 #else
 #define audit_put_watch(w) {}
 #define audit_get_watch(w) {}
@@ -278,6 +289,13 @@ extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev
 #define audit_watch_path(w) ""
 #define audit_watch_compare(w, i, d) 0
+#define audit_alloc_mark(k, p, l) (ERR_PTR(-EINVAL))
+#define audit_mark_path(m) ""
+#define audit_remove_mark(m)
+#define audit_remove_mark_rule(k)
+#define audit_mark_compare(m, i, d) 0
+#define audit_exe_compare(t, m) (-EINVAL)
+#define audit_dupe_exe(n, o) (-EINVAL)
 #endif /* CONFIG_AUDIT_WATCH */
 #ifdef CONFIG_AUDIT_TREE
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
new file mode 100644
index 000000000000..27c6046c2c3d
--- /dev/null
+++ b/kernel/audit_fsnotify.c
@@ -0,0 +1,216 @@
+/* audit_fsnotify.c -- tracking inodes
+ *
+ * Copyright 2003-2009,2014-2015 Red Hat, Inc.
+ * Copyright 2005 Hewlett-Packard Development Company, L.P.
+ * Copyright 2005 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/audit.h>
+#include <linux/kthread.h>
+#include <linux/mutex.h>
+#include <linux/fs.h>
+#include <linux/fsnotify_backend.h>
+#include <linux/namei.h>
+#include <linux/netlink.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/security.h>
+#include "audit.h"
+/*
+ * this mark lives on the parent directory of the inode in question.
+ * but dev, ino, and path are about the child
+ */
+struct audit_fsnotify_mark {
+        dev_t dev;              /* associated superblock device */
+        unsigned long ino;      /* associated inode number */
+        char *path;             /* insertion path */
+        struct fsnotify_mark mark; /* fsnotify mark on the inode */
+        struct audit_krule *rule;
+};
+/* fsnotify handle. */
+static struct fsnotify_group *audit_fsnotify_group;
+/* fsnotify events we care about. */
+#define AUDIT_FS_EVENTS (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
+                         FS_MOVE_SELF | FS_EVENT_ON_CHILD)
+static void audit_fsnotify_mark_free(struct audit_fsnotify_mark *audit_mark)
+{
+        kfree(audit_mark->path);
+        kfree(audit_mark);
+}
+static void audit_fsnotify_free_mark(struct fsnotify_mark *mark)
+{
+        struct audit_fsnotify_mark *audit_mark;
+        audit_mark = container_of(mark, struct audit_fsnotify_mark, mark);
+        audit_fsnotify_mark_free(audit_mark);
+}
+char *audit_mark_path(struct audit_fsnotify_mark *mark)
+{
+        return mark->path;
+}
+int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_t dev)
+{
+        if (mark->ino == AUDIT_INO_UNSET)
+                return 0;
+        return (mark->ino == ino) && (mark->dev == dev);
+}
+static void audit_update_mark(struct audit_fsnotify_mark *audit_mark,
+                             struct inode *inode)
+{
+        audit_mark->dev = inode ? inode->i_sb->s_dev : AUDIT_DEV_UNSET;
+        audit_mark->ino = inode ? inode->i_ino : AUDIT_INO_UNSET;
+}
+struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pathname, int len)
+{
+        struct audit_fsnotify_mark *audit_mark;
+        struct path path;
+        struct dentry *dentry;
+        struct inode *inode;
+        int ret;
+        if (pathname[0] != '/' || pathname[len-1] == '/')
+                return ERR_PTR(-EINVAL);
+        dentry = kern_path_locked(pathname, &path);
+        if (IS_ERR(dentry))
+                return (void *)dentry; /* returning an error */
+        inode = path.dentry->d_inode;
+        mutex_unlock(&inode->i_mutex);
+        audit_mark = kzalloc(sizeof(*audit_mark), GFP_KERNEL);
+        if (unlikely(!audit_mark)) {
+                audit_mark = ERR_PTR(-ENOMEM);
+                goto out;
+        }
+        fsnotify_init_mark(&audit_mark->mark, audit_fsnotify_free_mark);
+        audit_mark->mark.mask = AUDIT_FS_EVENTS;
+        audit_mark->path = pathname;
+        audit_update_mark(audit_mark, dentry->d_inode);
+        audit_mark->rule = krule;
+        ret = fsnotify_add_mark(&audit_mark->mark, audit_fsnotify_group, inode, NULL, true);
+        if (ret < 0) {
+                audit_fsnotify_mark_free(audit_mark);
+                audit_mark = ERR_PTR(ret);
+        }
+out:
+        dput(dentry);
+        path_put(&path);
+        return audit_mark;
+}
+static void audit_mark_log_rule_change(struct audit_fsnotify_mark *audit_mark, char *op)
+{
+        struct audit_buffer *ab;
+        struct audit_krule *rule = audit_mark->rule;
+        if (!audit_enabled)
+                return;
+        ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
+        if (unlikely(!ab))
+                return;
+        audit_log_format(ab, "auid=%u ses=%u op=",
+                         from_kuid(&init_user_ns, audit_get_loginuid(current)),
+                         audit_get_sessionid(current));
+        audit_log_string(ab, op);
+        audit_log_format(ab, " path=");
+        audit_log_untrustedstring(ab, audit_mark->path);
+        audit_log_key(ab, rule->filterkey);
+        audit_log_format(ab, " list=%d res=1", rule->listnr);
+        audit_log_end(ab);
+}
+void audit_remove_mark(struct audit_fsnotify_mark *audit_mark)
+{
+        fsnotify_destroy_mark(&audit_mark->mark, audit_fsnotify_group);
+        fsnotify_put_mark(&audit_mark->mark);
+}
+void audit_remove_mark_rule(struct audit_krule *krule)
+{
+        struct audit_fsnotify_mark *mark = krule->exe;
+        audit_remove_mark(mark);
+}
+static void audit_autoremove_mark_rule(struct audit_fsnotify_mark *audit_mark)
+{
+        struct audit_krule *rule = audit_mark->rule;
+        struct audit_entry *entry = container_of(rule, struct audit_entry, rule);
+        audit_mark_log_rule_change(audit_mark, "autoremove_rule");
+        audit_del_rule(entry);
+}
+/* Update mark data in audit rules based on fsnotify events. */
+static int audit_mark_handle_event(struct fsnotify_group *group,
+                                    struct inode *to_tell,
+                                    struct fsnotify_mark *inode_mark,
+                                    struct fsnotify_mark *vfsmount_mark,
+                                    u32 mask, void *data, int data_type,
+                                    const unsigned char *dname, u32 cookie)
+{
+        struct audit_fsnotify_mark *audit_mark;
+        struct inode *inode = NULL;
+        audit_mark = container_of(inode_mark, struct audit_fsnotify_mark, mark);
+        BUG_ON(group != audit_fsnotify_group);
+        switch (data_type) {
+        case (FSNOTIFY_EVENT_PATH):
+                inode = ((struct path *)data)->dentry->d_inode;
+                break;
+        case (FSNOTIFY_EVENT_INODE):
+                inode = (struct inode *)data;
+                break;
+        default:
+                BUG();
+                return 0;
+        };
+        if (mask & (FS_CREATE|FS_MOVED_TO|FS_DELETE|FS_MOVED_FROM)) {
+                if (audit_compare_dname_path(dname, audit_mark->path, AUDIT_NAME_FULL))
+                        return 0;
+                audit_update_mark(audit_mark, inode);
+        } else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF))
+                audit_autoremove_mark_rule(audit_mark);
+        return 0;
+}
+static const struct fsnotify_ops audit_mark_fsnotify_ops = {
+        .handle_event = audit_mark_handle_event,
+};
+static int __init audit_fsnotify_init(void)
+{
+        audit_fsnotify_group = fsnotify_alloc_group(&audit_mark_fsnotify_ops);
+        if (IS_ERR(audit_fsnotify_group)) {
+                audit_fsnotify_group = NULL;
+                audit_panic("cannot create audit fsnotify group");
+        }
+        return 0;
+}
+device_initcall(audit_fsnotify_init);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index b0f9877273fc..94ecdabda8e6 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -479,6 +479,8 @@ static void kill_rules(struct audit_tree *tree)
                if (rule->tree) {
                        /* not a half-baked one */
                        audit_tree_log_remove_rule(rule);
+                        if (entry->rule.exe)
+                                audit_remove_mark(entry->rule.exe);
                        rule->tree = NULL;
                        list_del_rcu(&entry->list);
                        list_del(&entry->rule.list);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 6e30024d9aac..656c7e93ac0d 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -138,7 +138,7 @@ char *audit_watch_path(struct audit_watch *watch)
 int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev)
 {
-        return (watch->ino != (unsigned long)-1) &&
+        return (watch->ino != AUDIT_INO_UNSET) &&
                (watch->ino == ino) &&
                (watch->dev == dev);
 }
@@ -179,8 +179,8 @@ static struct audit_watch *audit_init_watch(char *path)
        INIT_LIST_HEAD(&watch->rules);
        atomic_set(&watch->count, 1);
        watch->path = path;
-        watch->dev = (dev_t)-1;
+        watch->dev = AUDIT_DEV_UNSET;
-        watch->ino = (unsigned long)-1;
+        watch->ino = AUDIT_INO_UNSET;
        return watch;
 }
@@ -203,7 +203,6 @@ int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op)
        if (IS_ERR(watch))
                return PTR_ERR(watch);
-        audit_get_watch(watch);
        krule->watch = watch;
        return 0;
@@ -313,6 +312,8 @@ static void audit_update_watch(struct audit_parent *parent,
                                list_replace(&oentry->rule.list,
                                             &nentry->rule.list);
                        }
+                        if (oentry->rule.exe)
+                                audit_remove_mark(oentry->rule.exe);
                        audit_watch_log_rule_change(r, owatch, "updated_rules");
@@ -343,6 +344,8 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
                list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
                        e = container_of(r, struct audit_entry, rule);
                        audit_watch_log_rule_change(r, w, "remove_rule");
+                        if (e->rule.exe)
+                                audit_remove_mark(e->rule.exe);
                        list_del(&r->rlist);
                        list_del(&r->list);
                        list_del_rcu(&e->list);
@@ -387,19 +390,20 @@ static void audit_add_to_parent(struct audit_krule *krule,
                watch_found = 1;
-                /* put krule's and initial refs to temporary watch */
+                /* put krule's ref to temporary watch */
-                audit_put_watch(watch);
                audit_put_watch(watch);
                audit_get_watch(w);
                krule->watch = watch = w;
+                audit_put_parent(parent);
                break;
        }
        if (!watch_found) {
-                audit_get_parent(parent);
                watch->parent = parent;
+                audit_get_watch(watch);
                list_add(&watch->wlist, &parent->watches);
        }
        list_add(&krule->rlist, &watch->rules);
@@ -437,9 +441,6 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list)
        audit_add_to_parent(krule, parent);
-        /* match get in audit_find_parent or audit_init_parent */
-        audit_put_parent(parent);
        h = audit_hash_ino((u32)watch->ino);
        *list = &audit_inode_hash[h];
 error:
@@ -496,7 +497,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
        if (mask & (FS_CREATE|FS_MOVED_TO) && inode)
                audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0);
        else if (mask & (FS_DELETE|FS_MOVED_FROM))
-                audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
+                audit_update_watch(parent, dname, AUDIT_DEV_UNSET, AUDIT_INO_UNSET, 1);
        else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF))
                audit_remove_parent_watches(parent);
@@ -517,3 +518,36 @@ static int __init audit_watch_init(void)
        return 0;
 }
 device_initcall(audit_watch_init);
+int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old)
+{
+        struct audit_fsnotify_mark *audit_mark;
+        char *pathname;
+        pathname = kstrdup(audit_mark_path(old->exe), GFP_KERNEL);
+        if (!pathname)
+                return -ENOMEM;
+        audit_mark = audit_alloc_mark(new, pathname, strlen(pathname));
+        if (IS_ERR(audit_mark)) {
+                kfree(pathname);
+                return PTR_ERR(audit_mark);
+        }
+        new->exe = audit_mark;
+        return 0;
+}
+int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark)
+{
+        struct file *exe_file;
+        unsigned long ino;
+        dev_t dev;
+        rcu_read_lock();
+        exe_file = rcu_dereference(tsk->mm->exe_file);
+        ino = exe_file->f_inode->i_ino;
+        dev = exe_file->f_inode->i_sb->s_dev;
+        rcu_read_unlock();
+        return audit_mark_compare(mark, ino, dev);
+}
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 72e1660a79a3..7714d93edb85 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -405,6 +405,12 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
                if (f->val > AUDIT_MAX_FIELD_COMPARE)
                        return -EINVAL;
                break;
+        case AUDIT_EXE:
+                if (f->op != Audit_equal)
+                        return -EINVAL;
+                if (entry->rule.listnr != AUDIT_FILTER_EXIT)
+                        return -EINVAL;
+                break;
        };
        return 0;
 }
@@ -419,6 +425,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
        size_t remain = datasz - sizeof(struct audit_rule_data);
        int i;
        char *str;
+        struct audit_fsnotify_mark *audit_mark;
        entry = audit_to_entry_common(data);
        if (IS_ERR(entry))
@@ -539,6 +546,24 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
                        entry->rule.buflen += f->val;
                        entry->rule.filterkey = str;
                        break;
+                case AUDIT_EXE:
+                        if (entry->rule.exe || f->val > PATH_MAX)
+                                goto exit_free;
+                        str = audit_unpack_string(&bufp, &remain, f->val);
+                        if (IS_ERR(str)) {
+                                err = PTR_ERR(str);
+                                goto exit_free;
+                        }
+                        entry->rule.buflen += f->val;
+                        audit_mark = audit_alloc_mark(&entry->rule, str, f->val);
+                        if (IS_ERR(audit_mark)) {
+                                kfree(str);
+                                err = PTR_ERR(audit_mark);
+                                goto exit_free;
+                        }
+                        entry->rule.exe = audit_mark;
+                        break;
                }
        }
@@ -549,10 +574,10 @@ exit_nofree:
        return entry;
 exit_free:
-        if (entry->rule.watch)
-                audit_put_watch(entry->rule.watch); /* matches initial get */
        if (entry->rule.tree)
                audit_put_tree(entry->rule.tree); /* that's the temporary one */
+        if (entry->rule.exe)
+                audit_remove_mark(entry->rule.exe); /* that's the template one */
        audit_free_rule(entry);
        return ERR_PTR(err);
 }
@@ -617,6 +642,10 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
                        data->buflen += data->values[i] =
                                audit_pack_string(&bufp, krule->filterkey);
                        break;
+                case AUDIT_EXE:
+                        data->buflen += data->values[i] =
+                                audit_pack_string(&bufp, audit_mark_path(krule->exe));
+                        break;
                case AUDIT_LOGINUID_SET:
                        if (krule->pflags & AUDIT_LOGINUID_LEGACY && !f->val) {
                                data->fields[i] = AUDIT_LOGINUID;
@@ -680,6 +709,12 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
                        if (strcmp(a->filterkey, b->filterkey))
                                return 1;
                        break;
+                case AUDIT_EXE:
+                        /* both paths exist based on above type compare */
+                        if (strcmp(audit_mark_path(a->exe),
+                                   audit_mark_path(b->exe)))
+                                return 1;
+                        break;
                case AUDIT_UID:
                case AUDIT_EUID:
                case AUDIT_SUID:
@@ -801,8 +836,14 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old)
                                err = -ENOMEM;
                        else
                                new->filterkey = fk;
+                        break;
+                case AUDIT_EXE:
+                        err = audit_dupe_exe(new, old);
+                        break;
                }
                if (err) {
+                        if (new->exe)
+                                audit_remove_mark(new->exe);
                        audit_free_rule(entry);
                        return ERR_PTR(err);
                }
@@ -863,7 +904,7 @@ static inline int audit_add_rule(struct audit_entry *entry)
        struct audit_watch *watch = entry->rule.watch;
        struct audit_tree *tree = entry->rule.tree;
        struct list_head *list;
-        int err;
+        int err = 0;
 #ifdef CONFIG_AUDITSYSCALL
        int dont_count = 0;
@@ -881,7 +922,7 @@ static inline int audit_add_rule(struct audit_entry *entry)
                /* normally audit_add_tree_rule() will free it on failure */
                if (tree)
                        audit_put_tree(tree);
-                goto error;
+                return err;
        }
        if (watch) {
@@ -895,14 +936,14 @@ static inline int audit_add_rule(struct audit_entry *entry)
                         */
                        if (tree)
                                audit_put_tree(tree);
-                        goto error;
+                        return err;
                }
        }
        if (tree) {
                err = audit_add_tree_rule(&entry->rule);
                if (err) {
                        mutex_unlock(&audit_filter_mutex);
-                        goto error;
+                        return err;
                }
        }
@@ -933,19 +974,13 @@ static inline int audit_add_rule(struct audit_entry *entry)
 #endif
        mutex_unlock(&audit_filter_mutex);
-        return 0;
-error:
-        if (watch)
-                audit_put_watch(watch); /* tmp watch, matches initial get */
        return err;
 }
 /* Remove an existing rule from filterlist. */
-static inline int audit_del_rule(struct audit_entry *entry)
+int audit_del_rule(struct audit_entry *entry)
 {
        struct audit_entry  *e;
-        struct audit_watch *watch = entry->rule.watch;
        struct audit_tree *tree = entry->rule.tree;
        struct list_head *list;
        int ret = 0;
@@ -961,7 +996,6 @@ static inline int audit_del_rule(struct audit_entry *entry)
        mutex_lock(&audit_filter_mutex);
        e = audit_find_rule(entry, &list);
        if (!e) {
-                mutex_unlock(&audit_filter_mutex);
                ret = -ENOENT;
                goto out;
        }
@@ -972,9 +1006,8 @@ static inline int audit_del_rule(struct audit_entry *entry)
        if (e->rule.tree)
                audit_remove_tree_rule(&e->rule);
-        list_del_rcu(&e->list);
+        if (e->rule.exe)
-        list_del(&e->rule.list);
+                audit_remove_mark_rule(&e->rule);
-        call_rcu(&e->rcu, audit_free_rule_rcu);
 #ifdef CONFIG_AUDITSYSCALL
        if (!dont_count)
@@ -983,11 +1016,14 @@ static inline int audit_del_rule(struct audit_entry *entry)
        if (!audit_match_signal(entry))
                audit_signals--;
 #endif
-        mutex_unlock(&audit_filter_mutex);
+        list_del_rcu(&e->list);
+        list_del(&e->rule.list);
+        call_rcu(&e->rcu, audit_free_rule_rcu);
 out:
-        if (watch)
+        mutex_unlock(&audit_filter_mutex);
-                audit_put_watch(watch); /* match initial get */
        if (tree)
                audit_put_tree(tree);   /* that's the temporary one */
@@ -1077,8 +1113,11 @@ int audit_rule_change(int type, __u32 portid, int seq, void *data,
                WARN_ON(1);
        }
-        if (err || type == AUDIT_DEL_RULE)
+        if (err || type == AUDIT_DEL_RULE) {
+                if (entry->rule.exe)
+                        audit_remove_mark(entry->rule.exe);
                audit_free_rule(entry);
+        }
        return err;
 }
@@ -1370,6 +1409,8 @@ static int update_lsm_rule(struct audit_krule *r)
                return 0;
        nentry = audit_dupe_rule(r);
+        if (entry->rule.exe)
+                audit_remove_mark(entry->rule.exe);
        if (IS_ERR(nentry)) {
                /* save the first error encountered for the
                 * return value */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index e85bdfd15fed..b86cc04959de 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -180,7 +180,7 @@ static int audit_match_filetype(struct audit_context *ctx, int val)
                return 0;
        list_for_each_entry(n, &ctx->names_list, list) {
-                if ((n->ino != -1) &&
+                if ((n->ino != AUDIT_INO_UNSET) &&
                    ((n->mode & S_IFMT) == mode))
                        return 1;
        }
@@ -466,6 +466,9 @@ static int audit_filter_rules(struct task_struct *tsk,
                                result = audit_comparator(ctx->ppid, f->op, f->val);
                        }
                        break;
+                case AUDIT_EXE:
+                        result = audit_exe_compare(tsk, rule->exe);
+                        break;
                case AUDIT_UID:
                        result = audit_uid_comparator(cred->uid, f->op, f->uid);
                        break;
@@ -1680,7 +1683,7 @@ static struct audit_names *audit_alloc_name(struct audit_context *context,
                aname->should_free = true;
        }
-        aname->ino = (unsigned long)-1;
+        aname->ino = AUDIT_INO_UNSET;
        aname->type = type;
        list_add_tail(&aname->list, &context->names_list);
@@ -1922,7 +1925,7 @@ void __audit_inode_child(const struct inode *parent,
        if (inode)
                audit_copy_inode(found_child, dentry, inode);
        else
-                found_child->ino = (unsigned long)-1;
+                found_child->ino = AUDIT_INO_UNSET;
 }
 EXPORT_SYMBOL_GPL(__audit_inode_child);
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index cb31229a6fa4..29ace107f236 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -150,15 +150,15 @@ static int __init register_array_map(void)
 }
 late_initcall(register_array_map);
-static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
+static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr)
 {
-        /* only bpf_prog file descriptors can be stored in prog_array map */
+        /* only file descriptors can be stored in this type of map */
        if (attr->value_size != sizeof(u32))
                return ERR_PTR(-EINVAL);
        return array_map_alloc(attr);
 }
-static void prog_array_map_free(struct bpf_map *map)
+static void fd_array_map_free(struct bpf_map *map)
 {
        struct bpf_array *array = container_of(map, struct bpf_array, map);
        int i;
@@ -167,21 +167,21 @@ static void prog_array_map_free(struct bpf_map *map)
        /* make sure it's empty */
        for (i = 0; i < array->map.max_entries; i++)
-                BUG_ON(array->prog[i] != NULL);
+                BUG_ON(array->ptrs[i] != NULL);
        kvfree(array);
 }
-static void *prog_array_map_lookup_elem(struct bpf_map *map, void *key)
+static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
 {
        return NULL;
 }
 /* only called from syscall */
-static int prog_array_map_update_elem(struct bpf_map *map, void *key,
+static int fd_array_map_update_elem(struct bpf_map *map, void *key,
-                                      void *value, u64 map_flags)
+                                    void *value, u64 map_flags)
 {
        struct bpf_array *array = container_of(map, struct bpf_array, map);
-        struct bpf_prog *prog, *old_prog;
+        void *new_ptr, *old_ptr;
        u32 index = *(u32 *)key, ufd;
        if (map_flags != BPF_ANY)
@@ -191,57 +191,75 @@ static int prog_array_map_update_elem(struct bpf_map *map, void *key,
                return -E2BIG;
        ufd = *(u32 *)value;
-        prog = bpf_prog_get(ufd);
+        new_ptr = map->ops->map_fd_get_ptr(map, ufd);
-        if (IS_ERR(prog))
+        if (IS_ERR(new_ptr))
-                return PTR_ERR(prog);
+                return PTR_ERR(new_ptr);
-        if (!bpf_prog_array_compatible(array, prog)) {
-                bpf_prog_put(prog);
-                return -EINVAL;
-        }
-        old_prog = xchg(array->prog + index, prog);
+        old_ptr = xchg(array->ptrs + index, new_ptr);
-        if (old_prog)
+        if (old_ptr)
-                bpf_prog_put_rcu(old_prog);
+                map->ops->map_fd_put_ptr(old_ptr);
        return 0;
 }
-static int prog_array_map_delete_elem(struct bpf_map *map, void *key)
+static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
 {
        struct bpf_array *array = container_of(map, struct bpf_array, map);
-        struct bpf_prog *old_prog;
+        void *old_ptr;
        u32 index = *(u32 *)key;
        if (index >= array->map.max_entries)
                return -E2BIG;
-        old_prog = xchg(array->prog + index, NULL);
+        old_ptr = xchg(array->ptrs + index, NULL);
-        if (old_prog) {
+        if (old_ptr) {
-                bpf_prog_put_rcu(old_prog);
+                map->ops->map_fd_put_ptr(old_ptr);
                return 0;
        } else {
                return -ENOENT;
        }
 }
+static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd)
+{
+        struct bpf_array *array = container_of(map, struct bpf_array, map);
+        struct bpf_prog *prog = bpf_prog_get(fd);
+        if (IS_ERR(prog))
+                return prog;
+        if (!bpf_prog_array_compatible(array, prog)) {
+                bpf_prog_put(prog);
+                return ERR_PTR(-EINVAL);
+        }
+        return prog;
+}
+static void prog_fd_array_put_ptr(void *ptr)
+{
+        struct bpf_prog *prog = ptr;
+        bpf_prog_put_rcu(prog);
+}
 /* decrement refcnt of all bpf_progs that are stored in this map */
-void bpf_prog_array_map_clear(struct bpf_map *map)
+void bpf_fd_array_map_clear(struct bpf_map *map)
 {
        struct bpf_array *array = container_of(map, struct bpf_array, map);
        int i;
        for (i = 0; i < array->map.max_entries; i++)
-                prog_array_map_delete_elem(map, &i);
+                fd_array_map_delete_elem(map, &i);
 }
 static const struct bpf_map_ops prog_array_ops = {
-        .map_alloc = prog_array_map_alloc,
+        .map_alloc = fd_array_map_alloc,
-        .map_free = prog_array_map_free,
+        .map_free = fd_array_map_free,
        .map_get_next_key = array_map_get_next_key,
-        .map_lookup_elem = prog_array_map_lookup_elem,
+        .map_lookup_elem = fd_array_map_lookup_elem,
-        .map_update_elem = prog_array_map_update_elem,
+        .map_update_elem = fd_array_map_update_elem,
-        .map_delete_elem = prog_array_map_delete_elem,
+        .map_delete_elem = fd_array_map_delete_elem,
+        .map_fd_get_ptr = prog_fd_array_get_ptr,
+        .map_fd_put_ptr = prog_fd_array_put_ptr,
 };
 static struct bpf_map_type_list prog_array_type __read_mostly = {
@@ -255,3 +273,60 @@ static int __init register_prog_array_map(void)
        return 0;
 }
 late_initcall(register_prog_array_map);
+static void perf_event_array_map_free(struct bpf_map *map)
+{
+        bpf_fd_array_map_clear(map);
+        fd_array_map_free(map);
+}
+static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
+{
+        struct perf_event *event;
+        const struct perf_event_attr *attr;
+        event = perf_event_get(fd);
+        if (IS_ERR(event))
+                return event;
+        attr = perf_event_attrs(event);
+        if (IS_ERR(attr))
+                return (void *)attr;
+        if (attr->type != PERF_TYPE_RAW &&
+            attr->type != PERF_TYPE_HARDWARE) {
+                perf_event_release_kernel(event);
+                return ERR_PTR(-EINVAL);
+        }
+        return event;
+}
+static void perf_event_fd_array_put_ptr(void *ptr)
+{
+        struct perf_event *event = ptr;
+        perf_event_release_kernel(event);
+}
+static const struct bpf_map_ops perf_event_array_ops = {
+        .map_alloc = fd_array_map_alloc,
+        .map_free = perf_event_array_map_free,
+        .map_get_next_key = array_map_get_next_key,
+        .map_lookup_elem = fd_array_map_lookup_elem,
+        .map_update_elem = fd_array_map_update_elem,
+        .map_delete_elem = fd_array_map_delete_elem,
+        .map_fd_get_ptr = perf_event_fd_array_get_ptr,
+        .map_fd_put_ptr = perf_event_fd_array_put_ptr,
+};
+static struct bpf_map_type_list perf_event_array_type __read_mostly = {
+        .ops = &perf_event_array_ops,
+        .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+};
+static int __init register_perf_event_array_map(void)
+{
+        bpf_register_map_type(&perf_event_array_type);
+        return 0;
+}
+late_initcall(register_perf_event_array_map);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index c5bedc82bc1c..67c380cfa9ca 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -177,6 +177,7 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 {
        return 0;
 }
+EXPORT_SYMBOL_GPL(__bpf_call_base);
 /**
 *      __bpf_prog_run - run eBPF program on a given context
@@ -449,11 +450,15 @@ select_insn:
                tail_call_cnt++;
-                prog = READ_ONCE(array->prog[index]);
+                prog = READ_ONCE(array->ptrs[index]);
                if (unlikely(!prog))
                        goto out;
-                ARG1 = BPF_R1;
+                /* ARG1 at this point is guaranteed to point to CTX from
+                 * the verifier side due to the fact that the tail call is
+                 * handeled like a helper, that is, bpf_tail_call_proto,
+                 * where arg1_type is ARG_PTR_TO_CTX.
+                 */
                insn = prog->insnsi;
                goto select_insn;
 out:
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a1b14d197a4f..35bac8e8b071 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -72,7 +72,7 @@ static int bpf_map_release(struct inode *inode, struct file *filp)
                /* prog_array stores refcnt-ed bpf_prog pointers
                 * release them all when user space closes prog_array_fd
                 */
-                bpf_prog_array_map_clear(map);
+                bpf_fd_array_map_clear(map);
        bpf_map_put(map);
        return 0;
@@ -155,14 +155,15 @@ static int map_lookup_elem(union bpf_attr *attr)
        void __user *ukey = u64_to_ptr(attr->key);
        void __user *uvalue = u64_to_ptr(attr->value);
        int ufd = attr->map_fd;
-        struct fd f = fdget(ufd);
        struct bpf_map *map;
        void *key, *value, *ptr;
+        struct fd f;
        int err;
        if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
                return -EINVAL;
+        f = fdget(ufd);
        map = bpf_map_get(f);
        if (IS_ERR(map))
                return PTR_ERR(map);
@@ -213,14 +214,15 @@ static int map_update_elem(union bpf_attr *attr)
        void __user *ukey = u64_to_ptr(attr->key);
        void __user *uvalue = u64_to_ptr(attr->value);
        int ufd = attr->map_fd;
-        struct fd f = fdget(ufd);
        struct bpf_map *map;
        void *key, *value;
+        struct fd f;
        int err;
        if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
                return -EINVAL;
+        f = fdget(ufd);
        map = bpf_map_get(f);
        if (IS_ERR(map))
                return PTR_ERR(map);
@@ -265,14 +267,15 @@ static int map_delete_elem(union bpf_attr *attr)
 {
        void __user *ukey = u64_to_ptr(attr->key);
        int ufd = attr->map_fd;
-        struct fd f = fdget(ufd);
        struct bpf_map *map;
+        struct fd f;
        void *key;
        int err;
        if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
                return -EINVAL;
+        f = fdget(ufd);
        map = bpf_map_get(f);
        if (IS_ERR(map))
                return PTR_ERR(map);
@@ -305,14 +308,15 @@ static int map_get_next_key(union bpf_attr *attr)
        void __user *ukey = u64_to_ptr(attr->key);
        void __user *unext_key = u64_to_ptr(attr->next_key);
        int ufd = attr->map_fd;
-        struct fd f = fdget(ufd);
        struct bpf_map *map;
        void *key, *next_key;
+        struct fd f;
        int err;
        if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
                return -EINVAL;
+        f = fdget(ufd);
        map = bpf_map_get(f);
        if (IS_ERR(map))
                return PTR_ERR(map);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 039d866fd36a..b074b23000d6 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -238,6 +238,14 @@ static const char * const reg_type_str[] = {
        [CONST_IMM]             = "imm",
 };
+static const struct {
+        int map_type;
+        int func_id;
+} func_limit[] = {
+        {BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call},
+        {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read},
+};
 static void print_verifier_state(struct verifier_env *env)
 {
        enum bpf_reg_type t;
@@ -275,7 +283,7 @@ static const char *const bpf_class_string[] = {
        [BPF_ALU64] = "alu64",
 };
-static const char *const bpf_alu_string[] = {
+static const char *const bpf_alu_string[16] = {
        [BPF_ADD >> 4]  = "+=",
        [BPF_SUB >> 4]  = "-=",
        [BPF_MUL >> 4]  = "*=",
@@ -299,7 +307,7 @@ static const char *const bpf_ldst_string[] = {
        [BPF_DW >> 3] = "u64",
 };
-static const char *const bpf_jmp_string[] = {
+static const char *const bpf_jmp_string[16] = {
        [BPF_JA >> 4]   = "jmp",
        [BPF_JEQ >> 4]  = "==",
        [BPF_JGT >> 4]  = ">",
@@ -648,6 +656,9 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
        struct verifier_state *state = &env->cur_state;
        int size, err = 0;
+        if (state->regs[regno].type == PTR_TO_STACK)
+                off += state->regs[regno].imm;
        size = bpf_size_to_bytes(bpf_size);
        if (size < 0)
                return size;
@@ -667,7 +678,8 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
                if (!err && t == BPF_READ && value_regno >= 0)
                        mark_reg_unknown_value(state->regs, value_regno);
-        } else if (state->regs[regno].type == FRAME_PTR) {
+        } else if (state->regs[regno].type == FRAME_PTR ||
+                   state->regs[regno].type == PTR_TO_STACK) {
                if (off >= 0 || off < -MAX_BPF_STACK) {
                        verbose("invalid stack off=%d size=%d\n", off, size);
                        return -EACCES;
@@ -833,6 +845,28 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
        return err;
 }
+static int check_map_func_compatibility(struct bpf_map *map, int func_id)
+{
+        bool bool_map, bool_func;
+        int i;
+        if (!map)
+                return 0;
+        for (i = 0; i < ARRAY_SIZE(func_limit); i++) {
+                bool_map = (map->map_type == func_limit[i].map_type);
+                bool_func = (func_id == func_limit[i].func_id);
+                /* only when map & func pair match it can continue.
+                 * don't allow any other map type to be passed into
+                 * the special func;
+                 */
+                if (bool_map != bool_func)
+                        return -EINVAL;
+        }
+        return 0;
+}
 static int check_call(struct verifier_env *env, int func_id)
 {
        struct verifier_state *state = &env->cur_state;
@@ -908,21 +942,9 @@ static int check_call(struct verifier_env *env, int func_id)
                return -EINVAL;
        }
-        if (map && map->map_type == BPF_MAP_TYPE_PROG_ARRAY &&
+        err = check_map_func_compatibility(map, func_id);
-            func_id != BPF_FUNC_tail_call)
+        if (err)
-                /* prog_array map type needs extra care:
+                return err;
-                 * only allow to pass it into bpf_tail_call() for now.
-                 * bpf_map_delete_elem() can be allowed in the future,
-                 * while bpf_map_update_elem() must only be done via syscall
-                 */
-                return -EINVAL;
-        if (func_id == BPF_FUNC_tail_call &&
-            map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
-                /* don't allow any other map type to be passed into
-                 * bpf_tail_call()
-                 */
-                return -EINVAL;
        return 0;
 }
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f89d9292eee6..2cf0f79f1fc9 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -107,8 +107,8 @@ static DEFINE_SPINLOCK(release_agent_path_lock);
 struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
 #define cgroup_assert_mutex_or_rcu_locked()                             \
-        rcu_lockdep_assert(rcu_read_lock_held() ||                      \
+        RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
-                           lockdep_is_held(&cgroup_mutex),              \
+                           !lockdep_is_held(&cgroup_mutex),             \
                           "cgroup_mutex or RCU read lock required");
 /*
@@ -145,6 +145,7 @@ static const char *cgroup_subsys_name[] = {
 * part of that cgroup.
 */
 struct cgroup_root cgrp_dfl_root;
+EXPORT_SYMBOL_GPL(cgrp_dfl_root);
 /*
 * The default hierarchy always exists but is hidden until mounted for the
@@ -186,6 +187,9 @@ static u64 css_serial_nr_next = 1;
 static unsigned long have_fork_callback __read_mostly;
 static unsigned long have_exit_callback __read_mostly;
+/* Ditto for the can_fork callback. */
+static unsigned long have_canfork_callback __read_mostly;
 static struct cftype cgroup_dfl_base_files[];
 static struct cftype cgroup_legacy_base_files[];
@@ -207,7 +211,7 @@ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
        idr_preload(gfp_mask);
        spin_lock_bh(&cgroup_idr_lock);
-        ret = idr_alloc(idr, ptr, start, end, gfp_mask);
+        ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_WAIT);
        spin_unlock_bh(&cgroup_idr_lock);
        idr_preload_end();
        return ret;
@@ -1027,10 +1031,13 @@ static const struct file_operations proc_cgroupstats_operations;
 static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
                              char *buf)
 {
+        struct cgroup_subsys *ss = cft->ss;
        if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
            !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
                snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
-                         cft->ss->name, cft->name);
+                         cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
+                         cft->name);
        else
                strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
        return buf;
@@ -1332,9 +1339,10 @@ static int cgroup_show_options(struct seq_file *seq,
        struct cgroup_subsys *ss;
        int ssid;
-        for_each_subsys(ss, ssid)
+        if (root != &cgrp_dfl_root)
-                if (root->subsys_mask & (1 << ssid))
+                for_each_subsys(ss, ssid)
-                        seq_printf(seq, ",%s", ss->name);
+                        if (root->subsys_mask & (1 << ssid))
+                                seq_show_option(seq, ss->legacy_name, NULL);
        if (root->flags & CGRP_ROOT_NOPREFIX)
                seq_puts(seq, ",noprefix");
        if (root->flags & CGRP_ROOT_XATTR)
@@ -1342,13 +1350,14 @@ static int cgroup_show_options(struct seq_file *seq,
        spin_lock(&release_agent_path_lock);
        if (strlen(root->release_agent_path))
-                seq_printf(seq, ",release_agent=%s", root->release_agent_path);
+                seq_show_option(seq, "release_agent",
+                                root->release_agent_path);
        spin_unlock(&release_agent_path_lock);
        if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
                seq_puts(seq, ",clone_children");
        if (strlen(root->name))
-                seq_printf(seq, ",name=%s", root->name);
+                seq_show_option(seq, "name", root->name);
        return 0;
 }
@@ -1447,7 +1456,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
                }
                for_each_subsys(ss, i) {
-                        if (strcmp(token, ss->name))
+                        if (strcmp(token, ss->legacy_name))
                                continue;
                        if (ss->disabled)
                                continue;
@@ -1666,7 +1675,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
        lockdep_assert_held(&cgroup_mutex);
-        ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT);
+        ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL);
        if (ret < 0)
                goto out;
        root_cgrp->id = ret;
@@ -4579,7 +4588,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
        if (err)
                goto err_free_css;
-        err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT);
+        err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL);
        if (err < 0)
                goto err_free_percpu_ref;
        css->id = err;
@@ -4656,7 +4665,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
         * Temporarily set the pointer to NULL, so idr_find() won't return
         * a half-baked cgroup.
         */
-        cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT);
+        cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL);
        if (cgrp->id < 0) {
                ret = -ENOMEM;
                goto out_cancel_ref;
@@ -4955,6 +4964,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
        have_fork_callback |= (bool)ss->fork << ss->id;
        have_exit_callback |= (bool)ss->exit << ss->id;
+        have_canfork_callback |= (bool)ss->can_fork << ss->id;
        /* At system boot, before all subsystems have been
         * registered, no tasks have been forked, so we don't
@@ -4993,6 +5003,8 @@ int __init cgroup_init_early(void)
                ss->id = i;
                ss->name = cgroup_subsys_name[i];
+                if (!ss->legacy_name)
+                        ss->legacy_name = cgroup_subsys_name[i];
                if (ss->early_init)
                        cgroup_init_subsys(ss, true);
@@ -5136,9 +5148,11 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
                        continue;
                seq_printf(m, "%d:", root->hierarchy_id);
-                for_each_subsys(ss, ssid)
+                if (root != &cgrp_dfl_root)
-                        if (root->subsys_mask & (1 << ssid))
+                        for_each_subsys(ss, ssid)
-                                seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
+                                if (root->subsys_mask & (1 << ssid))
+                                        seq_printf(m, "%s%s", count++ ? "," : "",
+                                                   ss->legacy_name);
                if (strlen(root->name))
                        seq_printf(m, "%sname=%s", count ? "," : "",
                                   root->name);
@@ -5178,7 +5192,7 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
        for_each_subsys(ss, i)
                seq_printf(m, "%s\t%d\t%d\t%d\n",
-                           ss->name, ss->root->hierarchy_id,
+                           ss->legacy_name, ss->root->hierarchy_id,
                           atomic_read(&ss->root->nr_cgrps), !ss->disabled);
        mutex_unlock(&cgroup_mutex);
@@ -5197,6 +5211,19 @@ static const struct file_operations proc_cgroupstats_operations = {
        .release = single_release,
 };
+static void **subsys_canfork_priv_p(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
+{
+        if (CGROUP_CANFORK_START <= i && i < CGROUP_CANFORK_END)
+                return &ss_priv[i - CGROUP_CANFORK_START];
+        return NULL;
+}
+static void *subsys_canfork_priv(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
+{
+        void **private = subsys_canfork_priv_p(ss_priv, i);
+        return private ? *private : NULL;
+}
 /**
 * cgroup_fork - initialize cgroup related fields during copy_process()
 * @child: pointer to task_struct of forking parent process.
@@ -5212,6 +5239,57 @@ void cgroup_fork(struct task_struct *child)
 }
 /**
+ * cgroup_can_fork - called on a new task before the process is exposed
+ * @child: the task in question.
+ *
+ * This calls the subsystem can_fork() callbacks. If the can_fork() callback
+ * returns an error, the fork aborts with that error code. This allows for
+ * a cgroup subsystem to conditionally allow or deny new forks.
+ */
+int cgroup_can_fork(struct task_struct *child,
+                    void *ss_priv[CGROUP_CANFORK_COUNT])
+{
+        struct cgroup_subsys *ss;
+        int i, j, ret;
+        for_each_subsys_which(ss, i, &have_canfork_callback) {
+                ret = ss->can_fork(child, subsys_canfork_priv_p(ss_priv, i));
+                if (ret)
+                        goto out_revert;
+        }
+        return 0;
+out_revert:
+        for_each_subsys(ss, j) {
+                if (j >= i)
+                        break;
+                if (ss->cancel_fork)
+                        ss->cancel_fork(child, subsys_canfork_priv(ss_priv, j));
+        }
+        return ret;
+}
+/**
+ * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
+ * @child: the task in question
+ *
+ * This calls the cancel_fork() callbacks if a fork failed *after*
+ * cgroup_can_fork() succeded.
+ */
+void cgroup_cancel_fork(struct task_struct *child,
+                        void *ss_priv[CGROUP_CANFORK_COUNT])
+{
+        struct cgroup_subsys *ss;
+        int i;
+        for_each_subsys(ss, i)
+                if (ss->cancel_fork)
+                        ss->cancel_fork(child, subsys_canfork_priv(ss_priv, i));
+}
+/**
 * cgroup_post_fork - called on a new task after adding it to the task list
 * @child: the task in question
 *
@@ -5221,7 +5299,8 @@ void cgroup_fork(struct task_struct *child)
 * cgroup_task_iter_start() - to guarantee that the new task ends up on its
 * list.
 */
-void cgroup_post_fork(struct task_struct *child)
+void cgroup_post_fork(struct task_struct *child,
+                      void *old_ss_priv[CGROUP_CANFORK_COUNT])
 {
        struct cgroup_subsys *ss;
        int i;
@@ -5266,7 +5345,7 @@ void cgroup_post_fork(struct task_struct *child)
         * and addition to css_set.
         */
        for_each_subsys_which(ss, i, &have_fork_callback)
-                ss->fork(child);
+                ss->fork(child, subsys_canfork_priv(old_ss_priv, i));
 }
 /**
@@ -5400,12 +5479,14 @@ static int __init cgroup_disable(char *str)
                        continue;
                for_each_subsys(ss, i) {
-                        if (!strcmp(token, ss->name)) {
+                        if (strcmp(token, ss->name) &&
-                                ss->disabled = 1;
+                            strcmp(token, ss->legacy_name))
-                                printk(KERN_INFO "Disabling %s control group"
+                                continue;
-                                        " subsystem\n", ss->name);
-                                break;
+                        ss->disabled = 1;
-                        }
+                        printk(KERN_INFO "Disabling %s control group subsystem\n",
+                               ss->name);
+                        break;
                }
        }
        return 1;
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 92b98cc0ee76..f1b30ad5dc6d 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -203,7 +203,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
 * to do anything as freezer_attach() will put @task into the appropriate
 * state.
 */
-static void freezer_fork(struct task_struct *task)
+static void freezer_fork(struct task_struct *task, void *private)
 {
        struct freezer *freezer;
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c
new file mode 100644
index 000000000000..806cd7693ac8
--- /dev/null
+++ b/kernel/cgroup_pids.c
@@ -0,0 +1,355 @@
+/*
+ * Process number limiting controller for cgroups.
+ *
+ * Used to allow a cgroup hierarchy to stop any new processes from fork()ing
+ * after a certain limit is reached.
+ *
+ * Since it is trivial to hit the task limit without hitting any kmemcg limits
+ * in place, PIDs are a fundamental resource. As such, PID exhaustion must be
+ * preventable in the scope of a cgroup hierarchy by allowing resource limiting
+ * of the number of tasks in a cgroup.
+ *
+ * In order to use the `pids` controller, set the maximum number of tasks in
+ * pids.max (this is not available in the root cgroup for obvious reasons). The
+ * number of processes currently in the cgroup is given by pids.current.
+ * Organisational operations are not blocked by cgroup policies, so it is
+ * possible to have pids.current > pids.max. However, it is not possible to
+ * violate a cgroup policy through fork(). fork() will return -EAGAIN if forking
+ * would cause a cgroup policy to be violated.
+ *
+ * To set a cgroup to have no limit, set pids.max to "max". This is the default
+ * for all new cgroups (N.B. that PID limits are hierarchical, so the most
+ * stringent limit in the hierarchy is followed).
+ *
+ * pids.current tracks all child cgroup hierarchies, so parent/pids.current is
+ * a superset of parent/child/pids.current.
+ *
+ * Copyright (C) 2015 Aleksa Sarai <cyphar@cyphar.com>
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License.  See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/threads.h>
+#include <linux/atomic.h>
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+#define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
+#define PIDS_MAX_STR "max"
+struct pids_cgroup {
+        struct cgroup_subsys_state      css;
+        /*
+         * Use 64-bit types so that we can safely represent "max" as
+         * %PIDS_MAX = (%PID_MAX_LIMIT + 1).
+         */
+        atomic64_t                      counter;
+        int64_t                         limit;
+};
+static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css)
+{
+        return container_of(css, struct pids_cgroup, css);
+}
+static struct pids_cgroup *parent_pids(struct pids_cgroup *pids)
+{
+        return css_pids(pids->css.parent);
+}
+static struct cgroup_subsys_state *
+pids_css_alloc(struct cgroup_subsys_state *parent)
+{
+        struct pids_cgroup *pids;
+        pids = kzalloc(sizeof(struct pids_cgroup), GFP_KERNEL);
+        if (!pids)
+                return ERR_PTR(-ENOMEM);
+        pids->limit = PIDS_MAX;
+        atomic64_set(&pids->counter, 0);
+        return &pids->css;
+}
+static void pids_css_free(struct cgroup_subsys_state *css)
+{
+        kfree(css_pids(css));
+}
+/**
+ * pids_cancel - uncharge the local pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to cancel
+ *
+ * This function will WARN if the pid count goes under 0, because such a case is
+ * a bug in the pids controller proper.
+ */
+static void pids_cancel(struct pids_cgroup *pids, int num)
+{
+        /*
+         * A negative count (or overflow for that matter) is invalid,
+         * and indicates a bug in the `pids` controller proper.
+         */
+        WARN_ON_ONCE(atomic64_add_negative(-num, &pids->counter));
+}
+/**
+ * pids_uncharge - hierarchically uncharge the pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to uncharge
+ */
+static void pids_uncharge(struct pids_cgroup *pids, int num)
+{
+        struct pids_cgroup *p;
+        for (p = pids; p; p = parent_pids(p))
+                pids_cancel(p, num);
+}
+/**
+ * pids_charge - hierarchically charge the pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to charge
+ *
+ * This function does *not* follow the pid limit set. It cannot fail and the new
+ * pid count may exceed the limit. This is only used for reverting failed
+ * attaches, where there is no other way out than violating the limit.
+ */
+static void pids_charge(struct pids_cgroup *pids, int num)
+{
+        struct pids_cgroup *p;
+        for (p = pids; p; p = parent_pids(p))
+                atomic64_add(num, &p->counter);
+}
+/**
+ * pids_try_charge - hierarchically try to charge the pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to charge
+ *
+ * This function follows the set limit. It will fail if the charge would cause
+ * the new value to exceed the hierarchical limit. Returns 0 if the charge
+ * succeded, otherwise -EAGAIN.
+ */
+static int pids_try_charge(struct pids_cgroup *pids, int num)
+{
+        struct pids_cgroup *p, *q;
+        for (p = pids; p; p = parent_pids(p)) {
+                int64_t new = atomic64_add_return(num, &p->counter);
+                /*
+                 * Since new is capped to the maximum number of pid_t, if
+                 * p->limit is %PIDS_MAX then we know that this test will never
+                 * fail.
+                 */
+                if (new > p->limit)
+                        goto revert;
+        }
+        return 0;
+revert:
+        for (q = pids; q != p; q = parent_pids(q))
+                pids_cancel(q, num);
+        pids_cancel(p, num);
+        return -EAGAIN;
+}
+static int pids_can_attach(struct cgroup_subsys_state *css,
+                           struct cgroup_taskset *tset)
+{
+        struct pids_cgroup *pids = css_pids(css);
+        struct task_struct *task;
+        cgroup_taskset_for_each(task, tset) {
+                struct cgroup_subsys_state *old_css;
+                struct pids_cgroup *old_pids;
+                /*
+                 * No need to pin @old_css between here and cancel_attach()
+                 * because cgroup core protects it from being freed before
+                 * the migration completes or fails.
+                 */
+                old_css = task_css(task, pids_cgrp_id);
+                old_pids = css_pids(old_css);
+                pids_charge(pids, 1);
+                pids_uncharge(old_pids, 1);
+        }
+        return 0;
+}
+static void pids_cancel_attach(struct cgroup_subsys_state *css,
+                               struct cgroup_taskset *tset)
+{
+        struct pids_cgroup *pids = css_pids(css);
+        struct task_struct *task;
+        cgroup_taskset_for_each(task, tset) {
+                struct cgroup_subsys_state *old_css;
+                struct pids_cgroup *old_pids;
+                old_css = task_css(task, pids_cgrp_id);
+                old_pids = css_pids(old_css);
+                pids_charge(old_pids, 1);
+                pids_uncharge(pids, 1);
+        }
+}
+static int pids_can_fork(struct task_struct *task, void **priv_p)
+{
+        struct cgroup_subsys_state *css;
+        struct pids_cgroup *pids;
+        int err;
+        /*
+         * Use the "current" task_css for the pids subsystem as the tentative
+         * css. It is possible we will charge the wrong hierarchy, in which
+         * case we will forcefully revert/reapply the charge on the right
+         * hierarchy after it is committed to the task proper.
+         */
+        css = task_get_css(current, pids_cgrp_id);
+        pids = css_pids(css);
+        err = pids_try_charge(pids, 1);
+        if (err)
+                goto err_css_put;
+        *priv_p = css;
+        return 0;
+err_css_put:
+        css_put(css);
+        return err;
+}
+static void pids_cancel_fork(struct task_struct *task, void *priv)
+{
+        struct cgroup_subsys_state *css = priv;
+        struct pids_cgroup *pids = css_pids(css);
+        pids_uncharge(pids, 1);
+        css_put(css);
+}
+static void pids_fork(struct task_struct *task, void *priv)
+{
+        struct cgroup_subsys_state *css;
+        struct cgroup_subsys_state *old_css = priv;
+        struct pids_cgroup *pids;
+        struct pids_cgroup *old_pids = css_pids(old_css);
+        css = task_get_css(task, pids_cgrp_id);
+        pids = css_pids(css);
+        /*
+         * If the association has changed, we have to revert and reapply the
+         * charge/uncharge on the wrong hierarchy to the current one. Since
+         * the association can only change due to an organisation event, its
+         * okay for us to ignore the limit in this case.
+         */
+        if (pids != old_pids) {
+                pids_uncharge(old_pids, 1);
+                pids_charge(pids, 1);
+        }
+        css_put(css);
+        css_put(old_css);
+}
+static void pids_exit(struct cgroup_subsys_state *css,
+                      struct cgroup_subsys_state *old_css,
+                      struct task_struct *task)
+{
+        struct pids_cgroup *pids = css_pids(old_css);
+        pids_uncharge(pids, 1);
+}
+static ssize_t pids_max_write(struct kernfs_open_file *of, char *buf,
+                              size_t nbytes, loff_t off)
+{
+        struct cgroup_subsys_state *css = of_css(of);
+        struct pids_cgroup *pids = css_pids(css);
+        int64_t limit;
+        int err;
+        buf = strstrip(buf);
+        if (!strcmp(buf, PIDS_MAX_STR)) {
+                limit = PIDS_MAX;
+                goto set_limit;
+        }
+        err = kstrtoll(buf, 0, &limit);
+        if (err)
+                return err;
+        if (limit < 0 || limit >= PIDS_MAX)
+                return -EINVAL;
+set_limit:
+        /*
+         * Limit updates don't need to be mutex'd, since it isn't
+         * critical that any racing fork()s follow the new limit.
+         */
+        pids->limit = limit;
+        return nbytes;
+}
+static int pids_max_show(struct seq_file *sf, void *v)
+{
+        struct cgroup_subsys_state *css = seq_css(sf);
+        struct pids_cgroup *pids = css_pids(css);
+        int64_t limit = pids->limit;
+        if (limit >= PIDS_MAX)
+                seq_printf(sf, "%s\n", PIDS_MAX_STR);
+        else
+                seq_printf(sf, "%lld\n", limit);
+        return 0;
+}
+static s64 pids_current_read(struct cgroup_subsys_state *css,
+                             struct cftype *cft)
+{
+        struct pids_cgroup *pids = css_pids(css);
+        return atomic64_read(&pids->counter);
+}
+static struct cftype pids_files[] = {
+        {
+                .name = "max",
+                .write = pids_max_write,
+                .seq_show = pids_max_show,
+                .flags = CFTYPE_NOT_ON_ROOT,
+        },
+        {
+                .name = "current",
+                .read_s64 = pids_current_read,
+        },
+        { }     /* terminate */
+};
+struct cgroup_subsys pids_cgrp_subsys = {
+        .css_alloc      = pids_css_alloc,
+        .css_free       = pids_css_free,
+        .can_attach     = pids_can_attach,
+        .cancel_attach  = pids_cancel_attach,
+        .can_fork       = pids_can_fork,
+        .cancel_fork    = pids_cancel_fork,
+        .fork           = pids_fork,
+        .exit           = pids_exit,
+        .legacy_cftypes = pids_files,
+        .dfl_cftypes    = pids_files,
+};
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6a374544d495..82cf9dff4295 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -191,21 +191,22 @@ void cpu_hotplug_done(void)
 void cpu_hotplug_disable(void)
 {
        cpu_maps_update_begin();
-        cpu_hotplug_disabled = 1;
+        cpu_hotplug_disabled++;
        cpu_maps_update_done();
 }
+EXPORT_SYMBOL_GPL(cpu_hotplug_disable);
 void cpu_hotplug_enable(void)
 {
        cpu_maps_update_begin();
-        cpu_hotplug_disabled = 0;
+        WARN_ON(--cpu_hotplug_disabled < 0);
        cpu_maps_update_done();
 }
+EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
 #endif  /* CONFIG_HOTPLUG_CPU */
 /* Need to know about CPUs going up/down? */
-int __ref register_cpu_notifier(struct notifier_block *nb)
+int register_cpu_notifier(struct notifier_block *nb)
 {
        int ret;
        cpu_maps_update_begin();
@@ -214,7 +215,7 @@ int __ref register_cpu_notifier(struct notifier_block *nb)
        return ret;
 }
-int __ref __register_cpu_notifier(struct notifier_block *nb)
+int __register_cpu_notifier(struct notifier_block *nb)
 {
        return raw_notifier_chain_register(&cpu_chain, nb);
 }
@@ -244,7 +245,7 @@ static void cpu_notify_nofail(unsigned long val, void *v)
 EXPORT_SYMBOL(register_cpu_notifier);
 EXPORT_SYMBOL(__register_cpu_notifier);
-void __ref unregister_cpu_notifier(struct notifier_block *nb)
+void unregister_cpu_notifier(struct notifier_block *nb)
 {
        cpu_maps_update_begin();
        raw_notifier_chain_unregister(&cpu_chain, nb);
@@ -252,7 +253,7 @@ void __ref unregister_cpu_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL(unregister_cpu_notifier);
-void __ref __unregister_cpu_notifier(struct notifier_block *nb)
+void __unregister_cpu_notifier(struct notifier_block *nb)
 {
        raw_notifier_chain_unregister(&cpu_chain, nb);
 }
@@ -329,7 +330,7 @@ struct take_cpu_down_param {
 };
 /* Take this CPU down. */
-static int __ref take_cpu_down(void *_param)
+static int take_cpu_down(void *_param)
 {
        struct take_cpu_down_param *param = _param;
        int err;
@@ -348,7 +349,7 @@ static int __ref take_cpu_down(void *_param)
 }
 /* Requires cpu_add_remove_lock to be held */
-static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
+static int _cpu_down(unsigned int cpu, int tasks_frozen)
 {
        int err, nr_calls = 0;
        void *hcpu = (void *)(long)cpu;
@@ -381,14 +382,14 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
         * will observe it.
         *
         * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
-         * not imply sync_sched(), so explicitly call both.
+         * not imply sync_sched(), so wait for both.
         *
         * Do sync before park smpboot threads to take care the rcu boost case.
         */
-#ifdef CONFIG_PREEMPT
+        if (IS_ENABLED(CONFIG_PREEMPT))
-        synchronize_sched();
+                synchronize_rcu_mult(call_rcu, call_rcu_sched);
-#endif
+        else
-        synchronize_rcu();
+                synchronize_rcu();
        smpboot_park_threads(cpu);
@@ -401,7 +402,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
        /*
         * So now all preempt/rcu users must observe !cpu_active().
         */
-        err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
+        err = stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
        if (err) {
                /* CPU didn't die: tell everyone.  Can't complain. */
                cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
@@ -442,7 +443,7 @@ out_release:
        return err;
 }
-int __ref cpu_down(unsigned int cpu)
+int cpu_down(unsigned int cpu)
 {
        int err;
@@ -527,18 +528,9 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen)
                goto out_notify;
        }
-        /*
-         * Some architectures have to walk the irq descriptors to
-         * setup the vector space for the cpu which comes online.
-         * Prevent irq alloc/free across the bringup.
-         */
-        irq_lock_sparse();
        /* Arch-specific enabling code. */
        ret = __cpu_up(cpu, idle);
-        irq_unlock_sparse();
        if (ret != 0)
                goto out_notify;
        BUG_ON(!cpu_online(cpu));
@@ -617,13 +609,18 @@ int disable_nonboot_cpus(void)
                }
        }
-        if (!error) {
+        if (!error)
                BUG_ON(num_online_cpus() > 1);
-                /* Make sure the CPUs won't be enabled by someone else */
+        else
-                cpu_hotplug_disabled = 1;
-        } else {
                pr_err("Non-boot CPUs are not disabled\n");
-        }
+        /*
+         * Make sure the CPUs won't be enabled by someone else. We need to do
+         * this even in case of failure as all disable_nonboot_cpus() users are
+         * supposed to do enable_nonboot_cpus() on the failure path.
+         */
+        cpu_hotplug_disabled++;
        cpu_maps_update_done();
        return error;
 }
@@ -636,13 +633,13 @@ void __weak arch_enable_nonboot_cpus_end(void)
 {
 }
-void __ref enable_nonboot_cpus(void)
+void enable_nonboot_cpus(void)
 {
        int cpu, error;
        /* Allow everyone to use the CPU hotplug again */
        cpu_maps_update_begin();
-        cpu_hotplug_disabled = 0;
+        WARN_ON(--cpu_hotplug_disabled < 0);
        if (cpumask_empty(frozen_cpus))
                goto out;
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
index 9656a3c36503..009cc9a17d95 100644
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -180,7 +180,7 @@ EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
 * low power state that may have caused some blocks in the same power domain
 * to reset.
 *
- * Must be called after cpu_pm_exit has been called on all cpus in the power
+ * Must be called after cpu_cluster_pm_enter has been called for the power
 * domain, and before cpu_pm_exit has been called on any cpu in the power
 * domain. Notified drivers can include VFP co-processor, interrupt controller
 * and its PM extensions, local CPU timers context save/restore which
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ee14e3a35a29..f0acff0f66c9 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1223,7 +1223,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
        spin_unlock_irq(&callback_lock);
        /* use trialcs->mems_allowed as a temp variable */
-        update_nodemasks_hier(cs, &cs->mems_allowed);
+        update_nodemasks_hier(cs, &trialcs->mems_allowed);
 done:
        return retval;
 }
diff --git a/kernel/cred.c b/kernel/cred.c
index ec1c07667ec1..71179a09c1d6 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -20,11 +20,16 @@
 #include <linux/cn_proc.h>
 #if 0
-#define kdebug(FMT, ...) \
+#define kdebug(FMT, ...)                                                \
-        printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
+        printk("[%-5.5s%5u] " FMT "\n",                                 \
+               current->comm, current->pid, ##__VA_ARGS__)
 #else
-#define kdebug(FMT, ...) \
+#define kdebug(FMT, ...)                                                \
-        no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
+do {                                                                    \
+        if (0)                                                          \
+                no_printk("[%-5.5s%5u] " FMT "\n",                      \
+                          current->comm, current->pid, ##__VA_ARGS__);  \
+} while (0)
 #endif
 static struct kmem_cache *cred_jar;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d3dae3419b99..f548f69c4299 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -163,6 +163,7 @@ static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
 static atomic_t nr_task_events __read_mostly;
 static atomic_t nr_freq_events __read_mostly;
+static atomic_t nr_switch_events __read_mostly;
 static LIST_HEAD(pmus);
 static DEFINE_MUTEX(pmus_lock);
@@ -1868,8 +1869,6 @@ event_sched_in(struct perf_event *event,
        perf_pmu_disable(event->pmu);
-        event->tstamp_running += tstamp - event->tstamp_stopped;
        perf_set_shadow_time(event, ctx, tstamp);
        perf_log_itrace_start(event);
@@ -1881,6 +1880,8 @@ event_sched_in(struct perf_event *event,
                goto out;
        }
+        event->tstamp_running += tstamp - event->tstamp_stopped;
        if (!is_software_event(event))
                cpuctx->active_oncpu++;
        if (!ctx->nr_active++)
@@ -2619,6 +2620,9 @@ static void perf_pmu_sched_task(struct task_struct *prev,
        local_irq_restore(flags);
 }
+static void perf_event_switch(struct task_struct *task,
+                              struct task_struct *next_prev, bool sched_in);
 #define for_each_task_context_nr(ctxn)                                  \
        for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
@@ -2641,6 +2645,9 @@ void __perf_event_task_sched_out(struct task_struct *task,
        if (__this_cpu_read(perf_sched_cb_usages))
                perf_pmu_sched_task(task, next, false);
+        if (atomic_read(&nr_switch_events))
+                perf_event_switch(task, next, false);
        for_each_task_context_nr(ctxn)
                perf_event_context_sched_out(task, ctxn, next);
@@ -2831,6 +2838,9 @@ void __perf_event_task_sched_in(struct task_struct *prev,
        if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
                perf_cgroup_sched_in(prev, task);
+        if (atomic_read(&nr_switch_events))
+                perf_event_switch(task, prev, true);
        if (__this_cpu_read(perf_sched_cb_usages))
                perf_pmu_sched_task(prev, task, true);
 }
@@ -3212,6 +3222,59 @@ static inline u64 perf_event_count(struct perf_event *event)
        return __perf_event_count(event);
 }
+/*
+ * NMI-safe method to read a local event, that is an event that
+ * is:
+ *   - either for the current task, or for this CPU
+ *   - does not have inherit set, for inherited task events
+ *     will not be local and we cannot read them atomically
+ *   - must not have a pmu::count method
+ */
+u64 perf_event_read_local(struct perf_event *event)
+{
+        unsigned long flags;
+        u64 val;
+        /*
+         * Disabling interrupts avoids all counter scheduling (context
+         * switches, timer based rotation and IPIs).
+         */
+        local_irq_save(flags);
+        /* If this is a per-task event, it must be for current */
+        WARN_ON_ONCE((event->attach_state & PERF_ATTACH_TASK) &&
+                     event->hw.target != current);
+        /* If this is a per-CPU event, it must be for this CPU */
+        WARN_ON_ONCE(!(event->attach_state & PERF_ATTACH_TASK) &&
+                     event->cpu != smp_processor_id());
+        /*
+         * It must not be an event with inherit set, we cannot read
+         * all child counters from atomic context.
+         */
+        WARN_ON_ONCE(event->attr.inherit);
+        /*
+         * It must not have a pmu::count method, those are not
+         * NMI safe.
+         */
+        WARN_ON_ONCE(event->pmu->count);
+        /*
+         * If the event is currently on this CPU, its either a per-task event,
+         * or local to this CPU. Furthermore it means its ACTIVE (otherwise
+         * oncpu == -1).
+         */
+        if (event->oncpu == smp_processor_id())
+                event->pmu->read(event);
+        val = local64_read(&event->count);
+        local_irq_restore(flags);
+        return val;
+}
 static u64 perf_event_read(struct perf_event *event)
 {
        /*
@@ -3454,6 +3517,10 @@ static void unaccount_event(struct perf_event *event)
                atomic_dec(&nr_task_events);
        if (event->attr.freq)
                atomic_dec(&nr_freq_events);
+        if (event->attr.context_switch) {
+                static_key_slow_dec_deferred(&perf_sched_events);
+                atomic_dec(&nr_switch_events);
+        }
        if (is_cgroup_event(event))
                static_key_slow_dec_deferred(&perf_sched_events);
        if (has_branch_stack(event))
@@ -3958,28 +4025,21 @@ static void perf_event_for_each(struct perf_event *event,
                perf_event_for_each_child(sibling, func);
 }
-static int perf_event_period(struct perf_event *event, u64 __user *arg)
+struct period_event {
-{
+        struct perf_event *event;
-        struct perf_event_context *ctx = event->ctx;
-        int ret = 0, active;
        u64 value;
+};
-        if (!is_sampling_event(event))
+static int __perf_event_period(void *info)
-                return -EINVAL;
+{
+        struct period_event *pe = info;
-        if (copy_from_user(&value, arg, sizeof(value)))
+        struct perf_event *event = pe->event;
-                return -EFAULT;
+        struct perf_event_context *ctx = event->ctx;
+        u64 value = pe->value;
-        if (!value)
+        bool active;
-                return -EINVAL;
-        raw_spin_lock_irq(&ctx->lock);
+        raw_spin_lock(&ctx->lock);
        if (event->attr.freq) {
-                if (value > sysctl_perf_event_sample_rate) {
-                        ret = -EINVAL;
-                        goto unlock;
-                }
                event->attr.sample_freq = value;
        } else {
                event->attr.sample_period = value;
@@ -3998,11 +4058,53 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
                event->pmu->start(event, PERF_EF_RELOAD);
                perf_pmu_enable(ctx->pmu);
        }
+        raw_spin_unlock(&ctx->lock);
-unlock:
+        return 0;
+}
+static int perf_event_period(struct perf_event *event, u64 __user *arg)
+{
+        struct period_event pe = { .event = event, };
+        struct perf_event_context *ctx = event->ctx;
+        struct task_struct *task;
+        u64 value;
+        if (!is_sampling_event(event))
+                return -EINVAL;
+        if (copy_from_user(&value, arg, sizeof(value)))
+                return -EFAULT;
+        if (!value)
+                return -EINVAL;
+        if (event->attr.freq && value > sysctl_perf_event_sample_rate)
+                return -EINVAL;
+        task = ctx->task;
+        pe.value = value;
+        if (!task) {
+                cpu_function_call(event->cpu, __perf_event_period, &pe);
+                return 0;
+        }
+retry:
+        if (!task_function_call(task, __perf_event_period, &pe))
+                return 0;
+        raw_spin_lock_irq(&ctx->lock);
+        if (ctx->is_active) {
+                raw_spin_unlock_irq(&ctx->lock);
+                task = ctx->task;
+                goto retry;
+        }
+        __perf_event_period(&pe);
        raw_spin_unlock_irq(&ctx->lock);
-        return ret;
+        return 0;
 }
 static const struct file_operations perf_fops;
@@ -4740,12 +4842,20 @@ static const struct file_operations perf_fops = {
 * to user-space before waking everybody up.
 */
+static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
+{
+        /* only the parent has fasync state */
+        if (event->parent)
+                event = event->parent;
+        return &event->fasync;
+}
 void perf_event_wakeup(struct perf_event *event)
 {
        ring_buffer_wakeup(event);
        if (event->pending_kill) {
-                kill_fasync(&event->fasync, SIGIO, event->pending_kill);
+                kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
                event->pending_kill = 0;
        }
 }
@@ -5982,6 +6092,91 @@ void perf_log_lost_samples(struct perf_event *event, u64 lost)
 }
 /*
+ * context_switch tracking
+ */
+struct perf_switch_event {
+        struct task_struct      *task;
+        struct task_struct      *next_prev;
+        struct {
+                struct perf_event_header        header;
+                u32                             next_prev_pid;
+                u32                             next_prev_tid;
+        } event_id;
+};
+static int perf_event_switch_match(struct perf_event *event)
+{
+        return event->attr.context_switch;
+}
+static void perf_event_switch_output(struct perf_event *event, void *data)
+{
+        struct perf_switch_event *se = data;
+        struct perf_output_handle handle;
+        struct perf_sample_data sample;
+        int ret;
+        if (!perf_event_switch_match(event))
+                return;
+        /* Only CPU-wide events are allowed to see next/prev pid/tid */
+        if (event->ctx->task) {
+                se->event_id.header.type = PERF_RECORD_SWITCH;
+                se->event_id.header.size = sizeof(se->event_id.header);
+        } else {
+                se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE;
+                se->event_id.header.size = sizeof(se->event_id);
+                se->event_id.next_prev_pid =
+                                        perf_event_pid(event, se->next_prev);
+                se->event_id.next_prev_tid =
+                                        perf_event_tid(event, se->next_prev);
+        }
+        perf_event_header__init_id(&se->event_id.header, &sample, event);
+        ret = perf_output_begin(&handle, event, se->event_id.header.size);
+        if (ret)
+                return;
+        if (event->ctx->task)
+                perf_output_put(&handle, se->event_id.header);
+        else
+                perf_output_put(&handle, se->event_id);
+        perf_event__output_id_sample(event, &handle, &sample);
+        perf_output_end(&handle);
+}
+static void perf_event_switch(struct task_struct *task,
+                              struct task_struct *next_prev, bool sched_in)
+{
+        struct perf_switch_event switch_event;
+        /* N.B. caller checks nr_switch_events != 0 */
+        switch_event = (struct perf_switch_event){
+                .task           = task,
+                .next_prev      = next_prev,
+                .event_id       = {
+                        .header = {
+                                /* .type */
+                                .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT,
+                                /* .size */
+                        },
+                        /* .next_prev_pid */
+                        /* .next_prev_tid */
+                },
+        };
+        perf_event_aux(perf_event_switch_output,
+                       &switch_event,
+                       NULL);
+}
+/*
 * IRQ throttle logging
 */
@@ -6040,8 +6235,6 @@ static void perf_log_itrace_start(struct perf_event *event)
            event->hw.itrace_started)
                return;
-        event->hw.itrace_started = 1;
        rec.header.type = PERF_RECORD_ITRACE_START;
        rec.header.misc = 0;
        rec.header.size = sizeof(rec);
@@ -6124,7 +6317,7 @@ static int __perf_event_overflow(struct perf_event *event,
        else
                perf_event_output(event, data, regs);
-        if (event->fasync && event->pending_kill) {
+        if (*perf_event_fasync(event) && event->pending_kill) {
                event->pending_wakeup = 1;
                irq_work_queue(&event->pending);
        }
@@ -6749,8 +6942,8 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
        if (event->tp_event->prog)
                return -EEXIST;
-        if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE))
+        if (!(event->tp_event->flags & TRACE_EVENT_FL_UKPROBE))
-                /* bpf programs can only be attached to kprobes */
+                /* bpf programs can only be attached to u/kprobes */
                return -EINVAL;
        prog = bpf_prog_get(prog_fd);
@@ -7479,6 +7672,10 @@ static void account_event(struct perf_event *event)
                if (atomic_inc_return(&nr_freq_events) == 1)
                        tick_nohz_full_kick_all();
        }
+        if (event->attr.context_switch) {
+                atomic_inc(&nr_switch_events);
+                static_key_slow_inc(&perf_sched_events.key);
+        }
        if (has_branch_stack(event))
                static_key_slow_inc(&perf_sched_events.key);
        if (is_cgroup_event(event))
@@ -8574,6 +8771,31 @@ void perf_event_delayed_put(struct task_struct *task)
                WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
 }
+struct perf_event *perf_event_get(unsigned int fd)
+{
+        int err;
+        struct fd f;
+        struct perf_event *event;
+        err = perf_fget_light(fd, &f);
+        if (err)
+                return ERR_PTR(err);
+        event = f.file->private_data;
+        atomic_long_inc(&event->refcount);
+        fdput(f);
+        return event;
+}
+const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
+{
+        if (!event)
+                return ERR_PTR(-EINVAL);
+        return &event->attr;
+}
 /*
 * inherit a event from parent task to child task:
 */
@@ -8872,7 +9094,7 @@ static void perf_event_init_cpu(int cpu)
        mutex_unlock(&swhash->hlist_mutex);
 }
-#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
+#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
 static void __perf_event_exit_context(void *__info)
 {
        struct remove_event re = { .detach_group = true };
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index b2be01b1aa9d..182bc30899d5 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -437,7 +437,10 @@ static struct page *rb_alloc_aux_page(int node, int order)
        if (page && order) {
                /*
-                 * Communicate the allocation size to the driver
+                 * Communicate the allocation size to the driver:
+                 * if we managed to secure a high-order allocation,
+                 * set its first page's private to this order;
+                 * !PagePrivate(page) means it's just a normal page.
                 */
                split_page(page, order);
                SetPagePrivate(page);
@@ -559,11 +562,13 @@ static void __rb_free_aux(struct ring_buffer *rb)
                rb->aux_priv = NULL;
        }
-        for (pg = 0; pg < rb->aux_nr_pages; pg++)
+        if (rb->aux_nr_pages) {
-                rb_free_aux_page(rb, pg);
+                for (pg = 0; pg < rb->aux_nr_pages; pg++)
+                        rb_free_aux_page(rb, pg);
-        kfree(rb->aux_pages);
+                kfree(rb->aux_pages);
-        rb->aux_nr_pages = 0;
+                rb->aux_nr_pages = 0;
+        }
 }
 void rb_free_aux(struct ring_buffer *rb)
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index cb346f26a22d..4e5e9798aa0c 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -86,15 +86,6 @@ struct uprobe {
        struct arch_uprobe      arch;
 };
-struct return_instance {
-        struct uprobe           *uprobe;
-        unsigned long           func;
-        unsigned long           orig_ret_vaddr; /* original return address */
-        bool                    chained;        /* true, if instance is nested */
-        struct return_instance  *next;          /* keep as stack */
-};
 /*
 * Execute out of line area: anonymous executable mapping installed
 * by the probed task to execute the copy of the original instruction
@@ -105,17 +96,18 @@ struct return_instance {
 * allocated.
 */
 struct xol_area {
-        wait_queue_head_t       wq;             /* if all slots are busy */
+        wait_queue_head_t               wq;             /* if all slots are busy */
-        atomic_t                slot_count;     /* number of in-use slots */
+        atomic_t                        slot_count;     /* number of in-use slots */
-        unsigned long           *bitmap;        /* 0 = free slot */
+        unsigned long                   *bitmap;        /* 0 = free slot */
-        struct page             *page;
+        struct vm_special_mapping       xol_mapping;
+        struct page                     *pages[2];
        /*
         * We keep the vma's vm_start rather than a pointer to the vma
         * itself.  The probed process or a naughty kernel module could make
         * the vma go away, and we must handle that reasonably gracefully.
         */
-        unsigned long           vaddr;          /* Page(s) of instruction slots */
+        unsigned long                   vaddr;          /* Page(s) of instruction slots */
 };
 /*
@@ -366,6 +358,18 @@ set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long v
        return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)&auprobe->insn);
 }
+static struct uprobe *get_uprobe(struct uprobe *uprobe)
+{
+        atomic_inc(&uprobe->ref);
+        return uprobe;
+}
+static void put_uprobe(struct uprobe *uprobe)
+{
+        if (atomic_dec_and_test(&uprobe->ref))
+                kfree(uprobe);
+}
 static int match_uprobe(struct uprobe *l, struct uprobe *r)
 {
        if (l->inode < r->inode)
@@ -393,10 +397,8 @@ static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
        while (n) {
                uprobe = rb_entry(n, struct uprobe, rb_node);
                match = match_uprobe(&u, uprobe);
-                if (!match) {
+                if (!match)
-                        atomic_inc(&uprobe->ref);
+                        return get_uprobe(uprobe);
-                        return uprobe;
-                }
                if (match < 0)
                        n = n->rb_left;
@@ -432,10 +434,8 @@ static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
                parent = *p;
                u = rb_entry(parent, struct uprobe, rb_node);
                match = match_uprobe(uprobe, u);
-                if (!match) {
+                if (!match)
-                        atomic_inc(&u->ref);
+                        return get_uprobe(u);
-                        return u;
-                }
                if (match < 0)
                        p = &parent->rb_left;
@@ -472,12 +472,6 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe)
        return u;
 }
-static void put_uprobe(struct uprobe *uprobe)
-{
-        if (atomic_dec_and_test(&uprobe->ref))
-                kfree(uprobe);
-}
 static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
 {
        struct uprobe *uprobe, *cur_uprobe;
@@ -1039,14 +1033,14 @@ static void build_probe_list(struct inode *inode,
                        if (u->inode != inode || u->offset < min)
                                break;
                        list_add(&u->pending_list, head);
-                        atomic_inc(&u->ref);
+                        get_uprobe(u);
                }
                for (t = n; (t = rb_next(t)); ) {
                        u = rb_entry(t, struct uprobe, rb_node);
                        if (u->inode != inode || u->offset > max)
                                break;
                        list_add(&u->pending_list, head);
-                        atomic_inc(&u->ref);
+                        get_uprobe(u);
                }
        }
        spin_unlock(&uprobes_treelock);
@@ -1132,11 +1126,14 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
 /* Slot allocation for XOL */
 static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
 {
-        int ret = -EALREADY;
+        struct vm_area_struct *vma;
+        int ret;
        down_write(&mm->mmap_sem);
-        if (mm->uprobes_state.xol_area)
+        if (mm->uprobes_state.xol_area) {
+                ret = -EALREADY;
                goto fail;
+        }
        if (!area->vaddr) {
                /* Try to map as high as possible, this is only a hint. */
@@ -1148,11 +1145,15 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
                }
        }
-        ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE,
+        vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE,
-                                VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, &area->page);
+                                VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO,
-        if (ret)
+                                &area->xol_mapping);
+        if (IS_ERR(vma)) {
+                ret = PTR_ERR(vma);
                goto fail;
+        }
+        ret = 0;
        smp_wmb();      /* pairs with get_xol_area() */
        mm->uprobes_state.xol_area = area;
 fail:
@@ -1175,21 +1176,24 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
        if (!area->bitmap)
                goto free_area;
-        area->page = alloc_page(GFP_HIGHUSER);
+        area->xol_mapping.name = "[uprobes]";
-        if (!area->page)
+        area->xol_mapping.pages = area->pages;
+        area->pages[0] = alloc_page(GFP_HIGHUSER);
+        if (!area->pages[0])
                goto free_bitmap;
+        area->pages[1] = NULL;
        area->vaddr = vaddr;
        init_waitqueue_head(&area->wq);
        /* Reserve the 1st slot for get_trampoline_vaddr() */
        set_bit(0, area->bitmap);
        atomic_set(&area->slot_count, 1);
-        copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE);
+        copy_to_page(area->pages[0], 0, &insn, UPROBE_SWBP_INSN_SIZE);
        if (!xol_add_vma(mm, area))
                return area;
-        __free_page(area->page);
+        __free_page(area->pages[0]);
 free_bitmap:
        kfree(area->bitmap);
 free_area:
@@ -1227,7 +1231,7 @@ void uprobe_clear_state(struct mm_struct *mm)
        if (!area)
                return;
-        put_page(area->page);
+        put_page(area->pages[0]);
        kfree(area->bitmap);
        kfree(area);
 }
@@ -1296,7 +1300,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
        if (unlikely(!xol_vaddr))
                return 0;
-        arch_uprobe_copy_ixol(area->page, xol_vaddr,
+        arch_uprobe_copy_ixol(area->pages[0], xol_vaddr,
                              &uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
        return xol_vaddr;
@@ -1333,6 +1337,7 @@ static void xol_free_insn_slot(struct task_struct *tsk)
                clear_bit(slot_nr, area->bitmap);
                atomic_dec(&area->slot_count);
+                smp_mb__after_atomic(); /* pairs with prepare_to_wait() */
                if (waitqueue_active(&area->wq))
                        wake_up(&area->wq);
@@ -1376,6 +1381,14 @@ unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
        return instruction_pointer(regs);
 }
+static struct return_instance *free_ret_instance(struct return_instance *ri)
+{
+        struct return_instance *next = ri->next;
+        put_uprobe(ri->uprobe);
+        kfree(ri);
+        return next;
+}
 /*
 * Called with no locks held.
 * Called in context of a exiting or a exec-ing thread.
@@ -1383,7 +1396,7 @@ unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
 void uprobe_free_utask(struct task_struct *t)
 {
        struct uprobe_task *utask = t->utask;
-        struct return_instance *ri, *tmp;
+        struct return_instance *ri;
        if (!utask)
                return;
@@ -1392,13 +1405,8 @@ void uprobe_free_utask(struct task_struct *t)
                put_uprobe(utask->active_uprobe);
        ri = utask->return_instances;
-        while (ri) {
+        while (ri)
-                tmp = ri;
+                ri = free_ret_instance(ri);
-                ri = ri->next;
-                put_uprobe(tmp->uprobe);
-                kfree(tmp);
-        }
        xol_free_insn_slot(t);
        kfree(utask);
@@ -1437,7 +1445,7 @@ static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
                        return -ENOMEM;
                *n = *o;
-                atomic_inc(&n->uprobe->ref);
+                get_uprobe(n->uprobe);
                n->next = NULL;
                *p = n;
@@ -1515,12 +1523,25 @@ static unsigned long get_trampoline_vaddr(void)
        return trampoline_vaddr;
 }
+static void cleanup_return_instances(struct uprobe_task *utask, bool chained,
+                                        struct pt_regs *regs)
+{
+        struct return_instance *ri = utask->return_instances;
+        enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL;
+        while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) {
+                ri = free_ret_instance(ri);
+                utask->depth--;
+        }
+        utask->return_instances = ri;
+}
 static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
 {
        struct return_instance *ri;
        struct uprobe_task *utask;
        unsigned long orig_ret_vaddr, trampoline_vaddr;
-        bool chained = false;
+        bool chained;
        if (!get_xol_area())
                return;
@@ -1536,49 +1557,47 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
                return;
        }
-        ri = kzalloc(sizeof(struct return_instance), GFP_KERNEL);
+        ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
        if (!ri)
-                goto fail;
+                return;
        trampoline_vaddr = get_trampoline_vaddr();
        orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
        if (orig_ret_vaddr == -1)
                goto fail;
+        /* drop the entries invalidated by longjmp() */
+        chained = (orig_ret_vaddr == trampoline_vaddr);
+        cleanup_return_instances(utask, chained, regs);
        /*
         * We don't want to keep trampoline address in stack, rather keep the
         * original return address of first caller thru all the consequent
         * instances. This also makes breakpoint unwrapping easier.
         */
-        if (orig_ret_vaddr == trampoline_vaddr) {
+        if (chained) {
                if (!utask->return_instances) {
                        /*
                         * This situation is not possible. Likely we have an
                         * attack from user-space.
                         */
-                        pr_warn("uprobe: unable to set uretprobe pid/tgid=%d/%d\n",
+                        uprobe_warn(current, "handle tail call");
-                                                current->pid, current->tgid);
                        goto fail;
                }
-                chained = true;
                orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
        }
-        atomic_inc(&uprobe->ref);
+        ri->uprobe = get_uprobe(uprobe);
-        ri->uprobe = uprobe;
        ri->func = instruction_pointer(regs);
+        ri->stack = user_stack_pointer(regs);
        ri->orig_ret_vaddr = orig_ret_vaddr;
        ri->chained = chained;
        utask->depth++;
-        /* add instance to the stack */
        ri->next = utask->return_instances;
        utask->return_instances = ri;
        return;
 fail:
        kfree(ri);
 }
@@ -1766,46 +1785,58 @@ handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
        up_read(&uprobe->register_rwsem);
 }
-static bool handle_trampoline(struct pt_regs *regs)
+static struct return_instance *find_next_ret_chain(struct return_instance *ri)
 {
-        struct uprobe_task *utask;
-        struct return_instance *ri, *tmp;
        bool chained;
+        do {
+                chained = ri->chained;
+                ri = ri->next;  /* can't be NULL if chained */
+        } while (chained);
+        return ri;
+}
+static void handle_trampoline(struct pt_regs *regs)
+{
+        struct uprobe_task *utask;
+        struct return_instance *ri, *next;
+        bool valid;
        utask = current->utask;
        if (!utask)
-                return false;
+                goto sigill;
        ri = utask->return_instances;
        if (!ri)
-                return false;
+                goto sigill;
-        /*
-         * TODO: we should throw out return_instance's invalidated by
-         * longjmp(), currently we assume that the probed function always
-         * returns.
-         */
-        instruction_pointer_set(regs, ri->orig_ret_vaddr);
-        for (;;) {
-                handle_uretprobe_chain(ri, regs);
-                chained = ri->chained;
-                put_uprobe(ri->uprobe);
-                tmp = ri;
-                ri = ri->next;
-                kfree(tmp);
-                utask->depth--;
-                if (!chained)
+        do {
-                        break;
+                /*
-                BUG_ON(!ri);
+                 * We should throw out the frames invalidated by longjmp().
-        }
+                 * If this chain is valid, then the next one should be alive
+                 * or NULL; the latter case means that nobody but ri->func
+                 * could hit this trampoline on return. TODO: sigaltstack().
+                 */
+                next = find_next_ret_chain(ri);
+                valid = !next || arch_uretprobe_is_alive(next, RP_CHECK_RET, regs);
+                instruction_pointer_set(regs, ri->orig_ret_vaddr);
+                do {
+                        if (valid)
+                                handle_uretprobe_chain(ri, regs);
+                        ri = free_ret_instance(ri);
+                        utask->depth--;
+                } while (ri != next);
+        } while (!valid);
        utask->return_instances = ri;
+        return;
+ sigill:
+        uprobe_warn(current, "handle uretprobe, sending SIGILL.");
+        force_sig_info(SIGILL, SEND_SIG_FORCED, current);
-        return true;
 }
 bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs)
@@ -1813,6 +1844,12 @@ bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs)
        return false;
 }
+bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
+                                        struct pt_regs *regs)
+{
+        return true;
+}
 /*
 * Run handler and ask thread to singlestep.
 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
@@ -1824,13 +1861,8 @@ static void handle_swbp(struct pt_regs *regs)
        int uninitialized_var(is_swbp);
        bp_vaddr = uprobe_get_swbp_addr(regs);
-        if (bp_vaddr == get_trampoline_vaddr()) {
+        if (bp_vaddr == get_trampoline_vaddr())
-                if (handle_trampoline(regs))
+                return handle_trampoline(regs);
-                        return;
-                pr_warn("uprobe: unable to handle uretprobe pid/tgid=%d/%d\n",
-                                                current->pid, current->tgid);
-        }
        uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
        if (!uprobe) {
diff --git a/kernel/exit.c b/kernel/exit.c
index 031325e9acf9..ea95ee1b5ef7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1471,7 +1471,7 @@ static long do_wait(struct wait_opts *wo)
        add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
 repeat:
        /*
-         * If there is nothing that can match our critiera just get out.
+         * If there is nothing that can match our criteria, just get out.
         * We will clear ->notask_error to zero if we see any child that
         * might later match our criteria, even if we are not able to reap
         * it yet.
diff --git a/kernel/extable.c b/kernel/extable.c
index c98f926277a8..e820ccee9846 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -18,7 +18,6 @@
 #include <linux/ftrace.h>
 #include <linux/memory.h>
 #include <linux/module.h>
-#include <linux/ftrace.h>
 #include <linux/mutex.h>
 #include <linux/init.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index 1bfefc6f96a4..7d5f0f118a63 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -287,6 +287,11 @@ static void set_max_threads(unsigned int max_threads_suggested)
        max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
 }
+#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
+/* Initialized by the architecture: */
+int arch_task_struct_size __read_mostly;
+#endif
 void __init fork_init(void)
 {
 #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
@@ -295,7 +300,7 @@ void __init fork_init(void)
 #endif
        /* create a slab on which task_structs can be allocated */
        task_struct_cachep =
-                kmem_cache_create("task_struct", sizeof(struct task_struct),
+                kmem_cache_create("task_struct", arch_task_struct_size,
                        ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
 #endif
@@ -449,8 +454,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                tmp->vm_mm = mm;
                if (anon_vma_fork(tmp, mpnt))
                        goto fail_nomem_anon_vma_fork;
-                tmp->vm_flags &= ~VM_LOCKED;
+                tmp->vm_flags &= ~(VM_LOCKED|VM_UFFD_MISSING|VM_UFFD_WP);
                tmp->vm_next = tmp->vm_prev = NULL;
+                tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
                file = tmp->vm_file;
                if (file) {
                        struct inode *inode = file_inode(file);
@@ -1067,6 +1073,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
        rcu_assign_pointer(tsk->sighand, sig);
        if (!sig)
                return -ENOMEM;
        atomic_set(&sig->count, 1);
        memcpy(sig->action, current->sighand->action, sizeof(sig->action));
        return 0;
@@ -1128,6 +1135,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        init_sigpending(&sig->shared_pending);
        INIT_LIST_HEAD(&sig->posix_timers);
        seqlock_init(&sig->stats_lock);
+        prev_cputime_init(&sig->prev_cputime);
        hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        sig->real_timer.function = it_real_fn;
@@ -1239,6 +1247,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 {
        int retval;
        struct task_struct *p;
+        void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {};
        if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
                return ERR_PTR(-EINVAL);
@@ -1273,10 +1282,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        /*
         * If the new process will be in a different pid or user namespace
-         * do not allow it to share a thread group or signal handlers or
+         * do not allow it to share a thread group with the forking task.
-         * parent with the forking task.
         */
-        if (clone_flags & CLONE_SIGHAND) {
+        if (clone_flags & CLONE_THREAD) {
                if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
                    (task_active_pid_ns(current) !=
                                current->nsproxy->pid_ns_for_children))
@@ -1335,9 +1343,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        p->utime = p->stime = p->gtime = 0;
        p->utimescaled = p->stimescaled = 0;
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+        prev_cputime_init(&p->prev_cputime);
-        p->prev_cputime.utime = p->prev_cputime.stime = 0;
-#endif
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
        seqlock_init(&p->vtime_seqlock);
        p->vtime_snap = 0;
@@ -1513,6 +1520,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        p->task_works = NULL;
        /*
+         * Ensure that the cgroup subsystem policies allow the new process to be
+         * forked. It should be noted the the new process's css_set can be changed
+         * between here and cgroup_post_fork() if an organisation operation is in
+         * progress.
+         */
+        retval = cgroup_can_fork(p, cgrp_ss_priv);
+        if (retval)
+                goto bad_fork_free_pid;
+        /*
         * Make it visible to the rest of the system, but dont wake it up yet.
         * Need tasklist lock for parent etc handling!
         */
@@ -1548,7 +1565,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                spin_unlock(&current->sighand->siglock);
                write_unlock_irq(&tasklist_lock);
                retval = -ERESTARTNOINTR;
-                goto bad_fork_free_pid;
+                goto bad_fork_cancel_cgroup;
        }
        if (likely(p->pid)) {
@@ -1590,7 +1607,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        write_unlock_irq(&tasklist_lock);
        proc_fork_connector(p);
-        cgroup_post_fork(p);
+        cgroup_post_fork(p, cgrp_ss_priv);
        if (clone_flags & CLONE_THREAD)
                threadgroup_change_end(current);
        perf_event_fork(p);
@@ -1600,6 +1617,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        return p;
+bad_fork_cancel_cgroup:
+        cgroup_cancel_fork(p, cgrp_ss_priv);
 bad_fork_free_pid:
        if (pid != &init_struct_pid)
                free_pid(pid);
@@ -1866,13 +1885,21 @@ static int check_unshare_flags(unsigned long unshare_flags)
                                CLONE_NEWUSER|CLONE_NEWPID))
                return -EINVAL;
        /*
-         * Not implemented, but pretend it works if there is nothing to
+         * Not implemented, but pretend it works if there is nothing
-         * unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND
+         * to unshare.  Note that unsharing the address space or the
-         * needs to unshare vm.
+         * signal handlers also need to unshare the signal queues (aka
+         * CLONE_THREAD).
         */
        if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {
-                /* FIXME: get_task_mm() increments ->mm_users */
+                if (!thread_group_empty(current))
-                if (atomic_read(&current->mm->mm_users) > 1)
+                        return -EINVAL;
+        }
+        if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) {
+                if (atomic_read(&current->sighand->count) > 1)
+                        return -EINVAL;
+        }
+        if (unshare_flags & CLONE_VM) {
+                if (!current_is_single_threaded())
                        return -EINVAL;
        }
@@ -1936,21 +1963,22 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
        int err;
        /*
-         * If unsharing a user namespace must also unshare the thread.
+         * If unsharing a user namespace must also unshare the thread group
+         * and unshare the filesystem root and working directories.
         */
        if (unshare_flags & CLONE_NEWUSER)
                unshare_flags |= CLONE_THREAD | CLONE_FS;
        /*
-         * If unsharing a thread from a thread group, must also unshare vm.
-         */
-        if (unshare_flags & CLONE_THREAD)
-                unshare_flags |= CLONE_VM;
-        /*
         * If unsharing vm, must also unshare signal handlers.
         */
        if (unshare_flags & CLONE_VM)
                unshare_flags |= CLONE_SIGHAND;
        /*
+         * If unsharing a signal handlers, must also unshare the signal queues.
+         */
+        if (unshare_flags & CLONE_SIGHAND)
+                unshare_flags |= CLONE_THREAD;
+        /*
         * If unsharing namespace, must also unshare filesystem information.
         */
        if (unshare_flags & CLONE_NEWNS)
diff --git a/kernel/futex.c b/kernel/futex.c
index c4a182f5357e..6e443efc65f4 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -64,6 +64,7 @@
 #include <linux/hugetlb.h>
 #include <linux/freezer.h>
 #include <linux/bootmem.h>
+#include <linux/fault-inject.h>
 #include <asm/futex.h>
@@ -258,6 +259,66 @@ static unsigned long __read_mostly futex_hashsize;
 static struct futex_hash_bucket *futex_queues;
+/*
+ * Fault injections for futexes.
+ */
+#ifdef CONFIG_FAIL_FUTEX
+static struct {
+        struct fault_attr attr;
+        u32 ignore_private;
+} fail_futex = {
+        .attr = FAULT_ATTR_INITIALIZER,
+        .ignore_private = 0,
+};
+static int __init setup_fail_futex(char *str)
+{
+        return setup_fault_attr(&fail_futex.attr, str);
+}
+__setup("fail_futex=", setup_fail_futex);
+static bool should_fail_futex(bool fshared)
+{
+        if (fail_futex.ignore_private && !fshared)
+                return false;
+        return should_fail(&fail_futex.attr, 1);
+}
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+static int __init fail_futex_debugfs(void)
+{
+        umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+        struct dentry *dir;
+        dir = fault_create_debugfs_attr("fail_futex", NULL,
+                                        &fail_futex.attr);
+        if (IS_ERR(dir))
+                return PTR_ERR(dir);
+        if (!debugfs_create_bool("ignore-private", mode, dir,
+                                 &fail_futex.ignore_private)) {
+                debugfs_remove_recursive(dir);
+                return -ENOMEM;
+        }
+        return 0;
+}
+late_initcall(fail_futex_debugfs);
+#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
+#else
+static inline bool should_fail_futex(bool fshared)
+{
+        return false;
+}
+#endif /* CONFIG_FAIL_FUTEX */
 static inline void futex_get_mm(union futex_key *key)
 {
        atomic_inc(&key->private.mm->mm_count);
@@ -413,6 +474,9 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
        if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
                return -EFAULT;
+        if (unlikely(should_fail_futex(fshared)))
+                return -EFAULT;
        /*
         * PROCESS_PRIVATE futexes are fast.
         * As the mm cannot disappear under us and the 'key' only needs
@@ -428,6 +492,10 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
        }
 again:
+        /* Ignore any VERIFY_READ mapping (futex common case) */
+        if (unlikely(should_fail_futex(fshared)))
+                return -EFAULT;
        err = get_user_pages_fast(address, 1, 1, &page);
        /*
         * If write access is not required (eg. FUTEX_WAIT), try
@@ -516,7 +584,7 @@ again:
                 * A RO anonymous page will never change and thus doesn't make
                 * sense for futex operations.
                 */
-                if (ro) {
+                if (unlikely(should_fail_futex(fshared)) || ro) {
                        err = -EFAULT;
                        goto out;
                }
@@ -974,6 +1042,9 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
 {
        u32 uninitialized_var(curval);
+        if (unlikely(should_fail_futex(true)))
+                return -EFAULT;
        if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
                return -EFAULT;
@@ -1015,12 +1086,18 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
        if (get_futex_value_locked(&uval, uaddr))
                return -EFAULT;
+        if (unlikely(should_fail_futex(true)))
+                return -EFAULT;
        /*
         * Detect deadlocks.
         */
        if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
                return -EDEADLK;
+        if ((unlikely(should_fail_futex(true))))
+                return -EDEADLK;
        /*
         * Lookup existing state first. If it exists, try to attach to
         * its pi_state.
@@ -1155,6 +1232,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
         */
        newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
+        if (unlikely(should_fail_futex(true)))
+                ret = -EFAULT;
        if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
                ret = -EFAULT;
        else if (curval != uval)
@@ -1457,6 +1537,9 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
        if (get_futex_value_locked(&curval, pifutex))
                return -EFAULT;
+        if (unlikely(should_fail_futex(true)))
+                return -EFAULT;
        /*
         * Find the top_waiter and determine if there are additional waiters.
         * If the caller intends to requeue more than 1 waiter to pifutex,
@@ -2268,8 +2351,11 @@ static long futex_wait_restart(struct restart_block *restart)
 /*
 * Userspace tried a 0 -> TID atomic transition of the futex value
 * and failed. The kernel side here does the whole locking operation:
- * if there are waiters then it will block, it does PI, etc. (Due to
+ * if there are waiters then it will block as a consequence of relying
- * races the kernel might see a 0 value of the futex too.)
+ * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
+ * a 0 value of the futex too.).
+ *
+ * Also serves as futex trylock_pi()'ing, and due semantics.
 */
 static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
                         ktime_t *time, int trylock)
@@ -2300,6 +2386,10 @@ retry_private:
        ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
        if (unlikely(ret)) {
+                /*
+                 * Atomic work succeeded and we got the lock,
+                 * or failed. Either way, we do _not_ block.
+                 */
                switch (ret) {
                case 1:
                        /* We got the lock. */
@@ -2530,7 +2620,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
 * @uaddr:      the futex we initially wait on (non-pi)
 * @flags:      futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
- *              the same type, no requeueing from private to shared, etc.
+ *              the same type, no requeueing from private to shared, etc.
 * @val:        the expected value of uaddr
 * @abs_time:   absolute timeout
 * @bitset:     32 bit wakeup bitset set by userspace, defaults to all
@@ -3005,6 +3095,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
        if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
                      cmd == FUTEX_WAIT_BITSET ||
                      cmd == FUTEX_WAIT_REQUEUE_PI)) {
+                if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
+                        return -EFAULT;
                if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
                        return -EFAULT;
                if (!timespec_valid(&ts))
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 27f4332c7f84..6e40a9539763 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -63,7 +63,7 @@ int irq_set_irq_type(unsigned int irq, unsigned int type)
                return -EINVAL;
        type &= IRQ_TYPE_SENSE_MASK;
-        ret = __irq_set_trigger(desc, irq, type);
+        ret = __irq_set_trigger(desc, type);
        irq_put_desc_busunlock(desc, flags);
        return ret;
 }
@@ -187,7 +187,7 @@ int irq_startup(struct irq_desc *desc, bool resend)
                irq_enable(desc);
        }
        if (resend)
-                check_irq_resend(desc, desc->irq_data.irq);
+                check_irq_resend(desc);
        return ret;
 }
@@ -315,7 +315,7 @@ void handle_nested_irq(unsigned int irq)
        raw_spin_lock_irq(&desc->lock);
        desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
-        kstat_incr_irqs_this_cpu(irq, desc);
+        kstat_incr_irqs_this_cpu(desc);
        action = desc->action;
        if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) {
@@ -328,7 +328,7 @@ void handle_nested_irq(unsigned int irq)
        action_ret = action->thread_fn(action->irq, action->dev_id);
        if (!noirqdebug)
-                note_interrupt(irq, desc, action_ret);
+                note_interrupt(desc, action_ret);
        raw_spin_lock_irq(&desc->lock);
        irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
@@ -391,7 +391,7 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
                goto out_unlock;
        desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
-        kstat_incr_irqs_this_cpu(irq, desc);
+        kstat_incr_irqs_this_cpu(desc);
        if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
                desc->istate |= IRQS_PENDING;
@@ -443,7 +443,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
                goto out_unlock;
        desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
-        kstat_incr_irqs_this_cpu(irq, desc);
+        kstat_incr_irqs_this_cpu(desc);
        /*
         * If its disabled or no action available
@@ -515,7 +515,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
                goto out;
        desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
-        kstat_incr_irqs_this_cpu(irq, desc);
+        kstat_incr_irqs_this_cpu(desc);
        /*
         * If its disabled or no action available
@@ -583,7 +583,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
                goto out_unlock;
        }
-        kstat_incr_irqs_this_cpu(irq, desc);
+        kstat_incr_irqs_this_cpu(desc);
        /* Start handling the irq */
        desc->irq_data.chip->irq_ack(&desc->irq_data);
@@ -646,7 +646,7 @@ void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc)
                goto out_eoi;
        }
-        kstat_incr_irqs_this_cpu(irq, desc);
+        kstat_incr_irqs_this_cpu(desc);
        do {
                if (unlikely(!desc->action))
@@ -675,7 +675,7 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
 {
        struct irq_chip *chip = irq_desc_get_chip(desc);
-        kstat_incr_irqs_this_cpu(irq, desc);
+        kstat_incr_irqs_this_cpu(desc);
        if (chip->irq_ack)
                chip->irq_ack(&desc->irq_data);
@@ -705,7 +705,7 @@ void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc)
        void *dev_id = raw_cpu_ptr(action->percpu_dev_id);
        irqreturn_t res;
-        kstat_incr_irqs_this_cpu(irq, desc);
+        kstat_incr_irqs_this_cpu(desc);
        if (chip->irq_ack)
                chip->irq_ack(&desc->irq_data);
@@ -985,6 +985,23 @@ int irq_chip_set_affinity_parent(struct irq_data *data,
 }
 /**
+ * irq_chip_set_type_parent - Set IRQ type on the parent interrupt
+ * @data:       Pointer to interrupt specific data
+ * @type:       IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h
+ *
+ * Conditional, as the underlying parent chip might not implement it.
+ */
+int irq_chip_set_type_parent(struct irq_data *data, unsigned int type)
+{
+        data = data->parent_data;
+        if (data->chip->irq_set_type)
+                return data->chip->irq_set_type(data, type);
+        return -ENOSYS;
+}
+/**
 * irq_chip_retrigger_hierarchy - Retrigger an interrupt in hardware
 * @data:       Pointer to interrupt specific data
 *
@@ -997,13 +1014,13 @@ int irq_chip_retrigger_hierarchy(struct irq_data *data)
                if (data->chip && data->chip->irq_retrigger)
                        return data->chip->irq_retrigger(data);
-        return -ENOSYS;
+        return 0;
 }
 /**
 * irq_chip_set_vcpu_affinity_parent - Set vcpu affinity on the parent interrupt
 * @data:       Pointer to interrupt specific data
- * @dest:       The vcpu affinity information
+ * @vcpu_info:  The vcpu affinity information
 */
 int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info)
 {
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index 15b370daf234..abd286afbd27 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -553,6 +553,9 @@ static int irq_gc_suspend(void)
                        if (data)
                                ct->chip.irq_suspend(data);
                }
+                if (gc->suspend)
+                        gc->suspend(gc);
        }
        return 0;
 }
@@ -564,6 +567,9 @@ static void irq_gc_resume(void)
        list_for_each_entry(gc, &gc_list, list) {
                struct irq_chip_type *ct = gc->chip_types;
+                if (gc->resume)
+                        gc->resume(gc);
                if (ct->chip.irq_resume) {
                        struct irq_data *data = irq_gc_get_irq_data(gc);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 635480270858..b6eeea8a80c5 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -30,7 +30,7 @@
 void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
 {
        print_irq_desc(irq, desc);
-        kstat_incr_irqs_this_cpu(irq, desc);
+        kstat_incr_irqs_this_cpu(desc);
        ack_bad_irq(irq);
 }
@@ -176,7 +176,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
        add_interrupt_randomness(irq, flags);
        if (!noirqdebug)
-                note_interrupt(irq, desc, retval);
+                note_interrupt(desc, retval);
        return retval;
 }
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 61008b8433ab..eee4b385cffb 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -59,10 +59,9 @@ enum {
 #include "debug.h"
 #include "settings.h"
-extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
+extern int __irq_set_trigger(struct irq_desc *desc, unsigned long flags);
-                unsigned long flags);
+extern void __disable_irq(struct irq_desc *desc);
-extern void __disable_irq(struct irq_desc *desc, unsigned int irq);
+extern void __enable_irq(struct irq_desc *desc);
-extern void __enable_irq(struct irq_desc *desc, unsigned int irq);
 extern int irq_startup(struct irq_desc *desc, bool resend);
 extern void irq_shutdown(struct irq_desc *desc);
@@ -86,7 +85,7 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *act
 irqreturn_t handle_irq_event(struct irq_desc *desc);
 /* Resending of interrupts :*/
-void check_irq_resend(struct irq_desc *desc, unsigned int irq);
+void check_irq_resend(struct irq_desc *desc);
 bool irq_wait_for_poll(struct irq_desc *desc);
 void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action);
@@ -187,7 +186,7 @@ static inline bool irqd_has_set(struct irq_data *d, unsigned int mask)
        return __irqd_to_state(d) & mask;
 }
-static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *desc)
+static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc)
 {
        __this_cpu_inc(*desc->kstat_irqs);
        __this_cpu_inc(kstat.irqs_sum);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 4afc457613dd..0a2a4b697bcb 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -582,7 +582,7 @@ int irq_set_percpu_devid(unsigned int irq)
 void kstat_incr_irq_this_cpu(unsigned int irq)
 {
-        kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
+        kstat_incr_irqs_this_cpu(irq_to_desc(irq));
 }
 /**
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 8c3577fef78c..79baaf8a7813 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -187,10 +187,12 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
 EXPORT_SYMBOL_GPL(irq_domain_add_legacy);
 /**
- * irq_find_host() - Locates a domain for a given device node
+ * irq_find_matching_host() - Locates a domain for a given device node
 * @node: device-tree node of the interrupt controller
+ * @bus_token: domain-specific data
 */
-struct irq_domain *irq_find_host(struct device_node *node)
+struct irq_domain *irq_find_matching_host(struct device_node *node,
+                                          enum irq_domain_bus_token bus_token)
 {
        struct irq_domain *h, *found = NULL;
        int rc;
@@ -199,13 +201,19 @@ struct irq_domain *irq_find_host(struct device_node *node)
         * it might potentially be set to match all interrupts in
         * the absence of a device node. This isn't a problem so far
         * yet though...
+         *
+         * bus_token == DOMAIN_BUS_ANY matches any domain, any other
+         * values must generate an exact match for the domain to be
+         * selected.
         */
        mutex_lock(&irq_domain_mutex);
        list_for_each_entry(h, &irq_domain_list, link) {
                if (h->ops->match)
-                        rc = h->ops->match(h, node);
+                        rc = h->ops->match(h, node, bus_token);
                else
-                        rc = (h->of_node != NULL) && (h->of_node == node);
+                        rc = ((h->of_node != NULL) && (h->of_node == node) &&
+                              ((bus_token == DOMAIN_BUS_ANY) ||
+                               (h->bus_token == bus_token)));
                if (rc) {
                        found = h;
@@ -215,7 +223,7 @@ struct irq_domain *irq_find_host(struct device_node *node)
        mutex_unlock(&irq_domain_mutex);
        return found;
 }
-EXPORT_SYMBOL_GPL(irq_find_host);
+EXPORT_SYMBOL_GPL(irq_find_matching_host);
 /**
 * irq_set_default_host() - Set a "default" irq domain
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index f9744853b656..ad1b064f94fe 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -115,6 +115,14 @@ EXPORT_SYMBOL(synchronize_irq);
 #ifdef CONFIG_SMP
 cpumask_var_t irq_default_affinity;
+static int __irq_can_set_affinity(struct irq_desc *desc)
+{
+        if (!desc || !irqd_can_balance(&desc->irq_data) ||
+            !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity)
+                return 0;
+        return 1;
+}
 /**
 *      irq_can_set_affinity - Check if the affinity of a given irq can be set
 *      @irq:           Interrupt to check
@@ -122,13 +130,7 @@ cpumask_var_t irq_default_affinity;
 */
 int irq_can_set_affinity(unsigned int irq)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
+        return __irq_can_set_affinity(irq_to_desc(irq));
-        if (!desc || !irqd_can_balance(&desc->irq_data) ||
-            !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity)
-                return 0;
-        return 1;
 }
 /**
@@ -359,14 +361,13 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);
 /*
 * Generic version of the affinity autoselector.
 */
-static int
+static int setup_affinity(struct irq_desc *desc, struct cpumask *mask)
-setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
 {
        struct cpumask *set = irq_default_affinity;
        int node = irq_desc_get_node(desc);
        /* Excludes PER_CPU and NO_BALANCE interrupts */
-        if (!irq_can_set_affinity(irq))
+        if (!__irq_can_set_affinity(desc))
                return 0;
        /*
@@ -393,10 +394,10 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
        return 0;
 }
 #else
-static inline int
+/* Wrapper for ALPHA specific affinity selector magic */
-setup_affinity(unsigned int irq, struct irq_desc *d, struct cpumask *mask)
+static inline int setup_affinity(struct irq_desc *d, struct cpumask *mask)
 {
-        return irq_select_affinity(irq);
+        return irq_select_affinity(irq_desc_get_irq(d));
 }
 #endif
@@ -410,20 +411,20 @@ int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask)
        int ret;
        raw_spin_lock_irqsave(&desc->lock, flags);
-        ret = setup_affinity(irq, desc, mask);
+        ret = setup_affinity(desc, mask);
        raw_spin_unlock_irqrestore(&desc->lock, flags);
        return ret;
 }
 #else
 static inline int
-setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
+setup_affinity(struct irq_desc *desc, struct cpumask *mask)
 {
        return 0;
 }
 #endif
-void __disable_irq(struct irq_desc *desc, unsigned int irq)
+void __disable_irq(struct irq_desc *desc)
 {
        if (!desc->depth++)
                irq_disable(desc);
@@ -436,7 +437,7 @@ static int __disable_irq_nosync(unsigned int irq)
        if (!desc)
                return -EINVAL;
-        __disable_irq(desc, irq);
+        __disable_irq(desc);
        irq_put_desc_busunlock(desc, flags);
        return 0;
 }
@@ -503,12 +504,13 @@ bool disable_hardirq(unsigned int irq)
 }
 EXPORT_SYMBOL_GPL(disable_hardirq);
-void __enable_irq(struct irq_desc *desc, unsigned int irq)
+void __enable_irq(struct irq_desc *desc)
 {
        switch (desc->depth) {
        case 0:
 err_out:
-                WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
+                WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n",
+                     irq_desc_get_irq(desc));
                break;
        case 1: {
                if (desc->istate & IRQS_SUSPENDED)
@@ -516,7 +518,7 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq)
                /* Prevent probing on this irq: */
                irq_settings_set_noprobe(desc);
                irq_enable(desc);
-                check_irq_resend(desc, irq);
+                check_irq_resend(desc);
                /* fall-through */
        }
        default:
@@ -546,7 +548,7 @@ void enable_irq(unsigned int irq)
                 KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
                goto out;
-        __enable_irq(desc, irq);
+        __enable_irq(desc);
 out:
        irq_put_desc_busunlock(desc, flags);
 }
@@ -637,8 +639,7 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
        return canrequest;
 }
-int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
+int __irq_set_trigger(struct irq_desc *desc, unsigned long flags)
-                      unsigned long flags)
 {
        struct irq_chip *chip = desc->irq_data.chip;
        int ret, unmask = 0;
@@ -648,7 +649,8 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
                 * IRQF_TRIGGER_* but the PIC does not support multiple
                 * flow-types?
                 */
-                pr_debug("No set_type function for IRQ %d (%s)\n", irq,
+                pr_debug("No set_type function for IRQ %d (%s)\n",
+                         irq_desc_get_irq(desc),
                         chip ? (chip->name ? : "unknown") : "unknown");
                return 0;
        }
@@ -685,7 +687,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
                break;
        default:
                pr_err("Setting trigger mode %lu for irq %u failed (%pF)\n",
-                       flags, irq, chip->irq_set_type);
+                       flags, irq_desc_get_irq(desc), chip->irq_set_type);
        }
        if (unmask)
                unmask_irq(desc);
@@ -1221,8 +1223,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                /* Setup the type (level, edge polarity) if configured: */
                if (new->flags & IRQF_TRIGGER_MASK) {
-                        ret = __irq_set_trigger(desc, irq,
+                        ret = __irq_set_trigger(desc,
-                                        new->flags & IRQF_TRIGGER_MASK);
+                                                new->flags & IRQF_TRIGGER_MASK);
                        if (ret)
                                goto out_mask;
@@ -1253,7 +1255,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                }
                /* Set default affinity mask once everything is setup */
-                setup_affinity(irq, desc, mask);
+                setup_affinity(desc, mask);
        } else if (new->flags & IRQF_TRIGGER_MASK) {
                unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK;
@@ -1280,7 +1282,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
         */
        if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) {
                desc->istate &= ~IRQS_SPURIOUS_DISABLED;
-                __enable_irq(desc, irq);
+                __enable_irq(desc);
        }
        raw_spin_unlock_irqrestore(&desc->lock, flags);
@@ -1650,7 +1652,7 @@ void enable_percpu_irq(unsigned int irq, unsigned int type)
        if (type != IRQ_TYPE_NONE) {
                int ret;
-                ret = __irq_set_trigger(desc, irq, type);
+                ret = __irq_set_trigger(desc, type);
                if (ret) {
                        WARN(1, "failed to set type for IRQ%d\n", irq);
@@ -1875,6 +1877,7 @@ int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
        irq_put_desc_busunlock(desc, flags);
        return err;
 }
+EXPORT_SYMBOL_GPL(irq_get_irqchip_state);
 /**
 *      irq_set_irqchip_state - set the state of a forwarded interrupt.
@@ -1920,3 +1923,4 @@ int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
        irq_put_desc_busunlock(desc, flags);
        return err;
 }
+EXPORT_SYMBOL_GPL(irq_set_irqchip_state);
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 7bf1f1bbb7fa..7e6512b9dc1f 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -18,6 +18,23 @@
 /* Temparory solution for building, will be removed later */
 #include <linux/pci.h>
+struct msi_desc *alloc_msi_entry(struct device *dev)
+{
+        struct msi_desc *desc = kzalloc(sizeof(*desc), GFP_KERNEL);
+        if (!desc)
+                return NULL;
+        INIT_LIST_HEAD(&desc->list);
+        desc->dev = dev;
+        return desc;
+}
+void free_msi_entry(struct msi_desc *entry)
+{
+        kfree(entry);
+}
 void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
 {
        *msg = entry->msg;
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index d22786a6dbde..21c62617a35a 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -68,7 +68,7 @@ void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action)
                desc->cond_suspend_depth--;
 }
-static bool suspend_device_irq(struct irq_desc *desc, int irq)
+static bool suspend_device_irq(struct irq_desc *desc)
 {
        if (!desc->action || desc->no_suspend_depth)
                return false;
@@ -85,7 +85,7 @@ static bool suspend_device_irq(struct irq_desc *desc, int irq)
        }
        desc->istate |= IRQS_SUSPENDED;
-        __disable_irq(desc, irq);
+        __disable_irq(desc);
        /*
         * Hardware which has no wakeup source configuration facility
@@ -126,7 +126,7 @@ void suspend_device_irqs(void)
                if (irq_settings_is_nested_thread(desc))
                        continue;
                raw_spin_lock_irqsave(&desc->lock, flags);
-                sync = suspend_device_irq(desc, irq);
+                sync = suspend_device_irq(desc);
                raw_spin_unlock_irqrestore(&desc->lock, flags);
                if (sync)
@@ -135,7 +135,7 @@ void suspend_device_irqs(void)
 }
 EXPORT_SYMBOL_GPL(suspend_device_irqs);
-static void resume_irq(struct irq_desc *desc, int irq)
+static void resume_irq(struct irq_desc *desc)
 {
        irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED);
@@ -150,7 +150,7 @@ static void resume_irq(struct irq_desc *desc, int irq)
        desc->depth++;
 resume:
        desc->istate &= ~IRQS_SUSPENDED;
-        __enable_irq(desc, irq);
+        __enable_irq(desc);
 }
 static void resume_irqs(bool want_early)
@@ -169,7 +169,7 @@ static void resume_irqs(bool want_early)
                        continue;
                raw_spin_lock_irqsave(&desc->lock, flags);
-                resume_irq(desc, irq);
+                resume_irq(desc);
                raw_spin_unlock_irqrestore(&desc->lock, flags);
        }
 }
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 9065107f083e..dd95f44f99b2 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -53,7 +53,7 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0);
 *
 * Is called with interrupts disabled and desc->lock held.
 */
-void check_irq_resend(struct irq_desc *desc, unsigned int irq)
+void check_irq_resend(struct irq_desc *desc)
 {
        /*
         * We do not resend level type interrupts. Level type
@@ -74,14 +74,24 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
                if (!desc->irq_data.chip->irq_retrigger ||
                    !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) {
 #ifdef CONFIG_HARDIRQS_SW_RESEND
+                        unsigned int irq = irq_desc_get_irq(desc);
                        /*
-                         * If the interrupt has a parent irq and runs
+                         * If the interrupt is running in the thread
-                         * in the thread context of the parent irq,
+                         * context of the parent irq we need to be
-                         * retrigger the parent.
+                         * careful, because we cannot trigger it
+                         * directly.
                         */
-                        if (desc->parent_irq &&
+                        if (irq_settings_is_nested_thread(desc)) {
-                            irq_settings_is_nested_thread(desc))
+                                /*
+                                 * If the parent_irq is valid, we
+                                 * retrigger the parent, otherwise we
+                                 * do nothing.
+                                 */
+                                if (!desc->parent_irq)
+                                        return;
                                irq = desc->parent_irq;
+                        }
                        /* Set it pending and activate the softirq: */
                        set_bit(irq, irqs_resend);
                        tasklet_schedule(&resend_tasklet);
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index e2514b0e439e..32144175458d 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -60,7 +60,7 @@ bool irq_wait_for_poll(struct irq_desc *desc)
 /*
 * Recovery handler for misrouted interrupts.
 */
-static int try_one_irq(int irq, struct irq_desc *desc, bool force)
+static int try_one_irq(struct irq_desc *desc, bool force)
 {
        irqreturn_t ret = IRQ_NONE;
        struct irqaction *action;
@@ -133,7 +133,7 @@ static int misrouted_irq(int irq)
                if (i == irq)   /* Already tried */
                        continue;
-                if (try_one_irq(i, desc, false))
+                if (try_one_irq(desc, false))
                        ok = 1;
        }
 out:
@@ -164,7 +164,7 @@ static void poll_spurious_irqs(unsigned long dummy)
                        continue;
                local_irq_disable();
-                try_one_irq(i, desc, true);
+                try_one_irq(desc, true);
                local_irq_enable();
        }
 out:
@@ -188,10 +188,9 @@ static inline int bad_action_ret(irqreturn_t action_ret)
 * (The other 100-of-100,000 interrupts may have been a correctly
 *  functioning device sharing an IRQ with the failing one)
 */
-static void
+static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
-__report_bad_irq(unsigned int irq, struct irq_desc *desc,
-                 irqreturn_t action_ret)
 {
+        unsigned int irq = irq_desc_get_irq(desc);
        struct irqaction *action;
        unsigned long flags;
@@ -224,14 +223,13 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc,
        raw_spin_unlock_irqrestore(&desc->lock, flags);
 }
-static void
+static void report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
-report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret)
 {
        static int count = 100;
        if (count > 0) {
                count--;
-                __report_bad_irq(irq, desc, action_ret);
+                __report_bad_irq(desc, action_ret);
        }
 }
@@ -272,15 +270,16 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
 #define SPURIOUS_DEFERRED       0x80000000
-void note_interrupt(unsigned int irq, struct irq_desc *desc,
+void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret)
-                    irqreturn_t action_ret)
 {
+        unsigned int irq;
        if (desc->istate & IRQS_POLL_INPROGRESS ||
            irq_settings_is_polled(desc))
                return;
        if (bad_action_ret(action_ret)) {
-                report_bad_irq(irq, desc, action_ret);
+                report_bad_irq(desc, action_ret);
                return;
        }
@@ -398,6 +397,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
                desc->last_unhandled = jiffies;
        }
+        irq = irq_desc_get_irq(desc);
        if (unlikely(try_misrouted_irq(irq, desc, action_ret))) {
                int ok = misrouted_irq(irq);
                if (action_ret == IRQ_NONE)
@@ -413,7 +413,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
                /*
                 * The interrupt is stuck
                 */
-                __report_bad_irq(irq, desc, action_ret);
+                __report_bad_irq(desc, action_ret);
                /*
                 * Now kill the IRQ
                 */
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 52ebaca1b9fc..f7dd15d537f9 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -54,7 +54,7 @@ jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop)
        sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL);
 }
-static void jump_label_update(struct static_key *key, int enable);
+static void jump_label_update(struct static_key *key);
 void static_key_slow_inc(struct static_key *key)
 {
@@ -63,13 +63,8 @@ void static_key_slow_inc(struct static_key *key)
                return;
        jump_label_lock();
-        if (atomic_read(&key->enabled) == 0) {
+        if (atomic_inc_return(&key->enabled) == 1)
-                if (!jump_label_get_branch_default(key))
+                jump_label_update(key);
-                        jump_label_update(key, JUMP_LABEL_ENABLE);
-                else
-                        jump_label_update(key, JUMP_LABEL_DISABLE);
-        }
-        atomic_inc(&key->enabled);
        jump_label_unlock();
 }
 EXPORT_SYMBOL_GPL(static_key_slow_inc);
@@ -87,10 +82,7 @@ static void __static_key_slow_dec(struct static_key *key,
                atomic_inc(&key->enabled);
                schedule_delayed_work(work, rate_limit);
        } else {
-                if (!jump_label_get_branch_default(key))
+                jump_label_update(key);
-                        jump_label_update(key, JUMP_LABEL_DISABLE);
-                else
-                        jump_label_update(key, JUMP_LABEL_ENABLE);
        }
        jump_label_unlock();
 }
@@ -149,7 +141,7 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start,
        return 0;
 }
-/* 
+/*
 * Update code which is definitely not currently executing.
 * Architectures which need heavyweight synchronization to modify
 * running code can override this to make the non-live update case
@@ -158,37 +150,54 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start,
 void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry *entry,
                                            enum jump_label_type type)
 {
-        arch_jump_label_transform(entry, type); 
+        arch_jump_label_transform(entry, type);
+}
+static inline struct jump_entry *static_key_entries(struct static_key *key)
+{
+        return (struct jump_entry *)((unsigned long)key->entries & ~JUMP_TYPE_MASK);
+}
+static inline bool static_key_type(struct static_key *key)
+{
+        return (unsigned long)key->entries & JUMP_TYPE_MASK;
+}
+static inline struct static_key *jump_entry_key(struct jump_entry *entry)
+{
+        return (struct static_key *)((unsigned long)entry->key & ~1UL);
+}
+static bool jump_entry_branch(struct jump_entry *entry)
+{
+        return (unsigned long)entry->key & 1UL;
+}
+static enum jump_label_type jump_label_type(struct jump_entry *entry)
+{
+        struct static_key *key = jump_entry_key(entry);
+        bool enabled = static_key_enabled(key);
+        bool branch = jump_entry_branch(entry);
+        /* See the comment in linux/jump_label.h */
+        return enabled ^ branch;
 }
 static void __jump_label_update(struct static_key *key,
                                struct jump_entry *entry,
-                                struct jump_entry *stop, int enable)
+                                struct jump_entry *stop)
 {
-        for (; (entry < stop) &&
+        for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) {
-              (entry->key == (jump_label_t)(unsigned long)key);
-              entry++) {
                /*
                 * entry->code set to 0 invalidates module init text sections
                 * kernel_text_address() verifies we are not in core kernel
                 * init code, see jump_label_invalidate_module_init().
                 */
                if (entry->code && kernel_text_address(entry->code))
-                        arch_jump_label_transform(entry, enable);
+                        arch_jump_label_transform(entry, jump_label_type(entry));
        }
 }
-static enum jump_label_type jump_label_type(struct static_key *key)
-{
-        bool true_branch = jump_label_get_branch_default(key);
-        bool state = static_key_enabled(key);
-        if ((!true_branch && state) || (true_branch && !state))
-                return JUMP_LABEL_ENABLE;
-        return JUMP_LABEL_DISABLE;
-}
 void __init jump_label_init(void)
 {
        struct jump_entry *iter_start = __start___jump_table;
@@ -202,8 +211,11 @@ void __init jump_label_init(void)
        for (iter = iter_start; iter < iter_stop; iter++) {
                struct static_key *iterk;
-                iterk = (struct static_key *)(unsigned long)iter->key;
+                /* rewrite NOPs */
-                arch_jump_label_transform_static(iter, jump_label_type(iterk));
+                if (jump_label_type(iter) == JUMP_LABEL_NOP)
+                        arch_jump_label_transform_static(iter, JUMP_LABEL_NOP);
+                iterk = jump_entry_key(iter);
                if (iterk == key)
                        continue;
@@ -222,6 +234,16 @@ void __init jump_label_init(void)
 #ifdef CONFIG_MODULES
+static enum jump_label_type jump_label_init_type(struct jump_entry *entry)
+{
+        struct static_key *key = jump_entry_key(entry);
+        bool type = static_key_type(key);
+        bool branch = jump_entry_branch(entry);
+        /* See the comment in linux/jump_label.h */
+        return type ^ branch;
+}
 struct static_key_mod {
        struct static_key_mod *next;
        struct jump_entry *entries;
@@ -243,17 +265,15 @@ static int __jump_label_mod_text_reserved(void *start, void *end)
                                start, end);
 }
-static void __jump_label_mod_update(struct static_key *key, int enable)
+static void __jump_label_mod_update(struct static_key *key)
 {
-        struct static_key_mod *mod = key->next;
+        struct static_key_mod *mod;
-        while (mod) {
+        for (mod = key->next; mod; mod = mod->next) {
                struct module *m = mod->mod;
                __jump_label_update(key, mod->entries,
-                                    m->jump_entries + m->num_jump_entries,
+                                    m->jump_entries + m->num_jump_entries);
-                                    enable);
-                mod = mod->next;
        }
 }
@@ -276,7 +296,9 @@ void jump_label_apply_nops(struct module *mod)
                return;
        for (iter = iter_start; iter < iter_stop; iter++) {
-                arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE);
+                /* Only write NOPs for arch_branch_static(). */
+                if (jump_label_init_type(iter) == JUMP_LABEL_NOP)
+                        arch_jump_label_transform_static(iter, JUMP_LABEL_NOP);
        }
 }
@@ -297,7 +319,7 @@ static int jump_label_add_module(struct module *mod)
        for (iter = iter_start; iter < iter_stop; iter++) {
                struct static_key *iterk;
-                iterk = (struct static_key *)(unsigned long)iter->key;
+                iterk = jump_entry_key(iter);
                if (iterk == key)
                        continue;
@@ -318,8 +340,9 @@ static int jump_label_add_module(struct module *mod)
                jlm->next = key->next;
                key->next = jlm;
-                if (jump_label_type(key) == JUMP_LABEL_ENABLE)
+                /* Only update if we've changed from our initial state */
-                        __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE);
+                if (jump_label_type(iter) != jump_label_init_type(iter))
+                        __jump_label_update(key, iter, iter_stop);
        }
        return 0;
@@ -334,10 +357,10 @@ static void jump_label_del_module(struct module *mod)
        struct static_key_mod *jlm, **prev;
        for (iter = iter_start; iter < iter_stop; iter++) {
-                if (iter->key == (jump_label_t)(unsigned long)key)
+                if (jump_entry_key(iter) == key)
                        continue;
-                key = (struct static_key *)(unsigned long)iter->key;
+                key = jump_entry_key(iter);
                if (within_module(iter->key, mod))
                        continue;
@@ -439,14 +462,14 @@ int jump_label_text_reserved(void *start, void *end)
        return ret;
 }
-static void jump_label_update(struct static_key *key, int enable)
+static void jump_label_update(struct static_key *key)
 {
        struct jump_entry *stop = __stop___jump_table;
-        struct jump_entry *entry = jump_label_get_entries(key);
+        struct jump_entry *entry = static_key_entries(key);
 #ifdef CONFIG_MODULES
        struct module *mod;
-        __jump_label_mod_update(key, enable);
+        __jump_label_mod_update(key);
        preempt_disable();
        mod = __module_address((unsigned long)key);
@@ -456,7 +479,44 @@ static void jump_label_update(struct static_key *key, int enable)
 #endif
        /* if there are no users, entry can be NULL */
        if (entry)
-                __jump_label_update(key, entry, stop, enable);
+                __jump_label_update(key, entry, stop);
 }
-#endif
+#ifdef CONFIG_STATIC_KEYS_SELFTEST
+static DEFINE_STATIC_KEY_TRUE(sk_true);
+static DEFINE_STATIC_KEY_FALSE(sk_false);
+static __init int jump_label_test(void)
+{
+        int i;
+        for (i = 0; i < 2; i++) {
+                WARN_ON(static_key_enabled(&sk_true.key) != true);
+                WARN_ON(static_key_enabled(&sk_false.key) != false);
+                WARN_ON(!static_branch_likely(&sk_true));
+                WARN_ON(!static_branch_unlikely(&sk_true));
+                WARN_ON(static_branch_likely(&sk_false));
+                WARN_ON(static_branch_unlikely(&sk_false));
+                static_branch_disable(&sk_true);
+                static_branch_enable(&sk_false);
+                WARN_ON(static_key_enabled(&sk_true.key) == true);
+                WARN_ON(static_key_enabled(&sk_false.key) == false);
+                WARN_ON(static_branch_likely(&sk_true));
+                WARN_ON(static_branch_unlikely(&sk_true));
+                WARN_ON(!static_branch_likely(&sk_false));
+                WARN_ON(!static_branch_unlikely(&sk_false));
+                static_branch_enable(&sk_true);
+                static_branch_disable(&sk_false);
+        }
+        return 0;
+}
+late_initcall(jump_label_test);
+#endif /* STATIC_KEYS_SELFTEST */
+#endif /* HAVE_JUMP_LABEL */
diff --git a/kernel/kexec.c b/kernel/kexec.c
index a785c1015e25..4c5edc357923 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1,156 +1,22 @@
 /*
- * kexec.c - kexec system call
+ * kexec.c - kexec_load system call
 * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
 *
 * This source code is licensed under the GNU General Public License,
 * Version 2.  See the file COPYING for more details.
 */
-#define pr_fmt(fmt)     "kexec: " fmt
 #include <linux/capability.h>
 #include <linux/mm.h>
 #include <linux/file.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
 #include <linux/kexec.h>
 #include <linux/mutex.h>
 #include <linux/list.h>
-#include <linux/highmem.h>
 #include <linux/syscalls.h>
-#include <linux/reboot.h>
-#include <linux/ioport.h>
-#include <linux/hardirq.h>
-#include <linux/elf.h>
-#include <linux/elfcore.h>
-#include <linux/utsname.h>
-#include <linux/numa.h>
-#include <linux/suspend.h>
-#include <linux/device.h>
-#include <linux/freezer.h>
-#include <linux/pm.h>
-#include <linux/cpu.h>
-#include <linux/console.h>
 #include <linux/vmalloc.h>
-#include <linux/swap.h>
+#include <linux/slab.h>
-#include <linux/syscore_ops.h>
-#include <linux/compiler.h>
-#include <linux/hugetlb.h>
-#include <asm/page.h>
-#include <asm/uaccess.h>
-#include <asm/io.h>
-#include <asm/sections.h>
-#include <crypto/hash.h>
-#include <crypto/sha.h>
-/* Per cpu memory for storing cpu states in case of system crash. */
-note_buf_t __percpu *crash_notes;
-/* vmcoreinfo stuff */
-static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
-u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
-size_t vmcoreinfo_size;
-size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
-/* Flag to indicate we are going to kexec a new kernel */
-bool kexec_in_progress = false;
-/*
- * Declare these symbols weak so that if architecture provides a purgatory,
- * these will be overridden.
- */
-char __weak kexec_purgatory[0];
-size_t __weak kexec_purgatory_size = 0;
-#ifdef CONFIG_KEXEC_FILE
-static int kexec_calculate_store_digests(struct kimage *image);
-#endif
-/* Location of the reserved area for the crash kernel */
-struct resource crashk_res = {
-        .name  = "Crash kernel",
-        .start = 0,
-        .end   = 0,
-        .flags = IORESOURCE_BUSY | IORESOURCE_MEM
-};
-struct resource crashk_low_res = {
-        .name  = "Crash kernel",
-        .start = 0,
-        .end   = 0,
-        .flags = IORESOURCE_BUSY | IORESOURCE_MEM
-};
-int kexec_should_crash(struct task_struct *p)
-{
-        /*
-         * If crash_kexec_post_notifiers is enabled, don't run
-         * crash_kexec() here yet, which must be run after panic
-         * notifiers in panic().
-         */
-        if (crash_kexec_post_notifiers)
-                return 0;
-        /*
-         * There are 4 panic() calls in do_exit() path, each of which
-         * corresponds to each of these 4 conditions.
-         */
-        if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
-                return 1;
-        return 0;
-}
-/*
- * When kexec transitions to the new kernel there is a one-to-one
- * mapping between physical and virtual addresses.  On processors
- * where you can disable the MMU this is trivial, and easy.  For
- * others it is still a simple predictable page table to setup.
- *
- * In that environment kexec copies the new kernel to its final
- * resting place.  This means I can only support memory whose
- * physical address can fit in an unsigned long.  In particular
- * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
- * If the assembly stub has more restrictive requirements
- * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
- * defined more restrictively in <asm/kexec.h>.
- *
- * The code for the transition from the current kernel to the
- * the new kernel is placed in the control_code_buffer, whose size
- * is given by KEXEC_CONTROL_PAGE_SIZE.  In the best case only a single
- * page of memory is necessary, but some architectures require more.
- * Because this memory must be identity mapped in the transition from
- * virtual to physical addresses it must live in the range
- * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
- * modifiable.
- *
- * The assembly stub in the control code buffer is passed a linked list
- * of descriptor pages detailing the source pages of the new kernel,
- * and the destination addresses of those source pages.  As this data
- * structure is not used in the context of the current OS, it must
- * be self-contained.
- *
- * The code has been made to work with highmem pages and will use a
- * destination page in its final resting place (if it happens
- * to allocate it).  The end product of this is that most of the
- * physical address space, and most of RAM can be used.
- *
- * Future directions include:
- *  - allocating a page table with the control code buffer identity
- *    mapped, to simplify machine_kexec and make kexec_on_panic more
- *    reliable.
- */
-/*
- * KIMAGE_NO_DEST is an impossible destination address..., for
- * allocating pages whose destination address we do not care about.
- */
-#define KIMAGE_NO_DEST (-1UL)
-static int kimage_is_destination_range(struct kimage *image,
+#include "kexec_internal.h"
-                                       unsigned long start, unsigned long end);
-static struct page *kimage_alloc_page(struct kimage *image,
-                                       gfp_t gfp_mask,
-                                       unsigned long dest);
 static int copy_user_segment_list(struct kimage *image,
                                  unsigned long nr_segments,
@@ -169,125 +35,6 @@ static int copy_user_segment_list(struct kimage *image,
        return ret;
 }
-static int sanity_check_segment_list(struct kimage *image)
-{
-        int result, i;
-        unsigned long nr_segments = image->nr_segments;
-        /*
-         * Verify we have good destination addresses.  The caller is
-         * responsible for making certain we don't attempt to load
-         * the new image into invalid or reserved areas of RAM.  This
-         * just verifies it is an address we can use.
-         *
-         * Since the kernel does everything in page size chunks ensure
-         * the destination addresses are page aligned.  Too many
-         * special cases crop of when we don't do this.  The most
-         * insidious is getting overlapping destination addresses
-         * simply because addresses are changed to page size
-         * granularity.
-         */
-        result = -EADDRNOTAVAIL;
-        for (i = 0; i < nr_segments; i++) {
-                unsigned long mstart, mend;
-                mstart = image->segment[i].mem;
-                mend   = mstart + image->segment[i].memsz;
-                if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
-                        return result;
-                if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
-                        return result;
-        }
-        /* Verify our destination addresses do not overlap.
-         * If we alloed overlapping destination addresses
-         * through very weird things can happen with no
-         * easy explanation as one segment stops on another.
-         */
-        result = -EINVAL;
-        for (i = 0; i < nr_segments; i++) {
-                unsigned long mstart, mend;
-                unsigned long j;
-                mstart = image->segment[i].mem;
-                mend   = mstart + image->segment[i].memsz;
-                for (j = 0; j < i; j++) {
-                        unsigned long pstart, pend;
-                        pstart = image->segment[j].mem;
-                        pend   = pstart + image->segment[j].memsz;
-                        /* Do the segments overlap ? */
-                        if ((mend > pstart) && (mstart < pend))
-                                return result;
-                }
-        }
-        /* Ensure our buffer sizes are strictly less than
-         * our memory sizes.  This should always be the case,
-         * and it is easier to check up front than to be surprised
-         * later on.
-         */
-        result = -EINVAL;
-        for (i = 0; i < nr_segments; i++) {
-                if (image->segment[i].bufsz > image->segment[i].memsz)
-                        return result;
-        }
-        /*
-         * Verify we have good destination addresses.  Normally
-         * the caller is responsible for making certain we don't
-         * attempt to load the new image into invalid or reserved
-         * areas of RAM.  But crash kernels are preloaded into a
-         * reserved area of ram.  We must ensure the addresses
-         * are in the reserved area otherwise preloading the
-         * kernel could corrupt things.
-         */
-        if (image->type == KEXEC_TYPE_CRASH) {
-                result = -EADDRNOTAVAIL;
-                for (i = 0; i < nr_segments; i++) {
-                        unsigned long mstart, mend;
-                        mstart = image->segment[i].mem;
-                        mend = mstart + image->segment[i].memsz - 1;
-                        /* Ensure we are within the crash kernel limits */
-                        if ((mstart < crashk_res.start) ||
-                            (mend > crashk_res.end))
-                                return result;
-                }
-        }
-        return 0;
-}
-static struct kimage *do_kimage_alloc_init(void)
-{
-        struct kimage *image;
-        /* Allocate a controlling structure */
-        image = kzalloc(sizeof(*image), GFP_KERNEL);
-        if (!image)
-                return NULL;
-        image->head = 0;
-        image->entry = &image->head;
-        image->last_entry = &image->head;
-        image->control_page = ~0; /* By default this does not apply */
-        image->type = KEXEC_TYPE_DEFAULT;
-        /* Initialize the list of control pages */
-        INIT_LIST_HEAD(&image->control_pages);
-        /* Initialize the list of destination pages */
-        INIT_LIST_HEAD(&image->dest_pages);
-        /* Initialize the list of unusable pages */
-        INIT_LIST_HEAD(&image->unusable_pages);
-        return image;
-}
-static void kimage_free_page_list(struct list_head *list);
 static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
                             unsigned long nr_segments,
                             struct kexec_segment __user *segments,
@@ -354,873 +101,6 @@ out_free_image:
        return ret;
 }
-#ifdef CONFIG_KEXEC_FILE
-static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len)
-{
-        struct fd f = fdget(fd);
-        int ret;
-        struct kstat stat;
-        loff_t pos;
-        ssize_t bytes = 0;
-        if (!f.file)
-                return -EBADF;
-        ret = vfs_getattr(&f.file->f_path, &stat);
-        if (ret)
-                goto out;
-        if (stat.size > INT_MAX) {
-                ret = -EFBIG;
-                goto out;
-        }
-        /* Don't hand 0 to vmalloc, it whines. */
-        if (stat.size == 0) {
-                ret = -EINVAL;
-                goto out;
-        }
-        *buf = vmalloc(stat.size);
-        if (!*buf) {
-                ret = -ENOMEM;
-                goto out;
-        }
-        pos = 0;
-        while (pos < stat.size) {
-                bytes = kernel_read(f.file, pos, (char *)(*buf) + pos,
-                                    stat.size - pos);
-                if (bytes < 0) {
-                        vfree(*buf);
-                        ret = bytes;
-                        goto out;
-                }
-                if (bytes == 0)
-                        break;
-                pos += bytes;
-        }
-        if (pos != stat.size) {
-                ret = -EBADF;
-                vfree(*buf);
-                goto out;
-        }
-        *buf_len = pos;
-out:
-        fdput(f);
-        return ret;
-}
-/* Architectures can provide this probe function */
-int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
-                                         unsigned long buf_len)
-{
-        return -ENOEXEC;
-}
-void * __weak arch_kexec_kernel_image_load(struct kimage *image)
-{
-        return ERR_PTR(-ENOEXEC);
-}
-void __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
-{
-}
-int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
-                                        unsigned long buf_len)
-{
-        return -EKEYREJECTED;
-}
-/* Apply relocations of type RELA */
-int __weak
-arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
-                                 unsigned int relsec)
-{
-        pr_err("RELA relocation unsupported.\n");
-        return -ENOEXEC;
-}
-/* Apply relocations of type REL */
-int __weak
-arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
-                             unsigned int relsec)
-{
-        pr_err("REL relocation unsupported.\n");
-        return -ENOEXEC;
-}
-/*
- * Free up memory used by kernel, initrd, and command line. This is temporary
- * memory allocation which is not needed any more after these buffers have
- * been loaded into separate segments and have been copied elsewhere.
- */
-static void kimage_file_post_load_cleanup(struct kimage *image)
-{
-        struct purgatory_info *pi = &image->purgatory_info;
-        vfree(image->kernel_buf);
-        image->kernel_buf = NULL;
-        vfree(image->initrd_buf);
-        image->initrd_buf = NULL;
-        kfree(image->cmdline_buf);
-        image->cmdline_buf = NULL;
-        vfree(pi->purgatory_buf);
-        pi->purgatory_buf = NULL;
-        vfree(pi->sechdrs);
-        pi->sechdrs = NULL;
-        /* See if architecture has anything to cleanup post load */
-        arch_kimage_file_post_load_cleanup(image);
-        /*
-         * Above call should have called into bootloader to free up
-         * any data stored in kimage->image_loader_data. It should
-         * be ok now to free it up.
-         */
-        kfree(image->image_loader_data);
-        image->image_loader_data = NULL;
-}
-/*
- * In file mode list of segments is prepared by kernel. Copy relevant
- * data from user space, do error checking, prepare segment list
- */
-static int
-kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
-                             const char __user *cmdline_ptr,
-                             unsigned long cmdline_len, unsigned flags)
-{
-        int ret = 0;
-        void *ldata;
-        ret = copy_file_from_fd(kernel_fd, &image->kernel_buf,
-                                &image->kernel_buf_len);
-        if (ret)
-                return ret;
-        /* Call arch image probe handlers */
-        ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
-                                            image->kernel_buf_len);
-        if (ret)
-                goto out;
-#ifdef CONFIG_KEXEC_VERIFY_SIG
-        ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
-                                           image->kernel_buf_len);
-        if (ret) {
-                pr_debug("kernel signature verification failed.\n");
-                goto out;
-        }
-        pr_debug("kernel signature verification successful.\n");
-#endif
-        /* It is possible that there no initramfs is being loaded */
-        if (!(flags & KEXEC_FILE_NO_INITRAMFS)) {
-                ret = copy_file_from_fd(initrd_fd, &image->initrd_buf,
-                                        &image->initrd_buf_len);
-                if (ret)
-                        goto out;
-        }
-        if (cmdline_len) {
-                image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL);
-                if (!image->cmdline_buf) {
-                        ret = -ENOMEM;
-                        goto out;
-                }
-                ret = copy_from_user(image->cmdline_buf, cmdline_ptr,
-                                     cmdline_len);
-                if (ret) {
-                        ret = -EFAULT;
-                        goto out;
-                }
-                image->cmdline_buf_len = cmdline_len;
-                /* command line should be a string with last byte null */
-                if (image->cmdline_buf[cmdline_len - 1] != '\0') {
-                        ret = -EINVAL;
-                        goto out;
-                }
-        }
-        /* Call arch image load handlers */
-        ldata = arch_kexec_kernel_image_load(image);
-        if (IS_ERR(ldata)) {
-                ret = PTR_ERR(ldata);
-                goto out;
-        }
-        image->image_loader_data = ldata;
-out:
-        /* In case of error, free up all allocated memory in this function */
-        if (ret)
-                kimage_file_post_load_cleanup(image);
-        return ret;
-}
-static int
-kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
-                       int initrd_fd, const char __user *cmdline_ptr,
-                       unsigned long cmdline_len, unsigned long flags)
-{
-        int ret;
-        struct kimage *image;
-        bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH;
-        image = do_kimage_alloc_init();
-        if (!image)
-                return -ENOMEM;
-        image->file_mode = 1;
-        if (kexec_on_panic) {
-                /* Enable special crash kernel control page alloc policy. */
-                image->control_page = crashk_res.start;
-                image->type = KEXEC_TYPE_CRASH;
-        }
-        ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd,
-                                           cmdline_ptr, cmdline_len, flags);
-        if (ret)
-                goto out_free_image;
-        ret = sanity_check_segment_list(image);
-        if (ret)
-                goto out_free_post_load_bufs;
-        ret = -ENOMEM;
-        image->control_code_page = kimage_alloc_control_pages(image,
-                                           get_order(KEXEC_CONTROL_PAGE_SIZE));
-        if (!image->control_code_page) {
-                pr_err("Could not allocate control_code_buffer\n");
-                goto out_free_post_load_bufs;
-        }
-        if (!kexec_on_panic) {
-                image->swap_page = kimage_alloc_control_pages(image, 0);
-                if (!image->swap_page) {
-                        pr_err("Could not allocate swap buffer\n");
-                        goto out_free_control_pages;
-                }
-        }
-        *rimage = image;
-        return 0;
-out_free_control_pages:
-        kimage_free_page_list(&image->control_pages);
-out_free_post_load_bufs:
-        kimage_file_post_load_cleanup(image);
-out_free_image:
-        kfree(image);
-        return ret;
-}
-#else /* CONFIG_KEXEC_FILE */
-static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
-#endif /* CONFIG_KEXEC_FILE */
-static int kimage_is_destination_range(struct kimage *image,
-                                        unsigned long start,
-                                        unsigned long end)
-{
-        unsigned long i;
-        for (i = 0; i < image->nr_segments; i++) {
-                unsigned long mstart, mend;
-                mstart = image->segment[i].mem;
-                mend = mstart + image->segment[i].memsz;
-                if ((end > mstart) && (start < mend))
-                        return 1;
-        }
-        return 0;
-}
-static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
-{
-        struct page *pages;
-        pages = alloc_pages(gfp_mask, order);
-        if (pages) {
-                unsigned int count, i;
-                pages->mapping = NULL;
-                set_page_private(pages, order);
-                count = 1 << order;
-                for (i = 0; i < count; i++)
-                        SetPageReserved(pages + i);
-        }
-        return pages;
-}
-static void kimage_free_pages(struct page *page)
-{
-        unsigned int order, count, i;
-        order = page_private(page);
-        count = 1 << order;
-        for (i = 0; i < count; i++)
-                ClearPageReserved(page + i);
-        __free_pages(page, order);
-}
-static void kimage_free_page_list(struct list_head *list)
-{
-        struct list_head *pos, *next;
-        list_for_each_safe(pos, next, list) {
-                struct page *page;
-                page = list_entry(pos, struct page, lru);
-                list_del(&page->lru);
-                kimage_free_pages(page);
-        }
-}
-static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
-                                                        unsigned int order)
-{
-        /* Control pages are special, they are the intermediaries
-         * that are needed while we copy the rest of the pages
-         * to their final resting place.  As such they must
-         * not conflict with either the destination addresses
-         * or memory the kernel is already using.
-         *
-         * The only case where we really need more than one of
-         * these are for architectures where we cannot disable
-         * the MMU and must instead generate an identity mapped
-         * page table for all of the memory.
-         *
-         * At worst this runs in O(N) of the image size.
-         */
-        struct list_head extra_pages;
-        struct page *pages;
-        unsigned int count;
-        count = 1 << order;
-        INIT_LIST_HEAD(&extra_pages);
-        /* Loop while I can allocate a page and the page allocated
-         * is a destination page.
-         */
-        do {
-                unsigned long pfn, epfn, addr, eaddr;
-                pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order);
-                if (!pages)
-                        break;
-                pfn   = page_to_pfn(pages);
-                epfn  = pfn + count;
-                addr  = pfn << PAGE_SHIFT;
-                eaddr = epfn << PAGE_SHIFT;
-                if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
-                              kimage_is_destination_range(image, addr, eaddr)) {
-                        list_add(&pages->lru, &extra_pages);
-                        pages = NULL;
-                }
-        } while (!pages);
-        if (pages) {
-                /* Remember the allocated page... */
-                list_add(&pages->lru, &image->control_pages);
-                /* Because the page is already in it's destination
-                 * location we will never allocate another page at
-                 * that address.  Therefore kimage_alloc_pages
-                 * will not return it (again) and we don't need
-                 * to give it an entry in image->segment[].
-                 */
-        }
-        /* Deal with the destination pages I have inadvertently allocated.
-         *
-         * Ideally I would convert multi-page allocations into single
-         * page allocations, and add everything to image->dest_pages.
-         *
-         * For now it is simpler to just free the pages.
-         */
-        kimage_free_page_list(&extra_pages);
-        return pages;
-}
-static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
-                                                      unsigned int order)
-{
-        /* Control pages are special, they are the intermediaries
-         * that are needed while we copy the rest of the pages
-         * to their final resting place.  As such they must
-         * not conflict with either the destination addresses
-         * or memory the kernel is already using.
-         *
-         * Control pages are also the only pags we must allocate
-         * when loading a crash kernel.  All of the other pages
-         * are specified by the segments and we just memcpy
-         * into them directly.
-         *
-         * The only case where we really need more than one of
-         * these are for architectures where we cannot disable
-         * the MMU and must instead generate an identity mapped
-         * page table for all of the memory.
-         *
-         * Given the low demand this implements a very simple
-         * allocator that finds the first hole of the appropriate
-         * size in the reserved memory region, and allocates all
-         * of the memory up to and including the hole.
-         */
-        unsigned long hole_start, hole_end, size;
-        struct page *pages;
-        pages = NULL;
-        size = (1 << order) << PAGE_SHIFT;
-        hole_start = (image->control_page + (size - 1)) & ~(size - 1);
-        hole_end   = hole_start + size - 1;
-        while (hole_end <= crashk_res.end) {
-                unsigned long i;
-                if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
-                        break;
-                /* See if I overlap any of the segments */
-                for (i = 0; i < image->nr_segments; i++) {
-                        unsigned long mstart, mend;
-                        mstart = image->segment[i].mem;
-                        mend   = mstart + image->segment[i].memsz - 1;
-                        if ((hole_end >= mstart) && (hole_start <= mend)) {
-                                /* Advance the hole to the end of the segment */
-                                hole_start = (mend + (size - 1)) & ~(size - 1);
-                                hole_end   = hole_start + size - 1;
-                                break;
-                        }
-                }
-                /* If I don't overlap any segments I have found my hole! */
-                if (i == image->nr_segments) {
-                        pages = pfn_to_page(hole_start >> PAGE_SHIFT);
-                        break;
-                }
-        }
-        if (pages)
-                image->control_page = hole_end;
-        return pages;
-}
-struct page *kimage_alloc_control_pages(struct kimage *image,
-                                         unsigned int order)
-{
-        struct page *pages = NULL;
-        switch (image->type) {
-        case KEXEC_TYPE_DEFAULT:
-                pages = kimage_alloc_normal_control_pages(image, order);
-                break;
-        case KEXEC_TYPE_CRASH:
-                pages = kimage_alloc_crash_control_pages(image, order);
-                break;
-        }
-        return pages;
-}
-static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
-{
-        if (*image->entry != 0)
-                image->entry++;
-        if (image->entry == image->last_entry) {
-                kimage_entry_t *ind_page;
-                struct page *page;
-                page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
-                if (!page)
-                        return -ENOMEM;
-                ind_page = page_address(page);
-                *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
-                image->entry = ind_page;
-                image->last_entry = ind_page +
-                                      ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
-        }
-        *image->entry = entry;
-        image->entry++;
-        *image->entry = 0;
-        return 0;
-}
-static int kimage_set_destination(struct kimage *image,
-                                   unsigned long destination)
-{
-        int result;
-        destination &= PAGE_MASK;
-        result = kimage_add_entry(image, destination | IND_DESTINATION);
-        return result;
-}
-static int kimage_add_page(struct kimage *image, unsigned long page)
-{
-        int result;
-        page &= PAGE_MASK;
-        result = kimage_add_entry(image, page | IND_SOURCE);
-        return result;
-}
-static void kimage_free_extra_pages(struct kimage *image)
-{
-        /* Walk through and free any extra destination pages I may have */
-        kimage_free_page_list(&image->dest_pages);
-        /* Walk through and free any unusable pages I have cached */
-        kimage_free_page_list(&image->unusable_pages);
-}
-static void kimage_terminate(struct kimage *image)
-{
-        if (*image->entry != 0)
-                image->entry++;
-        *image->entry = IND_DONE;
-}
-#define for_each_kimage_entry(image, ptr, entry) \
-        for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
-                ptr = (entry & IND_INDIRECTION) ? \
-                        phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
-static void kimage_free_entry(kimage_entry_t entry)
-{
-        struct page *page;
-        page = pfn_to_page(entry >> PAGE_SHIFT);
-        kimage_free_pages(page);
-}
-static void kimage_free(struct kimage *image)
-{
-        kimage_entry_t *ptr, entry;
-        kimage_entry_t ind = 0;
-        if (!image)
-                return;
-        kimage_free_extra_pages(image);
-        for_each_kimage_entry(image, ptr, entry) {
-                if (entry & IND_INDIRECTION) {
-                        /* Free the previous indirection page */
-                        if (ind & IND_INDIRECTION)
-                                kimage_free_entry(ind);
-                        /* Save this indirection page until we are
-                         * done with it.
-                         */
-                        ind = entry;
-                } else if (entry & IND_SOURCE)
-                        kimage_free_entry(entry);
-        }
-        /* Free the final indirection page */
-        if (ind & IND_INDIRECTION)
-                kimage_free_entry(ind);
-        /* Handle any machine specific cleanup */
-        machine_kexec_cleanup(image);
-        /* Free the kexec control pages... */
-        kimage_free_page_list(&image->control_pages);
-        /*
-         * Free up any temporary buffers allocated. This might hit if
-         * error occurred much later after buffer allocation.
-         */
-        if (image->file_mode)
-                kimage_file_post_load_cleanup(image);
-        kfree(image);
-}
-static kimage_entry_t *kimage_dst_used(struct kimage *image,
-                                        unsigned long page)
-{
-        kimage_entry_t *ptr, entry;
-        unsigned long destination = 0;
-        for_each_kimage_entry(image, ptr, entry) {
-                if (entry & IND_DESTINATION)
-                        destination = entry & PAGE_MASK;
-                else if (entry & IND_SOURCE) {
-                        if (page == destination)
-                                return ptr;
-                        destination += PAGE_SIZE;
-                }
-        }
-        return NULL;
-}
-static struct page *kimage_alloc_page(struct kimage *image,
-                                        gfp_t gfp_mask,
-                                        unsigned long destination)
-{
-        /*
-         * Here we implement safeguards to ensure that a source page
-         * is not copied to its destination page before the data on
-         * the destination page is no longer useful.
-         *
-         * To do this we maintain the invariant that a source page is
-         * either its own destination page, or it is not a
-         * destination page at all.
-         *
-         * That is slightly stronger than required, but the proof
-         * that no problems will not occur is trivial, and the
-         * implementation is simply to verify.
-         *
-         * When allocating all pages normally this algorithm will run
-         * in O(N) time, but in the worst case it will run in O(N^2)
-         * time.   If the runtime is a problem the data structures can
-         * be fixed.
-         */
-        struct page *page;
-        unsigned long addr;
-        /*
-         * Walk through the list of destination pages, and see if I
-         * have a match.
-         */
-        list_for_each_entry(page, &image->dest_pages, lru) {
-                addr = page_to_pfn(page) << PAGE_SHIFT;
-                if (addr == destination) {
-                        list_del(&page->lru);
-                        return page;
-                }
-        }
-        page = NULL;
-        while (1) {
-                kimage_entry_t *old;
-                /* Allocate a page, if we run out of memory give up */
-                page = kimage_alloc_pages(gfp_mask, 0);
-                if (!page)
-                        return NULL;
-                /* If the page cannot be used file it away */
-                if (page_to_pfn(page) >
-                                (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
-                        list_add(&page->lru, &image->unusable_pages);
-                        continue;
-                }
-                addr = page_to_pfn(page) << PAGE_SHIFT;
-                /* If it is the destination page we want use it */
-                if (addr == destination)
-                        break;
-                /* If the page is not a destination page use it */
-                if (!kimage_is_destination_range(image, addr,
-                                                  addr + PAGE_SIZE))
-                        break;
-                /*
-                 * I know that the page is someones destination page.
-                 * See if there is already a source page for this
-                 * destination page.  And if so swap the source pages.
-                 */
-                old = kimage_dst_used(image, addr);
-                if (old) {
-                        /* If so move it */
-                        unsigned long old_addr;
-                        struct page *old_page;
-                        old_addr = *old & PAGE_MASK;
-                        old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
-                        copy_highpage(page, old_page);
-                        *old = addr | (*old & ~PAGE_MASK);
-                        /* The old page I have found cannot be a
-                         * destination page, so return it if it's
-                         * gfp_flags honor the ones passed in.
-                         */
-                        if (!(gfp_mask & __GFP_HIGHMEM) &&
-                            PageHighMem(old_page)) {
-                                kimage_free_pages(old_page);
-                                continue;
-                        }
-                        addr = old_addr;
-                        page = old_page;
-                        break;
-                } else {
-                        /* Place the page on the destination list I
-                         * will use it later.
-                         */
-                        list_add(&page->lru, &image->dest_pages);
-                }
-        }
-        return page;
-}
-static int kimage_load_normal_segment(struct kimage *image,
-                                         struct kexec_segment *segment)
-{
-        unsigned long maddr;
-        size_t ubytes, mbytes;
-        int result;
-        unsigned char __user *buf = NULL;
-        unsigned char *kbuf = NULL;
-        result = 0;
-        if (image->file_mode)
-                kbuf = segment->kbuf;
-        else
-                buf = segment->buf;
-        ubytes = segment->bufsz;
-        mbytes = segment->memsz;
-        maddr = segment->mem;
-        result = kimage_set_destination(image, maddr);
-        if (result < 0)
-                goto out;
-        while (mbytes) {
-                struct page *page;
-                char *ptr;
-                size_t uchunk, mchunk;
-                page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
-                if (!page) {
-                        result  = -ENOMEM;
-                        goto out;
-                }
-                result = kimage_add_page(image, page_to_pfn(page)
-                                                                << PAGE_SHIFT);
-                if (result < 0)
-                        goto out;
-                ptr = kmap(page);
-                /* Start with a clear page */
-                clear_page(ptr);
-                ptr += maddr & ~PAGE_MASK;
-                mchunk = min_t(size_t, mbytes,
-                                PAGE_SIZE - (maddr & ~PAGE_MASK));
-                uchunk = min(ubytes, mchunk);
-                /* For file based kexec, source pages are in kernel memory */
-                if (image->file_mode)
-                        memcpy(ptr, kbuf, uchunk);
-                else
-                        result = copy_from_user(ptr, buf, uchunk);
-                kunmap(page);
-                if (result) {
-                        result = -EFAULT;
-                        goto out;
-                }
-                ubytes -= uchunk;
-                maddr  += mchunk;
-                if (image->file_mode)
-                        kbuf += mchunk;
-                else
-                        buf += mchunk;
-                mbytes -= mchunk;
-        }
-out:
-        return result;
-}
-static int kimage_load_crash_segment(struct kimage *image,
-                                        struct kexec_segment *segment)
-{
-        /* For crash dumps kernels we simply copy the data from
-         * user space to it's destination.
-         * We do things a page at a time for the sake of kmap.
-         */
-        unsigned long maddr;
-        size_t ubytes, mbytes;
-        int result;
-        unsigned char __user *buf = NULL;
-        unsigned char *kbuf = NULL;
-        result = 0;
-        if (image->file_mode)
-                kbuf = segment->kbuf;
-        else
-                buf = segment->buf;
-        ubytes = segment->bufsz;
-        mbytes = segment->memsz;
-        maddr = segment->mem;
-        while (mbytes) {
-                struct page *page;
-                char *ptr;
-                size_t uchunk, mchunk;
-                page = pfn_to_page(maddr >> PAGE_SHIFT);
-                if (!page) {
-                        result  = -ENOMEM;
-                        goto out;
-                }
-                ptr = kmap(page);
-                ptr += maddr & ~PAGE_MASK;
-                mchunk = min_t(size_t, mbytes,
-                                PAGE_SIZE - (maddr & ~PAGE_MASK));
-                uchunk = min(ubytes, mchunk);
-                if (mchunk > uchunk) {
-                        /* Zero the trailing part of the page */
-                        memset(ptr + uchunk, 0, mchunk - uchunk);
-                }
-                /* For file based kexec, source pages are in kernel memory */
-                if (image->file_mode)
-                        memcpy(ptr, kbuf, uchunk);
-                else
-                        result = copy_from_user(ptr, buf, uchunk);
-                kexec_flush_icache_page(page);
-                kunmap(page);
-                if (result) {
-                        result = -EFAULT;
-                        goto out;
-                }
-                ubytes -= uchunk;
-                maddr  += mchunk;
-                if (image->file_mode)
-                        kbuf += mchunk;
-                else
-                        buf += mchunk;
-                mbytes -= mchunk;
-        }
-out:
-        return result;
-}
-static int kimage_load_segment(struct kimage *image,
-                                struct kexec_segment *segment)
-{
-        int result = -ENOMEM;
-        switch (image->type) {
-        case KEXEC_TYPE_DEFAULT:
-                result = kimage_load_normal_segment(image, segment);
-                break;
-        case KEXEC_TYPE_CRASH:
-                result = kimage_load_crash_segment(image, segment);
-                break;
-        }
-        return result;
-}
 /*
 * Exec Kernel system call: for obvious reasons only root may call it.
 *
@@ -1241,11 +121,6 @@ static int kimage_load_segment(struct kimage *image,
 * kexec does not sync, or unmount filesystems so if you need
 * that to happen you need to do that yourself.
 */
-struct kimage *kexec_image;
-struct kimage *kexec_crash_image;
-int kexec_load_disabled;
-static DEFINE_MUTEX(kexec_mutex);
 SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
                struct kexec_segment __user *, segments, unsigned long, flags)
@@ -1340,18 +215,6 @@ out:
        return result;
 }
-/*
- * Add and remove page tables for crashkernel memory
- *
- * Provide an empty default implementation here -- architecture
- * code may override this
- */
-void __weak crash_map_reserved_pages(void)
-{}
-void __weak crash_unmap_reserved_pages(void)
-{}
 #ifdef CONFIG_COMPAT
 COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
                       compat_ulong_t, nr_segments,
@@ -1390,1391 +253,3 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
        return sys_kexec_load(entry, nr_segments, ksegments, flags);
 }
 #endif
-#ifdef CONFIG_KEXEC_FILE
-SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
-                unsigned long, cmdline_len, const char __user *, cmdline_ptr,
-                unsigned long, flags)
-{
-        int ret = 0, i;
-        struct kimage **dest_image, *image;
-        /* We only trust the superuser with rebooting the system. */
-        if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
-                return -EPERM;
-        /* Make sure we have a legal set of flags */
-        if (flags != (flags & KEXEC_FILE_FLAGS))
-                return -EINVAL;
-        image = NULL;
-        if (!mutex_trylock(&kexec_mutex))
-                return -EBUSY;
-        dest_image = &kexec_image;
-        if (flags & KEXEC_FILE_ON_CRASH)
-                dest_image = &kexec_crash_image;
-        if (flags & KEXEC_FILE_UNLOAD)
-                goto exchange;
-        /*
-         * In case of crash, new kernel gets loaded in reserved region. It is
-         * same memory where old crash kernel might be loaded. Free any
-         * current crash dump kernel before we corrupt it.
-         */
-        if (flags & KEXEC_FILE_ON_CRASH)
-                kimage_free(xchg(&kexec_crash_image, NULL));
-        ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr,
-                                     cmdline_len, flags);
-        if (ret)
-                goto out;
-        ret = machine_kexec_prepare(image);
-        if (ret)
-                goto out;
-        ret = kexec_calculate_store_digests(image);
-        if (ret)
-                goto out;
-        for (i = 0; i < image->nr_segments; i++) {
-                struct kexec_segment *ksegment;
-                ksegment = &image->segment[i];
-                pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n",
-                         i, ksegment->buf, ksegment->bufsz, ksegment->mem,
-                         ksegment->memsz);
-                ret = kimage_load_segment(image, &image->segment[i]);
-                if (ret)
-                        goto out;
-        }
-        kimage_terminate(image);
-        /*
-         * Free up any temporary buffers allocated which are not needed
-         * after image has been loaded
-         */
-        kimage_file_post_load_cleanup(image);
-exchange:
-        image = xchg(dest_image, image);
-out:
-        mutex_unlock(&kexec_mutex);
-        kimage_free(image);
-        return ret;
-}
-#endif /* CONFIG_KEXEC_FILE */
-void crash_kexec(struct pt_regs *regs)
-{
-        /* Take the kexec_mutex here to prevent sys_kexec_load
-         * running on one cpu from replacing the crash kernel
-         * we are using after a panic on a different cpu.
-         *
-         * If the crash kernel was not located in a fixed area
-         * of memory the xchg(&kexec_crash_image) would be
-         * sufficient.  But since I reuse the memory...
-         */
-        if (mutex_trylock(&kexec_mutex)) {
-                if (kexec_crash_image) {
-                        struct pt_regs fixed_regs;
-                        crash_setup_regs(&fixed_regs, regs);
-                        crash_save_vmcoreinfo();
-                        machine_crash_shutdown(&fixed_regs);
-                        machine_kexec(kexec_crash_image);
-                }
-                mutex_unlock(&kexec_mutex);
-        }
-}
-size_t crash_get_memory_size(void)
-{
-        size_t size = 0;
-        mutex_lock(&kexec_mutex);
-        if (crashk_res.end != crashk_res.start)
-                size = resource_size(&crashk_res);
-        mutex_unlock(&kexec_mutex);
-        return size;
-}
-void __weak crash_free_reserved_phys_range(unsigned long begin,
-                                           unsigned long end)
-{
-        unsigned long addr;
-        for (addr = begin; addr < end; addr += PAGE_SIZE)
-                free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
-}
-int crash_shrink_memory(unsigned long new_size)
-{
-        int ret = 0;
-        unsigned long start, end;
-        unsigned long old_size;
-        struct resource *ram_res;
-        mutex_lock(&kexec_mutex);
-        if (kexec_crash_image) {
-                ret = -ENOENT;
-                goto unlock;
-        }
-        start = crashk_res.start;
-        end = crashk_res.end;
-        old_size = (end == 0) ? 0 : end - start + 1;
-        if (new_size >= old_size) {
-                ret = (new_size == old_size) ? 0 : -EINVAL;
-                goto unlock;
-        }
-        ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
-        if (!ram_res) {
-                ret = -ENOMEM;
-                goto unlock;
-        }
-        start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
-        end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
-        crash_map_reserved_pages();
-        crash_free_reserved_phys_range(end, crashk_res.end);
-        if ((start == end) && (crashk_res.parent != NULL))
-                release_resource(&crashk_res);
-        ram_res->start = end;
-        ram_res->end = crashk_res.end;
-        ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
-        ram_res->name = "System RAM";
-        crashk_res.end = end - 1;
-        insert_resource(&iomem_resource, ram_res);
-        crash_unmap_reserved_pages();
-unlock:
-        mutex_unlock(&kexec_mutex);
-        return ret;
-}
-static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
-                            size_t data_len)
-{
-        struct elf_note note;
-        note.n_namesz = strlen(name) + 1;
-        note.n_descsz = data_len;
-        note.n_type   = type;
-        memcpy(buf, &note, sizeof(note));
-        buf += (sizeof(note) + 3)/4;
-        memcpy(buf, name, note.n_namesz);
-        buf += (note.n_namesz + 3)/4;
-        memcpy(buf, data, note.n_descsz);
-        buf += (note.n_descsz + 3)/4;
-        return buf;
-}
-static void final_note(u32 *buf)
-{
-        struct elf_note note;
-        note.n_namesz = 0;
-        note.n_descsz = 0;
-        note.n_type   = 0;
-        memcpy(buf, &note, sizeof(note));
-}
-void crash_save_cpu(struct pt_regs *regs, int cpu)
-{
-        struct elf_prstatus prstatus;
-        u32 *buf;
-        if ((cpu < 0) || (cpu >= nr_cpu_ids))
-                return;
-        /* Using ELF notes here is opportunistic.
-         * I need a well defined structure format
-         * for the data I pass, and I need tags
-         * on the data to indicate what information I have
-         * squirrelled away.  ELF notes happen to provide
-         * all of that, so there is no need to invent something new.
-         */
-        buf = (u32 *)per_cpu_ptr(crash_notes, cpu);
-        if (!buf)
-                return;
-        memset(&prstatus, 0, sizeof(prstatus));
-        prstatus.pr_pid = current->pid;
-        elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
-        buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
-                              &prstatus, sizeof(prstatus));
-        final_note(buf);
-}
-static int __init crash_notes_memory_init(void)
-{
-        /* Allocate memory for saving cpu registers. */
-        crash_notes = alloc_percpu(note_buf_t);
-        if (!crash_notes) {
-                pr_warn("Kexec: Memory allocation for saving cpu register states failed\n");
-                return -ENOMEM;
-        }
-        return 0;
-}
-subsys_initcall(crash_notes_memory_init);
-/*
- * parsing the "crashkernel" commandline
- *
- * this code is intended to be called from architecture specific code
- */
-/*
- * This function parses command lines in the format
- *
- *   crashkernel=ramsize-range:size[,...][@offset]
- *
- * The function returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_mem(char *cmdline,
-                                        unsigned long long system_ram,
-                                        unsigned long long *crash_size,
-                                        unsigned long long *crash_base)
-{
-        char *cur = cmdline, *tmp;
-        /* for each entry of the comma-separated list */
-        do {
-                unsigned long long start, end = ULLONG_MAX, size;
-                /* get the start of the range */
-                start = memparse(cur, &tmp);
-                if (cur == tmp) {
-                        pr_warn("crashkernel: Memory value expected\n");
-                        return -EINVAL;
-                }
-                cur = tmp;
-                if (*cur != '-') {
-                        pr_warn("crashkernel: '-' expected\n");
-                        return -EINVAL;
-                }
-                cur++;
-                /* if no ':' is here, than we read the end */
-                if (*cur != ':') {
-                        end = memparse(cur, &tmp);
-                        if (cur == tmp) {
-                                pr_warn("crashkernel: Memory value expected\n");
-                                return -EINVAL;
-                        }
-                        cur = tmp;
-                        if (end <= start) {
-                                pr_warn("crashkernel: end <= start\n");
-                                return -EINVAL;
-                        }
-                }
-                if (*cur != ':') {
-                        pr_warn("crashkernel: ':' expected\n");
-                        return -EINVAL;
-                }
-                cur++;
-                size = memparse(cur, &tmp);
-                if (cur == tmp) {
-                        pr_warn("Memory value expected\n");
-                        return -EINVAL;
-                }
-                cur = tmp;
-                if (size >= system_ram) {
-                        pr_warn("crashkernel: invalid size\n");
-                        return -EINVAL;
-                }
-                /* match ? */
-                if (system_ram >= start && system_ram < end) {
-                        *crash_size = size;
-                        break;
-                }
-        } while (*cur++ == ',');
-        if (*crash_size > 0) {
-                while (*cur && *cur != ' ' && *cur != '@')
-                        cur++;
-                if (*cur == '@') {
-                        cur++;
-                        *crash_base = memparse(cur, &tmp);
-                        if (cur == tmp) {
-                                pr_warn("Memory value expected after '@'\n");
-                                return -EINVAL;
-                        }
-                }
-        }
-        return 0;
-}
-/*
- * That function parses "simple" (old) crashkernel command lines like
- *
- *      crashkernel=size[@offset]
- *
- * It returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_simple(char *cmdline,
-                                           unsigned long long *crash_size,
-                                           unsigned long long *crash_base)
-{
-        char *cur = cmdline;
-        *crash_size = memparse(cmdline, &cur);
-        if (cmdline == cur) {
-                pr_warn("crashkernel: memory value expected\n");
-                return -EINVAL;
-        }
-        if (*cur == '@')
-                *crash_base = memparse(cur+1, &cur);
-        else if (*cur != ' ' && *cur != '\0') {
-                pr_warn("crashkernel: unrecognized char\n");
-                return -EINVAL;
-        }
-        return 0;
-}
-#define SUFFIX_HIGH 0
-#define SUFFIX_LOW  1
-#define SUFFIX_NULL 2
-static __initdata char *suffix_tbl[] = {
-        [SUFFIX_HIGH] = ",high",
-        [SUFFIX_LOW]  = ",low",
-        [SUFFIX_NULL] = NULL,
-};
-/*
- * That function parses "suffix"  crashkernel command lines like
- *
- *      crashkernel=size,[high|low]
- *
- * It returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_suffix(char *cmdline,
-                                           unsigned long long   *crash_size,
-                                           const char *suffix)
-{
-        char *cur = cmdline;
-        *crash_size = memparse(cmdline, &cur);
-        if (cmdline == cur) {
-                pr_warn("crashkernel: memory value expected\n");
-                return -EINVAL;
-        }
-        /* check with suffix */
-        if (strncmp(cur, suffix, strlen(suffix))) {
-                pr_warn("crashkernel: unrecognized char\n");
-                return -EINVAL;
-        }
-        cur += strlen(suffix);
-        if (*cur != ' ' && *cur != '\0') {
-                pr_warn("crashkernel: unrecognized char\n");
-                return -EINVAL;
-        }
-        return 0;
-}
-static __init char *get_last_crashkernel(char *cmdline,
-                             const char *name,
-                             const char *suffix)
-{
-        char *p = cmdline, *ck_cmdline = NULL;
-        /* find crashkernel and use the last one if there are more */
-        p = strstr(p, name);
-        while (p) {
-                char *end_p = strchr(p, ' ');
-                char *q;
-                if (!end_p)
-                        end_p = p + strlen(p);
-                if (!suffix) {
-                        int i;
-                        /* skip the one with any known suffix */
-                        for (i = 0; suffix_tbl[i]; i++) {
-                                q = end_p - strlen(suffix_tbl[i]);
-                                if (!strncmp(q, suffix_tbl[i],
-                                             strlen(suffix_tbl[i])))
-                                        goto next;
-                        }
-                        ck_cmdline = p;
-                } else {
-                        q = end_p - strlen(suffix);
-                        if (!strncmp(q, suffix, strlen(suffix)))
-                                ck_cmdline = p;
-                }
-next:
-                p = strstr(p+1, name);
-        }
-        if (!ck_cmdline)
-                return NULL;
-        return ck_cmdline;
-}
-static int __init __parse_crashkernel(char *cmdline,
-                             unsigned long long system_ram,
-                             unsigned long long *crash_size,
-                             unsigned long long *crash_base,
-                             const char *name,
-                             const char *suffix)
-{
-        char    *first_colon, *first_space;
-        char    *ck_cmdline;
-        BUG_ON(!crash_size || !crash_base);
-        *crash_size = 0;
-        *crash_base = 0;
-        ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
-        if (!ck_cmdline)
-                return -EINVAL;
-        ck_cmdline += strlen(name);
-        if (suffix)
-                return parse_crashkernel_suffix(ck_cmdline, crash_size,
-                                suffix);
-        /*
-         * if the commandline contains a ':', then that's the extended
-         * syntax -- if not, it must be the classic syntax
-         */
-        first_colon = strchr(ck_cmdline, ':');
-        first_space = strchr(ck_cmdline, ' ');
-        if (first_colon && (!first_space || first_colon < first_space))
-                return parse_crashkernel_mem(ck_cmdline, system_ram,
-                                crash_size, crash_base);
-        return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
-}
-/*
- * That function is the entry point for command line parsing and should be
- * called from the arch-specific code.
- */
-int __init parse_crashkernel(char *cmdline,
-                             unsigned long long system_ram,
-                             unsigned long long *crash_size,
-                             unsigned long long *crash_base)
-{
-        return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
-                                        "crashkernel=", NULL);
-}
-int __init parse_crashkernel_high(char *cmdline,
-                             unsigned long long system_ram,
-                             unsigned long long *crash_size,
-                             unsigned long long *crash_base)
-{
-        return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
-                                "crashkernel=", suffix_tbl[SUFFIX_HIGH]);
-}
-int __init parse_crashkernel_low(char *cmdline,
-                             unsigned long long system_ram,
-                             unsigned long long *crash_size,
-                             unsigned long long *crash_base)
-{
-        return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
-                                "crashkernel=", suffix_tbl[SUFFIX_LOW]);
-}
-static void update_vmcoreinfo_note(void)
-{
-        u32 *buf = vmcoreinfo_note;
-        if (!vmcoreinfo_size)
-                return;
-        buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
-                              vmcoreinfo_size);
-        final_note(buf);
-}
-void crash_save_vmcoreinfo(void)
-{
-        vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
-        update_vmcoreinfo_note();
-}
-void vmcoreinfo_append_str(const char *fmt, ...)
-{
-        va_list args;
-        char buf[0x50];
-        size_t r;
-        va_start(args, fmt);
-        r = vscnprintf(buf, sizeof(buf), fmt, args);
-        va_end(args);
-        r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
-        memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
-        vmcoreinfo_size += r;
-}
-/*
- * provide an empty default implementation here -- architecture
- * code may override this
- */
-void __weak arch_crash_save_vmcoreinfo(void)
-{}
-unsigned long __weak paddr_vmcoreinfo_note(void)
-{
-        return __pa((unsigned long)(char *)&vmcoreinfo_note);
-}
-static int __init crash_save_vmcoreinfo_init(void)
-{
-        VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
-        VMCOREINFO_PAGESIZE(PAGE_SIZE);
-        VMCOREINFO_SYMBOL(init_uts_ns);
-        VMCOREINFO_SYMBOL(node_online_map);
-#ifdef CONFIG_MMU
-        VMCOREINFO_SYMBOL(swapper_pg_dir);
-#endif
-        VMCOREINFO_SYMBOL(_stext);
-        VMCOREINFO_SYMBOL(vmap_area_list);
-#ifndef CONFIG_NEED_MULTIPLE_NODES
-        VMCOREINFO_SYMBOL(mem_map);
-        VMCOREINFO_SYMBOL(contig_page_data);
-#endif
-#ifdef CONFIG_SPARSEMEM
-        VMCOREINFO_SYMBOL(mem_section);
-        VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
-        VMCOREINFO_STRUCT_SIZE(mem_section);
-        VMCOREINFO_OFFSET(mem_section, section_mem_map);
-#endif
-        VMCOREINFO_STRUCT_SIZE(page);
-        VMCOREINFO_STRUCT_SIZE(pglist_data);
-        VMCOREINFO_STRUCT_SIZE(zone);
-        VMCOREINFO_STRUCT_SIZE(free_area);
-        VMCOREINFO_STRUCT_SIZE(list_head);
-        VMCOREINFO_SIZE(nodemask_t);
-        VMCOREINFO_OFFSET(page, flags);
-        VMCOREINFO_OFFSET(page, _count);
-        VMCOREINFO_OFFSET(page, mapping);
-        VMCOREINFO_OFFSET(page, lru);
-        VMCOREINFO_OFFSET(page, _mapcount);
-        VMCOREINFO_OFFSET(page, private);
-        VMCOREINFO_OFFSET(pglist_data, node_zones);
-        VMCOREINFO_OFFSET(pglist_data, nr_zones);
-#ifdef CONFIG_FLAT_NODE_MEM_MAP
-        VMCOREINFO_OFFSET(pglist_data, node_mem_map);
-#endif
-        VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
-        VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
-        VMCOREINFO_OFFSET(pglist_data, node_id);
-        VMCOREINFO_OFFSET(zone, free_area);
-        VMCOREINFO_OFFSET(zone, vm_stat);
-        VMCOREINFO_OFFSET(zone, spanned_pages);
-        VMCOREINFO_OFFSET(free_area, free_list);
-        VMCOREINFO_OFFSET(list_head, next);
-        VMCOREINFO_OFFSET(list_head, prev);
-        VMCOREINFO_OFFSET(vmap_area, va_start);
-        VMCOREINFO_OFFSET(vmap_area, list);
-        VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
-        log_buf_kexec_setup();
-        VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
-        VMCOREINFO_NUMBER(NR_FREE_PAGES);
-        VMCOREINFO_NUMBER(PG_lru);
-        VMCOREINFO_NUMBER(PG_private);
-        VMCOREINFO_NUMBER(PG_swapcache);
-        VMCOREINFO_NUMBER(PG_slab);
-#ifdef CONFIG_MEMORY_FAILURE
-        VMCOREINFO_NUMBER(PG_hwpoison);
-#endif
-        VMCOREINFO_NUMBER(PG_head_mask);
-        VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
-#ifdef CONFIG_HUGETLBFS
-        VMCOREINFO_SYMBOL(free_huge_page);
-#endif
-        arch_crash_save_vmcoreinfo();
-        update_vmcoreinfo_note();
-        return 0;
-}
-subsys_initcall(crash_save_vmcoreinfo_init);
-#ifdef CONFIG_KEXEC_FILE
-static int locate_mem_hole_top_down(unsigned long start, unsigned long end,
-                                    struct kexec_buf *kbuf)
-{
-        struct kimage *image = kbuf->image;
-        unsigned long temp_start, temp_end;
-        temp_end = min(end, kbuf->buf_max);
-        temp_start = temp_end - kbuf->memsz;
-        do {
-                /* align down start */
-                temp_start = temp_start & (~(kbuf->buf_align - 1));
-                if (temp_start < start || temp_start < kbuf->buf_min)
-                        return 0;
-                temp_end = temp_start + kbuf->memsz - 1;
-                /*
-                 * Make sure this does not conflict with any of existing
-                 * segments
-                 */
-                if (kimage_is_destination_range(image, temp_start, temp_end)) {
-                        temp_start = temp_start - PAGE_SIZE;
-                        continue;
-                }
-                /* We found a suitable memory range */
-                break;
-        } while (1);
-        /* If we are here, we found a suitable memory range */
-        kbuf->mem = temp_start;
-        /* Success, stop navigating through remaining System RAM ranges */
-        return 1;
-}
-static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end,
-                                     struct kexec_buf *kbuf)
-{
-        struct kimage *image = kbuf->image;
-        unsigned long temp_start, temp_end;
-        temp_start = max(start, kbuf->buf_min);
-        do {
-                temp_start = ALIGN(temp_start, kbuf->buf_align);
-                temp_end = temp_start + kbuf->memsz - 1;
-                if (temp_end > end || temp_end > kbuf->buf_max)
-                        return 0;
-                /*
-                 * Make sure this does not conflict with any of existing
-                 * segments
-                 */
-                if (kimage_is_destination_range(image, temp_start, temp_end)) {
-                        temp_start = temp_start + PAGE_SIZE;
-                        continue;
-                }
-                /* We found a suitable memory range */
-                break;
-        } while (1);
-        /* If we are here, we found a suitable memory range */
-        kbuf->mem = temp_start;
-        /* Success, stop navigating through remaining System RAM ranges */
-        return 1;
-}
-static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
-{
-        struct kexec_buf *kbuf = (struct kexec_buf *)arg;
-        unsigned long sz = end - start + 1;
-        /* Returning 0 will take to next memory range */
-        if (sz < kbuf->memsz)
-                return 0;
-        if (end < kbuf->buf_min || start > kbuf->buf_max)
-                return 0;
-        /*
-         * Allocate memory top down with-in ram range. Otherwise bottom up
-         * allocation.
-         */
-        if (kbuf->top_down)
-                return locate_mem_hole_top_down(start, end, kbuf);
-        return locate_mem_hole_bottom_up(start, end, kbuf);
-}
-/*
- * Helper function for placing a buffer in a kexec segment. This assumes
- * that kexec_mutex is held.
- */
-int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
-                     unsigned long memsz, unsigned long buf_align,
-                     unsigned long buf_min, unsigned long buf_max,
-                     bool top_down, unsigned long *load_addr)
-{
-        struct kexec_segment *ksegment;
-        struct kexec_buf buf, *kbuf;
-        int ret;
-        /* Currently adding segment this way is allowed only in file mode */
-        if (!image->file_mode)
-                return -EINVAL;
-        if (image->nr_segments >= KEXEC_SEGMENT_MAX)
-                return -EINVAL;
-        /*
-         * Make sure we are not trying to add buffer after allocating
-         * control pages. All segments need to be placed first before
-         * any control pages are allocated. As control page allocation
-         * logic goes through list of segments to make sure there are
-         * no destination overlaps.
-         */
-        if (!list_empty(&image->control_pages)) {
-                WARN_ON(1);
-                return -EINVAL;
-        }
-        memset(&buf, 0, sizeof(struct kexec_buf));
-        kbuf = &buf;
-        kbuf->image = image;
-        kbuf->buffer = buffer;
-        kbuf->bufsz = bufsz;
-        kbuf->memsz = ALIGN(memsz, PAGE_SIZE);
-        kbuf->buf_align = max(buf_align, PAGE_SIZE);
-        kbuf->buf_min = buf_min;
-        kbuf->buf_max = buf_max;
-        kbuf->top_down = top_down;
-        /* Walk the RAM ranges and allocate a suitable range for the buffer */
-        if (image->type == KEXEC_TYPE_CRASH)
-                ret = walk_iomem_res("Crash kernel",
-                                     IORESOURCE_MEM | IORESOURCE_BUSY,
-                                     crashk_res.start, crashk_res.end, kbuf,
-                                     locate_mem_hole_callback);
-        else
-                ret = walk_system_ram_res(0, -1, kbuf,
-                                          locate_mem_hole_callback);
-        if (ret != 1) {
-                /* A suitable memory range could not be found for buffer */
-                return -EADDRNOTAVAIL;
-        }
-        /* Found a suitable memory range */
-        ksegment = &image->segment[image->nr_segments];
-        ksegment->kbuf = kbuf->buffer;
-        ksegment->bufsz = kbuf->bufsz;
-        ksegment->mem = kbuf->mem;
-        ksegment->memsz = kbuf->memsz;
-        image->nr_segments++;
-        *load_addr = ksegment->mem;
-        return 0;
-}
-/* Calculate and store the digest of segments */
-static int kexec_calculate_store_digests(struct kimage *image)
-{
-        struct crypto_shash *tfm;
-        struct shash_desc *desc;
-        int ret = 0, i, j, zero_buf_sz, sha_region_sz;
-        size_t desc_size, nullsz;
-        char *digest;
-        void *zero_buf;
-        struct kexec_sha_region *sha_regions;
-        struct purgatory_info *pi = &image->purgatory_info;
-        zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT);
-        zero_buf_sz = PAGE_SIZE;
-        tfm = crypto_alloc_shash("sha256", 0, 0);
-        if (IS_ERR(tfm)) {
-                ret = PTR_ERR(tfm);
-                goto out;
-        }
-        desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
-        desc = kzalloc(desc_size, GFP_KERNEL);
-        if (!desc) {
-                ret = -ENOMEM;
-                goto out_free_tfm;
-        }
-        sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region);
-        sha_regions = vzalloc(sha_region_sz);
-        if (!sha_regions)
-                goto out_free_desc;
-        desc->tfm   = tfm;
-        desc->flags = 0;
-        ret = crypto_shash_init(desc);
-        if (ret < 0)
-                goto out_free_sha_regions;
-        digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL);
-        if (!digest) {
-                ret = -ENOMEM;
-                goto out_free_sha_regions;
-        }
-        for (j = i = 0; i < image->nr_segments; i++) {
-                struct kexec_segment *ksegment;
-                ksegment = &image->segment[i];
-                /*
-                 * Skip purgatory as it will be modified once we put digest
-                 * info in purgatory.
-                 */
-                if (ksegment->kbuf == pi->purgatory_buf)
-                        continue;
-                ret = crypto_shash_update(desc, ksegment->kbuf,
-                                          ksegment->bufsz);
-                if (ret)
-                        break;
-                /*
-                 * Assume rest of the buffer is filled with zero and
-                 * update digest accordingly.
-                 */
-                nullsz = ksegment->memsz - ksegment->bufsz;
-                while (nullsz) {
-                        unsigned long bytes = nullsz;
-                        if (bytes > zero_buf_sz)
-                                bytes = zero_buf_sz;
-                        ret = crypto_shash_update(desc, zero_buf, bytes);
-                        if (ret)
-                                break;
-                        nullsz -= bytes;
-                }
-                if (ret)
-                        break;
-                sha_regions[j].start = ksegment->mem;
-                sha_regions[j].len = ksegment->memsz;
-                j++;
-        }
-        if (!ret) {
-                ret = crypto_shash_final(desc, digest);
-                if (ret)
-                        goto out_free_digest;
-                ret = kexec_purgatory_get_set_symbol(image, "sha_regions",
-                                                sha_regions, sha_region_sz, 0);
-                if (ret)
-                        goto out_free_digest;
-                ret = kexec_purgatory_get_set_symbol(image, "sha256_digest",
-                                                digest, SHA256_DIGEST_SIZE, 0);
-                if (ret)
-                        goto out_free_digest;
-        }
-out_free_digest:
-        kfree(digest);
-out_free_sha_regions:
-        vfree(sha_regions);
-out_free_desc:
-        kfree(desc);
-out_free_tfm:
-        kfree(tfm);
-out:
-        return ret;
-}
-/* Actually load purgatory. Lot of code taken from kexec-tools */
-static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
-                                  unsigned long max, int top_down)
-{
-        struct purgatory_info *pi = &image->purgatory_info;
-        unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad;
-        unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset;
-        unsigned char *buf_addr, *src;
-        int i, ret = 0, entry_sidx = -1;
-        const Elf_Shdr *sechdrs_c;
-        Elf_Shdr *sechdrs = NULL;
-        void *purgatory_buf = NULL;
-        /*
-         * sechdrs_c points to section headers in purgatory and are read
-         * only. No modifications allowed.
-         */
-        sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff;
-        /*
-         * We can not modify sechdrs_c[] and its fields. It is read only.
-         * Copy it over to a local copy where one can store some temporary
-         * data and free it at the end. We need to modify ->sh_addr and
-         * ->sh_offset fields to keep track of permanent and temporary
-         * locations of sections.
-         */
-        sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr));
-        if (!sechdrs)
-                return -ENOMEM;
-        memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr));
-        /*
-         * We seem to have multiple copies of sections. First copy is which
-         * is embedded in kernel in read only section. Some of these sections
-         * will be copied to a temporary buffer and relocated. And these
-         * sections will finally be copied to their final destination at
-         * segment load time.
-         *
-         * Use ->sh_offset to reflect section address in memory. It will
-         * point to original read only copy if section is not allocatable.
-         * Otherwise it will point to temporary copy which will be relocated.
-         *
-         * Use ->sh_addr to contain final address of the section where it
-         * will go during execution time.
-         */
-        for (i = 0; i < pi->ehdr->e_shnum; i++) {
-                if (sechdrs[i].sh_type == SHT_NOBITS)
-                        continue;
-                sechdrs[i].sh_offset = (unsigned long)pi->ehdr +
-                                                sechdrs[i].sh_offset;
-        }
-        /*
-         * Identify entry point section and make entry relative to section
-         * start.
-         */
-        entry = pi->ehdr->e_entry;
-        for (i = 0; i < pi->ehdr->e_shnum; i++) {
-                if (!(sechdrs[i].sh_flags & SHF_ALLOC))
-                        continue;
-                if (!(sechdrs[i].sh_flags & SHF_EXECINSTR))
-                        continue;
-                /* Make entry section relative */
-                if (sechdrs[i].sh_addr <= pi->ehdr->e_entry &&
-                    ((sechdrs[i].sh_addr + sechdrs[i].sh_size) >
-                     pi->ehdr->e_entry)) {
-                        entry_sidx = i;
-                        entry -= sechdrs[i].sh_addr;
-                        break;
-                }
-        }
-        /* Determine how much memory is needed to load relocatable object. */
-        buf_align = 1;
-        bss_align = 1;
-        buf_sz = 0;
-        bss_sz = 0;
-        for (i = 0; i < pi->ehdr->e_shnum; i++) {
-                if (!(sechdrs[i].sh_flags & SHF_ALLOC))
-                        continue;
-                align = sechdrs[i].sh_addralign;
-                if (sechdrs[i].sh_type != SHT_NOBITS) {
-                        if (buf_align < align)
-                                buf_align = align;
-                        buf_sz = ALIGN(buf_sz, align);
-                        buf_sz += sechdrs[i].sh_size;
-                } else {
-                        /* bss section */
-                        if (bss_align < align)
-                                bss_align = align;
-                        bss_sz = ALIGN(bss_sz, align);
-                        bss_sz += sechdrs[i].sh_size;
-                }
-        }
-        /* Determine the bss padding required to align bss properly */
-        bss_pad = 0;
-        if (buf_sz & (bss_align - 1))
-                bss_pad = bss_align - (buf_sz & (bss_align - 1));
-        memsz = buf_sz + bss_pad + bss_sz;
-        /* Allocate buffer for purgatory */
-        purgatory_buf = vzalloc(buf_sz);
-        if (!purgatory_buf) {
-                ret = -ENOMEM;
-                goto out;
-        }
-        if (buf_align < bss_align)
-                buf_align = bss_align;
-        /* Add buffer to segment list */
-        ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz,
-                                buf_align, min, max, top_down,
-                                &pi->purgatory_load_addr);
-        if (ret)
-                goto out;
-        /* Load SHF_ALLOC sections */
-        buf_addr = purgatory_buf;
-        load_addr = curr_load_addr = pi->purgatory_load_addr;
-        bss_addr = load_addr + buf_sz + bss_pad;
-        for (i = 0; i < pi->ehdr->e_shnum; i++) {
-                if (!(sechdrs[i].sh_flags & SHF_ALLOC))
-                        continue;
-                align = sechdrs[i].sh_addralign;
-                if (sechdrs[i].sh_type != SHT_NOBITS) {
-                        curr_load_addr = ALIGN(curr_load_addr, align);
-                        offset = curr_load_addr - load_addr;
-                        /* We already modifed ->sh_offset to keep src addr */
-                        src = (char *) sechdrs[i].sh_offset;
-                        memcpy(buf_addr + offset, src, sechdrs[i].sh_size);
-                        /* Store load address and source address of section */
-                        sechdrs[i].sh_addr = curr_load_addr;
-                        /*
-                         * This section got copied to temporary buffer. Update
-                         * ->sh_offset accordingly.
-                         */
-                        sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset);
-                        /* Advance to the next address */
-                        curr_load_addr += sechdrs[i].sh_size;
-                } else {
-                        bss_addr = ALIGN(bss_addr, align);
-                        sechdrs[i].sh_addr = bss_addr;
-                        bss_addr += sechdrs[i].sh_size;
-                }
-        }
-        /* Update entry point based on load address of text section */
-        if (entry_sidx >= 0)
-                entry += sechdrs[entry_sidx].sh_addr;
-        /* Make kernel jump to purgatory after shutdown */
-        image->start = entry;
-        /* Used later to get/set symbol values */
-        pi->sechdrs = sechdrs;
-        /*
-         * Used later to identify which section is purgatory and skip it
-         * from checksumming.
-         */
-        pi->purgatory_buf = purgatory_buf;
-        return ret;
-out:
-        vfree(sechdrs);
-        vfree(purgatory_buf);
-        return ret;
-}
-static int kexec_apply_relocations(struct kimage *image)
-{
-        int i, ret;
-        struct purgatory_info *pi = &image->purgatory_info;
-        Elf_Shdr *sechdrs = pi->sechdrs;
-        /* Apply relocations */
-        for (i = 0; i < pi->ehdr->e_shnum; i++) {
-                Elf_Shdr *section, *symtab;
-                if (sechdrs[i].sh_type != SHT_RELA &&
-                    sechdrs[i].sh_type != SHT_REL)
-                        continue;
-                /*
-                 * For section of type SHT_RELA/SHT_REL,
-                 * ->sh_link contains section header index of associated
-                 * symbol table. And ->sh_info contains section header
-                 * index of section to which relocations apply.
-                 */
-                if (sechdrs[i].sh_info >= pi->ehdr->e_shnum ||
-                    sechdrs[i].sh_link >= pi->ehdr->e_shnum)
-                        return -ENOEXEC;
-                section = &sechdrs[sechdrs[i].sh_info];
-                symtab = &sechdrs[sechdrs[i].sh_link];
-                if (!(section->sh_flags & SHF_ALLOC))
-                        continue;
-                /*
-                 * symtab->sh_link contain section header index of associated
-                 * string table.
-                 */
-                if (symtab->sh_link >= pi->ehdr->e_shnum)
-                        /* Invalid section number? */
-                        continue;
-                /*
-                 * Respective architecture needs to provide support for applying
-                 * relocations of type SHT_RELA/SHT_REL.
-                 */
-                if (sechdrs[i].sh_type == SHT_RELA)
-                        ret = arch_kexec_apply_relocations_add(pi->ehdr,
-                                                               sechdrs, i);
-                else if (sechdrs[i].sh_type == SHT_REL)
-                        ret = arch_kexec_apply_relocations(pi->ehdr,
-                                                           sechdrs, i);
-                if (ret)
-                        return ret;
-        }
-        return 0;
-}
-/* Load relocatable purgatory object and relocate it appropriately */
-int kexec_load_purgatory(struct kimage *image, unsigned long min,
-                         unsigned long max, int top_down,
-                         unsigned long *load_addr)
-{
-        struct purgatory_info *pi = &image->purgatory_info;
-        int ret;
-        if (kexec_purgatory_size <= 0)
-                return -EINVAL;
-        if (kexec_purgatory_size < sizeof(Elf_Ehdr))
-                return -ENOEXEC;
-        pi->ehdr = (Elf_Ehdr *)kexec_purgatory;
-        if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0
-            || pi->ehdr->e_type != ET_REL
-            || !elf_check_arch(pi->ehdr)
-            || pi->ehdr->e_shentsize != sizeof(Elf_Shdr))
-                return -ENOEXEC;
-        if (pi->ehdr->e_shoff >= kexec_purgatory_size
-            || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) >
-            kexec_purgatory_size - pi->ehdr->e_shoff))
-                return -ENOEXEC;
-        ret = __kexec_load_purgatory(image, min, max, top_down);
-        if (ret)
-                return ret;
-        ret = kexec_apply_relocations(image);
-        if (ret)
-                goto out;
-        *load_addr = pi->purgatory_load_addr;
-        return 0;
-out:
-        vfree(pi->sechdrs);
-        vfree(pi->purgatory_buf);
-        return ret;
-}
-static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
-                                            const char *name)
-{
-        Elf_Sym *syms;
-        Elf_Shdr *sechdrs;
-        Elf_Ehdr *ehdr;
-        int i, k;
-        const char *strtab;
-        if (!pi->sechdrs || !pi->ehdr)
-                return NULL;
-        sechdrs = pi->sechdrs;
-        ehdr = pi->ehdr;
-        for (i = 0; i < ehdr->e_shnum; i++) {
-                if (sechdrs[i].sh_type != SHT_SYMTAB)
-                        continue;
-                if (sechdrs[i].sh_link >= ehdr->e_shnum)
-                        /* Invalid strtab section number */
-                        continue;
-                strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset;
-                syms = (Elf_Sym *)sechdrs[i].sh_offset;
-                /* Go through symbols for a match */
-                for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) {
-                        if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL)
-                                continue;
-                        if (strcmp(strtab + syms[k].st_name, name) != 0)
-                                continue;
-                        if (syms[k].st_shndx == SHN_UNDEF ||
-                            syms[k].st_shndx >= ehdr->e_shnum) {
-                                pr_debug("Symbol: %s has bad section index %d.\n",
-                                                name, syms[k].st_shndx);
-                                return NULL;
-                        }
-                        /* Found the symbol we are looking for */
-                        return &syms[k];
-                }
-        }
-        return NULL;
-}
-void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name)
-{
-        struct purgatory_info *pi = &image->purgatory_info;
-        Elf_Sym *sym;
-        Elf_Shdr *sechdr;
-        sym = kexec_purgatory_find_symbol(pi, name);
-        if (!sym)
-                return ERR_PTR(-EINVAL);
-        sechdr = &pi->sechdrs[sym->st_shndx];
-        /*
-         * Returns the address where symbol will finally be loaded after
-         * kexec_load_segment()
-         */
-        return (void *)(sechdr->sh_addr + sym->st_value);
-}
-/*
- * Get or set value of a symbol. If "get_value" is true, symbol value is
- * returned in buf otherwise symbol value is set based on value in buf.
- */
-int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
-                                   void *buf, unsigned int size, bool get_value)
-{
-        Elf_Sym *sym;
-        Elf_Shdr *sechdrs;
-        struct purgatory_info *pi = &image->purgatory_info;
-        char *sym_buf;
-        sym = kexec_purgatory_find_symbol(pi, name);
-        if (!sym)
-                return -EINVAL;
-        if (sym->st_size != size) {
-                pr_err("symbol %s size mismatch: expected %lu actual %u\n",
-                       name, (unsigned long)sym->st_size, size);
-                return -EINVAL;
-        }
-        sechdrs = pi->sechdrs;
-        if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
-                pr_err("symbol %s is in a bss section. Cannot %s\n", name,
-                       get_value ? "get" : "set");
-                return -EINVAL;
-        }
-        sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset +
-                                        sym->st_value;
-        if (get_value)
-                memcpy((void *)buf, sym_buf, size);
-        else
-                memcpy((void *)sym_buf, buf, size);
-        return 0;
-}
-#endif /* CONFIG_KEXEC_FILE */
-/*
- * Move into place and start executing a preloaded standalone
- * executable.  If nothing was preloaded return an error.
- */
-int kernel_kexec(void)
-{
-        int error = 0;
-        if (!mutex_trylock(&kexec_mutex))
-                return -EBUSY;
-        if (!kexec_image) {
-                error = -EINVAL;
-                goto Unlock;
-        }
-#ifdef CONFIG_KEXEC_JUMP
-        if (kexec_image->preserve_context) {
-                lock_system_sleep();
-                pm_prepare_console();
-                error = freeze_processes();
-                if (error) {
-                        error = -EBUSY;
-                        goto Restore_console;
-                }
-                suspend_console();
-                error = dpm_suspend_start(PMSG_FREEZE);
-                if (error)
-                        goto Resume_console;
-                /* At this point, dpm_suspend_start() has been called,
-                 * but *not* dpm_suspend_end(). We *must* call
-                 * dpm_suspend_end() now.  Otherwise, drivers for
-                 * some devices (e.g. interrupt controllers) become
-                 * desynchronized with the actual state of the
-                 * hardware at resume time, and evil weirdness ensues.
-                 */
-                error = dpm_suspend_end(PMSG_FREEZE);
-                if (error)
-                        goto Resume_devices;
-                error = disable_nonboot_cpus();
-                if (error)
-                        goto Enable_cpus;
-                local_irq_disable();
-                error = syscore_suspend();
-                if (error)
-                        goto Enable_irqs;
-        } else
-#endif
-        {
-                kexec_in_progress = true;
-                kernel_restart_prepare(NULL);
-                migrate_to_reboot_cpu();
-                /*
-                 * migrate_to_reboot_cpu() disables CPU hotplug assuming that
-                 * no further code needs to use CPU hotplug (which is true in
-                 * the reboot case). However, the kexec path depends on using
-                 * CPU hotplug again; so re-enable it here.
-                 */
-                cpu_hotplug_enable();
-                pr_emerg("Starting new kernel\n");
-                machine_shutdown();
-        }
-        machine_kexec(kexec_image);
-#ifdef CONFIG_KEXEC_JUMP
-        if (kexec_image->preserve_context) {
-                syscore_resume();
- Enable_irqs:
-                local_irq_enable();
- Enable_cpus:
-                enable_nonboot_cpus();
-                dpm_resume_start(PMSG_RESTORE);
- Resume_devices:
-                dpm_resume_end(PMSG_RESTORE);
- Resume_console:
-                resume_console();
-                thaw_processes();
- Restore_console:
-                pm_restore_console();
-                unlock_system_sleep();
-        }
-#endif
- Unlock:
-        mutex_unlock(&kexec_mutex);
-        return error;
-}
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
new file mode 100644
index 000000000000..201b45327804
--- /dev/null
+++ b/kernel/kexec_core.c
@@ -0,0 +1,1534 @@
+/*
+ * kexec.c - kexec system call core code.
+ * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+#define pr_fmt(fmt)     "kexec: " fmt
+#include <linux/capability.h>
+#include <linux/mm.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/kexec.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <linux/syscalls.h>
+#include <linux/reboot.h>
+#include <linux/ioport.h>
+#include <linux/hardirq.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
+#include <linux/utsname.h>
+#include <linux/numa.h>
+#include <linux/suspend.h>
+#include <linux/device.h>
+#include <linux/freezer.h>
+#include <linux/pm.h>
+#include <linux/cpu.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
+#include <linux/console.h>
+#include <linux/vmalloc.h>
+#include <linux/swap.h>
+#include <linux/syscore_ops.h>
+#include <linux/compiler.h>
+#include <linux/hugetlb.h>
+#include <asm/page.h>
+#include <asm/sections.h>
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+#include "kexec_internal.h"
+DEFINE_MUTEX(kexec_mutex);
+/* Per cpu memory for storing cpu states in case of system crash. */
+note_buf_t __percpu *crash_notes;
+/* vmcoreinfo stuff */
+static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
+u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
+size_t vmcoreinfo_size;
+size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
+/* Flag to indicate we are going to kexec a new kernel */
+bool kexec_in_progress = false;
+/* Location of the reserved area for the crash kernel */
+struct resource crashk_res = {
+        .name  = "Crash kernel",
+        .start = 0,
+        .end   = 0,
+        .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+struct resource crashk_low_res = {
+        .name  = "Crash kernel",
+        .start = 0,
+        .end   = 0,
+        .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+int kexec_should_crash(struct task_struct *p)
+{
+        /*
+         * If crash_kexec_post_notifiers is enabled, don't run
+         * crash_kexec() here yet, which must be run after panic
+         * notifiers in panic().
+         */
+        if (crash_kexec_post_notifiers)
+                return 0;
+        /*
+         * There are 4 panic() calls in do_exit() path, each of which
+         * corresponds to each of these 4 conditions.
+         */
+        if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
+                return 1;
+        return 0;
+}
+/*
+ * When kexec transitions to the new kernel there is a one-to-one
+ * mapping between physical and virtual addresses.  On processors
+ * where you can disable the MMU this is trivial, and easy.  For
+ * others it is still a simple predictable page table to setup.
+ *
+ * In that environment kexec copies the new kernel to its final
+ * resting place.  This means I can only support memory whose
+ * physical address can fit in an unsigned long.  In particular
+ * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
+ * If the assembly stub has more restrictive requirements
+ * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
+ * defined more restrictively in <asm/kexec.h>.
+ *
+ * The code for the transition from the current kernel to the
+ * the new kernel is placed in the control_code_buffer, whose size
+ * is given by KEXEC_CONTROL_PAGE_SIZE.  In the best case only a single
+ * page of memory is necessary, but some architectures require more.
+ * Because this memory must be identity mapped in the transition from
+ * virtual to physical addresses it must live in the range
+ * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
+ * modifiable.
+ *
+ * The assembly stub in the control code buffer is passed a linked list
+ * of descriptor pages detailing the source pages of the new kernel,
+ * and the destination addresses of those source pages.  As this data
+ * structure is not used in the context of the current OS, it must
+ * be self-contained.
+ *
+ * The code has been made to work with highmem pages and will use a
+ * destination page in its final resting place (if it happens
+ * to allocate it).  The end product of this is that most of the
+ * physical address space, and most of RAM can be used.
+ *
+ * Future directions include:
+ *  - allocating a page table with the control code buffer identity
+ *    mapped, to simplify machine_kexec and make kexec_on_panic more
+ *    reliable.
+ */
+/*
+ * KIMAGE_NO_DEST is an impossible destination address..., for
+ * allocating pages whose destination address we do not care about.
+ */
+#define KIMAGE_NO_DEST (-1UL)
+static struct page *kimage_alloc_page(struct kimage *image,
+                                       gfp_t gfp_mask,
+                                       unsigned long dest);
+int sanity_check_segment_list(struct kimage *image)
+{
+        int result, i;
+        unsigned long nr_segments = image->nr_segments;
+        /*
+         * Verify we have good destination addresses.  The caller is
+         * responsible for making certain we don't attempt to load
+         * the new image into invalid or reserved areas of RAM.  This
+         * just verifies it is an address we can use.
+         *
+         * Since the kernel does everything in page size chunks ensure
+         * the destination addresses are page aligned.  Too many
+         * special cases crop of when we don't do this.  The most
+         * insidious is getting overlapping destination addresses
+         * simply because addresses are changed to page size
+         * granularity.
+         */
+        result = -EADDRNOTAVAIL;
+        for (i = 0; i < nr_segments; i++) {
+                unsigned long mstart, mend;
+                mstart = image->segment[i].mem;
+                mend   = mstart + image->segment[i].memsz;
+                if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
+                        return result;
+                if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
+                        return result;
+        }
+        /* Verify our destination addresses do not overlap.
+         * If we alloed overlapping destination addresses
+         * through very weird things can happen with no
+         * easy explanation as one segment stops on another.
+         */
+        result = -EINVAL;
+        for (i = 0; i < nr_segments; i++) {
+                unsigned long mstart, mend;
+                unsigned long j;
+                mstart = image->segment[i].mem;
+                mend   = mstart + image->segment[i].memsz;
+                for (j = 0; j < i; j++) {
+                        unsigned long pstart, pend;
+                        pstart = image->segment[j].mem;
+                        pend   = pstart + image->segment[j].memsz;
+                        /* Do the segments overlap ? */
+                        if ((mend > pstart) && (mstart < pend))
+                                return result;
+                }
+        }
+        /* Ensure our buffer sizes are strictly less than
+         * our memory sizes.  This should always be the case,
+         * and it is easier to check up front than to be surprised
+         * later on.
+         */
+        result = -EINVAL;
+        for (i = 0; i < nr_segments; i++) {
+                if (image->segment[i].bufsz > image->segment[i].memsz)
+                        return result;
+        }
+        /*
+         * Verify we have good destination addresses.  Normally
+         * the caller is responsible for making certain we don't
+         * attempt to load the new image into invalid or reserved
+         * areas of RAM.  But crash kernels are preloaded into a
+         * reserved area of ram.  We must ensure the addresses
+         * are in the reserved area otherwise preloading the
+         * kernel could corrupt things.
+         */
+        if (image->type == KEXEC_TYPE_CRASH) {
+                result = -EADDRNOTAVAIL;
+                for (i = 0; i < nr_segments; i++) {
+                        unsigned long mstart, mend;
+                        mstart = image->segment[i].mem;
+                        mend = mstart + image->segment[i].memsz - 1;
+                        /* Ensure we are within the crash kernel limits */
+                        if ((mstart < crashk_res.start) ||
+                            (mend > crashk_res.end))
+                                return result;
+                }
+        }
+        return 0;
+}
+struct kimage *do_kimage_alloc_init(void)
+{
+        struct kimage *image;
+        /* Allocate a controlling structure */
+        image = kzalloc(sizeof(*image), GFP_KERNEL);
+        if (!image)
+                return NULL;
+        image->head = 0;
+        image->entry = &image->head;
+        image->last_entry = &image->head;
+        image->control_page = ~0; /* By default this does not apply */
+        image->type = KEXEC_TYPE_DEFAULT;
+        /* Initialize the list of control pages */
+        INIT_LIST_HEAD(&image->control_pages);
+        /* Initialize the list of destination pages */
+        INIT_LIST_HEAD(&image->dest_pages);
+        /* Initialize the list of unusable pages */
+        INIT_LIST_HEAD(&image->unusable_pages);
+        return image;
+}
+int kimage_is_destination_range(struct kimage *image,
+                                        unsigned long start,
+                                        unsigned long end)
+{
+        unsigned long i;
+        for (i = 0; i < image->nr_segments; i++) {
+                unsigned long mstart, mend;
+                mstart = image->segment[i].mem;
+                mend = mstart + image->segment[i].memsz;
+                if ((end > mstart) && (start < mend))
+                        return 1;
+        }
+        return 0;
+}
+static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
+{
+        struct page *pages;
+        pages = alloc_pages(gfp_mask, order);
+        if (pages) {
+                unsigned int count, i;
+                pages->mapping = NULL;
+                set_page_private(pages, order);
+                count = 1 << order;
+                for (i = 0; i < count; i++)
+                        SetPageReserved(pages + i);
+        }
+        return pages;
+}
+static void kimage_free_pages(struct page *page)
+{
+        unsigned int order, count, i;
+        order = page_private(page);
+        count = 1 << order;
+        for (i = 0; i < count; i++)
+                ClearPageReserved(page + i);
+        __free_pages(page, order);
+}
+void kimage_free_page_list(struct list_head *list)
+{
+        struct list_head *pos, *next;
+        list_for_each_safe(pos, next, list) {
+                struct page *page;
+                page = list_entry(pos, struct page, lru);
+                list_del(&page->lru);
+                kimage_free_pages(page);
+        }
+}
+static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
+                                                        unsigned int order)
+{
+        /* Control pages are special, they are the intermediaries
+         * that are needed while we copy the rest of the pages
+         * to their final resting place.  As such they must
+         * not conflict with either the destination addresses
+         * or memory the kernel is already using.
+         *
+         * The only case where we really need more than one of
+         * these are for architectures where we cannot disable
+         * the MMU and must instead generate an identity mapped
+         * page table for all of the memory.
+         *
+         * At worst this runs in O(N) of the image size.
+         */
+        struct list_head extra_pages;
+        struct page *pages;
+        unsigned int count;
+        count = 1 << order;
+        INIT_LIST_HEAD(&extra_pages);
+        /* Loop while I can allocate a page and the page allocated
+         * is a destination page.
+         */
+        do {
+                unsigned long pfn, epfn, addr, eaddr;
+                pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order);
+                if (!pages)
+                        break;
+                pfn   = page_to_pfn(pages);
+                epfn  = pfn + count;
+                addr  = pfn << PAGE_SHIFT;
+                eaddr = epfn << PAGE_SHIFT;
+                if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
+                              kimage_is_destination_range(image, addr, eaddr)) {
+                        list_add(&pages->lru, &extra_pages);
+                        pages = NULL;
+                }
+        } while (!pages);
+        if (pages) {
+                /* Remember the allocated page... */
+                list_add(&pages->lru, &image->control_pages);
+                /* Because the page is already in it's destination
+                 * location we will never allocate another page at
+                 * that address.  Therefore kimage_alloc_pages
+                 * will not return it (again) and we don't need
+                 * to give it an entry in image->segment[].
+                 */
+        }
+        /* Deal with the destination pages I have inadvertently allocated.
+         *
+         * Ideally I would convert multi-page allocations into single
+         * page allocations, and add everything to image->dest_pages.
+         *
+         * For now it is simpler to just free the pages.
+         */
+        kimage_free_page_list(&extra_pages);
+        return pages;
+}
+static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
+                                                      unsigned int order)
+{
+        /* Control pages are special, they are the intermediaries
+         * that are needed while we copy the rest of the pages
+         * to their final resting place.  As such they must
+         * not conflict with either the destination addresses
+         * or memory the kernel is already using.
+         *
+         * Control pages are also the only pags we must allocate
+         * when loading a crash kernel.  All of the other pages
+         * are specified by the segments and we just memcpy
+         * into them directly.
+         *
+         * The only case where we really need more than one of
+         * these are for architectures where we cannot disable
+         * the MMU and must instead generate an identity mapped
+         * page table for all of the memory.
+         *
+         * Given the low demand this implements a very simple
+         * allocator that finds the first hole of the appropriate
+         * size in the reserved memory region, and allocates all
+         * of the memory up to and including the hole.
+         */
+        unsigned long hole_start, hole_end, size;
+        struct page *pages;
+        pages = NULL;
+        size = (1 << order) << PAGE_SHIFT;
+        hole_start = (image->control_page + (size - 1)) & ~(size - 1);
+        hole_end   = hole_start + size - 1;
+        while (hole_end <= crashk_res.end) {
+                unsigned long i;
+                if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
+                        break;
+                /* See if I overlap any of the segments */
+                for (i = 0; i < image->nr_segments; i++) {
+                        unsigned long mstart, mend;
+                        mstart = image->segment[i].mem;
+                        mend   = mstart + image->segment[i].memsz - 1;
+                        if ((hole_end >= mstart) && (hole_start <= mend)) {
+                                /* Advance the hole to the end of the segment */
+                                hole_start = (mend + (size - 1)) & ~(size - 1);
+                                hole_end   = hole_start + size - 1;
+                                break;
+                        }
+                }
+                /* If I don't overlap any segments I have found my hole! */
+                if (i == image->nr_segments) {
+                        pages = pfn_to_page(hole_start >> PAGE_SHIFT);
+                        image->control_page = hole_end;
+                        break;
+                }
+        }
+        return pages;
+}
+struct page *kimage_alloc_control_pages(struct kimage *image,
+                                         unsigned int order)
+{
+        struct page *pages = NULL;
+        switch (image->type) {
+        case KEXEC_TYPE_DEFAULT:
+                pages = kimage_alloc_normal_control_pages(image, order);
+                break;
+        case KEXEC_TYPE_CRASH:
+                pages = kimage_alloc_crash_control_pages(image, order);
+                break;
+        }
+        return pages;
+}
+static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
+{
+        if (*image->entry != 0)
+                image->entry++;
+        if (image->entry == image->last_entry) {
+                kimage_entry_t *ind_page;
+                struct page *page;
+                page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
+                if (!page)
+                        return -ENOMEM;
+                ind_page = page_address(page);
+                *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
+                image->entry = ind_page;
+                image->last_entry = ind_page +
+                                      ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
+        }
+        *image->entry = entry;
+        image->entry++;
+        *image->entry = 0;
+        return 0;
+}
+static int kimage_set_destination(struct kimage *image,
+                                   unsigned long destination)
+{
+        int result;
+        destination &= PAGE_MASK;
+        result = kimage_add_entry(image, destination | IND_DESTINATION);
+        return result;
+}
+static int kimage_add_page(struct kimage *image, unsigned long page)
+{
+        int result;
+        page &= PAGE_MASK;
+        result = kimage_add_entry(image, page | IND_SOURCE);
+        return result;
+}
+static void kimage_free_extra_pages(struct kimage *image)
+{
+        /* Walk through and free any extra destination pages I may have */
+        kimage_free_page_list(&image->dest_pages);
+        /* Walk through and free any unusable pages I have cached */
+        kimage_free_page_list(&image->unusable_pages);
+}
+void kimage_terminate(struct kimage *image)
+{
+        if (*image->entry != 0)
+                image->entry++;
+        *image->entry = IND_DONE;
+}
+#define for_each_kimage_entry(image, ptr, entry) \
+        for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
+                ptr = (entry & IND_INDIRECTION) ? \
+                        phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
+static void kimage_free_entry(kimage_entry_t entry)
+{
+        struct page *page;
+        page = pfn_to_page(entry >> PAGE_SHIFT);
+        kimage_free_pages(page);
+}
+void kimage_free(struct kimage *image)
+{
+        kimage_entry_t *ptr, entry;
+        kimage_entry_t ind = 0;
+        if (!image)
+                return;
+        kimage_free_extra_pages(image);
+        for_each_kimage_entry(image, ptr, entry) {
+                if (entry & IND_INDIRECTION) {
+                        /* Free the previous indirection page */
+                        if (ind & IND_INDIRECTION)
+                                kimage_free_entry(ind);
+                        /* Save this indirection page until we are
+                         * done with it.
+                         */
+                        ind = entry;
+                } else if (entry & IND_SOURCE)
+                        kimage_free_entry(entry);
+        }
+        /* Free the final indirection page */
+        if (ind & IND_INDIRECTION)
+                kimage_free_entry(ind);
+        /* Handle any machine specific cleanup */
+        machine_kexec_cleanup(image);
+        /* Free the kexec control pages... */
+        kimage_free_page_list(&image->control_pages);
+        /*
+         * Free up any temporary buffers allocated. This might hit if
+         * error occurred much later after buffer allocation.
+         */
+        if (image->file_mode)
+                kimage_file_post_load_cleanup(image);
+        kfree(image);
+}
+static kimage_entry_t *kimage_dst_used(struct kimage *image,
+                                        unsigned long page)
+{
+        kimage_entry_t *ptr, entry;
+        unsigned long destination = 0;
+        for_each_kimage_entry(image, ptr, entry) {
+                if (entry & IND_DESTINATION)
+                        destination = entry & PAGE_MASK;
+                else if (entry & IND_SOURCE) {
+                        if (page == destination)
+                                return ptr;
+                        destination += PAGE_SIZE;
+                }
+        }
+        return NULL;
+}
+static struct page *kimage_alloc_page(struct kimage *image,
+                                        gfp_t gfp_mask,
+                                        unsigned long destination)
+{
+        /*
+         * Here we implement safeguards to ensure that a source page
+         * is not copied to its destination page before the data on
+         * the destination page is no longer useful.
+         *
+         * To do this we maintain the invariant that a source page is
+         * either its own destination page, or it is not a
+         * destination page at all.
+         *
+         * That is slightly stronger than required, but the proof
+         * that no problems will not occur is trivial, and the
+         * implementation is simply to verify.
+         *
+         * When allocating all pages normally this algorithm will run
+         * in O(N) time, but in the worst case it will run in O(N^2)
+         * time.   If the runtime is a problem the data structures can
+         * be fixed.
+         */
+        struct page *page;
+        unsigned long addr;
+        /*
+         * Walk through the list of destination pages, and see if I
+         * have a match.
+         */
+        list_for_each_entry(page, &image->dest_pages, lru) {
+                addr = page_to_pfn(page) << PAGE_SHIFT;
+                if (addr == destination) {
+                        list_del(&page->lru);
+                        return page;
+                }
+        }
+        page = NULL;
+        while (1) {
+                kimage_entry_t *old;
+                /* Allocate a page, if we run out of memory give up */
+                page = kimage_alloc_pages(gfp_mask, 0);
+                if (!page)
+                        return NULL;
+                /* If the page cannot be used file it away */
+                if (page_to_pfn(page) >
+                                (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
+                        list_add(&page->lru, &image->unusable_pages);
+                        continue;
+                }
+                addr = page_to_pfn(page) << PAGE_SHIFT;
+                /* If it is the destination page we want use it */
+                if (addr == destination)
+                        break;
+                /* If the page is not a destination page use it */
+                if (!kimage_is_destination_range(image, addr,
+                                                  addr + PAGE_SIZE))
+                        break;
+                /*
+                 * I know that the page is someones destination page.
+                 * See if there is already a source page for this
+                 * destination page.  And if so swap the source pages.
+                 */
+                old = kimage_dst_used(image, addr);
+                if (old) {
+                        /* If so move it */
+                        unsigned long old_addr;
+                        struct page *old_page;
+                        old_addr = *old & PAGE_MASK;
+                        old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
+                        copy_highpage(page, old_page);
+                        *old = addr | (*old & ~PAGE_MASK);
+                        /* The old page I have found cannot be a
+                         * destination page, so return it if it's
+                         * gfp_flags honor the ones passed in.
+                         */
+                        if (!(gfp_mask & __GFP_HIGHMEM) &&
+                            PageHighMem(old_page)) {
+                                kimage_free_pages(old_page);
+                                continue;
+                        }
+                        addr = old_addr;
+                        page = old_page;
+                        break;
+                }
+                /* Place the page on the destination list, to be used later */
+                list_add(&page->lru, &image->dest_pages);
+        }
+        return page;
+}
+static int kimage_load_normal_segment(struct kimage *image,
+                                         struct kexec_segment *segment)
+{
+        unsigned long maddr;
+        size_t ubytes, mbytes;
+        int result;
+        unsigned char __user *buf = NULL;
+        unsigned char *kbuf = NULL;
+        result = 0;
+        if (image->file_mode)
+                kbuf = segment->kbuf;
+        else
+                buf = segment->buf;
+        ubytes = segment->bufsz;
+        mbytes = segment->memsz;
+        maddr = segment->mem;
+        result = kimage_set_destination(image, maddr);
+        if (result < 0)
+                goto out;
+        while (mbytes) {
+                struct page *page;
+                char *ptr;
+                size_t uchunk, mchunk;
+                page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
+                if (!page) {
+                        result  = -ENOMEM;
+                        goto out;
+                }
+                result = kimage_add_page(image, page_to_pfn(page)
+                                                                << PAGE_SHIFT);
+                if (result < 0)
+                        goto out;
+                ptr = kmap(page);
+                /* Start with a clear page */
+                clear_page(ptr);
+                ptr += maddr & ~PAGE_MASK;
+                mchunk = min_t(size_t, mbytes,
+                                PAGE_SIZE - (maddr & ~PAGE_MASK));
+                uchunk = min(ubytes, mchunk);
+                /* For file based kexec, source pages are in kernel memory */
+                if (image->file_mode)
+                        memcpy(ptr, kbuf, uchunk);
+                else
+                        result = copy_from_user(ptr, buf, uchunk);
+                kunmap(page);
+                if (result) {
+                        result = -EFAULT;
+                        goto out;
+                }
+                ubytes -= uchunk;
+                maddr  += mchunk;
+                if (image->file_mode)
+                        kbuf += mchunk;
+                else
+                        buf += mchunk;
+                mbytes -= mchunk;
+        }
+out:
+        return result;
+}
+static int kimage_load_crash_segment(struct kimage *image,
+                                        struct kexec_segment *segment)
+{
+        /* For crash dumps kernels we simply copy the data from
+         * user space to it's destination.
+         * We do things a page at a time for the sake of kmap.
+         */
+        unsigned long maddr;
+        size_t ubytes, mbytes;
+        int result;
+        unsigned char __user *buf = NULL;
+        unsigned char *kbuf = NULL;
+        result = 0;
+        if (image->file_mode)
+                kbuf = segment->kbuf;
+        else
+                buf = segment->buf;
+        ubytes = segment->bufsz;
+        mbytes = segment->memsz;
+        maddr = segment->mem;
+        while (mbytes) {
+                struct page *page;
+                char *ptr;
+                size_t uchunk, mchunk;
+                page = pfn_to_page(maddr >> PAGE_SHIFT);
+                if (!page) {
+                        result  = -ENOMEM;
+                        goto out;
+                }
+                ptr = kmap(page);
+                ptr += maddr & ~PAGE_MASK;
+                mchunk = min_t(size_t, mbytes,
+                                PAGE_SIZE - (maddr & ~PAGE_MASK));
+                uchunk = min(ubytes, mchunk);
+                if (mchunk > uchunk) {
+                        /* Zero the trailing part of the page */
+                        memset(ptr + uchunk, 0, mchunk - uchunk);
+                }
+                /* For file based kexec, source pages are in kernel memory */
+                if (image->file_mode)
+                        memcpy(ptr, kbuf, uchunk);
+                else
+                        result = copy_from_user(ptr, buf, uchunk);
+                kexec_flush_icache_page(page);
+                kunmap(page);
+                if (result) {
+                        result = -EFAULT;
+                        goto out;
+                }
+                ubytes -= uchunk;
+                maddr  += mchunk;
+                if (image->file_mode)
+                        kbuf += mchunk;
+                else
+                        buf += mchunk;
+                mbytes -= mchunk;
+        }
+out:
+        return result;
+}
+int kimage_load_segment(struct kimage *image,
+                                struct kexec_segment *segment)
+{
+        int result = -ENOMEM;
+        switch (image->type) {
+        case KEXEC_TYPE_DEFAULT:
+                result = kimage_load_normal_segment(image, segment);
+                break;
+        case KEXEC_TYPE_CRASH:
+                result = kimage_load_crash_segment(image, segment);
+                break;
+        }
+        return result;
+}
+struct kimage *kexec_image;
+struct kimage *kexec_crash_image;
+int kexec_load_disabled;
+void crash_kexec(struct pt_regs *regs)
+{
+        /* Take the kexec_mutex here to prevent sys_kexec_load
+         * running on one cpu from replacing the crash kernel
+         * we are using after a panic on a different cpu.
+         *
+         * If the crash kernel was not located in a fixed area
+         * of memory the xchg(&kexec_crash_image) would be
+         * sufficient.  But since I reuse the memory...
+         */
+        if (mutex_trylock(&kexec_mutex)) {
+                if (kexec_crash_image) {
+                        struct pt_regs fixed_regs;
+                        crash_setup_regs(&fixed_regs, regs);
+                        crash_save_vmcoreinfo();
+                        machine_crash_shutdown(&fixed_regs);
+                        machine_kexec(kexec_crash_image);
+                }
+                mutex_unlock(&kexec_mutex);
+        }
+}
+size_t crash_get_memory_size(void)
+{
+        size_t size = 0;
+        mutex_lock(&kexec_mutex);
+        if (crashk_res.end != crashk_res.start)
+                size = resource_size(&crashk_res);
+        mutex_unlock(&kexec_mutex);
+        return size;
+}
+void __weak crash_free_reserved_phys_range(unsigned long begin,
+                                           unsigned long end)
+{
+        unsigned long addr;
+        for (addr = begin; addr < end; addr += PAGE_SIZE)
+                free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
+}
+int crash_shrink_memory(unsigned long new_size)
+{
+        int ret = 0;
+        unsigned long start, end;
+        unsigned long old_size;
+        struct resource *ram_res;
+        mutex_lock(&kexec_mutex);
+        if (kexec_crash_image) {
+                ret = -ENOENT;
+                goto unlock;
+        }
+        start = crashk_res.start;
+        end = crashk_res.end;
+        old_size = (end == 0) ? 0 : end - start + 1;
+        if (new_size >= old_size) {
+                ret = (new_size == old_size) ? 0 : -EINVAL;
+                goto unlock;
+        }
+        ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
+        if (!ram_res) {
+                ret = -ENOMEM;
+                goto unlock;
+        }
+        start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
+        end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
+        crash_map_reserved_pages();
+        crash_free_reserved_phys_range(end, crashk_res.end);
+        if ((start == end) && (crashk_res.parent != NULL))
+                release_resource(&crashk_res);
+        ram_res->start = end;
+        ram_res->end = crashk_res.end;
+        ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+        ram_res->name = "System RAM";
+        crashk_res.end = end - 1;
+        insert_resource(&iomem_resource, ram_res);
+        crash_unmap_reserved_pages();
+unlock:
+        mutex_unlock(&kexec_mutex);
+        return ret;
+}
+static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
+                            size_t data_len)
+{
+        struct elf_note note;
+        note.n_namesz = strlen(name) + 1;
+        note.n_descsz = data_len;
+        note.n_type   = type;
+        memcpy(buf, &note, sizeof(note));
+        buf += (sizeof(note) + 3)/4;
+        memcpy(buf, name, note.n_namesz);
+        buf += (note.n_namesz + 3)/4;
+        memcpy(buf, data, note.n_descsz);
+        buf += (note.n_descsz + 3)/4;
+        return buf;
+}
+static void final_note(u32 *buf)
+{
+        struct elf_note note;
+        note.n_namesz = 0;
+        note.n_descsz = 0;
+        note.n_type   = 0;
+        memcpy(buf, &note, sizeof(note));
+}
+void crash_save_cpu(struct pt_regs *regs, int cpu)
+{
+        struct elf_prstatus prstatus;
+        u32 *buf;
+        if ((cpu < 0) || (cpu >= nr_cpu_ids))
+                return;
+        /* Using ELF notes here is opportunistic.
+         * I need a well defined structure format
+         * for the data I pass, and I need tags
+         * on the data to indicate what information I have
+         * squirrelled away.  ELF notes happen to provide
+         * all of that, so there is no need to invent something new.
+         */
+        buf = (u32 *)per_cpu_ptr(crash_notes, cpu);
+        if (!buf)
+                return;
+        memset(&prstatus, 0, sizeof(prstatus));
+        prstatus.pr_pid = current->pid;
+        elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
+        buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
+                              &prstatus, sizeof(prstatus));
+        final_note(buf);
+}
+static int __init crash_notes_memory_init(void)
+{
+        /* Allocate memory for saving cpu registers. */
+        size_t size, align;
+        /*
+         * crash_notes could be allocated across 2 vmalloc pages when percpu
+         * is vmalloc based . vmalloc doesn't guarantee 2 continuous vmalloc
+         * pages are also on 2 continuous physical pages. In this case the
+         * 2nd part of crash_notes in 2nd page could be lost since only the
+         * starting address and size of crash_notes are exported through sysfs.
+         * Here round up the size of crash_notes to the nearest power of two
+         * and pass it to __alloc_percpu as align value. This can make sure
+         * crash_notes is allocated inside one physical page.
+         */
+        size = sizeof(note_buf_t);
+        align = min(roundup_pow_of_two(sizeof(note_buf_t)), PAGE_SIZE);
+        /*
+         * Break compile if size is bigger than PAGE_SIZE since crash_notes
+         * definitely will be in 2 pages with that.
+         */
+        BUILD_BUG_ON(size > PAGE_SIZE);
+        crash_notes = __alloc_percpu(size, align);
+        if (!crash_notes) {
+                pr_warn("Kexec: Memory allocation for saving cpu register states failed\n");
+                return -ENOMEM;
+        }
+        return 0;
+}
+subsys_initcall(crash_notes_memory_init);
+/*
+ * parsing the "crashkernel" commandline
+ *
+ * this code is intended to be called from architecture specific code
+ */
+/*
+ * This function parses command lines in the format
+ *
+ *   crashkernel=ramsize-range:size[,...][@offset]
+ *
+ * The function returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_mem(char *cmdline,
+                                        unsigned long long system_ram,
+                                        unsigned long long *crash_size,
+                                        unsigned long long *crash_base)
+{
+        char *cur = cmdline, *tmp;
+        /* for each entry of the comma-separated list */
+        do {
+                unsigned long long start, end = ULLONG_MAX, size;
+                /* get the start of the range */
+                start = memparse(cur, &tmp);
+                if (cur == tmp) {
+                        pr_warn("crashkernel: Memory value expected\n");
+                        return -EINVAL;
+                }
+                cur = tmp;
+                if (*cur != '-') {
+                        pr_warn("crashkernel: '-' expected\n");
+                        return -EINVAL;
+                }
+                cur++;
+                /* if no ':' is here, than we read the end */
+                if (*cur != ':') {
+                        end = memparse(cur, &tmp);
+                        if (cur == tmp) {
+                                pr_warn("crashkernel: Memory value expected\n");
+                                return -EINVAL;
+                        }
+                        cur = tmp;
+                        if (end <= start) {
+                                pr_warn("crashkernel: end <= start\n");
+                                return -EINVAL;
+                        }
+                }
+                if (*cur != ':') {
+                        pr_warn("crashkernel: ':' expected\n");
+                        return -EINVAL;
+                }
+                cur++;
+                size = memparse(cur, &tmp);
+                if (cur == tmp) {
+                        pr_warn("Memory value expected\n");
+                        return -EINVAL;
+                }
+                cur = tmp;
+                if (size >= system_ram) {
+                        pr_warn("crashkernel: invalid size\n");
+                        return -EINVAL;
+                }
+                /* match ? */
+                if (system_ram >= start && system_ram < end) {
+                        *crash_size = size;
+                        break;
+                }
+        } while (*cur++ == ',');
+        if (*crash_size > 0) {
+                while (*cur && *cur != ' ' && *cur != '@')
+                        cur++;
+                if (*cur == '@') {
+                        cur++;
+                        *crash_base = memparse(cur, &tmp);
+                        if (cur == tmp) {
+                                pr_warn("Memory value expected after '@'\n");
+                                return -EINVAL;
+                        }
+                }
+        }
+        return 0;
+}
+/*
+ * That function parses "simple" (old) crashkernel command lines like
+ *
+ *      crashkernel=size[@offset]
+ *
+ * It returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_simple(char *cmdline,
+                                           unsigned long long *crash_size,
+                                           unsigned long long *crash_base)
+{
+        char *cur = cmdline;
+        *crash_size = memparse(cmdline, &cur);
+        if (cmdline == cur) {
+                pr_warn("crashkernel: memory value expected\n");
+                return -EINVAL;
+        }
+        if (*cur == '@')
+                *crash_base = memparse(cur+1, &cur);
+        else if (*cur != ' ' && *cur != '\0') {
+                pr_warn("crashkernel: unrecognized char\n");
+                return -EINVAL;
+        }
+        return 0;
+}
+#define SUFFIX_HIGH 0
+#define SUFFIX_LOW  1
+#define SUFFIX_NULL 2
+static __initdata char *suffix_tbl[] = {
+        [SUFFIX_HIGH] = ",high",
+        [SUFFIX_LOW]  = ",low",
+        [SUFFIX_NULL] = NULL,
+};
+/*
+ * That function parses "suffix"  crashkernel command lines like
+ *
+ *      crashkernel=size,[high|low]
+ *
+ * It returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_suffix(char *cmdline,
+                                           unsigned long long   *crash_size,
+                                           const char *suffix)
+{
+        char *cur = cmdline;
+        *crash_size = memparse(cmdline, &cur);
+        if (cmdline == cur) {
+                pr_warn("crashkernel: memory value expected\n");
+                return -EINVAL;
+        }
+        /* check with suffix */
+        if (strncmp(cur, suffix, strlen(suffix))) {
+                pr_warn("crashkernel: unrecognized char\n");
+                return -EINVAL;
+        }
+        cur += strlen(suffix);
+        if (*cur != ' ' && *cur != '\0') {
+                pr_warn("crashkernel: unrecognized char\n");
+                return -EINVAL;
+        }
+        return 0;
+}
+static __init char *get_last_crashkernel(char *cmdline,
+                             const char *name,
+                             const char *suffix)
+{
+        char *p = cmdline, *ck_cmdline = NULL;
+        /* find crashkernel and use the last one if there are more */
+        p = strstr(p, name);
+        while (p) {
+                char *end_p = strchr(p, ' ');
+                char *q;
+                if (!end_p)
+                        end_p = p + strlen(p);
+                if (!suffix) {
+                        int i;
+                        /* skip the one with any known suffix */
+                        for (i = 0; suffix_tbl[i]; i++) {
+                                q = end_p - strlen(suffix_tbl[i]);
+                                if (!strncmp(q, suffix_tbl[i],
+                                             strlen(suffix_tbl[i])))
+                                        goto next;
+                        }
+                        ck_cmdline = p;
+                } else {
+                        q = end_p - strlen(suffix);
+                        if (!strncmp(q, suffix, strlen(suffix)))
+                                ck_cmdline = p;
+                }
+next:
+                p = strstr(p+1, name);
+        }
+        if (!ck_cmdline)
+                return NULL;
+        return ck_cmdline;
+}
+static int __init __parse_crashkernel(char *cmdline,
+                             unsigned long long system_ram,
+                             unsigned long long *crash_size,
+                             unsigned long long *crash_base,
+                             const char *name,
+                             const char *suffix)
+{
+        char    *first_colon, *first_space;
+        char    *ck_cmdline;
+        BUG_ON(!crash_size || !crash_base);
+        *crash_size = 0;
+        *crash_base = 0;
+        ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
+        if (!ck_cmdline)
+                return -EINVAL;
+        ck_cmdline += strlen(name);
+        if (suffix)
+                return parse_crashkernel_suffix(ck_cmdline, crash_size,
+                                suffix);
+        /*
+         * if the commandline contains a ':', then that's the extended
+         * syntax -- if not, it must be the classic syntax
+         */
+        first_colon = strchr(ck_cmdline, ':');
+        first_space = strchr(ck_cmdline, ' ');
+        if (first_colon && (!first_space || first_colon < first_space))
+                return parse_crashkernel_mem(ck_cmdline, system_ram,
+                                crash_size, crash_base);
+        return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
+}
+/*
+ * That function is the entry point for command line parsing and should be
+ * called from the arch-specific code.
+ */
+int __init parse_crashkernel(char *cmdline,
+                             unsigned long long system_ram,
+                             unsigned long long *crash_size,
+                             unsigned long long *crash_base)
+{
+        return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+                                        "crashkernel=", NULL);
+}
+int __init parse_crashkernel_high(char *cmdline,
+                             unsigned long long system_ram,
+                             unsigned long long *crash_size,
+                             unsigned long long *crash_base)
+{
+        return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+                                "crashkernel=", suffix_tbl[SUFFIX_HIGH]);
+}
+int __init parse_crashkernel_low(char *cmdline,
+                             unsigned long long system_ram,
+                             unsigned long long *crash_size,
+                             unsigned long long *crash_base)
+{
+        return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+                                "crashkernel=", suffix_tbl[SUFFIX_LOW]);
+}
+static void update_vmcoreinfo_note(void)
+{
+        u32 *buf = vmcoreinfo_note;
+        if (!vmcoreinfo_size)
+                return;
+        buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
+                              vmcoreinfo_size);
+        final_note(buf);
+}
+void crash_save_vmcoreinfo(void)
+{
+        vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
+        update_vmcoreinfo_note();
+}
+void vmcoreinfo_append_str(const char *fmt, ...)
+{
+        va_list args;
+        char buf[0x50];
+        size_t r;
+        va_start(args, fmt);
+        r = vscnprintf(buf, sizeof(buf), fmt, args);
+        va_end(args);
+        r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
+        memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
+        vmcoreinfo_size += r;
+}
+/*
+ * provide an empty default implementation here -- architecture
+ * code may override this
+ */
+void __weak arch_crash_save_vmcoreinfo(void)
+{}
+unsigned long __weak paddr_vmcoreinfo_note(void)
+{
+        return __pa((unsigned long)(char *)&vmcoreinfo_note);
+}
+static int __init crash_save_vmcoreinfo_init(void)
+{
+        VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
+        VMCOREINFO_PAGESIZE(PAGE_SIZE);
+        VMCOREINFO_SYMBOL(init_uts_ns);
+        VMCOREINFO_SYMBOL(node_online_map);
+#ifdef CONFIG_MMU
+        VMCOREINFO_SYMBOL(swapper_pg_dir);
+#endif
+        VMCOREINFO_SYMBOL(_stext);
+        VMCOREINFO_SYMBOL(vmap_area_list);
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+        VMCOREINFO_SYMBOL(mem_map);
+        VMCOREINFO_SYMBOL(contig_page_data);
+#endif
+#ifdef CONFIG_SPARSEMEM
+        VMCOREINFO_SYMBOL(mem_section);
+        VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
+        VMCOREINFO_STRUCT_SIZE(mem_section);
+        VMCOREINFO_OFFSET(mem_section, section_mem_map);
+#endif
+        VMCOREINFO_STRUCT_SIZE(page);
+        VMCOREINFO_STRUCT_SIZE(pglist_data);
+        VMCOREINFO_STRUCT_SIZE(zone);
+        VMCOREINFO_STRUCT_SIZE(free_area);
+        VMCOREINFO_STRUCT_SIZE(list_head);
+        VMCOREINFO_SIZE(nodemask_t);
+        VMCOREINFO_OFFSET(page, flags);
+        VMCOREINFO_OFFSET(page, _count);
+        VMCOREINFO_OFFSET(page, mapping);
+        VMCOREINFO_OFFSET(page, lru);
+        VMCOREINFO_OFFSET(page, _mapcount);
+        VMCOREINFO_OFFSET(page, private);
+        VMCOREINFO_OFFSET(pglist_data, node_zones);
+        VMCOREINFO_OFFSET(pglist_data, nr_zones);
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
+        VMCOREINFO_OFFSET(pglist_data, node_mem_map);
+#endif
+        VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
+        VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
+        VMCOREINFO_OFFSET(pglist_data, node_id);
+        VMCOREINFO_OFFSET(zone, free_area);
+        VMCOREINFO_OFFSET(zone, vm_stat);
+        VMCOREINFO_OFFSET(zone, spanned_pages);
+        VMCOREINFO_OFFSET(free_area, free_list);
+        VMCOREINFO_OFFSET(list_head, next);
+        VMCOREINFO_OFFSET(list_head, prev);
+        VMCOREINFO_OFFSET(vmap_area, va_start);
+        VMCOREINFO_OFFSET(vmap_area, list);
+        VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
+        log_buf_kexec_setup();
+        VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
+        VMCOREINFO_NUMBER(NR_FREE_PAGES);
+        VMCOREINFO_NUMBER(PG_lru);
+        VMCOREINFO_NUMBER(PG_private);
+        VMCOREINFO_NUMBER(PG_swapcache);
+        VMCOREINFO_NUMBER(PG_slab);
+#ifdef CONFIG_MEMORY_FAILURE
+        VMCOREINFO_NUMBER(PG_hwpoison);
+#endif
+        VMCOREINFO_NUMBER(PG_head_mask);
+        VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
+#ifdef CONFIG_X86
+        VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE);
+#endif
+#ifdef CONFIG_HUGETLBFS
+        VMCOREINFO_SYMBOL(free_huge_page);
+#endif
+        arch_crash_save_vmcoreinfo();
+        update_vmcoreinfo_note();
+        return 0;
+}
+subsys_initcall(crash_save_vmcoreinfo_init);
+/*
+ * Move into place and start executing a preloaded standalone
+ * executable.  If nothing was preloaded return an error.
+ */
+int kernel_kexec(void)
+{
+        int error = 0;
+        if (!mutex_trylock(&kexec_mutex))
+                return -EBUSY;
+        if (!kexec_image) {
+                error = -EINVAL;
+                goto Unlock;
+        }
+#ifdef CONFIG_KEXEC_JUMP
+        if (kexec_image->preserve_context) {
+                lock_system_sleep();
+                pm_prepare_console();
+                error = freeze_processes();
+                if (error) {
+                        error = -EBUSY;
+                        goto Restore_console;
+                }
+                suspend_console();
+                error = dpm_suspend_start(PMSG_FREEZE);
+                if (error)
+                        goto Resume_console;
+                /* At this point, dpm_suspend_start() has been called,
+                 * but *not* dpm_suspend_end(). We *must* call
+                 * dpm_suspend_end() now.  Otherwise, drivers for
+                 * some devices (e.g. interrupt controllers) become
+                 * desynchronized with the actual state of the
+                 * hardware at resume time, and evil weirdness ensues.
+                 */
+                error = dpm_suspend_end(PMSG_FREEZE);
+                if (error)
+                        goto Resume_devices;
+                error = disable_nonboot_cpus();
+                if (error)
+                        goto Enable_cpus;
+                local_irq_disable();
+                error = syscore_suspend();
+                if (error)
+                        goto Enable_irqs;
+        } else
+#endif
+        {
+                kexec_in_progress = true;
+                kernel_restart_prepare(NULL);
+                migrate_to_reboot_cpu();
+                /*
+                 * migrate_to_reboot_cpu() disables CPU hotplug assuming that
+                 * no further code needs to use CPU hotplug (which is true in
+                 * the reboot case). However, the kexec path depends on using
+                 * CPU hotplug again; so re-enable it here.
+                 */
+                cpu_hotplug_enable();
+                pr_emerg("Starting new kernel\n");
+                machine_shutdown();
+        }
+        machine_kexec(kexec_image);
+#ifdef CONFIG_KEXEC_JUMP
+        if (kexec_image->preserve_context) {
+                syscore_resume();
+ Enable_irqs:
+                local_irq_enable();
+ Enable_cpus:
+                enable_nonboot_cpus();
+                dpm_resume_start(PMSG_RESTORE);
+ Resume_devices:
+                dpm_resume_end(PMSG_RESTORE);
+ Resume_console:
+                resume_console();
+                thaw_processes();
+ Restore_console:
+                pm_restore_console();
+                unlock_system_sleep();
+        }
+#endif
+ Unlock:
+        mutex_unlock(&kexec_mutex);
+        return error;
+}
+/*
+ * Add and remove page tables for crashkernel memory
+ *
+ * Provide an empty default implementation here -- architecture
+ * code may override this
+ */
+void __weak crash_map_reserved_pages(void)
+{}
+void __weak crash_unmap_reserved_pages(void)
+{}
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
new file mode 100644
index 000000000000..6a9a3f2a0e8e
--- /dev/null
+++ b/kernel/kexec_file.c
@@ -0,0 +1,1045 @@
+/*
+ * kexec: kexec_file_load system call
+ *
+ * Copyright (C) 2014 Red Hat Inc.
+ * Authors:
+ *      Vivek Goyal <vgoyal@redhat.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+#include <linux/capability.h>
+#include <linux/mm.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/kexec.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+#include <linux/syscalls.h>
+#include <linux/vmalloc.h>
+#include "kexec_internal.h"
+/*
+ * Declare these symbols weak so that if architecture provides a purgatory,
+ * these will be overridden.
+ */
+char __weak kexec_purgatory[0];
+size_t __weak kexec_purgatory_size = 0;
+static int kexec_calculate_store_digests(struct kimage *image);
+static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len)
+{
+        struct fd f = fdget(fd);
+        int ret;
+        struct kstat stat;
+        loff_t pos;
+        ssize_t bytes = 0;
+        if (!f.file)
+                return -EBADF;
+        ret = vfs_getattr(&f.file->f_path, &stat);
+        if (ret)
+                goto out;
+        if (stat.size > INT_MAX) {
+                ret = -EFBIG;
+                goto out;
+        }
+        /* Don't hand 0 to vmalloc, it whines. */
+        if (stat.size == 0) {
+                ret = -EINVAL;
+                goto out;
+        }
+        *buf = vmalloc(stat.size);
+        if (!*buf) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        pos = 0;
+        while (pos < stat.size) {
+                bytes = kernel_read(f.file, pos, (char *)(*buf) + pos,
+                                    stat.size - pos);
+                if (bytes < 0) {
+                        vfree(*buf);
+                        ret = bytes;
+                        goto out;
+                }
+                if (bytes == 0)
+                        break;
+                pos += bytes;
+        }
+        if (pos != stat.size) {
+                ret = -EBADF;
+                vfree(*buf);
+                goto out;
+        }
+        *buf_len = pos;
+out:
+        fdput(f);
+        return ret;
+}
+/* Architectures can provide this probe function */
+int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
+                                         unsigned long buf_len)
+{
+        return -ENOEXEC;
+}
+void * __weak arch_kexec_kernel_image_load(struct kimage *image)
+{
+        return ERR_PTR(-ENOEXEC);
+}
+int __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
+{
+        return -EINVAL;
+}
+int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
+                                        unsigned long buf_len)
+{
+        return -EKEYREJECTED;
+}
+/* Apply relocations of type RELA */
+int __weak
+arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
+                                 unsigned int relsec)
+{
+        pr_err("RELA relocation unsupported.\n");
+        return -ENOEXEC;
+}
+/* Apply relocations of type REL */
+int __weak
+arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
+                             unsigned int relsec)
+{
+        pr_err("REL relocation unsupported.\n");
+        return -ENOEXEC;
+}
+/*
+ * Free up memory used by kernel, initrd, and command line. This is temporary
+ * memory allocation which is not needed any more after these buffers have
+ * been loaded into separate segments and have been copied elsewhere.
+ */
+void kimage_file_post_load_cleanup(struct kimage *image)
+{
+        struct purgatory_info *pi = &image->purgatory_info;
+        vfree(image->kernel_buf);
+        image->kernel_buf = NULL;
+        vfree(image->initrd_buf);
+        image->initrd_buf = NULL;
+        kfree(image->cmdline_buf);
+        image->cmdline_buf = NULL;
+        vfree(pi->purgatory_buf);
+        pi->purgatory_buf = NULL;
+        vfree(pi->sechdrs);
+        pi->sechdrs = NULL;
+        /* See if architecture has anything to cleanup post load */
+        arch_kimage_file_post_load_cleanup(image);
+        /*
+         * Above call should have called into bootloader to free up
+         * any data stored in kimage->image_loader_data. It should
+         * be ok now to free it up.
+         */
+        kfree(image->image_loader_data);
+        image->image_loader_data = NULL;
+}
+/*
+ * In file mode list of segments is prepared by kernel. Copy relevant
+ * data from user space, do error checking, prepare segment list
+ */
+static int
+kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
+                             const char __user *cmdline_ptr,
+                             unsigned long cmdline_len, unsigned flags)
+{
+        int ret = 0;
+        void *ldata;
+        ret = copy_file_from_fd(kernel_fd, &image->kernel_buf,
+                                &image->kernel_buf_len);
+        if (ret)
+                return ret;
+        /* Call arch image probe handlers */
+        ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
+                                            image->kernel_buf_len);
+        if (ret)
+                goto out;
+#ifdef CONFIG_KEXEC_VERIFY_SIG
+        ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
+                                           image->kernel_buf_len);
+        if (ret) {
+                pr_debug("kernel signature verification failed.\n");
+                goto out;
+        }
+        pr_debug("kernel signature verification successful.\n");
+#endif
+        /* It is possible that there no initramfs is being loaded */
+        if (!(flags & KEXEC_FILE_NO_INITRAMFS)) {
+                ret = copy_file_from_fd(initrd_fd, &image->initrd_buf,
+                                        &image->initrd_buf_len);
+                if (ret)
+                        goto out;
+        }
+        if (cmdline_len) {
+                image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL);
+                if (!image->cmdline_buf) {
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                ret = copy_from_user(image->cmdline_buf, cmdline_ptr,
+                                     cmdline_len);
+                if (ret) {
+                        ret = -EFAULT;
+                        goto out;
+                }
+                image->cmdline_buf_len = cmdline_len;
+                /* command line should be a string with last byte null */
+                if (image->cmdline_buf[cmdline_len - 1] != '\0') {
+                        ret = -EINVAL;
+                        goto out;
+                }
+        }
+        /* Call arch image load handlers */
+        ldata = arch_kexec_kernel_image_load(image);
+        if (IS_ERR(ldata)) {
+                ret = PTR_ERR(ldata);
+                goto out;
+        }
+        image->image_loader_data = ldata;
+out:
+        /* In case of error, free up all allocated memory in this function */
+        if (ret)
+                kimage_file_post_load_cleanup(image);
+        return ret;
+}
+static int
+kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
+                       int initrd_fd, const char __user *cmdline_ptr,
+                       unsigned long cmdline_len, unsigned long flags)
+{
+        int ret;
+        struct kimage *image;
+        bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH;
+        image = do_kimage_alloc_init();
+        if (!image)
+                return -ENOMEM;
+        image->file_mode = 1;
+        if (kexec_on_panic) {
+                /* Enable special crash kernel control page alloc policy. */
+                image->control_page = crashk_res.start;
+                image->type = KEXEC_TYPE_CRASH;
+        }
+        ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd,
+                                           cmdline_ptr, cmdline_len, flags);
+        if (ret)
+                goto out_free_image;
+        ret = sanity_check_segment_list(image);
+        if (ret)
+                goto out_free_post_load_bufs;
+        ret = -ENOMEM;
+        image->control_code_page = kimage_alloc_control_pages(image,
+                                           get_order(KEXEC_CONTROL_PAGE_SIZE));
+        if (!image->control_code_page) {
+                pr_err("Could not allocate control_code_buffer\n");
+                goto out_free_post_load_bufs;
+        }
+        if (!kexec_on_panic) {
+                image->swap_page = kimage_alloc_control_pages(image, 0);
+                if (!image->swap_page) {
+                        pr_err("Could not allocate swap buffer\n");
+                        goto out_free_control_pages;
+                }
+        }
+        *rimage = image;
+        return 0;
+out_free_control_pages:
+        kimage_free_page_list(&image->control_pages);
+out_free_post_load_bufs:
+        kimage_file_post_load_cleanup(image);
+out_free_image:
+        kfree(image);
+        return ret;
+}
+SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
+                unsigned long, cmdline_len, const char __user *, cmdline_ptr,
+                unsigned long, flags)
+{
+        int ret = 0, i;
+        struct kimage **dest_image, *image;
+        /* We only trust the superuser with rebooting the system. */
+        if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
+                return -EPERM;
+        /* Make sure we have a legal set of flags */
+        if (flags != (flags & KEXEC_FILE_FLAGS))
+                return -EINVAL;
+        image = NULL;
+        if (!mutex_trylock(&kexec_mutex))
+                return -EBUSY;
+        dest_image = &kexec_image;
+        if (flags & KEXEC_FILE_ON_CRASH)
+                dest_image = &kexec_crash_image;
+        if (flags & KEXEC_FILE_UNLOAD)
+                goto exchange;
+        /*
+         * In case of crash, new kernel gets loaded in reserved region. It is
+         * same memory where old crash kernel might be loaded. Free any
+         * current crash dump kernel before we corrupt it.
+         */
+        if (flags & KEXEC_FILE_ON_CRASH)
+                kimage_free(xchg(&kexec_crash_image, NULL));
+        ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr,
+                                     cmdline_len, flags);
+        if (ret)
+                goto out;
+        ret = machine_kexec_prepare(image);
+        if (ret)
+                goto out;
+        ret = kexec_calculate_store_digests(image);
+        if (ret)
+                goto out;
+        for (i = 0; i < image->nr_segments; i++) {
+                struct kexec_segment *ksegment;
+                ksegment = &image->segment[i];
+                pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n",
+                         i, ksegment->buf, ksegment->bufsz, ksegment->mem,
+                         ksegment->memsz);
+                ret = kimage_load_segment(image, &image->segment[i]);
+                if (ret)
+                        goto out;
+        }
+        kimage_terminate(image);
+        /*
+         * Free up any temporary buffers allocated which are not needed
+         * after image has been loaded
+         */
+        kimage_file_post_load_cleanup(image);
+exchange:
+        image = xchg(dest_image, image);
+out:
+        mutex_unlock(&kexec_mutex);
+        kimage_free(image);
+        return ret;
+}
+static int locate_mem_hole_top_down(unsigned long start, unsigned long end,
+                                    struct kexec_buf *kbuf)
+{
+        struct kimage *image = kbuf->image;
+        unsigned long temp_start, temp_end;
+        temp_end = min(end, kbuf->buf_max);
+        temp_start = temp_end - kbuf->memsz;
+        do {
+                /* align down start */
+                temp_start = temp_start & (~(kbuf->buf_align - 1));
+                if (temp_start < start || temp_start < kbuf->buf_min)
+                        return 0;
+                temp_end = temp_start + kbuf->memsz - 1;
+                /*
+                 * Make sure this does not conflict with any of existing
+                 * segments
+                 */
+                if (kimage_is_destination_range(image, temp_start, temp_end)) {
+                        temp_start = temp_start - PAGE_SIZE;
+                        continue;
+                }
+                /* We found a suitable memory range */
+                break;
+        } while (1);
+        /* If we are here, we found a suitable memory range */
+        kbuf->mem = temp_start;
+        /* Success, stop navigating through remaining System RAM ranges */
+        return 1;
+}
+static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end,
+                                     struct kexec_buf *kbuf)
+{
+        struct kimage *image = kbuf->image;
+        unsigned long temp_start, temp_end;
+        temp_start = max(start, kbuf->buf_min);
+        do {
+                temp_start = ALIGN(temp_start, kbuf->buf_align);
+                temp_end = temp_start + kbuf->memsz - 1;
+                if (temp_end > end || temp_end > kbuf->buf_max)
+                        return 0;
+                /*
+                 * Make sure this does not conflict with any of existing
+                 * segments
+                 */
+                if (kimage_is_destination_range(image, temp_start, temp_end)) {
+                        temp_start = temp_start + PAGE_SIZE;
+                        continue;
+                }
+                /* We found a suitable memory range */
+                break;
+        } while (1);
+        /* If we are here, we found a suitable memory range */
+        kbuf->mem = temp_start;
+        /* Success, stop navigating through remaining System RAM ranges */
+        return 1;
+}
+static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
+{
+        struct kexec_buf *kbuf = (struct kexec_buf *)arg;
+        unsigned long sz = end - start + 1;
+        /* Returning 0 will take to next memory range */
+        if (sz < kbuf->memsz)
+                return 0;
+        if (end < kbuf->buf_min || start > kbuf->buf_max)
+                return 0;
+        /*
+         * Allocate memory top down with-in ram range. Otherwise bottom up
+         * allocation.
+         */
+        if (kbuf->top_down)
+                return locate_mem_hole_top_down(start, end, kbuf);
+        return locate_mem_hole_bottom_up(start, end, kbuf);
+}
+/*
+ * Helper function for placing a buffer in a kexec segment. This assumes
+ * that kexec_mutex is held.
+ */
+int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
+                     unsigned long memsz, unsigned long buf_align,
+                     unsigned long buf_min, unsigned long buf_max,
+                     bool top_down, unsigned long *load_addr)
+{
+        struct kexec_segment *ksegment;
+        struct kexec_buf buf, *kbuf;
+        int ret;
+        /* Currently adding segment this way is allowed only in file mode */
+        if (!image->file_mode)
+                return -EINVAL;
+        if (image->nr_segments >= KEXEC_SEGMENT_MAX)
+                return -EINVAL;
+        /*
+         * Make sure we are not trying to add buffer after allocating
+         * control pages. All segments need to be placed first before
+         * any control pages are allocated. As control page allocation
+         * logic goes through list of segments to make sure there are
+         * no destination overlaps.
+         */
+        if (!list_empty(&image->control_pages)) {
+                WARN_ON(1);
+                return -EINVAL;
+        }
+        memset(&buf, 0, sizeof(struct kexec_buf));
+        kbuf = &buf;
+        kbuf->image = image;
+        kbuf->buffer = buffer;
+        kbuf->bufsz = bufsz;
+        kbuf->memsz = ALIGN(memsz, PAGE_SIZE);
+        kbuf->buf_align = max(buf_align, PAGE_SIZE);
+        kbuf->buf_min = buf_min;
+        kbuf->buf_max = buf_max;
+        kbuf->top_down = top_down;
+        /* Walk the RAM ranges and allocate a suitable range for the buffer */
+        if (image->type == KEXEC_TYPE_CRASH)
+                ret = walk_iomem_res("Crash kernel",
+                                     IORESOURCE_MEM | IORESOURCE_BUSY,
+                                     crashk_res.start, crashk_res.end, kbuf,
+                                     locate_mem_hole_callback);
+        else
+                ret = walk_system_ram_res(0, -1, kbuf,
+                                          locate_mem_hole_callback);
+        if (ret != 1) {
+                /* A suitable memory range could not be found for buffer */
+                return -EADDRNOTAVAIL;
+        }
+        /* Found a suitable memory range */
+        ksegment = &image->segment[image->nr_segments];
+        ksegment->kbuf = kbuf->buffer;
+        ksegment->bufsz = kbuf->bufsz;
+        ksegment->mem = kbuf->mem;
+        ksegment->memsz = kbuf->memsz;
+        image->nr_segments++;
+        *load_addr = ksegment->mem;
+        return 0;
+}
+/* Calculate and store the digest of segments */
+static int kexec_calculate_store_digests(struct kimage *image)
+{
+        struct crypto_shash *tfm;
+        struct shash_desc *desc;
+        int ret = 0, i, j, zero_buf_sz, sha_region_sz;
+        size_t desc_size, nullsz;
+        char *digest;
+        void *zero_buf;
+        struct kexec_sha_region *sha_regions;
+        struct purgatory_info *pi = &image->purgatory_info;
+        zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT);
+        zero_buf_sz = PAGE_SIZE;
+        tfm = crypto_alloc_shash("sha256", 0, 0);
+        if (IS_ERR(tfm)) {
+                ret = PTR_ERR(tfm);
+                goto out;
+        }
+        desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
+        desc = kzalloc(desc_size, GFP_KERNEL);
+        if (!desc) {
+                ret = -ENOMEM;
+                goto out_free_tfm;
+        }
+        sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region);
+        sha_regions = vzalloc(sha_region_sz);
+        if (!sha_regions)
+                goto out_free_desc;
+        desc->tfm   = tfm;
+        desc->flags = 0;
+        ret = crypto_shash_init(desc);
+        if (ret < 0)
+                goto out_free_sha_regions;
+        digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL);
+        if (!digest) {
+                ret = -ENOMEM;
+                goto out_free_sha_regions;
+        }
+        for (j = i = 0; i < image->nr_segments; i++) {
+                struct kexec_segment *ksegment;
+                ksegment = &image->segment[i];
+                /*
+                 * Skip purgatory as it will be modified once we put digest
+                 * info in purgatory.
+                 */
+                if (ksegment->kbuf == pi->purgatory_buf)
+                        continue;
+                ret = crypto_shash_update(desc, ksegment->kbuf,
+                                          ksegment->bufsz);
+                if (ret)
+                        break;
+                /*
+                 * Assume rest of the buffer is filled with zero and
+                 * update digest accordingly.
+                 */
+                nullsz = ksegment->memsz - ksegment->bufsz;
+                while (nullsz) {
+                        unsigned long bytes = nullsz;
+                        if (bytes > zero_buf_sz)
+                                bytes = zero_buf_sz;
+                        ret = crypto_shash_update(desc, zero_buf, bytes);
+                        if (ret)
+                                break;
+                        nullsz -= bytes;
+                }
+                if (ret)
+                        break;
+                sha_regions[j].start = ksegment->mem;
+                sha_regions[j].len = ksegment->memsz;
+                j++;
+        }
+        if (!ret) {
+                ret = crypto_shash_final(desc, digest);
+                if (ret)
+                        goto out_free_digest;
+                ret = kexec_purgatory_get_set_symbol(image, "sha_regions",
+                                                sha_regions, sha_region_sz, 0);
+                if (ret)
+                        goto out_free_digest;
+                ret = kexec_purgatory_get_set_symbol(image, "sha256_digest",
+                                                digest, SHA256_DIGEST_SIZE, 0);
+                if (ret)
+                        goto out_free_digest;
+        }
+out_free_digest:
+        kfree(digest);
+out_free_sha_regions:
+        vfree(sha_regions);
+out_free_desc:
+        kfree(desc);
+out_free_tfm:
+        kfree(tfm);
+out:
+        return ret;
+}
+/* Actually load purgatory. Lot of code taken from kexec-tools */
+static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
+                                  unsigned long max, int top_down)
+{
+        struct purgatory_info *pi = &image->purgatory_info;
+        unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad;
+        unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset;
+        unsigned char *buf_addr, *src;
+        int i, ret = 0, entry_sidx = -1;
+        const Elf_Shdr *sechdrs_c;
+        Elf_Shdr *sechdrs = NULL;
+        void *purgatory_buf = NULL;
+        /*
+         * sechdrs_c points to section headers in purgatory and are read
+         * only. No modifications allowed.
+         */
+        sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff;
+        /*
+         * We can not modify sechdrs_c[] and its fields. It is read only.
+         * Copy it over to a local copy where one can store some temporary
+         * data and free it at the end. We need to modify ->sh_addr and
+         * ->sh_offset fields to keep track of permanent and temporary
+         * locations of sections.
+         */
+        sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr));
+        if (!sechdrs)
+                return -ENOMEM;
+        memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr));
+        /*
+         * We seem to have multiple copies of sections. First copy is which
+         * is embedded in kernel in read only section. Some of these sections
+         * will be copied to a temporary buffer and relocated. And these
+         * sections will finally be copied to their final destination at
+         * segment load time.
+         *
+         * Use ->sh_offset to reflect section address in memory. It will
+         * point to original read only copy if section is not allocatable.
+         * Otherwise it will point to temporary copy which will be relocated.
+         *
+         * Use ->sh_addr to contain final address of the section where it
+         * will go during execution time.
+         */
+        for (i = 0; i < pi->ehdr->e_shnum; i++) {
+                if (sechdrs[i].sh_type == SHT_NOBITS)
+                        continue;
+                sechdrs[i].sh_offset = (unsigned long)pi->ehdr +
+                                                sechdrs[i].sh_offset;
+        }
+        /*
+         * Identify entry point section and make entry relative to section
+         * start.
+         */
+        entry = pi->ehdr->e_entry;
+        for (i = 0; i < pi->ehdr->e_shnum; i++) {
+                if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+                        continue;
+                if (!(sechdrs[i].sh_flags & SHF_EXECINSTR))
+                        continue;
+                /* Make entry section relative */
+                if (sechdrs[i].sh_addr <= pi->ehdr->e_entry &&
+                    ((sechdrs[i].sh_addr + sechdrs[i].sh_size) >
+                     pi->ehdr->e_entry)) {
+                        entry_sidx = i;
+                        entry -= sechdrs[i].sh_addr;
+                        break;
+                }
+        }
+        /* Determine how much memory is needed to load relocatable object. */
+        buf_align = 1;
+        bss_align = 1;
+        buf_sz = 0;
+        bss_sz = 0;
+        for (i = 0; i < pi->ehdr->e_shnum; i++) {
+                if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+                        continue;
+                align = sechdrs[i].sh_addralign;
+                if (sechdrs[i].sh_type != SHT_NOBITS) {
+                        if (buf_align < align)
+                                buf_align = align;
+                        buf_sz = ALIGN(buf_sz, align);
+                        buf_sz += sechdrs[i].sh_size;
+                } else {
+                        /* bss section */
+                        if (bss_align < align)
+                                bss_align = align;
+                        bss_sz = ALIGN(bss_sz, align);
+                        bss_sz += sechdrs[i].sh_size;
+                }
+        }
+        /* Determine the bss padding required to align bss properly */
+        bss_pad = 0;
+        if (buf_sz & (bss_align - 1))
+                bss_pad = bss_align - (buf_sz & (bss_align - 1));
+        memsz = buf_sz + bss_pad + bss_sz;
+        /* Allocate buffer for purgatory */
+        purgatory_buf = vzalloc(buf_sz);
+        if (!purgatory_buf) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        if (buf_align < bss_align)
+                buf_align = bss_align;
+        /* Add buffer to segment list */
+        ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz,
+                                buf_align, min, max, top_down,
+                                &pi->purgatory_load_addr);
+        if (ret)
+                goto out;
+        /* Load SHF_ALLOC sections */
+        buf_addr = purgatory_buf;
+        load_addr = curr_load_addr = pi->purgatory_load_addr;
+        bss_addr = load_addr + buf_sz + bss_pad;
+        for (i = 0; i < pi->ehdr->e_shnum; i++) {
+                if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+                        continue;
+                align = sechdrs[i].sh_addralign;
+                if (sechdrs[i].sh_type != SHT_NOBITS) {
+                        curr_load_addr = ALIGN(curr_load_addr, align);
+                        offset = curr_load_addr - load_addr;
+                        /* We already modifed ->sh_offset to keep src addr */
+                        src = (char *) sechdrs[i].sh_offset;
+                        memcpy(buf_addr + offset, src, sechdrs[i].sh_size);
+                        /* Store load address and source address of section */
+                        sechdrs[i].sh_addr = curr_load_addr;
+                        /*
+                         * This section got copied to temporary buffer. Update
+                         * ->sh_offset accordingly.
+                         */
+                        sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset);
+                        /* Advance to the next address */
+                        curr_load_addr += sechdrs[i].sh_size;
+                } else {
+                        bss_addr = ALIGN(bss_addr, align);
+                        sechdrs[i].sh_addr = bss_addr;
+                        bss_addr += sechdrs[i].sh_size;
+                }
+        }
+        /* Update entry point based on load address of text section */
+        if (entry_sidx >= 0)
+                entry += sechdrs[entry_sidx].sh_addr;
+        /* Make kernel jump to purgatory after shutdown */
+        image->start = entry;
+        /* Used later to get/set symbol values */
+        pi->sechdrs = sechdrs;
+        /*
+         * Used later to identify which section is purgatory and skip it
+         * from checksumming.
+         */
+        pi->purgatory_buf = purgatory_buf;
+        return ret;
+out:
+        vfree(sechdrs);
+        vfree(purgatory_buf);
+        return ret;
+}
+static int kexec_apply_relocations(struct kimage *image)
+{
+        int i, ret;
+        struct purgatory_info *pi = &image->purgatory_info;
+        Elf_Shdr *sechdrs = pi->sechdrs;
+        /* Apply relocations */
+        for (i = 0; i < pi->ehdr->e_shnum; i++) {
+                Elf_Shdr *section, *symtab;
+                if (sechdrs[i].sh_type != SHT_RELA &&
+                    sechdrs[i].sh_type != SHT_REL)
+                        continue;
+                /*
+                 * For section of type SHT_RELA/SHT_REL,
+                 * ->sh_link contains section header index of associated
+                 * symbol table. And ->sh_info contains section header
+                 * index of section to which relocations apply.
+                 */
+                if (sechdrs[i].sh_info >= pi->ehdr->e_shnum ||
+                    sechdrs[i].sh_link >= pi->ehdr->e_shnum)
+                        return -ENOEXEC;
+                section = &sechdrs[sechdrs[i].sh_info];
+                symtab = &sechdrs[sechdrs[i].sh_link];
+                if (!(section->sh_flags & SHF_ALLOC))
+                        continue;
+                /*
+                 * symtab->sh_link contain section header index of associated
+                 * string table.
+                 */
+                if (symtab->sh_link >= pi->ehdr->e_shnum)
+                        /* Invalid section number? */
+                        continue;
+                /*
+                 * Respective architecture needs to provide support for applying
+                 * relocations of type SHT_RELA/SHT_REL.
+                 */
+                if (sechdrs[i].sh_type == SHT_RELA)
+                        ret = arch_kexec_apply_relocations_add(pi->ehdr,
+                                                               sechdrs, i);
+                else if (sechdrs[i].sh_type == SHT_REL)
+                        ret = arch_kexec_apply_relocations(pi->ehdr,
+                                                           sechdrs, i);
+                if (ret)
+                        return ret;
+        }
+        return 0;
+}
+/* Load relocatable purgatory object and relocate it appropriately */
+int kexec_load_purgatory(struct kimage *image, unsigned long min,
+                         unsigned long max, int top_down,
+                         unsigned long *load_addr)
+{
+        struct purgatory_info *pi = &image->purgatory_info;
+        int ret;
+        if (kexec_purgatory_size <= 0)
+                return -EINVAL;
+        if (kexec_purgatory_size < sizeof(Elf_Ehdr))
+                return -ENOEXEC;
+        pi->ehdr = (Elf_Ehdr *)kexec_purgatory;
+        if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0
+            || pi->ehdr->e_type != ET_REL
+            || !elf_check_arch(pi->ehdr)
+            || pi->ehdr->e_shentsize != sizeof(Elf_Shdr))
+                return -ENOEXEC;
+        if (pi->ehdr->e_shoff >= kexec_purgatory_size
+            || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) >
+            kexec_purgatory_size - pi->ehdr->e_shoff))
+                return -ENOEXEC;
+        ret = __kexec_load_purgatory(image, min, max, top_down);
+        if (ret)
+                return ret;
+        ret = kexec_apply_relocations(image);
+        if (ret)
+                goto out;
+        *load_addr = pi->purgatory_load_addr;
+        return 0;
+out:
+        vfree(pi->sechdrs);
+        vfree(pi->purgatory_buf);
+        return ret;
+}
+static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
+                                            const char *name)
+{
+        Elf_Sym *syms;
+        Elf_Shdr *sechdrs;
+        Elf_Ehdr *ehdr;
+        int i, k;
+        const char *strtab;
+        if (!pi->sechdrs || !pi->ehdr)
+                return NULL;
+        sechdrs = pi->sechdrs;
+        ehdr = pi->ehdr;
+        for (i = 0; i < ehdr->e_shnum; i++) {
+                if (sechdrs[i].sh_type != SHT_SYMTAB)
+                        continue;
+                if (sechdrs[i].sh_link >= ehdr->e_shnum)
+                        /* Invalid strtab section number */
+                        continue;
+                strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset;
+                syms = (Elf_Sym *)sechdrs[i].sh_offset;
+                /* Go through symbols for a match */
+                for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) {
+                        if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL)
+                                continue;
+                        if (strcmp(strtab + syms[k].st_name, name) != 0)
+                                continue;
+                        if (syms[k].st_shndx == SHN_UNDEF ||
+                            syms[k].st_shndx >= ehdr->e_shnum) {
+                                pr_debug("Symbol: %s has bad section index %d.\n",
+                                                name, syms[k].st_shndx);
+                                return NULL;
+                        }
+                        /* Found the symbol we are looking for */
+                        return &syms[k];
+                }
+        }
+        return NULL;
+}
+void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name)
+{
+        struct purgatory_info *pi = &image->purgatory_info;
+        Elf_Sym *sym;
+        Elf_Shdr *sechdr;
+        sym = kexec_purgatory_find_symbol(pi, name);
+        if (!sym)
+                return ERR_PTR(-EINVAL);
+        sechdr = &pi->sechdrs[sym->st_shndx];
+        /*
+         * Returns the address where symbol will finally be loaded after
+         * kexec_load_segment()
+         */
+        return (void *)(sechdr->sh_addr + sym->st_value);
+}
+/*
+ * Get or set value of a symbol. If "get_value" is true, symbol value is
+ * returned in buf otherwise symbol value is set based on value in buf.
+ */
+int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
+                                   void *buf, unsigned int size, bool get_value)
+{
+        Elf_Sym *sym;
+        Elf_Shdr *sechdrs;
+        struct purgatory_info *pi = &image->purgatory_info;
+        char *sym_buf;
+        sym = kexec_purgatory_find_symbol(pi, name);
+        if (!sym)
+                return -EINVAL;
+        if (sym->st_size != size) {
+                pr_err("symbol %s size mismatch: expected %lu actual %u\n",
+                       name, (unsigned long)sym->st_size, size);
+                return -EINVAL;
+        }
+        sechdrs = pi->sechdrs;
+        if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
+                pr_err("symbol %s is in a bss section. Cannot %s\n", name,
+                       get_value ? "get" : "set");
+                return -EINVAL;
+        }
+        sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset +
+                                        sym->st_value;
+        if (get_value)
+                memcpy((void *)buf, sym_buf, size);
+        else
+                memcpy((void *)sym_buf, buf, size);
+        return 0;
+}
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
new file mode 100644
index 000000000000..e4392a698ad4
--- /dev/null
+++ b/kernel/kexec_internal.h
@@ -0,0 +1,22 @@
+#ifndef LINUX_KEXEC_INTERNAL_H
+#define LINUX_KEXEC_INTERNAL_H
+#include <linux/kexec.h>
+struct kimage *do_kimage_alloc_init(void);
+int sanity_check_segment_list(struct kimage *image);
+void kimage_free_page_list(struct list_head *list);
+void kimage_free(struct kimage *image);
+int kimage_load_segment(struct kimage *image, struct kexec_segment *segment);
+void kimage_terminate(struct kimage *image);
+int kimage_is_destination_range(struct kimage *image,
+                                unsigned long start, unsigned long end);
+extern struct mutex kexec_mutex;
+#ifdef CONFIG_KEXEC_FILE
+void kimage_file_post_load_cleanup(struct kimage *image);
+#else /* CONFIG_KEXEC_FILE */
+static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
+#endif /* CONFIG_KEXEC_FILE */
+#endif /* LINUX_KEXEC_INTERNAL_H */
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 2777f40a9c7b..da98d0593de2 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -45,8 +45,6 @@
 extern int max_threads;
-static struct workqueue_struct *khelper_wq;
 #define CAP_BSET        (void *)1
 #define CAP_PI          (void *)2
@@ -114,10 +112,11 @@ out:
 * @...: arguments as specified in the format string
 *
 * Load a module using the user mode module loader. The function returns
- * zero on success or a negative errno code on failure. Note that a
+ * zero on success or a negative errno code or positive exit code from
- * successful module load does not mean the module did not then unload
+ * "modprobe" on failure. Note that a successful module load does not mean
- * and exit on an error of its own. Callers must check that the service
+ * the module did not then unload and exit on an error of its own. Callers
- * they requested is now available not blindly invoke it.
+ * must check that the service they requested is now available not blindly
+ * invoke it.
 *
 * If module auto-loading support is disabled then this function
 * becomes a no-operation.
@@ -213,7 +212,7 @@ static void umh_complete(struct subprocess_info *sub_info)
 /*
 * This is the task which runs the usermode application
 */
-static int ____call_usermodehelper(void *data)
+static int call_usermodehelper_exec_async(void *data)
 {
        struct subprocess_info *sub_info = data;
        struct cred *new;
@@ -223,12 +222,9 @@ static int ____call_usermodehelper(void *data)
        flush_signal_handlers(current, 1);
        spin_unlock_irq(&current->sighand->siglock);
-        /* We can run anywhere, unlike our parent keventd(). */
-        set_cpus_allowed_ptr(current, cpu_all_mask);
        /*
-         * Our parent is keventd, which runs with elevated scheduling priority.
+         * Our parent (unbound workqueue) runs with elevated scheduling
-         * Avoid propagating that into the userspace child.
+         * priority. Avoid propagating that into the userspace child.
         */
        set_user_nice(current, 0);
@@ -258,7 +254,10 @@ static int ____call_usermodehelper(void *data)
                           (const char __user *const __user *)sub_info->envp);
 out:
        sub_info->retval = retval;
-        /* wait_for_helper() will call umh_complete if UHM_WAIT_PROC. */
+        /*
+         * call_usermodehelper_exec_sync() will call umh_complete
+         * if UHM_WAIT_PROC.
+         */
        if (!(sub_info->wait & UMH_WAIT_PROC))
                umh_complete(sub_info);
        if (!retval)
@@ -266,15 +265,14 @@ out:
        do_exit(0);
 }
-/* Keventd can't block, but this (a child) can. */
+/* Handles UMH_WAIT_PROC.  */
-static int wait_for_helper(void *data)
+static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info)
 {
-        struct subprocess_info *sub_info = data;
        pid_t pid;
        /* If SIGCLD is ignored sys_wait4 won't populate the status. */
        kernel_sigaction(SIGCHLD, SIG_DFL);
-        pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);
+        pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD);
        if (pid < 0) {
                sub_info->retval = pid;
        } else {
@@ -282,44 +280,60 @@ static int wait_for_helper(void *data)
                /*
                 * Normally it is bogus to call wait4() from in-kernel because
                 * wait4() wants to write the exit code to a userspace address.
-                 * But wait_for_helper() always runs as keventd, and put_user()
+                 * But call_usermodehelper_exec_sync() always runs as kernel
-                 * to a kernel address works OK for kernel threads, due to their
+                 * thread (workqueue) and put_user() to a kernel address works
-                 * having an mm_segment_t which spans the entire address space.
+                 * OK for kernel threads, due to their having an mm_segment_t
+                 * which spans the entire address space.
                 *
                 * Thus the __user pointer cast is valid here.
                 */
                sys_wait4(pid, (int __user *)&ret, 0, NULL);
                /*
-                 * If ret is 0, either ____call_usermodehelper failed and the
+                 * If ret is 0, either call_usermodehelper_exec_async failed and
-                 * real error code is already in sub_info->retval or
+                 * the real error code is already in sub_info->retval or
                 * sub_info->retval is 0 anyway, so don't mess with it then.
                 */
                if (ret)
                        sub_info->retval = ret;
        }
+        /* Restore default kernel sig handler */
+        kernel_sigaction(SIGCHLD, SIG_IGN);
        umh_complete(sub_info);
-        do_exit(0);
 }
-/* This is run by khelper thread  */
+/*
-static void __call_usermodehelper(struct work_struct *work)
+ * We need to create the usermodehelper kernel thread from a task that is affine
+ * to an optimized set of CPUs (or nohz housekeeping ones) such that they
+ * inherit a widest affinity irrespective of call_usermodehelper() callers with
+ * possibly reduced affinity (eg: per-cpu workqueues). We don't want
+ * usermodehelper targets to contend a busy CPU.
+ *
+ * Unbound workqueues provide such wide affinity and allow to block on
+ * UMH_WAIT_PROC requests without blocking pending request (up to some limit).
+ *
+ * Besides, workqueues provide the privilege level that caller might not have
+ * to perform the usermodehelper request.
+ *
+ */
+static void call_usermodehelper_exec_work(struct work_struct *work)
 {
        struct subprocess_info *sub_info =
                container_of(work, struct subprocess_info, work);
-        pid_t pid;
-        if (sub_info->wait & UMH_WAIT_PROC)
+        if (sub_info->wait & UMH_WAIT_PROC) {
-                pid = kernel_thread(wait_for_helper, sub_info,
+                call_usermodehelper_exec_sync(sub_info);
-                                    CLONE_FS | CLONE_FILES | SIGCHLD);
+        } else {
-        else
+                pid_t pid;
-                pid = kernel_thread(____call_usermodehelper, sub_info,
-                                    SIGCHLD);
-        if (pid < 0) {
+                pid = kernel_thread(call_usermodehelper_exec_async, sub_info,
-                sub_info->retval = pid;
+                                    SIGCHLD);
-                umh_complete(sub_info);
+                if (pid < 0) {
+                        sub_info->retval = pid;
+                        umh_complete(sub_info);
+                }
        }
 }
@@ -509,7 +523,7 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
        if (!sub_info)
                goto out;
-        INIT_WORK(&sub_info->work, __call_usermodehelper);
+        INIT_WORK(&sub_info->work, call_usermodehelper_exec_work);
        sub_info->path = path;
        sub_info->argv = argv;
        sub_info->envp = envp;
@@ -531,8 +545,8 @@ EXPORT_SYMBOL(call_usermodehelper_setup);
 *        from interrupt context.
 *
 * Runs a user-space application.  The application is started
- * asynchronously if wait is not set, and runs as a child of keventd.
+ * asynchronously if wait is not set, and runs as a child of system workqueues.
- * (ie. it runs with full root capabilities).
+ * (ie. it runs with full root capabilities and optimized affinity).
 */
 int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
 {
@@ -544,7 +558,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
                return -EINVAL;
        }
        helper_lock();
-        if (!khelper_wq || usermodehelper_disabled) {
+        if (usermodehelper_disabled) {
                retval = -EBUSY;
                goto out;
        }
@@ -556,7 +570,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
        sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done;
        sub_info->wait = wait;
-        queue_work(khelper_wq, &sub_info->work);
+        queue_work(system_unbound_wq, &sub_info->work);
        if (wait == UMH_NO_WAIT)        /* task has freed sub_info */
                goto unlock;
@@ -686,9 +700,3 @@ struct ctl_table usermodehelper_table[] = {
        },
        { }
 };
-void __init usermodehelper_init(void)
-{
-        khelper_wq = create_singlethread_workqueue("khelper");
-        BUG_ON(!khelper_wq);
-}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index c90e417bb963..d10ab6b9b5e0 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1332,7 +1332,7 @@ bool __weak arch_within_kprobe_blacklist(unsigned long addr)
               addr < (unsigned long)__kprobes_text_end;
 }
-static bool within_kprobe_blacklist(unsigned long addr)
+bool within_kprobe_blacklist(unsigned long addr)
 {
        struct kprobe_blacklist_entry *ent;
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 6683ccef9fff..e83b26464061 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -90,7 +90,7 @@ static ssize_t profiling_store(struct kobject *kobj,
 KERNEL_ATTR_RW(profiling);
 #endif
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 static ssize_t kexec_loaded_show(struct kobject *kobj,
                                 struct kobj_attribute *attr, char *buf)
 {
@@ -134,7 +134,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj,
 }
 KERNEL_ATTR_RO(vmcoreinfo);
-#endif /* CONFIG_KEXEC */
+#endif /* CONFIG_KEXEC_CORE */
 /* whether file capabilities are enabled */
 static ssize_t fscaps_show(struct kobject *kobj,
@@ -196,7 +196,7 @@ static struct attribute * kernel_attrs[] = {
 #ifdef CONFIG_PROFILING
        &profiling_attr.attr,
 #endif
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
        &kexec_loaded_attr.attr,
        &kexec_crash_loaded_attr.attr,
        &kexec_crash_size_attr.attr,
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 10e489c448fe..9ff173dca1ae 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -97,6 +97,7 @@ bool kthread_should_park(void)
 {
        return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(current)->flags);
 }
+EXPORT_SYMBOL_GPL(kthread_should_park);
 /**
 * kthread_freezable_should_stop - should this freezable kthread return now?
@@ -171,6 +172,7 @@ void kthread_parkme(void)
 {
        __kthread_parkme(to_kthread(current));
 }
+EXPORT_SYMBOL_GPL(kthread_parkme);
 static int kthread(void *_create)
 {
@@ -246,15 +248,16 @@ static void create_kthread(struct kthread_create_info *create)
 * kthread_create_on_node - create a kthread.
 * @threadfn: the function to run until signal_pending(current).
 * @data: data ptr for @threadfn.
- * @node: memory node number.
+ * @node: task and thread structures for the thread are allocated on this node
 * @namefmt: printf-style name for the thread.
 *
 * Description: This helper function creates and names a kernel
 * thread.  The thread will be stopped: use wake_up_process() to start
- * it.  See also kthread_run().
+ * it.  See also kthread_run().  The new thread has SCHED_NORMAL policy and
+ * is affine to all CPUs.
 *
 * If thread is going to be bound on a particular cpu, give its node
- * in @node, to get NUMA affinity for kthread stack, or else give -1.
+ * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE.
 * When woken, the thread will run @threadfn() with @data as its
 * argument. @threadfn() can either call do_exit() directly if it is a
 * standalone thread for which no one will call kthread_stop(), or
@@ -325,16 +328,30 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
 }
 EXPORT_SYMBOL(kthread_create_on_node);
-static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
+static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, long state)
 {
-        /* Must have done schedule() in kthread() before we set_task_cpu */
+        unsigned long flags;
        if (!wait_task_inactive(p, state)) {
                WARN_ON(1);
                return;
        }
        /* It's safe because the task is inactive. */
-        do_set_cpus_allowed(p, cpumask_of(cpu));
+        raw_spin_lock_irqsave(&p->pi_lock, flags);
+        do_set_cpus_allowed(p, mask);
        p->flags |= PF_NO_SETAFFINITY;
+        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+}
+static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
+{
+        __kthread_bind_mask(p, cpumask_of(cpu), state);
+}
+void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask)
+{
+        __kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE);
 }
 /**
@@ -411,6 +428,7 @@ void kthread_unpark(struct task_struct *k)
        if (kthread)
                __kthread_unpark(k, kthread);
 }
+EXPORT_SYMBOL_GPL(kthread_unpark);
 /**
 * kthread_park - park a thread created by kthread_create().
@@ -441,6 +459,7 @@ int kthread_park(struct task_struct *k)
        }
        return ret;
 }
+EXPORT_SYMBOL_GPL(kthread_park);
 /**
 * kthread_stop - stop a thread created by kthread_create().
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index c40ebcca0495..6e5344112419 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -348,8 +348,10 @@ static void klp_disable_func(struct klp_func *func)
 {
        struct klp_ops *ops;
-        WARN_ON(func->state != KLP_ENABLED);
+        if (WARN_ON(func->state != KLP_ENABLED))
-        WARN_ON(!func->old_addr);
+                return;
+        if (WARN_ON(!func->old_addr))
+                return;
        ops = klp_find_ops(func->old_addr);
        if (WARN_ON(!ops))
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 7dd5c9918e4c..8e96f6cc2a4a 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -1,5 +1,5 @@
-obj-y += mutex.o semaphore.o rwsem.o
+obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
 ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
@@ -20,11 +20,9 @@ obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
 obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o
 obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
 obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
-obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
 obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
 obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
-obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o
 obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
 obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 652a8ee8efe9..f32567254867 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -88,6 +88,19 @@ void percpu_down_read(struct percpu_rw_semaphore *brw)
        __up_read(&brw->rw_sem);
 }
+int percpu_down_read_trylock(struct percpu_rw_semaphore *brw)
+{
+        if (unlikely(!update_fast_ctr(brw, +1))) {
+                if (!__down_read_trylock(&brw->rw_sem))
+                        return 0;
+                atomic_inc(&brw->slow_read_ctr);
+                __up_read(&brw->rw_sem);
+        }
+        rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 1, _RET_IP_);
+        return 1;
+}
 void percpu_up_read(struct percpu_rw_semaphore *brw)
 {
        rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_);
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index 6c5da483966b..f17a3e3b3550 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -55,27 +55,29 @@ rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts)
 {
        while ((cnts & _QW_WMASK) == _QW_LOCKED) {
                cpu_relax_lowlatency();
-                cnts = smp_load_acquire((u32 *)&lock->cnts);
+                cnts = atomic_read_acquire(&lock->cnts);
        }
 }
 /**
- * queue_read_lock_slowpath - acquire read lock of a queue rwlock
+ * queued_read_lock_slowpath - acquire read lock of a queue rwlock
 * @lock: Pointer to queue rwlock structure
+ * @cnts: Current qrwlock lock value
 */
-void queue_read_lock_slowpath(struct qrwlock *lock)
+void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
 {
-        u32 cnts;
        /*
         * Readers come here when they cannot get the lock without waiting
         */
        if (unlikely(in_interrupt())) {
                /*
-                 * Readers in interrupt context will spin until the lock is
+                 * Readers in interrupt context will get the lock immediately
-                 * available without waiting in the queue.
+                 * if the writer is just waiting (not holding the lock yet).
+                 * The rspin_until_writer_unlock() function returns immediately
+                 * in this case. Otherwise, they will spin (with ACQUIRE
+                 * semantics) until the lock is available without waiting in
+                 * the queue.
                 */
-                cnts = smp_load_acquire((u32 *)&lock->cnts);
                rspin_until_writer_unlock(lock, cnts);
                return;
        }
@@ -87,16 +89,11 @@ void queue_read_lock_slowpath(struct qrwlock *lock)
        arch_spin_lock(&lock->lock);
        /*
-         * At the head of the wait queue now, wait until the writer state
+         * The ACQUIRE semantics of the following spinning code ensure
-         * goes to 0 and then try to increment the reader count and get
+         * that accesses can't leak upwards out of our subsequent critical
-         * the lock. It is possible that an incoming writer may steal the
+         * section in the case that the lock is currently held for write.
-         * lock in the interim, so it is necessary to check the writer byte
-         * to make sure that the write lock isn't taken.
         */
-        while (atomic_read(&lock->cnts) & _QW_WMASK)
+        cnts = atomic_add_return_acquire(_QR_BIAS, &lock->cnts) - _QR_BIAS;
-                cpu_relax_lowlatency();
-        cnts = atomic_add_return(_QR_BIAS, &lock->cnts) - _QR_BIAS;
        rspin_until_writer_unlock(lock, cnts);
        /*
@@ -104,13 +101,13 @@ void queue_read_lock_slowpath(struct qrwlock *lock)
         */
        arch_spin_unlock(&lock->lock);
 }
-EXPORT_SYMBOL(queue_read_lock_slowpath);
+EXPORT_SYMBOL(queued_read_lock_slowpath);
 /**
- * queue_write_lock_slowpath - acquire write lock of a queue rwlock
+ * queued_write_lock_slowpath - acquire write lock of a queue rwlock
 * @lock : Pointer to queue rwlock structure
 */
-void queue_write_lock_slowpath(struct qrwlock *lock)
+void queued_write_lock_slowpath(struct qrwlock *lock)
 {
        u32 cnts;
@@ -119,7 +116,7 @@ void queue_write_lock_slowpath(struct qrwlock *lock)
        /* Try to acquire the lock directly if no reader is present */
        if (!atomic_read(&lock->cnts) &&
-            (atomic_cmpxchg(&lock->cnts, 0, _QW_LOCKED) == 0))
+            (atomic_cmpxchg_acquire(&lock->cnts, 0, _QW_LOCKED) == 0))
                goto unlock;
        /*
@@ -130,7 +127,7 @@ void queue_write_lock_slowpath(struct qrwlock *lock)
                struct __qrwlock *l = (struct __qrwlock *)lock;
                if (!READ_ONCE(l->wmode) &&
-                   (cmpxchg(&l->wmode, 0, _QW_WAITING) == 0))
+                   (cmpxchg_relaxed(&l->wmode, 0, _QW_WAITING) == 0))
                        break;
                cpu_relax_lowlatency();
@@ -140,8 +137,8 @@ void queue_write_lock_slowpath(struct qrwlock *lock)
        for (;;) {
                cnts = atomic_read(&lock->cnts);
                if ((cnts == _QW_WAITING) &&
-                    (atomic_cmpxchg(&lock->cnts, _QW_WAITING,
+                    (atomic_cmpxchg_acquire(&lock->cnts, _QW_WAITING,
-                                    _QW_LOCKED) == _QW_WAITING))
+                                            _QW_LOCKED) == _QW_WAITING))
                        break;
                cpu_relax_lowlatency();
@@ -149,4 +146,4 @@ void queue_write_lock_slowpath(struct qrwlock *lock)
 unlock:
        arch_spin_unlock(&lock->lock);
 }
-EXPORT_SYMBOL(queue_write_lock_slowpath);
+EXPORT_SYMBOL(queued_write_lock_slowpath);
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 38c49202d532..337c8818541d 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -239,8 +239,8 @@ static __always_inline void set_locked(struct qspinlock *lock)
 static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
 static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
-static __always_inline void __pv_kick_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_kick_node(struct qspinlock *lock,
+                                           struct mcs_spinlock *node) { }
 static __always_inline void __pv_wait_head(struct qspinlock *lock,
                                           struct mcs_spinlock *node) { }
@@ -440,7 +440,7 @@ queue:
                cpu_relax();
        arch_mcs_spin_unlock_contended(&next->locked);
-        pv_kick_node(next);
+        pv_kick_node(lock, next);
 release:
        /*
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 04ab18151cc8..c8e6e9a596f5 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -4,6 +4,7 @@
 #include <linux/hash.h>
 #include <linux/bootmem.h>
+#include <linux/debug_locks.h>
 /*
 * Implement paravirt qspinlocks; the general idea is to halt the vcpus instead
@@ -21,9 +22,14 @@
 #define _Q_SLOW_VAL     (3U << _Q_LOCKED_OFFSET)
+/*
+ * Queue node uses: vcpu_running & vcpu_halted.
+ * Queue head uses: vcpu_running & vcpu_hashed.
+ */
 enum vcpu_state {
        vcpu_running = 0,
-        vcpu_halted,
+        vcpu_halted,            /* Used only in pv_wait_node */
+        vcpu_hashed,            /* = pv_hash'ed + vcpu_halted */
 };
 struct pv_node {
@@ -152,7 +158,8 @@ static void pv_init_node(struct mcs_spinlock *node)
 /*
 * Wait for node->locked to become true, halt the vcpu after a short spin.
- * pv_kick_node() is used to wake the vcpu again.
+ * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its
+ * behalf.
 */
 static void pv_wait_node(struct mcs_spinlock *node)
 {
@@ -171,9 +178,9 @@ static void pv_wait_node(struct mcs_spinlock *node)
                 *
                 * [S] pn->state = vcpu_halted    [S] next->locked = 1
                 *     MB                             MB
-                 * [L] pn->locked               [RmW] pn->state = vcpu_running
+                 * [L] pn->locked               [RmW] pn->state = vcpu_hashed
                 *
-                 * Matches the xchg() from pv_kick_node().
+                 * Matches the cmpxchg() from pv_kick_node().
                 */
                smp_store_mb(pn->state, vcpu_halted);
@@ -181,9 +188,10 @@ static void pv_wait_node(struct mcs_spinlock *node)
                        pv_wait(&pn->state, vcpu_halted);
                /*
-                 * Reset the vCPU state to avoid unncessary CPU kicking
+                 * If pv_kick_node() changed us to vcpu_hashed, retain that value
+                 * so that pv_wait_head() knows to not also try to hash this lock.
                 */
-                WRITE_ONCE(pn->state, vcpu_running);
+                cmpxchg(&pn->state, vcpu_halted, vcpu_running);
                /*
                 * If the locked flag is still not set after wakeup, it is a
@@ -193,6 +201,7 @@ static void pv_wait_node(struct mcs_spinlock *node)
                 * MCS lock will be released soon.
                 */
        }
        /*
         * By now our node->locked should be 1 and our caller will not actually
         * spin-wait for it. We do however rely on our caller to do a
@@ -201,24 +210,35 @@ static void pv_wait_node(struct mcs_spinlock *node)
 }
 /*
- * Called after setting next->locked = 1, used to wake those stuck in
+ * Called after setting next->locked = 1 when we're the lock owner.
- * pv_wait_node().
+ *
+ * Instead of waking the waiters stuck in pv_wait_node() advance their state such
+ * that they're waiting in pv_wait_head(), this avoids a wake/sleep cycle.
 */
-static void pv_kick_node(struct mcs_spinlock *node)
+static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
 {
        struct pv_node *pn = (struct pv_node *)node;
+        struct __qspinlock *l = (void *)lock;
        /*
-         * Note that because node->locked is already set, this actual
+         * If the vCPU is indeed halted, advance its state to match that of
-         * mcs_spinlock entry could be re-used already.
+         * pv_wait_node(). If OTOH this fails, the vCPU was running and will
+         * observe its next->locked value and advance itself.
         *
-         * This should be fine however, kicking people for no reason is
+         * Matches with smp_store_mb() and cmpxchg() in pv_wait_node()
-         * harmless.
+         */
+        if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted)
+                return;
+        /*
+         * Put the lock into the hash table and set the _Q_SLOW_VAL.
         *
-         * See the comment in pv_wait_node().
+         * As this is the same vCPU that will check the _Q_SLOW_VAL value and
+         * the hash table later on at unlock time, no atomic instruction is
+         * needed.
         */
-        if (xchg(&pn->state, vcpu_running) == vcpu_halted)
+        WRITE_ONCE(l->locked, _Q_SLOW_VAL);
-                pv_kick(pn->cpu);
+        (void)pv_hash(lock, pn);
 }
 /*
@@ -232,6 +252,13 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
        struct qspinlock **lp = NULL;
        int loop;
+        /*
+         * If pv_kick_node() already advanced our state, we don't need to
+         * insert ourselves into the hash table anymore.
+         */
+        if (READ_ONCE(pn->state) == vcpu_hashed)
+                lp = (struct qspinlock **)1;
        for (;;) {
                for (loop = SPIN_THRESHOLD; loop; loop--) {
                        if (!READ_ONCE(l->locked))
@@ -239,17 +266,22 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
                        cpu_relax();
                }
-                WRITE_ONCE(pn->state, vcpu_halted);
                if (!lp) { /* ONCE */
+                        WRITE_ONCE(pn->state, vcpu_hashed);
                        lp = pv_hash(lock, pn);
                        /*
-                         * lp must be set before setting _Q_SLOW_VAL
+                         * We must hash before setting _Q_SLOW_VAL, such that
+                         * when we observe _Q_SLOW_VAL in __pv_queued_spin_unlock()
+                         * we'll be sure to be able to observe our hash entry.
                         *
-                         * [S] lp = lock                [RmW] l = l->locked = 0
+                         *   [S] pn->state
-                         *     MB                             MB
+                         *   [S] <hash>                 [Rmw] l->locked == _Q_SLOW_VAL
-                         * [S] l->locked = _Q_SLOW_VAL  [L]   lp
+                         *       MB                           RMB
+                         * [RmW] l->locked = _Q_SLOW_VAL  [L] <unhash>
+                         *                                [L] pn->state
                         *
-                         * Matches the cmpxchg() in __pv_queued_spin_unlock().
+                         * Matches the smp_rmb() in __pv_queued_spin_unlock().
                         */
                        if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) {
                                /*
@@ -286,14 +318,32 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
 {
        struct __qspinlock *l = (void *)lock;
        struct pv_node *node;
+        u8 locked;
        /*
         * We must not unlock if SLOW, because in that case we must first
         * unhash. Otherwise it would be possible to have multiple @lock
         * entries, which would be BAD.
         */
-        if (likely(cmpxchg(&l->locked, _Q_LOCKED_VAL, 0) == _Q_LOCKED_VAL))
+        locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
+        if (likely(locked == _Q_LOCKED_VAL))
+                return;
+        if (unlikely(locked != _Q_SLOW_VAL)) {
+                WARN(!debug_locks_silent,
+                     "pvqspinlock: lock 0x%lx has corrupted value 0x%x!\n",
+                     (unsigned long)lock, atomic_read(&lock->val));
                return;
+        }
+        /*
+         * A failed cmpxchg doesn't provide any memory-ordering guarantees,
+         * so we need a barrier to order the read of the node data in
+         * pv_unhash *after* we've read the lock being _Q_SLOW_VAL.
+         *
+         * Matches the cmpxchg() in pv_wait_head() setting _Q_SLOW_VAL.
+         */
+        smp_rmb();
        /*
         * Since the above failed to release, this must be the SLOW path.
@@ -310,8 +360,11 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
        /*
         * At this point the memory pointed at by lock can be freed/reused,
         * however we can still use the pv_node to kick the CPU.
+         * The other vCPU may not really be halted, but kicking an active
+         * vCPU is harmless other than the additional latency in completing
+         * the unlock.
         */
-        if (READ_ONCE(node->state) == vcpu_halted)
+        if (READ_ONCE(node->state) == vcpu_hashed)
                pv_kick(node->cpu);
 }
 /*
diff --git a/kernel/locking/rtmutex-tester.c b/kernel/locking/rtmutex-tester.c
deleted file mode 100644
index 1d96dd0d93c1..000000000000
--- a/kernel/locking/rtmutex-tester.c
+++ /dev/null
@@ -1,420 +0,0 @@
-/*
- * RT-Mutex-tester: scriptable tester for rt mutexes
- *
- * started by Thomas Gleixner:
- *
- *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
- *
- */
-#include <linux/device.h>
-#include <linux/kthread.h>
-#include <linux/export.h>
-#include <linux/sched.h>
-#include <linux/sched/rt.h>
-#include <linux/spinlock.h>
-#include <linux/timer.h>
-#include <linux/freezer.h>
-#include <linux/stat.h>
-#include "rtmutex.h"
-#define MAX_RT_TEST_THREADS     8
-#define MAX_RT_TEST_MUTEXES     8
-static spinlock_t rttest_lock;
-static atomic_t rttest_event;
-struct test_thread_data {
-        int                     opcode;
-        int                     opdata;
-        int                     mutexes[MAX_RT_TEST_MUTEXES];
-        int                     event;
-        struct device           dev;
-};
-static struct test_thread_data thread_data[MAX_RT_TEST_THREADS];
-static struct task_struct *threads[MAX_RT_TEST_THREADS];
-static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES];
-enum test_opcodes {
-        RTTEST_NOP = 0,
-        RTTEST_SCHEDOT,         /* 1 Sched other, data = nice */
-        RTTEST_SCHEDRT,         /* 2 Sched fifo, data = prio */
-        RTTEST_LOCK,            /* 3 Lock uninterruptible, data = lockindex */
-        RTTEST_LOCKNOWAIT,      /* 4 Lock uninterruptible no wait in wakeup, data = lockindex */
-        RTTEST_LOCKINT,         /* 5 Lock interruptible, data = lockindex */
-        RTTEST_LOCKINTNOWAIT,   /* 6 Lock interruptible no wait in wakeup, data = lockindex */
-        RTTEST_LOCKCONT,        /* 7 Continue locking after the wakeup delay */
-        RTTEST_UNLOCK,          /* 8 Unlock, data = lockindex */
-        /* 9, 10 - reserved for BKL commemoration */
-        RTTEST_SIGNAL = 11,     /* 11 Signal other test thread, data = thread id */
-        RTTEST_RESETEVENT = 98, /* 98 Reset event counter */
-        RTTEST_RESET = 99,      /* 99 Reset all pending operations */
-};
-static int handle_op(struct test_thread_data *td, int lockwakeup)
-{
-        int i, id, ret = -EINVAL;
-        switch(td->opcode) {
-        case RTTEST_NOP:
-                return 0;
-        case RTTEST_LOCKCONT:
-                td->mutexes[td->opdata] = 1;
-                td->event = atomic_add_return(1, &rttest_event);
-                return 0;
-        case RTTEST_RESET:
-                for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) {
-                        if (td->mutexes[i] == 4) {
-                                rt_mutex_unlock(&mutexes[i]);
-                                td->mutexes[i] = 0;
-                        }
-                }
-                return 0;
-        case RTTEST_RESETEVENT:
-                atomic_set(&rttest_event, 0);
-                return 0;
-        default:
-                if (lockwakeup)
-                        return ret;
-        }
-        switch(td->opcode) {
-        case RTTEST_LOCK:
-        case RTTEST_LOCKNOWAIT:
-                id = td->opdata;
-                if (id < 0 || id >= MAX_RT_TEST_MUTEXES)
-                        return ret;
-                td->mutexes[id] = 1;
-                td->event = atomic_add_return(1, &rttest_event);
-                rt_mutex_lock(&mutexes[id]);
-                td->event = atomic_add_return(1, &rttest_event);
-                td->mutexes[id] = 4;
-                return 0;
-        case RTTEST_LOCKINT:
-        case RTTEST_LOCKINTNOWAIT:
-                id = td->opdata;
-                if (id < 0 || id >= MAX_RT_TEST_MUTEXES)
-                        return ret;
-                td->mutexes[id] = 1;
-                td->event = atomic_add_return(1, &rttest_event);
-                ret = rt_mutex_lock_interruptible(&mutexes[id], 0);
-                td->event = atomic_add_return(1, &rttest_event);
-                td->mutexes[id] = ret ? 0 : 4;
-                return ret ? -EINTR : 0;
-        case RTTEST_UNLOCK:
-                id = td->opdata;
-                if (id < 0 || id >= MAX_RT_TEST_MUTEXES || td->mutexes[id] != 4)
-                        return ret;
-                td->event = atomic_add_return(1, &rttest_event);
-                rt_mutex_unlock(&mutexes[id]);
-                td->event = atomic_add_return(1, &rttest_event);
-                td->mutexes[id] = 0;
-                return 0;
-        default:
-                break;
-        }
-        return ret;
-}
-/*
- * Schedule replacement for rtsem_down(). Only called for threads with
- * PF_MUTEX_TESTER set.
- *
- * This allows us to have finegrained control over the event flow.
- *
- */
-void schedule_rt_mutex_test(struct rt_mutex *mutex)
-{
-        int tid, op, dat;
-        struct test_thread_data *td;
-        /* We have to lookup the task */
-        for (tid = 0; tid < MAX_RT_TEST_THREADS; tid++) {
-                if (threads[tid] == current)
-                        break;
-        }
-        BUG_ON(tid == MAX_RT_TEST_THREADS);
-        td = &thread_data[tid];
-        op = td->opcode;
-        dat = td->opdata;
-        switch (op) {
-        case RTTEST_LOCK:
-        case RTTEST_LOCKINT:
-        case RTTEST_LOCKNOWAIT:
-        case RTTEST_LOCKINTNOWAIT:
-                if (mutex != &mutexes[dat])
-                        break;
-                if (td->mutexes[dat] != 1)
-                        break;
-                td->mutexes[dat] = 2;
-                td->event = atomic_add_return(1, &rttest_event);
-                break;
-        default:
-                break;
-        }
-        schedule();
-        switch (op) {
-        case RTTEST_LOCK:
-        case RTTEST_LOCKINT:
-                if (mutex != &mutexes[dat])
-                        return;
-                if (td->mutexes[dat] != 2)
-                        return;
-                td->mutexes[dat] = 3;
-                td->event = atomic_add_return(1, &rttest_event);
-                break;
-        case RTTEST_LOCKNOWAIT:
-        case RTTEST_LOCKINTNOWAIT:
-                if (mutex != &mutexes[dat])
-                        return;
-                if (td->mutexes[dat] != 2)
-                        return;
-                td->mutexes[dat] = 1;
-                td->event = atomic_add_return(1, &rttest_event);
-                return;
-        default:
-                return;
-        }
-        td->opcode = 0;
-        for (;;) {
-                set_current_state(TASK_INTERRUPTIBLE);
-                if (td->opcode > 0) {
-                        int ret;
-                        set_current_state(TASK_RUNNING);
-                        ret = handle_op(td, 1);
-                        set_current_state(TASK_INTERRUPTIBLE);
-                        if (td->opcode == RTTEST_LOCKCONT)
-                                break;
-                        td->opcode = ret;
-                }
-                /* Wait for the next command to be executed */
-                schedule();
-        }
-        /* Restore previous command and data */
-        td->opcode = op;
-        td->opdata = dat;
-}
-static int test_func(void *data)
-{
-        struct test_thread_data *td = data;
-        int ret;
-        current->flags |= PF_MUTEX_TESTER;
-        set_freezable();
-        allow_signal(SIGHUP);
-        for(;;) {
-                set_current_state(TASK_INTERRUPTIBLE);
-                if (td->opcode > 0) {
-                        set_current_state(TASK_RUNNING);
-                        ret = handle_op(td, 0);
-                        set_current_state(TASK_INTERRUPTIBLE);
-                        td->opcode = ret;
-                }
-                /* Wait for the next command to be executed */
-                schedule();
-                try_to_freeze();
-                if (signal_pending(current))
-                        flush_signals(current);
-                if(kthread_should_stop())
-                        break;
-        }
-        return 0;
-}
-/**
- * sysfs_test_command - interface for test commands
- * @dev:        thread reference
- * @buf:        command for actual step
- * @count:      length of buffer
- *
- * command syntax:
- *
- * opcode:data
- */
-static ssize_t sysfs_test_command(struct device *dev, struct device_attribute *attr,
-                                  const char *buf, size_t count)
-{
-        struct sched_param schedpar;
-        struct test_thread_data *td;
-        char cmdbuf[32];
-        int op, dat, tid, ret;
-        td = container_of(dev, struct test_thread_data, dev);
-        tid = td->dev.id;
-        /* strings from sysfs write are not 0 terminated! */
-        if (count >= sizeof(cmdbuf))
-                return -EINVAL;
-        /* strip of \n: */
-        if (buf[count-1] == '\n')
-                count--;
-        if (count < 1)
-                return -EINVAL;
-        memcpy(cmdbuf, buf, count);
-        cmdbuf[count] = 0;
-        if (sscanf(cmdbuf, "%d:%d", &op, &dat) != 2)
-                return -EINVAL;
-        switch (op) {
-        case RTTEST_SCHEDOT:
-                schedpar.sched_priority = 0;
-                ret = sched_setscheduler(threads[tid], SCHED_NORMAL, &schedpar);
-                if (ret)
-                        return ret;
-                set_user_nice(current, 0);
-                break;
-        case RTTEST_SCHEDRT:
-                schedpar.sched_priority = dat;
-                ret = sched_setscheduler(threads[tid], SCHED_FIFO, &schedpar);
-                if (ret)
-                        return ret;
-                break;
-        case RTTEST_SIGNAL:
-                send_sig(SIGHUP, threads[tid], 0);
-                break;
-        default:
-                if (td->opcode > 0)
-                        return -EBUSY;
-                td->opdata = dat;
-                td->opcode = op;
-                wake_up_process(threads[tid]);
-        }
-        return count;
-}
-/**
- * sysfs_test_status - sysfs interface for rt tester
- * @dev:        thread to query
- * @buf:        char buffer to be filled with thread status info
- */
-static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *attr,
-                                 char *buf)
-{
-        struct test_thread_data *td;
-        struct task_struct *tsk;
-        char *curr = buf;
-        int i;
-        td = container_of(dev, struct test_thread_data, dev);
-        tsk = threads[td->dev.id];
-        spin_lock(&rttest_lock);
-        curr += sprintf(curr,
-                "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, M:",
-                td->opcode, td->event, tsk->state,
-                        (MAX_RT_PRIO - 1) - tsk->prio,
-                        (MAX_RT_PRIO - 1) - tsk->normal_prio,
-                tsk->pi_blocked_on);
-        for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--)
-                curr += sprintf(curr, "%d", td->mutexes[i]);
-        spin_unlock(&rttest_lock);
-        curr += sprintf(curr, ", T: %p, R: %p\n", tsk,
-                        mutexes[td->dev.id].owner);
-        return curr - buf;
-}
-static DEVICE_ATTR(status, S_IRUSR, sysfs_test_status, NULL);
-static DEVICE_ATTR(command, S_IWUSR, NULL, sysfs_test_command);
-static struct bus_type rttest_subsys = {
-        .name = "rttest",
-        .dev_name = "rttest",
-};
-static int init_test_thread(int id)
-{
-        thread_data[id].dev.bus = &rttest_subsys;
-        thread_data[id].dev.id = id;
-        threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id);
-        if (IS_ERR(threads[id]))
-                return PTR_ERR(threads[id]);
-        return device_register(&thread_data[id].dev);
-}
-static int init_rttest(void)
-{
-        int ret, i;
-        spin_lock_init(&rttest_lock);
-        for (i = 0; i < MAX_RT_TEST_MUTEXES; i++)
-                rt_mutex_init(&mutexes[i]);
-        ret = subsys_system_register(&rttest_subsys, NULL);
-        if (ret)
-                return ret;
-        for (i = 0; i < MAX_RT_TEST_THREADS; i++) {
-                ret = init_test_thread(i);
-                if (ret)
-                        break;
-                ret = device_create_file(&thread_data[i].dev, &dev_attr_status);
-                if (ret)
-                        break;
-                ret = device_create_file(&thread_data[i].dev, &dev_attr_command);
-                if (ret)
-                        break;
-        }
-        printk("Initializing RT-Tester: %s\n", ret ? "Failed" : "OK" );
-        return ret;
-}
-device_initcall(init_rttest);
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 5674b073473c..7781d801212f 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1120,7 +1120,7 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
                debug_rt_mutex_print_deadlock(waiter);
-                schedule_rt_mutex(lock);
+                schedule();
                raw_spin_lock(&lock->wait_lock);
                set_current_state(state);
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 7844f8f0e639..4f5f83c7d2d3 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -15,28 +15,6 @@
 #include <linux/rtmutex.h>
 /*
- * The rtmutex in kernel tester is independent of rtmutex debugging. We
- * call schedule_rt_mutex_test() instead of schedule() for the tasks which
- * belong to the tester. That way we can delay the wakeup path of those
- * threads to provoke lock stealing and testing of  complex boosting scenarios.
- */
-#ifdef CONFIG_RT_MUTEX_TESTER
-extern void schedule_rt_mutex_test(struct rt_mutex *lock);
-#define schedule_rt_mutex(_lock)                                \
-  do {                                                          \
-        if (!(current->flags & PF_MUTEX_TESTER))                \
-                schedule();                                     \
-        else                                                    \
-                schedule_rt_mutex_test(_lock);                  \
-  } while (0)
-#else
-# define schedule_rt_mutex(_lock)                       schedule()
-#endif
-/*
 * This is the control structure for tasks blocked on a rt_mutex,
 * which is allocated on the kernel stack on of the blocked task.
 *
diff --git a/kernel/membarrier.c b/kernel/membarrier.c
new file mode 100644
index 000000000000..536c727a56e9
--- /dev/null
+++ b/kernel/membarrier.c
@@ -0,0 +1,66 @@
+/*
+ * Copyright (C) 2010, 2015 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ *
+ * membarrier system call
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#include <linux/syscalls.h>
+#include <linux/membarrier.h>
+/*
+ * Bitmask made from a "or" of all commands within enum membarrier_cmd,
+ * except MEMBARRIER_CMD_QUERY.
+ */
+#define MEMBARRIER_CMD_BITMASK  (MEMBARRIER_CMD_SHARED)
+/**
+ * sys_membarrier - issue memory barriers on a set of threads
+ * @cmd:   Takes command values defined in enum membarrier_cmd.
+ * @flags: Currently needs to be 0. For future extensions.
+ *
+ * If this system call is not implemented, -ENOSYS is returned. If the
+ * command specified does not exist, or if the command argument is invalid,
+ * this system call returns -EINVAL. For a given command, with flags argument
+ * set to 0, this system call is guaranteed to always return the same value
+ * until reboot.
+ *
+ * All memory accesses performed in program order from each targeted thread
+ * is guaranteed to be ordered with respect to sys_membarrier(). If we use
+ * the semantic "barrier()" to represent a compiler barrier forcing memory
+ * accesses to be performed in program order across the barrier, and
+ * smp_mb() to represent explicit memory barriers forcing full memory
+ * ordering across the barrier, we have the following ordering table for
+ * each pair of barrier(), sys_membarrier() and smp_mb():
+ *
+ * The pair ordering is detailed as (O: ordered, X: not ordered):
+ *
+ *                        barrier()   smp_mb() sys_membarrier()
+ *        barrier()          X           X            O
+ *        smp_mb()           X           O            O
+ *        sys_membarrier()   O           O            O
+ */
+SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
+{
+        if (unlikely(flags))
+                return -EINVAL;
+        switch (cmd) {
+        case MEMBARRIER_CMD_QUERY:
+                return MEMBARRIER_CMD_BITMASK;
+        case MEMBARRIER_CMD_SHARED:
+                if (num_online_cpus() > 1)
+                        synchronize_sched();
+                return 0;
+        default:
+                return -EINVAL;
+        }
+}
diff --git a/kernel/memremap.c b/kernel/memremap.c
new file mode 100644
index 000000000000..72b0c66628b6
--- /dev/null
+++ b/kernel/memremap.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright(c) 2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/device.h>
+#include <linux/types.h>
+#include <linux/io.h>
+#include <linux/mm.h>
+#include <linux/memory_hotplug.h>
+#ifndef ioremap_cache
+/* temporary while we convert existing ioremap_cache users to memremap */
+__weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size)
+{
+        return ioremap(offset, size);
+}
+#endif
+/**
+ * memremap() - remap an iomem_resource as cacheable memory
+ * @offset: iomem resource start address
+ * @size: size of remap
+ * @flags: either MEMREMAP_WB or MEMREMAP_WT
+ *
+ * memremap() is "ioremap" for cases where it is known that the resource
+ * being mapped does not have i/o side effects and the __iomem
+ * annotation is not applicable.
+ *
+ * MEMREMAP_WB - matches the default mapping for "System RAM" on
+ * the architecture.  This is usually a read-allocate write-back cache.
+ * Morever, if MEMREMAP_WB is specified and the requested remap region is RAM
+ * memremap() will bypass establishing a new mapping and instead return
+ * a pointer into the direct map.
+ *
+ * MEMREMAP_WT - establish a mapping whereby writes either bypass the
+ * cache or are written through to memory and never exist in a
+ * cache-dirty state with respect to program visibility.  Attempts to
+ * map "System RAM" with this mapping type will fail.
+ */
+void *memremap(resource_size_t offset, size_t size, unsigned long flags)
+{
+        int is_ram = region_intersects(offset, size, "System RAM");
+        void *addr = NULL;
+        if (is_ram == REGION_MIXED) {
+                WARN_ONCE(1, "memremap attempted on mixed range %pa size: %#lx\n",
+                                &offset, (unsigned long) size);
+                return NULL;
+        }
+        /* Try all mapping types requested until one returns non-NULL */
+        if (flags & MEMREMAP_WB) {
+                flags &= ~MEMREMAP_WB;
+                /*
+                 * MEMREMAP_WB is special in that it can be satisifed
+                 * from the direct map.  Some archs depend on the
+                 * capability of memremap() to autodetect cases where
+                 * the requested range is potentially in "System RAM"
+                 */
+                if (is_ram == REGION_INTERSECTS)
+                        addr = __va(offset);
+                else
+                        addr = ioremap_cache(offset, size);
+        }
+        /*
+         * If we don't have a mapping yet and more request flags are
+         * pending then we will be attempting to establish a new virtual
+         * address mapping.  Enforce that this mapping is not aliasing
+         * "System RAM"
+         */
+        if (!addr && is_ram == REGION_INTERSECTS && flags) {
+                WARN_ONCE(1, "memremap attempted on ram %pa size: %#lx\n",
+                                &offset, (unsigned long) size);
+                return NULL;
+        }
+        if (!addr && (flags & MEMREMAP_WT)) {
+                flags &= ~MEMREMAP_WT;
+                addr = ioremap_wt(offset, size);
+        }
+        return addr;
+}
+EXPORT_SYMBOL(memremap);
+void memunmap(void *addr)
+{
+        if (is_vmalloc_addr(addr))
+                iounmap((void __iomem *) addr);
+}
+EXPORT_SYMBOL(memunmap);
+static void devm_memremap_release(struct device *dev, void *res)
+{
+        memunmap(res);
+}
+static int devm_memremap_match(struct device *dev, void *res, void *match_data)
+{
+        return *(void **)res == match_data;
+}
+void *devm_memremap(struct device *dev, resource_size_t offset,
+                size_t size, unsigned long flags)
+{
+        void **ptr, *addr;
+        ptr = devres_alloc(devm_memremap_release, sizeof(*ptr), GFP_KERNEL);
+        if (!ptr)
+                return NULL;
+        addr = memremap(offset, size, flags);
+        if (addr) {
+                *ptr = addr;
+                devres_add(dev, ptr);
+        } else
+                devres_free(ptr);
+        return addr;
+}
+EXPORT_SYMBOL(devm_memremap);
+void devm_memunmap(struct device *dev, void *addr)
+{
+        WARN_ON(devres_destroy(dev, devm_memremap_release, devm_memremap_match,
+                               addr));
+        memunmap(addr);
+}
+EXPORT_SYMBOL(devm_memunmap);
+#ifdef CONFIG_ZONE_DEVICE
+struct page_map {
+        struct resource res;
+};
+static void devm_memremap_pages_release(struct device *dev, void *res)
+{
+        struct page_map *page_map = res;
+        /* pages are dead and unused, undo the arch mapping */
+        arch_remove_memory(page_map->res.start, resource_size(&page_map->res));
+}
+void *devm_memremap_pages(struct device *dev, struct resource *res)
+{
+        int is_ram = region_intersects(res->start, resource_size(res),
+                        "System RAM");
+        struct page_map *page_map;
+        int error, nid;
+        if (is_ram == REGION_MIXED) {
+                WARN_ONCE(1, "%s attempted on mixed region %pr\n",
+                                __func__, res);
+                return ERR_PTR(-ENXIO);
+        }
+        if (is_ram == REGION_INTERSECTS)
+                return __va(res->start);
+        page_map = devres_alloc(devm_memremap_pages_release,
+                        sizeof(*page_map), GFP_KERNEL);
+        if (!page_map)
+                return ERR_PTR(-ENOMEM);
+        memcpy(&page_map->res, res, sizeof(*res));
+        nid = dev_to_node(dev);
+        if (nid < 0)
+                nid = 0;
+        error = arch_add_memory(nid, res->start, resource_size(res), true);
+        if (error) {
+                devres_free(page_map);
+                return ERR_PTR(error);
+        }
+        devres_add(dev, page_map);
+        return __va(res->start);
+}
+EXPORT_SYMBOL(devm_memremap_pages);
+#endif /* CONFIG_ZONE_DEVICE */
diff --git a/kernel/module.c b/kernel/module.c
index 4d2b82e610e2..b86b7bf1be38 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -602,13 +602,16 @@ const struct kernel_symbol *find_symbol(const char *name,
 }
 EXPORT_SYMBOL_GPL(find_symbol);
-/* Search for module by name: must hold module_mutex. */
+/*
+ * Search for module by name: must hold module_mutex (or preempt disabled
+ * for read-only access).
+ */
 static struct module *find_module_all(const char *name, size_t len,
                                      bool even_unformed)
 {
        struct module *mod;
-        module_assert_mutex();
+        module_assert_mutex_or_preempt();
        list_for_each_entry(mod, &modules, list) {
                if (!even_unformed && mod->state == MODULE_STATE_UNFORMED)
@@ -621,6 +624,7 @@ static struct module *find_module_all(const char *name, size_t len,
 struct module *find_module(const char *name)
 {
+        module_assert_mutex();
        return find_module_all(name, strlen(name), false);
 }
 EXPORT_SYMBOL_GPL(find_module);
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index be5b8fac4bd0..bd62f5cda746 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c
@@ -10,11 +10,8 @@
 */
 #include <linux/kernel.h>
-#include <linux/err.h>
-#include <crypto/public_key.h>
-#include <crypto/hash.h>
-#include <keys/asymmetric-type.h>
 #include <keys/system_keyring.h>
+#include <crypto/public_key.h>
 #include "module-internal.h"
 /*
@@ -28,170 +25,22 @@
 *      - Information block
 */
 struct module_signature {
-        u8      algo;           /* Public-key crypto algorithm [enum pkey_algo] */
+        u8      algo;           /* Public-key crypto algorithm [0] */
-        u8      hash;           /* Digest algorithm [enum hash_algo] */
+        u8      hash;           /* Digest algorithm [0] */
-        u8      id_type;        /* Key identifier type [enum pkey_id_type] */
+        u8      id_type;        /* Key identifier type [PKEY_ID_PKCS7] */
-        u8      signer_len;     /* Length of signer's name */
+        u8      signer_len;     /* Length of signer's name [0] */
-        u8      key_id_len;     /* Length of key identifier */
+        u8      key_id_len;     /* Length of key identifier [0] */
        u8      __pad[3];
        __be32  sig_len;        /* Length of signature data */
 };
 /*
- * Digest the module contents.
- */
-static struct public_key_signature *mod_make_digest(enum hash_algo hash,
-                                                    const void *mod,
-                                                    unsigned long modlen)
-{
-        struct public_key_signature *pks;
-        struct crypto_shash *tfm;
-        struct shash_desc *desc;
-        size_t digest_size, desc_size;
-        int ret;
-        pr_devel("==>%s()\n", __func__);
-        
-        /* Allocate the hashing algorithm we're going to need and find out how
-         * big the hash operational data will be.
-         */
-        tfm = crypto_alloc_shash(hash_algo_name[hash], 0, 0);
-        if (IS_ERR(tfm))
-                return (PTR_ERR(tfm) == -ENOENT) ? ERR_PTR(-ENOPKG) : ERR_CAST(tfm);
-        desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
-        digest_size = crypto_shash_digestsize(tfm);
-        /* We allocate the hash operational data storage on the end of our
-         * context data and the digest output buffer on the end of that.
-         */
-        ret = -ENOMEM;
-        pks = kzalloc(digest_size + sizeof(*pks) + desc_size, GFP_KERNEL);
-        if (!pks)
-                goto error_no_pks;
-        pks->pkey_hash_algo     = hash;
-        pks->digest             = (u8 *)pks + sizeof(*pks) + desc_size;
-        pks->digest_size        = digest_size;
-        desc = (void *)pks + sizeof(*pks);
-        desc->tfm   = tfm;
-        desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
-        ret = crypto_shash_init(desc);
-        if (ret < 0)
-                goto error;
-        ret = crypto_shash_finup(desc, mod, modlen, pks->digest);
-        if (ret < 0)
-                goto error;
-        crypto_free_shash(tfm);
-        pr_devel("<==%s() = ok\n", __func__);
-        return pks;
-error:
-        kfree(pks);
-error_no_pks:
-        crypto_free_shash(tfm);
-        pr_devel("<==%s() = %d\n", __func__, ret);
-        return ERR_PTR(ret);
-}
-/*
- * Extract an MPI array from the signature data.  This represents the actual
- * signature.  Each raw MPI is prefaced by a BE 2-byte value indicating the
- * size of the MPI in bytes.
- *
- * RSA signatures only have one MPI, so currently we only read one.
- */
-static int mod_extract_mpi_array(struct public_key_signature *pks,
-                                 const void *data, size_t len)
-{
-        size_t nbytes;
-        MPI mpi;
-        if (len < 3)
-                return -EBADMSG;
-        nbytes = ((const u8 *)data)[0] << 8 | ((const u8 *)data)[1];
-        data += 2;
-        len -= 2;
-        if (len != nbytes)
-                return -EBADMSG;
-        mpi = mpi_read_raw_data(data, nbytes);
-        if (!mpi)
-                return -ENOMEM;
-        pks->mpi[0] = mpi;
-        pks->nr_mpi = 1;
-        return 0;
-}
-/*
- * Request an asymmetric key.
- */
-static struct key *request_asymmetric_key(const char *signer, size_t signer_len,
-                                          const u8 *key_id, size_t key_id_len)
-{
-        key_ref_t key;
-        size_t i;
-        char *id, *q;
-        pr_devel("==>%s(,%zu,,%zu)\n", __func__, signer_len, key_id_len);
-        /* Construct an identifier. */
-        id = kmalloc(signer_len + 2 + key_id_len * 2 + 1, GFP_KERNEL);
-        if (!id)
-                return ERR_PTR(-ENOKEY);
-        memcpy(id, signer, signer_len);
-        q = id + signer_len;
-        *q++ = ':';
-        *q++ = ' ';
-        for (i = 0; i < key_id_len; i++) {
-                *q++ = hex_asc[*key_id >> 4];
-                *q++ = hex_asc[*key_id++ & 0x0f];
-        }
-        *q = 0;
-        pr_debug("Look up: \"%s\"\n", id);
-        key = keyring_search(make_key_ref(system_trusted_keyring, 1),
-                             &key_type_asymmetric, id);
-        if (IS_ERR(key))
-                pr_warn("Request for unknown module key '%s' err %ld\n",
-                        id, PTR_ERR(key));
-        kfree(id);
-        if (IS_ERR(key)) {
-                switch (PTR_ERR(key)) {
-                        /* Hide some search errors */
-                case -EACCES:
-                case -ENOTDIR:
-                case -EAGAIN:
-                        return ERR_PTR(-ENOKEY);
-                default:
-                        return ERR_CAST(key);
-                }
-        }
-        pr_devel("<==%s() = 0 [%x]\n", __func__, key_serial(key_ref_to_ptr(key)));
-        return key_ref_to_ptr(key);
-}
-/*
 * Verify the signature on a module.
 */
 int mod_verify_sig(const void *mod, unsigned long *_modlen)
 {
-        struct public_key_signature *pks;
        struct module_signature ms;
-        struct key *key;
-        const void *sig;
        size_t modlen = *_modlen, sig_len;
-        int ret;
        pr_devel("==>%s(,%zu)\n", __func__, modlen);
@@ -205,46 +54,24 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen)
        if (sig_len >= modlen)
                return -EBADMSG;
        modlen -= sig_len;
-        if ((size_t)ms.signer_len + ms.key_id_len >= modlen)
-                return -EBADMSG;
-        modlen -= (size_t)ms.signer_len + ms.key_id_len;
        *_modlen = modlen;
-        sig = mod + modlen;
-        /* For the moment, only support RSA and X.509 identifiers */
-        if (ms.algo != PKEY_ALGO_RSA ||
-            ms.id_type != PKEY_ID_X509)
-                return -ENOPKG;
-        if (ms.hash >= PKEY_HASH__LAST ||
+        if (ms.id_type != PKEY_ID_PKCS7) {
-            !hash_algo_name[ms.hash])
+                pr_err("Module is not signed with expected PKCS#7 message\n");
                return -ENOPKG;
-        key = request_asymmetric_key(sig, ms.signer_len,
-                                     sig + ms.signer_len, ms.key_id_len);
-        if (IS_ERR(key))
-                return PTR_ERR(key);
-        pks = mod_make_digest(ms.hash, mod, modlen);
-        if (IS_ERR(pks)) {
-                ret = PTR_ERR(pks);
-                goto error_put_key;
        }
-        ret = mod_extract_mpi_array(pks, sig + ms.signer_len + ms.key_id_len,
+        if (ms.algo != 0 ||
-                                    sig_len);
+            ms.hash != 0 ||
-        if (ret < 0)
+            ms.signer_len != 0 ||
-                goto error_free_pks;
+            ms.key_id_len != 0 ||
+            ms.__pad[0] != 0 ||
-        ret = verify_signature(key, pks);
+            ms.__pad[1] != 0 ||
-        pr_devel("verify_signature() = %d\n", ret);
+            ms.__pad[2] != 0) {
+                pr_err("PKCS#7 signature info has unexpected non-zero params\n");
+                return -EBADMSG;
+        }
-error_free_pks:
+        return system_verify_data(mod, modlen, mod + modlen, sig_len,
-        mpi_free(pks->rsa.s);
+                                  VERIFYING_MODULE_SIGNATURE);
-        kfree(pks);
-error_put_key:
-        key_put(key);
-        pr_devel("<==%s() = %d\n", __func__, ret);
-        return ret;     
 }
diff --git a/kernel/notifier.c b/kernel/notifier.c
index ae9fc7cc360e..fd2c9acbcc19 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -544,6 +544,8 @@ int notrace notify_die(enum die_val val, const char *str,
                .signr  = sig,
        };
+        RCU_LOCKDEP_WARN(!rcu_is_watching(),
+                           "notify_die called but RCU thinks we're quiescent");
        return atomic_notifier_call_chain(&die_chain, val, &args);
 }
 NOKPROBE_SYMBOL(notify_die);
diff --git a/kernel/pid.c b/kernel/pid.c
index 4fd07d5b7baf..ca368793808e 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -451,9 +451,8 @@ EXPORT_SYMBOL(pid_task);
 */
 struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
 {
-        rcu_lockdep_assert(rcu_read_lock_held(),
+        RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
-                           "find_task_by_pid_ns() needs rcu_read_lock()"
+                         "find_task_by_pid_ns() needs rcu_read_lock() protection");
-                           " protection");
        return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
 }
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 9e302315e33d..02e8dfaa1ce2 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -18,6 +18,16 @@ config SUSPEND_FREEZER
          Turning OFF this setting is NOT recommended! If in doubt, say Y.
+config SUSPEND_SKIP_SYNC
+        bool "Skip kernel's sys_sync() on suspend to RAM/standby"
+        depends on SUSPEND
+        depends on EXPERT
+        help
+          Skip the kernel sys_sync() before freezing user processes.
+          Some systems prefer not to pay this cost on every invocation
+          of suspend, or they are content with invoking sync() from
+          user-space before invoking suspend.  Say Y if that's your case.
 config HIBERNATE_CALLBACKS
        bool
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 53266b729fd9..7e4cda4a8dd9 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -484,11 +484,13 @@ static int enter_state(suspend_state_t state)
        if (state == PM_SUSPEND_FREEZE)
                freeze_begin();
+#ifndef CONFIG_SUSPEND_SKIP_SYNC
        trace_suspend_resume(TPS("sync_filesystems"), 0, true);
        printk(KERN_INFO "PM: Syncing filesystems ... ");
        sys_sync();
        printk("done.\n");
        trace_suspend_resume(TPS("sync_filesystems"), 0, false);
+#endif
        pr_debug("PM: Preparing system for sleep (%s)\n", pm_states[state]);
        error = suspend_prepare(state);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 2f30ca91e4fa..b2066fb5b10f 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -227,27 +227,23 @@ static void hib_init_batch(struct hib_bio_batch *hb)
        hb->error = 0;
 }
-static void hib_end_io(struct bio *bio, int error)
+static void hib_end_io(struct bio *bio)
 {
        struct hib_bio_batch *hb = bio->bi_private;
-        const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
        struct page *page = bio->bi_io_vec[0].bv_page;
-        if (!uptodate || error) {
+        if (bio->bi_error) {
                printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
                                imajor(bio->bi_bdev->bd_inode),
                                iminor(bio->bi_bdev->bd_inode),
                                (unsigned long long)bio->bi_iter.bi_sector);
-                if (!error)
-                        error = -EIO;
        }
        if (bio_data_dir(bio) == WRITE)
                put_page(page);
-        if (error && !hb->error)
+        if (bio->bi_error && !hb->error)
-                hb->error = error;
+                hb->error = bio->bi_error;
        if (atomic_dec_and_test(&hb->count))
                wake_up(&hb->wait);
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c
index 019069c84ff6..1896386e16bb 100644
--- a/kernel/power/wakelock.c
+++ b/kernel/power/wakelock.c
@@ -17,6 +17,7 @@
 #include <linux/list.h>
 #include <linux/rbtree.h>
 #include <linux/slab.h>
+#include <linux/workqueue.h>
 #include "power.h"
@@ -83,7 +84,9 @@ static inline void decrement_wakelocks_number(void) {}
 #define WL_GC_COUNT_MAX 100
 #define WL_GC_TIME_SEC  300
+static void __wakelocks_gc(struct work_struct *work);
 static LIST_HEAD(wakelocks_lru_list);
+static DECLARE_WORK(wakelock_work, __wakelocks_gc);
 static unsigned int wakelocks_gc_count;
 static inline void wakelocks_lru_add(struct wakelock *wl)
@@ -96,13 +99,12 @@ static inline void wakelocks_lru_most_recent(struct wakelock *wl)
        list_move(&wl->lru, &wakelocks_lru_list);
 }
-static void wakelocks_gc(void)
+static void __wakelocks_gc(struct work_struct *work)
 {
        struct wakelock *wl, *aux;
        ktime_t now;
-        if (++wakelocks_gc_count <= WL_GC_COUNT_MAX)
+        mutex_lock(&wakelocks_lock);
-                return;
        now = ktime_get();
        list_for_each_entry_safe_reverse(wl, aux, &wakelocks_lru_list, lru) {
@@ -127,6 +129,16 @@ static void wakelocks_gc(void)
                }
        }
        wakelocks_gc_count = 0;
+        mutex_unlock(&wakelocks_lock);
+}
+static void wakelocks_gc(void)
+{
+        if (++wakelocks_gc_count <= WL_GC_COUNT_MAX)
+                return;
+        schedule_work(&wakelock_work);
 }
 #else /* !CONFIG_PM_WAKELOCKS_GC */
 static inline void wakelocks_lru_add(struct wakelock *wl) {}
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index cf8c24203368..8f0324ef72ab 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -835,7 +835,7 @@ const struct file_operations kmsg_fops = {
        .release = devkmsg_release,
 };
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 /*
 * This appends the listed symbols to /proc/vmcore
 *
diff --git a/kernel/profile.c b/kernel/profile.c
index a7bcd28d6e9f..99513e1160e5 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -339,7 +339,7 @@ static int profile_cpu_callback(struct notifier_block *info,
                node = cpu_to_mem(cpu);
                per_cpu(cpu_profile_flip, cpu) = 0;
                if (!per_cpu(cpu_profile_hits, cpu)[1]) {
-                        page = alloc_pages_exact_node(node,
+                        page = __alloc_pages_node(node,
                                        GFP_KERNEL | __GFP_ZERO,
                                        0);
                        if (!page)
@@ -347,7 +347,7 @@ static int profile_cpu_callback(struct notifier_block *info,
                        per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
                }
                if (!per_cpu(cpu_profile_hits, cpu)[0]) {
-                        page = alloc_pages_exact_node(node,
+                        page = __alloc_pages_node(node,
                                        GFP_KERNEL | __GFP_ZERO,
                                        0);
                        if (!page)
@@ -543,14 +543,14 @@ static int create_hash_tables(void)
                int node = cpu_to_mem(cpu);
                struct page *page;
-                page = alloc_pages_exact_node(node,
+                page = __alloc_pages_node(node,
                                GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
                                0);
                if (!page)
                        goto out_cleanup;
                per_cpu(cpu_profile_hits, cpu)[1]
                                = (struct profile_hit *)page_address(page);
-                page = alloc_pages_exact_node(node,
+                page = __alloc_pages_node(node,
                                GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
                                0);
                if (!page)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index c8e0e050a36a..787320de68e0 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -556,6 +556,19 @@ static int ptrace_setoptions(struct task_struct *child, unsigned long data)
        if (data & ~(unsigned long)PTRACE_O_MASK)
                return -EINVAL;
+        if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) {
+                if (!config_enabled(CONFIG_CHECKPOINT_RESTORE) ||
+                    !config_enabled(CONFIG_SECCOMP))
+                        return -EINVAL;
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                if (seccomp_mode(&current->seccomp) != SECCOMP_MODE_DISABLED ||
+                    current->ptrace & PT_SUSPEND_SECCOMP)
+                        return -EPERM;
+        }
        /* Avoid intermediate state when all opts are cleared */
        flags = child->ptrace;
        flags &= ~(PTRACE_O_MASK << PT_OPT_FLAG_SHIFT);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 59e32684c23b..77192953dee5 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -635,6 +635,8 @@ static struct rcu_torture_ops sched_ops = {
        .deferred_free  = rcu_sched_torture_deferred_free,
        .sync           = synchronize_sched,
        .exp_sync       = synchronize_sched_expedited,
+        .get_state      = get_state_synchronize_sched,
+        .cond_sync      = cond_synchronize_sched,
        .call           = call_rcu_sched,
        .cb_barrier     = rcu_barrier_sched,
        .fqs            = rcu_sched_force_quiescent_state,
@@ -684,10 +686,20 @@ static struct rcu_torture_ops tasks_ops = {
 #define RCUTORTURE_TASKS_OPS &tasks_ops,
+static bool __maybe_unused torturing_tasks(void)
+{
+        return cur_ops == &tasks_ops;
+}
 #else /* #ifdef CONFIG_TASKS_RCU */
 #define RCUTORTURE_TASKS_OPS
+static bool torturing_tasks(void)
+{
+        return false;
+}
 #endif /* #else #ifdef CONFIG_TASKS_RCU */
 /*
@@ -823,9 +835,7 @@ rcu_torture_cbflood(void *arg)
        }
        if (err) {
                VERBOSE_TOROUT_STRING("rcu_torture_cbflood disabled: Bad args or OOM");
-                while (!torture_must_stop())
+                goto wait_for_stop;
-                        schedule_timeout_interruptible(HZ);
-                return 0;
        }
        VERBOSE_TOROUT_STRING("rcu_torture_cbflood task started");
        do {
@@ -844,6 +854,7 @@ rcu_torture_cbflood(void *arg)
                stutter_wait("rcu_torture_cbflood");
        } while (!torture_must_stop());
        vfree(rhp);
+wait_for_stop:
        torture_kthread_stopping("rcu_torture_cbflood");
        return 0;
 }
@@ -1088,7 +1099,8 @@ static void rcu_torture_timer(unsigned long unused)
        p = rcu_dereference_check(rcu_torture_current,
                                  rcu_read_lock_bh_held() ||
                                  rcu_read_lock_sched_held() ||
-                                  srcu_read_lock_held(srcu_ctlp));
+                                  srcu_read_lock_held(srcu_ctlp) ||
+                                  torturing_tasks());
        if (p == NULL) {
                /* Leave because rcu_torture_writer is not yet underway */
                cur_ops->readunlock(idx);
@@ -1162,7 +1174,8 @@ rcu_torture_reader(void *arg)
                p = rcu_dereference_check(rcu_torture_current,
                                          rcu_read_lock_bh_held() ||
                                          rcu_read_lock_sched_held() ||
-                                          srcu_read_lock_held(srcu_ctlp));
+                                          srcu_read_lock_held(srcu_ctlp) ||
+                                          torturing_tasks());
                if (p == NULL) {
                        /* Wait for rcu_torture_writer to get underway */
                        cur_ops->readunlock(idx);
@@ -1507,7 +1520,7 @@ static int rcu_torture_barrier_init(void)
        int i;
        int ret;
-        if (n_barrier_cbs == 0)
+        if (n_barrier_cbs <= 0)
                return 0;
        if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) {
                pr_alert("%s" TORTURE_FLAG
@@ -1786,12 +1799,15 @@ rcu_torture_init(void)
                                          writer_task);
        if (firsterr)
                goto unwind;
-        fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]),
+        if (nfakewriters > 0) {
-                                   GFP_KERNEL);
+                fakewriter_tasks = kzalloc(nfakewriters *
-        if (fakewriter_tasks == NULL) {
+                                           sizeof(fakewriter_tasks[0]),
-                VERBOSE_TOROUT_ERRSTRING("out of memory");
+                                           GFP_KERNEL);
-                firsterr = -ENOMEM;
+                if (fakewriter_tasks == NULL) {
-                goto unwind;
+                        VERBOSE_TOROUT_ERRSTRING("out of memory");
+                        firsterr = -ENOMEM;
+                        goto unwind;
+                }
        }
        for (i = 0; i < nfakewriters; i++) {
                firsterr = torture_create_kthread(rcu_torture_fakewriter,
@@ -1818,7 +1834,7 @@ rcu_torture_init(void)
                if (firsterr)
                        goto unwind;
        }
-        if (test_no_idle_hz) {
+        if (test_no_idle_hz && shuffle_interval > 0) {
                firsterr = torture_shuffle_init(shuffle_interval * HZ);
                if (firsterr)
                        goto unwind;
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index fb33d35ee0b7..d3fcb2ec8536 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -252,14 +252,15 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
 }
 /**
- * srcu_readers_active - returns approximate number of readers.
+ * srcu_readers_active - returns true if there are readers. and false
+ *                       otherwise
 * @sp: which srcu_struct to count active readers (holding srcu_read_lock).
 *
 * Note that this is not an atomic primitive, and can therefore suffer
 * severe errors when invoked on an active srcu_struct.  That said, it
 * can be useful as an error check at cleanup time.
 */
-static int srcu_readers_active(struct srcu_struct *sp)
+static bool srcu_readers_active(struct srcu_struct *sp)
 {
        int cpu;
        unsigned long sum = 0;
@@ -414,11 +415,11 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
        struct rcu_head *head = &rcu.head;
        bool done = false;
-        rcu_lockdep_assert(!lock_is_held(&sp->dep_map) &&
+        RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) ||
-                           !lock_is_held(&rcu_bh_lock_map) &&
+                         lock_is_held(&rcu_bh_lock_map) ||
-                           !lock_is_held(&rcu_lock_map) &&
+                         lock_is_held(&rcu_lock_map) ||
-                           !lock_is_held(&rcu_sched_lock_map),
+                         lock_is_held(&rcu_sched_lock_map),
-                           "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section");
+                         "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section");
        might_sleep();
        init_completion(&rcu.completion);
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index c291bd65d2cb..d0471056d0af 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -191,10 +191,10 @@ static void rcu_process_callbacks(struct softirq_action *unused)
 */
 void synchronize_sched(void)
 {
-        rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
+        RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
-                           !lock_is_held(&rcu_lock_map) &&
+                         lock_is_held(&rcu_lock_map) ||
-                           !lock_is_held(&rcu_sched_lock_map),
+                         lock_is_held(&rcu_sched_lock_map),
-                           "Illegal synchronize_sched() in RCU read-side critical section");
+                         "Illegal synchronize_sched() in RCU read-side critical section");
        cond_resched();
 }
 EXPORT_SYMBOL_GPL(synchronize_sched);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 65137bc28b2b..9f75f25cc5d9 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -70,6 +70,8 @@ MODULE_ALIAS("rcutree");
 static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
 static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
+static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS];
+static struct lock_class_key rcu_exp_sched_class[RCU_NUM_LVLS];
 /*
 * In order to export the rcu_state name to the tracing tools, it
@@ -124,13 +126,8 @@ module_param(rcu_fanout_exact, bool, 0444);
 static int rcu_fanout_leaf = RCU_FANOUT_LEAF;
 module_param(rcu_fanout_leaf, int, 0444);
 int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
-static int num_rcu_lvl[] = {  /* Number of rcu_nodes at specified level. */
+/* Number of rcu_nodes at specified level. */
-        NUM_RCU_LVL_0,
+static int num_rcu_lvl[] = NUM_RCU_LVL_INIT;
-        NUM_RCU_LVL_1,
-        NUM_RCU_LVL_2,
-        NUM_RCU_LVL_3,
-        NUM_RCU_LVL_4,
-};
 int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
 /*
@@ -649,12 +646,12 @@ static void rcu_eqs_enter_common(long long oldval, bool user)
         * It is illegal to enter an extended quiescent state while
         * in an RCU read-side critical section.
         */
-        rcu_lockdep_assert(!lock_is_held(&rcu_lock_map),
+        RCU_LOCKDEP_WARN(lock_is_held(&rcu_lock_map),
-                           "Illegal idle entry in RCU read-side critical section.");
+                         "Illegal idle entry in RCU read-side critical section.");
-        rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map),
+        RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map),
-                           "Illegal idle entry in RCU-bh read-side critical section.");
+                         "Illegal idle entry in RCU-bh read-side critical section.");
-        rcu_lockdep_assert(!lock_is_held(&rcu_sched_lock_map),
+        RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map),
-                           "Illegal idle entry in RCU-sched read-side critical section.");
+                         "Illegal idle entry in RCU-sched read-side critical section.");
 }
 /*
@@ -701,7 +698,7 @@ void rcu_idle_enter(void)
 }
 EXPORT_SYMBOL_GPL(rcu_idle_enter);
-#ifdef CONFIG_RCU_USER_QS
+#ifdef CONFIG_NO_HZ_FULL
 /**
 * rcu_user_enter - inform RCU that we are resuming userspace.
 *
@@ -714,7 +711,7 @@ void rcu_user_enter(void)
 {
        rcu_eqs_enter(1);
 }
-#endif /* CONFIG_RCU_USER_QS */
+#endif /* CONFIG_NO_HZ_FULL */
 /**
 * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
@@ -828,7 +825,7 @@ void rcu_idle_exit(void)
 }
 EXPORT_SYMBOL_GPL(rcu_idle_exit);
-#ifdef CONFIG_RCU_USER_QS
+#ifdef CONFIG_NO_HZ_FULL
 /**
 * rcu_user_exit - inform RCU that we are exiting userspace.
 *
@@ -839,7 +836,7 @@ void rcu_user_exit(void)
 {
        rcu_eqs_exit(1);
 }
-#endif /* CONFIG_RCU_USER_QS */
+#endif /* CONFIG_NO_HZ_FULL */
 /**
 * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
@@ -978,9 +975,9 @@ bool notrace rcu_is_watching(void)
 {
        bool ret;
-        preempt_disable();
+        preempt_disable_notrace();
        ret = __rcu_is_watching();
-        preempt_enable();
+        preempt_enable_notrace();
        return ret;
 }
 EXPORT_SYMBOL_GPL(rcu_is_watching);
@@ -1178,9 +1175,11 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
        j = jiffies;
        gpa = READ_ONCE(rsp->gp_activity);
        if (j - gpa > 2 * HZ)
-                pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x\n",
+                pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x s%d ->state=%#lx\n",
                       rsp->name, j - gpa,
-                       rsp->gpnum, rsp->completed, rsp->gp_flags);
+                       rsp->gpnum, rsp->completed,
+                       rsp->gp_flags, rsp->gp_state,
+                       rsp->gp_kthread ? rsp->gp_kthread->state : 0);
 }
 /*
@@ -1906,6 +1905,26 @@ static int rcu_gp_init(struct rcu_state *rsp)
 }
 /*
+ * Helper function for wait_event_interruptible_timeout() wakeup
+ * at force-quiescent-state time.
+ */
+static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp)
+{
+        struct rcu_node *rnp = rcu_get_root(rsp);
+        /* Someone like call_rcu() requested a force-quiescent-state scan. */
+        *gfp = READ_ONCE(rsp->gp_flags);
+        if (*gfp & RCU_GP_FLAG_FQS)
+                return true;
+        /* The current grace period has completed. */
+        if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp))
+                return true;
+        return false;
+}
+/*
 * Do one round of quiescent-state forcing.
 */
 static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
@@ -2041,6 +2060,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
                        wait_event_interruptible(rsp->gp_wq,
                                                 READ_ONCE(rsp->gp_flags) &
                                                 RCU_GP_FLAG_INIT);
+                        rsp->gp_state = RCU_GP_DONE_GPS;
                        /* Locking provides needed memory barrier. */
                        if (rcu_gp_init(rsp))
                                break;
@@ -2068,11 +2088,8 @@ static int __noreturn rcu_gp_kthread(void *arg)
                                               TPS("fqswait"));
                        rsp->gp_state = RCU_GP_WAIT_FQS;
                        ret = wait_event_interruptible_timeout(rsp->gp_wq,
-                                        ((gf = READ_ONCE(rsp->gp_flags)) &
+                                        rcu_gp_fqs_check_wake(rsp, &gf), j);
-                                         RCU_GP_FLAG_FQS) ||
+                        rsp->gp_state = RCU_GP_DOING_FQS;
-                                        (!READ_ONCE(rnp->qsmask) &&
-                                         !rcu_preempt_blocked_readers_cgp(rnp)),
-                                        j);
                        /* Locking provides needed memory barriers. */
                        /* If grace period done, leave loop. */
                        if (!READ_ONCE(rnp->qsmask) &&
@@ -2110,7 +2127,9 @@ static int __noreturn rcu_gp_kthread(void *arg)
                }
                /* Handle grace-period end. */
+                rsp->gp_state = RCU_GP_CLEANUP;
                rcu_gp_cleanup(rsp);
+                rsp->gp_state = RCU_GP_CLEANED;
        }
 }
@@ -3161,10 +3180,10 @@ static inline int rcu_blocking_is_gp(void)
 */
 void synchronize_sched(void)
 {
-        rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
+        RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
-                           !lock_is_held(&rcu_lock_map) &&
+                         lock_is_held(&rcu_lock_map) ||
-                           !lock_is_held(&rcu_sched_lock_map),
+                         lock_is_held(&rcu_sched_lock_map),
-                           "Illegal synchronize_sched() in RCU-sched read-side critical section");
+                         "Illegal synchronize_sched() in RCU-sched read-side critical section");
        if (rcu_blocking_is_gp())
                return;
        if (rcu_gp_is_expedited())
@@ -3188,10 +3207,10 @@ EXPORT_SYMBOL_GPL(synchronize_sched);
 */
 void synchronize_rcu_bh(void)
 {
-        rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
+        RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
-                           !lock_is_held(&rcu_lock_map) &&
+                         lock_is_held(&rcu_lock_map) ||
-                           !lock_is_held(&rcu_sched_lock_map),
+                         lock_is_held(&rcu_sched_lock_map),
-                           "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");
+                         "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");
        if (rcu_blocking_is_gp())
                return;
        if (rcu_gp_is_expedited())
@@ -3253,23 +3272,247 @@ void cond_synchronize_rcu(unsigned long oldstate)
 }
 EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
-static int synchronize_sched_expedited_cpu_stop(void *data)
+/**
+ * get_state_synchronize_sched - Snapshot current RCU-sched state
+ *
+ * Returns a cookie that is used by a later call to cond_synchronize_sched()
+ * to determine whether or not a full grace period has elapsed in the
+ * meantime.
+ */
+unsigned long get_state_synchronize_sched(void)
 {
        /*
-         * There must be a full memory barrier on each affected CPU
+         * Any prior manipulation of RCU-protected data must happen
-         * between the time that try_stop_cpus() is called and the
+         * before the load from ->gpnum.
-         * time that it returns.
+         */
-         *
+        smp_mb();  /* ^^^ */
-         * In the current initial implementation of cpu_stop, the
-         * above condition is already met when the control reaches
+        /*
-         * this point and the following smp_mb() is not strictly
+         * Make sure this load happens before the purportedly
-         * necessary.  Do smp_mb() anyway for documentation and
+         * time-consuming work between get_state_synchronize_sched()
-         * robustness against future implementation changes.
+         * and cond_synchronize_sched().
+         */
+        return smp_load_acquire(&rcu_sched_state.gpnum);
+}
+EXPORT_SYMBOL_GPL(get_state_synchronize_sched);
+/**
+ * cond_synchronize_sched - Conditionally wait for an RCU-sched grace period
+ *
+ * @oldstate: return value from earlier call to get_state_synchronize_sched()
+ *
+ * If a full RCU-sched grace period has elapsed since the earlier call to
+ * get_state_synchronize_sched(), just return.  Otherwise, invoke
+ * synchronize_sched() to wait for a full grace period.
+ *
+ * Yes, this function does not take counter wrap into account.  But
+ * counter wrap is harmless.  If the counter wraps, we have waited for
+ * more than 2 billion grace periods (and way more on a 64-bit system!),
+ * so waiting for one additional grace period should be just fine.
+ */
+void cond_synchronize_sched(unsigned long oldstate)
+{
+        unsigned long newstate;
+        /*
+         * Ensure that this load happens before any RCU-destructive
+         * actions the caller might carry out after we return.
         */
-        smp_mb(); /* See above comment block. */
+        newstate = smp_load_acquire(&rcu_sched_state.completed);
+        if (ULONG_CMP_GE(oldstate, newstate))
+                synchronize_sched();
+}
+EXPORT_SYMBOL_GPL(cond_synchronize_sched);
+/* Adjust sequence number for start of update-side operation. */
+static void rcu_seq_start(unsigned long *sp)
+{
+        WRITE_ONCE(*sp, *sp + 1);
+        smp_mb(); /* Ensure update-side operation after counter increment. */
+        WARN_ON_ONCE(!(*sp & 0x1));
+}
+/* Adjust sequence number for end of update-side operation. */
+static void rcu_seq_end(unsigned long *sp)
+{
+        smp_mb(); /* Ensure update-side operation before counter increment. */
+        WRITE_ONCE(*sp, *sp + 1);
+        WARN_ON_ONCE(*sp & 0x1);
+}
+/* Take a snapshot of the update side's sequence number. */
+static unsigned long rcu_seq_snap(unsigned long *sp)
+{
+        unsigned long s;
+        smp_mb(); /* Caller's modifications seen first by other CPUs. */
+        s = (READ_ONCE(*sp) + 3) & ~0x1;
+        smp_mb(); /* Above access must not bleed into critical section. */
+        return s;
+}
+/*
+ * Given a snapshot from rcu_seq_snap(), determine whether or not a
+ * full update-side operation has occurred.
+ */
+static bool rcu_seq_done(unsigned long *sp, unsigned long s)
+{
+        return ULONG_CMP_GE(READ_ONCE(*sp), s);
+}
+/* Wrapper functions for expedited grace periods.  */
+static void rcu_exp_gp_seq_start(struct rcu_state *rsp)
+{
+        rcu_seq_start(&rsp->expedited_sequence);
+}
+static void rcu_exp_gp_seq_end(struct rcu_state *rsp)
+{
+        rcu_seq_end(&rsp->expedited_sequence);
+        smp_mb(); /* Ensure that consecutive grace periods serialize. */
+}
+static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp)
+{
+        return rcu_seq_snap(&rsp->expedited_sequence);
+}
+static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s)
+{
+        return rcu_seq_done(&rsp->expedited_sequence, s);
+}
+/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */
+static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp,
+                               struct rcu_data *rdp,
+                               atomic_long_t *stat, unsigned long s)
+{
+        if (rcu_exp_gp_seq_done(rsp, s)) {
+                if (rnp)
+                        mutex_unlock(&rnp->exp_funnel_mutex);
+                else if (rdp)
+                        mutex_unlock(&rdp->exp_funnel_mutex);
+                /* Ensure test happens before caller kfree(). */
+                smp_mb__before_atomic(); /* ^^^ */
+                atomic_long_inc(stat);
+                return true;
+        }
+        return false;
+}
+/*
+ * Funnel-lock acquisition for expedited grace periods.  Returns a
+ * pointer to the root rcu_node structure, or NULL if some other
+ * task did the expedited grace period for us.
+ */
+static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
+{
+        struct rcu_data *rdp;
+        struct rcu_node *rnp0;
+        struct rcu_node *rnp1 = NULL;
+        /*
+         * First try directly acquiring the root lock in order to reduce
+         * latency in the common case where expedited grace periods are
+         * rare.  We check mutex_is_locked() to avoid pathological levels of
+         * memory contention on ->exp_funnel_mutex in the heavy-load case.
+         */
+        rnp0 = rcu_get_root(rsp);
+        if (!mutex_is_locked(&rnp0->exp_funnel_mutex)) {
+                if (mutex_trylock(&rnp0->exp_funnel_mutex)) {
+                        if (sync_exp_work_done(rsp, rnp0, NULL,
+                                               &rsp->expedited_workdone0, s))
+                                return NULL;
+                        return rnp0;
+                }
+        }
+        /*
+         * Each pass through the following loop works its way
+         * up the rcu_node tree, returning if others have done the
+         * work or otherwise falls through holding the root rnp's
+         * ->exp_funnel_mutex.  The mapping from CPU to rcu_node structure
+         * can be inexact, as it is just promoting locality and is not
+         * strictly needed for correctness.
+         */
+        rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
+        if (sync_exp_work_done(rsp, NULL, NULL, &rsp->expedited_workdone1, s))
+                return NULL;
+        mutex_lock(&rdp->exp_funnel_mutex);
+        rnp0 = rdp->mynode;
+        for (; rnp0 != NULL; rnp0 = rnp0->parent) {
+                if (sync_exp_work_done(rsp, rnp1, rdp,
+                                       &rsp->expedited_workdone2, s))
+                        return NULL;
+                mutex_lock(&rnp0->exp_funnel_mutex);
+                if (rnp1)
+                        mutex_unlock(&rnp1->exp_funnel_mutex);
+                else
+                        mutex_unlock(&rdp->exp_funnel_mutex);
+                rnp1 = rnp0;
+        }
+        if (sync_exp_work_done(rsp, rnp1, rdp,
+                               &rsp->expedited_workdone3, s))
+                return NULL;
+        return rnp1;
+}
+/* Invoked on each online non-idle CPU for expedited quiescent state. */
+static int synchronize_sched_expedited_cpu_stop(void *data)
+{
+        struct rcu_data *rdp = data;
+        struct rcu_state *rsp = rdp->rsp;
+        /* We are here: If we are last, do the wakeup. */
+        rdp->exp_done = true;
+        if (atomic_dec_and_test(&rsp->expedited_need_qs))
+                wake_up(&rsp->expedited_wq);
        return 0;
 }
+static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
+{
+        int cpu;
+        unsigned long jiffies_stall;
+        unsigned long jiffies_start;
+        struct rcu_data *rdp;
+        int ret;
+        jiffies_stall = rcu_jiffies_till_stall_check();
+        jiffies_start = jiffies;
+        for (;;) {
+                ret = wait_event_interruptible_timeout(
+                                rsp->expedited_wq,
+                                !atomic_read(&rsp->expedited_need_qs),
+                                jiffies_stall);
+                if (ret > 0)
+                        return;
+                if (ret < 0) {
+                        /* Hit a signal, disable CPU stall warnings. */
+                        wait_event(rsp->expedited_wq,
+                                   !atomic_read(&rsp->expedited_need_qs));
+                        return;
+                }
+                pr_err("INFO: %s detected expedited stalls on CPUs: {",
+                       rsp->name);
+                for_each_online_cpu(cpu) {
+                        rdp = per_cpu_ptr(rsp->rda, cpu);
+                        if (rdp->exp_done)
+                                continue;
+                        pr_cont(" %d", cpu);
+                }
+                pr_cont(" } %lu jiffies s: %lu\n",
+                        jiffies - jiffies_start, rsp->expedited_sequence);
+                for_each_online_cpu(cpu) {
+                        rdp = per_cpu_ptr(rsp->rda, cpu);
+                        if (rdp->exp_done)
+                                continue;
+                        dump_cpu_task(cpu);
+                }
+                jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3;
+        }
+}
 /**
 * synchronize_sched_expedited - Brute-force RCU-sched grace period
 *
@@ -3281,58 +3524,21 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
 * restructure your code to batch your updates, and then use a single
 * synchronize_sched() instead.
 *
- * This implementation can be thought of as an application of ticket
+ * This implementation can be thought of as an application of sequence
- * locking to RCU, with sync_sched_expedited_started and
+ * locking to expedited grace periods, but using the sequence counter to
- * sync_sched_expedited_done taking on the roles of the halves
+ * determine when someone else has already done the work instead of for
- * of the ticket-lock word.  Each task atomically increments
+ * retrying readers.
- * sync_sched_expedited_started upon entry, snapshotting the old value,
- * then attempts to stop all the CPUs.  If this succeeds, then each
- * CPU will have executed a context switch, resulting in an RCU-sched
- * grace period.  We are then done, so we use atomic_cmpxchg() to
- * update sync_sched_expedited_done to match our snapshot -- but
- * only if someone else has not already advanced past our snapshot.
- *
- * On the other hand, if try_stop_cpus() fails, we check the value
- * of sync_sched_expedited_done.  If it has advanced past our
- * initial snapshot, then someone else must have forced a grace period
- * some time after we took our snapshot.  In this case, our work is
- * done for us, and we can simply return.  Otherwise, we try again,
- * but keep our initial snapshot for purposes of checking for someone
- * doing our work for us.
- *
- * If we fail too many times in a row, we fall back to synchronize_sched().
 */
 void synchronize_sched_expedited(void)
 {
-        cpumask_var_t cm;
-        bool cma = false;
        int cpu;
-        long firstsnap, s, snap;
+        unsigned long s;
-        int trycount = 0;
+        struct rcu_node *rnp;
        struct rcu_state *rsp = &rcu_sched_state;
-        /*
+        /* Take a snapshot of the sequence number.  */
-         * If we are in danger of counter wrap, just do synchronize_sched().
+        s = rcu_exp_gp_seq_snap(rsp);
-         * By allowing sync_sched_expedited_started to advance no more than
-         * ULONG_MAX/8 ahead of sync_sched_expedited_done, we are ensuring
-         * that more than 3.5 billion CPUs would be required to force a
-         * counter wrap on a 32-bit system.  Quite a few more CPUs would of
-         * course be required on a 64-bit system.
-         */
-        if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start),
-                         (ulong)atomic_long_read(&rsp->expedited_done) +
-                         ULONG_MAX / 8)) {
-                wait_rcu_gp(call_rcu_sched);
-                atomic_long_inc(&rsp->expedited_wrap);
-                return;
-        }
-        /*
-         * Take a ticket.  Note that atomic_inc_return() implies a
-         * full memory barrier.
-         */
-        snap = atomic_long_inc_return(&rsp->expedited_start);
-        firstsnap = snap;
        if (!try_get_online_cpus()) {
                /* CPU hotplug operation in flight, fall back to normal GP. */
                wait_rcu_gp(call_rcu_sched);
@@ -3341,100 +3547,38 @@ void synchronize_sched_expedited(void)
        }
        WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
-        /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
+        rnp = exp_funnel_lock(rsp, s);
-        cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
+        if (rnp == NULL) {
-        if (cma) {
+                put_online_cpus();
-                cpumask_copy(cm, cpu_online_mask);
+                return;  /* Someone else did our work for us. */
-                cpumask_clear_cpu(raw_smp_processor_id(), cm);
-                for_each_cpu(cpu, cm) {
-                        struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-                        if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
-                                cpumask_clear_cpu(cpu, cm);
-                }
-                if (cpumask_weight(cm) == 0)
-                        goto all_cpus_idle;
        }
-        /*
+        rcu_exp_gp_seq_start(rsp);
-         * Each pass through the following loop attempts to force a
-         * context switch on each CPU.
-         */
-        while (try_stop_cpus(cma ? cm : cpu_online_mask,
-                             synchronize_sched_expedited_cpu_stop,
-                             NULL) == -EAGAIN) {
-                put_online_cpus();
-                atomic_long_inc(&rsp->expedited_tryfail);
-                /* Check to see if someone else did our work for us. */
-                s = atomic_long_read(&rsp->expedited_done);
-                if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
-                        /* ensure test happens before caller kfree */
-                        smp_mb__before_atomic(); /* ^^^ */
-                        atomic_long_inc(&rsp->expedited_workdone1);
-                        free_cpumask_var(cm);
-                        return;
-                }
-                /* No joy, try again later.  Or just synchronize_sched(). */
+        /* Stop each CPU that is online, non-idle, and not us. */
-                if (trycount++ < 10) {
+        init_waitqueue_head(&rsp->expedited_wq);
-                        udelay(trycount * num_online_cpus());
+        atomic_set(&rsp->expedited_need_qs, 1); /* Extra count avoids race. */
-                } else {
+        for_each_online_cpu(cpu) {
-                        wait_rcu_gp(call_rcu_sched);
+                struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
-                        atomic_long_inc(&rsp->expedited_normal);
+                struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-                        free_cpumask_var(cm);
-                        return;
-                }
-                /* Recheck to see if someone else did our work for us. */
+                rdp->exp_done = false;
-                s = atomic_long_read(&rsp->expedited_done);
-                if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
-                        /* ensure test happens before caller kfree */
-                        smp_mb__before_atomic(); /* ^^^ */
-                        atomic_long_inc(&rsp->expedited_workdone2);
-                        free_cpumask_var(cm);
-                        return;
-                }
-                /*
+                /* Skip our CPU and any idle CPUs. */
-                 * Refetching sync_sched_expedited_started allows later
+                if (raw_smp_processor_id() == cpu ||
-                 * callers to piggyback on our grace period.  We retry
+                    !(atomic_add_return(0, &rdtp->dynticks) & 0x1))
-                 * after they started, so our grace period works for them,
+                        continue;
-                 * and they started after our first try, so their grace
+                atomic_inc(&rsp->expedited_need_qs);
-                 * period works for us.
+                stop_one_cpu_nowait(cpu, synchronize_sched_expedited_cpu_stop,
-                 */
+                                    rdp, &rdp->exp_stop_work);
-                if (!try_get_online_cpus()) {
-                        /* CPU hotplug operation in flight, use normal GP. */
-                        wait_rcu_gp(call_rcu_sched);
-                        atomic_long_inc(&rsp->expedited_normal);
-                        free_cpumask_var(cm);
-                        return;
-                }
-                snap = atomic_long_read(&rsp->expedited_start);
-                smp_mb(); /* ensure read is before try_stop_cpus(). */
        }
-        atomic_long_inc(&rsp->expedited_stoppedcpus);
-all_cpus_idle:
+        /* Remove extra count and, if necessary, wait for CPUs to stop. */
-        free_cpumask_var(cm);
+        if (!atomic_dec_and_test(&rsp->expedited_need_qs))
+                synchronize_sched_expedited_wait(rsp);
-        /*
+        rcu_exp_gp_seq_end(rsp);
-         * Everyone up to our most recent fetch is covered by our grace
+        mutex_unlock(&rnp->exp_funnel_mutex);
-         * period.  Update the counter, but only if our work is still
-         * relevant -- which it won't be if someone who started later
-         * than we did already did their update.
-         */
-        do {
-                atomic_long_inc(&rsp->expedited_done_tries);
-                s = atomic_long_read(&rsp->expedited_done);
-                if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
-                        /* ensure test happens before caller kfree */
-                        smp_mb__before_atomic(); /* ^^^ */
-                        atomic_long_inc(&rsp->expedited_done_lost);
-                        break;
-                }
-        } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
-        atomic_long_inc(&rsp->expedited_done_exit);
        put_online_cpus();
 }
@@ -3571,10 +3715,10 @@ static void rcu_barrier_callback(struct rcu_head *rhp)
        struct rcu_state *rsp = rdp->rsp;
        if (atomic_dec_and_test(&rsp->barrier_cpu_count)) {
-                _rcu_barrier_trace(rsp, "LastCB", -1, rsp->n_barrier_done);
+                _rcu_barrier_trace(rsp, "LastCB", -1, rsp->barrier_sequence);
                complete(&rsp->barrier_completion);
        } else {
-                _rcu_barrier_trace(rsp, "CB", -1, rsp->n_barrier_done);
+                _rcu_barrier_trace(rsp, "CB", -1, rsp->barrier_sequence);
        }
 }
@@ -3586,7 +3730,7 @@ static void rcu_barrier_func(void *type)
        struct rcu_state *rsp = type;
        struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
-        _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done);
+        _rcu_barrier_trace(rsp, "IRQ", -1, rsp->barrier_sequence);
        atomic_inc(&rsp->barrier_cpu_count);
        rsp->call(&rdp->barrier_head, rcu_barrier_callback);
 }
@@ -3599,55 +3743,24 @@ static void _rcu_barrier(struct rcu_state *rsp)
 {
        int cpu;
        struct rcu_data *rdp;
-        unsigned long snap = READ_ONCE(rsp->n_barrier_done);
+        unsigned long s = rcu_seq_snap(&rsp->barrier_sequence);
-        unsigned long snap_done;
-        _rcu_barrier_trace(rsp, "Begin", -1, snap);
+        _rcu_barrier_trace(rsp, "Begin", -1, s);
        /* Take mutex to serialize concurrent rcu_barrier() requests. */
        mutex_lock(&rsp->barrier_mutex);
-        /*
+        /* Did someone else do our work for us? */
-         * Ensure that all prior references, including to ->n_barrier_done,
+        if (rcu_seq_done(&rsp->barrier_sequence, s)) {
-         * are ordered before the _rcu_barrier() machinery.
+                _rcu_barrier_trace(rsp, "EarlyExit", -1, rsp->barrier_sequence);
-         */
-        smp_mb();  /* See above block comment. */
-        /*
-         * Recheck ->n_barrier_done to see if others did our work for us.
-         * This means checking ->n_barrier_done for an even-to-odd-to-even
-         * transition.  The "if" expression below therefore rounds the old
-         * value up to the next even number and adds two before comparing.
-         */
-        snap_done = rsp->n_barrier_done;
-        _rcu_barrier_trace(rsp, "Check", -1, snap_done);
-        /*
-         * If the value in snap is odd, we needed to wait for the current
-         * rcu_barrier() to complete, then wait for the next one, in other
-         * words, we need the value of snap_done to be three larger than
-         * the value of snap.  On the other hand, if the value in snap is
-         * even, we only had to wait for the next rcu_barrier() to complete,
-         * in other words, we need the value of snap_done to be only two
-         * greater than the value of snap.  The "(snap + 3) & ~0x1" computes
-         * this for us (thank you, Linus!).
-         */
-        if (ULONG_CMP_GE(snap_done, (snap + 3) & ~0x1)) {
-                _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done);
                smp_mb(); /* caller's subsequent code after above check. */
                mutex_unlock(&rsp->barrier_mutex);
                return;
        }
-        /*
+        /* Mark the start of the barrier operation. */
-         * Increment ->n_barrier_done to avoid duplicate work.  Use
+        rcu_seq_start(&rsp->barrier_sequence);
-         * WRITE_ONCE() to prevent the compiler from speculating
+        _rcu_barrier_trace(rsp, "Inc1", -1, rsp->barrier_sequence);
-         * the increment to precede the early-exit check.
-         */
-        WRITE_ONCE(rsp->n_barrier_done, rsp->n_barrier_done + 1);
-        WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1);
-        _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done);
-        smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */
        /*
         * Initialize the count to one rather than to zero in order to
@@ -3671,10 +3784,10 @@ static void _rcu_barrier(struct rcu_state *rsp)
                if (rcu_is_nocb_cpu(cpu)) {
                        if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) {
                                _rcu_barrier_trace(rsp, "OfflineNoCB", cpu,
-                                                   rsp->n_barrier_done);
+                                                   rsp->barrier_sequence);
                        } else {
                                _rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
-                                                   rsp->n_barrier_done);
+                                                   rsp->barrier_sequence);
                                smp_mb__before_atomic();
                                atomic_inc(&rsp->barrier_cpu_count);
                                __call_rcu(&rdp->barrier_head,
@@ -3682,11 +3795,11 @@ static void _rcu_barrier(struct rcu_state *rsp)
                        }
                } else if (READ_ONCE(rdp->qlen)) {
                        _rcu_barrier_trace(rsp, "OnlineQ", cpu,
-                                           rsp->n_barrier_done);
+                                           rsp->barrier_sequence);
                        smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
                } else {
                        _rcu_barrier_trace(rsp, "OnlineNQ", cpu,
-                                           rsp->n_barrier_done);
+                                           rsp->barrier_sequence);
                }
        }
        put_online_cpus();
@@ -3698,16 +3811,13 @@ static void _rcu_barrier(struct rcu_state *rsp)
        if (atomic_dec_and_test(&rsp->barrier_cpu_count))
                complete(&rsp->barrier_completion);
-        /* Increment ->n_barrier_done to prevent duplicate work. */
-        smp_mb(); /* Keep increment after above mechanism. */
-        WRITE_ONCE(rsp->n_barrier_done, rsp->n_barrier_done + 1);
-        WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0);
-        _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done);
-        smp_mb(); /* Keep increment before caller's subsequent code. */
        /* Wait for all rcu_barrier_callback() callbacks to be invoked. */
        wait_for_completion(&rsp->barrier_completion);
+        /* Mark the end of the barrier operation. */
+        _rcu_barrier_trace(rsp, "Inc2", -1, rsp->barrier_sequence);
+        rcu_seq_end(&rsp->barrier_sequence);
        /* Other rcu_barrier() invocations can now safely proceed. */
        mutex_unlock(&rsp->barrier_mutex);
 }
@@ -3770,6 +3880,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
        WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
        rdp->cpu = cpu;
        rdp->rsp = rsp;
+        mutex_init(&rdp->exp_funnel_mutex);
        rcu_boot_init_nocb_percpu_data(rdp);
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
@@ -3961,22 +4072,22 @@ void rcu_scheduler_starting(void)
 * Compute the per-level fanout, either using the exact fanout specified
 * or balancing the tree, depending on the rcu_fanout_exact boot parameter.
 */
-static void __init rcu_init_levelspread(struct rcu_state *rsp)
+static void __init rcu_init_levelspread(int *levelspread, const int *levelcnt)
 {
        int i;
        if (rcu_fanout_exact) {
-                rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
+                levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
                for (i = rcu_num_lvls - 2; i >= 0; i--)
-                        rsp->levelspread[i] = RCU_FANOUT;
+                        levelspread[i] = RCU_FANOUT;
        } else {
                int ccur;
                int cprv;
                cprv = nr_cpu_ids;
                for (i = rcu_num_lvls - 1; i >= 0; i--) {
-                        ccur = rsp->levelcnt[i];
+                        ccur = levelcnt[i];
-                        rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
+                        levelspread[i] = (cprv + ccur - 1) / ccur;
                        cprv = ccur;
                }
        }
@@ -3988,23 +4099,20 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
 static void __init rcu_init_one(struct rcu_state *rsp,
                struct rcu_data __percpu *rda)
 {
-        static const char * const buf[] = {
+        static const char * const buf[] = RCU_NODE_NAME_INIT;
-                "rcu_node_0",
+        static const char * const fqs[] = RCU_FQS_NAME_INIT;
-                "rcu_node_1",
+        static const char * const exp[] = RCU_EXP_NAME_INIT;
-                "rcu_node_2",
+        static const char * const exp_sched[] = RCU_EXP_SCHED_NAME_INIT;
-                "rcu_node_3" };  /* Match MAX_RCU_LVLS */
-        static const char * const fqs[] = {
-                "rcu_node_fqs_0",
-                "rcu_node_fqs_1",
-                "rcu_node_fqs_2",
-                "rcu_node_fqs_3" };  /* Match MAX_RCU_LVLS */
        static u8 fl_mask = 0x1;
+        int levelcnt[RCU_NUM_LVLS];             /* # nodes in each level. */
+        int levelspread[RCU_NUM_LVLS];          /* kids/node in each level. */
        int cpustride = 1;
        int i;
        int j;
        struct rcu_node *rnp;
-        BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf));  /* Fix buf[] init! */
+        BUILD_BUG_ON(RCU_NUM_LVLS > ARRAY_SIZE(buf));  /* Fix buf[] init! */
        /* Silence gcc 4.8 false positive about array index out of range. */
        if (rcu_num_lvls <= 0 || rcu_num_lvls > RCU_NUM_LVLS)
@@ -4013,19 +4121,19 @@ static void __init rcu_init_one(struct rcu_state *rsp,
        /* Initialize the level-tracking arrays. */
        for (i = 0; i < rcu_num_lvls; i++)
-                rsp->levelcnt[i] = num_rcu_lvl[i];
+                levelcnt[i] = num_rcu_lvl[i];
        for (i = 1; i < rcu_num_lvls; i++)
-                rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
+                rsp->level[i] = rsp->level[i - 1] + levelcnt[i - 1];
-        rcu_init_levelspread(rsp);
+        rcu_init_levelspread(levelspread, levelcnt);
        rsp->flavor_mask = fl_mask;
        fl_mask <<= 1;
        /* Initialize the elements themselves, starting from the leaves. */
        for (i = rcu_num_lvls - 1; i >= 0; i--) {
-                cpustride *= rsp->levelspread[i];
+                cpustride *= levelspread[i];
                rnp = rsp->level[i];
-                for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
+                for (j = 0; j < levelcnt[i]; j++, rnp++) {
                        raw_spin_lock_init(&rnp->lock);
                        lockdep_set_class_and_name(&rnp->lock,
                                                   &rcu_node_class[i], buf[i]);
@@ -4045,14 +4153,23 @@ static void __init rcu_init_one(struct rcu_state *rsp,
                                rnp->grpmask = 0;
                                rnp->parent = NULL;
                        } else {
-                                rnp->grpnum = j % rsp->levelspread[i - 1];
+                                rnp->grpnum = j % levelspread[i - 1];
                                rnp->grpmask = 1UL << rnp->grpnum;
                                rnp->parent = rsp->level[i - 1] +
-                                              j / rsp->levelspread[i - 1];
+                                              j / levelspread[i - 1];
                        }
                        rnp->level = i;
                        INIT_LIST_HEAD(&rnp->blkd_tasks);
                        rcu_init_one_nocb(rnp);
+                        mutex_init(&rnp->exp_funnel_mutex);
+                        if (rsp == &rcu_sched_state)
+                                lockdep_set_class_and_name(
+                                        &rnp->exp_funnel_mutex,
+                                        &rcu_exp_sched_class[i], exp_sched[i]);
+                        else
+                                lockdep_set_class_and_name(
+                                        &rnp->exp_funnel_mutex,
+                                        &rcu_exp_class[i], exp[i]);
                }
        }
@@ -4076,9 +4193,7 @@ static void __init rcu_init_geometry(void)
 {
        ulong d;
        int i;
-        int j;
+        int rcu_capacity[RCU_NUM_LVLS];
-        int n = nr_cpu_ids;
-        int rcu_capacity[MAX_RCU_LVLS + 1];
        /*
         * Initialize any unspecified boot parameters.
@@ -4101,47 +4216,49 @@ static void __init rcu_init_geometry(void)
                rcu_fanout_leaf, nr_cpu_ids);
        /*
-         * Compute number of nodes that can be handled an rcu_node tree
-         * with the given number of levels.  Setting rcu_capacity[0] makes
-         * some of the arithmetic easier.
-         */
-        rcu_capacity[0] = 1;
-        rcu_capacity[1] = rcu_fanout_leaf;
-        for (i = 2; i <= MAX_RCU_LVLS; i++)
-                rcu_capacity[i] = rcu_capacity[i - 1] * RCU_FANOUT;
-        /*
         * The boot-time rcu_fanout_leaf parameter is only permitted
         * to increase the leaf-level fanout, not decrease it.  Of course,
         * the leaf-level fanout cannot exceed the number of bits in
-         * the rcu_node masks.  Finally, the tree must be able to accommodate
+         * the rcu_node masks.  Complain and fall back to the compile-
-         * the configured number of CPUs.  Complain and fall back to the
+         * time values if these limits are exceeded.
-         * compile-time values if these limits are exceeded.
         */
        if (rcu_fanout_leaf < RCU_FANOUT_LEAF ||
-            rcu_fanout_leaf > sizeof(unsigned long) * 8 ||
+            rcu_fanout_leaf > sizeof(unsigned long) * 8) {
-            n > rcu_capacity[MAX_RCU_LVLS]) {
+                rcu_fanout_leaf = RCU_FANOUT_LEAF;
                WARN_ON(1);
                return;
        }
+        /*
+         * Compute number of nodes that can be handled an rcu_node tree
+         * with the given number of levels.
+         */
+        rcu_capacity[0] = rcu_fanout_leaf;
+        for (i = 1; i < RCU_NUM_LVLS; i++)
+                rcu_capacity[i] = rcu_capacity[i - 1] * RCU_FANOUT;
+        /*
+         * The tree must be able to accommodate the configured number of CPUs.
+         * If this limit is exceeded than we have a serious problem elsewhere.
+         */
+        if (nr_cpu_ids > rcu_capacity[RCU_NUM_LVLS - 1])
+                panic("rcu_init_geometry: rcu_capacity[] is too small");
+        /* Calculate the number of levels in the tree. */
+        for (i = 0; nr_cpu_ids > rcu_capacity[i]; i++) {
+        }
+        rcu_num_lvls = i + 1;
        /* Calculate the number of rcu_nodes at each level of the tree. */
-        for (i = 1; i <= MAX_RCU_LVLS; i++)
+        for (i = 0; i < rcu_num_lvls; i++) {
-                if (n <= rcu_capacity[i]) {
+                int cap = rcu_capacity[(rcu_num_lvls - 1) - i];
-                        for (j = 0; j <= i; j++)
+                num_rcu_lvl[i] = DIV_ROUND_UP(nr_cpu_ids, cap);
-                                num_rcu_lvl[j] =
+        }
-                                        DIV_ROUND_UP(n, rcu_capacity[i - j]);
-                        rcu_num_lvls = i;
-                        for (j = i + 1; j <= MAX_RCU_LVLS; j++)
-                                num_rcu_lvl[j] = 0;
-                        break;
-                }
        /* Calculate the total number of rcu_node structures. */
        rcu_num_nodes = 0;
-        for (i = 0; i <= MAX_RCU_LVLS; i++)
+        for (i = 0; i < rcu_num_lvls; i++)
                rcu_num_nodes += num_rcu_lvl[i];
-        rcu_num_nodes -= n;
 }
 /*
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 4adb7ca0bf47..2e991f8361e4 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -27,6 +27,7 @@
 #include <linux/threads.h>
 #include <linux/cpumask.h>
 #include <linux/seqlock.h>
+#include <linux/stop_machine.h>
 /*
 * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
@@ -36,8 +37,6 @@
 * Of course, your mileage may vary.
 */
-#define MAX_RCU_LVLS 4
 #ifdef CONFIG_RCU_FANOUT
 #define RCU_FANOUT CONFIG_RCU_FANOUT
 #else /* #ifdef CONFIG_RCU_FANOUT */
@@ -66,38 +65,53 @@
 #if NR_CPUS <= RCU_FANOUT_1
 #  define RCU_NUM_LVLS        1
 #  define NUM_RCU_LVL_0       1
-#  define NUM_RCU_LVL_1       (NR_CPUS)
+#  define NUM_RCU_NODES       NUM_RCU_LVL_0
-#  define NUM_RCU_LVL_2       0
+#  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0 }
-#  define NUM_RCU_LVL_3       0
+#  define RCU_NODE_NAME_INIT  { "rcu_node_0" }
-#  define NUM_RCU_LVL_4       0
+#  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0" }
+#  define RCU_EXP_NAME_INIT   { "rcu_node_exp_0" }
+#  define RCU_EXP_SCHED_NAME_INIT \
+                              { "rcu_node_exp_sched_0" }
 #elif NR_CPUS <= RCU_FANOUT_2
 #  define RCU_NUM_LVLS        2
 #  define NUM_RCU_LVL_0       1
 #  define NUM_RCU_LVL_1       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
-#  define NUM_RCU_LVL_2       (NR_CPUS)
+#  define NUM_RCU_NODES       (NUM_RCU_LVL_0 + NUM_RCU_LVL_1)
-#  define NUM_RCU_LVL_3       0
+#  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0, NUM_RCU_LVL_1 }
-#  define NUM_RCU_LVL_4       0
+#  define RCU_NODE_NAME_INIT  { "rcu_node_0", "rcu_node_1" }
+#  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0", "rcu_node_fqs_1" }
+#  define RCU_EXP_NAME_INIT   { "rcu_node_exp_0", "rcu_node_exp_1" }
+#  define RCU_EXP_SCHED_NAME_INIT \
+                              { "rcu_node_exp_sched_0", "rcu_node_exp_sched_1" }
 #elif NR_CPUS <= RCU_FANOUT_3
 #  define RCU_NUM_LVLS        3
 #  define NUM_RCU_LVL_0       1
 #  define NUM_RCU_LVL_1       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
 #  define NUM_RCU_LVL_2       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
-#  define NUM_RCU_LVL_3       (NR_CPUS)
+#  define NUM_RCU_NODES       (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2)
-#  define NUM_RCU_LVL_4       0
+#  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 }
+#  define RCU_NODE_NAME_INIT  { "rcu_node_0", "rcu_node_1", "rcu_node_2" }
+#  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" }
+#  define RCU_EXP_NAME_INIT   { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" }
+#  define RCU_EXP_SCHED_NAME_INIT \
+                              { "rcu_node_exp_sched_0", "rcu_node_exp_sched_1", "rcu_node_exp_sched_2" }
 #elif NR_CPUS <= RCU_FANOUT_4
 #  define RCU_NUM_LVLS        4
 #  define NUM_RCU_LVL_0       1
 #  define NUM_RCU_LVL_1       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
 #  define NUM_RCU_LVL_2       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
 #  define NUM_RCU_LVL_3       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
-#  define NUM_RCU_LVL_4       (NR_CPUS)
+#  define NUM_RCU_NODES       (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
+#  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 }
+#  define RCU_NODE_NAME_INIT  { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" }
+#  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" }
+#  define RCU_EXP_NAME_INIT   { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" }
+#  define RCU_EXP_SCHED_NAME_INIT \
+                              { "rcu_node_exp_sched_0", "rcu_node_exp_sched_1", "rcu_node_exp_sched_2", "rcu_node_exp_sched_3" }
 #else
 # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
 #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
-#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
-#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
 extern int rcu_num_lvls;
 extern int rcu_num_nodes;
@@ -236,6 +250,8 @@ struct rcu_node {
        int need_future_gp[2];
                                /* Counts of upcoming no-CB GP requests. */
        raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp;
+        struct mutex exp_funnel_mutex ____cacheline_internodealigned_in_smp;
 } ____cacheline_internodealigned_in_smp;
 /*
@@ -287,12 +303,13 @@ struct rcu_data {
        bool            gpwrap;         /* Possible gpnum/completed wrap. */
        struct rcu_node *mynode;        /* This CPU's leaf of hierarchy */
        unsigned long grpmask;          /* Mask to apply to leaf qsmask. */
-#ifdef CONFIG_RCU_CPU_STALL_INFO
        unsigned long   ticks_this_gp;  /* The number of scheduling-clock */
                                        /*  ticks this CPU has handled */
                                        /*  during and after the last grace */
                                        /* period it is aware of. */
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
+        struct cpu_stop_work exp_stop_work;
+                                        /* Expedited grace-period control */
+                                        /*  for CPU stopping. */
        /* 2) batch handling */
        /*
@@ -355,11 +372,13 @@ struct rcu_data {
        unsigned long n_rp_nocb_defer_wakeup;
        unsigned long n_rp_need_nothing;
-        /* 6) _rcu_barrier() and OOM callbacks. */
+        /* 6) _rcu_barrier(), OOM callbacks, and expediting. */
        struct rcu_head barrier_head;
 #ifdef CONFIG_RCU_FAST_NO_HZ
        struct rcu_head oom_head;
 #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
+        struct mutex exp_funnel_mutex;
+        bool exp_done;                  /* Expedited QS for this CPU? */
        /* 7) Callback offloading. */
 #ifdef CONFIG_RCU_NOCB_CPU
@@ -387,9 +406,7 @@ struct rcu_data {
 #endif /* #ifdef CONFIG_RCU_NOCB_CPU */
        /* 8) RCU CPU stall data. */
-#ifdef CONFIG_RCU_CPU_STALL_INFO
        unsigned int softirq_snap;      /* Snapshot of softirq activity. */
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
        int cpu;
        struct rcu_state *rsp;
@@ -442,9 +459,9 @@ do {									\
 */
 struct rcu_state {
        struct rcu_node node[NUM_RCU_NODES];    /* Hierarchy. */
-        struct rcu_node *level[RCU_NUM_LVLS];   /* Hierarchy levels. */
+        struct rcu_node *level[RCU_NUM_LVLS + 1];
-        u32 levelcnt[MAX_RCU_LVLS + 1];         /* # nodes in each level. */
+                                                /* Hierarchy levels (+1 to */
-        u8 levelspread[RCU_NUM_LVLS];           /* kids/node in each level. */
+                                                /*  shut bogus gcc warning) */
        u8 flavor_mask;                         /* bit in flavor mask. */
        struct rcu_data __percpu *rda;          /* pointer of percu rcu_data. */
        void (*call)(struct rcu_head *head,     /* call_rcu() flavor. */
@@ -479,21 +496,18 @@ struct rcu_state {
        struct mutex barrier_mutex;             /* Guards barrier fields. */
        atomic_t barrier_cpu_count;             /* # CPUs waiting on. */
        struct completion barrier_completion;   /* Wake at barrier end. */
-        unsigned long n_barrier_done;           /* ++ at start and end of */
+        unsigned long barrier_sequence;         /* ++ at start and end of */
                                                /*  _rcu_barrier(). */
        /* End of fields guarded by barrier_mutex. */
-        atomic_long_t expedited_start;          /* Starting ticket. */
+        unsigned long expedited_sequence;       /* Take a ticket. */
-        atomic_long_t expedited_done;           /* Done ticket. */
+        atomic_long_t expedited_workdone0;      /* # done by others #0. */
-        atomic_long_t expedited_wrap;           /* # near-wrap incidents. */
-        atomic_long_t expedited_tryfail;        /* # acquisition failures. */
        atomic_long_t expedited_workdone1;      /* # done by others #1. */
        atomic_long_t expedited_workdone2;      /* # done by others #2. */
+        atomic_long_t expedited_workdone3;      /* # done by others #3. */
        atomic_long_t expedited_normal;         /* # fallbacks to normal. */
-        atomic_long_t expedited_stoppedcpus;    /* # successful stop_cpus. */
+        atomic_t expedited_need_qs;             /* # CPUs left to check in. */
-        atomic_long_t expedited_done_tries;     /* # tries to update _done. */
+        wait_queue_head_t expedited_wq;         /* Wait for check-ins. */
-        atomic_long_t expedited_done_lost;      /* # times beaten to _done. */
-        atomic_long_t expedited_done_exit;      /* # times exited _done loop. */
        unsigned long jiffies_force_qs;         /* Time at which to invoke */
                                                /*  force_quiescent_state(). */
@@ -527,7 +541,11 @@ struct rcu_state {
 /* Values for rcu_state structure's gp_flags field. */
 #define RCU_GP_WAIT_INIT 0      /* Initial state. */
 #define RCU_GP_WAIT_GPS  1      /* Wait for grace-period start. */
-#define RCU_GP_WAIT_FQS  2      /* Wait for force-quiescent-state time. */
+#define RCU_GP_DONE_GPS  2      /* Wait done for grace-period start. */
+#define RCU_GP_WAIT_FQS  3      /* Wait for force-quiescent-state time. */
+#define RCU_GP_DOING_FQS 4      /* Wait done for force-quiescent-state time. */
+#define RCU_GP_CLEANUP   5      /* Grace-period cleanup started. */
+#define RCU_GP_CLEANED   6      /* Grace-period cleanup complete. */
 extern struct list_head rcu_struct_flavors;
@@ -635,3 +653,15 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
 #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
 }
 #endif /* #ifdef CONFIG_RCU_TRACE */
+/*
+ * Place this after a lock-acquisition primitive to guarantee that
+ * an UNLOCK+LOCK pair act as a full barrier.  This guarantee applies
+ * if the UNLOCK and LOCK are executed by the same CPU or if the
+ * UNLOCK and LOCK operate on the same lock variable.
+ */
+#ifdef CONFIG_PPC
+#define smp_mb__after_unlock_lock()     smp_mb()  /* Full ordering for lock. */
+#else /* #ifdef CONFIG_PPC */
+#define smp_mb__after_unlock_lock()     do { } while (0)
+#endif /* #else #ifdef CONFIG_PPC */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 013485fb2b06..b2bf3963a0ae 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -82,10 +82,8 @@ static void __init rcu_bootup_announce_oddness(void)
                pr_info("\tRCU lockdep checking is enabled.\n");
        if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_RUNNABLE))
                pr_info("\tRCU torture testing starts during boot.\n");
-        if (IS_ENABLED(CONFIG_RCU_CPU_STALL_INFO))
+        if (RCU_NUM_LVLS >= 4)
-                pr_info("\tAdditional per-CPU info printed with stalls.\n");
+                pr_info("\tFour(or more)-level hierarchy is enabled.\n");
-        if (NUM_RCU_LVL_4 != 0)
-                pr_info("\tFour-level hierarchy is enabled.\n");
        if (RCU_FANOUT_LEAF != 16)
                pr_info("\tBuild-time adjustment of leaf fanout to %d.\n",
                        RCU_FANOUT_LEAF);
@@ -418,8 +416,6 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
                rcu_print_detail_task_stall_rnp(rnp);
 }
-#ifdef CONFIG_RCU_CPU_STALL_INFO
 static void rcu_print_task_stall_begin(struct rcu_node *rnp)
 {
        pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
@@ -431,18 +427,6 @@ static void rcu_print_task_stall_end(void)
        pr_cont("\n");
 }
-#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
-static void rcu_print_task_stall_begin(struct rcu_node *rnp)
-{
-}
-static void rcu_print_task_stall_end(void)
-{
-}
-#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */
 /*
 * Scan the current list of tasks blocked within RCU read-side critical
 * sections, printing out the tid of each.
@@ -538,10 +522,10 @@ EXPORT_SYMBOL_GPL(call_rcu);
 */
 void synchronize_rcu(void)
 {
-        rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
+        RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
-                           !lock_is_held(&rcu_lock_map) &&
+                         lock_is_held(&rcu_lock_map) ||
-                           !lock_is_held(&rcu_sched_lock_map),
+                         lock_is_held(&rcu_sched_lock_map),
-                           "Illegal synchronize_rcu() in RCU read-side critical section");
+                         "Illegal synchronize_rcu() in RCU read-side critical section");
        if (!rcu_scheduler_active)
                return;
        if (rcu_gp_is_expedited())
@@ -552,8 +536,6 @@ void synchronize_rcu(void)
 EXPORT_SYMBOL_GPL(synchronize_rcu);
 static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
-static unsigned long sync_rcu_preempt_exp_count;
-static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
 /*
 * Return non-zero if there are any tasks in RCU read-side critical
@@ -573,7 +555,7 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp)
 * for the current expedited grace period.  Works only for preemptible
 * RCU -- other RCU implementation use other means.
 *
- * Caller must hold sync_rcu_preempt_exp_mutex.
+ * Caller must hold the root rcu_node's exp_funnel_mutex.
 */
 static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
 {
@@ -589,7 +571,7 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
 * recursively up the tree.  (Calm down, calm down, we do the recursion
 * iteratively!)
 *
- * Caller must hold sync_rcu_preempt_exp_mutex.
+ * Caller must hold the root rcu_node's exp_funnel_mutex.
 */
 static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
                               bool wake)
@@ -628,7 +610,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
 * set the ->expmask bits on the leaf rcu_node structures to tell phase 2
 * that work is needed here.
 *
- * Caller must hold sync_rcu_preempt_exp_mutex.
+ * Caller must hold the root rcu_node's exp_funnel_mutex.
 */
 static void
 sync_rcu_preempt_exp_init1(struct rcu_state *rsp, struct rcu_node *rnp)
@@ -671,7 +653,7 @@ sync_rcu_preempt_exp_init1(struct rcu_state *rsp, struct rcu_node *rnp)
 * invoke rcu_report_exp_rnp() to clear out the upper-level ->expmask bits,
 * enabling rcu_read_unlock_special() to do the bit-clearing.
 *
- * Caller must hold sync_rcu_preempt_exp_mutex.
+ * Caller must hold the root rcu_node's exp_funnel_mutex.
 */
 static void
 sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp)
@@ -719,51 +701,17 @@ sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp)
 void synchronize_rcu_expedited(void)
 {
        struct rcu_node *rnp;
+        struct rcu_node *rnp_unlock;
        struct rcu_state *rsp = rcu_state_p;
-        unsigned long snap;
+        unsigned long s;
-        int trycount = 0;
-        smp_mb(); /* Caller's modifications seen first by other CPUs. */
+        s = rcu_exp_gp_seq_snap(rsp);
-        snap = READ_ONCE(sync_rcu_preempt_exp_count) + 1;
-        smp_mb(); /* Above access cannot bleed into critical section. */
-        /*
+        rnp_unlock = exp_funnel_lock(rsp, s);
-         * Block CPU-hotplug operations.  This means that any CPU-hotplug
+        if (rnp_unlock == NULL)
-         * operation that finds an rcu_node structure with tasks in the
+                return;  /* Someone else did our work for us. */
-         * process of being boosted will know that all tasks blocking
-         * this expedited grace period will already be in the process of
-         * being boosted.  This simplifies the process of moving tasks
-         * from leaf to root rcu_node structures.
-         */
-        if (!try_get_online_cpus()) {
-                /* CPU-hotplug operation in flight, fall back to normal GP. */
-                wait_rcu_gp(call_rcu);
-                return;
-        }
-        /*
+        rcu_exp_gp_seq_start(rsp);
-         * Acquire lock, falling back to synchronize_rcu() if too many
-         * lock-acquisition failures.  Of course, if someone does the
-         * expedited grace period for us, just leave.
-         */
-        while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
-                if (ULONG_CMP_LT(snap,
-                    READ_ONCE(sync_rcu_preempt_exp_count))) {
-                        put_online_cpus();
-                        goto mb_ret; /* Others did our work for us. */
-                }
-                if (trycount++ < 10) {
-                        udelay(trycount * num_online_cpus());
-                } else {
-                        put_online_cpus();
-                        wait_rcu_gp(call_rcu);
-                        return;
-                }
-        }
-        if (ULONG_CMP_LT(snap, READ_ONCE(sync_rcu_preempt_exp_count))) {
-                put_online_cpus();
-                goto unlock_mb_ret; /* Others did our work for us. */
-        }
        /* force all RCU readers onto ->blkd_tasks lists. */
        synchronize_sched_expedited();
@@ -779,20 +727,14 @@ void synchronize_rcu_expedited(void)
        rcu_for_each_leaf_node(rsp, rnp)
                sync_rcu_preempt_exp_init2(rsp, rnp);
-        put_online_cpus();
        /* Wait for snapshotted ->blkd_tasks lists to drain. */
        rnp = rcu_get_root(rsp);
        wait_event(sync_rcu_preempt_exp_wq,
                   sync_rcu_preempt_exp_done(rnp));
        /* Clean up and exit. */
-        smp_mb(); /* ensure expedited GP seen before counter increment. */
+        rcu_exp_gp_seq_end(rsp);
-        WRITE_ONCE(sync_rcu_preempt_exp_count, sync_rcu_preempt_exp_count + 1);
+        mutex_unlock(&rnp_unlock->exp_funnel_mutex);
-unlock_mb_ret:
-        mutex_unlock(&sync_rcu_preempt_exp_mutex);
-mb_ret:
-        smp_mb(); /* ensure subsequent action seen after grace period. */
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
@@ -1061,8 +1003,7 @@ static int rcu_boost(struct rcu_node *rnp)
 }
 /*
- * Priority-boosting kthread.  One per leaf rcu_node and one for the
+ * Priority-boosting kthread, one per leaf rcu_node.
- * root rcu_node.
 */
 static int rcu_boost_kthread(void *arg)
 {
@@ -1680,12 +1621,10 @@ static int rcu_oom_notify(struct notifier_block *self,
         */
        atomic_set(&oom_callback_count, 1);
-        get_online_cpus();
        for_each_online_cpu(cpu) {
                smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1);
                cond_resched_rcu_qs();
        }
-        put_online_cpus();
        /* Unconditionally decrement: no need to wake ourselves up. */
        atomic_dec(&oom_callback_count);
@@ -1706,8 +1645,6 @@ early_initcall(rcu_register_oom_notifier);
 #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
-#ifdef CONFIG_RCU_CPU_STALL_INFO
 #ifdef CONFIG_RCU_FAST_NO_HZ
 static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
@@ -1796,33 +1733,6 @@ static void increment_cpu_stall_ticks(void)
                raw_cpu_inc(rsp->rda->ticks_this_gp);
 }
-#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
-static void print_cpu_stall_info_begin(void)
-{
-        pr_cont(" {");
-}
-static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
-{
-        pr_cont(" %d", cpu);
-}
-static void print_cpu_stall_info_end(void)
-{
-        pr_cont("} ");
-}
-static void zero_cpu_stall_ticks(struct rcu_data *rdp)
-{
-}
-static void increment_cpu_stall_ticks(void)
-{
-}
-#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */
 #ifdef CONFIG_RCU_NOCB_CPU
 /*
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 3ea7ffc7d5c4..6fc4c5ff3bb5 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -81,9 +81,9 @@ static void r_stop(struct seq_file *m, void *v)
 static int show_rcubarrier(struct seq_file *m, void *v)
 {
        struct rcu_state *rsp = (struct rcu_state *)m->private;
-        seq_printf(m, "bcc: %d nbd: %lu\n",
+        seq_printf(m, "bcc: %d bseq: %lu\n",
                   atomic_read(&rsp->barrier_cpu_count),
-                   rsp->n_barrier_done);
+                   rsp->barrier_sequence);
        return 0;
 }
@@ -185,18 +185,15 @@ static int show_rcuexp(struct seq_file *m, void *v)
 {
        struct rcu_state *rsp = (struct rcu_state *)m->private;
-        seq_printf(m, "s=%lu d=%lu w=%lu tf=%lu wd1=%lu wd2=%lu n=%lu sc=%lu dt=%lu dl=%lu dx=%lu\n",
+        seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n",
-                   atomic_long_read(&rsp->expedited_start),
+                   rsp->expedited_sequence,
-                   atomic_long_read(&rsp->expedited_done),
+                   atomic_long_read(&rsp->expedited_workdone0),
-                   atomic_long_read(&rsp->expedited_wrap),
-                   atomic_long_read(&rsp->expedited_tryfail),
                   atomic_long_read(&rsp->expedited_workdone1),
                   atomic_long_read(&rsp->expedited_workdone2),
+                   atomic_long_read(&rsp->expedited_workdone3),
                   atomic_long_read(&rsp->expedited_normal),
-                   atomic_long_read(&rsp->expedited_stoppedcpus),
+                   atomic_read(&rsp->expedited_need_qs),
-                   atomic_long_read(&rsp->expedited_done_tries),
+                   rsp->expedited_sequence / 2);
-                   atomic_long_read(&rsp->expedited_done_lost),
-                   atomic_long_read(&rsp->expedited_done_exit));
        return 0;
 }
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index afaecb7a799a..7a0b3bc7c5ed 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -62,6 +62,55 @@ MODULE_ALIAS("rcupdate");
 module_param(rcu_expedited, int, 0);
+#if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_PREEMPT_COUNT)
+/**
+ * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section?
+ *
+ * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an
+ * RCU-sched read-side critical section.  In absence of
+ * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side
+ * critical section unless it can prove otherwise.  Note that disabling
+ * of preemption (including disabling irqs) counts as an RCU-sched
+ * read-side critical section.  This is useful for debug checks in functions
+ * that required that they be called within an RCU-sched read-side
+ * critical section.
+ *
+ * Check debug_lockdep_rcu_enabled() to prevent false positives during boot
+ * and while lockdep is disabled.
+ *
+ * Note that if the CPU is in the idle loop from an RCU point of
+ * view (ie: that we are in the section between rcu_idle_enter() and
+ * rcu_idle_exit()) then rcu_read_lock_held() returns false even if the CPU
+ * did an rcu_read_lock().  The reason for this is that RCU ignores CPUs
+ * that are in such a section, considering these as in extended quiescent
+ * state, so such a CPU is effectively never in an RCU read-side critical
+ * section regardless of what RCU primitives it invokes.  This state of
+ * affairs is required --- we need to keep an RCU-free window in idle
+ * where the CPU may possibly enter into low power mode. This way we can
+ * notice an extended quiescent state to other CPUs that started a grace
+ * period. Otherwise we would delay any grace period as long as we run in
+ * the idle task.
+ *
+ * Similarly, we avoid claiming an SRCU read lock held if the current
+ * CPU is offline.
+ */
+int rcu_read_lock_sched_held(void)
+{
+        int lockdep_opinion = 0;
+        if (!debug_lockdep_rcu_enabled())
+                return 1;
+        if (!rcu_is_watching())
+                return 0;
+        if (!rcu_lockdep_current_cpu_online())
+                return 0;
+        if (debug_locks)
+                lockdep_opinion = lock_is_held(&rcu_sched_lock_map);
+        return lockdep_opinion || preempt_count() != 0 || irqs_disabled();
+}
+EXPORT_SYMBOL(rcu_read_lock_sched_held);
+#endif
 #ifndef CONFIG_TINY_RCU
 static atomic_t rcu_expedited_nesting =
@@ -269,20 +318,37 @@ void wakeme_after_rcu(struct rcu_head *head)
        rcu = container_of(head, struct rcu_synchronize, head);
        complete(&rcu->completion);
 }
+EXPORT_SYMBOL_GPL(wakeme_after_rcu);
-void wait_rcu_gp(call_rcu_func_t crf)
+void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
+                   struct rcu_synchronize *rs_array)
 {
-        struct rcu_synchronize rcu;
+        int i;
-        init_rcu_head_on_stack(&rcu.head);
+        /* Initialize and register callbacks for each flavor specified. */
-        init_completion(&rcu.completion);
+        for (i = 0; i < n; i++) {
-        /* Will wake me after RCU finished. */
+                if (checktiny &&
-        crf(&rcu.head, wakeme_after_rcu);
+                    (crcu_array[i] == call_rcu ||
-        /* Wait for it. */
+                     crcu_array[i] == call_rcu_bh)) {
-        wait_for_completion(&rcu.completion);
+                        might_sleep();
-        destroy_rcu_head_on_stack(&rcu.head);
+                        continue;
+                }
+                init_rcu_head_on_stack(&rs_array[i].head);
+                init_completion(&rs_array[i].completion);
+                (crcu_array[i])(&rs_array[i].head, wakeme_after_rcu);
+        }
+        /* Wait for all callbacks to be invoked. */
+        for (i = 0; i < n; i++) {
+                if (checktiny &&
+                    (crcu_array[i] == call_rcu ||
+                     crcu_array[i] == call_rcu_bh))
+                        continue;
+                wait_for_completion(&rs_array[i].completion);
+                destroy_rcu_head_on_stack(&rs_array[i].head);
+        }
 }
-EXPORT_SYMBOL_GPL(wait_rcu_gp);
+EXPORT_SYMBOL_GPL(__wait_rcu_gp);
 #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
 void init_rcu_head(struct rcu_head *head)
@@ -523,8 +589,8 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks);
 void synchronize_rcu_tasks(void)
 {
        /* Complain if the scheduler has not started.  */
-        rcu_lockdep_assert(!rcu_scheduler_active,
+        RCU_LOCKDEP_WARN(!rcu_scheduler_active,
-                           "synchronize_rcu_tasks called too soon");
+                         "synchronize_rcu_tasks called too soon");
        /* Wait for the grace period. */
        wait_rcu_gp(call_rcu_tasks);
diff --git a/kernel/reboot.c b/kernel/reboot.c
index d20c85d9f8c0..bd30a973fe94 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -346,7 +346,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
                kernel_restart(buffer);
                break;
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
        case LINUX_REBOOT_CMD_KEXEC:
                ret = kernel_kexec();
                break;
diff --git a/kernel/resource.c b/kernel/resource.c
index 90552aab5f2d..f150dbbe6f62 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -492,40 +492,51 @@ int __weak page_is_ram(unsigned long pfn)
 }
 EXPORT_SYMBOL_GPL(page_is_ram);
-/*
+/**
- * Search for a resouce entry that fully contains the specified region.
+ * region_intersects() - determine intersection of region with known resources
- * If found, return 1 if it is RAM, 0 if not.
+ * @start: region start address
- * If not found, or region is not fully contained, return -1
+ * @size: size of region
+ * @name: name of resource (in iomem_resource)
 *
- * Used by the ioremap functions to ensure the user is not remapping RAM and is
+ * Check if the specified region partially overlaps or fully eclipses a
- * a vast speed up over walking through the resource table page by page.
+ * resource identified by @name.  Return REGION_DISJOINT if the region
+ * does not overlap @name, return REGION_MIXED if the region overlaps
+ * @type and another resource, and return REGION_INTERSECTS if the
+ * region overlaps @type and no other defined resource. Note, that
+ * REGION_INTERSECTS is also returned in the case when the specified
+ * region overlaps RAM and undefined memory holes.
+ *
+ * region_intersect() is used by memory remapping functions to ensure
+ * the user is not remapping RAM and is a vast speed up over walking
+ * through the resource table page by page.
 */
-int region_is_ram(resource_size_t start, unsigned long size)
+int region_intersects(resource_size_t start, size_t size, const char *name)
 {
-        struct resource *p;
+        unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY;
        resource_size_t end = start + size - 1;
-        int flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+        int type = 0; int other = 0;
-        const char *name = "System RAM";
+        struct resource *p;
-        int ret = -1;
        read_lock(&resource_lock);
        for (p = iomem_resource.child; p ; p = p->sibling) {
-                if (end < p->start)
+                bool is_type = strcmp(p->name, name) == 0 && p->flags == flags;
-                        continue;
+                if (start >= p->start && start <= p->end)
-                if (p->start <= start && end <= p->end) {
+                        is_type ? type++ : other++;
-                        /* resource fully contains region */
+                if (end >= p->start && end <= p->end)
-                        if ((p->flags != flags) || strcmp(p->name, name))
+                        is_type ? type++ : other++;
-                                ret = 0;
+                if (p->start >= start && p->end <= end)
-                        else
+                        is_type ? type++ : other++;
-                                ret = 1;
-                        break;
-                }
-                if (p->end < start)
-                        break;  /* not found */
        }
        read_unlock(&resource_lock);
-        return ret;
+        if (other == 0)
+                return type ? REGION_INTERSECTS : REGION_DISJOINT;
+        if (type)
+                return REGION_MIXED;
+        return REGION_DISJOINT;
 }
 void __weak arch_remove_reservations(struct resource *avail)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 78b4bad10081..3595403921bd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -164,14 +164,12 @@ struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
 static void sched_feat_disable(int i)
 {
-        if (static_key_enabled(&sched_feat_keys[i]))
+        static_key_disable(&sched_feat_keys[i]);
-                static_key_slow_dec(&sched_feat_keys[i]);
 }
 static void sched_feat_enable(int i)
 {
-        if (!static_key_enabled(&sched_feat_keys[i]))
+        static_key_enable(&sched_feat_keys[i]);
-                static_key_slow_inc(&sched_feat_keys[i]);
 }
 #else
 static void sched_feat_disable(int i) { };
@@ -1151,15 +1149,45 @@ static int migration_cpu_stop(void *data)
        return 0;
 }
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+/*
+ * sched_class::set_cpus_allowed must do the below, but is not required to
+ * actually call this function.
+ */
+void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
 {
-        if (p->sched_class->set_cpus_allowed)
-                p->sched_class->set_cpus_allowed(p, new_mask);
        cpumask_copy(&p->cpus_allowed, new_mask);
        p->nr_cpus_allowed = cpumask_weight(new_mask);
 }
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+        struct rq *rq = task_rq(p);
+        bool queued, running;
+        lockdep_assert_held(&p->pi_lock);
+        queued = task_on_rq_queued(p);
+        running = task_current(rq, p);
+        if (queued) {
+                /*
+                 * Because __kthread_bind() calls this on blocked tasks without
+                 * holding rq->lock.
+                 */
+                lockdep_assert_held(&rq->lock);
+                dequeue_task(rq, p, 0);
+        }
+        if (running)
+                put_prev_task(rq, p);
+        p->sched_class->set_cpus_allowed(p, new_mask);
+        if (running)
+                p->sched_class->set_curr_task(rq);
+        if (queued)
+                enqueue_task(rq, p, 0);
+}
 /*
 * Change a given task's CPU affinity. Migrate the thread to a
 * proper CPU and schedule it away if the CPU it's executing on
@@ -1169,7 +1197,8 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 * task must not exit() & deallocate itself prematurely. The
 * call is not atomic; no spinlocks may be held.
 */
-int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+static int __set_cpus_allowed_ptr(struct task_struct *p,
+                                  const struct cpumask *new_mask, bool check)
 {
        unsigned long flags;
        struct rq *rq;
@@ -1178,6 +1207,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
        rq = task_rq_lock(p, &flags);
+        /*
+         * Must re-check here, to close a race against __kthread_bind(),
+         * sched_setaffinity() is not guaranteed to observe the flag.
+         */
+        if (check && (p->flags & PF_NO_SETAFFINITY)) {
+                ret = -EINVAL;
+                goto out;
+        }
        if (cpumask_equal(&p->cpus_allowed, new_mask))
                goto out;
@@ -1214,6 +1252,11 @@ out:
        return ret;
 }
+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+{
+        return __set_cpus_allowed_ptr(p, new_mask, false);
+}
 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
@@ -1595,6 +1638,15 @@ static void update_avg(u64 *avg, u64 sample)
        s64 diff = sample - *avg;
        *avg += diff >> 3;
 }
+#else
+static inline int __set_cpus_allowed_ptr(struct task_struct *p,
+                                         const struct cpumask *new_mask, bool check)
+{
+        return set_cpus_allowed_ptr(p, new_mask);
+}
 #endif /* CONFIG_SMP */
 static void
@@ -1654,9 +1706,9 @@ static void
 ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
 {
        check_preempt_curr(rq, p, wake_flags);
-        trace_sched_wakeup(p, true);
        p->state = TASK_RUNNING;
+        trace_sched_wakeup(p);
 #ifdef CONFIG_SMP
        if (p->sched_class->task_woken) {
                /*
@@ -1874,6 +1926,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
        if (!(p->state & state))
                goto out;
+        trace_sched_waking(p);
        success = 1; /* we're going to change ->state */
        cpu = task_cpu(p);
@@ -1949,6 +2003,8 @@ static void try_to_wake_up_local(struct task_struct *p)
        if (!(p->state & TASK_NORMAL))
                goto out;
+        trace_sched_waking(p);
        if (!task_on_rq_queued(p))
                ttwu_activate(rq, p, ENQUEUE_WAKEUP);
@@ -2016,9 +2072,6 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
        p->se.prev_sum_exec_runtime     = 0;
        p->se.nr_migrations             = 0;
        p->se.vruntime                  = 0;
-#ifdef CONFIG_SMP
-        p->se.avg.decay_count           = 0;
-#endif
        INIT_LIST_HEAD(&p->se.group_node);
 #ifdef CONFIG_SCHEDSTATS
@@ -2200,8 +2253,8 @@ unsigned long to_ratio(u64 period, u64 runtime)
 #ifdef CONFIG_SMP
 inline struct dl_bw *dl_bw_of(int i)
 {
-        rcu_lockdep_assert(rcu_read_lock_sched_held(),
+        RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
-                           "sched RCU must be held");
+                         "sched RCU must be held");
        return &cpu_rq(i)->rd->dl_bw;
 }
@@ -2210,8 +2263,8 @@ static inline int dl_bw_cpus(int i)
        struct root_domain *rd = cpu_rq(i)->rd;
        int cpus = 0;
-        rcu_lockdep_assert(rcu_read_lock_sched_held(),
+        RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
-                           "sched RCU must be held");
+                         "sched RCU must be held");
        for_each_cpu_and(i, rd->span, cpu_active_mask)
                cpus++;
@@ -2303,11 +2356,11 @@ void wake_up_new_task(struct task_struct *p)
 #endif
        /* Initialize new task's runnable average */
-        init_task_runnable_average(p);
+        init_entity_runnable_average(&p->se);
        rq = __task_rq_lock(p);
        activate_task(rq, p, 0);
        p->on_rq = TASK_ON_RQ_QUEUED;
-        trace_sched_wakeup_new(p, true);
+        trace_sched_wakeup_new(p);
        check_preempt_curr(rq, p, WF_FORK);
 #ifdef CONFIG_SMP
        if (p->sched_class->task_woken)
@@ -2469,7 +2522,6 @@ static struct rq *finish_task_switch(struct task_struct *prev)
         */
        prev_state = prev->state;
        vtime_task_switch(prev);
-        finish_arch_switch(prev);
        perf_event_task_sched_in(prev, current);
        finish_lock_switch(rq, prev);
        finish_arch_post_lock_switch();
@@ -2489,7 +2541,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
                put_task_struct(prev);
        }
-        tick_nohz_task_switch(current);
+        tick_nohz_task_switch();
        return rq;
 }
@@ -4340,7 +4392,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
        }
 #endif
 again:
-        retval = set_cpus_allowed_ptr(p, new_mask);
+        retval = __set_cpus_allowed_ptr(p, new_mask, true);
        if (!retval) {
                cpuset_cpus_allowed(p, cpus_allowed);
@@ -4492,7 +4544,7 @@ SYSCALL_DEFINE0(sched_yield)
 int __sched _cond_resched(void)
 {
-        if (should_resched()) {
+        if (should_resched(0)) {
                preempt_schedule_common();
                return 1;
        }
@@ -4510,7 +4562,7 @@ EXPORT_SYMBOL(_cond_resched);
 */
 int __cond_resched_lock(spinlock_t *lock)
 {
-        int resched = should_resched();
+        int resched = should_resched(PREEMPT_LOCK_OFFSET);
        int ret = 0;
        lockdep_assert_held(lock);
@@ -4532,7 +4584,7 @@ int __sched __cond_resched_softirq(void)
 {
        BUG_ON(!in_softirq());
-        if (should_resched()) {
+        if (should_resched(SOFTIRQ_DISABLE_OFFSET)) {
                local_bh_enable();
                preempt_schedule_common();
                local_bh_disable();
@@ -4865,7 +4917,8 @@ void init_idle(struct task_struct *idle, int cpu)
        struct rq *rq = cpu_rq(cpu);
        unsigned long flags;
-        raw_spin_lock_irqsave(&rq->lock, flags);
+        raw_spin_lock_irqsave(&idle->pi_lock, flags);
+        raw_spin_lock(&rq->lock);
        __sched_fork(0, idle);
        idle->state = TASK_RUNNING;
@@ -4891,7 +4944,8 @@ void init_idle(struct task_struct *idle, int cpu)
 #if defined(CONFIG_SMP)
        idle->on_cpu = 1;
 #endif
-        raw_spin_unlock_irqrestore(&rq->lock, flags);
+        raw_spin_unlock(&rq->lock);
+        raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
        /* Set the preempt count _outside_ the spinlocks! */
        init_idle_preempt_count(idle, cpu);
@@ -5311,8 +5365,7 @@ static void register_sched_domain_sysctl(void)
 /* may be called multiple times per register */
 static void unregister_sched_domain_sysctl(void)
 {
-        if (sd_sysctl_header)
+        unregister_sysctl_table(sd_sysctl_header);
-                unregister_sysctl_table(sd_sysctl_header);
        sd_sysctl_header = NULL;
        if (sd_ctl_dir[0].child)
                sd_free_ctl_entry(&sd_ctl_dir[0].child);
@@ -5433,6 +5486,14 @@ static int sched_cpu_active(struct notifier_block *nfb,
        case CPU_STARTING:
                set_cpu_rq_start_time();
                return NOTIFY_OK;
+        case CPU_ONLINE:
+                /*
+                 * At this point a starting CPU has marked itself as online via
+                 * set_cpu_online(). But it might not yet have marked itself
+                 * as active, which is essential from here on.
+                 *
+                 * Thus, fall-through and help the starting CPU along.
+                 */
        case CPU_DOWN_FAILED:
                set_cpu_active((long)hcpu, true);
                return NOTIFY_OK;
@@ -6445,8 +6506,10 @@ static void init_numa_topology_type(void)
        n = sched_max_numa_distance;
-        if (n <= 1)
+        if (sched_domains_numa_levels <= 1) {
                sched_numa_topology_type = NUMA_DIRECT;
+                return;
+        }
        for_each_online_node(a) {
                for_each_online_node(b) {
@@ -8068,7 +8131,7 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
        sched_offline_group(tg);
 }
-static void cpu_cgroup_fork(struct task_struct *task)
+static void cpu_cgroup_fork(struct task_struct *task, void *private)
 {
        sched_move_task(task);
 }
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index f5a64ffad176..8cbc3db671df 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -555,48 +555,43 @@ drop_precision:
 }
 /*
- * Atomically advance counter to the new value. Interrupts, vcpu
+ * Adjust tick based cputime random precision against scheduler runtime
- * scheduling, and scaling inaccuracies can cause cputime_advance
+ * accounting.
- * to be occasionally called with a new value smaller than counter.
- * Let's enforce atomicity.
 *
- * Normally a caller will only go through this loop once, or not
+ * Tick based cputime accounting depend on random scheduling timeslices of a
- * at all in case a previous caller updated counter the same jiffy.
+ * task to be interrupted or not by the timer.  Depending on these
- */
+ * circumstances, the number of these interrupts may be over or
-static void cputime_advance(cputime_t *counter, cputime_t new)
+ * under-optimistic, matching the real user and system cputime with a variable
-{
+ * precision.
-        cputime_t old;
+ *
+ * Fix this by scaling these tick based values against the total runtime
-        while (new > (old = READ_ONCE(*counter)))
+ * accounted by the CFS scheduler.
-                cmpxchg_cputime(counter, old, new);
+ *
-}
+ * This code provides the following guarantees:
+ *
-/*
+ *   stime + utime == rtime
- * Adjust tick based cputime random precision against scheduler
+ *   stime_i+1 >= stime_i, utime_i+1 >= utime_i
- * runtime accounting.
+ *
+ * Assuming that rtime_i+1 >= rtime_i.
 */
 static void cputime_adjust(struct task_cputime *curr,
-                           struct cputime *prev,
+                           struct prev_cputime *prev,
                           cputime_t *ut, cputime_t *st)
 {
        cputime_t rtime, stime, utime;
+        unsigned long flags;
-        /*
+        /* Serialize concurrent callers such that we can honour our guarantees */
-         * Tick based cputime accounting depend on random scheduling
+        raw_spin_lock_irqsave(&prev->lock, flags);
-         * timeslices of a task to be interrupted or not by the timer.
-         * Depending on these circumstances, the number of these interrupts
-         * may be over or under-optimistic, matching the real user and system
-         * cputime with a variable precision.
-         *
-         * Fix this by scaling these tick based values against the total
-         * runtime accounted by the CFS scheduler.
-         */
        rtime = nsecs_to_cputime(curr->sum_exec_runtime);
        /*
-         * Update userspace visible utime/stime values only if actual execution
+         * This is possible under two circumstances:
-         * time is bigger than already exported. Note that can happen, that we
+         *  - rtime isn't monotonic after all (a bug);
-         * provided bigger values due to scaling inaccuracy on big numbers.
+         *  - we got reordered by the lock.
+         *
+         * In both cases this acts as a filter such that the rest of the code
+         * can assume it is monotonic regardless of anything else.
         */
        if (prev->stime + prev->utime >= rtime)
                goto out;
@@ -606,22 +601,46 @@ static void cputime_adjust(struct task_cputime *curr,
        if (utime == 0) {
                stime = rtime;
-        } else if (stime == 0) {
+                goto update;
-                utime = rtime;
+        }
-        } else {
-                cputime_t total = stime + utime;
-                stime = scale_stime((__force u64)stime,
+        if (stime == 0) {
-                                    (__force u64)rtime, (__force u64)total);
+                utime = rtime;
-                utime = rtime - stime;
+                goto update;
        }
-        cputime_advance(&prev->stime, stime);
+        stime = scale_stime((__force u64)stime, (__force u64)rtime,
-        cputime_advance(&prev->utime, utime);
+                            (__force u64)(stime + utime));
+        /*
+         * Make sure stime doesn't go backwards; this preserves monotonicity
+         * for utime because rtime is monotonic.
+         *
+         *  utime_i+1 = rtime_i+1 - stime_i
+         *            = rtime_i+1 - (rtime_i - utime_i)
+         *            = (rtime_i+1 - rtime_i) + utime_i
+         *            >= utime_i
+         */
+        if (stime < prev->stime)
+                stime = prev->stime;
+        utime = rtime - stime;
+        /*
+         * Make sure utime doesn't go backwards; this still preserves
+         * monotonicity for stime, analogous argument to above.
+         */
+        if (utime < prev->utime) {
+                utime = prev->utime;
+                stime = rtime - utime;
+        }
+update:
+        prev->stime = stime;
+        prev->utime = utime;
 out:
        *ut = prev->utime;
        *st = prev->stime;
+        raw_spin_unlock_irqrestore(&prev->lock, flags);
 }
 void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 0a17af35670a..fc8f01083527 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -953,7 +953,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
        /*
         * Use the scheduling parameters of the top pi-waiter
-         * task if we have one and its (relative) deadline is
+         * task if we have one and its (absolute) deadline is
         * smaller than our one... OTW we keep our runtime and
         * deadline.
         */
@@ -1563,7 +1563,7 @@ out:
 static void push_dl_tasks(struct rq *rq)
 {
-        /* Terminates as it moves a -deadline task */
+        /* push_dl_task() will return true if it moved a -deadline task */
        while (push_dl_task(rq))
                ;
 }
@@ -1657,7 +1657,6 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
 {
        if (!task_running(rq, p) &&
            !test_tsk_need_resched(rq->curr) &&
-            has_pushable_dl_tasks(rq) &&
            p->nr_cpus_allowed > 1 &&
            dl_task(rq->curr) &&
            (rq->curr->nr_cpus_allowed < 2 ||
@@ -1669,9 +1668,8 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
 static void set_cpus_allowed_dl(struct task_struct *p,
                                const struct cpumask *new_mask)
 {
-        struct rq *rq;
        struct root_domain *src_rd;
-        int weight;
+        struct rq *rq;
        BUG_ON(!dl_task(p));
@@ -1697,37 +1695,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
                raw_spin_unlock(&src_dl_b->lock);
        }
-        /*
+        set_cpus_allowed_common(p, new_mask);
-         * Update only if the task is actually running (i.e.,
-         * it is on the rq AND it is not throttled).
-         */
-        if (!on_dl_rq(&p->dl))
-                return;
-        weight = cpumask_weight(new_mask);
-        /*
-         * Only update if the process changes its state from whether it
-         * can migrate or not.
-         */
-        if ((p->nr_cpus_allowed > 1) == (weight > 1))
-                return;
-        /*
-         * The process used to be able to migrate OR it can now migrate
-         */
-        if (weight <= 1) {
-                if (!task_current(rq, p))
-                        dequeue_pushable_dl_task(rq, p);
-                BUG_ON(!rq->dl.dl_nr_migratory);
-                rq->dl.dl_nr_migratory--;
-        } else {
-                if (!task_current(rq, p))
-                        enqueue_pushable_dl_task(rq, p);
-                rq->dl.dl_nr_migratory++;
-        }
-        update_dl_migration(&rq->dl);
 }
 /* Assumes rq->lock is held */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4222ec50ab88..641511771ae6 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -68,13 +68,8 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
 #define PN(F) \
        SEQ_printf(m, "  .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
-        if (!se) {
+        if (!se)
-                struct sched_avg *avg = &cpu_rq(cpu)->avg;
-                P(avg->runnable_avg_sum);
-                P(avg->avg_period);
                return;
-        }
        PN(se->exec_start);
        PN(se->vruntime);
@@ -93,12 +88,8 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
 #endif
        P(se->load.weight);
 #ifdef CONFIG_SMP
-        P(se->avg.runnable_avg_sum);
+        P(se->avg.load_avg);
-        P(se->avg.running_avg_sum);
+        P(se->avg.util_avg);
-        P(se->avg.avg_period);
-        P(se->avg.load_avg_contrib);
-        P(se->avg.utilization_avg_contrib);
-        P(se->avg.decay_count);
 #endif
 #undef PN
 #undef P
@@ -214,21 +205,21 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
        SEQ_printf(m, "  .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
        SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
 #ifdef CONFIG_SMP
-        SEQ_printf(m, "  .%-30s: %ld\n", "runnable_load_avg",
+        SEQ_printf(m, "  .%-30s: %lu\n", "load_avg",
+                        cfs_rq->avg.load_avg);
+        SEQ_printf(m, "  .%-30s: %lu\n", "runnable_load_avg",
                        cfs_rq->runnable_load_avg);
-        SEQ_printf(m, "  .%-30s: %ld\n", "blocked_load_avg",
+        SEQ_printf(m, "  .%-30s: %lu\n", "util_avg",
-                        cfs_rq->blocked_load_avg);
+                        cfs_rq->avg.util_avg);
-        SEQ_printf(m, "  .%-30s: %ld\n", "utilization_load_avg",
+        SEQ_printf(m, "  .%-30s: %ld\n", "removed_load_avg",
-                        cfs_rq->utilization_load_avg);
+                        atomic_long_read(&cfs_rq->removed_load_avg));
+        SEQ_printf(m, "  .%-30s: %ld\n", "removed_util_avg",
+                        atomic_long_read(&cfs_rq->removed_util_avg));
 #ifdef CONFIG_FAIR_GROUP_SCHED
-        SEQ_printf(m, "  .%-30s: %ld\n", "tg_load_contrib",
+        SEQ_printf(m, "  .%-30s: %lu\n", "tg_load_avg_contrib",
-                        cfs_rq->tg_load_contrib);
+                        cfs_rq->tg_load_avg_contrib);
-        SEQ_printf(m, "  .%-30s: %d\n", "tg_runnable_contrib",
-                        cfs_rq->tg_runnable_contrib);
        SEQ_printf(m, "  .%-30s: %ld\n", "tg_load_avg",
                        atomic_long_read(&cfs_rq->tg->load_avg));
-        SEQ_printf(m, "  .%-30s: %d\n", "tg->runnable_avg",
-                        atomic_read(&cfs_rq->tg->runnable_avg));
 #endif
 #endif
 #ifdef CONFIG_CFS_BANDWIDTH
@@ -636,12 +627,11 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
        P(se.load.weight);
 #ifdef CONFIG_SMP
-        P(se.avg.runnable_avg_sum);
+        P(se.avg.load_sum);
-        P(se.avg.running_avg_sum);
+        P(se.avg.util_sum);
-        P(se.avg.avg_period);
+        P(se.avg.load_avg);
-        P(se.avg.load_avg_contrib);
+        P(se.avg.util_avg);
-        P(se.avg.utilization_avg_contrib);
+        P(se.avg.last_update_time);
-        P(se.avg.decay_count);
 #endif
        P(policy);
        P(prio);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 65c8f3ebdc3c..6e2e3483b1ec 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -283,9 +283,6 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
        return grp->my_q;
 }
-static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
-                                       int force_update);
 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
        if (!cfs_rq->on_list) {
@@ -305,8 +302,6 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
                }
                cfs_rq->on_list = 1;
-                /* We should have no load, but we need to update last_decay. */
-                update_cfs_rq_blocked_load(cfs_rq, 0);
        }
 }
@@ -616,15 +611,10 @@ static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
 */
 static u64 __sched_period(unsigned long nr_running)
 {
-        u64 period = sysctl_sched_latency;
+        if (unlikely(nr_running > sched_nr_latency))
-        unsigned long nr_latency = sched_nr_latency;
+                return nr_running * sysctl_sched_min_granularity;
+        else
-        if (unlikely(nr_running > nr_latency)) {
+                return sysctl_sched_latency;
-                period = sysctl_sched_min_granularity;
-                period *= nr_running;
-        }
-        return period;
 }
 /*
@@ -669,22 +659,37 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 static int select_idle_sibling(struct task_struct *p, int cpu);
 static unsigned long task_h_load(struct task_struct *p);
-static inline void __update_task_entity_contrib(struct sched_entity *se);
+/*
-static inline void __update_task_entity_utilization(struct sched_entity *se);
+ * We choose a half-life close to 1 scheduling period.
+ * Note: The tables below are dependent on this value.
+ */
+#define LOAD_AVG_PERIOD 32
+#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
+#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
-/* Give new task start runnable values to heavy its load in infant time */
+/* Give new sched_entity start runnable values to heavy its load in infant time */
-void init_task_runnable_average(struct task_struct *p)
+void init_entity_runnable_average(struct sched_entity *se)
 {
-        u32 slice;
+        struct sched_avg *sa = &se->avg;
-        slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
+        sa->last_update_time = 0;
-        p->se.avg.runnable_avg_sum = p->se.avg.running_avg_sum = slice;
+        /*
-        p->se.avg.avg_period = slice;
+         * sched_avg's period_contrib should be strictly less then 1024, so
-        __update_task_entity_contrib(&p->se);
+         * we give it 1023 to make sure it is almost a period (1024us), and
-        __update_task_entity_utilization(&p->se);
+         * will definitely be update (after enqueue).
+         */
+        sa->period_contrib = 1023;
+        sa->load_avg = scale_load_down(se->load.weight);
+        sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
+        sa->util_avg = scale_load_down(SCHED_LOAD_SCALE);
+        sa->util_sum = LOAD_AVG_MAX;
+        /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
 }
+static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
+static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
 #else
-void init_task_runnable_average(struct task_struct *p)
+void init_entity_runnable_average(struct sched_entity *se)
 {
 }
 #endif
@@ -1415,8 +1420,9 @@ static bool numa_has_capacity(struct task_numa_env *env)
         * --------------------- vs ---------------------
         * src->compute_capacity    dst->compute_capacity
         */
-        if (src->load * dst->compute_capacity >
+        if (src->load * dst->compute_capacity * env->imbalance_pct >
-            dst->load * src->compute_capacity)
+            dst->load * src->compute_capacity * 100)
                return true;
        return false;
@@ -1702,8 +1708,8 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
                delta = runtime - p->last_sum_exec_runtime;
                *period = now - p->last_task_numa_placement;
        } else {
-                delta = p->se.avg.runnable_avg_sum;
+                delta = p->se.avg.load_sum / p->se.load.weight;
-                *period = p->se.avg.avg_period;
+                *period = LOAD_AVG_MAX;
        }
        p->last_sum_exec_runtime = runtime;
@@ -2351,13 +2357,13 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
        long tg_weight;
        /*
-         * Use this CPU's actual weight instead of the last load_contribution
+         * Use this CPU's real-time load instead of the last load contribution
-         * to gain a more accurate current total weight. See
+         * as the updating of the contribution is delayed, and we will use the
-         * update_cfs_rq_load_contribution().
+         * the real-time load to calc the share. See update_tg_load_avg().
         */
        tg_weight = atomic_long_read(&tg->load_avg);
-        tg_weight -= cfs_rq->tg_load_contrib;
+        tg_weight -= cfs_rq->tg_load_avg_contrib;
-        tg_weight += cfs_rq->load.weight;
+        tg_weight += cfs_rq_load_avg(cfs_rq);
        return tg_weight;
 }
@@ -2367,7 +2373,7 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
        long tg_weight, load, shares;
        tg_weight = calc_tg_weight(tg, cfs_rq);
-        load = cfs_rq->load.weight;
+        load = cfs_rq_load_avg(cfs_rq);
        shares = (tg->shares * load);
        if (tg_weight)
@@ -2429,14 +2435,6 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_SMP
-/*
- * We choose a half-life close to 1 scheduling period.
- * Note: The tables below are dependent on this value.
- */
-#define LOAD_AVG_PERIOD 32
-#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
-#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
 /* Precomputed fixed inverse multiplies for multiplication by y^n */
 static const u32 runnable_avg_yN_inv[] = {
        0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
@@ -2485,9 +2483,8 @@ static __always_inline u64 decay_load(u64 val, u64 n)
                local_n %= LOAD_AVG_PERIOD;
        }
-        val *= runnable_avg_yN_inv[local_n];
+        val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
-        /* We don't use SRR here since we always want to round down. */
+        return val;
-        return val >> 32;
 }
 /*
@@ -2546,23 +2543,22 @@ static u32 __compute_runnable_contrib(u64 n)
 *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
 *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
 */
-static __always_inline int __update_entity_runnable_avg(u64 now, int cpu,
+static __always_inline int
-                                                        struct sched_avg *sa,
+__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
-                                                        int runnable,
+                  unsigned long weight, int running, struct cfs_rq *cfs_rq)
-                                                        int running)
 {
        u64 delta, periods;
-        u32 runnable_contrib;
+        u32 contrib;
        int delta_w, decayed = 0;
        unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu);
-        delta = now - sa->last_runnable_update;
+        delta = now - sa->last_update_time;
        /*
         * This should only happen when time goes backwards, which it
         * unfortunately does during sched clock init when we swap over to TSC.
         */
        if ((s64)delta < 0) {
-                sa->last_runnable_update = now;
+                sa->last_update_time = now;
                return 0;
        }
@@ -2573,26 +2569,29 @@ static __always_inline int __update_entity_runnable_avg(u64 now, int cpu,
        delta >>= 10;
        if (!delta)
                return 0;
-        sa->last_runnable_update = now;
+        sa->last_update_time = now;
        /* delta_w is the amount already accumulated against our next period */
-        delta_w = sa->avg_period % 1024;
+        delta_w = sa->period_contrib;
        if (delta + delta_w >= 1024) {
-                /* period roll-over */
                decayed = 1;
+                /* how much left for next period will start over, we don't know yet */
+                sa->period_contrib = 0;
                /*
                 * Now that we know we're crossing a period boundary, figure
                 * out how much from delta we need to complete the current
                 * period and accrue it.
                 */
                delta_w = 1024 - delta_w;
-                if (runnable)
+                if (weight) {
-                        sa->runnable_avg_sum += delta_w;
+                        sa->load_sum += weight * delta_w;
+                        if (cfs_rq)
+                                cfs_rq->runnable_load_sum += weight * delta_w;
+                }
                if (running)
-                        sa->running_avg_sum += delta_w * scale_freq
+                        sa->util_sum += delta_w * scale_freq >> SCHED_CAPACITY_SHIFT;
-                                >> SCHED_CAPACITY_SHIFT;
-                sa->avg_period += delta_w;
                delta -= delta_w;
@@ -2600,341 +2599,186 @@ static __always_inline int __update_entity_runnable_avg(u64 now, int cpu,
                periods = delta / 1024;
                delta %= 1024;
-                sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
+                sa->load_sum = decay_load(sa->load_sum, periods + 1);
-                                                  periods + 1);
+                if (cfs_rq) {
-                sa->running_avg_sum = decay_load(sa->running_avg_sum,
+                        cfs_rq->runnable_load_sum =
-                                                  periods + 1);
+                                decay_load(cfs_rq->runnable_load_sum, periods + 1);
-                sa->avg_period = decay_load(sa->avg_period,
+                }
-                                                     periods + 1);
+                sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
                /* Efficiently calculate \sum (1..n_period) 1024*y^i */
-                runnable_contrib = __compute_runnable_contrib(periods);
+                contrib = __compute_runnable_contrib(periods);
-                if (runnable)
+                if (weight) {
-                        sa->runnable_avg_sum += runnable_contrib;
+                        sa->load_sum += weight * contrib;
+                        if (cfs_rq)
+                                cfs_rq->runnable_load_sum += weight * contrib;
+                }
                if (running)
-                        sa->running_avg_sum += runnable_contrib * scale_freq
+                        sa->util_sum += contrib * scale_freq >> SCHED_CAPACITY_SHIFT;
-                                >> SCHED_CAPACITY_SHIFT;
-                sa->avg_period += runnable_contrib;
        }
        /* Remainder of delta accrued against u_0` */
-        if (runnable)
+        if (weight) {
-                sa->runnable_avg_sum += delta;
+                sa->load_sum += weight * delta;
+                if (cfs_rq)
+                        cfs_rq->runnable_load_sum += weight * delta;
+        }
        if (running)
-                sa->running_avg_sum += delta * scale_freq
+                sa->util_sum += delta * scale_freq >> SCHED_CAPACITY_SHIFT;
-                        >> SCHED_CAPACITY_SHIFT;
-        sa->avg_period += delta;
-        return decayed;
-}
-/* Synchronize an entity's decay with its parenting cfs_rq.*/
+        sa->period_contrib += delta;
-static inline u64 __synchronize_entity_decay(struct sched_entity *se)
-{
-        struct cfs_rq *cfs_rq = cfs_rq_of(se);
-        u64 decays = atomic64_read(&cfs_rq->decay_counter);
-        decays -= se->avg.decay_count;
-        se->avg.decay_count = 0;
-        if (!decays)
-                return 0;
-        se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
+        if (decayed) {
-        se->avg.utilization_avg_contrib =
+                sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
-                decay_load(se->avg.utilization_avg_contrib, decays);
+                if (cfs_rq) {
+                        cfs_rq->runnable_load_avg =
+                                div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
+                }
+                sa->util_avg = (sa->util_sum << SCHED_LOAD_SHIFT) / LOAD_AVG_MAX;
+        }
-        return decays;
+        return decayed;
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
-                                                 int force_update)
-{
-        struct task_group *tg = cfs_rq->tg;
-        long tg_contrib;
-        tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
-        tg_contrib -= cfs_rq->tg_load_contrib;
-        if (!tg_contrib)
-                return;
-        if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
-                atomic_long_add(tg_contrib, &tg->load_avg);
-                cfs_rq->tg_load_contrib += tg_contrib;
-        }
-}
 /*
- * Aggregate cfs_rq runnable averages into an equivalent task_group
+ * Updating tg's load_avg is necessary before update_cfs_share (which is done)
- * representation for computing load contributions.
+ * and effective_load (which is not done because it is too costly).
 */
-static inline void __update_tg_runnable_avg(struct sched_avg *sa,
+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
-                                                  struct cfs_rq *cfs_rq)
 {
-        struct task_group *tg = cfs_rq->tg;
+        long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
-        long contrib;
-        /* The fraction of a cpu used by this cfs_rq */
+        if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
-        contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,
+                atomic_long_add(delta, &cfs_rq->tg->load_avg);
-                          sa->avg_period + 1);
+                cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
-        contrib -= cfs_rq->tg_runnable_contrib;
-        if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
-                atomic_add(contrib, &tg->runnable_avg);
-                cfs_rq->tg_runnable_contrib += contrib;
        }
 }
-static inline void __update_group_entity_contrib(struct sched_entity *se)
-{
-        struct cfs_rq *cfs_rq = group_cfs_rq(se);
-        struct task_group *tg = cfs_rq->tg;
-        int runnable_avg;
-        u64 contrib;
-        contrib = cfs_rq->tg_load_contrib * tg->shares;
-        se->avg.load_avg_contrib = div_u64(contrib,
-                                     atomic_long_read(&tg->load_avg) + 1);
-        /*
-         * For group entities we need to compute a correction term in the case
-         * that they are consuming <1 cpu so that we would contribute the same
-         * load as a task of equal weight.
-         *
-         * Explicitly co-ordinating this measurement would be expensive, but
-         * fortunately the sum of each cpus contribution forms a usable
-         * lower-bound on the true value.
-         *
-         * Consider the aggregate of 2 contributions.  Either they are disjoint
-         * (and the sum represents true value) or they are disjoint and we are
-         * understating by the aggregate of their overlap.
-         *
-         * Extending this to N cpus, for a given overlap, the maximum amount we
-         * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
-         * cpus that overlap for this interval and w_i is the interval width.
-         *
-         * On a small machine; the first term is well-bounded which bounds the
-         * total error since w_i is a subset of the period.  Whereas on a
-         * larger machine, while this first term can be larger, if w_i is the
-         * of consequential size guaranteed to see n_i*w_i quickly converge to
-         * our upper bound of 1-cpu.
-         */
-        runnable_avg = atomic_read(&tg->runnable_avg);
-        if (runnable_avg < NICE_0_LOAD) {
-                se->avg.load_avg_contrib *= runnable_avg;
-                se->avg.load_avg_contrib >>= NICE_0_SHIFT;
-        }
-}
-static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
-{
-        __update_entity_runnable_avg(rq_clock_task(rq), cpu_of(rq), &rq->avg,
-                        runnable, runnable);
-        __update_tg_runnable_avg(&rq->avg, &rq->cfs);
-}
 #else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
-                                                 int force_update) {}
-static inline void __update_tg_runnable_avg(struct sched_avg *sa,
-                                                  struct cfs_rq *cfs_rq) {}
-static inline void __update_group_entity_contrib(struct sched_entity *se) {}
-static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
 #endif /* CONFIG_FAIR_GROUP_SCHED */
-static inline void __update_task_entity_contrib(struct sched_entity *se)
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
-{
-        u32 contrib;
-        /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
-        contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
-        contrib /= (se->avg.avg_period + 1);
-        se->avg.load_avg_contrib = scale_load(contrib);
-}
-/* Compute the current contribution to load_avg by se, return any delta */
+/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
-static long __update_entity_load_avg_contrib(struct sched_entity *se)
+static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 {
-        long old_contrib = se->avg.load_avg_contrib;
+        int decayed;
+        struct sched_avg *sa = &cfs_rq->avg;
-        if (entity_is_task(se)) {
+        if (atomic_long_read(&cfs_rq->removed_load_avg)) {
-                __update_task_entity_contrib(se);
+                long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
-        } else {
+                sa->load_avg = max_t(long, sa->load_avg - r, 0);
-                __update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
+                sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
-                __update_group_entity_contrib(se);
        }
-        return se->avg.load_avg_contrib - old_contrib;
+        if (atomic_long_read(&cfs_rq->removed_util_avg)) {
-}
+                long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
+                sa->util_avg = max_t(long, sa->util_avg - r, 0);
+                sa->util_sum = max_t(s32, sa->util_sum -
-static inline void __update_task_entity_utilization(struct sched_entity *se)
+                        ((r * LOAD_AVG_MAX) >> SCHED_LOAD_SHIFT), 0);
-{
+        }
-        u32 contrib;
-        /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
+        decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
-        contrib = se->avg.running_avg_sum * scale_load_down(SCHED_LOAD_SCALE);
+                scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);
-        contrib /= (se->avg.avg_period + 1);
-        se->avg.utilization_avg_contrib = scale_load(contrib);
-}
-static long __update_entity_utilization_avg_contrib(struct sched_entity *se)
+#ifndef CONFIG_64BIT
-{
+        smp_wmb();
-        long old_contrib = se->avg.utilization_avg_contrib;
+        cfs_rq->load_last_update_time_copy = sa->last_update_time;
+#endif
-        if (entity_is_task(se))
-                __update_task_entity_utilization(se);
-        else
-                se->avg.utilization_avg_contrib =
-                                        group_cfs_rq(se)->utilization_load_avg;
-        return se->avg.utilization_avg_contrib - old_contrib;
+        return decayed;
 }
-static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
+/* Update task and its cfs_rq load average */
-                                                 long load_contrib)
+static inline void update_load_avg(struct sched_entity *se, int update_tg)
-{
-        if (likely(load_contrib < cfs_rq->blocked_load_avg))
-                cfs_rq->blocked_load_avg -= load_contrib;
-        else
-                cfs_rq->blocked_load_avg = 0;
-}
-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
-/* Update a sched_entity's runnable average */
-static inline void update_entity_load_avg(struct sched_entity *se,
-                                          int update_cfs_rq)
 {
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
-        long contrib_delta, utilization_delta;
        int cpu = cpu_of(rq_of(cfs_rq));
-        u64 now;
+        u64 now = cfs_rq_clock_task(cfs_rq);
        /*
-         * For a group entity we need to use their owned cfs_rq_clock_task() in
+         * Track task load average for carrying it to new CPU after migrated, and
-         * case they are the parent of a throttled hierarchy.
+         * track group sched_entity load average for task_h_load calc in migration
         */
-        if (entity_is_task(se))
+        __update_load_avg(now, cpu, &se->avg,
-                now = cfs_rq_clock_task(cfs_rq);
+                se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL);
-        else
-                now = cfs_rq_clock_task(group_cfs_rq(se));
-        if (!__update_entity_runnable_avg(now, cpu, &se->avg, se->on_rq,
-                                        cfs_rq->curr == se))
-                return;
-        contrib_delta = __update_entity_load_avg_contrib(se);
-        utilization_delta = __update_entity_utilization_avg_contrib(se);
-        if (!update_cfs_rq)
-                return;
-        if (se->on_rq) {
+        if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
-                cfs_rq->runnable_load_avg += contrib_delta;
+                update_tg_load_avg(cfs_rq, 0);
-                cfs_rq->utilization_load_avg += utilization_delta;
-        } else {
-                subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
-        }
 }
-/*
+/* Add the load generated by se into cfs_rq's load average */
- * Decay the load contributed by all blocked children and account this so that
+static inline void
- * their contribution may appropriately discounted when they wake up.
+enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
- */
-static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
 {
-        u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
+        struct sched_avg *sa = &se->avg;
-        u64 decays;
+        u64 now = cfs_rq_clock_task(cfs_rq);
+        int migrated = 0, decayed;
-        decays = now - cfs_rq->last_decay;
-        if (!decays && !force_update)
-                return;
-        if (atomic_long_read(&cfs_rq->removed_load)) {
+        if (sa->last_update_time == 0) {
-                unsigned long removed_load;
+                sa->last_update_time = now;
-                removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0);
+                migrated = 1;
-                subtract_blocked_load_contrib(cfs_rq, removed_load);
        }
+        else {
+                __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
+                        se->on_rq * scale_load_down(se->load.weight),
+                        cfs_rq->curr == se, NULL);
+        }
+        decayed = update_cfs_rq_load_avg(now, cfs_rq);
-        if (decays) {
+        cfs_rq->runnable_load_avg += sa->load_avg;
-                cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
+        cfs_rq->runnable_load_sum += sa->load_sum;
-                                                      decays);
-                atomic64_add(decays, &cfs_rq->decay_counter);
+        if (migrated) {
-                cfs_rq->last_decay = now;
+                cfs_rq->avg.load_avg += sa->load_avg;
+                cfs_rq->avg.load_sum += sa->load_sum;
+                cfs_rq->avg.util_avg += sa->util_avg;
+                cfs_rq->avg.util_sum += sa->util_sum;
        }
-        __update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
+        if (decayed || migrated)
+                update_tg_load_avg(cfs_rq, 0);
 }
-/* Add the load generated by se into cfs_rq's child load-average */
+/* Remove the runnable load generated by se from cfs_rq's runnable load average */
-static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
+static inline void
-                                                  struct sched_entity *se,
+dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
-                                                  int wakeup)
 {
-        /*
+        update_load_avg(se, 1);
-         * We track migrations using entity decay_count <= 0, on a wake-up
-         * migration we use a negative decay count to track the remote decays
-         * accumulated while sleeping.
-         *
-         * Newly forked tasks are enqueued with se->avg.decay_count == 0, they
-         * are seen by enqueue_entity_load_avg() as a migration with an already
-         * constructed load_avg_contrib.
-         */
-        if (unlikely(se->avg.decay_count <= 0)) {
-                se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq));
-                if (se->avg.decay_count) {
-                        /*
-                         * In a wake-up migration we have to approximate the
-                         * time sleeping.  This is because we can't synchronize
-                         * clock_task between the two cpus, and it is not
-                         * guaranteed to be read-safe.  Instead, we can
-                         * approximate this using our carried decays, which are
-                         * explicitly atomically readable.
-                         */
-                        se->avg.last_runnable_update -= (-se->avg.decay_count)
-                                                        << 20;
-                        update_entity_load_avg(se, 0);
-                        /* Indicate that we're now synchronized and on-rq */
-                        se->avg.decay_count = 0;
-                }
-                wakeup = 0;
-        } else {
-                __synchronize_entity_decay(se);
-        }
-        /* migrated tasks did not contribute to our blocked load */
+        cfs_rq->runnable_load_avg =
-        if (wakeup) {
+                max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
-                subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
+        cfs_rq->runnable_load_sum =
-                update_entity_load_avg(se, 0);
+                max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
-        }
-        cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
-        cfs_rq->utilization_load_avg += se->avg.utilization_avg_contrib;
-        /* we force update consideration on load-balancer moves */
-        update_cfs_rq_blocked_load(cfs_rq, !wakeup);
 }
 /*
- * Remove se's load from this cfs_rq child load-average, if the entity is
+ * Task first catches up with cfs_rq, and then subtract
- * transitioning to a blocked state we track its projected decay using
+ * itself from the cfs_rq (task must be off the queue now).
- * blocked_load_avg.
 */
-static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
+void remove_entity_load_avg(struct sched_entity *se)
-                                                  struct sched_entity *se,
-                                                  int sleep)
 {
-        update_entity_load_avg(se, 1);
+        struct cfs_rq *cfs_rq = cfs_rq_of(se);
-        /* we force update consideration on load-balancer moves */
+        u64 last_update_time;
-        update_cfs_rq_blocked_load(cfs_rq, !sleep);
+#ifndef CONFIG_64BIT
+        u64 last_update_time_copy;
-        cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
+        do {
-        cfs_rq->utilization_load_avg -= se->avg.utilization_avg_contrib;
+                last_update_time_copy = cfs_rq->load_last_update_time_copy;
-        if (sleep) {
+                smp_rmb();
-                cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
+                last_update_time = cfs_rq->avg.last_update_time;
-                se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
+        } while (last_update_time != last_update_time_copy);
-        } /* migrations, e.g. sleep=0 leave decay_count == 0 */
+#else
+        last_update_time = cfs_rq->avg.last_update_time;
+#endif
+        __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
+        atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
+        atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
 }
 /*
@@ -2944,7 +2788,6 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
 */
 void idle_enter_fair(struct rq *this_rq)
 {
-        update_rq_runnable_avg(this_rq, 1);
 }
 /*
@@ -2954,24 +2797,28 @@ void idle_enter_fair(struct rq *this_rq)
 */
 void idle_exit_fair(struct rq *this_rq)
 {
-        update_rq_runnable_avg(this_rq, 0);
+}
+static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
+{
+        return cfs_rq->runnable_load_avg;
+}
+static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
+{
+        return cfs_rq->avg.load_avg;
 }
 static int idle_balance(struct rq *this_rq);
 #else /* CONFIG_SMP */
-static inline void update_entity_load_avg(struct sched_entity *se,
+static inline void update_load_avg(struct sched_entity *se, int update_tg) {}
-                                          int update_cfs_rq) {}
+static inline void
-static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
+enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
-static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
+static inline void
-                                           struct sched_entity *se,
+dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
-                                           int wakeup) {}
+static inline void remove_entity_load_avg(struct sched_entity *se) {}
-static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
-                                           struct sched_entity *se,
-                                           int sleep) {}
-static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
-                                              int force_update) {}
 static inline int idle_balance(struct rq *rq)
 {
@@ -3103,7 +2950,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
         * Update run-time statistics of the 'current'.
         */
        update_curr(cfs_rq);
-        enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
+        enqueue_entity_load_avg(cfs_rq, se);
        account_entity_enqueue(cfs_rq, se);
        update_cfs_shares(cfs_rq);
@@ -3178,7 +3025,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
         * Update run-time statistics of the 'current'.
         */
        update_curr(cfs_rq);
-        dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
+        dequeue_entity_load_avg(cfs_rq, se);
        update_stats_dequeue(cfs_rq, se);
        if (flags & DEQUEUE_SLEEP) {
@@ -3268,7 +3115,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
                 */
                update_stats_wait_end(cfs_rq, se);
                __dequeue_entity(cfs_rq, se);
-                update_entity_load_avg(se, 1);
+                update_load_avg(se, 1);
        }
        update_stats_curr_start(cfs_rq, se);
@@ -3368,7 +3215,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
                /* Put 'current' back into the tree. */
                __enqueue_entity(cfs_rq, prev);
                /* in !on_rq case, update occurred at dequeue */
-                update_entity_load_avg(prev, 1);
+                update_load_avg(prev, 0);
        }
        cfs_rq->curr = NULL;
 }
@@ -3384,8 +3231,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
        /*
         * Ensure that runnable average is periodically updated.
         */
-        update_entity_load_avg(curr, 1);
+        update_load_avg(curr, 1);
-        update_cfs_rq_blocked_load(cfs_rq, 1);
        update_cfs_shares(cfs_rq);
 #ifdef CONFIG_SCHED_HRTICK
@@ -3683,7 +3529,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
        cfs_rq->throttled = 1;
        cfs_rq->throttled_clock = rq_clock(rq);
        raw_spin_lock(&cfs_b->lock);
-        empty = list_empty(&cfs_rq->throttled_list);
+        empty = list_empty(&cfs_b->throttled_cfs_rq);
        /*
         * Add to the _head_ of the list, so that an already-started
@@ -4258,14 +4104,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                if (cfs_rq_throttled(cfs_rq))
                        break;
+                update_load_avg(se, 1);
                update_cfs_shares(cfs_rq);
-                update_entity_load_avg(se, 1);
        }
-        if (!se) {
+        if (!se)
-                update_rq_runnable_avg(rq, rq->nr_running);
                add_nr_running(rq, 1);
-        }
        hrtick_update(rq);
 }
@@ -4319,14 +4164,13 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                if (cfs_rq_throttled(cfs_rq))
                        break;
+                update_load_avg(se, 1);
                update_cfs_shares(cfs_rq);
-                update_entity_load_avg(se, 1);
        }
-        if (!se) {
+        if (!se)
                sub_nr_running(rq, 1);
-                update_rq_runnable_avg(rq, 1);
-        }
        hrtick_update(rq);
 }
@@ -4439,6 +4283,12 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
        sched_avg_update(this_rq);
 }
+/* Used instead of source_load when we know the type == 0 */
+static unsigned long weighted_cpuload(const int cpu)
+{
+        return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs);
+}
 #ifdef CONFIG_NO_HZ_COMMON
 /*
 * There is no sane way to deal with nohz on smp when using jiffies because the
@@ -4460,7 +4310,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
 static void update_idle_cpu_load(struct rq *this_rq)
 {
        unsigned long curr_jiffies = READ_ONCE(jiffies);
-        unsigned long load = this_rq->cfs.runnable_load_avg;
+        unsigned long load = weighted_cpuload(cpu_of(this_rq));
        unsigned long pending_updates;
        /*
@@ -4506,7 +4356,7 @@ void update_cpu_load_nohz(void)
 */
 void update_cpu_load_active(struct rq *this_rq)
 {
-        unsigned long load = this_rq->cfs.runnable_load_avg;
+        unsigned long load = weighted_cpuload(cpu_of(this_rq));
        /*
         * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
         */
@@ -4514,12 +4364,6 @@ void update_cpu_load_active(struct rq *this_rq)
        __update_cpu_load(this_rq, load, 1);
 }
-/* Used instead of source_load when we know the type == 0 */
-static unsigned long weighted_cpuload(const int cpu)
-{
-        return cpu_rq(cpu)->cfs.runnable_load_avg;
-}
 /*
 * Return a low guess at the load of a migration-source cpu weighted
 * according to the scheduling class and "nice" value.
@@ -4567,7 +4411,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
        unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
-        unsigned long load_avg = rq->cfs.runnable_load_avg;
+        unsigned long load_avg = weighted_cpuload(cpu);
        if (nr_running)
                return load_avg / nr_running;
@@ -4686,7 +4530,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
                /*
                 * w = rw_i + @wl
                 */
-                w = se->my_q->load.weight + wl;
+                w = cfs_rq_load_avg(se->my_q) + wl;
                /*
                 * wl = S * s'_i; see (2)
@@ -4707,7 +4551,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
                /*
                 * wl = dw_i = S * (s'_i - s_i); see (3)
                 */
-                wl -= se->load.weight;
+                wl -= se->avg.load_avg;
                /*
                 * Recursively apply this logic to all parent groups to compute
@@ -4730,26 +4574,29 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
 #endif
+/*
+ * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
+ * A waker of many should wake a different task than the one last awakened
+ * at a frequency roughly N times higher than one of its wakees.  In order
+ * to determine whether we should let the load spread vs consolodating to
+ * shared cache, we look for a minimum 'flip' frequency of llc_size in one
+ * partner, and a factor of lls_size higher frequency in the other.  With
+ * both conditions met, we can be relatively sure that the relationship is
+ * non-monogamous, with partner count exceeding socket size.  Waker/wakee
+ * being client/server, worker/dispatcher, interrupt source or whatever is
+ * irrelevant, spread criteria is apparent partner count exceeds socket size.
+ */
 static int wake_wide(struct task_struct *p)
 {
+        unsigned int master = current->wakee_flips;
+        unsigned int slave = p->wakee_flips;
        int factor = this_cpu_read(sd_llc_size);
-        /*
+        if (master < slave)
-         * Yeah, it's the switching-frequency, could means many wakee or
+                swap(master, slave);
-         * rapidly switch, use factor here will just help to automatically
+        if (slave < factor || master < slave * factor)
-         * adjust the loose-degree, so bigger node will lead to more pull.
+                return 0;
-         */
+        return 1;
-        if (p->wakee_flips > factor) {
-                /*
-                 * wakee is somewhat hot, it needs certain amount of cpu
-                 * resource, so if waker is far more hot, prefer to leave
-                 * it alone.
-                 */
-                if (current->wakee_flips > (factor * p->wakee_flips))
-                        return 1;
-        }
-        return 0;
 }
 static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
@@ -4761,13 +4608,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
        unsigned long weight;
        int balanced;
-        /*
-         * If we wake multiple tasks be careful to not bounce
-         * ourselves around too much.
-         */
-        if (wake_wide(p))
-                return 0;
        idx       = sd->wake_idx;
        this_cpu  = smp_processor_id();
        prev_cpu  = task_cpu(p);
@@ -4781,14 +4621,14 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
         */
        if (sync) {
                tg = task_group(current);
-                weight = current->se.load.weight;
+                weight = current->se.avg.load_avg;
                this_load += effective_load(tg, this_cpu, -weight, -weight);
                load += effective_load(tg, prev_cpu, 0, -weight);
        }
        tg = task_group(p);
-        weight = p->se.load.weight;
+        weight = p->se.avg.load_avg;
        /*
         * In low-load situations, where prev_cpu is idle and this_cpu is idle
@@ -4981,12 +4821,12 @@ done:
 * tasks. The unit of the return value must be the one of capacity so we can
 * compare the usage with the capacity of the CPU that is available for CFS
 * task (ie cpu_capacity).
- * cfs.utilization_load_avg is the sum of running time of runnable tasks on a
+ * cfs.avg.util_avg is the sum of running time of runnable tasks on a
 * CPU. It represents the amount of utilization of a CPU in the range
 * [0..SCHED_LOAD_SCALE].  The usage of a CPU can't be higher than the full
 * capacity of the CPU because it's about the running time on this CPU.
- * Nevertheless, cfs.utilization_load_avg can be higher than SCHED_LOAD_SCALE
+ * Nevertheless, cfs.avg.util_avg can be higher than SCHED_LOAD_SCALE
- * because of unfortunate rounding in avg_period and running_load_avg or just
+ * because of unfortunate rounding in util_avg or just
 * after migrating tasks until the average stabilizes with the new running
 * time. So we need to check that the usage stays into the range
 * [0..cpu_capacity_orig] and cap if necessary.
@@ -4995,7 +4835,7 @@ done:
 */
 static int get_cpu_usage(int cpu)
 {
-        unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg;
+        unsigned long usage = cpu_rq(cpu)->cfs.avg.util_avg;
        unsigned long capacity = capacity_orig_of(cpu);
        if (usage >= SCHED_LOAD_SCALE)
@@ -5021,17 +4861,17 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 {
        struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
        int cpu = smp_processor_id();
-        int new_cpu = cpu;
+        int new_cpu = prev_cpu;
        int want_affine = 0;
        int sync = wake_flags & WF_SYNC;
        if (sd_flag & SD_BALANCE_WAKE)
-                want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+                want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
        rcu_read_lock();
        for_each_domain(cpu, tmp) {
                if (!(tmp->flags & SD_LOAD_BALANCE))
-                        continue;
+                        break;
                /*
                 * If both cpu and prev_cpu are part of this domain,
@@ -5045,17 +4885,21 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
                if (tmp->flags & sd_flag)
                        sd = tmp;
+                else if (!want_affine)
+                        break;
        }
-        if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync))
+        if (affine_sd) {
-                prev_cpu = cpu;
+                sd = NULL; /* Prefer wake_affine over balance flags */
+                if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
-        if (sd_flag & SD_BALANCE_WAKE) {
+                        new_cpu = cpu;
-                new_cpu = select_idle_sibling(p, prev_cpu);
-                goto unlock;
        }
-        while (sd) {
+        if (!sd) {
+                if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
+                        new_cpu = select_idle_sibling(p, new_cpu);
+        } else while (sd) {
                struct sched_group *group;
                int weight;
@@ -5089,7 +4933,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
                }
                /* while loop will break here if sd == NULL */
        }
-unlock:
        rcu_read_unlock();
        return new_cpu;
@@ -5101,26 +4944,27 @@ unlock:
 * previous cpu.  However, the caller only guarantees p->pi_lock is held; no
 * other assumptions, including the state of rq->lock, should be made.
 */
-static void
+static void migrate_task_rq_fair(struct task_struct *p, int next_cpu)
-migrate_task_rq_fair(struct task_struct *p, int next_cpu)
 {
-        struct sched_entity *se = &p->se;
-        struct cfs_rq *cfs_rq = cfs_rq_of(se);
        /*
-         * Load tracking: accumulate removed load so that it can be processed
+         * We are supposed to update the task to "current" time, then its up to date
-         * when we next update owning cfs_rq under rq->lock.  Tasks contribute
+         * and ready to go to new CPU/cfs_rq. But we have difficulty in getting
-         * to blocked load iff they have a positive decay-count.  It can never
+         * what current time is, so simply throw away the out-of-date time. This
-         * be negative here since on-rq tasks have decay-count == 0.
+         * will result in the wakee task is less decayed, but giving the wakee more
+         * load sounds not bad.
         */
-        if (se->avg.decay_count) {
+        remove_entity_load_avg(&p->se);
-                se->avg.decay_count = -__synchronize_entity_decay(se);
-                atomic_long_add(se->avg.load_avg_contrib,
+        /* Tell new CPU we are migrated */
-                                                &cfs_rq->removed_load);
+        p->se.avg.last_update_time = 0;
-        }
        /* We have migrated, no longer consider this task hot */
-        se->exec_start = 0;
+        p->se.exec_start = 0;
+}
+static void task_dead_fair(struct task_struct *p)
+{
+        remove_entity_load_avg(&p->se);
 }
 #endif /* CONFIG_SMP */
@@ -5670,72 +5514,39 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
 #ifdef CONFIG_NUMA_BALANCING
 /*
- * Returns true if the destination node is the preferred node.
+ * Returns 1, if task migration degrades locality
- * Needs to match fbq_classify_rq(): if there is a runnable task
+ * Returns 0, if task migration improves locality i.e migration preferred.
- * that is not on its preferred node, we should identify it.
+ * Returns -1, if task migration is not affected by locality.
 */
-static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
+static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
 {
        struct numa_group *numa_group = rcu_dereference(p->numa_group);
        unsigned long src_faults, dst_faults;
        int src_nid, dst_nid;
-        if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
-            !(env->sd->flags & SD_NUMA)) {
-                return false;
-        }
-        src_nid = cpu_to_node(env->src_cpu);
-        dst_nid = cpu_to_node(env->dst_cpu);
-        if (src_nid == dst_nid)
-                return false;
-        /* Encourage migration to the preferred node. */
-        if (dst_nid == p->numa_preferred_nid)
-                return true;
-        /* Migrating away from the preferred node is bad. */
-        if (src_nid == p->numa_preferred_nid)
-                return false;
-        if (numa_group) {
-                src_faults = group_faults(p, src_nid);
-                dst_faults = group_faults(p, dst_nid);
-        } else {
-                src_faults = task_faults(p, src_nid);
-                dst_faults = task_faults(p, dst_nid);
-        }
-        return dst_faults > src_faults;
-}
-static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
-{
-        struct numa_group *numa_group = rcu_dereference(p->numa_group);
-        unsigned long src_faults, dst_faults;
-        int src_nid, dst_nid;
-        if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
-                return false;
        if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
-                return false;
+                return -1;
+        if (!sched_feat(NUMA))
+                return -1;
        src_nid = cpu_to_node(env->src_cpu);
        dst_nid = cpu_to_node(env->dst_cpu);
        if (src_nid == dst_nid)
-                return false;
+                return -1;
-        /* Migrating away from the preferred node is bad. */
+        /* Migrating away from the preferred node is always bad. */
-        if (src_nid == p->numa_preferred_nid)
+        if (src_nid == p->numa_preferred_nid) {
-                return true;
+                if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
+                        return 1;
+                else
+                        return -1;
+        }
        /* Encourage migration to the preferred node. */
        if (dst_nid == p->numa_preferred_nid)
-                return false;
+                return 0;
        if (numa_group) {
                src_faults = group_faults(p, src_nid);
@@ -5749,16 +5560,10 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
 }
 #else
-static inline bool migrate_improves_locality(struct task_struct *p,
+static inline int migrate_degrades_locality(struct task_struct *p,
                                             struct lb_env *env)
 {
-        return false;
+        return -1;
-}
-static inline bool migrate_degrades_locality(struct task_struct *p,
-                                             struct lb_env *env)
-{
-        return false;
 }
 #endif
@@ -5768,7 +5573,7 @@ static inline bool migrate_degrades_locality(struct task_struct *p,
 static
 int can_migrate_task(struct task_struct *p, struct lb_env *env)
 {
-        int tsk_cache_hot = 0;
+        int tsk_cache_hot;
        lockdep_assert_held(&env->src_rq->lock);
@@ -5826,13 +5631,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
         * 2) task is cache cold, or
         * 3) too many balance attempts have failed.
         */
-        tsk_cache_hot = task_hot(p, env);
+        tsk_cache_hot = migrate_degrades_locality(p, env);
-        if (!tsk_cache_hot)
+        if (tsk_cache_hot == -1)
-                tsk_cache_hot = migrate_degrades_locality(p, env);
+                tsk_cache_hot = task_hot(p, env);
-        if (migrate_improves_locality(p, env) || !tsk_cache_hot ||
+        if (tsk_cache_hot <= 0 ||
            env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
-                if (tsk_cache_hot) {
+                if (tsk_cache_hot == 1) {
                        schedstat_inc(env->sd, lb_hot_gained[env->idle]);
                        schedstat_inc(p, se.statistics.nr_forced_migrations);
                }
@@ -5906,6 +5711,13 @@ static int detach_tasks(struct lb_env *env)
                return 0;
        while (!list_empty(tasks)) {
+                /*
+                 * We don't want to steal all, otherwise we may be treated likewise,
+                 * which could at worst lead to a livelock crash.
+                 */
+                if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
+                        break;
                p = list_first_entry(tasks, struct task_struct, se.group_node);
                env->loop++;
@@ -6015,39 +5827,6 @@ static void attach_tasks(struct lb_env *env)
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
-/*
- * update tg->load_weight by folding this cpu's load_avg
- */
-static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
-{
-        struct sched_entity *se = tg->se[cpu];
-        struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
-        /* throttled entities do not contribute to load */
-        if (throttled_hierarchy(cfs_rq))
-                return;
-        update_cfs_rq_blocked_load(cfs_rq, 1);
-        if (se) {
-                update_entity_load_avg(se, 1);
-                /*
-                 * We pivot on our runnable average having decayed to zero for
-                 * list removal.  This generally implies that all our children
-                 * have also been removed (modulo rounding error or bandwidth
-                 * control); however, such cases are rare and we can fix these
-                 * at enqueue.
-                 *
-                 * TODO: fix up out-of-order children on enqueue.
-                 */
-                if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
-                        list_del_leaf_cfs_rq(cfs_rq);
-        } else {
-                struct rq *rq = rq_of(cfs_rq);
-                update_rq_runnable_avg(rq, rq->nr_running);
-        }
-}
 static void update_blocked_averages(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
@@ -6056,19 +5835,19 @@ static void update_blocked_averages(int cpu)
        raw_spin_lock_irqsave(&rq->lock, flags);
        update_rq_clock(rq);
        /*
         * Iterates the task_group tree in a bottom up fashion, see
         * list_add_leaf_cfs_rq() for details.
         */
        for_each_leaf_cfs_rq(rq, cfs_rq) {
-                /*
+                /* throttled entities do not contribute to load */
-                 * Note: We may want to consider periodically releasing
+                if (throttled_hierarchy(cfs_rq))
-                 * rq->lock about these updates so that creating many task
+                        continue;
-                 * groups does not result in continually extending hold time.
-                 */
-                __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
-        }
+                if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
+                        update_tg_load_avg(cfs_rq, 0);
+        }
        raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
@@ -6096,14 +5875,14 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
        }
        if (!se) {
-                cfs_rq->h_load = cfs_rq->runnable_load_avg;
+                cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
                cfs_rq->last_h_load_update = now;
        }
        while ((se = cfs_rq->h_load_next) != NULL) {
                load = cfs_rq->h_load;
-                load = div64_ul(load * se->avg.load_avg_contrib,
+                load = div64_ul(load * se->avg.load_avg,
-                                cfs_rq->runnable_load_avg + 1);
+                        cfs_rq_load_avg(cfs_rq) + 1);
                cfs_rq = group_cfs_rq(se);
                cfs_rq->h_load = load;
                cfs_rq->last_h_load_update = now;
@@ -6115,17 +5894,25 @@ static unsigned long task_h_load(struct task_struct *p)
        struct cfs_rq *cfs_rq = task_cfs_rq(p);
        update_cfs_rq_h_load(cfs_rq);
-        return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,
+        return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
-                        cfs_rq->runnable_load_avg + 1);
+                        cfs_rq_load_avg(cfs_rq) + 1);
 }
 #else
 static inline void update_blocked_averages(int cpu)
 {
+        struct rq *rq = cpu_rq(cpu);
+        struct cfs_rq *cfs_rq = &rq->cfs;
+        unsigned long flags;
+        raw_spin_lock_irqsave(&rq->lock, flags);
+        update_rq_clock(rq);
+        update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
+        raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 static unsigned long task_h_load(struct task_struct *p)
 {
-        return p->se.avg.load_avg_contrib;
+        return p->se.avg.load_avg;
 }
 #endif
@@ -8025,8 +7812,6 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
        if (numabalancing_enabled)
                task_tick_numa(rq, curr);
-        update_rq_runnable_avg(rq, 1);
 }
 /*
@@ -8125,15 +7910,18 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
        }
 #ifdef CONFIG_SMP
-        /*
+        /* Catch up with the cfs_rq and remove our load when we leave */
-        * Remove our load from contribution when we leave sched_fair
+        __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq), &se->avg,
-        * and ensure we don't carry in an old decay_count if we
+                se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL);
-        * switch back.
-        */
+        cfs_rq->avg.load_avg =
-        if (se->avg.decay_count) {
+                max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
-                __synchronize_entity_decay(se);
+        cfs_rq->avg.load_sum =
-                subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
+                max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0);
-        }
+        cfs_rq->avg.util_avg =
+                max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
+        cfs_rq->avg.util_sum =
+                max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0);
 #endif
 }
@@ -8142,16 +7930,31 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
 */
 static void switched_to_fair(struct rq *rq, struct task_struct *p)
 {
-#ifdef CONFIG_FAIR_GROUP_SCHED
        struct sched_entity *se = &p->se;
+#ifdef CONFIG_FAIR_GROUP_SCHED
        /*
         * Since the real-depth could have been changed (only FAIR
         * class maintain depth value), reset depth properly.
         */
        se->depth = se->parent ? se->parent->depth + 1 : 0;
 #endif
-        if (!task_on_rq_queued(p))
+        if (!task_on_rq_queued(p)) {
+                /*
+                 * Ensure the task has a non-normalized vruntime when it is switched
+                 * back to the fair class with !queued, so that enqueue_entity() at
+                 * wake-up time will do the right thing.
+                 *
+                 * If it's queued, then the enqueue_entity(.flags=0) makes the task
+                 * has non-normalized vruntime, if it's !queued, then it still has
+                 * normalized vruntime.
+                 */
+                if (p->state != TASK_RUNNING)
+                        se->vruntime += cfs_rq_of(se)->min_vruntime;
                return;
+        }
        /*
         * We were most likely switched from sched_rt, so
@@ -8190,8 +7993,8 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
        cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
 #endif
 #ifdef CONFIG_SMP
-        atomic64_set(&cfs_rq->decay_counter, 1);
+        atomic_long_set(&cfs_rq->removed_load_avg, 0);
-        atomic_long_set(&cfs_rq->removed_load, 0);
+        atomic_long_set(&cfs_rq->removed_util_avg, 0);
 #endif
 }
@@ -8236,14 +8039,14 @@ static void task_move_group_fair(struct task_struct *p, int queued)
        if (!queued) {
                cfs_rq = cfs_rq_of(se);
                se->vruntime += cfs_rq->min_vruntime;
 #ifdef CONFIG_SMP
-                /*
+                /* Virtually synchronize task with its new cfs_rq */
-                 * migrate_task_rq_fair() will have removed our previous
+                p->se.avg.last_update_time = cfs_rq->avg.last_update_time;
-                 * contribution, but we must synchronize for ongoing future
+                cfs_rq->avg.load_avg += p->se.avg.load_avg;
-                 * decay.
+                cfs_rq->avg.load_sum += p->se.avg.load_sum;
-                 */
+                cfs_rq->avg.util_avg += p->se.avg.util_avg;
-                se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
+                cfs_rq->avg.util_sum += p->se.avg.util_sum;
-                cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
 #endif
        }
 }
@@ -8257,8 +8060,11 @@ void free_fair_sched_group(struct task_group *tg)
        for_each_possible_cpu(i) {
                if (tg->cfs_rq)
                        kfree(tg->cfs_rq[i]);
-                if (tg->se)
+                if (tg->se) {
+                        if (tg->se[i])
+                                remove_entity_load_avg(tg->se[i]);
                        kfree(tg->se[i]);
+                }
        }
        kfree(tg->cfs_rq);
@@ -8295,6 +8101,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
                init_cfs_rq(cfs_rq);
                init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
+                init_entity_runnable_average(se);
        }
        return 1;
@@ -8444,6 +8251,8 @@ const struct sched_class fair_sched_class = {
        .rq_offline             = rq_offline_fair,
        .task_waking            = task_waking_fair,
+        .task_dead              = task_dead_fair,
+        .set_cpus_allowed       = set_cpus_allowed_common,
 #endif
        .set_curr_task          = set_curr_task_fair,
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 91e33cd485f6..83a50e7ca533 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -79,20 +79,12 @@ SCHED_FEAT(LB_MIN, false)
 * numa_balancing=
 */
 #ifdef CONFIG_NUMA_BALANCING
-SCHED_FEAT(NUMA,        false)
 /*
- * NUMA_FAVOUR_HIGHER will favor moving tasks towards nodes where a
+ * NUMA will favor moving tasks towards nodes where a higher number of
- * higher number of hinting faults are recorded during active load
+ * hinting faults are recorded during active load balancing. It will
- * balancing.
+ * resist moving tasks towards nodes where a lower number of hinting
+ * faults have been recorded.
 */
-SCHED_FEAT(NUMA_FAVOUR_HIGHER, true)
+SCHED_FEAT(NUMA,        true)
-/*
- * NUMA_RESIST_LOWER will resist moving tasks towards nodes where a
- * lower number of hinting faults have been recorded. As this has
- * the potential to prevent a task ever migrating to a new node
- * due to CPU overload it is disabled by default.
- */
-SCHED_FEAT(NUMA_RESIST_LOWER, false)
 #endif
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 594275ed2620..8f177c73ae19 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -83,10 +83,13 @@ void __weak arch_cpu_idle(void)
 */
 void default_idle_call(void)
 {
-        if (current_clr_polling_and_test())
+        if (current_clr_polling_and_test()) {
                local_irq_enable();
-        else
+        } else {
+                stop_critical_timings();
                arch_cpu_idle();
+                start_critical_timings();
+        }
 }
 static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
@@ -141,12 +144,6 @@ static void cpuidle_idle_call(void)
        }
        /*
-         * During the idle period, stop measuring the disabled irqs
-         * critical sections latencies
-         */
-        stop_critical_timings();
-        /*
         * Tell the RCU framework we are entering an idle section,
         * so no more rcu read side critical sections and one more
         * step to the grace period
@@ -198,7 +195,6 @@ exit_idle:
                local_irq_enable();
        rcu_idle_exit();
-        start_critical_timings();
 }
 DEFINE_PER_CPU(bool, cpu_dead_idle);
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index c65dac8c97cd..c4ae0f1fdf9b 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -96,6 +96,7 @@ const struct sched_class idle_sched_class = {
 #ifdef CONFIG_SMP
        .select_task_rq         = select_task_rq_idle,
+        .set_cpus_allowed       = set_cpus_allowed_common,
 #endif
        .set_curr_task          = set_curr_task_idle,
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 0d193a243e96..d2ea59364a1c 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2069,7 +2069,6 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
 {
        if (!task_running(rq, p) &&
            !test_tsk_need_resched(rq->curr) &&
-            has_pushable_tasks(rq) &&
            p->nr_cpus_allowed > 1 &&
            (dl_task(rq->curr) || rt_task(rq->curr)) &&
            (rq->curr->nr_cpus_allowed < 2 ||
@@ -2077,45 +2076,6 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
                push_rt_tasks(rq);
 }
-static void set_cpus_allowed_rt(struct task_struct *p,
-                                const struct cpumask *new_mask)
-{
-        struct rq *rq;
-        int weight;
-        BUG_ON(!rt_task(p));
-        if (!task_on_rq_queued(p))
-                return;
-        weight = cpumask_weight(new_mask);
-        /*
-         * Only update if the process changes its state from whether it
-         * can migrate or not.
-         */
-        if ((p->nr_cpus_allowed > 1) == (weight > 1))
-                return;
-        rq = task_rq(p);
-        /*
-         * The process used to be able to migrate OR it can now migrate
-         */
-        if (weight <= 1) {
-                if (!task_current(rq, p))
-                        dequeue_pushable_task(rq, p);
-                BUG_ON(!rq->rt.rt_nr_migratory);
-                rq->rt.rt_nr_migratory--;
-        } else {
-                if (!task_current(rq, p))
-                        enqueue_pushable_task(rq, p);
-                rq->rt.rt_nr_migratory++;
-        }
-        update_rt_migration(&rq->rt);
-}
 /* Assumes rq->lock is held */
 static void rq_online_rt(struct rq *rq)
 {
@@ -2324,7 +2284,7 @@ const struct sched_class rt_sched_class = {
 #ifdef CONFIG_SMP
        .select_task_rq         = select_task_rq_rt,
-        .set_cpus_allowed       = set_cpus_allowed_rt,
+        .set_cpus_allowed       = set_cpus_allowed_common,
        .rq_online              = rq_online_rt,
        .rq_offline             = rq_offline_rt,
        .task_woken             = task_woken_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 84d48790bb6d..68cda117574c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -245,7 +245,6 @@ struct task_group {
 #ifdef  CONFIG_SMP
        atomic_long_t load_avg;
-        atomic_t runnable_avg;
 #endif
 #endif
@@ -366,27 +365,20 @@ struct cfs_rq {
 #ifdef CONFIG_SMP
        /*
-         * CFS Load tracking
+         * CFS load tracking
-         * Under CFS, load is tracked on a per-entity basis and aggregated up.
-         * This allows for the description of both thread and group usage (in
-         * the FAIR_GROUP_SCHED case).
-         * runnable_load_avg is the sum of the load_avg_contrib of the
-         * sched_entities on the rq.
-         * blocked_load_avg is similar to runnable_load_avg except that its
-         * the blocked sched_entities on the rq.
-         * utilization_load_avg is the sum of the average running time of the
-         * sched_entities on the rq.
         */
-        unsigned long runnable_load_avg, blocked_load_avg, utilization_load_avg;
+        struct sched_avg avg;
-        atomic64_t decay_counter;
+        u64 runnable_load_sum;
-        u64 last_decay;
+        unsigned long runnable_load_avg;
-        atomic_long_t removed_load;
 #ifdef CONFIG_FAIR_GROUP_SCHED
-        /* Required to track per-cpu representation of a task_group */
+        unsigned long tg_load_avg_contrib;
-        u32 tg_runnable_contrib;
+#endif
-        unsigned long tg_load_contrib;
+        atomic_long_t removed_load_avg, removed_util_avg;
+#ifndef CONFIG_64BIT
+        u64 load_last_update_time_copy;
+#endif
+#ifdef CONFIG_FAIR_GROUP_SCHED
        /*
         *   h_load = weight * f(tg)
         *
@@ -595,8 +587,6 @@ struct rq {
 #ifdef CONFIG_FAIR_GROUP_SCHED
        /* list of leaf cfs_rq on this cpu: */
        struct list_head leaf_cfs_rq_list;
-        struct sched_avg avg;
 #endif /* CONFIG_FAIR_GROUP_SCHED */
        /*
@@ -1065,9 +1055,6 @@ static inline int task_on_rq_migrating(struct task_struct *p)
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(next)      do { } while (0)
 #endif
-#ifndef finish_arch_switch
-# define finish_arch_switch(prev)       do { } while (0)
-#endif
 #ifndef finish_arch_post_lock_switch
 # define finish_arch_post_lock_switch() do { } while (0)
 #endif
@@ -1268,6 +1255,8 @@ extern void trigger_load_balance(struct rq *rq);
 extern void idle_enter_fair(struct rq *this_rq);
 extern void idle_exit_fair(struct rq *this_rq);
+extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask);
 #else
 static inline void idle_enter_fair(struct rq *rq) { }
@@ -1319,7 +1308,7 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
 unsigned long to_ratio(u64 period, u64 runtime);
-extern void init_task_runnable_average(struct task_struct *p);
+extern void init_entity_runnable_average(struct sched_entity *se);
 static inline void add_nr_running(struct rq *rq, unsigned count)
 {
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 79ffec45a6ac..cbc67da10954 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -123,6 +123,7 @@ const struct sched_class stop_sched_class = {
 #ifdef CONFIG_SMP
        .select_task_rq         = select_task_rq_stop,
+        .set_cpus_allowed       = set_cpus_allowed_common,
 #endif
        .set_curr_task          = set_curr_task_stop,
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 052e02672d12..272d9322bc5d 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -106,9 +106,10 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked);
-void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, int nr,
+                          void *key)
 {
-        __wake_up_common(q, mode, 1, 0, key);
+        __wake_up_common(q, mode, nr, 0, key);
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked_key);
@@ -283,7 +284,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
        if (!list_empty(&wait->task_list))
                list_del_init(&wait->task_list);
        else if (waitqueue_active(q))
-                __wake_up_locked_key(q, mode, key);
+                __wake_up_locked_key(q, mode, 1, key);
        spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL(abort_exclusive_wait);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 245df6b32b81..5bd4779282df 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -175,17 +175,16 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
 */
 static u32 seccomp_run_filters(struct seccomp_data *sd)
 {
-        struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter);
        struct seccomp_data sd_local;
        u32 ret = SECCOMP_RET_ALLOW;
+        /* Make sure cross-thread synced filter points somewhere sane. */
+        struct seccomp_filter *f =
+                        lockless_dereference(current->seccomp.filter);
        /* Ensure unexpected behavior doesn't result in failing open. */
        if (unlikely(WARN_ON(f == NULL)))
                return SECCOMP_RET_KILL;
-        /* Make sure cross-thread synced filter points somewhere sane. */
-        smp_read_barrier_depends();
        if (!sd) {
                populate_seccomp_data(&sd_local);
                sd = &sd_local;
@@ -549,7 +548,11 @@ void secure_computing_strict(int this_syscall)
 {
        int mode = current->seccomp.mode;
-        if (mode == 0)
+        if (config_enabled(CONFIG_CHECKPOINT_RESTORE) &&
+            unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
+                return;
+        if (mode == SECCOMP_MODE_DISABLED)
                return;
        else if (mode == SECCOMP_MODE_STRICT)
                __secure_computing_strict(this_syscall);
@@ -650,6 +653,10 @@ u32 seccomp_phase1(struct seccomp_data *sd)
        int this_syscall = sd ? sd->nr :
                syscall_get_nr(current, task_pt_regs(current));
+        if (config_enabled(CONFIG_CHECKPOINT_RESTORE) &&
+            unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
+                return SECCOMP_PHASE1_OK;
        switch (mode) {
        case SECCOMP_MODE_STRICT:
                __secure_computing_strict(this_syscall);  /* may call do_exit */
diff --git a/kernel/signal.c b/kernel/signal.c
index 836df8dac6cc..0f6bbbe77b46 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2748,12 +2748,15 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
                 * Other callers might not initialize the si_lsb field,
                 * so check explicitly for the right codes here.
                 */
-                if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO)
+                if (from->si_signo == SIGBUS &&
+                    (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO))
                        err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
 #endif
 #ifdef SEGV_BNDERR
-                err |= __put_user(from->si_lower, &to->si_lower);
+                if (from->si_signo == SIGSEGV && from->si_code == SEGV_BNDERR) {
-                err |= __put_user(from->si_upper, &to->si_upper);
+                        err |= __put_user(from->si_lower, &to->si_lower);
+                        err |= __put_user(from->si_upper, &to->si_upper);
+                }
 #endif
                break;
        case __SI_CHLD:
@@ -3017,7 +3020,7 @@ COMPAT_SYSCALL_DEFINE3(rt_sigqueueinfo,
                        int, sig,
                        struct compat_siginfo __user *, uinfo)
 {
-        siginfo_t info;
+        siginfo_t info = {};
        int ret = copy_siginfo_from_user32(&info, uinfo);
        if (unlikely(ret))
                return ret;
@@ -3061,7 +3064,7 @@ COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo,
                        int, sig,
                        struct compat_siginfo __user *, uinfo)
 {
-        siginfo_t info;
+        siginfo_t info = {};
        if (copy_siginfo_from_user32(&info, uinfo))
                return -EFAULT;
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 7c434c39f02a..a818cbc73e14 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -113,7 +113,8 @@ static int smpboot_thread_fn(void *data)
                if (kthread_should_stop()) {
                        __set_current_state(TASK_RUNNING);
                        preempt_enable();
-                        if (ht->cleanup)
+                        /* cleanup must mirror setup */
+                        if (ht->cleanup && td->status != HP_THREAD_NONE)
                                ht->cleanup(td->cpu, cpu_online(td->cpu));
                        kfree(td);
                        return 0;
@@ -259,15 +260,6 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
 {
        unsigned int cpu;
-        /* Unpark any threads that were voluntarily parked. */
-        for_each_cpu_not(cpu, ht->cpumask) {
-                if (cpu_online(cpu)) {
-                        struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
-                        if (tsk)
-                                kthread_unpark(tsk);
-                }
-        }
        /* We need to destroy also the parked threads of offline cpus */
        for_each_possible_cpu(cpu) {
                struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
@@ -281,19 +273,22 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
 }
 /**
- * smpboot_register_percpu_thread - Register a per_cpu thread related to hotplug
+ * smpboot_register_percpu_thread_cpumask - Register a per_cpu thread related
+ *                                          to hotplug
 * @plug_thread:        Hotplug thread descriptor
+ * @cpumask:            The cpumask where threads run
 *
 * Creates and starts the threads on all online cpus.
 */
-int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
+int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread,
+                                           const struct cpumask *cpumask)
 {
        unsigned int cpu;
        int ret = 0;
        if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL))
                return -ENOMEM;
-        cpumask_copy(plug_thread->cpumask, cpu_possible_mask);
+        cpumask_copy(plug_thread->cpumask, cpumask);
        get_online_cpus();
        mutex_lock(&smpboot_threads_lock);
@@ -301,9 +296,11 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
                ret = __smpboot_create_thread(plug_thread, cpu);
                if (ret) {
                        smpboot_destroy_threads(plug_thread);
+                        free_cpumask_var(plug_thread->cpumask);
                        goto out;
                }
-                smpboot_unpark_thread(plug_thread, cpu);
+                if (cpumask_test_cpu(cpu, cpumask))
+                        smpboot_unpark_thread(plug_thread, cpu);
        }
        list_add(&plug_thread->list, &hotplug_threads);
 out:
@@ -311,7 +308,7 @@ out:
        put_online_cpus();
        return ret;
 }
-EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread);
+EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread_cpumask);
 /**
 * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index fd643d8c4b42..12484e5d5c88 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -35,13 +35,16 @@ struct cpu_stop_done {
 /* the actual stopper, one per every possible cpu, enabled on online cpus */
 struct cpu_stopper {
+        struct task_struct      *thread;
        spinlock_t              lock;
        bool                    enabled;        /* is this stopper enabled? */
        struct list_head        works;          /* list of pending works */
+        struct cpu_stop_work    stop_work;      /* for stop_cpus */
 };
 static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
-static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task);
 static bool stop_machine_initialized = false;
 /*
@@ -74,7 +77,6 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
 static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
 {
        struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
-        struct task_struct *p = per_cpu(cpu_stopper_task, cpu);
        unsigned long flags;
@@ -82,7 +84,7 @@ static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
        if (stopper->enabled) {
                list_add_tail(&work->list, &stopper->works);
-                wake_up_process(p);
+                wake_up_process(stopper->thread);
        } else
                cpu_stop_signal_done(work->done, false);
@@ -139,7 +141,7 @@ enum multi_stop_state {
 };
 struct multi_stop_data {
-        int                     (*fn)(void *);
+        cpu_stop_fn_t           fn;
        void                    *data;
        /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
        unsigned int            num_threads;
@@ -293,7 +295,6 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
 /* static data for stop_cpus */
 static DEFINE_MUTEX(stop_cpus_mutex);
-static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);
 static void queue_stop_cpus_work(const struct cpumask *cpumask,
                                 cpu_stop_fn_t fn, void *arg,
@@ -302,22 +303,19 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask,
        struct cpu_stop_work *work;
        unsigned int cpu;
-        /* initialize works and done */
-        for_each_cpu(cpu, cpumask) {
-                work = &per_cpu(stop_cpus_work, cpu);
-                work->fn = fn;
-                work->arg = arg;
-                work->done = done;
-        }
        /*
         * Disable preemption while queueing to avoid getting
         * preempted by a stopper which might wait for other stoppers
         * to enter @fn which can lead to deadlock.
         */
        lg_global_lock(&stop_cpus_lock);
-        for_each_cpu(cpu, cpumask)
+        for_each_cpu(cpu, cpumask) {
-                cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu));
+                work = &per_cpu(cpu_stopper.stop_work, cpu);
+                work->fn = fn;
+                work->arg = arg;
+                work->done = done;
+                cpu_stop_queue_work(cpu, work);
+        }
        lg_global_unlock(&stop_cpus_lock);
 }
@@ -458,19 +456,21 @@ extern void sched_set_stop_task(int cpu, struct task_struct *stop);
 static void cpu_stop_create(unsigned int cpu)
 {
-        sched_set_stop_task(cpu, per_cpu(cpu_stopper_task, cpu));
+        sched_set_stop_task(cpu, per_cpu(cpu_stopper.thread, cpu));
 }
 static void cpu_stop_park(unsigned int cpu)
 {
        struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
-        struct cpu_stop_work *work;
+        struct cpu_stop_work *work, *tmp;
        unsigned long flags;
        /* drain remaining works */
        spin_lock_irqsave(&stopper->lock, flags);
-        list_for_each_entry(work, &stopper->works, list)
+        list_for_each_entry_safe(work, tmp, &stopper->works, list) {
+                list_del_init(&work->list);
                cpu_stop_signal_done(work->done, false);
+        }
        stopper->enabled = false;
        spin_unlock_irqrestore(&stopper->lock, flags);
 }
@@ -485,7 +485,7 @@ static void cpu_stop_unpark(unsigned int cpu)
 }
 static struct smp_hotplug_thread cpu_stop_threads = {
-        .store                  = &cpu_stopper_task,
+        .store                  = &cpu_stopper.thread,
        .thread_should_run      = cpu_stop_should_run,
        .thread_fn              = cpu_stopper_thread,
        .thread_comm            = "migration/%u",
@@ -515,7 +515,7 @@ early_initcall(cpu_stop_init);
 #ifdef CONFIG_STOP_MACHINE
-int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
+static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus)
 {
        struct multi_stop_data msdata = {
                .fn = fn,
@@ -548,7 +548,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
        return stop_cpus(cpu_online_mask, multi_cpu_stop, &msdata);
 }
-int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
+int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus)
 {
        int ret;
@@ -582,7 +582,7 @@ EXPORT_SYMBOL_GPL(stop_machine);
 * 0 if all executions of @fn returned 0, any non zero return value if any
 * returned non zero.
 */
-int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,
+int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
                                  const struct cpumask *cpus)
 {
        struct multi_stop_data msdata = { .fn = fn, .data = data,
diff --git a/kernel/sys.c b/kernel/sys.c
index 259fda25eb6b..fa2f2f671a5c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1668,8 +1668,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
         * overall picture.
         */
        err = -EACCES;
-        if (!S_ISREG(inode->i_mode)     ||
+        if (!S_ISREG(inode->i_mode) || path_noexec(&exe.file->f_path))
-            exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC)
                goto exit;
        err = inode_permission(inode, MAY_EXEC);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 7995ef5868d8..a02decf15583 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -140,6 +140,7 @@ cond_syscall(sys_sgetmask);
 cond_syscall(sys_ssetmask);
 cond_syscall(sys_vm86old);
 cond_syscall(sys_vm86);
+cond_syscall(sys_modify_ldt);
 cond_syscall(sys_ipc);
 cond_syscall(compat_sys_ipc);
 cond_syscall(compat_sys_sysctl);
@@ -218,6 +219,7 @@ cond_syscall(compat_sys_timerfd_gettime);
 cond_syscall(sys_eventfd);
 cond_syscall(sys_eventfd2);
 cond_syscall(sys_memfd_create);
+cond_syscall(sys_userfaultfd);
 /* performance counters: */
 cond_syscall(sys_perf_event_open);
@@ -243,3 +245,6 @@ cond_syscall(sys_bpf);
 /* execveat */
 cond_syscall(sys_execveat);
+/* membarrier */
+cond_syscall(sys_membarrier);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 19b62b522158..e69201d8094e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -621,7 +621,7 @@ static struct ctl_table kern_table[] = {
                .proc_handler   = proc_dointvec,
        },
 #endif
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
        {
                .procname       = "kexec_load_disabled",
                .data           = &kexec_load_disabled,
@@ -1995,7 +1995,7 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
                int val = *valp;
                if (val < 0) {
                        *negp = true;
-                        *lvalp = (unsigned long)-val;
+                        *lvalp = -(unsigned long)val;
                } else {
                        *negp = false;
                        *lvalp = (unsigned long)val;
@@ -2201,7 +2201,7 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
                int val = *valp;
                if (val < 0) {
                        *negp = true;
-                        *lvalp = (unsigned long)-val;
+                        *lvalp = -(unsigned long)val;
                } else {
                        *negp = false;
                        *lvalp = (unsigned long)val;
@@ -2436,7 +2436,7 @@ static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp,
                unsigned long lval;
                if (val < 0) {
                        *negp = true;
-                        lval = (unsigned long)-val;
+                        lval = -(unsigned long)val;
                } else {
                        *negp = false;
                        lval = (unsigned long)val;
@@ -2459,7 +2459,7 @@ static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp
                unsigned long lval;
                if (val < 0) {
                        *negp = true;
-                        lval = (unsigned long)-val;
+                        lval = -(unsigned long)val;
                } else {
                        *negp = false;
                        lval = (unsigned long)val;
@@ -2484,7 +2484,7 @@ static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
                unsigned long lval;
                if (val < 0) {
                        *negp = true;
-                        lval = (unsigned long)-val;
+                        lval = -(unsigned long)val;
                } else {
                        *negp = false;
                        lval = (unsigned long)val;
diff --git a/kernel/system_certificates.S b/kernel/system_certificates.S
deleted file mode 100644
index 3e9868d47535..000000000000
--- a/kernel/system_certificates.S
+++ /dev/null
@@ -1,20 +0,0 @@
-#include <linux/export.h>
-#include <linux/init.h>
-        __INITRODATA
-        .align 8
-        .globl VMLINUX_SYMBOL(system_certificate_list)
-VMLINUX_SYMBOL(system_certificate_list):
-__cert_list_start:
-        .incbin "kernel/x509_certificate_list"
-__cert_list_end:
-        .align 8
-        .globl VMLINUX_SYMBOL(system_certificate_list_size)
-VMLINUX_SYMBOL(system_certificate_list_size):
-#ifdef CONFIG_64BIT
-        .quad __cert_list_end - __cert_list_start
-#else
-        .long __cert_list_end - __cert_list_start
-#endif
diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c
deleted file mode 100644
index 875f64e8935b..000000000000
--- a/kernel/system_keyring.c
+++ /dev/null
@@ -1,106 +0,0 @@
-/* System trusted keyring for trusted public keys
- *
- * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-#include <linux/export.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/cred.h>
-#include <linux/err.h>
-#include <keys/asymmetric-type.h>
-#include <keys/system_keyring.h>
-#include "module-internal.h"
-struct key *system_trusted_keyring;
-EXPORT_SYMBOL_GPL(system_trusted_keyring);
-extern __initconst const u8 system_certificate_list[];
-extern __initconst const unsigned long system_certificate_list_size;
-/*
- * Load the compiled-in keys
- */
-static __init int system_trusted_keyring_init(void)
-{
-        pr_notice("Initialise system trusted keyring\n");
-        system_trusted_keyring =
-                keyring_alloc(".system_keyring",
-                              KUIDT_INIT(0), KGIDT_INIT(0), current_cred(),
-                              ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
-                              KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH),
-                              KEY_ALLOC_NOT_IN_QUOTA, NULL);
-        if (IS_ERR(system_trusted_keyring))
-                panic("Can't allocate system trusted keyring\n");
-        set_bit(KEY_FLAG_TRUSTED_ONLY, &system_trusted_keyring->flags);
-        return 0;
-}
-/*
- * Must be initialised before we try and load the keys into the keyring.
- */
-device_initcall(system_trusted_keyring_init);
-/*
- * Load the compiled-in list of X.509 certificates.
- */
-static __init int load_system_certificate_list(void)
-{
-        key_ref_t key;
-        const u8 *p, *end;
-        size_t plen;
-        pr_notice("Loading compiled-in X.509 certificates\n");
-        p = system_certificate_list;
-        end = p + system_certificate_list_size;
-        while (p < end) {
-                /* Each cert begins with an ASN.1 SEQUENCE tag and must be more
-                 * than 256 bytes in size.
-                 */
-                if (end - p < 4)
-                        goto dodgy_cert;
-                if (p[0] != 0x30 &&
-                    p[1] != 0x82)
-                        goto dodgy_cert;
-                plen = (p[2] << 8) | p[3];
-                plen += 4;
-                if (plen > end - p)
-                        goto dodgy_cert;
-                key = key_create_or_update(make_key_ref(system_trusted_keyring, 1),
-                                           "asymmetric",
-                                           NULL,
-                                           p,
-                                           plen,
-                                           ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
-                                           KEY_USR_VIEW | KEY_USR_READ),
-                                           KEY_ALLOC_NOT_IN_QUOTA |
-                                           KEY_ALLOC_TRUSTED);
-                if (IS_ERR(key)) {
-                        pr_err("Problem loading in-kernel X.509 certificate (%ld)\n",
-                               PTR_ERR(key));
-                } else {
-                        set_bit(KEY_FLAG_BUILTIN, &key_ref_to_ptr(key)->flags);
-                        pr_notice("Loaded X.509 cert '%s'\n",
-                                  key_ref_to_ptr(key)->description);
-                        key_ref_put(key);
-                }
-                p += plen;
-        }
-        return 0;
-dodgy_cert:
-        pr_err("Problem parsing in-kernel X.509 certificate list\n");
-        return 0;
-}
-late_initcall(load_system_certificate_list);
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 8727032e3a6f..53fa971d000d 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -18,6 +18,8 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */
 * This is like the signal handler which runs in kernel mode, but it doesn't
 * try to wake up the @task.
 *
+ * Note: there is no ordering guarantee on works queued here.
+ *
 * RETURNS:
 * 0 if succeeds or -ESRCH.
 */
@@ -108,16 +110,6 @@ void task_work_run(void)
                raw_spin_unlock_wait(&task->pi_lock);
                smp_mb();
-                /* Reverse the list to run the works in fifo order */
-                head = NULL;
-                do {
-                        next = work->next;
-                        work->next = head;
-                        head = work;
-                        work = next;
-                } while (work);
-                work = head;
                do {
                        next = work->next;
                        work->func(work);
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 579ce1b929af..4008d9f95dd7 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -92,12 +92,10 @@ config NO_HZ_FULL
        depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
        # We need at least one periodic CPU for timekeeping
        depends on SMP
-        # RCU_USER_QS dependency
        depends on HAVE_CONTEXT_TRACKING
        # VIRT_CPU_ACCOUNTING_GEN dependency
        depends on HAVE_VIRT_CPU_ACCOUNTING_GEN
        select NO_HZ_COMMON
-        select RCU_USER_QS
        select RCU_NOCB_CPU
        select VIRT_CPU_ACCOUNTING_GEN
        select IRQ_WORK
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 5c7ae4b641c4..457a373e2181 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -183,7 +183,7 @@ struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
                                         int pinned)
 {
        if (pinned || !base->migration_enabled)
-                return this_cpu_ptr(&hrtimer_bases);
+                return base;
        return &per_cpu(hrtimer_bases, get_nohz_timer_target());
 }
 #else
@@ -191,23 +191,32 @@ static inline
 struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
                                         int pinned)
 {
-        return this_cpu_ptr(&hrtimer_bases);
+        return base;
 }
 #endif
 /*
- * Switch the timer base to the current CPU when possible.
+ * We switch the timer base to a power-optimized selected CPU target,
+ * if:
+ *      - NO_HZ_COMMON is enabled
+ *      - timer migration is enabled
+ *      - the timer callback is not running
+ *      - the timer is not the first expiring timer on the new target
+ *
+ * If one of the above requirements is not fulfilled we move the timer
+ * to the current CPU or leave it on the previously assigned CPU if
+ * the timer callback is currently running.
 */
 static inline struct hrtimer_clock_base *
 switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
                    int pinned)
 {
-        struct hrtimer_cpu_base *new_cpu_base, *this_base;
+        struct hrtimer_cpu_base *new_cpu_base, *this_cpu_base;
        struct hrtimer_clock_base *new_base;
        int basenum = base->index;
-        this_base = this_cpu_ptr(&hrtimer_bases);
+        this_cpu_base = this_cpu_ptr(&hrtimer_bases);
-        new_cpu_base = get_target_base(this_base, pinned);
+        new_cpu_base = get_target_base(this_cpu_base, pinned);
 again:
        new_base = &new_cpu_base->clock_base[basenum];
@@ -229,19 +238,19 @@ again:
                raw_spin_unlock(&base->cpu_base->lock);
                raw_spin_lock(&new_base->cpu_base->lock);
-                if (new_cpu_base != this_base &&
+                if (new_cpu_base != this_cpu_base &&
                    hrtimer_check_target(timer, new_base)) {
                        raw_spin_unlock(&new_base->cpu_base->lock);
                        raw_spin_lock(&base->cpu_base->lock);
-                        new_cpu_base = this_base;
+                        new_cpu_base = this_cpu_base;
                        timer->base = base;
                        goto again;
                }
                timer->base = new_base;
        } else {
-                if (new_cpu_base != this_base &&
+                if (new_cpu_base != this_cpu_base &&
                    hrtimer_check_target(timer, new_base)) {
-                        new_cpu_base = this_base;
+                        new_cpu_base = this_cpu_base;
                        goto again;
                }
        }
@@ -679,14 +688,14 @@ static void retrigger_next_event(void *arg)
 /*
 * Switch to high resolution mode
 */
-static int hrtimer_switch_to_hres(void)
+static void hrtimer_switch_to_hres(void)
 {
        struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
        if (tick_init_highres()) {
                printk(KERN_WARNING "Could not switch to high resolution "
                                    "mode on CPU %d\n", base->cpu);
-                return 0;
+                return;
        }
        base->hres_active = 1;
        hrtimer_resolution = HIGH_RES_NSEC;
@@ -694,7 +703,6 @@ static int hrtimer_switch_to_hres(void)
        tick_setup_sched_timer();
        /* "Retrigger" the interrupt to get things going */
        retrigger_next_event(NULL);
-        return 1;
 }
 static void clock_was_set_work(struct work_struct *work)
@@ -718,7 +726,7 @@ void clock_was_set_delayed(void)
 static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *b) { return 0; }
 static inline int hrtimer_hres_active(void) { return 0; }
 static inline int hrtimer_is_hres_enabled(void) { return 0; }
-static inline int hrtimer_switch_to_hres(void) { return 0; }
+static inline void hrtimer_switch_to_hres(void) { }
 static inline void
 hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
 static inline int hrtimer_reprogram(struct hrtimer *timer,
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index fb4d98c7fd43..df68cb875248 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -487,6 +487,11 @@ out:
 }
 #ifdef CONFIG_GENERIC_CMOS_UPDATE
+int __weak update_persistent_clock(struct timespec now)
+{
+        return -ENODEV;
+}
 int __weak update_persistent_clock64(struct timespec64 now64)
 {
        struct timespec now;
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index 3e7db49a2381..53d7184da0be 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -18,30 +18,23 @@
 static struct hrtimer bctimer;
-static void bc_set_mode(enum clock_event_mode mode,
+static int bc_shutdown(struct clock_event_device *evt)
-                        struct clock_event_device *bc)
 {
-        switch (mode) {
+        /*
-        case CLOCK_EVT_MODE_UNUSED:
+         * Note, we cannot cancel the timer here as we might
-        case CLOCK_EVT_MODE_SHUTDOWN:
+         * run into the following live lock scenario:
-                /*
+         *
-                 * Note, we cannot cancel the timer here as we might
+         * cpu 0                cpu1
-                 * run into the following live lock scenario:
+         * lock(broadcast_lock);
-                 *
+         *                      hrtimer_interrupt()
-                 * cpu 0                cpu1
+         *                      bc_handler()
-                 * lock(broadcast_lock);
+         *                         tick_handle_oneshot_broadcast();
-                 *                      hrtimer_interrupt()
+         *                          lock(broadcast_lock);
-                 *                      bc_handler()
+         * hrtimer_cancel()
-                 *                         tick_handle_oneshot_broadcast();
+         *  wait_for_callback()
-                 *                          lock(broadcast_lock);
+         */
-                 * hrtimer_cancel()
+        hrtimer_try_to_cancel(&bctimer);
-                 *  wait_for_callback()
+        return 0;
-                 */
-                hrtimer_try_to_cancel(&bctimer);
-                break;
-        default:
-                break;
-        }
 }
 /*
@@ -82,7 +75,7 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
 }
 static struct clock_event_device ce_broadcast_hrtimer = {
-        .set_mode               = bc_set_mode,
+        .set_state_shutdown     = bc_shutdown,
        .set_next_ktime         = bc_set_next,
        .features               = CLOCK_EVT_FEAT_ONESHOT |
                                  CLOCK_EVT_FEAT_KTIME |
@@ -102,13 +95,11 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t)
 {
        ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer);
-        switch (ce_broadcast_hrtimer.mode) {
+        if (clockevent_state_oneshot(&ce_broadcast_hrtimer))
-        case CLOCK_EVT_MODE_ONESHOT:
                if (ce_broadcast_hrtimer.next_event.tv64 != KTIME_MAX)
                        return HRTIMER_RESTART;
-        default:
-                return HRTIMER_NORESTART;
+        return HRTIMER_NORESTART;
-        }
 }
 void tick_setup_hrtimer_broadcast(void)
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 52b9e199b5ac..f6aae7977824 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -839,7 +839,6 @@ out:
        raw_spin_unlock(&tick_broadcast_lock);
        return ret;
 }
-EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control);
 /*
 * Reset the one shot broadcast for a cpu
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 55e13efff1ab..d11c55b6ab7d 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -304,9 +304,6 @@ void tick_check_new_device(struct clock_event_device *newdev)
        int cpu;
        cpu = smp_processor_id();
-        if (!cpumask_test_cpu(cpu, newdev->cpumask))
-                goto out_bc;
        td = &per_cpu(tick_cpu_device, cpu);
        curdev = td->evtdev;
@@ -363,6 +360,7 @@ int tick_broadcast_oneshot_control(enum tick_broadcast_state state)
        return __tick_broadcast_oneshot_control(state);
 }
+EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control);
 #ifdef CONFIG_HOTPLUG_CPU
 /*
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index c792429e98c6..3319e16f31e5 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -197,27 +197,9 @@ static bool can_stop_full_tick(void)
        return true;
 }
-static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now);
-/*
- * Re-evaluate the need for the tick on the current CPU
- * and restart it if necessary.
- */
-void __tick_nohz_full_check(void)
-{
-        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
-        if (tick_nohz_full_cpu(smp_processor_id())) {
-                if (ts->tick_stopped && !is_idle_task(current)) {
-                        if (!can_stop_full_tick())
-                                tick_nohz_restart_sched_tick(ts, ktime_get());
-                }
-        }
-}
 static void nohz_full_kick_work_func(struct irq_work *work)
 {
-        __tick_nohz_full_check();
+        /* Empty, the tick restart happens on tick_nohz_irq_exit() */
 }
 static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
@@ -252,7 +234,7 @@ void tick_nohz_full_kick_cpu(int cpu)
 static void nohz_full_kick_ipi(void *info)
 {
-        __tick_nohz_full_check();
+        /* Empty, the tick restart happens on tick_nohz_irq_exit() */
 }
 /*
@@ -276,7 +258,7 @@ void tick_nohz_full_kick_all(void)
 * It might need the tick due to per task/process properties:
 * perf events, posix cpu timers, ...
 */
-void __tick_nohz_task_switch(struct task_struct *tsk)
+void __tick_nohz_task_switch(void)
 {
        unsigned long flags;
@@ -705,21 +687,38 @@ out:
        return tick;
 }
-static void tick_nohz_full_stop_tick(struct tick_sched *ts)
+static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
+{
+        /* Update jiffies first */
+        tick_do_update_jiffies64(now);
+        update_cpu_load_nohz();
+        calc_load_exit_idle();
+        touch_softlockup_watchdog();
+        /*
+         * Cancel the scheduled timer and restore the tick
+         */
+        ts->tick_stopped  = 0;
+        ts->idle_exittime = now;
+        tick_nohz_restart(ts, now);
+}
+static void tick_nohz_full_update_tick(struct tick_sched *ts)
 {
 #ifdef CONFIG_NO_HZ_FULL
        int cpu = smp_processor_id();
-        if (!tick_nohz_full_cpu(cpu) || is_idle_task(current))
+        if (!tick_nohz_full_cpu(cpu))
                return;
        if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE)
                return;
-        if (!can_stop_full_tick())
+        if (can_stop_full_tick())
-                return;
+                tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
+        else if (ts->tick_stopped)
-        tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
+                tick_nohz_restart_sched_tick(ts, ktime_get());
 #endif
 }
@@ -849,7 +848,7 @@ void tick_nohz_irq_exit(void)
        if (ts->inidle)
                __tick_nohz_idle_enter(ts);
        else
-                tick_nohz_full_stop_tick(ts);
+                tick_nohz_full_update_tick(ts);
 }
 /**
@@ -864,23 +863,6 @@ ktime_t tick_nohz_get_sleep_length(void)
        return ts->sleep_length;
 }
-static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
-{
-        /* Update jiffies first */
-        tick_do_update_jiffies64(now);
-        update_cpu_load_nohz();
-        calc_load_exit_idle();
-        touch_softlockup_watchdog();
-        /*
-         * Cancel the scheduled timer and restore the tick
-         */
-        ts->tick_stopped  = 0;
-        ts->idle_exittime = now;
-        tick_nohz_restart(ts, now);
-}
 static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
 {
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 85d5bb1d67eb..86751c68e08d 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -268,10 +268,14 @@ EXPORT_SYMBOL(jiffies_to_msecs);
 unsigned int jiffies_to_usecs(const unsigned long j)
 {
-#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
+        /*
+         * Hz usually doesn't go much further MSEC_PER_SEC.
+         * jiffies_to_usecs() and usecs_to_jiffies() depend on that.
+         */
+        BUILD_BUG_ON(HZ > USEC_PER_SEC);
+#if !(USEC_PER_SEC % HZ)
        return (USEC_PER_SEC / HZ) * j;
-#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
-        return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC);
 #else
 # if BITS_PER_LONG == 32
        return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32;
@@ -287,26 +291,20 @@ EXPORT_SYMBOL(jiffies_to_usecs);
 * @t: Timespec
 * @gran: Granularity in ns.
 *
- * Truncate a timespec to a granularity. gran must be smaller than a second.
+ * Truncate a timespec to a granularity. Always rounds down. gran must
- * Always rounds down.
+ * not be 0 nor greater than a second (NSEC_PER_SEC, or 10^9 ns).
- *
- * This function should be only used for timestamps returned by
- * current_kernel_time() or CURRENT_TIME, not with do_gettimeofday() because
- * it doesn't handle the better resolution of the latter.
 */
 struct timespec timespec_trunc(struct timespec t, unsigned gran)
 {
-        /*
+        /* Avoid division in the common cases 1 ns and 1 s. */
-         * Division is pretty slow so avoid it for common cases.
+        if (gran == 1) {
-         * Currently current_kernel_time() never returns better than
-         * jiffies resolution. Exploit that.
-         */
-        if (gran <= jiffies_to_usecs(1) * 1000) {
                /* nothing */
-        } else if (gran == 1000000000) {
+        } else if (gran == NSEC_PER_SEC) {
                t.tv_nsec = 0;
-        } else {
+        } else if (gran > 1 && gran < NSEC_PER_SEC) {
                t.tv_nsec -= t.tv_nsec % gran;
+        } else {
+                WARN(1, "illegal file time granularity: %u", gran);
        }
        return t;
 }
@@ -546,7 +544,7 @@ EXPORT_SYMBOL(__usecs_to_jiffies);
 * value to a scaled second value.
 */
 static unsigned long
-__timespec_to_jiffies(unsigned long sec, long nsec)
+__timespec64_to_jiffies(u64 sec, long nsec)
 {
        nsec = nsec + TICK_NSEC - 1;
@@ -554,22 +552,27 @@ __timespec_to_jiffies(unsigned long sec, long nsec)
                sec = MAX_SEC_IN_JIFFIES;
                nsec = 0;
        }
-        return (((u64)sec * SEC_CONVERSION) +
+        return ((sec * SEC_CONVERSION) +
                (((u64)nsec * NSEC_CONVERSION) >>
                 (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
 }
-unsigned long
+static unsigned long
-timespec_to_jiffies(const struct timespec *value)
+__timespec_to_jiffies(unsigned long sec, long nsec)
 {
-        return __timespec_to_jiffies(value->tv_sec, value->tv_nsec);
+        return __timespec64_to_jiffies((u64)sec, nsec);
 }
-EXPORT_SYMBOL(timespec_to_jiffies);
+unsigned long
+timespec64_to_jiffies(const struct timespec64 *value)
+{
+        return __timespec64_to_jiffies(value->tv_sec, value->tv_nsec);
+}
+EXPORT_SYMBOL(timespec64_to_jiffies);
 void
-jiffies_to_timespec(const unsigned long jiffies, struct timespec *value)
+jiffies_to_timespec64(const unsigned long jiffies, struct timespec64 *value)
 {
        /*
         * Convert jiffies to nanoseconds and separate with
@@ -580,7 +583,7 @@ jiffies_to_timespec(const unsigned long jiffies, struct timespec *value)
                                    NSEC_PER_SEC, &rem);
        value->tv_nsec = rem;
 }
-EXPORT_SYMBOL(jiffies_to_timespec);
+EXPORT_SYMBOL(jiffies_to_timespec64);
 /*
 * We could use a similar algorithm to timespec_to_jiffies (with a
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index bca3667a2de1..f6ee2e6b6f5d 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -911,6 +911,7 @@ int do_settimeofday64(const struct timespec64 *ts)
        struct timekeeper *tk = &tk_core.timekeeper;
        struct timespec64 ts_delta, xt;
        unsigned long flags;
+        int ret = 0;
        if (!timespec64_valid_strict(ts))
                return -EINVAL;
@@ -924,10 +925,15 @@ int do_settimeofday64(const struct timespec64 *ts)
        ts_delta.tv_sec = ts->tv_sec - xt.tv_sec;
        ts_delta.tv_nsec = ts->tv_nsec - xt.tv_nsec;
+        if (timespec64_compare(&tk->wall_to_monotonic, &ts_delta) > 0) {
+                ret = -EINVAL;
+                goto out;
+        }
        tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta));
        tk_set_xtime(tk, ts);
+out:
        timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
        write_seqcount_end(&tk_core.seq);
@@ -936,7 +942,7 @@ int do_settimeofday64(const struct timespec64 *ts)
        /* signal hrtimers about time change */
        clock_was_set();
-        return 0;
+        return ret;
 }
 EXPORT_SYMBOL(do_settimeofday64);
@@ -965,7 +971,8 @@ int timekeeping_inject_offset(struct timespec *ts)
        /* Make sure the proposed value is valid */
        tmp = timespec64_add(tk_xtime(tk),  ts64);
-        if (!timespec64_valid_strict(&tmp)) {
+        if (timespec64_compare(&tk->wall_to_monotonic, &ts64) > 0 ||
+            !timespec64_valid_strict(&tmp)) {
                ret = -EINVAL;
                goto error;
        }
@@ -1874,7 +1881,7 @@ struct timespec __current_kernel_time(void)
        return timespec64_to_timespec(tk_xtime(tk));
 }
-struct timespec current_kernel_time(void)
+struct timespec64 current_kernel_time64(void)
 {
        struct timekeeper *tk = &tk_core.timekeeper;
        struct timespec64 now;
@@ -1886,9 +1893,9 @@ struct timespec current_kernel_time(void)
                now = tk_xtime(tk);
        } while (read_seqcount_retry(&tk_core.seq, seq));
-        return timespec64_to_timespec(now);
+        return now;
 }
-EXPORT_SYMBOL(current_kernel_time);
+EXPORT_SYMBOL(current_kernel_time64);
 struct timespec64 get_monotonic_coarse64(void)
 {
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 5e097fa9faf7..84190f02b521 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -807,8 +807,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
                        spin_unlock(&base->lock);
                        base = new_base;
                        spin_lock(&base->lock);
-                        timer->flags &= ~TIMER_BASEMASK;
+                        WRITE_ONCE(timer->flags,
-                        timer->flags |= base->cpu;
+                                   (timer->flags & ~TIMER_BASEMASK) | base->cpu);
                }
        }
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index a4536e1e3e2a..129c96033e46 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -137,7 +137,7 @@ print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
                   (unsigned long long) ktime_to_ns(base->offset));
 #endif
        SEQ_printf(m,   "active timers:\n");
-        print_active_timers(m, base, now);
+        print_active_timers(m, base, now + ktime_to_ns(base->offset));
 }
 static void print_cpu(struct seq_file *m, int cpu, u64 now)
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 3b9a48ae153a..1153c43428f3 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -434,7 +434,7 @@ config UPROBE_EVENT
 config BPF_EVENTS
        depends on BPF_SYSCALL
-        depends on KPROBE_EVENT
+        depends on KPROBE_EVENT || UPROBE_EVENT
        bool
        default y
        help
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index b3e6b39b6cf9..90e72a0c3047 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -778,9 +778,6 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
        if (likely(!bt))
                return;
-        if (!error && !bio_flagged(bio, BIO_UPTODATE))
-                error = EIO;
        __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
                        bio->bi_rw, what, error, 0, NULL);
 }
@@ -887,8 +884,7 @@ static void blk_add_trace_split(void *ignore,
                __blk_add_trace(bt, bio->bi_iter.bi_sector,
                                bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_SPLIT,
-                                !bio_flagged(bio, BIO_UPTODATE),
+                                bio->bi_error, sizeof(rpdu), &rpdu);
-                                sizeof(rpdu), &rpdu);
        }
 }
@@ -920,8 +916,8 @@ static void blk_add_trace_bio_remap(void *ignore,
        r.sector_from = cpu_to_be64(from);
        __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
-                        bio->bi_rw, BLK_TA_REMAP,
+                        bio->bi_rw, BLK_TA_REMAP, bio->bi_error,
-                        !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
+                        sizeof(r), &r);
 }
 /**
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 88a041adee90..0fe96c7c8803 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -81,13 +81,16 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
 /*
 * limited trace_printk()
- * only %d %u %x %ld %lu %lx %lld %llu %llx %p conversion specifiers allowed
+ * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed
 */
 static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
 {
        char *fmt = (char *) (long) r1;
+        bool str_seen = false;
        int mod[3] = {};
        int fmt_cnt = 0;
+        u64 unsafe_addr;
+        char buf[64];
        int i;
        /*
@@ -114,12 +117,37 @@ static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
                if (fmt[i] == 'l') {
                        mod[fmt_cnt]++;
                        i++;
-                } else if (fmt[i] == 'p') {
+                } else if (fmt[i] == 'p' || fmt[i] == 's') {
                        mod[fmt_cnt]++;
                        i++;
                        if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0)
                                return -EINVAL;
                        fmt_cnt++;
+                        if (fmt[i - 1] == 's') {
+                                if (str_seen)
+                                        /* allow only one '%s' per fmt string */
+                                        return -EINVAL;
+                                str_seen = true;
+                                switch (fmt_cnt) {
+                                case 1:
+                                        unsafe_addr = r3;
+                                        r3 = (long) buf;
+                                        break;
+                                case 2:
+                                        unsafe_addr = r4;
+                                        r4 = (long) buf;
+                                        break;
+                                case 3:
+                                        unsafe_addr = r5;
+                                        r5 = (long) buf;
+                                        break;
+                                }
+                                buf[0] = 0;
+                                strncpy_from_unsafe(buf,
+                                                    (void *) (long) unsafe_addr,
+                                                    sizeof(buf));
+                        }
                        continue;
                }
@@ -158,6 +186,35 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
        return &bpf_trace_printk_proto;
 }
+static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
+{
+        struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+        struct bpf_array *array = container_of(map, struct bpf_array, map);
+        struct perf_event *event;
+        if (unlikely(index >= array->map.max_entries))
+                return -E2BIG;
+        event = (struct perf_event *)array->ptrs[index];
+        if (!event)
+                return -ENOENT;
+        /*
+         * we don't know if the function is run successfully by the
+         * return value. It can be judged in other places, such as
+         * eBPF programs.
+         */
+        return perf_event_read_local(event);
+}
+const struct bpf_func_proto bpf_perf_event_read_proto = {
+        .func           = bpf_perf_event_read,
+        .gpl_only       = false,
+        .ret_type       = RET_INTEGER,
+        .arg1_type      = ARG_CONST_MAP_PTR,
+        .arg2_type      = ARG_ANYTHING,
+};
 static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
 {
        switch (func_id) {
@@ -183,6 +240,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
                return bpf_get_trace_printk_proto();
        case BPF_FUNC_get_smp_processor_id:
                return &bpf_get_smp_processor_id_proto;
+        case BPF_FUNC_perf_event_read:
+                return &bpf_perf_event_read_proto;
        default:
                return NULL;
        }
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 02bece4a99ea..b0623ac785a2 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -98,6 +98,13 @@ struct ftrace_pid {
        struct pid *pid;
 };
+static bool ftrace_pids_enabled(void)
+{
+        return !list_empty(&ftrace_pids);
+}
+static void ftrace_update_trampoline(struct ftrace_ops *ops);
 /*
 * ftrace_disabled is set when an anomaly is discovered.
 * ftrace_disabled is much stronger than ftrace_enabled.
@@ -109,7 +116,6 @@ static DEFINE_MUTEX(ftrace_lock);
 static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end;
 static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;
 ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
-ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
 static struct ftrace_ops global_ops;
 static struct ftrace_ops control_ops;
@@ -183,14 +189,7 @@ static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
        if (!test_tsk_trace_trace(current))
                return;
-        ftrace_pid_function(ip, parent_ip, op, regs);
+        op->saved_func(ip, parent_ip, op, regs);
-}
-static void set_ftrace_pid_function(ftrace_func_t func)
-{
-        /* do not set ftrace_pid_function to itself! */
-        if (func != ftrace_pid_func)
-                ftrace_pid_function = func;
 }
 /**
@@ -202,7 +201,6 @@ static void set_ftrace_pid_function(ftrace_func_t func)
 void clear_ftrace_function(void)
 {
        ftrace_trace_function = ftrace_stub;
-        ftrace_pid_function = ftrace_stub;
 }
 static void control_ops_disable_all(struct ftrace_ops *ops)
@@ -436,6 +434,12 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
        } else
                add_ftrace_ops(&ftrace_ops_list, ops);
+        /* Always save the function, and reset at unregistering */
+        ops->saved_func = ops->func;
+        if (ops->flags & FTRACE_OPS_FL_PID && ftrace_pids_enabled())
+                ops->func = ftrace_pid_func;
        ftrace_update_trampoline(ops);
        if (ftrace_enabled)
@@ -463,15 +467,28 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
        if (ftrace_enabled)
                update_ftrace_function();
+        ops->func = ops->saved_func;
        return 0;
 }
 static void ftrace_update_pid_func(void)
 {
+        bool enabled = ftrace_pids_enabled();
+        struct ftrace_ops *op;
        /* Only do something if we are tracing something */
        if (ftrace_trace_function == ftrace_stub)
                return;
+        do_for_each_ftrace_op(op, ftrace_ops_list) {
+                if (op->flags & FTRACE_OPS_FL_PID) {
+                        op->func = enabled ? ftrace_pid_func :
+                                op->saved_func;
+                        ftrace_update_trampoline(op);
+                }
+        } while_for_each_ftrace_op(op);
        update_ftrace_function();
 }
@@ -613,13 +630,18 @@ static int function_stat_show(struct seq_file *m, void *v)
                goto out;
        }
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+        avg = rec->time;
+        do_div(avg, rec->counter);
+        if (tracing_thresh && (avg < tracing_thresh))
+                goto out;
+#endif
        kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
        seq_printf(m, "  %-30.30s  %10lu", str, rec->counter);
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
        seq_puts(m, "    ");
-        avg = rec->time;
-        do_div(avg, rec->counter);
        /* Sample standard deviation (s^2) */
        if (rec->counter <= 1)
@@ -1133,7 +1155,8 @@ static struct ftrace_ops global_ops = {
        .local_hash.filter_hash         = EMPTY_HASH,
        INIT_OPS_HASH(global_ops)
        .flags                          = FTRACE_OPS_FL_RECURSION_SAFE |
-                                          FTRACE_OPS_FL_INITIALIZED,
+                                          FTRACE_OPS_FL_INITIALIZED |
+                                          FTRACE_OPS_FL_PID,
 };
 /*
@@ -5023,7 +5046,9 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops)
 static struct ftrace_ops global_ops = {
        .func                   = ftrace_stub,
-        .flags                  = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
+        .flags                  = FTRACE_OPS_FL_RECURSION_SAFE |
+                                  FTRACE_OPS_FL_INITIALIZED |
+                                  FTRACE_OPS_FL_PID,
 };
 static int __init ftrace_nodyn_init(void)
@@ -5080,11 +5105,6 @@ void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func)
                if (WARN_ON(tr->ops->func != ftrace_stub))
                        printk("ftrace ops had %pS for function\n",
                               tr->ops->func);
-                /* Only the top level instance does pid tracing */
-                if (!list_empty(&ftrace_pids)) {
-                        set_ftrace_pid_function(func);
-                        func = ftrace_pid_func;
-                }
        }
        tr->ops->func = func;
        tr->ops->private = tr;
@@ -5371,7 +5391,7 @@ static void *fpid_start(struct seq_file *m, loff_t *pos)
 {
        mutex_lock(&ftrace_lock);
-        if (list_empty(&ftrace_pids) && (!*pos))
+        if (!ftrace_pids_enabled() && (!*pos))
                return (void *) 1;
        return seq_list_start(&ftrace_pids, *pos);
@@ -5610,6 +5630,7 @@ static struct ftrace_ops graph_ops = {
        .func                   = ftrace_stub,
        .flags                  = FTRACE_OPS_FL_RECURSION_SAFE |
                                   FTRACE_OPS_FL_INITIALIZED |
+                                   FTRACE_OPS_FL_PID |
                                   FTRACE_OPS_FL_STUB,
 #ifdef FTRACE_GRAPH_TRAMP_ADDR
        .trampoline             = FTRACE_GRAPH_TRAMP_ADDR,
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 6260717c18e3..fc347f8b1bca 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -400,6 +400,17 @@ struct rb_irq_work {
 };
 /*
+ * Structure to hold event state and handle nested events.
+ */
+struct rb_event_info {
+        u64                     ts;
+        u64                     delta;
+        unsigned long           length;
+        struct buffer_page      *tail_page;
+        int                     add_timestamp;
+};
+/*
 * Used for which event context the event is in.
 *  NMI     = 0
 *  IRQ     = 1
@@ -1876,73 +1887,6 @@ rb_event_index(struct ring_buffer_event *event)
        return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE;
 }
-static inline int
-rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
-                   struct ring_buffer_event *event)
-{
-        unsigned long addr = (unsigned long)event;
-        unsigned long index;
-        index = rb_event_index(event);
-        addr &= PAGE_MASK;
-        return cpu_buffer->commit_page->page == (void *)addr &&
-                rb_commit_index(cpu_buffer) == index;
-}
-static void
-rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
-{
-        unsigned long max_count;
-        /*
-         * We only race with interrupts and NMIs on this CPU.
-         * If we own the commit event, then we can commit
-         * all others that interrupted us, since the interruptions
-         * are in stack format (they finish before they come
-         * back to us). This allows us to do a simple loop to
-         * assign the commit to the tail.
-         */
- again:
-        max_count = cpu_buffer->nr_pages * 100;
-        while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
-                if (RB_WARN_ON(cpu_buffer, !(--max_count)))
-                        return;
-                if (RB_WARN_ON(cpu_buffer,
-                               rb_is_reader_page(cpu_buffer->tail_page)))
-                        return;
-                local_set(&cpu_buffer->commit_page->page->commit,
-                          rb_page_write(cpu_buffer->commit_page));
-                rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
-                cpu_buffer->write_stamp =
-                        cpu_buffer->commit_page->page->time_stamp;
-                /* add barrier to keep gcc from optimizing too much */
-                barrier();
-        }
-        while (rb_commit_index(cpu_buffer) !=
-               rb_page_write(cpu_buffer->commit_page)) {
-                local_set(&cpu_buffer->commit_page->page->commit,
-                          rb_page_write(cpu_buffer->commit_page));
-                RB_WARN_ON(cpu_buffer,
-                           local_read(&cpu_buffer->commit_page->page->commit) &
-                           ~RB_WRITE_MASK);
-                barrier();
-        }
-        /* again, keep gcc from optimizing */
-        barrier();
-        /*
-         * If an interrupt came in just after the first while loop
-         * and pushed the tail page forward, we will be left with
-         * a dangling commit that will never go forward.
-         */
-        if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page))
-                goto again;
-}
 static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 {
        cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp;
@@ -1968,64 +1912,6 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
        iter->head = 0;
 }
-/* Slow path, do not inline */
-static noinline struct ring_buffer_event *
-rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
-{
-        event->type_len = RINGBUF_TYPE_TIME_EXTEND;
-        /* Not the first event on the page? */
-        if (rb_event_index(event)) {
-                event->time_delta = delta & TS_MASK;
-                event->array[0] = delta >> TS_SHIFT;
-        } else {
-                /* nope, just zero it */
-                event->time_delta = 0;
-                event->array[0] = 0;
-        }
-        return skip_time_extend(event);
-}
-/**
- * rb_update_event - update event type and data
- * @event: the event to update
- * @type: the type of event
- * @length: the size of the event field in the ring buffer
- *
- * Update the type and data fields of the event. The length
- * is the actual size that is written to the ring buffer,
- * and with this, we can determine what to place into the
- * data field.
- */
-static void
-rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
-                struct ring_buffer_event *event, unsigned length,
-                int add_timestamp, u64 delta)
-{
-        /* Only a commit updates the timestamp */
-        if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
-                delta = 0;
-        /*
-         * If we need to add a timestamp, then we
-         * add it to the start of the resevered space.
-         */
-        if (unlikely(add_timestamp)) {
-                event = rb_add_time_stamp(event, delta);
-                length -= RB_LEN_TIME_EXTEND;
-                delta = 0;
-        }
-        event->time_delta = delta;
-        length -= RB_EVNT_HDR_SIZE;
-        if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) {
-                event->type_len = 0;
-                event->array[0] = length;
-        } else
-                event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
-}
 /*
 * rb_handle_head_page - writer hit the head page
 *
@@ -2184,29 +2070,13 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
        return 0;
 }
-static unsigned rb_calculate_event_length(unsigned length)
-{
-        struct ring_buffer_event event; /* Used only for sizeof array */
-        /* zero length can cause confusions */
-        if (!length)
-                length++;
-        if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
-                length += sizeof(event.array[0]);
-        length += RB_EVNT_HDR_SIZE;
-        length = ALIGN(length, RB_ARCH_ALIGNMENT);
-        return length;
-}
 static inline void
 rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
-              struct buffer_page *tail_page,
+              unsigned long tail, struct rb_event_info *info)
-              unsigned long tail, unsigned long length)
 {
+        struct buffer_page *tail_page = info->tail_page;
        struct ring_buffer_event *event;
+        unsigned long length = info->length;
        /*
         * Only the event that crossed the page boundary
@@ -2276,13 +2146,14 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
 */
 static noinline struct ring_buffer_event *
 rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
-             unsigned long length, unsigned long tail,
+             unsigned long tail, struct rb_event_info *info)
-             struct buffer_page *tail_page, u64 ts)
 {
+        struct buffer_page *tail_page = info->tail_page;
        struct buffer_page *commit_page = cpu_buffer->commit_page;
        struct ring_buffer *buffer = cpu_buffer->buffer;
        struct buffer_page *next_page;
        int ret;
+        u64 ts;
        next_page = tail_page;
@@ -2368,74 +2239,120 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
 out_again:
-        rb_reset_tail(cpu_buffer, tail_page, tail, length);
+        rb_reset_tail(cpu_buffer, tail, info);
        /* fail and let the caller try again */
        return ERR_PTR(-EAGAIN);
 out_reset:
        /* reset write */
-        rb_reset_tail(cpu_buffer, tail_page, tail, length);
+        rb_reset_tail(cpu_buffer, tail, info);
        return NULL;
 }
-static struct ring_buffer_event *
+/* Slow path, do not inline */
-__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
+static noinline struct ring_buffer_event *
-                  unsigned long length, u64 ts,
+rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
-                  u64 delta, int add_timestamp)
 {
-        struct buffer_page *tail_page;
+        event->type_len = RINGBUF_TYPE_TIME_EXTEND;
-        struct ring_buffer_event *event;
-        unsigned long tail, write;
-        /*
+        /* Not the first event on the page? */
-         * If the time delta since the last event is too big to
+        if (rb_event_index(event)) {
-         * hold in the time field of the event, then we append a
+                event->time_delta = delta & TS_MASK;
-         * TIME EXTEND event ahead of the data event.
+                event->array[0] = delta >> TS_SHIFT;
-         */
+        } else {
-        if (unlikely(add_timestamp))
+                /* nope, just zero it */
-                length += RB_LEN_TIME_EXTEND;
+                event->time_delta = 0;
+                event->array[0] = 0;
+        }
-        tail_page = cpu_buffer->tail_page;
+        return skip_time_extend(event);
-        write = local_add_return(length, &tail_page->write);
+}
-        /* set write to only the index of the write */
+static inline int rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
-        write &= RB_WRITE_MASK;
+                                     struct ring_buffer_event *event);
-        tail = write - length;
+/**
+ * rb_update_event - update event type and data
+ * @event: the event to update
+ * @type: the type of event
+ * @length: the size of the event field in the ring buffer
+ *
+ * Update the type and data fields of the event. The length
+ * is the actual size that is written to the ring buffer,
+ * and with this, we can determine what to place into the
+ * data field.
+ */
+static void
+rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
+                struct ring_buffer_event *event,
+                struct rb_event_info *info)
+{
+        unsigned length = info->length;
+        u64 delta = info->delta;
+        /* Only a commit updates the timestamp */
+        if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
+                delta = 0;
        /*
-         * If this is the first commit on the page, then it has the same
+         * If we need to add a timestamp, then we
-         * timestamp as the page itself.
+         * add it to the start of the resevered space.
         */
-        if (!tail)
+        if (unlikely(info->add_timestamp)) {
+                event = rb_add_time_stamp(event, delta);
+                length -= RB_LEN_TIME_EXTEND;
                delta = 0;
+        }
-        /* See if we shot pass the end of this buffer page */
+        event->time_delta = delta;
-        if (unlikely(write > BUF_PAGE_SIZE))
+        length -= RB_EVNT_HDR_SIZE;
-                return rb_move_tail(cpu_buffer, length, tail,
+        if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) {
-                                    tail_page, ts);
+                event->type_len = 0;
+                event->array[0] = length;
+        } else
+                event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
+}
-        /* We reserved something on the buffer */
+static unsigned rb_calculate_event_length(unsigned length)
+{
+        struct ring_buffer_event event; /* Used only for sizeof array */
-        event = __rb_page_index(tail_page, tail);
+        /* zero length can cause confusions */
-        kmemcheck_annotate_bitfield(event, bitfield);
+        if (!length)
-        rb_update_event(cpu_buffer, event, length, add_timestamp, delta);
+                length++;
-        local_inc(&tail_page->entries);
+        if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
+                length += sizeof(event.array[0]);
+        length += RB_EVNT_HDR_SIZE;
+        length = ALIGN(length, RB_ARCH_ALIGNMENT);
        /*
-         * If this is the first commit on the page, then update
+         * In case the time delta is larger than the 27 bits for it
-         * its timestamp.
+         * in the header, we need to add a timestamp. If another
+         * event comes in when trying to discard this one to increase
+         * the length, then the timestamp will be added in the allocated
+         * space of this event. If length is bigger than the size needed
+         * for the TIME_EXTEND, then padding has to be used. The events
+         * length must be either RB_LEN_TIME_EXTEND, or greater than or equal
+         * to RB_LEN_TIME_EXTEND + 8, as 8 is the minimum size for padding.
+         * As length is a multiple of 4, we only need to worry if it
+         * is 12 (RB_LEN_TIME_EXTEND + 4).
         */
-        if (!tail)
+        if (length == RB_LEN_TIME_EXTEND + RB_ALIGNMENT)
-                tail_page->page->time_stamp = ts;
+                length += RB_ALIGNMENT;
-        /* account for these added bytes */
+        return length;
-        local_add(length, &cpu_buffer->entries_bytes);
+}
-        return event;
+#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+static inline bool sched_clock_stable(void)
+{
+        return true;
 }
+#endif
 static inline int
 rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
@@ -2483,6 +2400,59 @@ static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
        local_inc(&cpu_buffer->commits);
 }
+static void
+rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
+{
+        unsigned long max_count;
+        /*
+         * We only race with interrupts and NMIs on this CPU.
+         * If we own the commit event, then we can commit
+         * all others that interrupted us, since the interruptions
+         * are in stack format (they finish before they come
+         * back to us). This allows us to do a simple loop to
+         * assign the commit to the tail.
+         */
+ again:
+        max_count = cpu_buffer->nr_pages * 100;
+        while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
+                if (RB_WARN_ON(cpu_buffer, !(--max_count)))
+                        return;
+                if (RB_WARN_ON(cpu_buffer,
+                               rb_is_reader_page(cpu_buffer->tail_page)))
+                        return;
+                local_set(&cpu_buffer->commit_page->page->commit,
+                          rb_page_write(cpu_buffer->commit_page));
+                rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
+                cpu_buffer->write_stamp =
+                        cpu_buffer->commit_page->page->time_stamp;
+                /* add barrier to keep gcc from optimizing too much */
+                barrier();
+        }
+        while (rb_commit_index(cpu_buffer) !=
+               rb_page_write(cpu_buffer->commit_page)) {
+                local_set(&cpu_buffer->commit_page->page->commit,
+                          rb_page_write(cpu_buffer->commit_page));
+                RB_WARN_ON(cpu_buffer,
+                           local_read(&cpu_buffer->commit_page->page->commit) &
+                           ~RB_WRITE_MASK);
+                barrier();
+        }
+        /* again, keep gcc from optimizing */
+        barrier();
+        /*
+         * If an interrupt came in just after the first while loop
+         * and pushed the tail page forward, we will be left with
+         * a dangling commit that will never go forward.
+         */
+        if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page))
+                goto again;
+}
 static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
 {
        unsigned long commits;
@@ -2515,91 +2485,94 @@ static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
        }
 }
-static struct ring_buffer_event *
+static inline void rb_event_discard(struct ring_buffer_event *event)
-rb_reserve_next_event(struct ring_buffer *buffer,
-                      struct ring_buffer_per_cpu *cpu_buffer,
-                      unsigned long length)
 {
-        struct ring_buffer_event *event;
+        if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
-        u64 ts, delta;
+                event = skip_time_extend(event);
-        int nr_loops = 0;
-        int add_timestamp;
-        u64 diff;
-        rb_start_commit(cpu_buffer);
+        /* array[0] holds the actual length for the discarded event */
+        event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
+        event->type_len = RINGBUF_TYPE_PADDING;
+        /* time delta must be non zero */
+        if (!event->time_delta)
+                event->time_delta = 1;
+}
-#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
+static inline int
-        /*
+rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
-         * Due to the ability to swap a cpu buffer from a buffer
+                   struct ring_buffer_event *event)
-         * it is possible it was swapped before we committed.
+{
-         * (committing stops a swap). We check for it here and
+        unsigned long addr = (unsigned long)event;
-         * if it happened, we have to fail the write.
+        unsigned long index;
-         */
-        barrier();
-        if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) {
-                local_dec(&cpu_buffer->committing);
-                local_dec(&cpu_buffer->commits);
-                return NULL;
-        }
-#endif
-        length = rb_calculate_event_length(length);
+        index = rb_event_index(event);
- again:
+        addr &= PAGE_MASK;
-        add_timestamp = 0;
-        delta = 0;
+        return cpu_buffer->commit_page->page == (void *)addr &&
+                rb_commit_index(cpu_buffer) == index;
+}
+static void
+rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
+                      struct ring_buffer_event *event)
+{
+        u64 delta;
        /*
-         * We allow for interrupts to reenter here and do a trace.
+         * The event first in the commit queue updates the
-         * If one does, it will cause this original code to loop
+         * time stamp.
-         * back here. Even with heavy interrupts happening, this
-         * should only happen a few times in a row. If this happens
-         * 1000 times in a row, there must be either an interrupt
-         * storm or we have something buggy.
-         * Bail!
         */
-        if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
+        if (rb_event_is_commit(cpu_buffer, event)) {
-                goto out_fail;
+                /*
+                 * A commit event that is first on a page
+                 * updates the write timestamp with the page stamp
+                 */
+                if (!rb_event_index(event))
+                        cpu_buffer->write_stamp =
+                                cpu_buffer->commit_page->page->time_stamp;
+                else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
+                        delta = event->array[0];
+                        delta <<= TS_SHIFT;
+                        delta += event->time_delta;
+                        cpu_buffer->write_stamp += delta;
+                } else
+                        cpu_buffer->write_stamp += event->time_delta;
+        }
+}
-        ts = rb_time_stamp(cpu_buffer->buffer);
+static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
-        diff = ts - cpu_buffer->write_stamp;
+                      struct ring_buffer_event *event)
+{
+        local_inc(&cpu_buffer->entries);
+        rb_update_write_stamp(cpu_buffer, event);
+        rb_end_commit(cpu_buffer);
+}
-        /* make sure this diff is calculated here */
+static __always_inline void
-        barrier();
+rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
+{
+        bool pagebusy;
-        /* Did the write stamp get updated already? */
+        if (buffer->irq_work.waiters_pending) {
-        if (likely(ts >= cpu_buffer->write_stamp)) {
+                buffer->irq_work.waiters_pending = false;
-                delta = diff;
+                /* irq_work_queue() supplies it's own memory barriers */
-                if (unlikely(test_time_stamp(delta))) {
+                irq_work_queue(&buffer->irq_work.work);
-                        int local_clock_stable = 1;
-#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-                        local_clock_stable = sched_clock_stable();
-#endif
-                        WARN_ONCE(delta > (1ULL << 59),
-                                  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
-                                  (unsigned long long)delta,
-                                  (unsigned long long)ts,
-                                  (unsigned long long)cpu_buffer->write_stamp,
-                                  local_clock_stable ? "" :
-                                  "If you just came from a suspend/resume,\n"
-                                  "please switch to the trace global clock:\n"
-                                  "  echo global > /sys/kernel/debug/tracing/trace_clock\n");
-                        add_timestamp = 1;
-                }
        }
-        event = __rb_reserve_next(cpu_buffer, length, ts,
+        if (cpu_buffer->irq_work.waiters_pending) {
-                                  delta, add_timestamp);
+                cpu_buffer->irq_work.waiters_pending = false;
-        if (unlikely(PTR_ERR(event) == -EAGAIN))
+                /* irq_work_queue() supplies it's own memory barriers */
-                goto again;
+                irq_work_queue(&cpu_buffer->irq_work.work);
+        }
-        if (!event)
-                goto out_fail;
-        return event;
+        pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
- out_fail:
+        if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) {
-        rb_end_commit(cpu_buffer);
+                cpu_buffer->irq_work.wakeup_full = true;
-        return NULL;
+                cpu_buffer->irq_work.full_waiters_pending = false;
+                /* irq_work_queue() supplies it's own memory barriers */
+                irq_work_queue(&cpu_buffer->irq_work.work);
+        }
 }
 /*
@@ -2672,6 +2645,178 @@ trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
 }
 /**
+ * ring_buffer_unlock_commit - commit a reserved
+ * @buffer: The buffer to commit to
+ * @event: The event pointer to commit.
+ *
+ * This commits the data to the ring buffer, and releases any locks held.
+ *
+ * Must be paired with ring_buffer_lock_reserve.
+ */
+int ring_buffer_unlock_commit(struct ring_buffer *buffer,
+                              struct ring_buffer_event *event)
+{
+        struct ring_buffer_per_cpu *cpu_buffer;
+        int cpu = raw_smp_processor_id();
+        cpu_buffer = buffer->buffers[cpu];
+        rb_commit(cpu_buffer, event);
+        rb_wakeups(buffer, cpu_buffer);
+        trace_recursive_unlock(cpu_buffer);
+        preempt_enable_notrace();
+        return 0;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
+static noinline void
+rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
+                    struct rb_event_info *info)
+{
+        WARN_ONCE(info->delta > (1ULL << 59),
+                  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
+                  (unsigned long long)info->delta,
+                  (unsigned long long)info->ts,
+                  (unsigned long long)cpu_buffer->write_stamp,
+                  sched_clock_stable() ? "" :
+                  "If you just came from a suspend/resume,\n"
+                  "please switch to the trace global clock:\n"
+                  "  echo global > /sys/kernel/debug/tracing/trace_clock\n");
+        info->add_timestamp = 1;
+}
+static struct ring_buffer_event *
+__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
+                  struct rb_event_info *info)
+{
+        struct ring_buffer_event *event;
+        struct buffer_page *tail_page;
+        unsigned long tail, write;
+        /*
+         * If the time delta since the last event is too big to
+         * hold in the time field of the event, then we append a
+         * TIME EXTEND event ahead of the data event.
+         */
+        if (unlikely(info->add_timestamp))
+                info->length += RB_LEN_TIME_EXTEND;
+        tail_page = info->tail_page = cpu_buffer->tail_page;
+        write = local_add_return(info->length, &tail_page->write);
+        /* set write to only the index of the write */
+        write &= RB_WRITE_MASK;
+        tail = write - info->length;
+        /*
+         * If this is the first commit on the page, then it has the same
+         * timestamp as the page itself.
+         */
+        if (!tail)
+                info->delta = 0;
+        /* See if we shot pass the end of this buffer page */
+        if (unlikely(write > BUF_PAGE_SIZE))
+                return rb_move_tail(cpu_buffer, tail, info);
+        /* We reserved something on the buffer */
+        event = __rb_page_index(tail_page, tail);
+        kmemcheck_annotate_bitfield(event, bitfield);
+        rb_update_event(cpu_buffer, event, info);
+        local_inc(&tail_page->entries);
+        /*
+         * If this is the first commit on the page, then update
+         * its timestamp.
+         */
+        if (!tail)
+                tail_page->page->time_stamp = info->ts;
+        /* account for these added bytes */
+        local_add(info->length, &cpu_buffer->entries_bytes);
+        return event;
+}
+static struct ring_buffer_event *
+rb_reserve_next_event(struct ring_buffer *buffer,
+                      struct ring_buffer_per_cpu *cpu_buffer,
+                      unsigned long length)
+{
+        struct ring_buffer_event *event;
+        struct rb_event_info info;
+        int nr_loops = 0;
+        u64 diff;
+        rb_start_commit(cpu_buffer);
+#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
+        /*
+         * Due to the ability to swap a cpu buffer from a buffer
+         * it is possible it was swapped before we committed.
+         * (committing stops a swap). We check for it here and
+         * if it happened, we have to fail the write.
+         */
+        barrier();
+        if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) {
+                local_dec(&cpu_buffer->committing);
+                local_dec(&cpu_buffer->commits);
+                return NULL;
+        }
+#endif
+        info.length = rb_calculate_event_length(length);
+ again:
+        info.add_timestamp = 0;
+        info.delta = 0;
+        /*
+         * We allow for interrupts to reenter here and do a trace.
+         * If one does, it will cause this original code to loop
+         * back here. Even with heavy interrupts happening, this
+         * should only happen a few times in a row. If this happens
+         * 1000 times in a row, there must be either an interrupt
+         * storm or we have something buggy.
+         * Bail!
+         */
+        if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
+                goto out_fail;
+        info.ts = rb_time_stamp(cpu_buffer->buffer);
+        diff = info.ts - cpu_buffer->write_stamp;
+        /* make sure this diff is calculated here */
+        barrier();
+        /* Did the write stamp get updated already? */
+        if (likely(info.ts >= cpu_buffer->write_stamp)) {
+                info.delta = diff;
+                if (unlikely(test_time_stamp(info.delta)))
+                        rb_handle_timestamp(cpu_buffer, &info);
+        }
+        event = __rb_reserve_next(cpu_buffer, &info);
+        if (unlikely(PTR_ERR(event) == -EAGAIN))
+                goto again;
+        if (!event)
+                goto out_fail;
+        return event;
+ out_fail:
+        rb_end_commit(cpu_buffer);
+        return NULL;
+}
+/**
 * ring_buffer_lock_reserve - reserve a part of the buffer
 * @buffer: the ring buffer to reserve from
 * @length: the length of the data to reserve (excluding event header)
@@ -2729,111 +2874,6 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
 }
 EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
-static void
-rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
-                      struct ring_buffer_event *event)
-{
-        u64 delta;
-        /*
-         * The event first in the commit queue updates the
-         * time stamp.
-         */
-        if (rb_event_is_commit(cpu_buffer, event)) {
-                /*
-                 * A commit event that is first on a page
-                 * updates the write timestamp with the page stamp
-                 */
-                if (!rb_event_index(event))
-                        cpu_buffer->write_stamp =
-                                cpu_buffer->commit_page->page->time_stamp;
-                else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
-                        delta = event->array[0];
-                        delta <<= TS_SHIFT;
-                        delta += event->time_delta;
-                        cpu_buffer->write_stamp += delta;
-                } else
-                        cpu_buffer->write_stamp += event->time_delta;
-        }
-}
-static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
-                      struct ring_buffer_event *event)
-{
-        local_inc(&cpu_buffer->entries);
-        rb_update_write_stamp(cpu_buffer, event);
-        rb_end_commit(cpu_buffer);
-}
-static __always_inline void
-rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
-{
-        bool pagebusy;
-        if (buffer->irq_work.waiters_pending) {
-                buffer->irq_work.waiters_pending = false;
-                /* irq_work_queue() supplies it's own memory barriers */
-                irq_work_queue(&buffer->irq_work.work);
-        }
-        if (cpu_buffer->irq_work.waiters_pending) {
-                cpu_buffer->irq_work.waiters_pending = false;
-                /* irq_work_queue() supplies it's own memory barriers */
-                irq_work_queue(&cpu_buffer->irq_work.work);
-        }
-        pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
-        if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) {
-                cpu_buffer->irq_work.wakeup_full = true;
-                cpu_buffer->irq_work.full_waiters_pending = false;
-                /* irq_work_queue() supplies it's own memory barriers */
-                irq_work_queue(&cpu_buffer->irq_work.work);
-        }
-}
-/**
- * ring_buffer_unlock_commit - commit a reserved
- * @buffer: The buffer to commit to
- * @event: The event pointer to commit.
- *
- * This commits the data to the ring buffer, and releases any locks held.
- *
- * Must be paired with ring_buffer_lock_reserve.
- */
-int ring_buffer_unlock_commit(struct ring_buffer *buffer,
-                              struct ring_buffer_event *event)
-{
-        struct ring_buffer_per_cpu *cpu_buffer;
-        int cpu = raw_smp_processor_id();
-        cpu_buffer = buffer->buffers[cpu];
-        rb_commit(cpu_buffer, event);
-        rb_wakeups(buffer, cpu_buffer);
-        trace_recursive_unlock(cpu_buffer);
-        preempt_enable_notrace();
-        return 0;
-}
-EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
-static inline void rb_event_discard(struct ring_buffer_event *event)
-{
-        if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
-                event = skip_time_extend(event);
-        /* array[0] holds the actual length for the discarded event */
-        event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
-        event->type_len = RINGBUF_TYPE_PADDING;
-        /* time delta must be non zero */
-        if (!event->time_delta)
-                event->time_delta = 1;
-}
 /*
 * Decrement the entries to the page that an event is on.
 * The event does not even need to exist, only the pointer
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index abcbf7ff8743..6e79408674aa 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3035,7 +3035,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
        if (!iter)
                return ERR_PTR(-ENOMEM);
-        iter->buffer_iter = kzalloc(sizeof(*iter->buffer_iter) * num_possible_cpus(),
+        iter->buffer_iter = kcalloc(nr_cpu_ids, sizeof(*iter->buffer_iter),
                                    GFP_KERNEL);
        if (!iter->buffer_iter)
                goto release;
@@ -6990,7 +6990,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
        trace_init_global_iter(&iter);
        for_each_tracing_cpu(cpu) {
-                atomic_inc(&per_cpu_ptr(iter.tr->trace_buffer.data, cpu)->disabled);
+                atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
        }
        old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f060716b02ae..74bde81601a9 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -444,6 +444,7 @@ enum {
        TRACE_CONTROL_BIT,
+        TRACE_BRANCH_BIT,
 /*
 * Abuse of the trace_recursion.
 * As we need a way to maintain state if we are tracing the function
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index a87b43f49eb4..e2e12ad3186f 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -36,9 +36,12 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
        struct trace_branch *entry;
        struct ring_buffer *buffer;
        unsigned long flags;
-        int cpu, pc;
+        int pc;
        const char *p;
+        if (current->trace_recursion & TRACE_BRANCH_BIT)
+                return;
        /*
         * I would love to save just the ftrace_likely_data pointer, but
         * this code can also be used by modules. Ugly things can happen
@@ -49,10 +52,10 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
        if (unlikely(!tr))
                return;
-        local_irq_save(flags);
+        raw_local_irq_save(flags);
-        cpu = raw_smp_processor_id();
+        current->trace_recursion |= TRACE_BRANCH_BIT;
-        data = per_cpu_ptr(tr->trace_buffer.data, cpu);
+        data = this_cpu_ptr(tr->trace_buffer.data);
-        if (atomic_inc_return(&data->disabled) != 1)
+        if (atomic_read(&data->disabled))
                goto out;
        pc = preempt_count();
@@ -81,8 +84,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
                __buffer_unlock_commit(buffer, event);
 out:
-        atomic_dec(&data->disabled);
+        current->trace_recursion &= ~TRACE_BRANCH_BIT;
-        local_irq_restore(flags);
+        raw_local_irq_restore(flags);
 }
 static inline
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 404a372ad85a..7ca09cdc20c2 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -30,6 +30,7 @@
 DEFINE_MUTEX(event_mutex);
 LIST_HEAD(ftrace_events);
+static LIST_HEAD(ftrace_generic_fields);
 static LIST_HEAD(ftrace_common_fields);
 #define GFP_TRACE (GFP_KERNEL | __GFP_ZERO)
@@ -94,6 +95,10 @@ trace_find_event_field(struct trace_event_call *call, char *name)
        struct ftrace_event_field *field;
        struct list_head *head;
+        field = __find_event_field(&ftrace_generic_fields, name);
+        if (field)
+                return field;
        field = __find_event_field(&ftrace_common_fields, name);
        if (field)
                return field;
@@ -144,6 +149,13 @@ int trace_define_field(struct trace_event_call *call, const char *type,
 }
 EXPORT_SYMBOL_GPL(trace_define_field);
+#define __generic_field(type, item, filter_type)                        \
+        ret = __trace_define_field(&ftrace_generic_fields, #type,       \
+                                   #item, 0, 0, is_signed_type(type),   \
+                                   filter_type);                        \
+        if (ret)                                                        \
+                return ret;
 #define __common_field(type, item)                                      \
        ret = __trace_define_field(&ftrace_common_fields, #type,        \
                                   "common_" #item,                     \
@@ -153,6 +165,16 @@ EXPORT_SYMBOL_GPL(trace_define_field);
        if (ret)                                                        \
                return ret;
+static int trace_define_generic_fields(void)
+{
+        int ret;
+        __generic_field(int, cpu, FILTER_OTHER);
+        __generic_field(char *, comm, FILTER_PTR_STRING);
+        return ret;
+}
 static int trace_define_common_fields(void)
 {
        int ret;
@@ -2671,6 +2693,9 @@ static __init int event_trace_init(void)
        if (!entry)
                pr_warn("Could not create tracefs 'available_events' entry\n");
+        if (trace_define_generic_fields())
+                pr_warn("tracing: Failed to allocated generic fields");
        if (trace_define_common_fields())
                pr_warn("tracing: Failed to allocate common fields");
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index d81d6f302b14..bd1bf184c5c9 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -252,6 +252,50 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event)
        return match;
 }
+/* Filter predicate for CPUs. */
+static int filter_pred_cpu(struct filter_pred *pred, void *event)
+{
+        int cpu, cmp;
+        int match = 0;
+        cpu = raw_smp_processor_id();
+        cmp = pred->val;
+        switch (pred->op) {
+        case OP_EQ:
+                match = cpu == cmp;
+                break;
+        case OP_LT:
+                match = cpu < cmp;
+                break;
+        case OP_LE:
+                match = cpu <= cmp;
+                break;
+        case OP_GT:
+                match = cpu > cmp;
+                break;
+        case OP_GE:
+                match = cpu >= cmp;
+                break;
+        default:
+                break;
+        }
+        return !!match == !pred->not;
+}
+/* Filter predicate for COMM. */
+static int filter_pred_comm(struct filter_pred *pred, void *event)
+{
+        int cmp, match;
+        cmp = pred->regex.match(current->comm, &pred->regex,
+                                pred->regex.field_len);
+        match = cmp ^ pred->not;
+        return match;
+}
 static int filter_pred_none(struct filter_pred *pred, void *event)
 {
        return 0;
@@ -1002,7 +1046,10 @@ static int init_pred(struct filter_parse_state *ps,
        if (is_string_field(field)) {
                filter_build_regex(pred);
-                if (field->filter_type == FILTER_STATIC_STRING) {
+                if (!strcmp(field->name, "comm")) {
+                        fn = filter_pred_comm;
+                        pred->regex.field_len = TASK_COMM_LEN;
+                } else if (field->filter_type == FILTER_STATIC_STRING) {
                        fn = filter_pred_string;
                        pred->regex.field_len = field->size;
                } else if (field->filter_type == FILTER_DYN_STRING)
@@ -1025,7 +1072,10 @@ static int init_pred(struct filter_parse_state *ps,
                }
                pred->val = val;
-                fn = select_comparison_fn(pred->op, field->size,
+                if (!strcmp(field->name, "cpu"))
+                        fn = filter_pred_cpu;
+                else
+                        fn = select_comparison_fn(pred->op, field->size,
                                          field->is_signed);
                if (!fn) {
                        parse_error(ps, FILT_ERR_INVALID_OP, 0);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 8968bf720c12..ca98445782ac 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -715,13 +715,13 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
                snprintf(nsecs_str, slen, "%03lu", nsecs_rem);
                trace_seq_printf(s, ".%s", nsecs_str);
-                len += strlen(nsecs_str);
+                len += strlen(nsecs_str) + 1;
        }
        trace_seq_puts(s, " us ");
        /* Print remaining spaces to fit the row's width */
-        for (i = len; i < 7; i++)
+        for (i = len; i < 8; i++)
                trace_seq_putc(s, ' ');
 }
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index b7d0cdd9906c..c9956440d0e6 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -165,11 +165,9 @@ DEFINE_BASIC_FETCH_FUNCS(memory)
 static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
                                            void *addr, void *dest)
 {
-        long ret;
        int maxlen = get_rloc_len(*(u32 *)dest);
        u8 *dst = get_rloc_data(dest);
-        u8 *src = addr;
+        long ret;
-        mm_segment_t old_fs = get_fs();
        if (!maxlen)
                return;
@@ -178,23 +176,13 @@ static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
         * Try to get string again, since the string can be changed while
         * probing.
         */
-        set_fs(KERNEL_DS);
+        ret = strncpy_from_unsafe(dst, addr, maxlen);
-        pagefault_disable();
-        do
-                ret = __copy_from_user_inatomic(dst++, src++, 1);
-        while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen);
-        dst[-1] = '\0';
-        pagefault_enable();
-        set_fs(old_fs);
        if (ret < 0) {  /* Failed to fetch string */
-                ((u8 *)get_rloc_data(dest))[0] = '\0';
+                dst[0] = '\0';
                *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest));
        } else {
-                *(u32 *)dest = make_data_rloc(src - (u8 *)addr,
+                *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(*(u32 *)dest));
-                                              get_rloc_offs(*(u32 *)dest));
        }
 }
 NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string));
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index dfab253727dc..8e481a84aeea 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -496,6 +496,8 @@ static const struct trace_mark {
        char                    sym;
 } mark[] = {
        MARK(1000000000ULL      , '$'), /* 1 sec */
+        MARK(100000000ULL       , '@'), /* 100 msec */
+        MARK(10000000ULL        , '*'), /* 10 msec */
        MARK(1000000ULL         , '#'), /* 1000 usecs */
        MARK(100000ULL          , '!'), /* 100 usecs */
        MARK(10000ULL           , '+'), /* 10 usecs */
@@ -508,7 +510,7 @@ char trace_find_mark(unsigned long long d)
        int size = ARRAY_SIZE(mark);
        for (i = 0; i < size; i++) {
-                if (d >= mark[i].val)
+                if (d > mark[i].val)
                        break;
        }
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 419ca37e72c9..f270088e9929 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -26,7 +26,7 @@ probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *n
 }
 static void
-probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success)
+probe_sched_wakeup(void *ignore, struct task_struct *wakee)
 {
        if (unlikely(!sched_ref))
                return;
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 9b33dd117f3f..12cbe77b4136 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -514,7 +514,7 @@ static void wakeup_reset(struct trace_array *tr)
 }
 static void
-probe_wakeup(void *ignore, struct task_struct *p, int success)
+probe_wakeup(void *ignore, struct task_struct *p)
 {
        struct trace_array_cpu *data;
        int cpu = smp_processor_id();
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 3f34496244e9..b746399ab59c 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -18,12 +18,6 @@
 #define STACK_TRACE_ENTRIES 500
-#ifdef CC_USING_FENTRY
-# define fentry         1
-#else
-# define fentry         0
-#endif
 static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] =
         { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX };
 static unsigned stack_dump_index[STACK_TRACE_ENTRIES];
@@ -35,7 +29,7 @@ static unsigned stack_dump_index[STACK_TRACE_ENTRIES];
 */
 static struct stack_trace max_stack_trace = {
        .max_entries            = STACK_TRACE_ENTRIES - 1,
-        .entries                = &stack_dump_trace[1],
+        .entries                = &stack_dump_trace[0],
 };
 static unsigned long max_stack_size;
@@ -55,7 +49,7 @@ static inline void print_max_stack(void)
        pr_emerg("        Depth    Size   Location    (%d entries)\n"
                           "        -----    ----   --------\n",
-                           max_stack_trace.nr_entries - 1);
+                           max_stack_trace.nr_entries);
        for (i = 0; i < max_stack_trace.nr_entries; i++) {
                if (stack_dump_trace[i] == ULONG_MAX)
@@ -77,7 +71,7 @@ check_stack(unsigned long ip, unsigned long *stack)
        unsigned long this_size, flags; unsigned long *p, *top, *start;
        static int tracer_frame;
        int frame_size = ACCESS_ONCE(tracer_frame);
-        int i;
+        int i, x;
        this_size = ((unsigned long)stack) & (THREAD_SIZE-1);
        this_size = THREAD_SIZE - this_size;
@@ -105,26 +99,20 @@ check_stack(unsigned long ip, unsigned long *stack)
        max_stack_size = this_size;
        max_stack_trace.nr_entries = 0;
+        max_stack_trace.skip = 3;
-        if (using_ftrace_ops_list_func())
-                max_stack_trace.skip = 4;
-        else
-                max_stack_trace.skip = 3;
        save_stack_trace(&max_stack_trace);
-        /*
+        /* Skip over the overhead of the stack tracer itself */
-         * Add the passed in ip from the function tracer.
+        for (i = 0; i < max_stack_trace.nr_entries; i++) {
-         * Searching for this on the stack will skip over
+                if (stack_dump_trace[i] == ip)
-         * most of the overhead from the stack tracer itself.
+                        break;
-         */
+        }
-        stack_dump_trace[0] = ip;
-        max_stack_trace.nr_entries++;
        /*
         * Now find where in the stack these are.
         */
-        i = 0;
+        x = 0;
        start = stack;
        top = (unsigned long *)
                (((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE);
@@ -139,12 +127,15 @@ check_stack(unsigned long ip, unsigned long *stack)
        while (i < max_stack_trace.nr_entries) {
                int found = 0;
-                stack_dump_index[i] = this_size;
+                stack_dump_index[x] = this_size;
                p = start;
                for (; p < top && i < max_stack_trace.nr_entries; p++) {
+                        if (stack_dump_trace[i] == ULONG_MAX)
+                                break;
                        if (*p == stack_dump_trace[i]) {
-                                this_size = stack_dump_index[i++] =
+                                stack_dump_trace[x] = stack_dump_trace[i++];
+                                this_size = stack_dump_index[x++] =
                                        (top - p) * sizeof(unsigned long);
                                found = 1;
                                /* Start the search from here */
@@ -156,7 +147,7 @@ check_stack(unsigned long ip, unsigned long *stack)
                                 * out what that is, then figure it out
                                 * now.
                                 */
-                                if (unlikely(!tracer_frame) && i == 1) {
+                                if (unlikely(!tracer_frame)) {
                                        tracer_frame = (p - stack) *
                                                sizeof(unsigned long);
                                        max_stack_size -= tracer_frame;
@@ -168,6 +159,10 @@ check_stack(unsigned long ip, unsigned long *stack)
                        i++;
        }
+        max_stack_trace.nr_entries = x;
+        for (; x < i; x++)
+                stack_dump_trace[x] = ULONG_MAX;
        if (task_stack_end_corrupted(current)) {
                print_max_stack();
                BUG();
@@ -192,24 +187,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,
        if (per_cpu(trace_active, cpu)++ != 0)
                goto out;
-        /*
+        ip += MCOUNT_INSN_SIZE;
-         * When fentry is used, the traced function does not get
-         * its stack frame set up, and we lose the parent.
-         * The ip is pretty useless because the function tracer
-         * was called before that function set up its stack frame.
-         * In this case, we use the parent ip.
-         *
-         * By adding the return address of either the parent ip
-         * or the current ip we can disregard most of the stack usage
-         * caused by the stack tracer itself.
-         *
-         * The function tracer always reports the address of where the
-         * mcount call was, but the stack will hold the return address.
-         */
-        if (fentry)
-                ip = parent_ip;
-        else
-                ip += MCOUNT_INSN_SIZE;
        check_stack(ip, &stack);
@@ -284,7 +262,7 @@ __next(struct seq_file *m, loff_t *pos)
 {
        long n = *pos - 1;
-        if (n >= max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX)
+        if (n > max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX)
                return NULL;
        m->private = (void *)n;
@@ -354,7 +332,7 @@ static int t_show(struct seq_file *m, void *v)
                seq_printf(m, "        Depth    Size   Location"
                           "    (%d entries)\n"
                           "        -----    ----   --------\n",
-                           max_stack_trace.nr_entries - 1);
+                           max_stack_trace.nr_entries);
                if (!stack_tracer_enabled && !max_stack_size)
                        print_disabled(m);
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index aa1ea7b36fa8..d2f6d0be3503 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -601,7 +601,22 @@ static int probes_seq_show(struct seq_file *m, void *v)
        seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system,
                        trace_event_name(&tu->tp.call));
-        seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset);
+        seq_printf(m, " %s:", tu->filename);
+        /* Don't print "0x  (null)" when offset is 0 */
+        if (tu->offset) {
+                seq_printf(m, "0x%p", (void *)tu->offset);
+        } else {
+                switch (sizeof(void *)) {
+                case 4:
+                        seq_printf(m, "0x00000000");
+                        break;
+                case 8:
+                default:
+                        seq_printf(m, "0x0000000000000000");
+                        break;
+                }
+        }
        for (i = 0; i < tu->tp.nr_args; i++)
                seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm);
@@ -1095,11 +1110,15 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
 {
        struct trace_event_call *call = &tu->tp.call;
        struct uprobe_trace_entry_head *entry;
+        struct bpf_prog *prog = call->prog;
        struct hlist_head *head;
        void *data;
        int size, esize;
        int rctx;
+        if (prog && !trace_call_bpf(prog, regs))
+                return;
        esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
        size = esize + tu->tp.size + dsize;
@@ -1289,6 +1308,7 @@ static int register_uprobe_event(struct trace_uprobe *tu)
                return -ENODEV;
        }
+        call->flags = TRACE_EVENT_FL_UPROBE;
        call->class->reg = trace_uprobe_register;
        call->data = tu;
        ret = trace_add_event_call(call);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 4109f8320684..88fefa68c516 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -39,6 +39,7 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
        cred->cap_inheritable = CAP_EMPTY_SET;
        cred->cap_permitted = CAP_FULL_SET;
        cred->cap_effective = CAP_FULL_SET;
+        cred->cap_ambient = CAP_EMPTY_SET;
        cred->cap_bset = CAP_FULL_SET;
 #ifdef CONFIG_KEYS
        key_put(cred->request_key_auth);
@@ -976,8 +977,8 @@ static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns)
        if (user_ns == current_user_ns())
                return -EINVAL;
-        /* Threaded processes may not enter a different user namespace */
+        /* Tasks that share a thread group must share a user namespace */
-        if (atomic_read(&current->mm->mm_users) > 1)
+        if (!thread_group_empty(current))
                return -EINVAL;
        if (current->fs->users != 1)
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index a6ffa43f2993..64ed1c37bd1f 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -24,6 +24,7 @@
 #include <asm/irq_regs.h>
 #include <linux/kvm_para.h>
 #include <linux/perf_event.h>
+#include <linux/kthread.h>
 /*
 * The run state of the lockup detectors is controlled by the content of the
@@ -66,7 +67,26 @@ unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
 #define for_each_watchdog_cpu(cpu) \
        for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
+/*
+ * The 'watchdog_running' variable is set to 1 when the watchdog threads
+ * are registered/started and is set to 0 when the watchdog threads are
+ * unregistered/stopped, so it is an indicator whether the threads exist.
+ */
 static int __read_mostly watchdog_running;
+/*
+ * If a subsystem has a need to deactivate the watchdog temporarily, it
+ * can use the suspend/resume interface to achieve this. The content of
+ * the 'watchdog_suspended' variable reflects this state. Existing threads
+ * are parked/unparked by the lockup_detector_{suspend|resume} functions
+ * (see comment blocks pertaining to those functions for further details).
+ *
+ * 'watchdog_suspended' also prevents threads from being registered/started
+ * or unregistered/stopped via parameters in /proc/sys/kernel, so the state
+ * of 'watchdog_running' cannot change while the watchdog is deactivated
+ * temporarily (see related code in 'proc' handlers).
+ */
+static int __read_mostly watchdog_suspended;
 static u64 __read_mostly sample_period;
 static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
@@ -613,46 +633,9 @@ static void watchdog_nmi_disable(unsigned int cpu)
        }
 }
-void watchdog_nmi_enable_all(void)
-{
-        int cpu;
-        mutex_lock(&watchdog_proc_mutex);
-        if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
-                goto unlock;
-        get_online_cpus();
-        for_each_watchdog_cpu(cpu)
-                watchdog_nmi_enable(cpu);
-        put_online_cpus();
-unlock:
-        mutex_unlock(&watchdog_proc_mutex);
-}
-void watchdog_nmi_disable_all(void)
-{
-        int cpu;
-        mutex_lock(&watchdog_proc_mutex);
-        if (!watchdog_running)
-                goto unlock;
-        get_online_cpus();
-        for_each_watchdog_cpu(cpu)
-                watchdog_nmi_disable(cpu);
-        put_online_cpus();
-unlock:
-        mutex_unlock(&watchdog_proc_mutex);
-}
 #else
 static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
 static void watchdog_nmi_disable(unsigned int cpu) { return; }
-void watchdog_nmi_enable_all(void) {}
-void watchdog_nmi_disable_all(void) {}
 #endif /* CONFIG_HARDLOCKUP_DETECTOR */
 static struct smp_hotplug_thread watchdog_threads = {
@@ -666,46 +649,89 @@ static struct smp_hotplug_thread watchdog_threads = {
        .unpark                 = watchdog_enable,
 };
-static void restart_watchdog_hrtimer(void *info)
+/*
+ * park all watchdog threads that are specified in 'watchdog_cpumask'
+ */
+static int watchdog_park_threads(void)
 {
-        struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
+        int cpu, ret = 0;
-        int ret;
+        get_online_cpus();
+        for_each_watchdog_cpu(cpu) {
+                ret = kthread_park(per_cpu(softlockup_watchdog, cpu));
+                if (ret)
+                        break;
+        }
+        if (ret) {
+                for_each_watchdog_cpu(cpu)
+                        kthread_unpark(per_cpu(softlockup_watchdog, cpu));
+        }
+        put_online_cpus();
+        return ret;
+}
+/*
+ * unpark all watchdog threads that are specified in 'watchdog_cpumask'
+ */
+static void watchdog_unpark_threads(void)
+{
+        int cpu;
+        get_online_cpus();
+        for_each_watchdog_cpu(cpu)
+                kthread_unpark(per_cpu(softlockup_watchdog, cpu));
+        put_online_cpus();
+}
+/*
+ * Suspend the hard and soft lockup detector by parking the watchdog threads.
+ */
+int lockup_detector_suspend(void)
+{
+        int ret = 0;
+        mutex_lock(&watchdog_proc_mutex);
        /*
-         * No need to cancel and restart hrtimer if it is currently executing
+         * Multiple suspend requests can be active in parallel (counted by
-         * because it will reprogram itself with the new period now.
+         * the 'watchdog_suspended' variable). If the watchdog threads are
-         * We should never see it unqueued here because we are running per-cpu
+         * running, the first caller takes care that they will be parked.
-         * with interrupts disabled.
+         * The state of 'watchdog_running' cannot change while a suspend
+         * request is active (see related code in 'proc' handlers).
         */
-        ret = hrtimer_try_to_cancel(hrtimer);
+        if (watchdog_running && !watchdog_suspended)
-        if (ret == 1)
+                ret = watchdog_park_threads();
-                hrtimer_start(hrtimer, ns_to_ktime(sample_period),
-                                HRTIMER_MODE_REL_PINNED);
+        if (ret == 0)
+                watchdog_suspended++;
+        mutex_unlock(&watchdog_proc_mutex);
+        return ret;
 }
-static void update_watchdog(int cpu)
+/*
+ * Resume the hard and soft lockup detector by unparking the watchdog threads.
+ */
+void lockup_detector_resume(void)
 {
+        mutex_lock(&watchdog_proc_mutex);
+        watchdog_suspended--;
        /*
-         * Make sure that perf event counter will adopt to a new
+         * The watchdog threads are unparked if they were previously running
-         * sampling period. Updating the sampling period directly would
+         * and if there is no more active suspend request.
-         * be much nicer but we do not have an API for that now so
-         * let's use a big hammer.
-         * Hrtimer will adopt the new period on the next tick but this
-         * might be late already so we have to restart the timer as well.
         */
-        watchdog_nmi_disable(cpu);
+        if (watchdog_running && !watchdog_suspended)
-        smp_call_function_single(cpu, restart_watchdog_hrtimer, NULL, 1);
+                watchdog_unpark_threads();
-        watchdog_nmi_enable(cpu);
+        mutex_unlock(&watchdog_proc_mutex);
 }
 static void update_watchdog_all_cpus(void)
 {
-        int cpu;
+        watchdog_park_threads();
+        watchdog_unpark_threads();
-        get_online_cpus();
-        for_each_watchdog_cpu(cpu)
-                update_watchdog(cpu);
-        put_online_cpus();
 }
 static int watchdog_enable_all_cpus(void)
@@ -713,15 +739,12 @@ static int watchdog_enable_all_cpus(void)
        int err = 0;
        if (!watchdog_running) {
-                err = smpboot_register_percpu_thread(&watchdog_threads);
+                err = smpboot_register_percpu_thread_cpumask(&watchdog_threads,
+                                                             &watchdog_cpumask);
                if (err)
                        pr_err("Failed to create watchdog threads, disabled\n");
-                else {
+                else
-                        if (smpboot_update_cpumask_percpu_thread(
-                                    &watchdog_threads, &watchdog_cpumask))
-                                pr_err("Failed to set cpumask for watchdog threads\n");
                        watchdog_running = 1;
-                }
        } else {
                /*
                 * Enable/disable the lockup detectors or
@@ -787,6 +810,12 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,
        mutex_lock(&watchdog_proc_mutex);
+        if (watchdog_suspended) {
+                /* no parameter changes allowed while watchdog is suspended */
+                err = -EAGAIN;
+                goto out;
+        }
        /*
         * If the parameter is being read return the state of the corresponding
         * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the
@@ -872,6 +901,12 @@ int proc_watchdog_thresh(struct ctl_table *table, int write,
        mutex_lock(&watchdog_proc_mutex);
+        if (watchdog_suspended) {
+                /* no parameter changes allowed while watchdog is suspended */
+                err = -EAGAIN;
+                goto out;
+        }
        old = ACCESS_ONCE(watchdog_thresh);
        err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
@@ -903,6 +938,13 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
        int err;
        mutex_lock(&watchdog_proc_mutex);
+        if (watchdog_suspended) {
+                /* no parameter changes allowed while watchdog is suspended */
+                err = -EAGAIN;
+                goto out;
+        }
        err = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
        if (!err && write) {
                /* Remove impossible cpus to keep sysctl output cleaner. */
@@ -920,6 +962,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
                                pr_err("cpumask update failed\n");
                }
        }
+out:
        mutex_unlock(&watchdog_proc_mutex);
        return err;
 }
@@ -932,10 +975,8 @@ void __init lockup_detector_init(void)
 #ifdef CONFIG_NO_HZ_FULL
        if (tick_nohz_full_enabled()) {
-                if (!cpumask_empty(tick_nohz_full_mask))
+                pr_info("Disabling watchdog on nohz_full cores by default\n");
-                        pr_info("Disabling watchdog on nohz_full cores by default\n");
+                cpumask_copy(&watchdog_cpumask, housekeeping_mask);
-                cpumask_andnot(&watchdog_cpumask, cpu_possible_mask,
-                               tick_nohz_full_mask);
        } else
                cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
 #else
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4c4f06176f74..ca71582fcfab 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -338,20 +338,20 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
 #include <trace/events/workqueue.h>
 #define assert_rcu_or_pool_mutex()                                      \
-        rcu_lockdep_assert(rcu_read_lock_sched_held() ||                \
+        RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
-                           lockdep_is_held(&wq_pool_mutex),             \
+                         !lockdep_is_held(&wq_pool_mutex),              \
-                           "sched RCU or wq_pool_mutex should be held")
+                         "sched RCU or wq_pool_mutex should be held")
 #define assert_rcu_or_wq_mutex(wq)                                      \
-        rcu_lockdep_assert(rcu_read_lock_sched_held() ||                \
+        RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
-                           lockdep_is_held(&wq->mutex),                 \
+                         !lockdep_is_held(&wq->mutex),                  \
-                           "sched RCU or wq->mutex should be held")
+                         "sched RCU or wq->mutex should be held")
 #define assert_rcu_or_wq_mutex_or_pool_mutex(wq)                        \
-        rcu_lockdep_assert(rcu_read_lock_sched_held() ||                \
+        RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
-                           lockdep_is_held(&wq->mutex) ||               \
+                         !lockdep_is_held(&wq->mutex) &&                \
-                           lockdep_is_held(&wq_pool_mutex),             \
+                         !lockdep_is_held(&wq_pool_mutex),              \
-                           "sched RCU, wq->mutex or wq_pool_mutex should be held")
+                         "sched RCU, wq->mutex or wq_pool_mutex should be held")
 #define for_each_cpu_worker_pool(pool, cpu)                             \
        for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0];               \
@@ -1714,9 +1714,7 @@ static struct worker *create_worker(struct worker_pool *pool)
                goto fail;
        set_user_nice(worker->task, pool->attrs->nice);
+        kthread_bind_mask(worker->task, pool->attrs->cpumask);
-        /* prevent userland from meddling with cpumask of workqueue workers */
-        worker->task->flags |= PF_NO_SETAFFINITY;
        /* successful, attach the worker to the pool */
        worker_attach_to_pool(worker, pool);
@@ -2614,7 +2612,7 @@ void flush_workqueue(struct workqueue_struct *wq)
 out_unlock:
        mutex_unlock(&wq->mutex);
 }
-EXPORT_SYMBOL_GPL(flush_workqueue);
+EXPORT_SYMBOL(flush_workqueue);
 /**
 * drain_workqueue - drain a workqueue
@@ -3856,7 +3854,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
                }
                wq->rescuer = rescuer;
-                rescuer->task->flags |= PF_NO_SETAFFINITY;
+                kthread_bind_mask(rescuer->task, cpu_possible_mask);
                wake_up_process(rescuer->task);
        }
author	Mark Brown <broonie@kernel.org>	2015-10-12 13:09:27 -0400
committer	Mark Brown <broonie@kernel.org>	2015-10-12 13:09:27 -0400
commit	79828b4fa835f73cdaf4bffa48696abdcbea9d02 (patch)
tree	5e0fa7156acb75ba603022bc807df8f2fedb97a8 /kernel
parent	721b51fcf91898299d96f4b72cb9434cda29dce6 (diff)
parent	8c1a9d6323abf0fb1e5dad96cf3f1c783505ea5a (diff)